verso-db 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/CHANGELOG.md +13 -0
  2. package/README.md +13 -7
  3. package/dist/BinaryHeap.d.ts +11 -1
  4. package/dist/BinaryHeap.d.ts.map +1 -1
  5. package/dist/BinaryHeap.js +138 -0
  6. package/dist/BinaryHeap.js.map +1 -0
  7. package/dist/Collection.d.ts +30 -4
  8. package/dist/Collection.d.ts.map +1 -1
  9. package/dist/Collection.js +1186 -0
  10. package/dist/Collection.js.map +1 -0
  11. package/dist/HNSWIndex.d.ts +59 -0
  12. package/dist/HNSWIndex.d.ts.map +1 -1
  13. package/dist/HNSWIndex.js +2818 -0
  14. package/dist/HNSWIndex.js.map +1 -0
  15. package/dist/MaxBinaryHeap.d.ts +2 -64
  16. package/dist/MaxBinaryHeap.d.ts.map +1 -1
  17. package/dist/MaxBinaryHeap.js +5 -0
  18. package/dist/MaxBinaryHeap.js.map +1 -0
  19. package/dist/SearchWorker.d.ts +57 -4
  20. package/dist/SearchWorker.d.ts.map +1 -1
  21. package/dist/SearchWorker.js +573 -0
  22. package/dist/SearchWorker.js.map +1 -0
  23. package/dist/VectorDB.d.ts.map +1 -1
  24. package/dist/VectorDB.js +246 -0
  25. package/dist/VectorDB.js.map +1 -0
  26. package/dist/WorkerPool.d.ts +32 -2
  27. package/dist/WorkerPool.d.ts.map +1 -1
  28. package/dist/WorkerPool.js +266 -0
  29. package/dist/WorkerPool.js.map +1 -0
  30. package/dist/backends/JsDistanceBackend.d.ts.map +1 -1
  31. package/dist/backends/JsDistanceBackend.js +163 -0
  32. package/dist/backends/JsDistanceBackend.js.map +1 -0
  33. package/dist/encoding/DeltaEncoder.d.ts +2 -2
  34. package/dist/encoding/DeltaEncoder.d.ts.map +1 -1
  35. package/dist/encoding/DeltaEncoder.js +199 -0
  36. package/dist/encoding/DeltaEncoder.js.map +1 -0
  37. package/dist/errors.js +97 -0
  38. package/dist/errors.js.map +1 -0
  39. package/dist/index.d.ts +3 -3
  40. package/dist/index.d.ts.map +1 -1
  41. package/dist/index.js +61 -42
  42. package/dist/index.js.map +1 -9
  43. package/dist/presets.js +205 -0
  44. package/dist/presets.js.map +1 -0
  45. package/dist/quantization/ScalarQuantizer.d.ts +0 -34
  46. package/dist/quantization/ScalarQuantizer.d.ts.map +1 -1
  47. package/dist/quantization/ScalarQuantizer.js +346 -0
  48. package/dist/quantization/ScalarQuantizer.js.map +1 -0
  49. package/dist/storage/BatchWriter.js +351 -0
  50. package/dist/storage/BatchWriter.js.map +1 -0
  51. package/dist/storage/BunStorageBackend.d.ts +7 -3
  52. package/dist/storage/BunStorageBackend.d.ts.map +1 -1
  53. package/dist/storage/BunStorageBackend.js +182 -0
  54. package/dist/storage/BunStorageBackend.js.map +1 -0
  55. package/dist/storage/MemoryBackend.js +109 -0
  56. package/dist/storage/MemoryBackend.js.map +1 -0
  57. package/dist/storage/OPFSBackend.d.ts.map +1 -1
  58. package/dist/storage/OPFSBackend.js +325 -0
  59. package/dist/storage/OPFSBackend.js.map +1 -0
  60. package/dist/storage/StorageBackend.js +12 -0
  61. package/dist/storage/StorageBackend.js.map +1 -0
  62. package/dist/storage/WriteAheadLog.js +321 -0
  63. package/dist/storage/WriteAheadLog.js.map +1 -0
  64. package/dist/storage/createStorageBackend.d.ts +4 -0
  65. package/dist/storage/createStorageBackend.d.ts.map +1 -1
  66. package/dist/storage/createStorageBackend.js +119 -0
  67. package/dist/storage/createStorageBackend.js.map +1 -0
  68. package/{src/storage/index.ts → dist/storage/index.js} +7 -27
  69. package/dist/storage/index.js.map +1 -0
  70. package/dist/storage/nodeFsRuntime.d.ts +14 -0
  71. package/dist/storage/nodeFsRuntime.d.ts.map +1 -0
  72. package/dist/storage/nodeFsRuntime.js +105 -0
  73. package/dist/storage/nodeFsRuntime.js.map +1 -0
  74. package/package.json +9 -7
  75. package/src/BinaryHeap.ts +0 -136
  76. package/src/Collection.ts +0 -1262
  77. package/src/HNSWIndex.ts +0 -2894
  78. package/src/MaxBinaryHeap.ts +0 -181
  79. package/src/SearchWorker.ts +0 -264
  80. package/src/VectorDB.ts +0 -319
  81. package/src/WorkerPool.ts +0 -222
  82. package/src/backends/JsDistanceBackend.ts +0 -171
  83. package/src/encoding/DeltaEncoder.ts +0 -236
  84. package/src/errors.ts +0 -110
  85. package/src/index.ts +0 -106
  86. package/src/presets.ts +0 -229
  87. package/src/quantization/ScalarQuantizer.ts +0 -487
  88. package/src/storage/BatchWriter.ts +0 -420
  89. package/src/storage/BunStorageBackend.ts +0 -199
  90. package/src/storage/MemoryBackend.ts +0 -122
  91. package/src/storage/OPFSBackend.ts +0 -348
  92. package/src/storage/StorageBackend.ts +0 -74
  93. package/src/storage/WriteAheadLog.ts +0 -379
  94. package/src/storage/createStorageBackend.ts +0 -137
package/src/Collection.ts DELETED
@@ -1,1262 +0,0 @@
1
- import { HNSWIndex, DistanceMetric } from './HNSWIndex';
2
- import type { StorageBackend } from './storage/StorageBackend';
3
- import { DimensionMismatchError, DuplicateVectorError, VectorDBError } from './errors';
4
-
5
- /**
6
- * Configuration for adding vectors to a collection.
7
- */
8
- export interface AddConfig {
9
- /** Unique string identifiers for each vector */
10
- ids: string[];
11
- /** Array of vectors to add (must match collection dimension) */
12
- vectors: Array<number[] | Float32Array>;
13
- /** Optional metadata for each vector (same length as ids/vectors, plain JSON objects only) */
14
- metadata?: Array<Record<string, any>>;
15
- }
16
-
17
- /**
18
- * Configuration for querying a collection.
19
- *
20
- * @example
21
- * ```typescript
22
- * // Simple query
23
- * const results = await collection.query({
24
- * queryVector: [0.1, 0.2, 0.3],
25
- * k: 10
26
- * });
27
- *
28
- * // Query with metadata filter
29
- * const results = await collection.query({
30
- * queryVector: [0.1, 0.2, 0.3],
31
- * k: 10,
32
- * filter: {
33
- * category: 'science', // Exact match
34
- * year: { $gte: 2020 }, // Greater than or equal
35
- * status: { $in: ['active', 'pending'] } // In array
36
- * }
37
- * });
38
- * ```
39
- */
40
- export interface QueryConfig {
41
- /** The query vector (must match collection dimension) */
42
- queryVector: number[] | Float32Array;
43
- /** Number of nearest neighbors to return */
44
- k: number;
45
- /**
46
- * Optional metadata filter. Supports MongoDB-style operators:
47
- * - Simple equality: `{ field: value }`
48
- * - `$gt`: Greater than `{ field: { $gt: 5 } }`
49
- * - `$gte`: Greater than or equal `{ field: { $gte: 5 } }`
50
- * - `$lt`: Less than `{ field: { $lt: 10 } }`
51
- * - `$lte`: Less than or equal `{ field: { $lte: 10 } }`
52
- * - `$ne`: Not equal `{ field: { $ne: 'excluded' } }`
53
- * - `$in`: In array `{ field: { $in: ['a', 'b', 'c'] } }`
54
- * - `$nin`: Not in array `{ field: { $nin: ['x', 'y'] } }`
55
- */
56
- filter?: Record<string, any>;
57
- /** Search effort parameter (higher = better recall, slower). Default: max(k*2, 50) */
58
- efSearch?: number;
59
- /** Use int8 quantized search + float32 rescore. Default: auto (true when quantization enabled) */
60
- useQuantizedSearch?: boolean;
61
- /** Oversampling multiplier for quantized rescore (default: 3). Higher = better recall, slower. */
62
- candidateMultiplier?: number;
63
- }
64
-
65
- export interface QueryResult {
66
- ids: string[];
67
- distances: number[];
68
- metadata: Array<Record<string, any>>;
69
- }
70
-
71
- /** Supported metadata filter operators — hoisted to avoid per-call allocation */
72
- const SUPPORTED_OPERATORS = new Set(['$eq', '$ne', '$gt', '$gte', '$lt', '$lte', '$in', '$nin']);
73
- const COLLECTION_STATE_VERSION = 1;
74
-
75
- interface CollectionStorageState {
76
- version: number;
77
- indexKey: string;
78
- metaKey: string;
79
- deletedKey: string;
80
- }
81
-
82
- export class Collection {
83
- private static readonly MAX_INTERNAL_ID = 0xFFFFFFFE;
84
-
85
- private name: string;
86
- private dimension: number;
87
- private metric: DistanceMetric;
88
- private M: number; // HNSW M parameter
89
- private efConstruction: number; // HNSW efConstruction parameter
90
- private storage: StorageBackend;
91
- private readonly stateKey: string; // Storage key for atomic state pointer
92
- private readonly defaultIndexKey: string;
93
- private readonly defaultMetaKey: string;
94
- private readonly defaultDeletedKey: string;
95
- private indexKey: string; // Storage key for HNSW index data
96
- private metaKey: string; // Storage key for metadata
97
- private deletedKey: string; // Storage key for deleted IDs
98
- private hnsw: HNSWIndex;
99
- private idMap: Map<string, number>; // Maps string IDs to internal numeric IDs
100
- private idReverseMap: Map<number, string>; // Maps internal numeric IDs to string IDs
101
- private metadata: Map<number, Record<string, any>>; // Stores metadata for each vector
102
- private deletedIds: Set<number>; // Tombstone markers for deleted vectors
103
- private nextNumericId: number; // Monotonic counter for generating unique internal IDs
104
- private activeCount: number; // O(1) counter for non-deleted vectors
105
-
106
- constructor(name: string, config: { dimension: number; metric?: DistanceMetric; M?: number; efConstruction?: number }, storage: StorageBackend) {
107
- this.name = name;
108
- this.dimension = config.dimension;
109
- this.metric = config.metric ?? 'cosine';
110
- this.M = config.M ?? 16;
111
- this.efConstruction = config.efConstruction ?? 200;
112
- this.storage = storage;
113
- this.defaultIndexKey = `${name}/${name}.hnsw`;
114
- this.defaultMetaKey = `${name}/${name}.meta`;
115
- this.defaultDeletedKey = `${name}/${name}.deleted`;
116
- this.stateKey = `${name}/${name}.state`;
117
- this.indexKey = this.defaultIndexKey;
118
- this.metaKey = this.defaultMetaKey;
119
- this.deletedKey = this.defaultDeletedKey;
120
-
121
- // Initialize HNSW index with the specified parameters
122
- this.hnsw = new HNSWIndex(
123
- config.dimension,
124
- this.metric,
125
- this.M,
126
- this.efConstruction
127
- );
128
-
129
- this.idMap = new Map();
130
- this.idReverseMap = new Map();
131
- this.metadata = new Map();
132
- this.deletedIds = new Set();
133
- this.nextNumericId = 0;
134
- this.activeCount = 0;
135
- }
136
-
137
- async init(): Promise<void> {
138
- // Load existing data if files exist
139
- await this.loadFromDisk();
140
- }
141
-
142
- private reserveNumericIds(count: number): number {
143
- if (!Number.isInteger(count) || count < 0) {
144
- throw new VectorDBError(`Invalid ID reservation count: ${count}`, 'VALIDATION_ERROR');
145
- }
146
- if (count === 0) return this.nextNumericId;
147
-
148
- const startId = this.nextNumericId;
149
- const endId = startId + count - 1;
150
-
151
- if (!Number.isSafeInteger(endId) || endId > Collection.MAX_INTERNAL_ID) {
152
- throw new VectorDBError(
153
- `Collection '${this.name}' cannot allocate ${count} internal IDs: exhausted ID space (max ${Collection.MAX_INTERNAL_ID})`,
154
- 'VALIDATION_ERROR'
155
- );
156
- }
157
-
158
- this.nextNumericId = endId + 1;
159
- return startId;
160
- }
161
-
162
- private setActiveStorageKeys(indexKey: string, metaKey: string, deletedKey: string): void {
163
- this.indexKey = indexKey;
164
- this.metaKey = metaKey;
165
- this.deletedKey = deletedKey;
166
- }
167
-
168
- private isValidStorageState(value: unknown): value is CollectionStorageState {
169
- if (!value || typeof value !== 'object') return false;
170
- const state = value as Record<string, unknown>;
171
- return (
172
- state.version === COLLECTION_STATE_VERSION &&
173
- typeof state.indexKey === 'string' &&
174
- typeof state.metaKey === 'string' &&
175
- typeof state.deletedKey === 'string'
176
- );
177
- }
178
-
179
- private async loadStorageState(): Promise<CollectionStorageState | null> {
180
- const stateBuffer = await this.storage.read(this.stateKey);
181
- if (!stateBuffer) return null;
182
-
183
- try {
184
- const raw = JSON.parse(new TextDecoder().decode(stateBuffer));
185
- if (this.isValidStorageState(raw)) {
186
- return raw;
187
- }
188
- } catch {
189
- // Ignore malformed state file and fall back to legacy keys.
190
- }
191
-
192
- return null;
193
- }
194
-
195
- private getVersionedDataKeys(saveId: string): { indexKey: string; metaKey: string; deletedKey: string } {
196
- return {
197
- indexKey: `${this.name}/${this.name}.${saveId}.hnsw`,
198
- metaKey: `${this.name}/${this.name}.${saveId}.meta`,
199
- deletedKey: `${this.name}/${this.name}.${saveId}.deleted`,
200
- };
201
- }
202
-
203
- private isVersionedDataKey(key: string): boolean {
204
- if (
205
- key === this.defaultIndexKey ||
206
- key === this.defaultMetaKey ||
207
- key === this.defaultDeletedKey
208
- ) {
209
- return false;
210
- }
211
- const prefix = `${this.name}/${this.name}.`;
212
- return key.startsWith(prefix) && (
213
- key.endsWith('.hnsw') ||
214
- key.endsWith('.meta') ||
215
- key.endsWith('.deleted')
216
- );
217
- }
218
-
219
- private validateLoadedIndexConfig(index: HNSWIndex): void {
220
- if (
221
- index.getDimension() !== this.dimension ||
222
- index.getMetric() !== this.metric ||
223
- index.getM() !== this.M ||
224
- index.getEfConstruction() !== this.efConstruction
225
- ) {
226
- throw new VectorDBError(
227
- `Collection '${this.name}' index parameters do not match manifest configuration`,
228
- 'CORRUPT_COLLECTION'
229
- );
230
- }
231
- }
232
-
233
- private async loadFromDisk(): Promise<void> {
234
- // Reset in-memory state before loading.
235
- this.idMap.clear();
236
- this.idReverseMap.clear();
237
- this.metadata.clear();
238
- this.deletedIds.clear();
239
- this.nextNumericId = 0;
240
- this.activeCount = 0;
241
-
242
- const persistedState = await this.loadStorageState();
243
- if (persistedState) {
244
- this.setActiveStorageKeys(persistedState.indexKey, persistedState.metaKey, persistedState.deletedKey);
245
- } else {
246
- this.setActiveStorageKeys(this.defaultIndexKey, this.defaultMetaKey, this.defaultDeletedKey);
247
- }
248
-
249
- // Load the binary HNSW index via StorageBackend
250
- const indexData = await this.storage.read(this.indexKey);
251
- if (indexData) {
252
- try {
253
- const loaded = HNSWIndex.deserialize(indexData);
254
- this.validateLoadedIndexConfig(loaded);
255
- this.hnsw = loaded;
256
- } catch (e) {
257
- if (e instanceof VectorDBError && e.code === 'CORRUPT_COLLECTION') {
258
- throw e;
259
- }
260
- console.warn(`Failed to load HNSW index from ${this.indexKey}:`, e);
261
- // Reset to an empty index to avoid stale in-memory graph state.
262
- this.hnsw = new HNSWIndex(this.dimension, this.metric, this.M, this.efConstruction);
263
- }
264
- }
265
-
266
- // Load metadata via StorageBackend
267
- const metaData = await this.storage.read(this.metaKey);
268
- if (metaData) {
269
- try {
270
- const metaContent = new TextDecoder().decode(metaData);
271
- const lines = metaContent.split('\n');
272
- for (let i = 0; i < lines.length; i++) {
273
- const line = lines[i];
274
- if (line.length === 0 || line.trim().length === 0) continue;
275
-
276
- try {
277
- const parsed = this.parseMetadataLine(line);
278
- if (!parsed) continue;
279
- const { id, internalId, metadata } = parsed;
280
-
281
- this.idMap.set(id, internalId);
282
- this.idReverseMap.set(internalId, id);
283
- if (metadata !== undefined) {
284
- this.metadata.set(internalId, metadata);
285
- }
286
- } catch {
287
- // Skip malformed lines
288
- console.warn(`Skipping malformed line in metadata file: ${line}`);
289
- }
290
- }
291
- } catch {
292
- // Ignore read errors
293
- }
294
- }
295
-
296
- // Load deleted IDs via StorageBackend
297
- const deletedData = await this.storage.read(this.deletedKey);
298
- if (deletedData) {
299
- try {
300
- const deletedContent = new TextDecoder().decode(deletedData);
301
- const deletedArray = JSON.parse(deletedContent);
302
- if (Array.isArray(deletedArray)) {
303
- this.deletedIds = new Set(deletedArray);
304
- }
305
- } catch {
306
- // Ignore read errors, start with empty set
307
- }
308
- }
309
-
310
- this.reconcileLoadedState();
311
- }
312
-
313
- async add(config: AddConfig): Promise<void> {
314
- this.validateAddInput(config, false);
315
-
316
- const points: Array<{ id: number; vector: Float32Array }> = new Array(config.ids.length);
317
- const startId = this.reserveNumericIds(config.ids.length);
318
-
319
- for (let i = 0; i < config.ids.length; i++) {
320
- const vector = config.vectors[i];
321
- const numericId = startId + i;
322
- points[i] = { id: numericId, vector: new Float32Array(vector) };
323
- }
324
-
325
- await this.hnsw.addPointsBulk(points);
326
-
327
- for (let i = 0; i < config.ids.length; i++) {
328
- const id = config.ids[i];
329
- const metadata = config.metadata ? config.metadata[i] : undefined;
330
- const numericId = startId + i;
331
-
332
- this.idMap.set(id, numericId);
333
- this.idReverseMap.set(numericId, id);
334
- if (metadata !== undefined) {
335
- this.metadata.set(numericId, metadata);
336
- }
337
- }
338
-
339
- this.activeCount += config.ids.length;
340
- }
341
-
342
- async query(config: QueryConfig): Promise<QueryResult> {
343
- return this.executeQuery(config, 'Query vector');
344
- }
345
-
346
- /**
347
- * Batch query for multiple vectors at once.
348
- * Shares query semantics with query(), including adaptive candidate expansion
349
- * for filters and tombstones.
350
- *
351
- * @param configs Array of query configurations
352
- * @returns Array of query results, one per query
353
- */
354
- async queryBatch(configs: QueryConfig[]): Promise<QueryResult[]> {
355
- if (!Array.isArray(configs)) {
356
- throw new VectorDBError('Query batch must be an array of query configs', 'VALIDATION_ERROR');
357
- }
358
- if (configs.length === 0) return [];
359
-
360
- for (let i = 0; i < configs.length; i++) {
361
- if (!this.isPlainObject(configs[i])) {
362
- throw new VectorDBError(`Query ${i} must be a plain object`, 'VALIDATION_ERROR');
363
- }
364
- }
365
-
366
- const results: QueryResult[] = new Array(configs.length);
367
- for (let i = 0; i < configs.length; i++) {
368
- results[i] = this.executeQuery(configs[i], `Query ${i}`);
369
- }
370
- return results;
371
- }
372
-
373
- /**
374
- * Brute-force KNN search for validation and correctness checking
375
- * This checks all vectors and returns the true k nearest neighbors
376
- */
377
- async queryBruteForce(config: QueryConfig): Promise<QueryResult> {
378
- this.validateQueryInput(config, 'Query vector');
379
- const { queryVector, k, filter } = config;
380
-
381
- // Compute distances to all vectors, applying filter during scan
382
- const queryFloat = queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector);
383
- const hasFilter = this.hasFilter(filter);
384
- const distances: Array<{ id: number; distance: number }> = [];
385
- for (const [numericId, vector] of this.hnsw.getAllVectors()) {
386
- // Skip deleted vectors
387
- if (this.deletedIds.has(numericId)) continue;
388
-
389
- // Apply filter before distance computation when possible
390
- if (hasFilter) {
391
- const meta = this.metadata.get(numericId) || {};
392
- if (!this.matchesFilter(meta, filter!)) continue;
393
- }
394
-
395
- const distance = this.hnsw.calculateDistance(queryFloat, vector);
396
- distances.push({ id: numericId, distance });
397
- }
398
-
399
- // Sort by distance and take top k
400
- distances.sort((a, b) => {
401
- const diff = a.distance - b.distance;
402
- return diff !== 0 ? diff : a.id - b.id;
403
- });
404
- const filteredResults = distances.slice(0, k);
405
-
406
- // Prepare the output
407
- const ids: string[] = [];
408
- const distancesOut: number[] = [];
409
- const metadata: Array<Record<string, any>> = [];
410
-
411
- for (const result of filteredResults) {
412
- const id = this.idReverseMap.get(result.id);
413
- if (id) {
414
- ids.push(id);
415
- distancesOut.push(result.distance);
416
- metadata.push(this.metadata.get(result.id) || {});
417
- }
418
- }
419
-
420
- return {
421
- ids,
422
- distances: distancesOut,
423
- metadata
424
- };
425
- }
426
-
427
- /**
428
- * Upsert vectors (insert or update).
429
- *
430
- * For existing IDs, the old vectors are tombstone-deleted and new vectors
431
- * are inserted. Use compact() periodically to reclaim space from
432
- * tombstoned vectors.
433
- *
434
- * For new IDs, behaves identically to add().
435
- */
436
- async upsert(config: AddConfig): Promise<void> {
437
- this.validateAddInput(config, true);
438
-
439
- const points: Array<{ id: number; vector: Float32Array }> = new Array(config.ids.length);
440
- const startId = this.reserveNumericIds(config.ids.length);
441
-
442
- for (let i = 0; i < config.ids.length; i++) {
443
- points[i] = { id: startId + i, vector: new Float32Array(config.vectors[i]) };
444
- }
445
-
446
- await this.hnsw.addPointsBulk(points);
447
-
448
- for (let i = 0; i < config.ids.length; i++) {
449
- const id = config.ids[i];
450
- const metadata = config.metadata ? config.metadata[i] : undefined;
451
- const replacementId = this.idMap.get(id);
452
- if (replacementId !== undefined) {
453
- if (!this.deletedIds.has(replacementId)) {
454
- this.activeCount--;
455
- }
456
- this.deletedIds.add(replacementId);
457
- this.idReverseMap.delete(replacementId);
458
- this.metadata.delete(replacementId);
459
- }
460
-
461
- const numericId = startId + i;
462
- this.idMap.set(id, numericId);
463
- this.idReverseMap.set(numericId, id);
464
- if (metadata !== undefined) {
465
- this.metadata.set(numericId, metadata);
466
- }
467
- this.activeCount++;
468
- }
469
- }
470
-
471
- /** Returns the vector dimension for this collection. */
472
- getDimension(): number { return this.dimension; }
473
-
474
- /** Returns the distance metric for this collection. */
475
- getMetric(): DistanceMetric { return this.metric; }
476
-
477
- /** Returns the HNSW M parameter for this collection. */
478
- getM(): number { return this.M; }
479
-
480
- /** Returns the HNSW efConstruction parameter for this collection. */
481
- getEfConstruction(): number { return this.efConstruction; }
482
-
483
- /**
484
- * Returns the number of active (non-deleted) vectors in the collection.
485
- * O(1) via maintained counter.
486
- */
487
- count(): number {
488
- return this.activeCount;
489
- }
490
-
491
- /**
492
- * Returns the total number of tracked vectors including deleted (tombstoned) ones.
493
- * Use this to determine when compaction might be beneficial.
494
- */
495
- countWithDeleted(): number {
496
- return this.activeCount + this.deletedIds.size;
497
- }
498
-
499
- /**
500
- * Returns the number of deleted (tombstoned) vectors awaiting compaction.
501
- */
502
- deletedCount(): number {
503
- return this.deletedIds.size;
504
- }
505
-
506
- /**
507
- * Mark a vector as deleted (tombstone deletion).
508
- * The vector remains in the index but is excluded from search results.
509
- * Use compact() to permanently remove deleted vectors and reclaim space.
510
- *
511
- * @param id The string ID of the vector to delete
512
- * @returns true if the vector was deleted, false if it didn't exist or was already deleted
513
- */
514
- delete(id: string): boolean {
515
- const numericId = this.idMap.get(id);
516
- if (numericId === undefined) return false;
517
- if (this.deletedIds.has(numericId)) return false;
518
-
519
- this.deletedIds.add(numericId);
520
- this.activeCount--;
521
- return true;
522
- }
523
-
524
- /**
525
- * Mark multiple vectors as deleted (tombstone deletion).
526
- *
527
- * @param ids Array of string IDs to delete
528
- * @returns Number of vectors that were successfully deleted
529
- */
530
- deleteBatch(ids: string[]): number {
531
- let deleted = 0;
532
- for (const id of ids) {
533
- if (this.delete(id)) deleted++;
534
- }
535
- return deleted;
536
- }
537
-
538
- /**
539
- * Check if a vector exists and is not deleted.
540
- */
541
- has(id: string): boolean {
542
- const numericId = this.idMap.get(id);
543
- if (numericId === undefined) return false;
544
- return !this.deletedIds.has(numericId);
545
- }
546
-
547
- /**
548
- * Check if a vector was deleted (tombstoned).
549
- */
550
- isDeleted(id: string): boolean {
551
- const numericId = this.idMap.get(id);
552
- if (numericId === undefined) return false;
553
- return this.deletedIds.has(numericId);
554
- }
555
-
556
- /**
557
- * Reorder the internal HNSW index for BFS cache locality.
558
- * Remaps all internal IDs so that graph neighbors are stored
559
- * contiguously in memory, improving search cache hit rates.
560
- */
561
- reorderIndex(): void {
562
- const oldToNew = this.hnsw.reorderForLocality();
563
- if (oldToNew.size === 0) return;
564
-
565
- // Remap idMap and idReverseMap
566
- const newIdMap = new Map<string, number>();
567
- const newIdReverseMap = new Map<number, string>();
568
- for (const [strId, oldNum] of this.idMap) {
569
- const newNum = oldToNew.get(oldNum);
570
- if (newNum !== undefined) {
571
- newIdMap.set(strId, newNum);
572
- newIdReverseMap.set(newNum, strId);
573
- }
574
- }
575
- this.idMap = newIdMap;
576
- this.idReverseMap = newIdReverseMap;
577
-
578
- // Remap metadata
579
- const newMetadata = new Map<number, Record<string, any>>();
580
- for (const [oldNum, meta] of this.metadata) {
581
- const newNum = oldToNew.get(oldNum);
582
- if (newNum !== undefined) {
583
- newMetadata.set(newNum, meta);
584
- }
585
- }
586
- this.metadata = newMetadata;
587
-
588
- // Remap deletedIds
589
- const newDeletedIds = new Set<number>();
590
- for (const oldNum of this.deletedIds) {
591
- const newNum = oldToNew.get(oldNum);
592
- if (newNum !== undefined) {
593
- newDeletedIds.add(newNum);
594
- }
595
- }
596
- this.deletedIds = newDeletedIds;
597
- }
598
-
599
- async saveToDisk(): Promise<void> {
600
- // Ensure the collection directory exists
601
- await this.storage.mkdir(this.name);
602
-
603
- const saveId = `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
604
- const nextKeys = this.getVersionedDataKeys(saveId);
605
- const previousKeys = {
606
- indexKey: this.indexKey,
607
- metaKey: this.metaKey,
608
- deletedKey: this.deletedKey,
609
- };
610
-
611
- // Save the HNSW index as serialized binary via StorageBackend
612
- const indexBuffer = this.hnsw.serialize();
613
- const indexBytes = new Uint8Array(indexBuffer);
614
- await this.storage.write(nextKeys.indexKey, indexBytes);
615
-
616
- // Save metadata as JSON lines (safe for arbitrary string IDs).
617
- const metaLines: string[] = [];
618
- for (const [numericId, id] of this.idReverseMap) {
619
- const meta = this.metadata.get(numericId);
620
- metaLines.push(JSON.stringify({
621
- id,
622
- numericId,
623
- metadata: meta ?? {},
624
- }));
625
- }
626
- const metaBytes = new TextEncoder().encode(metaLines.join('\n'));
627
- await this.storage.write(nextKeys.metaKey, metaBytes);
628
-
629
- // Save deleted IDs
630
- const deletedJson = JSON.stringify([...this.deletedIds]);
631
- const deletedBytes = new TextEncoder().encode(deletedJson);
632
- await this.storage.write(nextKeys.deletedKey, deletedBytes);
633
-
634
- // Commit pointer last. Crash before this write keeps previous committed state.
635
- const state: CollectionStorageState = {
636
- version: COLLECTION_STATE_VERSION,
637
- indexKey: nextKeys.indexKey,
638
- metaKey: nextKeys.metaKey,
639
- deletedKey: nextKeys.deletedKey,
640
- };
641
- await this.storage.write(this.stateKey, new TextEncoder().encode(JSON.stringify(state)));
642
-
643
- this.setActiveStorageKeys(nextKeys.indexKey, nextKeys.metaKey, nextKeys.deletedKey);
644
-
645
- // Backward-compatible shadow copies at legacy keys for tooling/tests that
646
- // read fixed file names directly. Failures here are non-fatal because the
647
- // committed state file is the source of truth for correctness.
648
- await this.storage.write(this.defaultIndexKey, indexBytes).catch(() => {});
649
- await this.storage.write(this.defaultMetaKey, metaBytes).catch(() => {});
650
- await this.storage.write(this.defaultDeletedKey, deletedBytes).catch(() => {});
651
-
652
- // Best-effort cleanup: only delete old versioned files. Legacy fixed keys are
653
- // intentionally left untouched for backward compatibility.
654
- const staleKeys = [previousKeys.indexKey, previousKeys.metaKey, previousKeys.deletedKey];
655
- for (const key of staleKeys) {
656
- if (!this.isVersionedDataKey(key)) continue;
657
- if (key === this.indexKey || key === this.metaKey || key === this.deletedKey) continue;
658
- await this.storage.delete(key).catch(() => {});
659
- }
660
- }
661
-
662
- private executeQuery(config: QueryConfig, context: string): QueryResult {
663
- this.validateQueryInput(config, context);
664
-
665
- const queryVector = config.queryVector instanceof Float32Array
666
- ? config.queryVector
667
- : new Float32Array(config.queryVector);
668
- const filter = this.hasFilter(config.filter) ? config.filter : undefined;
669
- const efSearch = config.efSearch ?? Math.max(config.k * 2, 50);
670
-
671
- // Determine whether to use quantized search:
672
- // - Explicit true/false from config takes precedence
673
- // - Otherwise auto-detect: use quantized when quantization is enabled
674
- const useQuantized = config.useQuantizedSearch ?? this.hnsw.isQuantizationEnabled();
675
- const candidateMultiplier = config.candidateMultiplier ?? 3;
676
-
677
- const candidates = this.searchCandidates(queryVector, config.k, efSearch, filter, useQuantized, candidateMultiplier);
678
- return this.materializeResults(candidates, config.k);
679
- }
680
-
681
- private searchCandidates(
682
- queryVector: Float32Array,
683
- k: number,
684
- efSearch: number,
685
- filter?: Record<string, any>,
686
- useQuantized: boolean = false,
687
- candidateMultiplier: number = 3
688
- ): Array<{ id: number; distance: number }> {
689
- const totalCandidates = this.countWithDeleted();
690
- if (totalCandidates === 0) return [];
691
-
692
- let requestK = Math.min(Math.max(k * 2, 50), totalCandidates);
693
- let requestEf = Math.max(efSearch, requestK);
694
- let filtered: Array<{ id: number; distance: number }> = [];
695
-
696
- while (true) {
697
- const rawResults = useQuantized
698
- ? this.hnsw.searchKNNQuantized(queryVector, requestK, candidateMultiplier, requestEf)
699
- : this.hnsw.searchKNN(queryVector, requestK, requestEf);
700
- filtered = this.filterAndDeduplicateResults(rawResults, filter);
701
-
702
- if (filtered.length >= k || requestK >= totalCandidates) {
703
- return filtered;
704
- }
705
-
706
- const nextK = Math.min(totalCandidates, requestK * 2);
707
- if (nextK === requestK) {
708
- return filtered;
709
- }
710
- requestK = nextK;
711
- requestEf = Math.max(requestEf, requestK);
712
- }
713
- }
714
-
715
- private filterAndDeduplicateResults(
716
- results: Array<{ id: number; distance: number }>,
717
- filter?: Record<string, any>
718
- ): Array<{ id: number; distance: number }> {
719
- const filtered: Array<{ id: number; distance: number }> = [];
720
- const hasFilter = !!filter;
721
-
722
- for (const result of results) {
723
- if (this.deletedIds.has(result.id)) {
724
- continue;
725
- }
726
-
727
- if (hasFilter) {
728
- const metadata = this.metadata.get(result.id) || {};
729
- if (!this.matchesFilter(metadata, filter!)) {
730
- continue;
731
- }
732
- }
733
-
734
- filtered.push(result);
735
- }
736
-
737
- filtered.sort((a, b) => {
738
- const diff = a.distance - b.distance;
739
- return diff !== 0 ? diff : a.id - b.id;
740
- });
741
- return filtered;
742
- }
743
-
744
- private materializeResults(
745
- candidates: Array<{ id: number; distance: number }>,
746
- k: number
747
- ): QueryResult {
748
- const resultCount = Math.min(candidates.length, k);
749
- const ids = new Array<string>(resultCount);
750
- const distances = new Array<number>(resultCount);
751
- const metadata = new Array<Record<string, any>>(resultCount);
752
-
753
- let outIdx = 0;
754
- for (let i = 0; i < resultCount; i++) {
755
- const candidate = candidates[i];
756
- const id = this.idReverseMap.get(candidate.id);
757
- if (!id) continue;
758
- ids[outIdx] = id;
759
- distances[outIdx] = candidate.distance;
760
- metadata[outIdx] = this.metadata.get(candidate.id) || {};
761
- outIdx++;
762
- }
763
-
764
- if (outIdx < resultCount) {
765
- ids.length = outIdx;
766
- distances.length = outIdx;
767
- metadata.length = outIdx;
768
- }
769
-
770
- return { ids, distances, metadata };
771
- }
772
-
773
- private validateQueryInput(config: QueryConfig, context: string): void {
774
- if (!this.isPlainObject(config)) {
775
- throw new VectorDBError(`${context} config must be a plain object`, 'VALIDATION_ERROR');
776
- }
777
-
778
- const queryVector = (config as QueryConfig).queryVector;
779
- if (!(Array.isArray(queryVector) || queryVector instanceof Float32Array)) {
780
- throw new VectorDBError(`${context} must include a queryVector (number[] or Float32Array)`, 'VALIDATION_ERROR');
781
- }
782
-
783
- if (!Number.isInteger(config.k) || config.k <= 0) {
784
- throw new VectorDBError(
785
- `k must be a positive integer, got ${config.k}`,
786
- 'VALIDATION_ERROR'
787
- );
788
- }
789
-
790
- if (config.queryVector.length !== this.dimension) {
791
- throw new DimensionMismatchError(this.dimension, config.queryVector.length, context);
792
- }
793
-
794
- for (let i = 0; i < config.queryVector.length; i++) {
795
- if (!Number.isFinite(config.queryVector[i])) {
796
- throw new VectorDBError(
797
- `${context} contains non-finite value at dimension ${i}: ${config.queryVector[i]}`,
798
- 'VALIDATION_ERROR'
799
- );
800
- }
801
- }
802
-
803
- if (config.efSearch !== undefined) {
804
- if (!Number.isInteger(config.efSearch) || config.efSearch <= 0) {
805
- throw new VectorDBError(
806
- `efSearch must be a positive integer, got ${config.efSearch}`,
807
- 'VALIDATION_ERROR'
808
- );
809
- }
810
- }
811
-
812
- if (config.filter !== undefined) {
813
- if (!this.isPlainObject(config.filter)) {
814
- throw new VectorDBError('Filter must be a plain object', 'VALIDATION_ERROR');
815
- }
816
- this.validateFilterOperators(config.filter);
817
- }
818
- }
819
-
820
- private hasFilter(filter?: Record<string, any>): boolean {
821
- return !!filter && Object.keys(filter).length > 0;
822
- }
823
-
824
- private reconcileLoadedState(): void {
825
- const indexVectors = this.hnsw.getAllVectors();
826
- const existingIds = new Set<number>();
827
- let maxNumericId = -1;
828
- for (const [id] of indexVectors) {
829
- existingIds.add(id);
830
- if (id > maxNumericId) maxNumericId = id;
831
- }
832
-
833
- for (const [id, numericId] of Array.from(this.idMap.entries())) {
834
- if (!existingIds.has(numericId)) {
835
- this.idMap.delete(id);
836
- }
837
- }
838
-
839
- this.idReverseMap.clear();
840
- for (const [id, numericId] of this.idMap.entries()) {
841
- if (this.idReverseMap.has(numericId)) {
842
- throw new VectorDBError(
843
- `Collection '${this.name}' metadata is corrupt: duplicate numeric ID ${numericId}`,
844
- 'CORRUPT_COLLECTION'
845
- );
846
- }
847
- this.idReverseMap.set(numericId, id);
848
- }
849
-
850
- for (const numericId of Array.from(this.deletedIds.values())) {
851
- if (!existingIds.has(numericId)) {
852
- this.deletedIds.delete(numericId);
853
- }
854
- }
855
-
856
- for (const numericId of Array.from(this.metadata.keys())) {
857
- if (!this.idReverseMap.has(numericId)) {
858
- this.metadata.delete(numericId);
859
- }
860
- }
861
-
862
- const missingMappings: number[] = [];
863
- for (const numericId of existingIds) {
864
- if (this.deletedIds.has(numericId)) continue;
865
- if (!this.idReverseMap.has(numericId)) {
866
- missingMappings.push(numericId);
867
- }
868
- }
869
-
870
- if (missingMappings.length > 0) {
871
- throw new VectorDBError(
872
- `Collection '${this.name}' metadata is inconsistent with index: missing ${missingMappings.length} active ID mapping(s)`,
873
- 'CORRUPT_COLLECTION'
874
- );
875
- }
876
-
877
- if (maxNumericId > Collection.MAX_INTERNAL_ID) {
878
- throw new VectorDBError(
879
- `Collection '${this.name}' contains unsupported internal ID ${maxNumericId} (max ${Collection.MAX_INTERNAL_ID})`,
880
- 'CORRUPT_COLLECTION'
881
- );
882
- }
883
-
884
- this.nextNumericId = maxNumericId + 1;
885
- this.activeCount = 0;
886
- for (const numericId of this.idMap.values()) {
887
- if (!this.deletedIds.has(numericId)) {
888
- this.activeCount++;
889
- }
890
- }
891
- }
892
-
893
- private matchesFilter(metadata: Record<string, any>, filter: Record<string, any>): boolean {
894
- const filterKeys = Object.keys(filter);
895
- for (const key of filterKeys) {
896
- const value = filter[key];
897
- const metaValue = Object.prototype.hasOwnProperty.call(metadata, key)
898
- ? metadata[key]
899
- : undefined;
900
-
901
- if (this.isOperatorObject(value)) {
902
- for (const operator of Object.keys(value)) {
903
- if (!SUPPORTED_OPERATORS.has(operator)) {
904
- throw new VectorDBError(`Unsupported filter operator '${operator}'`, 'VALIDATION_ERROR');
905
- }
906
- }
907
-
908
- if (metaValue === undefined && value.$ne === undefined && value.$nin === undefined) {
909
- return false;
910
- }
911
-
912
- if (value.$eq !== undefined && !this.deepEqual(metaValue, value.$eq)) return false;
913
- if (value.$ne !== undefined && this.deepEqual(metaValue, value.$ne)) return false;
914
- // Comparison operators must reject undefined/missing fields explicitly,
915
- // because JS comparisons like `undefined > 5` return false (not NaN/error),
916
- // causing `!(undefined > 5)` to incorrectly pass.
917
- if (value.$gt !== undefined && (metaValue === undefined || !(metaValue > value.$gt))) return false;
918
- if (value.$lt !== undefined && (metaValue === undefined || !(metaValue < value.$lt))) return false;
919
- if (value.$gte !== undefined && (metaValue === undefined || !(metaValue >= value.$gte))) return false;
920
- if (value.$lte !== undefined && (metaValue === undefined || !(metaValue <= value.$lte))) return false;
921
-
922
- if (value.$in !== undefined) {
923
- if (!Array.isArray(value.$in) || !this.matchesInOperator(metaValue, value.$in)) return false;
924
- }
925
-
926
- if (value.$nin !== undefined) {
927
- if (!Array.isArray(value.$nin)) return false;
928
- if (this.matchesInOperator(metaValue, value.$nin)) return false;
929
- }
930
- } else {
931
- if (!this.deepEqual(metaValue, value)) return false;
932
- }
933
- }
934
- return true;
935
- }
936
-
937
- private validateFilterOperators(filter: Record<string, any>): void {
938
- for (const key of Object.keys(filter)) {
939
- const value = filter[key];
940
- if (!this.isOperatorObject(value)) continue;
941
- for (const operator of Object.keys(value)) {
942
- if (!SUPPORTED_OPERATORS.has(operator)) {
943
- throw new VectorDBError(`Unsupported filter operator '${operator}'`, 'VALIDATION_ERROR');
944
- }
945
- }
946
- }
947
- }
948
-
949
- private validateAddInput(config: AddConfig, allowExistingIds: boolean): void {
950
- if (!this.isPlainObject(config)) {
951
- throw new VectorDBError('Add config must be a plain object', 'VALIDATION_ERROR');
952
- }
953
- if (!Array.isArray(config.ids)) {
954
- throw new VectorDBError('ids must be an array of strings', 'VALIDATION_ERROR');
955
- }
956
- if (!Array.isArray(config.vectors)) {
957
- throw new VectorDBError('vectors must be an array of number[] or Float32Array', 'VALIDATION_ERROR');
958
- }
959
-
960
- if (config.vectors.length !== config.ids.length) {
961
- throw new VectorDBError('Number of vectors must match number of IDs', 'VALIDATION_ERROR');
962
- }
963
-
964
- if (config.metadata !== undefined && !Array.isArray(config.metadata)) {
965
- throw new VectorDBError('metadata must be an array when provided', 'VALIDATION_ERROR');
966
- }
967
-
968
- if (config.metadata && config.metadata.length !== config.ids.length) {
969
- throw new VectorDBError('Number of metadata entries must match number of IDs', 'VALIDATION_ERROR');
970
- }
971
-
972
- if (config.metadata) {
973
- for (let i = 0; i < config.metadata.length; i++) {
974
- const meta = config.metadata[i];
975
- if (meta === undefined) continue;
976
- if (!this.isPlainObject(meta)) {
977
- throw new VectorDBError(
978
- `Metadata at index ${i} must be a plain object`,
979
- 'VALIDATION_ERROR'
980
- );
981
- }
982
- try {
983
- JSON.stringify(meta);
984
- } catch {
985
- throw new VectorDBError(
986
- `Metadata at index ${i} is not JSON-serializable`,
987
- 'VALIDATION_ERROR'
988
- );
989
- }
990
- }
991
- }
992
-
993
- const seenIds = new Set<string>();
994
- const duplicateIds = new Set<string>();
995
- for (let i = 0; i < config.ids.length; i++) {
996
- const id = config.ids[i];
997
- if (typeof id !== 'string') {
998
- throw new VectorDBError(
999
- `ID at index ${i} must be a string`,
1000
- 'VALIDATION_ERROR'
1001
- );
1002
- }
1003
- if (seenIds.has(id)) {
1004
- duplicateIds.add(id);
1005
- } else {
1006
- seenIds.add(id);
1007
- }
1008
- if (!allowExistingIds && this.idMap.has(id)) {
1009
- duplicateIds.add(id);
1010
- }
1011
- }
1012
-
1013
- if (duplicateIds.size > 0) {
1014
- throw new DuplicateVectorError([...duplicateIds]);
1015
- }
1016
-
1017
- for (let i = 0; i < config.vectors.length; i++) {
1018
- const vector = config.vectors[i];
1019
- if (!(Array.isArray(vector) || vector instanceof Float32Array)) {
1020
- throw new VectorDBError(
1021
- `Vector at index ${i} must be a number[] or Float32Array`,
1022
- 'VALIDATION_ERROR'
1023
- );
1024
- }
1025
- if (vector.length !== this.dimension) {
1026
- throw new DimensionMismatchError(this.dimension, vector.length, `Vector at index ${i}`);
1027
- }
1028
- for (let d = 0; d < vector.length; d++) {
1029
- if (!Number.isFinite(vector[d])) {
1030
- throw new VectorDBError(
1031
- `Vector at index ${i} contains non-finite value at dimension ${d}: ${vector[d]}`,
1032
- 'VALIDATION_ERROR'
1033
- );
1034
- }
1035
- }
1036
- }
1037
- }
1038
-
1039
- private parseMetadataLine(line: string): { id: string; internalId: number; metadata?: Record<string, any> } | null {
1040
- const trimmed = line.trim();
1041
- if (trimmed.length === 0) return null;
1042
-
1043
- // v2+ JSONL format
1044
- if (trimmed[0] === '{') {
1045
- const record = JSON.parse(trimmed) as { id?: unknown; numericId?: unknown; metadata?: unknown };
1046
- if (
1047
- typeof record.id !== 'string'
1048
- || !Number.isInteger(record.numericId)
1049
- || (record.numericId as number) < 0
1050
- ) {
1051
- return null;
1052
- }
1053
-
1054
- if (record.metadata !== undefined && !this.isPlainObject(record.metadata)) {
1055
- return null;
1056
- }
1057
- return {
1058
- id: record.id,
1059
- internalId: record.numericId as number,
1060
- metadata: record.metadata === undefined ? undefined : (record.metadata as Record<string, any>),
1061
- };
1062
- }
1063
-
1064
- // Legacy tab-separated format
1065
- const parts = line.split('\t');
1066
- if (parts.length < 2) return null;
1067
- const id = parts[0];
1068
- const internalId = parseInt(parts[1], 10);
1069
- if (!Number.isInteger(internalId) || internalId < 0) return null;
1070
-
1071
- let metadata: Record<string, any> | undefined;
1072
- if (parts.length > 2) {
1073
- const parsedMetadata = JSON.parse(parts.slice(2).join('\t'));
1074
- if (!this.isPlainObject(parsedMetadata)) return null;
1075
- metadata = parsedMetadata as Record<string, any>;
1076
- }
1077
-
1078
- return { id, internalId, metadata };
1079
- }
1080
-
1081
- private isPlainObject(value: unknown): value is Record<string, any> {
1082
- return typeof value === 'object' && value !== null && !Array.isArray(value);
1083
- }
1084
-
1085
- private isOperatorObject(value: unknown): value is Record<string, any> {
1086
- if (!this.isPlainObject(value)) return false;
1087
- const keys = Object.keys(value);
1088
- if (keys.length === 0) return false;
1089
- for (const key of keys) {
1090
- if (!key.startsWith('$')) return false;
1091
- }
1092
- return true;
1093
- }
1094
-
1095
- private deepEqual(a: unknown, b: unknown): boolean {
1096
- if (Object.is(a, b)) return true;
1097
-
1098
- if (Array.isArray(a) && Array.isArray(b)) {
1099
- if (a.length !== b.length) return false;
1100
- for (let i = 0; i < a.length; i++) {
1101
- if (!this.deepEqual(a[i], b[i])) return false;
1102
- }
1103
- return true;
1104
- }
1105
-
1106
- if (this.isPlainObject(a) && this.isPlainObject(b)) {
1107
- const keysA = Object.keys(a);
1108
- const keysB = Object.keys(b);
1109
- if (keysA.length !== keysB.length) return false;
1110
- for (const key of keysA) {
1111
- if (!Object.prototype.hasOwnProperty.call(b, key)) return false;
1112
- if (!this.deepEqual(a[key], b[key])) return false;
1113
- }
1114
- return true;
1115
- }
1116
-
1117
- return false;
1118
- }
1119
-
1120
- private matchesInOperator(metaValue: unknown, candidates: unknown[]): boolean {
1121
- if (Array.isArray(metaValue)) {
1122
- for (const candidate of candidates) {
1123
- for (const item of metaValue) {
1124
- if (this.deepEqual(item, candidate)) return true;
1125
- }
1126
- }
1127
- return false;
1128
- }
1129
-
1130
- for (const candidate of candidates) {
1131
- if (this.deepEqual(metaValue, candidate)) return true;
1132
- }
1133
- return false;
1134
- }
1135
-
1136
- /**
1137
- * Compact the collection by rebuilding the index without deleted vectors.
1138
- * This permanently removes tombstoned vectors and reclaims space.
1139
- *
1140
- * @returns Number of vectors removed during compaction
1141
- */
1142
- async compact(): Promise<number> {
1143
- if (this.deletedIds.size === 0) return 0;
1144
-
1145
- const removedCount = this.deletedIds.size;
1146
-
1147
- // Collect all non-deleted vectors
1148
- const activeVectors: Array<{ id: string; numericId: number; vector: Float32Array; meta?: Record<string, any> }> = [];
1149
-
1150
- for (const [numericId, vector] of this.hnsw.getAllVectors()) {
1151
- if (!this.deletedIds.has(numericId)) {
1152
- const stringId = this.idReverseMap.get(numericId);
1153
- if (stringId) {
1154
- activeVectors.push({
1155
- id: stringId,
1156
- numericId,
1157
- vector,
1158
- meta: this.metadata.get(numericId)
1159
- });
1160
- }
1161
- }
1162
- }
1163
-
1164
- // Rebuild into temporary state first so we can roll back on failure.
1165
- const rebuiltHnsw = new HNSWIndex(
1166
- this.dimension,
1167
- this.metric,
1168
- this.M,
1169
- this.efConstruction
1170
- );
1171
- const rebuiltIdMap = new Map<string, number>();
1172
- const rebuiltIdReverseMap = new Map<number, string>();
1173
- const rebuiltMetadata = new Map<number, Record<string, any>>();
1174
- const rebuiltDeletedIds = new Set<number>();
1175
-
1176
- // Prepare bulk insert data and re-populate mappings
1177
- const points: Array<{ id: number; vector: Float32Array }> = new Array(activeVectors.length);
1178
-
1179
- for (let i = 0; i < activeVectors.length; i++) {
1180
- const { id, vector, meta } = activeVectors[i];
1181
- const newNumericId = i;
1182
-
1183
- rebuiltIdMap.set(id, newNumericId);
1184
- rebuiltIdReverseMap.set(newNumericId, id);
1185
- if (meta) {
1186
- rebuiltMetadata.set(newNumericId, meta);
1187
- }
1188
-
1189
- points[i] = { id: newNumericId, vector };
1190
- }
1191
-
1192
- // Use bulk insert for better performance
1193
- try {
1194
- await rebuiltHnsw.addPointsBulk(points);
1195
- } catch (err) {
1196
- rebuiltHnsw.destroy();
1197
- throw err;
1198
- }
1199
-
1200
- const previousState = {
1201
- hnsw: this.hnsw,
1202
- idMap: this.idMap,
1203
- idReverseMap: this.idReverseMap,
1204
- metadata: this.metadata,
1205
- deletedIds: this.deletedIds,
1206
- nextNumericId: this.nextNumericId,
1207
- activeCount: this.activeCount,
1208
- indexKey: this.indexKey,
1209
- metaKey: this.metaKey,
1210
- deletedKey: this.deletedKey,
1211
- };
1212
-
1213
- this.hnsw = rebuiltHnsw;
1214
- this.idMap = rebuiltIdMap;
1215
- this.idReverseMap = rebuiltIdReverseMap;
1216
- this.metadata = rebuiltMetadata;
1217
- this.deletedIds = rebuiltDeletedIds;
1218
- this.nextNumericId = activeVectors.length;
1219
- this.activeCount = activeVectors.length;
1220
-
1221
- try {
1222
- await this.saveToDisk();
1223
- } catch (err) {
1224
- this.hnsw.destroy();
1225
- this.hnsw = previousState.hnsw;
1226
- this.idMap = previousState.idMap;
1227
- this.idReverseMap = previousState.idReverseMap;
1228
- this.metadata = previousState.metadata;
1229
- this.deletedIds = previousState.deletedIds;
1230
- this.nextNumericId = previousState.nextNumericId;
1231
- this.activeCount = previousState.activeCount;
1232
- this.setActiveStorageKeys(previousState.indexKey, previousState.metaKey, previousState.deletedKey);
1233
- throw err;
1234
- }
1235
-
1236
- previousState.hnsw.destroy();
1237
- return removedCount;
1238
- }
1239
-
1240
- /**
1241
- * Destroy the collection, freeing all in-memory resources.
1242
- * @param save Whether to persist data to storage before destroying (default: true).
1243
- * Pass false when the collection is being deleted entirely.
1244
- */
1245
- async destroy(save: boolean = true): Promise<void> {
1246
- if (save) {
1247
- await this.saveToDisk();
1248
- }
1249
-
1250
- // Destroy the HNSW index to free memory
1251
- if (this.hnsw && typeof this.hnsw.destroy === 'function') {
1252
- this.hnsw.destroy();
1253
- }
1254
-
1255
- // Clear all maps to free memory
1256
- this.idMap.clear();
1257
- this.idReverseMap.clear();
1258
- this.metadata.clear();
1259
- this.deletedIds.clear();
1260
- this.activeCount = 0;
1261
- }
1262
- }