verso-db 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +46 -0
  2. package/LICENSE +21 -0
  3. package/README.md +252 -0
  4. package/dist/BinaryHeap.d.ts +25 -0
  5. package/dist/BinaryHeap.d.ts.map +1 -0
  6. package/dist/Collection.d.ts +156 -0
  7. package/dist/Collection.d.ts.map +1 -0
  8. package/dist/HNSWIndex.d.ts +357 -0
  9. package/dist/HNSWIndex.d.ts.map +1 -0
  10. package/dist/MaxBinaryHeap.d.ts +63 -0
  11. package/dist/MaxBinaryHeap.d.ts.map +1 -0
  12. package/dist/Storage.d.ts +54 -0
  13. package/dist/Storage.d.ts.map +1 -0
  14. package/dist/VectorDB.d.ts +44 -0
  15. package/dist/VectorDB.d.ts.map +1 -0
  16. package/dist/backends/DistanceBackend.d.ts +5 -0
  17. package/dist/backends/DistanceBackend.d.ts.map +1 -0
  18. package/dist/backends/JsDistanceBackend.d.ts +37 -0
  19. package/dist/backends/JsDistanceBackend.d.ts.map +1 -0
  20. package/dist/encoding/DeltaEncoder.d.ts +61 -0
  21. package/dist/encoding/DeltaEncoder.d.ts.map +1 -0
  22. package/dist/errors.d.ts +58 -0
  23. package/dist/errors.d.ts.map +1 -0
  24. package/dist/index.d.ts +64 -0
  25. package/dist/index.d.ts.map +1 -0
  26. package/dist/index.js +3732 -0
  27. package/dist/presets.d.ts +91 -0
  28. package/dist/presets.d.ts.map +1 -0
  29. package/dist/quantization/ScalarQuantizer.d.ts +114 -0
  30. package/dist/quantization/ScalarQuantizer.d.ts.map +1 -0
  31. package/dist/storage/BatchWriter.d.ts +104 -0
  32. package/dist/storage/BatchWriter.d.ts.map +1 -0
  33. package/dist/storage/BunStorageBackend.d.ts +58 -0
  34. package/dist/storage/BunStorageBackend.d.ts.map +1 -0
  35. package/dist/storage/MemoryBackend.d.ts +44 -0
  36. package/dist/storage/MemoryBackend.d.ts.map +1 -0
  37. package/dist/storage/OPFSBackend.d.ts +59 -0
  38. package/dist/storage/OPFSBackend.d.ts.map +1 -0
  39. package/dist/storage/StorageBackend.d.ts +66 -0
  40. package/dist/storage/StorageBackend.d.ts.map +1 -0
  41. package/dist/storage/WriteAheadLog.d.ts +111 -0
  42. package/dist/storage/WriteAheadLog.d.ts.map +1 -0
  43. package/dist/storage/createStorageBackend.d.ts +40 -0
  44. package/dist/storage/createStorageBackend.d.ts.map +1 -0
  45. package/dist/storage/index.d.ts +30 -0
  46. package/dist/storage/index.d.ts.map +1 -0
  47. package/package.json +98 -0
  48. package/src/BinaryHeap.ts +131 -0
  49. package/src/Collection.ts +695 -0
  50. package/src/HNSWIndex.ts +1839 -0
  51. package/src/MaxBinaryHeap.ts +175 -0
  52. package/src/Storage.ts +435 -0
  53. package/src/VectorDB.ts +109 -0
  54. package/src/backends/DistanceBackend.ts +17 -0
  55. package/src/backends/JsDistanceBackend.ts +227 -0
  56. package/src/encoding/DeltaEncoder.ts +217 -0
  57. package/src/errors.ts +110 -0
  58. package/src/index.ts +138 -0
  59. package/src/presets.ts +229 -0
  60. package/src/quantization/ScalarQuantizer.ts +383 -0
  61. package/src/storage/BatchWriter.ts +336 -0
  62. package/src/storage/BunStorageBackend.ts +161 -0
  63. package/src/storage/MemoryBackend.ts +120 -0
  64. package/src/storage/OPFSBackend.ts +250 -0
  65. package/src/storage/StorageBackend.ts +74 -0
  66. package/src/storage/WriteAheadLog.ts +326 -0
  67. package/src/storage/createStorageBackend.ts +137 -0
  68. package/src/storage/index.ts +53 -0
@@ -0,0 +1,695 @@
1
+ import { HNSWIndex, DistanceMetric } from './HNSWIndex';
2
+ import { mkdir } from 'fs/promises';
3
+ import * as path from 'path';
4
+
5
+ /**
6
+ * Configuration for adding vectors to a collection.
7
+ */
8
+ export interface AddConfig {
9
+ /** Unique string identifiers for each vector */
10
+ ids: string[];
11
+ /** Array of vectors to add (must match collection dimension) */
12
+ vectors: number[][];
13
+ /** Optional metadata for each vector (same length as ids/vectors) */
14
+ metadata?: Array<Record<string, any>>;
15
+ }
16
+
17
+ /**
18
+ * Configuration for querying a collection.
19
+ *
20
+ * @example
21
+ * ```typescript
22
+ * // Simple query
23
+ * const results = await collection.query({
24
+ * queryVector: [0.1, 0.2, 0.3],
25
+ * k: 10
26
+ * });
27
+ *
28
+ * // Query with metadata filter
29
+ * const results = await collection.query({
30
+ * queryVector: [0.1, 0.2, 0.3],
31
+ * k: 10,
32
+ * filter: {
33
+ * category: 'science', // Exact match
34
+ * year: { $gte: 2020 }, // Greater than or equal
35
+ * status: { $in: ['active', 'pending'] } // In array
36
+ * }
37
+ * });
38
+ * ```
39
+ */
40
+ export interface QueryConfig {
41
+ /** The query vector (must match collection dimension) */
42
+ queryVector: number[];
43
+ /** Number of nearest neighbors to return */
44
+ k: number;
45
+ /**
46
+ * Optional metadata filter. Supports MongoDB-style operators:
47
+ * - Simple equality: `{ field: value }`
48
+ * - `$gt`: Greater than `{ field: { $gt: 5 } }`
49
+ * - `$gte`: Greater than or equal `{ field: { $gte: 5 } }`
50
+ * - `$lt`: Less than `{ field: { $lt: 10 } }`
51
+ * - `$lte`: Less than or equal `{ field: { $lte: 10 } }`
52
+ * - `$ne`: Not equal `{ field: { $ne: 'excluded' } }`
53
+ * - `$in`: In array `{ field: { $in: ['a', 'b', 'c'] } }`
54
+ * - `$nin`: Not in array `{ field: { $nin: ['x', 'y'] } }`
55
+ */
56
+ filter?: Record<string, any>;
57
+ /** Search effort parameter (higher = better recall, slower). Default: max(k*2, 50) */
58
+ efSearch?: number;
59
+ }
60
+
61
+ export interface QueryResult {
62
+ ids: string[];
63
+ distances: number[];
64
+ metadata: Array<Record<string, any>>;
65
+ }
66
+
67
+ export class Collection {
68
+ private name: string;
69
+ private dimension: number;
70
+ private metric: DistanceMetric;
71
+ private M: number; // HNSW M parameter
72
+ private efConstruction: number; // HNSW efConstruction parameter
73
+ private indexPath: string;
74
+ private metaPath: string;
75
+ private deletedPath: string; // Path for persisting deleted IDs
76
+ private hnsw: HNSWIndex;
77
+ private idMap: Map<string, number>; // Maps string IDs to internal numeric IDs
78
+ private idReverseMap: Map<number, string>; // Maps internal numeric IDs to string IDs
79
+ private metadata: Map<number, Record<string, any>>; // Stores metadata for each vector
80
+ private deletedIds: Set<number>; // Tombstone markers for deleted vectors
81
+
82
+ constructor(name: string, config: { dimension: number; metric?: DistanceMetric; M?: number; efConstruction?: number }, collectionPath: string) {
83
+ this.name = name;
84
+ this.dimension = config.dimension;
85
+ this.metric = config.metric || 'cosine';
86
+ this.M = config.M || 16;
87
+ this.efConstruction = config.efConstruction || 200;
88
+ this.indexPath = path.join(collectionPath, `${name}.hnsw`);
89
+ this.metaPath = path.join(collectionPath, `${name}.meta`);
90
+ this.deletedPath = path.join(collectionPath, `${name}.deleted`);
91
+
92
+ // Initialize HNSW index with the specified parameters
93
+ this.hnsw = new HNSWIndex(
94
+ config.dimension,
95
+ this.metric,
96
+ this.M,
97
+ this.efConstruction
98
+ );
99
+
100
+ this.idMap = new Map();
101
+ this.idReverseMap = new Map();
102
+ this.metadata = new Map();
103
+ this.deletedIds = new Set();
104
+ }
105
+
106
+ async init(): Promise<void> {
107
+ // Load existing data if files exist
108
+ await this.loadFromDisk();
109
+ }
110
+
111
+ private async loadFromDisk(): Promise<void> {
112
+ // Check if index files exist and load the binary HNSW index using Bun APIs
113
+ const indexFile = Bun.file(this.indexPath);
114
+ if (await indexFile.exists()) {
115
+ try {
116
+ this.hnsw = await HNSWIndex.loadFromFile(this.indexPath);
117
+ } catch (e) {
118
+ console.warn(`Failed to load HNSW index from ${this.indexPath}:`, e);
119
+ }
120
+ }
121
+
122
+ // Load metadata from file using Bun APIs
123
+ const metaFile = Bun.file(this.metaPath);
124
+ if (await metaFile.exists()) {
125
+ try {
126
+ const metaContent = await metaFile.text();
127
+ // Process lines without creating intermediate filter array
128
+ const lines = metaContent.split('\n');
129
+ for (let i = 0; i < lines.length; i++) {
130
+ const line = lines[i];
131
+ if (line.length === 0 || line.trim().length === 0) continue;
132
+
133
+ try {
134
+ const parts = line.split('\t');
135
+ const id = parts[0];
136
+ const internalId = parseInt(parts[1], 10);
137
+ if (!isNaN(internalId)) {
138
+ this.idMap.set(id, internalId);
139
+ this.idReverseMap.set(internalId, id);
140
+
141
+ if (parts.length > 2) {
142
+ // Join remaining parts for metadata (avoiding spread operator)
143
+ const metaStr = parts.slice(2).join('\t');
144
+ const metadata = JSON.parse(metaStr);
145
+ this.metadata.set(internalId, metadata);
146
+ }
147
+ }
148
+ } catch (e) {
149
+ // Skip malformed lines
150
+ console.warn(`Skipping malformed line in metadata file: ${line}`);
151
+ }
152
+ }
153
+ } catch (e) {
154
+ // Ignore file read errors
155
+ }
156
+ }
157
+
158
+ // Load deleted IDs from file
159
+ const deletedFile = Bun.file(this.deletedPath);
160
+ if (await deletedFile.exists()) {
161
+ try {
162
+ const deletedContent = await deletedFile.text();
163
+ const deletedArray = JSON.parse(deletedContent);
164
+ if (Array.isArray(deletedArray)) {
165
+ this.deletedIds = new Set(deletedArray);
166
+ }
167
+ } catch (e) {
168
+ // Ignore file read errors, start with empty set
169
+ }
170
+ }
171
+ }
172
+
173
+ async add(config: AddConfig): Promise<void> {
174
+ if (config.vectors.length !== config.ids.length) {
175
+ throw new Error('Number of vectors must match number of IDs');
176
+ }
177
+
178
+ if (config.metadata && config.metadata.length !== config.ids.length) {
179
+ throw new Error('Number of metadata entries must match number of IDs');
180
+ }
181
+
182
+ // Check for duplicate IDs before adding any vectors
183
+ const duplicates: string[] = [];
184
+ for (const id of config.ids) {
185
+ if (this.idMap.has(id)) {
186
+ duplicates.push(id);
187
+ }
188
+ }
189
+ if (duplicates.length > 0) {
190
+ throw new Error(
191
+ `Cannot add: IDs already exist: ${duplicates.slice(0, 5).join(', ')}${duplicates.length > 5 ? ` and ${duplicates.length - 5} more` : ''}. ` +
192
+ `Use upsert() to update existing vectors.`
193
+ );
194
+ }
195
+
196
+ // Prepare bulk insert data - single pass validation and transformation
197
+ const points: Array<{ id: number; vector: Float32Array }> = new Array(config.ids.length);
198
+ const startId = this.idMap.size;
199
+
200
+ for (let i = 0; i < config.ids.length; i++) {
201
+ const id = config.ids[i];
202
+ const vector = config.vectors[i];
203
+ const metadata = config.metadata ? config.metadata[i] : undefined;
204
+
205
+ // Validate vector dimension
206
+ if (vector.length !== this.dimension) {
207
+ throw new Error(`Vector at index ${i} has dimension ${vector.length}, expected ${this.dimension}`);
208
+ }
209
+
210
+ // Generate a numeric ID for internal use
211
+ const numericId = startId + i;
212
+
213
+ // Update mappings
214
+ this.idMap.set(id, numericId);
215
+ this.idReverseMap.set(numericId, id);
216
+
217
+ // Store metadata if provided
218
+ if (metadata) {
219
+ this.metadata.set(numericId, metadata);
220
+ }
221
+
222
+ // Prepare point for bulk insert
223
+ points[i] = { id: numericId, vector: new Float32Array(vector) };
224
+ }
225
+
226
+ // Use bulk insert for better performance
227
+ await this.hnsw.addPointsBulk(points);
228
+ }
229
+
230
+ async query(config: QueryConfig): Promise<QueryResult> {
231
+ const { queryVector, k, filter, efSearch } = config;
232
+
233
+ if (queryVector.length !== this.dimension) {
234
+ throw new Error(`Query vector has dimension ${queryVector.length}, expected ${this.dimension}`);
235
+ }
236
+
237
+ // Perform the search
238
+ const ef = efSearch || Math.max(k * 2, 50); // Default ef to 2*k or 50
239
+ const results = this.hnsw.searchKNN(new Float32Array(queryVector), k * 2, ef); // Request more results to handle duplicates
240
+
241
+ // Apply filtering if provided (check for non-empty filter without creating array)
242
+ let filteredResults = results;
243
+ if (filter) {
244
+ let hasFilter = false;
245
+ for (const _ in filter) { hasFilter = true; break; }
246
+ if (hasFilter) {
247
+ filteredResults = results.filter(result => {
248
+ const metadata = this.metadata.get(result.id) || {};
249
+ return this.matchesFilter(metadata, filter);
250
+ });
251
+ }
252
+ }
253
+
254
+ // Deduplicate results and filter out deleted vectors
255
+ const seenIds = new Set<number>();
256
+ const deduplicatedResults: Array<{ id: number; distance: number }> = [];
257
+
258
+ for (const result of filteredResults) {
259
+ // Skip deleted vectors (tombstone check)
260
+ if (this.deletedIds.has(result.id)) continue;
261
+
262
+ if (!seenIds.has(result.id)) {
263
+ seenIds.add(result.id);
264
+ deduplicatedResults.push(result);
265
+ }
266
+ }
267
+
268
+ // Sort by distance to ensure proper ordering (closest first)
269
+ deduplicatedResults.sort((a, b) => a.distance - b.distance);
270
+
271
+ // Limit to k results
272
+ const resultCount = Math.min(deduplicatedResults.length, k);
273
+
274
+ // Pre-allocate output arrays for better performance
275
+ const ids = new Array<string>(resultCount);
276
+ const distances = new Array<number>(resultCount);
277
+ const metadata = new Array<Record<string, any>>(resultCount);
278
+
279
+ let outIdx = 0;
280
+ for (let i = 0; i < resultCount; i++) {
281
+ const result = deduplicatedResults[i];
282
+ const id = this.idReverseMap.get(result.id);
283
+ if (id) {
284
+ ids[outIdx] = id;
285
+ distances[outIdx] = result.distance;
286
+ metadata[outIdx] = this.metadata.get(result.id) || {};
287
+ outIdx++;
288
+ }
289
+ }
290
+
291
+ // Trim arrays if any results were filtered (shouldn't happen often)
292
+ if (outIdx < resultCount) {
293
+ ids.length = outIdx;
294
+ distances.length = outIdx;
295
+ metadata.length = outIdx;
296
+ }
297
+
298
+ return { ids, distances, metadata };
299
+ }
300
+
301
+ /**
302
+ * Batch query for multiple vectors at once.
303
+ * More efficient than calling query() multiple times.
304
+ *
305
+ * @param configs Array of query configurations
306
+ * @returns Array of query results, one per query
307
+ */
308
+ async queryBatch(configs: QueryConfig[]): Promise<QueryResult[]> {
309
+ if (configs.length === 0) return [];
310
+
311
+ // Validate all queries have correct dimension
312
+ for (let i = 0; i < configs.length; i++) {
313
+ if (configs[i].queryVector.length !== this.dimension) {
314
+ throw new Error(`Query ${i} has dimension ${configs[i].queryVector.length}, expected ${this.dimension}`);
315
+ }
316
+ }
317
+
318
+ // Use batch search on the HNSW index - single loop instead of 3 map calls
319
+ const queries = new Array<Float32Array>(configs.length);
320
+ let k = 0;
321
+ let efSearch = 0;
322
+ for (let i = 0; i < configs.length; i++) {
323
+ const c = configs[i];
324
+ queries[i] = new Float32Array(c.queryVector);
325
+ if (c.k > k) k = c.k;
326
+ const ef = c.efSearch || Math.max(c.k * 2, 50);
327
+ if (ef > efSearch) efSearch = ef;
328
+ }
329
+
330
+ const batchResults = this.hnsw.searchKNNBatch(queries, k * 2, efSearch);
331
+
332
+ // Process results for each query
333
+ const results: QueryResult[] = [];
334
+
335
+ for (let q = 0; q < configs.length; q++) {
336
+ const config = configs[q];
337
+ const rawResults = batchResults[q];
338
+
339
+ // Apply filtering if provided (check for non-empty filter without creating array)
340
+ let filteredResults = rawResults;
341
+ if (config.filter) {
342
+ let hasFilter = false;
343
+ for (const _ in config.filter) { hasFilter = true; break; }
344
+ if (hasFilter) {
345
+ filteredResults = rawResults.filter(result => {
346
+ const metadata = this.metadata.get(result.id) || {};
347
+ return this.matchesFilter(metadata, config.filter!);
348
+ });
349
+ }
350
+ }
351
+
352
+ // Deduplicate and filter out deleted vectors
353
+ const seenIds = new Set<number>();
354
+ const deduplicatedResults: Array<{ id: number; distance: number }> = [];
355
+ for (const result of filteredResults) {
356
+ // Skip deleted vectors (tombstone check)
357
+ if (this.deletedIds.has(result.id)) continue;
358
+
359
+ if (!seenIds.has(result.id)) {
360
+ seenIds.add(result.id);
361
+ deduplicatedResults.push(result);
362
+ }
363
+ }
364
+
365
+ // Sort and limit
366
+ deduplicatedResults.sort((a, b) => a.distance - b.distance);
367
+ const resultCount = Math.min(deduplicatedResults.length, config.k);
368
+
369
+ // Pre-allocate output arrays
370
+ const ids = new Array<string>(resultCount);
371
+ const distances = new Array<number>(resultCount);
372
+ const metadata = new Array<Record<string, any>>(resultCount);
373
+
374
+ let outIdx = 0;
375
+ for (let i = 0; i < resultCount; i++) {
376
+ const result = deduplicatedResults[i];
377
+ const id = this.idReverseMap.get(result.id);
378
+ if (id) {
379
+ ids[outIdx] = id;
380
+ distances[outIdx] = result.distance;
381
+ metadata[outIdx] = this.metadata.get(result.id) || {};
382
+ outIdx++;
383
+ }
384
+ }
385
+
386
+ // Trim if needed
387
+ if (outIdx < resultCount) {
388
+ ids.length = outIdx;
389
+ distances.length = outIdx;
390
+ metadata.length = outIdx;
391
+ }
392
+
393
+ results.push({ ids, distances, metadata });
394
+ }
395
+
396
+ return results;
397
+ }
398
+
399
+ /**
400
+ * Brute-force KNN search for validation and correctness checking
401
+ * This checks all vectors and returns the true k nearest neighbors
402
+ */
403
+ async queryBruteForce(config: QueryConfig): Promise<QueryResult> {
404
+ const { queryVector, k, filter } = config;
405
+
406
+ if (queryVector.length !== this.dimension) {
407
+ throw new Error(`Query vector has dimension ${queryVector.length}, expected ${this.dimension}`);
408
+ }
409
+
410
+ // Get all vectors from the HNSW index
411
+ const allNodes: Array<{ id: number; vector: Float32Array }> = [];
412
+ for (const [numericId, vector] of this.hnsw.getAllVectors()) {
413
+ allNodes.push({ id: numericId, vector });
414
+ }
415
+
416
+ // Compute distances to all vectors
417
+ const distances: Array<{ id: number; distance: number }> = [];
418
+ for (const node of allNodes) {
419
+ const distance = this.hnsw.calculateDistance(new Float32Array(queryVector), node.vector);
420
+ distances.push({ id: node.id, distance });
421
+ }
422
+
423
+ // Sort by distance and get top k
424
+ distances.sort((a, b) => a.distance - b.distance);
425
+ const topK = distances.slice(0, k);
426
+
427
+ // Apply filtering if provided
428
+ let filteredResults = topK;
429
+ if (filter && Object.keys(filter).length > 0) {
430
+ filteredResults = [];
431
+ for (const result of topK) {
432
+ const metadata = this.metadata.get(result.id) || {};
433
+ if (this.matchesFilter(metadata, filter)) {
434
+ filteredResults.push(result);
435
+ }
436
+ if (filteredResults.length >= k) break;
437
+ }
438
+ }
439
+
440
+ // Prepare the output
441
+ const ids: string[] = [];
442
+ const distancesOut: number[] = [];
443
+ const metadata: Array<Record<string, any>> = [];
444
+
445
+ for (const result of filteredResults) {
446
+ const id = this.idReverseMap.get(result.id);
447
+ if (id) {
448
+ ids.push(id);
449
+ distancesOut.push(result.distance);
450
+ metadata.push(this.metadata.get(result.id) || {});
451
+ }
452
+ }
453
+
454
+ return {
455
+ ids,
456
+ distances: distancesOut,
457
+ metadata
458
+ };
459
+ }
460
+
461
+ /**
462
+ * Upsert vectors (insert or update).
463
+ *
464
+ * NOTE: HNSW indices don't support efficient in-place updates.
465
+ * Duplicate IDs will throw an error. For updates, rebuild the index
466
+ * without the old vectors and add the new ones.
467
+ *
468
+ * @throws Error if any ID already exists
469
+ */
470
+ async upsert(config: AddConfig): Promise<void> {
471
+ // Check for duplicates and throw clear error
472
+ const duplicates: string[] = [];
473
+ for (const id of config.ids) {
474
+ if (this.idMap.has(id)) {
475
+ duplicates.push(id);
476
+ }
477
+ }
478
+
479
+ if (duplicates.length > 0) {
480
+ throw new Error(
481
+ `Cannot upsert: IDs already exist: ${duplicates.join(', ')}. ` +
482
+ `HNSW indices don't support efficient updates. ` +
483
+ `To update vectors, create a new index without the old vectors.`
484
+ );
485
+ }
486
+
487
+ // No duplicates - proceed with add
488
+ await this.add(config);
489
+ }
490
+
491
+ /**
492
+ * Returns the number of active (non-deleted) vectors in the collection.
493
+ */
494
+ count(): number {
495
+ return this.idMap.size - this.deletedIds.size;
496
+ }
497
+
498
+ /**
499
+ * Returns the total number of vectors including deleted (tombstoned) ones.
500
+ * Use this to determine when compaction might be beneficial.
501
+ */
502
+ countWithDeleted(): number {
503
+ return this.idMap.size;
504
+ }
505
+
506
+ /**
507
+ * Returns the number of deleted (tombstoned) vectors.
508
+ */
509
+ deletedCount(): number {
510
+ return this.deletedIds.size;
511
+ }
512
+
513
+ /**
514
+ * Mark a vector as deleted (tombstone deletion).
515
+ * The vector remains in the index but is excluded from search results.
516
+ * Use compact() to permanently remove deleted vectors and reclaim space.
517
+ *
518
+ * @param id The string ID of the vector to delete
519
+ * @returns true if the vector was deleted, false if it didn't exist or was already deleted
520
+ */
521
+ delete(id: string): boolean {
522
+ const numericId = this.idMap.get(id);
523
+ if (numericId === undefined) return false;
524
+ if (this.deletedIds.has(numericId)) return false;
525
+
526
+ this.deletedIds.add(numericId);
527
+ return true;
528
+ }
529
+
530
+ /**
531
+ * Mark multiple vectors as deleted (tombstone deletion).
532
+ *
533
+ * @param ids Array of string IDs to delete
534
+ * @returns Number of vectors that were successfully deleted
535
+ */
536
+ deleteBatch(ids: string[]): number {
537
+ let deleted = 0;
538
+ for (const id of ids) {
539
+ if (this.delete(id)) deleted++;
540
+ }
541
+ return deleted;
542
+ }
543
+
544
+ /**
545
+ * Check if a vector exists and is not deleted.
546
+ */
547
+ has(id: string): boolean {
548
+ const numericId = this.idMap.get(id);
549
+ if (numericId === undefined) return false;
550
+ return !this.deletedIds.has(numericId);
551
+ }
552
+
553
+ /**
554
+ * Check if a vector was deleted (tombstoned).
555
+ */
556
+ isDeleted(id: string): boolean {
557
+ const numericId = this.idMap.get(id);
558
+ if (numericId === undefined) return false;
559
+ return this.deletedIds.has(numericId);
560
+ }
561
+
562
+ async saveToDisk(): Promise<void> {
563
+ // Ensure the directory exists before saving using Bun-compatible async API
564
+ const dirPath = path.dirname(this.indexPath);
565
+ await mkdir(dirPath, { recursive: true }).catch(() => {});
566
+
567
+ // Save the HNSW index to binary file (now async)
568
+ await this.hnsw.saveToFile(this.indexPath);
569
+
570
+ // Save the metadata to disk using Bun APIs
571
+ const metaLines: string[] = [];
572
+ for (const [numericId, id] of this.idReverseMap) {
573
+ const meta = this.metadata.get(numericId);
574
+ const metaStr = meta ? JSON.stringify(meta) : '{}';
575
+ metaLines.push(`${id}\t${numericId}\t${metaStr}`);
576
+ }
577
+ await Bun.write(this.metaPath, metaLines.join('\n'));
578
+
579
+ // Save deleted IDs to disk (only if there are any)
580
+ if (this.deletedIds.size > 0) {
581
+ await Bun.write(this.deletedPath, JSON.stringify([...this.deletedIds]));
582
+ } else {
583
+ // Remove the deleted file if no deletions (clean state)
584
+ const deletedFile = Bun.file(this.deletedPath);
585
+ if (await deletedFile.exists()) {
586
+ await Bun.write(this.deletedPath, '[]');
587
+ }
588
+ }
589
+ }
590
+
591
+ private matchesFilter(metadata: Record<string, any>, filter: Record<string, any>): boolean {
592
+ // Use for-in loop to avoid Object.entries() array allocation
593
+ for (const key in filter) {
594
+ const value = filter[key];
595
+ const metaValue = metadata[key];
596
+ if (typeof value === 'object' && value !== null) {
597
+ // Handle complex filters like { $gt: 25 }
598
+ if (value.$gt !== undefined && metaValue <= value.$gt) return false;
599
+ if (value.$lt !== undefined && metaValue >= value.$lt) return false;
600
+ if (value.$gte !== undefined && metaValue < value.$gte) return false;
601
+ if (value.$lte !== undefined && metaValue > value.$lte) return false;
602
+ if (value.$ne !== undefined && metaValue === value.$ne) return false;
603
+ if (value.$in !== undefined && !value.$in.includes(metaValue)) return false;
604
+ if (value.$nin !== undefined && value.$nin.includes(metaValue)) return false;
605
+ } else {
606
+ // Handle simple equality check
607
+ if (metaValue !== value) return false;
608
+ }
609
+ }
610
+ return true;
611
+ }
612
+
613
+ /**
614
+ * Compact the collection by rebuilding the index without deleted vectors.
615
+ * This permanently removes tombstoned vectors and reclaims space.
616
+ *
617
+ * @returns Number of vectors removed during compaction
618
+ */
619
+ async compact(): Promise<number> {
620
+ if (this.deletedIds.size === 0) return 0;
621
+
622
+ const removedCount = this.deletedIds.size;
623
+
624
+ // Collect all non-deleted vectors
625
+ const activeVectors: Array<{ id: string; numericId: number; vector: Float32Array; meta?: Record<string, any> }> = [];
626
+
627
+ for (const [numericId, vector] of this.hnsw.getAllVectors()) {
628
+ if (!this.deletedIds.has(numericId)) {
629
+ const stringId = this.idReverseMap.get(numericId);
630
+ if (stringId) {
631
+ activeVectors.push({
632
+ id: stringId,
633
+ numericId,
634
+ vector,
635
+ meta: this.metadata.get(numericId)
636
+ });
637
+ }
638
+ }
639
+ }
640
+
641
+ // Destroy old index
642
+ this.hnsw.destroy();
643
+
644
+ // Create new index with same parameters as the original
645
+ this.hnsw = new HNSWIndex(
646
+ this.dimension,
647
+ this.metric,
648
+ this.M,
649
+ this.efConstruction
650
+ );
651
+
652
+ // Clear all mappings
653
+ this.idMap.clear();
654
+ this.idReverseMap.clear();
655
+ this.metadata.clear();
656
+ this.deletedIds.clear();
657
+
658
+ // Prepare bulk insert data and re-populate mappings
659
+ const points: Array<{ id: number; vector: Float32Array }> = new Array(activeVectors.length);
660
+
661
+ for (let i = 0; i < activeVectors.length; i++) {
662
+ const { id, vector, meta } = activeVectors[i];
663
+ const newNumericId = i;
664
+
665
+ this.idMap.set(id, newNumericId);
666
+ this.idReverseMap.set(newNumericId, id);
667
+ if (meta) {
668
+ this.metadata.set(newNumericId, meta);
669
+ }
670
+
671
+ points[i] = { id: newNumericId, vector };
672
+ }
673
+
674
+ // Use bulk insert for better performance
675
+ await this.hnsw.addPointsBulk(points);
676
+
677
+ return removedCount;
678
+ }
679
+
680
+ async destroy(): Promise<void> {
681
+ // Save to disk before destroying
682
+ await this.saveToDisk();
683
+
684
+ // Destroy the HNSW index to free memory
685
+ if (this.hnsw && typeof this.hnsw.destroy === 'function') {
686
+ this.hnsw.destroy();
687
+ }
688
+
689
+ // Clear all maps to free memory
690
+ this.idMap.clear();
691
+ this.idReverseMap.clear();
692
+ this.metadata.clear();
693
+ this.deletedIds.clear();
694
+ }
695
+ }