verso-db 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -0
- package/LICENSE +21 -0
- package/README.md +252 -0
- package/dist/BinaryHeap.d.ts +25 -0
- package/dist/BinaryHeap.d.ts.map +1 -0
- package/dist/Collection.d.ts +156 -0
- package/dist/Collection.d.ts.map +1 -0
- package/dist/HNSWIndex.d.ts +357 -0
- package/dist/HNSWIndex.d.ts.map +1 -0
- package/dist/MaxBinaryHeap.d.ts +63 -0
- package/dist/MaxBinaryHeap.d.ts.map +1 -0
- package/dist/Storage.d.ts +54 -0
- package/dist/Storage.d.ts.map +1 -0
- package/dist/VectorDB.d.ts +44 -0
- package/dist/VectorDB.d.ts.map +1 -0
- package/dist/backends/DistanceBackend.d.ts +5 -0
- package/dist/backends/DistanceBackend.d.ts.map +1 -0
- package/dist/backends/JsDistanceBackend.d.ts +37 -0
- package/dist/backends/JsDistanceBackend.d.ts.map +1 -0
- package/dist/encoding/DeltaEncoder.d.ts +61 -0
- package/dist/encoding/DeltaEncoder.d.ts.map +1 -0
- package/dist/errors.d.ts +58 -0
- package/dist/errors.d.ts.map +1 -0
- package/dist/index.d.ts +64 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3732 -0
- package/dist/presets.d.ts +91 -0
- package/dist/presets.d.ts.map +1 -0
- package/dist/quantization/ScalarQuantizer.d.ts +114 -0
- package/dist/quantization/ScalarQuantizer.d.ts.map +1 -0
- package/dist/storage/BatchWriter.d.ts +104 -0
- package/dist/storage/BatchWriter.d.ts.map +1 -0
- package/dist/storage/BunStorageBackend.d.ts +58 -0
- package/dist/storage/BunStorageBackend.d.ts.map +1 -0
- package/dist/storage/MemoryBackend.d.ts +44 -0
- package/dist/storage/MemoryBackend.d.ts.map +1 -0
- package/dist/storage/OPFSBackend.d.ts +59 -0
- package/dist/storage/OPFSBackend.d.ts.map +1 -0
- package/dist/storage/StorageBackend.d.ts +66 -0
- package/dist/storage/StorageBackend.d.ts.map +1 -0
- package/dist/storage/WriteAheadLog.d.ts +111 -0
- package/dist/storage/WriteAheadLog.d.ts.map +1 -0
- package/dist/storage/createStorageBackend.d.ts +40 -0
- package/dist/storage/createStorageBackend.d.ts.map +1 -0
- package/dist/storage/index.d.ts +30 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/package.json +98 -0
- package/src/BinaryHeap.ts +131 -0
- package/src/Collection.ts +695 -0
- package/src/HNSWIndex.ts +1839 -0
- package/src/MaxBinaryHeap.ts +175 -0
- package/src/Storage.ts +435 -0
- package/src/VectorDB.ts +109 -0
- package/src/backends/DistanceBackend.ts +17 -0
- package/src/backends/JsDistanceBackend.ts +227 -0
- package/src/encoding/DeltaEncoder.ts +217 -0
- package/src/errors.ts +110 -0
- package/src/index.ts +138 -0
- package/src/presets.ts +229 -0
- package/src/quantization/ScalarQuantizer.ts +383 -0
- package/src/storage/BatchWriter.ts +336 -0
- package/src/storage/BunStorageBackend.ts +161 -0
- package/src/storage/MemoryBackend.ts +120 -0
- package/src/storage/OPFSBackend.ts +250 -0
- package/src/storage/StorageBackend.ts +74 -0
- package/src/storage/WriteAheadLog.ts +326 -0
- package/src/storage/createStorageBackend.ts +137 -0
- package/src/storage/index.ts +53 -0
|
@@ -0,0 +1,695 @@
|
|
|
1
|
+
import { HNSWIndex, DistanceMetric } from './HNSWIndex';
|
|
2
|
+
import { mkdir } from 'fs/promises';
|
|
3
|
+
import * as path from 'path';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Configuration for adding vectors to a collection.
|
|
7
|
+
*/
|
|
8
|
+
export interface AddConfig {
|
|
9
|
+
/** Unique string identifiers for each vector */
|
|
10
|
+
ids: string[];
|
|
11
|
+
/** Array of vectors to add (must match collection dimension) */
|
|
12
|
+
vectors: number[][];
|
|
13
|
+
/** Optional metadata for each vector (same length as ids/vectors) */
|
|
14
|
+
metadata?: Array<Record<string, any>>;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Configuration for querying a collection.
|
|
19
|
+
*
|
|
20
|
+
* @example
|
|
21
|
+
* ```typescript
|
|
22
|
+
* // Simple query
|
|
23
|
+
* const results = await collection.query({
|
|
24
|
+
* queryVector: [0.1, 0.2, 0.3],
|
|
25
|
+
* k: 10
|
|
26
|
+
* });
|
|
27
|
+
*
|
|
28
|
+
* // Query with metadata filter
|
|
29
|
+
* const results = await collection.query({
|
|
30
|
+
* queryVector: [0.1, 0.2, 0.3],
|
|
31
|
+
* k: 10,
|
|
32
|
+
* filter: {
|
|
33
|
+
* category: 'science', // Exact match
|
|
34
|
+
* year: { $gte: 2020 }, // Greater than or equal
|
|
35
|
+
* status: { $in: ['active', 'pending'] } // In array
|
|
36
|
+
* }
|
|
37
|
+
* });
|
|
38
|
+
* ```
|
|
39
|
+
*/
|
|
40
|
+
export interface QueryConfig {
|
|
41
|
+
/** The query vector (must match collection dimension) */
|
|
42
|
+
queryVector: number[];
|
|
43
|
+
/** Number of nearest neighbors to return */
|
|
44
|
+
k: number;
|
|
45
|
+
/**
|
|
46
|
+
* Optional metadata filter. Supports MongoDB-style operators:
|
|
47
|
+
* - Simple equality: `{ field: value }`
|
|
48
|
+
* - `$gt`: Greater than `{ field: { $gt: 5 } }`
|
|
49
|
+
* - `$gte`: Greater than or equal `{ field: { $gte: 5 } }`
|
|
50
|
+
* - `$lt`: Less than `{ field: { $lt: 10 } }`
|
|
51
|
+
* - `$lte`: Less than or equal `{ field: { $lte: 10 } }`
|
|
52
|
+
* - `$ne`: Not equal `{ field: { $ne: 'excluded' } }`
|
|
53
|
+
* - `$in`: In array `{ field: { $in: ['a', 'b', 'c'] } }`
|
|
54
|
+
* - `$nin`: Not in array `{ field: { $nin: ['x', 'y'] } }`
|
|
55
|
+
*/
|
|
56
|
+
filter?: Record<string, any>;
|
|
57
|
+
/** Search effort parameter (higher = better recall, slower). Default: max(k*2, 50) */
|
|
58
|
+
efSearch?: number;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export interface QueryResult {
|
|
62
|
+
ids: string[];
|
|
63
|
+
distances: number[];
|
|
64
|
+
metadata: Array<Record<string, any>>;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export class Collection {
|
|
68
|
+
private name: string;
|
|
69
|
+
private dimension: number;
|
|
70
|
+
private metric: DistanceMetric;
|
|
71
|
+
private M: number; // HNSW M parameter
|
|
72
|
+
private efConstruction: number; // HNSW efConstruction parameter
|
|
73
|
+
private indexPath: string;
|
|
74
|
+
private metaPath: string;
|
|
75
|
+
private deletedPath: string; // Path for persisting deleted IDs
|
|
76
|
+
private hnsw: HNSWIndex;
|
|
77
|
+
private idMap: Map<string, number>; // Maps string IDs to internal numeric IDs
|
|
78
|
+
private idReverseMap: Map<number, string>; // Maps internal numeric IDs to string IDs
|
|
79
|
+
private metadata: Map<number, Record<string, any>>; // Stores metadata for each vector
|
|
80
|
+
private deletedIds: Set<number>; // Tombstone markers for deleted vectors
|
|
81
|
+
|
|
82
|
+
constructor(name: string, config: { dimension: number; metric?: DistanceMetric; M?: number; efConstruction?: number }, collectionPath: string) {
|
|
83
|
+
this.name = name;
|
|
84
|
+
this.dimension = config.dimension;
|
|
85
|
+
this.metric = config.metric || 'cosine';
|
|
86
|
+
this.M = config.M || 16;
|
|
87
|
+
this.efConstruction = config.efConstruction || 200;
|
|
88
|
+
this.indexPath = path.join(collectionPath, `${name}.hnsw`);
|
|
89
|
+
this.metaPath = path.join(collectionPath, `${name}.meta`);
|
|
90
|
+
this.deletedPath = path.join(collectionPath, `${name}.deleted`);
|
|
91
|
+
|
|
92
|
+
// Initialize HNSW index with the specified parameters
|
|
93
|
+
this.hnsw = new HNSWIndex(
|
|
94
|
+
config.dimension,
|
|
95
|
+
this.metric,
|
|
96
|
+
this.M,
|
|
97
|
+
this.efConstruction
|
|
98
|
+
);
|
|
99
|
+
|
|
100
|
+
this.idMap = new Map();
|
|
101
|
+
this.idReverseMap = new Map();
|
|
102
|
+
this.metadata = new Map();
|
|
103
|
+
this.deletedIds = new Set();
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
async init(): Promise<void> {
|
|
107
|
+
// Load existing data if files exist
|
|
108
|
+
await this.loadFromDisk();
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
private async loadFromDisk(): Promise<void> {
|
|
112
|
+
// Check if index files exist and load the binary HNSW index using Bun APIs
|
|
113
|
+
const indexFile = Bun.file(this.indexPath);
|
|
114
|
+
if (await indexFile.exists()) {
|
|
115
|
+
try {
|
|
116
|
+
this.hnsw = await HNSWIndex.loadFromFile(this.indexPath);
|
|
117
|
+
} catch (e) {
|
|
118
|
+
console.warn(`Failed to load HNSW index from ${this.indexPath}:`, e);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Load metadata from file using Bun APIs
|
|
123
|
+
const metaFile = Bun.file(this.metaPath);
|
|
124
|
+
if (await metaFile.exists()) {
|
|
125
|
+
try {
|
|
126
|
+
const metaContent = await metaFile.text();
|
|
127
|
+
// Process lines without creating intermediate filter array
|
|
128
|
+
const lines = metaContent.split('\n');
|
|
129
|
+
for (let i = 0; i < lines.length; i++) {
|
|
130
|
+
const line = lines[i];
|
|
131
|
+
if (line.length === 0 || line.trim().length === 0) continue;
|
|
132
|
+
|
|
133
|
+
try {
|
|
134
|
+
const parts = line.split('\t');
|
|
135
|
+
const id = parts[0];
|
|
136
|
+
const internalId = parseInt(parts[1], 10);
|
|
137
|
+
if (!isNaN(internalId)) {
|
|
138
|
+
this.idMap.set(id, internalId);
|
|
139
|
+
this.idReverseMap.set(internalId, id);
|
|
140
|
+
|
|
141
|
+
if (parts.length > 2) {
|
|
142
|
+
// Join remaining parts for metadata (avoiding spread operator)
|
|
143
|
+
const metaStr = parts.slice(2).join('\t');
|
|
144
|
+
const metadata = JSON.parse(metaStr);
|
|
145
|
+
this.metadata.set(internalId, metadata);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
} catch (e) {
|
|
149
|
+
// Skip malformed lines
|
|
150
|
+
console.warn(`Skipping malformed line in metadata file: ${line}`);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
} catch (e) {
|
|
154
|
+
// Ignore file read errors
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Load deleted IDs from file
|
|
159
|
+
const deletedFile = Bun.file(this.deletedPath);
|
|
160
|
+
if (await deletedFile.exists()) {
|
|
161
|
+
try {
|
|
162
|
+
const deletedContent = await deletedFile.text();
|
|
163
|
+
const deletedArray = JSON.parse(deletedContent);
|
|
164
|
+
if (Array.isArray(deletedArray)) {
|
|
165
|
+
this.deletedIds = new Set(deletedArray);
|
|
166
|
+
}
|
|
167
|
+
} catch (e) {
|
|
168
|
+
// Ignore file read errors, start with empty set
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
async add(config: AddConfig): Promise<void> {
|
|
174
|
+
if (config.vectors.length !== config.ids.length) {
|
|
175
|
+
throw new Error('Number of vectors must match number of IDs');
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (config.metadata && config.metadata.length !== config.ids.length) {
|
|
179
|
+
throw new Error('Number of metadata entries must match number of IDs');
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Check for duplicate IDs before adding any vectors
|
|
183
|
+
const duplicates: string[] = [];
|
|
184
|
+
for (const id of config.ids) {
|
|
185
|
+
if (this.idMap.has(id)) {
|
|
186
|
+
duplicates.push(id);
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
if (duplicates.length > 0) {
|
|
190
|
+
throw new Error(
|
|
191
|
+
`Cannot add: IDs already exist: ${duplicates.slice(0, 5).join(', ')}${duplicates.length > 5 ? ` and ${duplicates.length - 5} more` : ''}. ` +
|
|
192
|
+
`Use upsert() to update existing vectors.`
|
|
193
|
+
);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Prepare bulk insert data - single pass validation and transformation
|
|
197
|
+
const points: Array<{ id: number; vector: Float32Array }> = new Array(config.ids.length);
|
|
198
|
+
const startId = this.idMap.size;
|
|
199
|
+
|
|
200
|
+
for (let i = 0; i < config.ids.length; i++) {
|
|
201
|
+
const id = config.ids[i];
|
|
202
|
+
const vector = config.vectors[i];
|
|
203
|
+
const metadata = config.metadata ? config.metadata[i] : undefined;
|
|
204
|
+
|
|
205
|
+
// Validate vector dimension
|
|
206
|
+
if (vector.length !== this.dimension) {
|
|
207
|
+
throw new Error(`Vector at index ${i} has dimension ${vector.length}, expected ${this.dimension}`);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Generate a numeric ID for internal use
|
|
211
|
+
const numericId = startId + i;
|
|
212
|
+
|
|
213
|
+
// Update mappings
|
|
214
|
+
this.idMap.set(id, numericId);
|
|
215
|
+
this.idReverseMap.set(numericId, id);
|
|
216
|
+
|
|
217
|
+
// Store metadata if provided
|
|
218
|
+
if (metadata) {
|
|
219
|
+
this.metadata.set(numericId, metadata);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Prepare point for bulk insert
|
|
223
|
+
points[i] = { id: numericId, vector: new Float32Array(vector) };
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Use bulk insert for better performance
|
|
227
|
+
await this.hnsw.addPointsBulk(points);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
async query(config: QueryConfig): Promise<QueryResult> {
|
|
231
|
+
const { queryVector, k, filter, efSearch } = config;
|
|
232
|
+
|
|
233
|
+
if (queryVector.length !== this.dimension) {
|
|
234
|
+
throw new Error(`Query vector has dimension ${queryVector.length}, expected ${this.dimension}`);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Perform the search
|
|
238
|
+
const ef = efSearch || Math.max(k * 2, 50); // Default ef to 2*k or 50
|
|
239
|
+
const results = this.hnsw.searchKNN(new Float32Array(queryVector), k * 2, ef); // Request more results to handle duplicates
|
|
240
|
+
|
|
241
|
+
// Apply filtering if provided (check for non-empty filter without creating array)
|
|
242
|
+
let filteredResults = results;
|
|
243
|
+
if (filter) {
|
|
244
|
+
let hasFilter = false;
|
|
245
|
+
for (const _ in filter) { hasFilter = true; break; }
|
|
246
|
+
if (hasFilter) {
|
|
247
|
+
filteredResults = results.filter(result => {
|
|
248
|
+
const metadata = this.metadata.get(result.id) || {};
|
|
249
|
+
return this.matchesFilter(metadata, filter);
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// Deduplicate results and filter out deleted vectors
|
|
255
|
+
const seenIds = new Set<number>();
|
|
256
|
+
const deduplicatedResults: Array<{ id: number; distance: number }> = [];
|
|
257
|
+
|
|
258
|
+
for (const result of filteredResults) {
|
|
259
|
+
// Skip deleted vectors (tombstone check)
|
|
260
|
+
if (this.deletedIds.has(result.id)) continue;
|
|
261
|
+
|
|
262
|
+
if (!seenIds.has(result.id)) {
|
|
263
|
+
seenIds.add(result.id);
|
|
264
|
+
deduplicatedResults.push(result);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// Sort by distance to ensure proper ordering (closest first)
|
|
269
|
+
deduplicatedResults.sort((a, b) => a.distance - b.distance);
|
|
270
|
+
|
|
271
|
+
// Limit to k results
|
|
272
|
+
const resultCount = Math.min(deduplicatedResults.length, k);
|
|
273
|
+
|
|
274
|
+
// Pre-allocate output arrays for better performance
|
|
275
|
+
const ids = new Array<string>(resultCount);
|
|
276
|
+
const distances = new Array<number>(resultCount);
|
|
277
|
+
const metadata = new Array<Record<string, any>>(resultCount);
|
|
278
|
+
|
|
279
|
+
let outIdx = 0;
|
|
280
|
+
for (let i = 0; i < resultCount; i++) {
|
|
281
|
+
const result = deduplicatedResults[i];
|
|
282
|
+
const id = this.idReverseMap.get(result.id);
|
|
283
|
+
if (id) {
|
|
284
|
+
ids[outIdx] = id;
|
|
285
|
+
distances[outIdx] = result.distance;
|
|
286
|
+
metadata[outIdx] = this.metadata.get(result.id) || {};
|
|
287
|
+
outIdx++;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Trim arrays if any results were filtered (shouldn't happen often)
|
|
292
|
+
if (outIdx < resultCount) {
|
|
293
|
+
ids.length = outIdx;
|
|
294
|
+
distances.length = outIdx;
|
|
295
|
+
metadata.length = outIdx;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return { ids, distances, metadata };
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Batch query for multiple vectors at once.
|
|
303
|
+
* More efficient than calling query() multiple times.
|
|
304
|
+
*
|
|
305
|
+
* @param configs Array of query configurations
|
|
306
|
+
* @returns Array of query results, one per query
|
|
307
|
+
*/
|
|
308
|
+
async queryBatch(configs: QueryConfig[]): Promise<QueryResult[]> {
|
|
309
|
+
if (configs.length === 0) return [];
|
|
310
|
+
|
|
311
|
+
// Validate all queries have correct dimension
|
|
312
|
+
for (let i = 0; i < configs.length; i++) {
|
|
313
|
+
if (configs[i].queryVector.length !== this.dimension) {
|
|
314
|
+
throw new Error(`Query ${i} has dimension ${configs[i].queryVector.length}, expected ${this.dimension}`);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Use batch search on the HNSW index - single loop instead of 3 map calls
|
|
319
|
+
const queries = new Array<Float32Array>(configs.length);
|
|
320
|
+
let k = 0;
|
|
321
|
+
let efSearch = 0;
|
|
322
|
+
for (let i = 0; i < configs.length; i++) {
|
|
323
|
+
const c = configs[i];
|
|
324
|
+
queries[i] = new Float32Array(c.queryVector);
|
|
325
|
+
if (c.k > k) k = c.k;
|
|
326
|
+
const ef = c.efSearch || Math.max(c.k * 2, 50);
|
|
327
|
+
if (ef > efSearch) efSearch = ef;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
const batchResults = this.hnsw.searchKNNBatch(queries, k * 2, efSearch);
|
|
331
|
+
|
|
332
|
+
// Process results for each query
|
|
333
|
+
const results: QueryResult[] = [];
|
|
334
|
+
|
|
335
|
+
for (let q = 0; q < configs.length; q++) {
|
|
336
|
+
const config = configs[q];
|
|
337
|
+
const rawResults = batchResults[q];
|
|
338
|
+
|
|
339
|
+
// Apply filtering if provided (check for non-empty filter without creating array)
|
|
340
|
+
let filteredResults = rawResults;
|
|
341
|
+
if (config.filter) {
|
|
342
|
+
let hasFilter = false;
|
|
343
|
+
for (const _ in config.filter) { hasFilter = true; break; }
|
|
344
|
+
if (hasFilter) {
|
|
345
|
+
filteredResults = rawResults.filter(result => {
|
|
346
|
+
const metadata = this.metadata.get(result.id) || {};
|
|
347
|
+
return this.matchesFilter(metadata, config.filter!);
|
|
348
|
+
});
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// Deduplicate and filter out deleted vectors
|
|
353
|
+
const seenIds = new Set<number>();
|
|
354
|
+
const deduplicatedResults: Array<{ id: number; distance: number }> = [];
|
|
355
|
+
for (const result of filteredResults) {
|
|
356
|
+
// Skip deleted vectors (tombstone check)
|
|
357
|
+
if (this.deletedIds.has(result.id)) continue;
|
|
358
|
+
|
|
359
|
+
if (!seenIds.has(result.id)) {
|
|
360
|
+
seenIds.add(result.id);
|
|
361
|
+
deduplicatedResults.push(result);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Sort and limit
|
|
366
|
+
deduplicatedResults.sort((a, b) => a.distance - b.distance);
|
|
367
|
+
const resultCount = Math.min(deduplicatedResults.length, config.k);
|
|
368
|
+
|
|
369
|
+
// Pre-allocate output arrays
|
|
370
|
+
const ids = new Array<string>(resultCount);
|
|
371
|
+
const distances = new Array<number>(resultCount);
|
|
372
|
+
const metadata = new Array<Record<string, any>>(resultCount);
|
|
373
|
+
|
|
374
|
+
let outIdx = 0;
|
|
375
|
+
for (let i = 0; i < resultCount; i++) {
|
|
376
|
+
const result = deduplicatedResults[i];
|
|
377
|
+
const id = this.idReverseMap.get(result.id);
|
|
378
|
+
if (id) {
|
|
379
|
+
ids[outIdx] = id;
|
|
380
|
+
distances[outIdx] = result.distance;
|
|
381
|
+
metadata[outIdx] = this.metadata.get(result.id) || {};
|
|
382
|
+
outIdx++;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// Trim if needed
|
|
387
|
+
if (outIdx < resultCount) {
|
|
388
|
+
ids.length = outIdx;
|
|
389
|
+
distances.length = outIdx;
|
|
390
|
+
metadata.length = outIdx;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
results.push({ ids, distances, metadata });
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
return results;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Brute-force KNN search for validation and correctness checking
|
|
401
|
+
* This checks all vectors and returns the true k nearest neighbors
|
|
402
|
+
*/
|
|
403
|
+
async queryBruteForce(config: QueryConfig): Promise<QueryResult> {
|
|
404
|
+
const { queryVector, k, filter } = config;
|
|
405
|
+
|
|
406
|
+
if (queryVector.length !== this.dimension) {
|
|
407
|
+
throw new Error(`Query vector has dimension ${queryVector.length}, expected ${this.dimension}`);
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Get all vectors from the HNSW index
|
|
411
|
+
const allNodes: Array<{ id: number; vector: Float32Array }> = [];
|
|
412
|
+
for (const [numericId, vector] of this.hnsw.getAllVectors()) {
|
|
413
|
+
allNodes.push({ id: numericId, vector });
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Compute distances to all vectors
|
|
417
|
+
const distances: Array<{ id: number; distance: number }> = [];
|
|
418
|
+
for (const node of allNodes) {
|
|
419
|
+
const distance = this.hnsw.calculateDistance(new Float32Array(queryVector), node.vector);
|
|
420
|
+
distances.push({ id: node.id, distance });
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// Sort by distance and get top k
|
|
424
|
+
distances.sort((a, b) => a.distance - b.distance);
|
|
425
|
+
const topK = distances.slice(0, k);
|
|
426
|
+
|
|
427
|
+
// Apply filtering if provided
|
|
428
|
+
let filteredResults = topK;
|
|
429
|
+
if (filter && Object.keys(filter).length > 0) {
|
|
430
|
+
filteredResults = [];
|
|
431
|
+
for (const result of topK) {
|
|
432
|
+
const metadata = this.metadata.get(result.id) || {};
|
|
433
|
+
if (this.matchesFilter(metadata, filter)) {
|
|
434
|
+
filteredResults.push(result);
|
|
435
|
+
}
|
|
436
|
+
if (filteredResults.length >= k) break;
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Prepare the output
|
|
441
|
+
const ids: string[] = [];
|
|
442
|
+
const distancesOut: number[] = [];
|
|
443
|
+
const metadata: Array<Record<string, any>> = [];
|
|
444
|
+
|
|
445
|
+
for (const result of filteredResults) {
|
|
446
|
+
const id = this.idReverseMap.get(result.id);
|
|
447
|
+
if (id) {
|
|
448
|
+
ids.push(id);
|
|
449
|
+
distancesOut.push(result.distance);
|
|
450
|
+
metadata.push(this.metadata.get(result.id) || {});
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
return {
|
|
455
|
+
ids,
|
|
456
|
+
distances: distancesOut,
|
|
457
|
+
metadata
|
|
458
|
+
};
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
/**
|
|
462
|
+
* Upsert vectors (insert or update).
|
|
463
|
+
*
|
|
464
|
+
* NOTE: HNSW indices don't support efficient in-place updates.
|
|
465
|
+
* Duplicate IDs will throw an error. For updates, rebuild the index
|
|
466
|
+
* without the old vectors and add the new ones.
|
|
467
|
+
*
|
|
468
|
+
* @throws Error if any ID already exists
|
|
469
|
+
*/
|
|
470
|
+
async upsert(config: AddConfig): Promise<void> {
|
|
471
|
+
// Check for duplicates and throw clear error
|
|
472
|
+
const duplicates: string[] = [];
|
|
473
|
+
for (const id of config.ids) {
|
|
474
|
+
if (this.idMap.has(id)) {
|
|
475
|
+
duplicates.push(id);
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
if (duplicates.length > 0) {
|
|
480
|
+
throw new Error(
|
|
481
|
+
`Cannot upsert: IDs already exist: ${duplicates.join(', ')}. ` +
|
|
482
|
+
`HNSW indices don't support efficient updates. ` +
|
|
483
|
+
`To update vectors, create a new index without the old vectors.`
|
|
484
|
+
);
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// No duplicates - proceed with add
|
|
488
|
+
await this.add(config);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Returns the number of active (non-deleted) vectors in the collection.
|
|
493
|
+
*/
|
|
494
|
+
count(): number {
|
|
495
|
+
return this.idMap.size - this.deletedIds.size;
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
/**
|
|
499
|
+
* Returns the total number of vectors including deleted (tombstoned) ones.
|
|
500
|
+
* Use this to determine when compaction might be beneficial.
|
|
501
|
+
*/
|
|
502
|
+
countWithDeleted(): number {
|
|
503
|
+
return this.idMap.size;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
/**
|
|
507
|
+
* Returns the number of deleted (tombstoned) vectors.
|
|
508
|
+
*/
|
|
509
|
+
deletedCount(): number {
|
|
510
|
+
return this.deletedIds.size;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
/**
|
|
514
|
+
* Mark a vector as deleted (tombstone deletion).
|
|
515
|
+
* The vector remains in the index but is excluded from search results.
|
|
516
|
+
* Use compact() to permanently remove deleted vectors and reclaim space.
|
|
517
|
+
*
|
|
518
|
+
* @param id The string ID of the vector to delete
|
|
519
|
+
* @returns true if the vector was deleted, false if it didn't exist or was already deleted
|
|
520
|
+
*/
|
|
521
|
+
delete(id: string): boolean {
|
|
522
|
+
const numericId = this.idMap.get(id);
|
|
523
|
+
if (numericId === undefined) return false;
|
|
524
|
+
if (this.deletedIds.has(numericId)) return false;
|
|
525
|
+
|
|
526
|
+
this.deletedIds.add(numericId);
|
|
527
|
+
return true;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
/**
|
|
531
|
+
* Mark multiple vectors as deleted (tombstone deletion).
|
|
532
|
+
*
|
|
533
|
+
* @param ids Array of string IDs to delete
|
|
534
|
+
* @returns Number of vectors that were successfully deleted
|
|
535
|
+
*/
|
|
536
|
+
deleteBatch(ids: string[]): number {
|
|
537
|
+
let deleted = 0;
|
|
538
|
+
for (const id of ids) {
|
|
539
|
+
if (this.delete(id)) deleted++;
|
|
540
|
+
}
|
|
541
|
+
return deleted;
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
/**
|
|
545
|
+
* Check if a vector exists and is not deleted.
|
|
546
|
+
*/
|
|
547
|
+
has(id: string): boolean {
|
|
548
|
+
const numericId = this.idMap.get(id);
|
|
549
|
+
if (numericId === undefined) return false;
|
|
550
|
+
return !this.deletedIds.has(numericId);
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
/**
|
|
554
|
+
* Check if a vector was deleted (tombstoned).
|
|
555
|
+
*/
|
|
556
|
+
isDeleted(id: string): boolean {
|
|
557
|
+
const numericId = this.idMap.get(id);
|
|
558
|
+
if (numericId === undefined) return false;
|
|
559
|
+
return this.deletedIds.has(numericId);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
async saveToDisk(): Promise<void> {
|
|
563
|
+
// Ensure the directory exists before saving using Bun-compatible async API
|
|
564
|
+
const dirPath = path.dirname(this.indexPath);
|
|
565
|
+
await mkdir(dirPath, { recursive: true }).catch(() => {});
|
|
566
|
+
|
|
567
|
+
// Save the HNSW index to binary file (now async)
|
|
568
|
+
await this.hnsw.saveToFile(this.indexPath);
|
|
569
|
+
|
|
570
|
+
// Save the metadata to disk using Bun APIs
|
|
571
|
+
const metaLines: string[] = [];
|
|
572
|
+
for (const [numericId, id] of this.idReverseMap) {
|
|
573
|
+
const meta = this.metadata.get(numericId);
|
|
574
|
+
const metaStr = meta ? JSON.stringify(meta) : '{}';
|
|
575
|
+
metaLines.push(`${id}\t${numericId}\t${metaStr}`);
|
|
576
|
+
}
|
|
577
|
+
await Bun.write(this.metaPath, metaLines.join('\n'));
|
|
578
|
+
|
|
579
|
+
// Save deleted IDs to disk (only if there are any)
|
|
580
|
+
if (this.deletedIds.size > 0) {
|
|
581
|
+
await Bun.write(this.deletedPath, JSON.stringify([...this.deletedIds]));
|
|
582
|
+
} else {
|
|
583
|
+
// Remove the deleted file if no deletions (clean state)
|
|
584
|
+
const deletedFile = Bun.file(this.deletedPath);
|
|
585
|
+
if (await deletedFile.exists()) {
|
|
586
|
+
await Bun.write(this.deletedPath, '[]');
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
private matchesFilter(metadata: Record<string, any>, filter: Record<string, any>): boolean {
|
|
592
|
+
// Use for-in loop to avoid Object.entries() array allocation
|
|
593
|
+
for (const key in filter) {
|
|
594
|
+
const value = filter[key];
|
|
595
|
+
const metaValue = metadata[key];
|
|
596
|
+
if (typeof value === 'object' && value !== null) {
|
|
597
|
+
// Handle complex filters like { $gt: 25 }
|
|
598
|
+
if (value.$gt !== undefined && metaValue <= value.$gt) return false;
|
|
599
|
+
if (value.$lt !== undefined && metaValue >= value.$lt) return false;
|
|
600
|
+
if (value.$gte !== undefined && metaValue < value.$gte) return false;
|
|
601
|
+
if (value.$lte !== undefined && metaValue > value.$lte) return false;
|
|
602
|
+
if (value.$ne !== undefined && metaValue === value.$ne) return false;
|
|
603
|
+
if (value.$in !== undefined && !value.$in.includes(metaValue)) return false;
|
|
604
|
+
if (value.$nin !== undefined && value.$nin.includes(metaValue)) return false;
|
|
605
|
+
} else {
|
|
606
|
+
// Handle simple equality check
|
|
607
|
+
if (metaValue !== value) return false;
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
return true;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/**
|
|
614
|
+
* Compact the collection by rebuilding the index without deleted vectors.
|
|
615
|
+
* This permanently removes tombstoned vectors and reclaims space.
|
|
616
|
+
*
|
|
617
|
+
* @returns Number of vectors removed during compaction
|
|
618
|
+
*/
|
|
619
|
+
async compact(): Promise<number> {
|
|
620
|
+
if (this.deletedIds.size === 0) return 0;
|
|
621
|
+
|
|
622
|
+
const removedCount = this.deletedIds.size;
|
|
623
|
+
|
|
624
|
+
// Collect all non-deleted vectors
|
|
625
|
+
const activeVectors: Array<{ id: string; numericId: number; vector: Float32Array; meta?: Record<string, any> }> = [];
|
|
626
|
+
|
|
627
|
+
for (const [numericId, vector] of this.hnsw.getAllVectors()) {
|
|
628
|
+
if (!this.deletedIds.has(numericId)) {
|
|
629
|
+
const stringId = this.idReverseMap.get(numericId);
|
|
630
|
+
if (stringId) {
|
|
631
|
+
activeVectors.push({
|
|
632
|
+
id: stringId,
|
|
633
|
+
numericId,
|
|
634
|
+
vector,
|
|
635
|
+
meta: this.metadata.get(numericId)
|
|
636
|
+
});
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// Destroy old index
|
|
642
|
+
this.hnsw.destroy();
|
|
643
|
+
|
|
644
|
+
// Create new index with same parameters as the original
|
|
645
|
+
this.hnsw = new HNSWIndex(
|
|
646
|
+
this.dimension,
|
|
647
|
+
this.metric,
|
|
648
|
+
this.M,
|
|
649
|
+
this.efConstruction
|
|
650
|
+
);
|
|
651
|
+
|
|
652
|
+
// Clear all mappings
|
|
653
|
+
this.idMap.clear();
|
|
654
|
+
this.idReverseMap.clear();
|
|
655
|
+
this.metadata.clear();
|
|
656
|
+
this.deletedIds.clear();
|
|
657
|
+
|
|
658
|
+
// Prepare bulk insert data and re-populate mappings
|
|
659
|
+
const points: Array<{ id: number; vector: Float32Array }> = new Array(activeVectors.length);
|
|
660
|
+
|
|
661
|
+
for (let i = 0; i < activeVectors.length; i++) {
|
|
662
|
+
const { id, vector, meta } = activeVectors[i];
|
|
663
|
+
const newNumericId = i;
|
|
664
|
+
|
|
665
|
+
this.idMap.set(id, newNumericId);
|
|
666
|
+
this.idReverseMap.set(newNumericId, id);
|
|
667
|
+
if (meta) {
|
|
668
|
+
this.metadata.set(newNumericId, meta);
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
points[i] = { id: newNumericId, vector };
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
// Use bulk insert for better performance
|
|
675
|
+
await this.hnsw.addPointsBulk(points);
|
|
676
|
+
|
|
677
|
+
return removedCount;
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
async destroy(): Promise<void> {
|
|
681
|
+
// Save to disk before destroying
|
|
682
|
+
await this.saveToDisk();
|
|
683
|
+
|
|
684
|
+
// Destroy the HNSW index to free memory
|
|
685
|
+
if (this.hnsw && typeof this.hnsw.destroy === 'function') {
|
|
686
|
+
this.hnsw.destroy();
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
// Clear all maps to free memory
|
|
690
|
+
this.idMap.clear();
|
|
691
|
+
this.idReverseMap.clear();
|
|
692
|
+
this.metadata.clear();
|
|
693
|
+
this.deletedIds.clear();
|
|
694
|
+
}
|
|
695
|
+
}
|