verso-db 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/README.md +3 -3
- package/dist/Collection.d.ts +2 -0
- package/dist/Collection.d.ts.map +1 -1
- package/dist/Collection.js +13 -6
- package/dist/Collection.js.map +1 -1
- package/dist/HNSWIndex.d.ts +4 -1
- package/dist/HNSWIndex.d.ts.map +1 -1
- package/dist/HNSWIndex.js +90 -25
- package/dist/HNSWIndex.js.map +1 -1
- package/dist/VectorDB.d.ts.map +1 -1
- package/dist/VectorDB.js +21 -5
- package/dist/VectorDB.js.map +1 -1
- package/dist/encoding/DeltaEncoder.d.ts.map +1 -1
- package/dist/encoding/DeltaEncoder.js +17 -1
- package/dist/encoding/DeltaEncoder.js.map +1 -1
- package/dist/presets.d.ts +5 -4
- package/dist/presets.d.ts.map +1 -1
- package/dist/presets.js +11 -10
- package/dist/presets.js.map +1 -1
- package/dist/quantization/ScalarQuantizer.d.ts.map +1 -1
- package/dist/quantization/ScalarQuantizer.js +28 -2
- package/dist/quantization/ScalarQuantizer.js.map +1 -1
- package/dist/storage/BunStorageBackend.d.ts +0 -2
- package/dist/storage/BunStorageBackend.d.ts.map +1 -1
- package/dist/storage/BunStorageBackend.js +9 -5
- package/dist/storage/BunStorageBackend.js.map +1 -1
- package/package.json +2 -2
package/dist/HNSWIndex.js
CHANGED
|
@@ -11,6 +11,7 @@ export class HNSWIndex {
|
|
|
11
11
|
static MAX_NODE_ID = HNSWIndex.MISSING_ID_SENTINEL - 1;
|
|
12
12
|
static MAX_ARRAY_BUFFER_BYTES = 0x7FFFFFFF;
|
|
13
13
|
static MAX_NODE_SLOTS = 50_000_000;
|
|
14
|
+
static MAX_SERIALIZED_LEVEL = 31;
|
|
14
15
|
/**
|
|
15
16
|
* Allocate a Float32Array backed by SharedArrayBuffer when available.
|
|
16
17
|
* This allows workers to read vector data without copying.
|
|
@@ -69,6 +70,7 @@ export class HNSWIndex {
|
|
|
69
70
|
vectorsAreNormalized = false;
|
|
70
71
|
// Cached distance function to avoid switch overhead
|
|
71
72
|
distanceFn;
|
|
73
|
+
random;
|
|
72
74
|
// Quantization support for 3-4x faster search with Int8
|
|
73
75
|
scalarQuantizer = null;
|
|
74
76
|
// OPTIMIZATION: Use array instead of Map for int8 vectors too
|
|
@@ -98,7 +100,7 @@ export class HNSWIndex {
|
|
|
98
100
|
constructionMode = false;
|
|
99
101
|
// Adaptive efSearch calibration stats
|
|
100
102
|
calibrationStats = null;
|
|
101
|
-
constructor(dimension, metric = 'cosine', M = 24, efConstruction = 200) {
|
|
103
|
+
constructor(dimension, metric = 'cosine', M = 24, efConstruction = 200, random = Math.random) {
|
|
102
104
|
if (dimension <= 0 || !Number.isInteger(dimension)) {
|
|
103
105
|
throw new VectorDBError(`Invalid dimension: must be a positive integer, got ${dimension}`, 'VALIDATION_ERROR');
|
|
104
106
|
}
|
|
@@ -108,8 +110,12 @@ export class HNSWIndex {
|
|
|
108
110
|
if (efConstruction <= 0 || !Number.isInteger(efConstruction)) {
|
|
109
111
|
throw new VectorDBError(`Invalid efConstruction parameter: must be a positive integer, got ${efConstruction}`, 'VALIDATION_ERROR');
|
|
110
112
|
}
|
|
113
|
+
if (typeof random !== 'function') {
|
|
114
|
+
throw new VectorDBError('Invalid random source: must be a function returning a number in [0, 1)', 'VALIDATION_ERROR');
|
|
115
|
+
}
|
|
111
116
|
this.dimension = dimension;
|
|
112
117
|
this.metric = metric;
|
|
118
|
+
this.random = random;
|
|
113
119
|
this.M = M;
|
|
114
120
|
this.M0 = M * 2;
|
|
115
121
|
this.efConstruction = efConstruction;
|
|
@@ -437,8 +443,18 @@ export class HNSWIndex {
|
|
|
437
443
|
normalizeInPlace(vector);
|
|
438
444
|
return vector;
|
|
439
445
|
}
|
|
446
|
+
randomFloat() {
|
|
447
|
+
const value = this.random();
|
|
448
|
+
if (!Number.isFinite(value))
|
|
449
|
+
return 0;
|
|
450
|
+
if (value <= 0)
|
|
451
|
+
return 0;
|
|
452
|
+
if (value >= 1)
|
|
453
|
+
return 1 - Number.EPSILON;
|
|
454
|
+
return value;
|
|
455
|
+
}
|
|
440
456
|
selectLevel() {
|
|
441
|
-
const r =
|
|
457
|
+
const r = this.randomFloat() || Number.MIN_VALUE;
|
|
442
458
|
const level = Math.floor(-Math.log(r) * this.levelMult);
|
|
443
459
|
return Math.max(0, Math.min(level, this.maxLayers - 1));
|
|
444
460
|
}
|
|
@@ -780,23 +796,10 @@ export class HNSWIndex {
|
|
|
780
796
|
if (this.nodes[id]) {
|
|
781
797
|
throw new VectorDBError(`Duplicate node ID ${id}: node already exists`, 'DUPLICATE_VECTOR');
|
|
782
798
|
}
|
|
783
|
-
//
|
|
784
|
-
//
|
|
785
|
-
//
|
|
786
|
-
|
|
787
|
-
let floatVector;
|
|
788
|
-
if (Array.isArray(vector)) {
|
|
789
|
-
floatVector = new Float32Array(vector);
|
|
790
|
-
}
|
|
791
|
-
else if (this.vectorsAreNormalized && !options?.skipNormalization) {
|
|
792
|
-
// Need to copy because normalizeVector modifies in place
|
|
793
|
-
floatVector = new Float32Array(vector);
|
|
794
|
-
}
|
|
795
|
-
else {
|
|
796
|
-
// No normalization needed and input is Float32Array - use directly
|
|
797
|
-
// Note: caller should not modify this array after passing it
|
|
798
|
-
floatVector = vector;
|
|
799
|
-
}
|
|
799
|
+
// Always take ownership of vector data. Reusing caller-owned
|
|
800
|
+
// Float32Arrays lets later external mutation diverge node.vector from the
|
|
801
|
+
// flat vector store and corrupt search behavior.
|
|
802
|
+
let floatVector = new Float32Array(vector);
|
|
800
803
|
if (floatVector.length !== this.dimension) {
|
|
801
804
|
throw new VectorDBError(`Vector dimension ${floatVector.length} does not match expected ${this.dimension}`, 'DIMENSION_MISMATCH');
|
|
802
805
|
}
|
|
@@ -1121,7 +1124,7 @@ export class HNSWIndex {
|
|
|
1121
1124
|
const sampleIndices = [];
|
|
1122
1125
|
const usedIndices = new Set();
|
|
1123
1126
|
while (sampleIndices.length < sampleSize) {
|
|
1124
|
-
const idx = Math.floor(
|
|
1127
|
+
const idx = Math.floor(this.randomFloat() * n);
|
|
1125
1128
|
if (!usedIndices.has(idx)) {
|
|
1126
1129
|
usedIndices.add(idx);
|
|
1127
1130
|
sampleIndices.push(idx);
|
|
@@ -1303,12 +1306,11 @@ export class HNSWIndex {
|
|
|
1303
1306
|
}
|
|
1304
1307
|
// Pre-normalize all vectors if needed
|
|
1305
1308
|
const normalizedPoints = points.map(p => {
|
|
1309
|
+
const vector = new Float32Array(p.vector);
|
|
1306
1310
|
if (this.vectorsAreNormalized && !skipNorm) {
|
|
1307
|
-
|
|
1308
|
-
normalizeInPlace(v);
|
|
1309
|
-
return { id: p.id, vector: v };
|
|
1311
|
+
normalizeInPlace(vector);
|
|
1310
1312
|
}
|
|
1311
|
-
return p;
|
|
1313
|
+
return { id: p.id, vector };
|
|
1312
1314
|
});
|
|
1313
1315
|
// Phase 1: Sequential seed insertion
|
|
1314
1316
|
const seedSize = Math.min(Math.max(500, Math.floor(normalizedPoints.length * seedFraction)), normalizedPoints.length);
|
|
@@ -2114,6 +2116,18 @@ export class HNSWIndex {
|
|
|
2114
2116
|
const entryPointId = readInt32('entryPointId');
|
|
2115
2117
|
const nodeCount = readUint32('nodeCount');
|
|
2116
2118
|
const vectorByteLength = dimension * 4;
|
|
2119
|
+
if (nodeCount > HNSWIndex.MAX_NODE_SLOTS) {
|
|
2120
|
+
throw new VectorDBError(`Corrupt HNSW data: node count ${nodeCount} exceeds maximum supported ${HNSWIndex.MAX_NODE_SLOTS}`, 'CORRUPT_INDEX');
|
|
2121
|
+
}
|
|
2122
|
+
if (maxLevel < -1 || maxLevel > HNSWIndex.MAX_SERIALIZED_LEVEL) {
|
|
2123
|
+
throw new VectorDBError(`Corrupt HNSW data: invalid maxLevel ${maxLevel}`, 'CORRUPT_INDEX');
|
|
2124
|
+
}
|
|
2125
|
+
if (nodeCount === 0 && (entryPointId !== -1 || maxLevel !== -1)) {
|
|
2126
|
+
throw new VectorDBError('Corrupt HNSW data: empty index has non-empty entry point metadata', 'CORRUPT_INDEX');
|
|
2127
|
+
}
|
|
2128
|
+
if (nodeCount > 0 && (entryPointId < 0 || maxLevel < 0)) {
|
|
2129
|
+
throw new VectorDBError('Corrupt HNSW data: populated index has missing entry point metadata', 'CORRUPT_INDEX');
|
|
2130
|
+
}
|
|
2117
2131
|
// V3+ has vectorDataOffset in header
|
|
2118
2132
|
let vectorDataOffset = 0;
|
|
2119
2133
|
if (formatVersion >= 3) {
|
|
@@ -2126,6 +2140,41 @@ export class HNSWIndex {
|
|
|
2126
2140
|
index.maxLevel = maxLevel;
|
|
2127
2141
|
index.entryPointId = entryPointId;
|
|
2128
2142
|
const indexToId = new Array(nodeCount);
|
|
2143
|
+
const seenNodeIds = new Set();
|
|
2144
|
+
let observedMaxLevel = -1;
|
|
2145
|
+
const validateSerializedNode = (id, level, nodeIndex) => {
|
|
2146
|
+
if (id > HNSWIndex.MAX_NODE_ID) {
|
|
2147
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeIndex} has reserved or unsupported ID ${id}`, 'CORRUPT_INDEX');
|
|
2148
|
+
}
|
|
2149
|
+
if (level > HNSWIndex.MAX_SERIALIZED_LEVEL || level > maxLevel) {
|
|
2150
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeIndex} has invalid level ${level}`, 'CORRUPT_INDEX');
|
|
2151
|
+
}
|
|
2152
|
+
if (seenNodeIds.has(id)) {
|
|
2153
|
+
throw new VectorDBError(`Corrupt HNSW data: duplicate node ID ${id}`, 'CORRUPT_INDEX');
|
|
2154
|
+
}
|
|
2155
|
+
seenNodeIds.add(id);
|
|
2156
|
+
if (level > observedMaxLevel)
|
|
2157
|
+
observedMaxLevel = level;
|
|
2158
|
+
};
|
|
2159
|
+
const validateNeighborList = (nodeId, layer, neighbors) => {
|
|
2160
|
+
const maxConnections = layer === 0 ? M * 2 : M;
|
|
2161
|
+
if (neighbors.length > maxConnections) {
|
|
2162
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} layer ${layer} has ${neighbors.length} neighbors, maximum is ${maxConnections}`, 'CORRUPT_INDEX');
|
|
2163
|
+
}
|
|
2164
|
+
const seenNeighbors = new Set();
|
|
2165
|
+
for (const neighborId of neighbors) {
|
|
2166
|
+
if (!seenNodeIds.has(neighborId)) {
|
|
2167
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} references missing neighbor ${neighborId}`, 'CORRUPT_INDEX');
|
|
2168
|
+
}
|
|
2169
|
+
if (neighborId === nodeId) {
|
|
2170
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} references itself as a neighbor`, 'CORRUPT_INDEX');
|
|
2171
|
+
}
|
|
2172
|
+
if (seenNeighbors.has(neighborId)) {
|
|
2173
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} has duplicate neighbor ${neighborId}`, 'CORRUPT_INDEX');
|
|
2174
|
+
}
|
|
2175
|
+
seenNeighbors.add(neighborId);
|
|
2176
|
+
}
|
|
2177
|
+
};
|
|
2129
2178
|
if (formatVersion >= 3) {
|
|
2130
2179
|
// V3 format: vectors at end, supports lazy loading
|
|
2131
2180
|
const nodeMetadata = [];
|
|
@@ -2134,6 +2183,7 @@ export class HNSWIndex {
|
|
|
2134
2183
|
for (let i = 0; i < nodeCount; i++) {
|
|
2135
2184
|
const id = readUint32(`node ${i} id`);
|
|
2136
2185
|
const level = readUint32(`node ${i} level`);
|
|
2186
|
+
validateSerializedNode(id, level, i);
|
|
2137
2187
|
indexToId[i] = id;
|
|
2138
2188
|
nodeMetadata.push({ id, level });
|
|
2139
2189
|
}
|
|
@@ -2172,6 +2222,7 @@ export class HNSWIndex {
|
|
|
2172
2222
|
}
|
|
2173
2223
|
return indexToId[idx];
|
|
2174
2224
|
});
|
|
2225
|
+
validateNeighborList(nodeMetadata[i].id, l, neighbors[l]);
|
|
2175
2226
|
}
|
|
2176
2227
|
nodeNeighbors.push(neighbors);
|
|
2177
2228
|
}
|
|
@@ -2236,6 +2287,7 @@ export class HNSWIndex {
|
|
|
2236
2287
|
for (let i = 0; i < nodeCount; i++) {
|
|
2237
2288
|
const id = readUint32(`node ${i} id`);
|
|
2238
2289
|
const level = readUint32(`node ${i} level`);
|
|
2290
|
+
validateSerializedNode(id, level, i);
|
|
2239
2291
|
indexToId[i] = id;
|
|
2240
2292
|
const vector = new Float32Array(dimension);
|
|
2241
2293
|
for (let j = 0; j < dimension; j++) {
|
|
@@ -2275,6 +2327,7 @@ export class HNSWIndex {
|
|
|
2275
2327
|
}
|
|
2276
2328
|
return indexToId[idx];
|
|
2277
2329
|
});
|
|
2330
|
+
validateNeighborList(id, l, neighbors[l]);
|
|
2278
2331
|
}
|
|
2279
2332
|
const node = { id, level, vector, neighbors };
|
|
2280
2333
|
index.setNode(node);
|
|
@@ -2285,6 +2338,7 @@ export class HNSWIndex {
|
|
|
2285
2338
|
for (let i = 0; i < nodeCount; i++) {
|
|
2286
2339
|
const id = readUint32(`node ${i} id`);
|
|
2287
2340
|
const level = readUint32(`node ${i} level`);
|
|
2341
|
+
validateSerializedNode(id, level, i);
|
|
2288
2342
|
indexToId[i] = id;
|
|
2289
2343
|
const vector = new Float32Array(dimension);
|
|
2290
2344
|
for (let j = 0; j < dimension; j++) {
|
|
@@ -2310,9 +2364,20 @@ export class HNSWIndex {
|
|
|
2310
2364
|
}
|
|
2311
2365
|
node.neighbors[l][j] = indexToId[neighborIndex];
|
|
2312
2366
|
}
|
|
2367
|
+
validateNeighborList(node.id, l, node.neighbors[l]);
|
|
2313
2368
|
}
|
|
2314
2369
|
}
|
|
2315
2370
|
}
|
|
2371
|
+
if (nodeCount > 0 && observedMaxLevel !== maxLevel) {
|
|
2372
|
+
throw new VectorDBError(`Corrupt HNSW data: maxLevel ${maxLevel} does not match highest node level ${observedMaxLevel}`, 'CORRUPT_INDEX');
|
|
2373
|
+
}
|
|
2374
|
+
if (nodeCount > 0 && !seenNodeIds.has(entryPointId)) {
|
|
2375
|
+
throw new VectorDBError(`Corrupt HNSW data: entry point ${entryPointId} is not present in node table`, 'CORRUPT_INDEX');
|
|
2376
|
+
}
|
|
2377
|
+
const entryPoint = nodeCount > 0 ? index.nodes[entryPointId] : undefined;
|
|
2378
|
+
if (entryPoint && entryPoint.level !== maxLevel) {
|
|
2379
|
+
throw new VectorDBError(`Corrupt HNSW data: entry point level ${entryPoint.level} does not match maxLevel ${maxLevel}`, 'CORRUPT_INDEX');
|
|
2380
|
+
}
|
|
2316
2381
|
return index;
|
|
2317
2382
|
}
|
|
2318
2383
|
catch (error) {
|
|
@@ -2479,7 +2544,7 @@ export class HNSWIndex {
|
|
|
2479
2544
|
continue;
|
|
2480
2545
|
const vector = this.getNodeVector(node.id);
|
|
2481
2546
|
if (vector) {
|
|
2482
|
-
result.set(node.id, vector);
|
|
2547
|
+
result.set(node.id, new Float32Array(vector));
|
|
2483
2548
|
}
|
|
2484
2549
|
}
|
|
2485
2550
|
return result;
|