verso-db 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/HNSWIndex.js CHANGED
@@ -11,6 +11,7 @@ export class HNSWIndex {
11
11
  static MAX_NODE_ID = HNSWIndex.MISSING_ID_SENTINEL - 1;
12
12
  static MAX_ARRAY_BUFFER_BYTES = 0x7FFFFFFF;
13
13
  static MAX_NODE_SLOTS = 50_000_000;
14
+ static MAX_SERIALIZED_LEVEL = 31;
14
15
  /**
15
16
  * Allocate a Float32Array backed by SharedArrayBuffer when available.
16
17
  * This allows workers to read vector data without copying.
@@ -69,6 +70,7 @@ export class HNSWIndex {
69
70
  vectorsAreNormalized = false;
70
71
  // Cached distance function to avoid switch overhead
71
72
  distanceFn;
73
+ random;
72
74
  // Quantization support for 3-4x faster search with Int8
73
75
  scalarQuantizer = null;
74
76
  // OPTIMIZATION: Use array instead of Map for int8 vectors too
@@ -98,7 +100,7 @@ export class HNSWIndex {
98
100
  constructionMode = false;
99
101
  // Adaptive efSearch calibration stats
100
102
  calibrationStats = null;
101
- constructor(dimension, metric = 'cosine', M = 24, efConstruction = 200) {
103
+ constructor(dimension, metric = 'cosine', M = 24, efConstruction = 200, random = Math.random) {
102
104
  if (dimension <= 0 || !Number.isInteger(dimension)) {
103
105
  throw new VectorDBError(`Invalid dimension: must be a positive integer, got ${dimension}`, 'VALIDATION_ERROR');
104
106
  }
@@ -108,8 +110,12 @@ export class HNSWIndex {
108
110
  if (efConstruction <= 0 || !Number.isInteger(efConstruction)) {
109
111
  throw new VectorDBError(`Invalid efConstruction parameter: must be a positive integer, got ${efConstruction}`, 'VALIDATION_ERROR');
110
112
  }
113
+ if (typeof random !== 'function') {
114
+ throw new VectorDBError('Invalid random source: must be a function returning a number in [0, 1)', 'VALIDATION_ERROR');
115
+ }
111
116
  this.dimension = dimension;
112
117
  this.metric = metric;
118
+ this.random = random;
113
119
  this.M = M;
114
120
  this.M0 = M * 2;
115
121
  this.efConstruction = efConstruction;
@@ -437,8 +443,18 @@ export class HNSWIndex {
437
443
  normalizeInPlace(vector);
438
444
  return vector;
439
445
  }
446
+ randomFloat() {
447
+ const value = this.random();
448
+ if (!Number.isFinite(value))
449
+ return 0;
450
+ if (value <= 0)
451
+ return 0;
452
+ if (value >= 1)
453
+ return 1 - Number.EPSILON;
454
+ return value;
455
+ }
440
456
  selectLevel() {
441
- const r = Math.random() || Number.MIN_VALUE;
457
+ const r = this.randomFloat() || Number.MIN_VALUE;
442
458
  const level = Math.floor(-Math.log(r) * this.levelMult);
443
459
  return Math.max(0, Math.min(level, this.maxLayers - 1));
444
460
  }
@@ -780,23 +796,10 @@ export class HNSWIndex {
780
796
  if (this.nodes[id]) {
781
797
  throw new VectorDBError(`Duplicate node ID ${id}: node already exists`, 'DUPLICATE_VECTOR');
782
798
  }
783
- // Optimize: only copy when necessary
784
- // - Always copy arrays (need Float32Array)
785
- // - Copy Float32Array only if we need to normalize (modifies in place)
786
- // - Reuse input directly if skipNormalization is set (caller guarantees immutability)
787
- let floatVector;
788
- if (Array.isArray(vector)) {
789
- floatVector = new Float32Array(vector);
790
- }
791
- else if (this.vectorsAreNormalized && !options?.skipNormalization) {
792
- // Need to copy because normalizeVector modifies in place
793
- floatVector = new Float32Array(vector);
794
- }
795
- else {
796
- // No normalization needed and input is Float32Array - use directly
797
- // Note: caller should not modify this array after passing it
798
- floatVector = vector;
799
- }
799
+ // Always take ownership of vector data. Reusing caller-owned
800
+ // Float32Arrays lets later external mutation diverge node.vector from the
801
+ // flat vector store and corrupt search behavior.
802
+ let floatVector = new Float32Array(vector);
800
803
  if (floatVector.length !== this.dimension) {
801
804
  throw new VectorDBError(`Vector dimension ${floatVector.length} does not match expected ${this.dimension}`, 'DIMENSION_MISMATCH');
802
805
  }
@@ -1121,7 +1124,7 @@ export class HNSWIndex {
1121
1124
  const sampleIndices = [];
1122
1125
  const usedIndices = new Set();
1123
1126
  while (sampleIndices.length < sampleSize) {
1124
- const idx = Math.floor(Math.random() * n);
1127
+ const idx = Math.floor(this.randomFloat() * n);
1125
1128
  if (!usedIndices.has(idx)) {
1126
1129
  usedIndices.add(idx);
1127
1130
  sampleIndices.push(idx);
@@ -1303,12 +1306,11 @@ export class HNSWIndex {
1303
1306
  }
1304
1307
  // Pre-normalize all vectors if needed
1305
1308
  const normalizedPoints = points.map(p => {
1309
+ const vector = new Float32Array(p.vector);
1306
1310
  if (this.vectorsAreNormalized && !skipNorm) {
1307
- const v = new Float32Array(p.vector);
1308
- normalizeInPlace(v);
1309
- return { id: p.id, vector: v };
1311
+ normalizeInPlace(vector);
1310
1312
  }
1311
- return p;
1313
+ return { id: p.id, vector };
1312
1314
  });
1313
1315
  // Phase 1: Sequential seed insertion
1314
1316
  const seedSize = Math.min(Math.max(500, Math.floor(normalizedPoints.length * seedFraction)), normalizedPoints.length);
@@ -2114,6 +2116,18 @@ export class HNSWIndex {
2114
2116
  const entryPointId = readInt32('entryPointId');
2115
2117
  const nodeCount = readUint32('nodeCount');
2116
2118
  const vectorByteLength = dimension * 4;
2119
+ if (nodeCount > HNSWIndex.MAX_NODE_SLOTS) {
2120
+ throw new VectorDBError(`Corrupt HNSW data: node count ${nodeCount} exceeds maximum supported ${HNSWIndex.MAX_NODE_SLOTS}`, 'CORRUPT_INDEX');
2121
+ }
2122
+ if (maxLevel < -1 || maxLevel > HNSWIndex.MAX_SERIALIZED_LEVEL) {
2123
+ throw new VectorDBError(`Corrupt HNSW data: invalid maxLevel ${maxLevel}`, 'CORRUPT_INDEX');
2124
+ }
2125
+ if (nodeCount === 0 && (entryPointId !== -1 || maxLevel !== -1)) {
2126
+ throw new VectorDBError('Corrupt HNSW data: empty index has non-empty entry point metadata', 'CORRUPT_INDEX');
2127
+ }
2128
+ if (nodeCount > 0 && (entryPointId < 0 || maxLevel < 0)) {
2129
+ throw new VectorDBError('Corrupt HNSW data: populated index has missing entry point metadata', 'CORRUPT_INDEX');
2130
+ }
2117
2131
  // V3+ has vectorDataOffset in header
2118
2132
  let vectorDataOffset = 0;
2119
2133
  if (formatVersion >= 3) {
@@ -2126,6 +2140,41 @@ export class HNSWIndex {
2126
2140
  index.maxLevel = maxLevel;
2127
2141
  index.entryPointId = entryPointId;
2128
2142
  const indexToId = new Array(nodeCount);
2143
+ const seenNodeIds = new Set();
2144
+ let observedMaxLevel = -1;
2145
+ const validateSerializedNode = (id, level, nodeIndex) => {
2146
+ if (id > HNSWIndex.MAX_NODE_ID) {
2147
+ throw new VectorDBError(`Corrupt HNSW data: node ${nodeIndex} has reserved or unsupported ID ${id}`, 'CORRUPT_INDEX');
2148
+ }
2149
+ if (level > HNSWIndex.MAX_SERIALIZED_LEVEL || level > maxLevel) {
2150
+ throw new VectorDBError(`Corrupt HNSW data: node ${nodeIndex} has invalid level ${level}`, 'CORRUPT_INDEX');
2151
+ }
2152
+ if (seenNodeIds.has(id)) {
2153
+ throw new VectorDBError(`Corrupt HNSW data: duplicate node ID ${id}`, 'CORRUPT_INDEX');
2154
+ }
2155
+ seenNodeIds.add(id);
2156
+ if (level > observedMaxLevel)
2157
+ observedMaxLevel = level;
2158
+ };
2159
+ const validateNeighborList = (nodeId, layer, neighbors) => {
2160
+ const maxConnections = layer === 0 ? M * 2 : M;
2161
+ if (neighbors.length > maxConnections) {
2162
+ throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} layer ${layer} has ${neighbors.length} neighbors, maximum is ${maxConnections}`, 'CORRUPT_INDEX');
2163
+ }
2164
+ const seenNeighbors = new Set();
2165
+ for (const neighborId of neighbors) {
2166
+ if (!seenNodeIds.has(neighborId)) {
2167
+ throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} references missing neighbor ${neighborId}`, 'CORRUPT_INDEX');
2168
+ }
2169
+ if (neighborId === nodeId) {
2170
+ throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} references itself as a neighbor`, 'CORRUPT_INDEX');
2171
+ }
2172
+ if (seenNeighbors.has(neighborId)) {
2173
+ throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} has duplicate neighbor ${neighborId}`, 'CORRUPT_INDEX');
2174
+ }
2175
+ seenNeighbors.add(neighborId);
2176
+ }
2177
+ };
2129
2178
  if (formatVersion >= 3) {
2130
2179
  // V3 format: vectors at end, supports lazy loading
2131
2180
  const nodeMetadata = [];
@@ -2134,6 +2183,7 @@ export class HNSWIndex {
2134
2183
  for (let i = 0; i < nodeCount; i++) {
2135
2184
  const id = readUint32(`node ${i} id`);
2136
2185
  const level = readUint32(`node ${i} level`);
2186
+ validateSerializedNode(id, level, i);
2137
2187
  indexToId[i] = id;
2138
2188
  nodeMetadata.push({ id, level });
2139
2189
  }
@@ -2172,6 +2222,7 @@ export class HNSWIndex {
2172
2222
  }
2173
2223
  return indexToId[idx];
2174
2224
  });
2225
+ validateNeighborList(nodeMetadata[i].id, l, neighbors[l]);
2175
2226
  }
2176
2227
  nodeNeighbors.push(neighbors);
2177
2228
  }
@@ -2236,6 +2287,7 @@ export class HNSWIndex {
2236
2287
  for (let i = 0; i < nodeCount; i++) {
2237
2288
  const id = readUint32(`node ${i} id`);
2238
2289
  const level = readUint32(`node ${i} level`);
2290
+ validateSerializedNode(id, level, i);
2239
2291
  indexToId[i] = id;
2240
2292
  const vector = new Float32Array(dimension);
2241
2293
  for (let j = 0; j < dimension; j++) {
@@ -2275,6 +2327,7 @@ export class HNSWIndex {
2275
2327
  }
2276
2328
  return indexToId[idx];
2277
2329
  });
2330
+ validateNeighborList(id, l, neighbors[l]);
2278
2331
  }
2279
2332
  const node = { id, level, vector, neighbors };
2280
2333
  index.setNode(node);
@@ -2285,6 +2338,7 @@ export class HNSWIndex {
2285
2338
  for (let i = 0; i < nodeCount; i++) {
2286
2339
  const id = readUint32(`node ${i} id`);
2287
2340
  const level = readUint32(`node ${i} level`);
2341
+ validateSerializedNode(id, level, i);
2288
2342
  indexToId[i] = id;
2289
2343
  const vector = new Float32Array(dimension);
2290
2344
  for (let j = 0; j < dimension; j++) {
@@ -2310,9 +2364,20 @@ export class HNSWIndex {
2310
2364
  }
2311
2365
  node.neighbors[l][j] = indexToId[neighborIndex];
2312
2366
  }
2367
+ validateNeighborList(node.id, l, node.neighbors[l]);
2313
2368
  }
2314
2369
  }
2315
2370
  }
2371
+ if (nodeCount > 0 && observedMaxLevel !== maxLevel) {
2372
+ throw new VectorDBError(`Corrupt HNSW data: maxLevel ${maxLevel} does not match highest node level ${observedMaxLevel}`, 'CORRUPT_INDEX');
2373
+ }
2374
+ if (nodeCount > 0 && !seenNodeIds.has(entryPointId)) {
2375
+ throw new VectorDBError(`Corrupt HNSW data: entry point ${entryPointId} is not present in node table`, 'CORRUPT_INDEX');
2376
+ }
2377
+ const entryPoint = nodeCount > 0 ? index.nodes[entryPointId] : undefined;
2378
+ if (entryPoint && entryPoint.level !== maxLevel) {
2379
+ throw new VectorDBError(`Corrupt HNSW data: entry point level ${entryPoint.level} does not match maxLevel ${maxLevel}`, 'CORRUPT_INDEX');
2380
+ }
2316
2381
  return index;
2317
2382
  }
2318
2383
  catch (error) {
@@ -2479,7 +2544,7 @@ export class HNSWIndex {
2479
2544
  continue;
2480
2545
  const vector = this.getNodeVector(node.id);
2481
2546
  if (vector) {
2482
- result.set(node.id, vector);
2547
+ result.set(node.id, new Float32Array(vector));
2483
2548
  }
2484
2549
  }
2485
2550
  return result;