verso-db 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/README.md +31 -16
- package/dist/Collection.d.ts +2 -0
- package/dist/Collection.d.ts.map +1 -1
- package/dist/Collection.js +13 -6
- package/dist/Collection.js.map +1 -1
- package/dist/HNSWIndex.d.ts +44 -2
- package/dist/HNSWIndex.d.ts.map +1 -1
- package/dist/HNSWIndex.js +262 -45
- package/dist/HNSWIndex.js.map +1 -1
- package/dist/SearchWorker.d.ts +36 -8
- package/dist/SearchWorker.d.ts.map +1 -1
- package/dist/SearchWorker.js +207 -51
- package/dist/SearchWorker.js.map +1 -1
- package/dist/VectorDB.d.ts.map +1 -1
- package/dist/VectorDB.js +21 -5
- package/dist/VectorDB.js.map +1 -1
- package/dist/WorkerPool.d.ts +36 -8
- package/dist/WorkerPool.d.ts.map +1 -1
- package/dist/WorkerPool.js +191 -17
- package/dist/WorkerPool.js.map +1 -1
- package/dist/encoding/DeltaEncoder.d.ts.map +1 -1
- package/dist/encoding/DeltaEncoder.js +17 -1
- package/dist/encoding/DeltaEncoder.js.map +1 -1
- package/dist/presets.d.ts +5 -4
- package/dist/presets.d.ts.map +1 -1
- package/dist/presets.js +11 -10
- package/dist/presets.js.map +1 -1
- package/dist/quantization/ScalarQuantizer.d.ts.map +1 -1
- package/dist/quantization/ScalarQuantizer.js +28 -2
- package/dist/quantization/ScalarQuantizer.js.map +1 -1
- package/dist/storage/BunStorageBackend.d.ts +0 -2
- package/dist/storage/BunStorageBackend.d.ts.map +1 -1
- package/dist/storage/BunStorageBackend.js +9 -5
- package/dist/storage/BunStorageBackend.js.map +1 -1
- package/package.json +6 -2
package/dist/HNSWIndex.js
CHANGED
|
@@ -11,6 +11,7 @@ export class HNSWIndex {
|
|
|
11
11
|
static MAX_NODE_ID = HNSWIndex.MISSING_ID_SENTINEL - 1;
|
|
12
12
|
static MAX_ARRAY_BUFFER_BYTES = 0x7FFFFFFF;
|
|
13
13
|
static MAX_NODE_SLOTS = 50_000_000;
|
|
14
|
+
static MAX_SERIALIZED_LEVEL = 31;
|
|
14
15
|
/**
|
|
15
16
|
* Allocate a Float32Array backed by SharedArrayBuffer when available.
|
|
16
17
|
* This allows workers to read vector data without copying.
|
|
@@ -32,6 +33,12 @@ export class HNSWIndex {
|
|
|
32
33
|
}
|
|
33
34
|
/** Whether flatVectors/flatInt8Vectors use SharedArrayBuffer */
|
|
34
35
|
useSharedMemory = false;
|
|
36
|
+
// Shared graph SAB state for zero-copy worker communication
|
|
37
|
+
sharedGraphIndex = null;
|
|
38
|
+
sharedGraphNeighborData = null;
|
|
39
|
+
sharedGraphMaxLayerSlots = 0;
|
|
40
|
+
sharedGraphWriteOffset = 0;
|
|
41
|
+
sharedMetadata = null;
|
|
35
42
|
M; // Max number of connections per node per level
|
|
36
43
|
M0; // Max number of connections for level 0 (typically M * 2)
|
|
37
44
|
efConstruction; // Size of candidate list during construction
|
|
@@ -63,6 +70,7 @@ export class HNSWIndex {
|
|
|
63
70
|
vectorsAreNormalized = false;
|
|
64
71
|
// Cached distance function to avoid switch overhead
|
|
65
72
|
distanceFn;
|
|
73
|
+
random;
|
|
66
74
|
// Quantization support for 3-4x faster search with Int8
|
|
67
75
|
scalarQuantizer = null;
|
|
68
76
|
// OPTIMIZATION: Use array instead of Map for int8 vectors too
|
|
@@ -92,7 +100,7 @@ export class HNSWIndex {
|
|
|
92
100
|
constructionMode = false;
|
|
93
101
|
// Adaptive efSearch calibration stats
|
|
94
102
|
calibrationStats = null;
|
|
95
|
-
constructor(dimension, metric = 'cosine', M = 24, efConstruction = 200) {
|
|
103
|
+
constructor(dimension, metric = 'cosine', M = 24, efConstruction = 200, random = Math.random) {
|
|
96
104
|
if (dimension <= 0 || !Number.isInteger(dimension)) {
|
|
97
105
|
throw new VectorDBError(`Invalid dimension: must be a positive integer, got ${dimension}`, 'VALIDATION_ERROR');
|
|
98
106
|
}
|
|
@@ -102,8 +110,12 @@ export class HNSWIndex {
|
|
|
102
110
|
if (efConstruction <= 0 || !Number.isInteger(efConstruction)) {
|
|
103
111
|
throw new VectorDBError(`Invalid efConstruction parameter: must be a positive integer, got ${efConstruction}`, 'VALIDATION_ERROR');
|
|
104
112
|
}
|
|
113
|
+
if (typeof random !== 'function') {
|
|
114
|
+
throw new VectorDBError('Invalid random source: must be a function returning a number in [0, 1)', 'VALIDATION_ERROR');
|
|
115
|
+
}
|
|
105
116
|
this.dimension = dimension;
|
|
106
117
|
this.metric = metric;
|
|
118
|
+
this.random = random;
|
|
107
119
|
this.M = M;
|
|
108
120
|
this.M0 = M * 2;
|
|
109
121
|
this.efConstruction = efConstruction;
|
|
@@ -125,6 +137,9 @@ export class HNSWIndex {
|
|
|
125
137
|
this.visitedArraySize = 10000;
|
|
126
138
|
this.visitedArray = new Uint16Array(this.visitedArraySize);
|
|
127
139
|
this.visitedGeneration = 1;
|
|
140
|
+
// Pre-allocate batch distance buffers sized for max neighbors at layer 0
|
|
141
|
+
this.batchNeighborIds = new Uint32Array(this.M0 + 1);
|
|
142
|
+
this.batchDistances = new Float64Array(this.M0 + 1);
|
|
128
143
|
// Pre-allocate searchLayer heaps - sized for typical ef values
|
|
129
144
|
// Will be resized if needed for larger ef
|
|
130
145
|
this.heapCapacity = Math.max(efConstruction * 2, 500);
|
|
@@ -223,9 +238,10 @@ export class HNSWIndex {
|
|
|
223
238
|
this.nextAutoId = id + 1;
|
|
224
239
|
}
|
|
225
240
|
}
|
|
226
|
-
// OPTIMIZATION:
|
|
227
|
-
|
|
228
|
-
|
|
241
|
+
// OPTIMIZATION: Pre-allocated typed arrays for batch distance calculation
|
|
242
|
+
// Sized to M0+1 (max neighbors at layer 0) to eliminate dynamic resizing
|
|
243
|
+
batchNeighborIds;
|
|
244
|
+
batchDistances;
|
|
229
245
|
/**
|
|
230
246
|
* OPTIMIZATION: Batch distance calculation for better cache locality
|
|
231
247
|
* Computes distances from query to multiple neighbors at once
|
|
@@ -427,8 +443,18 @@ export class HNSWIndex {
|
|
|
427
443
|
normalizeInPlace(vector);
|
|
428
444
|
return vector;
|
|
429
445
|
}
|
|
446
|
+
randomFloat() {
|
|
447
|
+
const value = this.random();
|
|
448
|
+
if (!Number.isFinite(value))
|
|
449
|
+
return 0;
|
|
450
|
+
if (value <= 0)
|
|
451
|
+
return 0;
|
|
452
|
+
if (value >= 1)
|
|
453
|
+
return 1 - Number.EPSILON;
|
|
454
|
+
return value;
|
|
455
|
+
}
|
|
430
456
|
selectLevel() {
|
|
431
|
-
const r =
|
|
457
|
+
const r = this.randomFloat() || Number.MIN_VALUE;
|
|
432
458
|
const level = Math.floor(-Math.log(r) * this.levelMult);
|
|
433
459
|
return Math.max(0, Math.min(level, this.maxLayers - 1));
|
|
434
460
|
}
|
|
@@ -667,10 +693,6 @@ export class HNSWIndex {
|
|
|
667
693
|
}
|
|
668
694
|
// Calculate all distances at once (better cache utilization)
|
|
669
695
|
if (batchCount > 0) {
|
|
670
|
-
// Ensure batch arrays are large enough
|
|
671
|
-
if (batchDists.length < batchCount) {
|
|
672
|
-
this.batchDistances.length = batchCount;
|
|
673
|
-
}
|
|
674
696
|
this.calculateDistancesBatch(query, batchIds, batchDists, batchCount);
|
|
675
697
|
// Process batch results
|
|
676
698
|
for (let i = 0; i < batchCount; i++) {
|
|
@@ -774,23 +796,10 @@ export class HNSWIndex {
|
|
|
774
796
|
if (this.nodes[id]) {
|
|
775
797
|
throw new VectorDBError(`Duplicate node ID ${id}: node already exists`, 'DUPLICATE_VECTOR');
|
|
776
798
|
}
|
|
777
|
-
//
|
|
778
|
-
//
|
|
779
|
-
//
|
|
780
|
-
|
|
781
|
-
let floatVector;
|
|
782
|
-
if (Array.isArray(vector)) {
|
|
783
|
-
floatVector = new Float32Array(vector);
|
|
784
|
-
}
|
|
785
|
-
else if (this.vectorsAreNormalized && !options?.skipNormalization) {
|
|
786
|
-
// Need to copy because normalizeVector modifies in place
|
|
787
|
-
floatVector = new Float32Array(vector);
|
|
788
|
-
}
|
|
789
|
-
else {
|
|
790
|
-
// No normalization needed and input is Float32Array - use directly
|
|
791
|
-
// Note: caller should not modify this array after passing it
|
|
792
|
-
floatVector = vector;
|
|
793
|
-
}
|
|
799
|
+
// Always take ownership of vector data. Reusing caller-owned
|
|
800
|
+
// Float32Arrays lets later external mutation diverge node.vector from the
|
|
801
|
+
// flat vector store and corrupt search behavior.
|
|
802
|
+
let floatVector = new Float32Array(vector);
|
|
794
803
|
if (floatVector.length !== this.dimension) {
|
|
795
804
|
throw new VectorDBError(`Vector dimension ${floatVector.length} does not match expected ${this.dimension}`, 'DIMENSION_MISMATCH');
|
|
796
805
|
}
|
|
@@ -1115,7 +1124,7 @@ export class HNSWIndex {
|
|
|
1115
1124
|
const sampleIndices = [];
|
|
1116
1125
|
const usedIndices = new Set();
|
|
1117
1126
|
while (sampleIndices.length < sampleSize) {
|
|
1118
|
-
const idx = Math.floor(
|
|
1127
|
+
const idx = Math.floor(this.randomFloat() * n);
|
|
1119
1128
|
if (!usedIndices.has(idx)) {
|
|
1120
1129
|
usedIndices.add(idx);
|
|
1121
1130
|
sampleIndices.push(idx);
|
|
@@ -1297,12 +1306,11 @@ export class HNSWIndex {
|
|
|
1297
1306
|
}
|
|
1298
1307
|
// Pre-normalize all vectors if needed
|
|
1299
1308
|
const normalizedPoints = points.map(p => {
|
|
1309
|
+
const vector = new Float32Array(p.vector);
|
|
1300
1310
|
if (this.vectorsAreNormalized && !skipNorm) {
|
|
1301
|
-
|
|
1302
|
-
normalizeInPlace(v);
|
|
1303
|
-
return { id: p.id, vector: v };
|
|
1311
|
+
normalizeInPlace(vector);
|
|
1304
1312
|
}
|
|
1305
|
-
return p;
|
|
1313
|
+
return { id: p.id, vector };
|
|
1306
1314
|
});
|
|
1307
1315
|
// Phase 1: Sequential seed insertion
|
|
1308
1316
|
const seedSize = Math.min(Math.max(500, Math.floor(normalizedPoints.length * seedFraction)), normalizedPoints.length);
|
|
@@ -1400,17 +1408,27 @@ export class HNSWIndex {
|
|
|
1400
1408
|
}
|
|
1401
1409
|
batchCount++;
|
|
1402
1410
|
// Send incremental graph update: ALL modified nodes (new + existing with new connections)
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1411
|
+
if (this.hasSharedGraph()) {
|
|
1412
|
+
// Shared graph: write directly to SABs, workers see updates via shared memory
|
|
1413
|
+
this.updateSharedGraphNodes(modifiedNodeIds);
|
|
1414
|
+
this.updateSharedMetadata();
|
|
1415
|
+
}
|
|
1416
|
+
else {
|
|
1417
|
+
// Legacy: send graph data via postMessage
|
|
1418
|
+
const graphUpdate = [];
|
|
1419
|
+
for (const nid of modifiedNodeIds) {
|
|
1420
|
+
const node = this.nodes[nid];
|
|
1421
|
+
if (node) {
|
|
1422
|
+
graphUpdate.push({ id: nid, neighbors: node.neighbors });
|
|
1423
|
+
}
|
|
1408
1424
|
}
|
|
1425
|
+
pool.broadcastGraphUpdate(graphUpdate, this.entryPointId, this.maxLevel);
|
|
1409
1426
|
}
|
|
1410
|
-
pool.broadcastGraphUpdate(graphUpdate, this.entryPointId, this.maxLevel);
|
|
1411
1427
|
// Full re-sync periodically for accumulated neighbor changes
|
|
1412
|
-
//
|
|
1428
|
+
// With shared graph, this defragments dead space in neighbor data.
|
|
1429
|
+
// Without shared graph, this fixes accumulated drift from incremental updates.
|
|
1413
1430
|
if (batchCount % resyncInterval === 0 && start + batchSize < normalizedPoints.length) {
|
|
1431
|
+
this.clearSharedGraph();
|
|
1414
1432
|
pool.destroy();
|
|
1415
1433
|
await pool.init(this);
|
|
1416
1434
|
}
|
|
@@ -1572,7 +1590,6 @@ export class HNSWIndex {
|
|
|
1572
1590
|
getSharedSearchData() {
|
|
1573
1591
|
if (this.nodeCount === 0 || this.entryPointId === -1)
|
|
1574
1592
|
return null;
|
|
1575
|
-
const graphData = this.serializeGraphStructure();
|
|
1576
1593
|
const nodeLevels = new Uint8Array(this.nodeCount);
|
|
1577
1594
|
for (let i = 0; i < this.nodeCount; i++) {
|
|
1578
1595
|
const node = this.nodes[i];
|
|
@@ -1606,7 +1623,7 @@ export class HNSWIndex {
|
|
|
1606
1623
|
quantizationParams = this.scalarQuantizer.getParams();
|
|
1607
1624
|
}
|
|
1608
1625
|
}
|
|
1609
|
-
|
|
1626
|
+
const baseData = {
|
|
1610
1627
|
flatVectors,
|
|
1611
1628
|
flatInt8Vectors,
|
|
1612
1629
|
dimension: this.dimension,
|
|
@@ -1616,14 +1633,154 @@ export class HNSWIndex {
|
|
|
1616
1633
|
maxLevel: this.maxLevel,
|
|
1617
1634
|
M: this.M,
|
|
1618
1635
|
M0: this.M0,
|
|
1619
|
-
graphData,
|
|
1620
1636
|
nodeLevels,
|
|
1621
1637
|
quantizationEnabled: this.quantizationEnabled,
|
|
1622
1638
|
quantizationParams,
|
|
1623
1639
|
};
|
|
1640
|
+
// Use shared graph SABs when SharedArrayBuffer is available
|
|
1641
|
+
if (this.useSharedMemory && typeof SharedArrayBuffer !== 'undefined') {
|
|
1642
|
+
const sharedGraph = this.serializeGraphToSharedBuffers();
|
|
1643
|
+
return {
|
|
1644
|
+
...baseData,
|
|
1645
|
+
graphNeighborData: sharedGraph.graphNeighborData,
|
|
1646
|
+
graphIndex: sharedGraph.graphIndex,
|
|
1647
|
+
maxLayerSlots: sharedGraph.maxLayerSlots,
|
|
1648
|
+
sharedMetadata: sharedGraph.sharedMetadata,
|
|
1649
|
+
};
|
|
1650
|
+
}
|
|
1651
|
+
// Fallback: legacy serialized graph
|
|
1652
|
+
return {
|
|
1653
|
+
...baseData,
|
|
1654
|
+
graphData: this.serializeGraphStructure(),
|
|
1655
|
+
};
|
|
1656
|
+
}
|
|
1657
|
+
/**
|
|
1658
|
+
* Serialize graph into SAB-backed flat typed arrays for zero-copy worker sharing.
|
|
1659
|
+
* Layout:
|
|
1660
|
+
* graphIndex[(nodeId * maxLayerSlots + layer) * 2] = offset into graphNeighborData
|
|
1661
|
+
* graphIndex[(nodeId * maxLayerSlots + layer) * 2 + 1] = neighbor count
|
|
1662
|
+
* graphNeighborData[offset..offset+count] = neighbor IDs
|
|
1663
|
+
*
|
|
1664
|
+
* Pre-allocates extra capacity for growth during parallel build.
|
|
1665
|
+
*/
|
|
1666
|
+
serializeGraphToSharedBuffers() {
|
|
1667
|
+
const maxLayerSlots = Math.max(this.maxLevel + 4, 8);
|
|
1668
|
+
// Count total neighbors for sizing
|
|
1669
|
+
let totalNeighbors = 0;
|
|
1670
|
+
for (let i = 0; i < this.nodeCount; i++) {
|
|
1671
|
+
const node = this.nodes[i];
|
|
1672
|
+
if (!node)
|
|
1673
|
+
continue;
|
|
1674
|
+
for (const neighbors of node.neighbors) {
|
|
1675
|
+
totalNeighbors += neighbors?.length ?? 0;
|
|
1676
|
+
}
|
|
1677
|
+
}
|
|
1678
|
+
// Pre-allocate with capacity for ALL nodes (not just current ones).
|
|
1679
|
+
// During parallel build, serializeGraphToSharedBuffers is called after the seed phase
|
|
1680
|
+
// (e.g. 500 nodes), but the full index may grow to flatVectorsCapacity nodes.
|
|
1681
|
+
// Each node has up to M0 layer-0 neighbors + M upper-layer neighbors.
|
|
1682
|
+
// The append-only write pattern creates dead space when nodes are rewritten,
|
|
1683
|
+
// so we use 3x the estimated maximum to accommodate waste.
|
|
1684
|
+
const nodeCapacity = Math.max(this.nodeCount, this.flatVectorsCapacity);
|
|
1685
|
+
const estimatedMaxNeighbors = nodeCapacity * (this.M0 + this.M);
|
|
1686
|
+
const neighborCapacity = Math.max(Math.ceil(totalNeighbors * 3), totalNeighbors + 10000, estimatedMaxNeighbors * 3);
|
|
1687
|
+
const indexSize = nodeCapacity * maxLayerSlots * 2;
|
|
1688
|
+
const graphIndex = new Uint32Array(new SharedArrayBuffer(indexSize * 4));
|
|
1689
|
+
const graphNeighborData = new Uint32Array(new SharedArrayBuffer(neighborCapacity * 4));
|
|
1690
|
+
// Shared metadata: [nodeCount, entryPointId, maxLevel]
|
|
1691
|
+
const sharedMetadata = new Uint32Array(new SharedArrayBuffer(3 * 4));
|
|
1692
|
+
sharedMetadata[0] = this.nodeCount;
|
|
1693
|
+
sharedMetadata[1] = this.entryPointId;
|
|
1694
|
+
sharedMetadata[2] = this.maxLevel;
|
|
1695
|
+
// Serialize current graph
|
|
1696
|
+
let writeOffset = 0;
|
|
1697
|
+
for (let nodeId = 0; nodeId < this.nodeCount; nodeId++) {
|
|
1698
|
+
const node = this.nodes[nodeId];
|
|
1699
|
+
if (!node)
|
|
1700
|
+
continue;
|
|
1701
|
+
for (let l = 0; l < node.neighbors.length && l < maxLayerSlots; l++) {
|
|
1702
|
+
const neighbors = node.neighbors[l] ?? [];
|
|
1703
|
+
const base = (nodeId * maxLayerSlots + l) * 2;
|
|
1704
|
+
graphIndex[base] = writeOffset;
|
|
1705
|
+
graphIndex[base + 1] = neighbors.length;
|
|
1706
|
+
for (let n = 0; n < neighbors.length; n++) {
|
|
1707
|
+
graphNeighborData[writeOffset + n] = neighbors[n];
|
|
1708
|
+
}
|
|
1709
|
+
writeOffset += neighbors.length;
|
|
1710
|
+
}
|
|
1711
|
+
}
|
|
1712
|
+
// Store references for incremental updates during parallel build
|
|
1713
|
+
this.sharedGraphIndex = graphIndex;
|
|
1714
|
+
this.sharedGraphNeighborData = graphNeighborData;
|
|
1715
|
+
this.sharedGraphMaxLayerSlots = maxLayerSlots;
|
|
1716
|
+
this.sharedGraphWriteOffset = writeOffset;
|
|
1717
|
+
this.sharedMetadata = sharedMetadata;
|
|
1718
|
+
return { graphNeighborData, graphIndex, maxLayerSlots, sharedMetadata };
|
|
1719
|
+
}
|
|
1720
|
+
/**
|
|
1721
|
+
* Update shared graph SABs for specific nodes.
|
|
1722
|
+
* Called during parallel build after each batch to sync graph changes.
|
|
1723
|
+
* Workers see updates immediately via shared memory — no postMessage needed.
|
|
1724
|
+
*/
|
|
1725
|
+
updateSharedGraphNodes(nodeIds) {
|
|
1726
|
+
if (!this.sharedGraphIndex || !this.sharedGraphNeighborData)
|
|
1727
|
+
return;
|
|
1728
|
+
const maxLayerSlots = this.sharedGraphMaxLayerSlots;
|
|
1729
|
+
let writeOffset = this.sharedGraphWriteOffset;
|
|
1730
|
+
for (const nodeId of nodeIds) {
|
|
1731
|
+
const node = this.nodes[nodeId];
|
|
1732
|
+
if (!node)
|
|
1733
|
+
continue;
|
|
1734
|
+
for (let l = 0; l < node.neighbors.length && l < maxLayerSlots; l++) {
|
|
1735
|
+
const neighbors = node.neighbors[l] ?? [];
|
|
1736
|
+
const base = (nodeId * maxLayerSlots + l) * 2;
|
|
1737
|
+
// If capacity exhausted, clear shared graph to force fallback to
|
|
1738
|
+
// broadcastGraphUpdate. Workers keep their current SAB data (stale but valid).
|
|
1739
|
+
// The periodic resync in addPointsBulkParallel will create fresh SABs.
|
|
1740
|
+
if (writeOffset + neighbors.length > this.sharedGraphNeighborData.length) {
|
|
1741
|
+
this.sharedGraphWriteOffset = writeOffset;
|
|
1742
|
+
this.clearSharedGraph();
|
|
1743
|
+
return;
|
|
1744
|
+
}
|
|
1745
|
+
this.sharedGraphIndex[base] = writeOffset;
|
|
1746
|
+
this.sharedGraphIndex[base + 1] = neighbors.length;
|
|
1747
|
+
for (let n = 0; n < neighbors.length; n++) {
|
|
1748
|
+
this.sharedGraphNeighborData[writeOffset + n] = neighbors[n];
|
|
1749
|
+
}
|
|
1750
|
+
writeOffset += neighbors.length;
|
|
1751
|
+
}
|
|
1752
|
+
}
|
|
1753
|
+
this.sharedGraphWriteOffset = writeOffset;
|
|
1754
|
+
}
|
|
1755
|
+
/**
|
|
1756
|
+
* Update shared metadata SAB with current index state.
|
|
1757
|
+
* Workers read these values during search.
|
|
1758
|
+
*/
|
|
1759
|
+
updateSharedMetadata() {
|
|
1760
|
+
if (!this.sharedMetadata)
|
|
1761
|
+
return;
|
|
1762
|
+
this.sharedMetadata[0] = this.nodeCount;
|
|
1763
|
+
this.sharedMetadata[1] = this.entryPointId;
|
|
1764
|
+
this.sharedMetadata[2] = this.maxLevel;
|
|
1765
|
+
}
|
|
1766
|
+
/**
|
|
1767
|
+
* Check if shared graph SABs are active (for parallel build optimization).
|
|
1768
|
+
*/
|
|
1769
|
+
hasSharedGraph() {
|
|
1770
|
+
return this.sharedGraphIndex !== null;
|
|
1771
|
+
}
|
|
1772
|
+
/**
|
|
1773
|
+
* Clear shared graph references (called when pool is destroyed/re-initialized).
|
|
1774
|
+
*/
|
|
1775
|
+
clearSharedGraph() {
|
|
1776
|
+
this.sharedGraphIndex = null;
|
|
1777
|
+
this.sharedGraphNeighborData = null;
|
|
1778
|
+
this.sharedMetadata = null;
|
|
1779
|
+
this.sharedGraphWriteOffset = 0;
|
|
1624
1780
|
}
|
|
1625
1781
|
/**
|
|
1626
1782
|
* Serialize graph structure (neighbor lists) into a compact ArrayBuffer.
|
|
1783
|
+
* Legacy format for non-SAB fallback.
|
|
1627
1784
|
* Format per node: [numLayers:uint8] [numNeighbors:uint16, neighborId:uint32...] per layer
|
|
1628
1785
|
*/
|
|
1629
1786
|
serializeGraphStructure() {
|
|
@@ -1959,6 +2116,18 @@ export class HNSWIndex {
|
|
|
1959
2116
|
const entryPointId = readInt32('entryPointId');
|
|
1960
2117
|
const nodeCount = readUint32('nodeCount');
|
|
1961
2118
|
const vectorByteLength = dimension * 4;
|
|
2119
|
+
if (nodeCount > HNSWIndex.MAX_NODE_SLOTS) {
|
|
2120
|
+
throw new VectorDBError(`Corrupt HNSW data: node count ${nodeCount} exceeds maximum supported ${HNSWIndex.MAX_NODE_SLOTS}`, 'CORRUPT_INDEX');
|
|
2121
|
+
}
|
|
2122
|
+
if (maxLevel < -1 || maxLevel > HNSWIndex.MAX_SERIALIZED_LEVEL) {
|
|
2123
|
+
throw new VectorDBError(`Corrupt HNSW data: invalid maxLevel ${maxLevel}`, 'CORRUPT_INDEX');
|
|
2124
|
+
}
|
|
2125
|
+
if (nodeCount === 0 && (entryPointId !== -1 || maxLevel !== -1)) {
|
|
2126
|
+
throw new VectorDBError('Corrupt HNSW data: empty index has non-empty entry point metadata', 'CORRUPT_INDEX');
|
|
2127
|
+
}
|
|
2128
|
+
if (nodeCount > 0 && (entryPointId < 0 || maxLevel < 0)) {
|
|
2129
|
+
throw new VectorDBError('Corrupt HNSW data: populated index has missing entry point metadata', 'CORRUPT_INDEX');
|
|
2130
|
+
}
|
|
1962
2131
|
// V3+ has vectorDataOffset in header
|
|
1963
2132
|
let vectorDataOffset = 0;
|
|
1964
2133
|
if (formatVersion >= 3) {
|
|
@@ -1971,6 +2140,41 @@ export class HNSWIndex {
|
|
|
1971
2140
|
index.maxLevel = maxLevel;
|
|
1972
2141
|
index.entryPointId = entryPointId;
|
|
1973
2142
|
const indexToId = new Array(nodeCount);
|
|
2143
|
+
const seenNodeIds = new Set();
|
|
2144
|
+
let observedMaxLevel = -1;
|
|
2145
|
+
const validateSerializedNode = (id, level, nodeIndex) => {
|
|
2146
|
+
if (id > HNSWIndex.MAX_NODE_ID) {
|
|
2147
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeIndex} has reserved or unsupported ID ${id}`, 'CORRUPT_INDEX');
|
|
2148
|
+
}
|
|
2149
|
+
if (level > HNSWIndex.MAX_SERIALIZED_LEVEL || level > maxLevel) {
|
|
2150
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeIndex} has invalid level ${level}`, 'CORRUPT_INDEX');
|
|
2151
|
+
}
|
|
2152
|
+
if (seenNodeIds.has(id)) {
|
|
2153
|
+
throw new VectorDBError(`Corrupt HNSW data: duplicate node ID ${id}`, 'CORRUPT_INDEX');
|
|
2154
|
+
}
|
|
2155
|
+
seenNodeIds.add(id);
|
|
2156
|
+
if (level > observedMaxLevel)
|
|
2157
|
+
observedMaxLevel = level;
|
|
2158
|
+
};
|
|
2159
|
+
const validateNeighborList = (nodeId, layer, neighbors) => {
|
|
2160
|
+
const maxConnections = layer === 0 ? M * 2 : M;
|
|
2161
|
+
if (neighbors.length > maxConnections) {
|
|
2162
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} layer ${layer} has ${neighbors.length} neighbors, maximum is ${maxConnections}`, 'CORRUPT_INDEX');
|
|
2163
|
+
}
|
|
2164
|
+
const seenNeighbors = new Set();
|
|
2165
|
+
for (const neighborId of neighbors) {
|
|
2166
|
+
if (!seenNodeIds.has(neighborId)) {
|
|
2167
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} references missing neighbor ${neighborId}`, 'CORRUPT_INDEX');
|
|
2168
|
+
}
|
|
2169
|
+
if (neighborId === nodeId) {
|
|
2170
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} references itself as a neighbor`, 'CORRUPT_INDEX');
|
|
2171
|
+
}
|
|
2172
|
+
if (seenNeighbors.has(neighborId)) {
|
|
2173
|
+
throw new VectorDBError(`Corrupt HNSW data: node ${nodeId} has duplicate neighbor ${neighborId}`, 'CORRUPT_INDEX');
|
|
2174
|
+
}
|
|
2175
|
+
seenNeighbors.add(neighborId);
|
|
2176
|
+
}
|
|
2177
|
+
};
|
|
1974
2178
|
if (formatVersion >= 3) {
|
|
1975
2179
|
// V3 format: vectors at end, supports lazy loading
|
|
1976
2180
|
const nodeMetadata = [];
|
|
@@ -1979,6 +2183,7 @@ export class HNSWIndex {
|
|
|
1979
2183
|
for (let i = 0; i < nodeCount; i++) {
|
|
1980
2184
|
const id = readUint32(`node ${i} id`);
|
|
1981
2185
|
const level = readUint32(`node ${i} level`);
|
|
2186
|
+
validateSerializedNode(id, level, i);
|
|
1982
2187
|
indexToId[i] = id;
|
|
1983
2188
|
nodeMetadata.push({ id, level });
|
|
1984
2189
|
}
|
|
@@ -2017,6 +2222,7 @@ export class HNSWIndex {
|
|
|
2017
2222
|
}
|
|
2018
2223
|
return indexToId[idx];
|
|
2019
2224
|
});
|
|
2225
|
+
validateNeighborList(nodeMetadata[i].id, l, neighbors[l]);
|
|
2020
2226
|
}
|
|
2021
2227
|
nodeNeighbors.push(neighbors);
|
|
2022
2228
|
}
|
|
@@ -2081,6 +2287,7 @@ export class HNSWIndex {
|
|
|
2081
2287
|
for (let i = 0; i < nodeCount; i++) {
|
|
2082
2288
|
const id = readUint32(`node ${i} id`);
|
|
2083
2289
|
const level = readUint32(`node ${i} level`);
|
|
2290
|
+
validateSerializedNode(id, level, i);
|
|
2084
2291
|
indexToId[i] = id;
|
|
2085
2292
|
const vector = new Float32Array(dimension);
|
|
2086
2293
|
for (let j = 0; j < dimension; j++) {
|
|
@@ -2120,6 +2327,7 @@ export class HNSWIndex {
|
|
|
2120
2327
|
}
|
|
2121
2328
|
return indexToId[idx];
|
|
2122
2329
|
});
|
|
2330
|
+
validateNeighborList(id, l, neighbors[l]);
|
|
2123
2331
|
}
|
|
2124
2332
|
const node = { id, level, vector, neighbors };
|
|
2125
2333
|
index.setNode(node);
|
|
@@ -2130,6 +2338,7 @@ export class HNSWIndex {
|
|
|
2130
2338
|
for (let i = 0; i < nodeCount; i++) {
|
|
2131
2339
|
const id = readUint32(`node ${i} id`);
|
|
2132
2340
|
const level = readUint32(`node ${i} level`);
|
|
2341
|
+
validateSerializedNode(id, level, i);
|
|
2133
2342
|
indexToId[i] = id;
|
|
2134
2343
|
const vector = new Float32Array(dimension);
|
|
2135
2344
|
for (let j = 0; j < dimension; j++) {
|
|
@@ -2155,9 +2364,20 @@ export class HNSWIndex {
|
|
|
2155
2364
|
}
|
|
2156
2365
|
node.neighbors[l][j] = indexToId[neighborIndex];
|
|
2157
2366
|
}
|
|
2367
|
+
validateNeighborList(node.id, l, node.neighbors[l]);
|
|
2158
2368
|
}
|
|
2159
2369
|
}
|
|
2160
2370
|
}
|
|
2371
|
+
if (nodeCount > 0 && observedMaxLevel !== maxLevel) {
|
|
2372
|
+
throw new VectorDBError(`Corrupt HNSW data: maxLevel ${maxLevel} does not match highest node level ${observedMaxLevel}`, 'CORRUPT_INDEX');
|
|
2373
|
+
}
|
|
2374
|
+
if (nodeCount > 0 && !seenNodeIds.has(entryPointId)) {
|
|
2375
|
+
throw new VectorDBError(`Corrupt HNSW data: entry point ${entryPointId} is not present in node table`, 'CORRUPT_INDEX');
|
|
2376
|
+
}
|
|
2377
|
+
const entryPoint = nodeCount > 0 ? index.nodes[entryPointId] : undefined;
|
|
2378
|
+
if (entryPoint && entryPoint.level !== maxLevel) {
|
|
2379
|
+
throw new VectorDBError(`Corrupt HNSW data: entry point level ${entryPoint.level} does not match maxLevel ${maxLevel}`, 'CORRUPT_INDEX');
|
|
2380
|
+
}
|
|
2161
2381
|
return index;
|
|
2162
2382
|
}
|
|
2163
2383
|
catch (error) {
|
|
@@ -2324,7 +2544,7 @@ export class HNSWIndex {
|
|
|
2324
2544
|
continue;
|
|
2325
2545
|
const vector = this.getNodeVector(node.id);
|
|
2326
2546
|
if (vector) {
|
|
2327
|
-
result.set(node.id, vector);
|
|
2547
|
+
result.set(node.id, new Float32Array(vector));
|
|
2328
2548
|
}
|
|
2329
2549
|
}
|
|
2330
2550
|
return result;
|
|
@@ -2718,9 +2938,6 @@ export class HNSWIndex {
|
|
|
2718
2938
|
}
|
|
2719
2939
|
}
|
|
2720
2940
|
if (batchCount > 0) {
|
|
2721
|
-
if (batchDists.length < batchCount) {
|
|
2722
|
-
this.batchDistances.length = batchCount;
|
|
2723
|
-
}
|
|
2724
2941
|
this.calculateDistancesBatchInt8(int8Query, batchIds, batchDists, batchCount);
|
|
2725
2942
|
for (let i = 0; i < batchCount; i++) {
|
|
2726
2943
|
const neighborId = batchIds[i];
|