native-vector-store 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -3
- package/binding.gyp +3 -2
- package/deps/parallel_hashmap/btree.h +4076 -0
- package/deps/parallel_hashmap/meminfo.h +195 -0
- package/deps/parallel_hashmap/phmap.h +5236 -0
- package/deps/parallel_hashmap/phmap_base.h +5115 -0
- package/deps/parallel_hashmap/phmap_bits.h +665 -0
- package/deps/parallel_hashmap/phmap_config.h +790 -0
- package/deps/parallel_hashmap/phmap_dump.h +335 -0
- package/deps/parallel_hashmap/phmap_fwd_decl.h +186 -0
- package/deps/parallel_hashmap/phmap_utils.h +407 -0
- package/docs/index.html +52 -3
- package/lib/index.d.ts +35 -1
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
- package/prebuilds/darwin-x64/native-vector-store.node +0 -0
- package/prebuilds/linux-arm64/native-vector-store.node +0 -0
- package/prebuilds/linux-x64/native-vector-store.node +0 -0
- package/src/Makefile +26 -6
- package/src/binding.cc +185 -2
- package/src/english_abbreviations.h +197 -0
- package/src/english_dictionary.h +25185 -0
- package/src/english_punctuations.h +42 -0
- package/src/english_stop_words.h +434 -0
- package/src/simple_sentence_splitter.h +218 -0
- package/src/simple_tokenizer.cpp +92 -0
- package/src/simple_tokenizer.h +30 -0
- package/src/test_bm25.cpp +357 -0
- package/src/test_hybrid_search.cpp +496 -0
- package/src/vector_store.cpp +239 -3
- package/src/vector_store.h +52 -1
- package/prebuilds/win32-x64/native-vector-store.node +0 -0
package/README.md
CHANGED
|
@@ -20,8 +20,9 @@ This design eliminates complex state management, ensures consistent performance,
|
|
|
20
20
|
- **🚀 High Performance**: C++ implementation with OpenMP SIMD optimization
|
|
21
21
|
- **📦 Arena Allocation**: Memory-efficient storage with 64MB chunks
|
|
22
22
|
- **⚡ Fast Search**: Sub-10ms similarity search for large document collections
|
|
23
|
+
- **🔍 Hybrid Search**: Combines vector similarity (semantic) with BM25 text search (lexical)
|
|
23
24
|
- **🔧 MCP Integration**: Built for Model Context Protocol servers
|
|
24
|
-
- **🌐 Cross-Platform**: Works on Linux
|
|
25
|
+
- **🌐 Cross-Platform**: Works on Linux and macOS (Windows users: use WSL)
|
|
25
26
|
- **📊 TypeScript Support**: Full type definitions included
|
|
26
27
|
- **🔄 Producer-Consumer Loading**: Parallel document loading at 178k+ docs/sec
|
|
27
28
|
|
|
@@ -48,12 +49,11 @@ npm install native-vector-store
|
|
|
48
49
|
- **Linux**: `sudo apt-get install libgomp1` (Ubuntu/Debian) or `dnf install libgomp` (Fedora)
|
|
49
50
|
- **Alpine**: `apk add libgomp`
|
|
50
51
|
- **macOS**: `brew install libomp`
|
|
51
|
-
- **Windows**:
|
|
52
|
+
- **Windows**: Use WSL (Windows Subsystem for Linux)
|
|
52
53
|
|
|
53
54
|
Prebuilt binaries are included for:
|
|
54
55
|
- Linux (x64, arm64, musl/Alpine) - x64 builds are AWS Lambda compatible (no AVX-512)
|
|
55
56
|
- macOS (x64, arm64/Apple Silicon)
|
|
56
|
-
- Windows (x64)
|
|
57
57
|
|
|
58
58
|
If building from source, you'll need:
|
|
59
59
|
- Node.js ≥14.0.0
|
|
@@ -86,8 +86,16 @@ store.finalize(); // Must call before searching!
|
|
|
86
86
|
|
|
87
87
|
// Search for similar documents
|
|
88
88
|
const queryEmbedding = new Float32Array(1536);
|
|
89
|
+
|
|
90
|
+
// Option 1: Vector-only search (traditional)
|
|
89
91
|
const results = store.search(queryEmbedding, 5); // Top 5 results
|
|
90
92
|
|
|
93
|
+
// Option 2: Hybrid search (NEW - combines vector + BM25 text search)
|
|
94
|
+
const hybridResults = store.search(queryEmbedding, 5, "your search query text");
|
|
95
|
+
|
|
96
|
+
// Option 3: BM25 text-only search
|
|
97
|
+
const textResults = store.searchBM25("your search query", 5);
|
|
98
|
+
|
|
91
99
|
// Results format - array of SearchResult objects, sorted by score (highest first):
|
|
92
100
|
console.log(results);
|
|
93
101
|
// [
|
|
@@ -278,6 +286,50 @@ if (process.env.NODE_ENV === 'development') {
|
|
|
278
286
|
}
|
|
279
287
|
```
|
|
280
288
|
|
|
289
|
+
## Hybrid Search
|
|
290
|
+
|
|
291
|
+
The vector store now supports hybrid search, combining semantic similarity (vector search) with lexical matching (BM25 text search) for improved retrieval accuracy:
|
|
292
|
+
|
|
293
|
+
```javascript
|
|
294
|
+
const { VectorStore } = require('native-vector-store');
|
|
295
|
+
|
|
296
|
+
const store = new VectorStore(1536);
|
|
297
|
+
store.loadDir('./documents');
|
|
298
|
+
|
|
299
|
+
// Hybrid search automatically combines vector and text search
|
|
300
|
+
const queryEmbedding = new Float32Array(1536);
|
|
301
|
+
const results = store.search(
|
|
302
|
+
queryEmbedding,
|
|
303
|
+
10, // Top 10 results
|
|
304
|
+
"machine learning algorithms" // Query text for BM25
|
|
305
|
+
);
|
|
306
|
+
|
|
307
|
+
// You can also use individual search methods
|
|
308
|
+
const vectorResults = store.searchVector(queryEmbedding, 10);
|
|
309
|
+
const textResults = store.searchBM25("machine learning", 10);
|
|
310
|
+
|
|
311
|
+
// Or explicitly control the hybrid weights
|
|
312
|
+
const customResults = store.searchHybrid(
|
|
313
|
+
queryEmbedding,
|
|
314
|
+
"machine learning",
|
|
315
|
+
10,
|
|
316
|
+
0.3, // Vector weight (30%)
|
|
317
|
+
0.7 // BM25 weight (70%)
|
|
318
|
+
);
|
|
319
|
+
|
|
320
|
+
// Tune BM25 parameters for your corpus
|
|
321
|
+
store.setBM25Parameters(
|
|
322
|
+
1.2, // k1: Term frequency saturation (default: 1.2)
|
|
323
|
+
0.75, // b: Document length normalization (default: 0.75)
|
|
324
|
+
1.0 // delta: Smoothing parameter (default: 1.0)
|
|
325
|
+
);
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
Hybrid search is particularly effective for:
|
|
329
|
+
- **Question answering**: BM25 finds documents with exact terms while vectors capture semantic meaning
|
|
330
|
+
- **Knowledge retrieval**: Combines conceptual similarity with keyword matching
|
|
331
|
+
- **Multi-lingual search**: Vectors handle cross-language similarity while BM25 matches exact terms
|
|
332
|
+
|
|
281
333
|
## MCP Server Integration
|
|
282
334
|
|
|
283
335
|
Perfect for building local RAG capabilities in MCP servers:
|
package/binding.gyp
CHANGED
|
@@ -2,12 +2,13 @@
|
|
|
2
2
|
"targets": [
|
|
3
3
|
{
|
|
4
4
|
"target_name": "vector_store",
|
|
5
|
-
"sources": ["src/binding.cc", "src/vector_store.cpp", "src/vector_store_loader.cpp", "src/vector_store_loader_mmap.cpp", "src/vector_store_loader_adaptive.cpp", "deps/simdjson/simdjson.cpp"],
|
|
5
|
+
"sources": ["src/binding.cc", "src/vector_store.cpp", "src/simple_tokenizer.cpp", "src/vector_store_loader.cpp", "src/vector_store_loader_mmap.cpp", "src/vector_store_loader_adaptive.cpp", "deps/simdjson/simdjson.cpp"],
|
|
6
6
|
"include_dirs": [
|
|
7
7
|
"<!@(node -p \"require('node-addon-api').include\")",
|
|
8
8
|
"src",
|
|
9
9
|
"deps/simdjson",
|
|
10
|
-
"deps/atomic_queue"
|
|
10
|
+
"deps/atomic_queue",
|
|
11
|
+
"deps"
|
|
11
12
|
],
|
|
12
13
|
"dependencies": ["<!(node -p \"require('node-addon-api').gyp\")"],
|
|
13
14
|
"cflags_cc": [
|