native-vector-store 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +55 -3
  2. package/binding.gyp +3 -2
  3. package/deps/parallel_hashmap/btree.h +4076 -0
  4. package/deps/parallel_hashmap/meminfo.h +195 -0
  5. package/deps/parallel_hashmap/phmap.h +5236 -0
  6. package/deps/parallel_hashmap/phmap_base.h +5115 -0
  7. package/deps/parallel_hashmap/phmap_bits.h +665 -0
  8. package/deps/parallel_hashmap/phmap_config.h +790 -0
  9. package/deps/parallel_hashmap/phmap_dump.h +335 -0
  10. package/deps/parallel_hashmap/phmap_fwd_decl.h +186 -0
  11. package/deps/parallel_hashmap/phmap_utils.h +407 -0
  12. package/docs/index.html +52 -3
  13. package/lib/index.d.ts +35 -1
  14. package/package.json +1 -1
  15. package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
  16. package/prebuilds/darwin-x64/native-vector-store.node +0 -0
  17. package/prebuilds/linux-arm64/native-vector-store.node +0 -0
  18. package/prebuilds/linux-x64/native-vector-store.node +0 -0
  19. package/src/Makefile +26 -6
  20. package/src/binding.cc +185 -2
  21. package/src/english_abbreviations.h +197 -0
  22. package/src/english_dictionary.h +25185 -0
  23. package/src/english_punctuations.h +42 -0
  24. package/src/english_stop_words.h +434 -0
  25. package/src/simple_sentence_splitter.h +218 -0
  26. package/src/simple_tokenizer.cpp +92 -0
  27. package/src/simple_tokenizer.h +30 -0
  28. package/src/test_bm25.cpp +357 -0
  29. package/src/test_hybrid_search.cpp +496 -0
  30. package/src/vector_store.cpp +239 -3
  31. package/src/vector_store.h +52 -1
  32. package/src/vector_store_loader.cpp +1 -1
  33. package/src/vector_store_loader_adaptive.cpp +1 -1
  34. package/src/vector_store_loader_mmap.cpp +2 -2
  35. package/prebuilds/win32-x64/native-vector-store.node +0 -0
package/README.md CHANGED
@@ -20,8 +20,9 @@ This design eliminates complex state management, ensures consistent performance,
20
20
  - **🚀 High Performance**: C++ implementation with OpenMP SIMD optimization
21
21
  - **📦 Arena Allocation**: Memory-efficient storage with 64MB chunks
22
22
  - **⚡ Fast Search**: Sub-10ms similarity search for large document collections
23
+ - **🔍 Hybrid Search**: Combines vector similarity (semantic) with BM25 text search (lexical)
23
24
  - **🔧 MCP Integration**: Built for Model Context Protocol servers
24
- - **🌐 Cross-Platform**: Works on Linux, macOS, and Windows
25
+ - **🌐 Cross-Platform**: Works on Linux and macOS (Windows users: use WSL)
25
26
  - **📊 TypeScript Support**: Full type definitions included
26
27
  - **🔄 Producer-Consumer Loading**: Parallel document loading at 178k+ docs/sec
27
28
 
@@ -48,12 +49,11 @@ npm install native-vector-store
48
49
  - **Linux**: `sudo apt-get install libgomp1` (Ubuntu/Debian) or `dnf install libgomp` (Fedora)
49
50
  - **Alpine**: `apk add libgomp`
50
51
  - **macOS**: `brew install libomp`
51
- - **Windows**: Included with Visual C++ runtime
52
+ - **Windows**: Use WSL (Windows Subsystem for Linux)
52
53
 
53
54
  Prebuilt binaries are included for:
54
55
  - Linux (x64, arm64, musl/Alpine) - x64 builds are AWS Lambda compatible (no AVX-512)
55
56
  - macOS (x64, arm64/Apple Silicon)
56
- - Windows (x64)
57
57
 
58
58
  If building from source, you'll need:
59
59
  - Node.js ≥14.0.0
@@ -86,8 +86,16 @@ store.finalize(); // Must call before searching!
86
86
 
87
87
  // Search for similar documents
88
88
  const queryEmbedding = new Float32Array(1536);
89
+
90
+ // Option 1: Vector-only search (traditional)
89
91
  const results = store.search(queryEmbedding, 5); // Top 5 results
90
92
 
93
+ // Option 2: Hybrid search (NEW - combines vector + BM25 text search)
94
+ const hybridResults = store.search(queryEmbedding, 5, "your search query text");
95
+
96
+ // Option 3: BM25 text-only search
97
+ const textResults = store.searchBM25("your search query", 5);
98
+
91
99
  // Results format - array of SearchResult objects, sorted by score (highest first):
92
100
  console.log(results);
93
101
  // [
@@ -278,6 +286,50 @@ if (process.env.NODE_ENV === 'development') {
278
286
  }
279
287
  ```
280
288
 
289
+ ## Hybrid Search
290
+
291
+ The vector store now supports hybrid search, combining semantic similarity (vector search) with lexical matching (BM25 text search) for improved retrieval accuracy:
292
+
293
+ ```javascript
294
+ const { VectorStore } = require('native-vector-store');
295
+
296
+ const store = new VectorStore(1536);
297
+ store.loadDir('./documents');
298
+
299
+ // Hybrid search automatically combines vector and text search
300
+ const queryEmbedding = new Float32Array(1536);
301
+ const results = store.search(
302
+ queryEmbedding,
303
+ 10, // Top 10 results
304
+ "machine learning algorithms" // Query text for BM25
305
+ );
306
+
307
+ // You can also use individual search methods
308
+ const vectorResults = store.searchVector(queryEmbedding, 10);
309
+ const textResults = store.searchBM25("machine learning", 10);
310
+
311
+ // Or explicitly control the hybrid weights
312
+ const customResults = store.searchHybrid(
313
+ queryEmbedding,
314
+ "machine learning",
315
+ 10,
316
+ 0.3, // Vector weight (30%)
317
+ 0.7 // BM25 weight (70%)
318
+ );
319
+
320
+ // Tune BM25 parameters for your corpus
321
+ store.setBM25Parameters(
322
+ 1.2, // k1: Term frequency saturation (default: 1.2)
323
+ 0.75, // b: Document length normalization (default: 0.75)
324
+ 1.0 // delta: Smoothing parameter (default: 1.0)
325
+ );
326
+ ```
327
+
328
+ Hybrid search is particularly effective for:
329
+ - **Question answering**: BM25 finds documents with exact terms while vectors capture semantic meaning
330
+ - **Knowledge retrieval**: Combines conceptual similarity with keyword matching
331
+ - **Multi-lingual search**: Vectors handle cross-language similarity while BM25 matches exact terms
332
+
281
333
  ## MCP Server Integration
282
334
 
283
335
  Perfect for building local RAG capabilities in MCP servers:
package/binding.gyp CHANGED
@@ -2,12 +2,13 @@
2
2
  "targets": [
3
3
  {
4
4
  "target_name": "vector_store",
5
- "sources": ["src/binding.cc", "src/vector_store.cpp", "src/vector_store_loader.cpp", "src/vector_store_loader_mmap.cpp", "src/vector_store_loader_adaptive.cpp", "deps/simdjson/simdjson.cpp"],
5
+ "sources": ["src/binding.cc", "src/vector_store.cpp", "src/simple_tokenizer.cpp", "src/vector_store_loader.cpp", "src/vector_store_loader_mmap.cpp", "src/vector_store_loader_adaptive.cpp", "deps/simdjson/simdjson.cpp"],
6
6
  "include_dirs": [
7
7
  "<!@(node -p \"require('node-addon-api').include\")",
8
8
  "src",
9
9
  "deps/simdjson",
10
- "deps/atomic_queue"
10
+ "deps/atomic_queue",
11
+ "deps"
11
12
  ],
12
13
  "dependencies": ["<!(node -p \"require('node-addon-api').gyp\")"],
13
14
  "cflags_cc": [