native-vector-store 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +55 -3
  2. package/binding.gyp +3 -2
  3. package/deps/parallel_hashmap/btree.h +4076 -0
  4. package/deps/parallel_hashmap/meminfo.h +195 -0
  5. package/deps/parallel_hashmap/phmap.h +5236 -0
  6. package/deps/parallel_hashmap/phmap_base.h +5115 -0
  7. package/deps/parallel_hashmap/phmap_bits.h +665 -0
  8. package/deps/parallel_hashmap/phmap_config.h +790 -0
  9. package/deps/parallel_hashmap/phmap_dump.h +335 -0
  10. package/deps/parallel_hashmap/phmap_fwd_decl.h +186 -0
  11. package/deps/parallel_hashmap/phmap_utils.h +407 -0
  12. package/docs/index.html +52 -3
  13. package/lib/index.d.ts +35 -1
  14. package/package.json +1 -1
  15. package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
  16. package/prebuilds/darwin-x64/native-vector-store.node +0 -0
  17. package/prebuilds/linux-arm64/native-vector-store.node +0 -0
  18. package/prebuilds/linux-x64/native-vector-store.node +0 -0
  19. package/src/Makefile +26 -6
  20. package/src/binding.cc +185 -2
  21. package/src/english_abbreviations.h +197 -0
  22. package/src/english_dictionary.h +25185 -0
  23. package/src/english_punctuations.h +42 -0
  24. package/src/english_stop_words.h +434 -0
  25. package/src/simple_sentence_splitter.h +218 -0
  26. package/src/simple_tokenizer.cpp +92 -0
  27. package/src/simple_tokenizer.h +30 -0
  28. package/src/test_bm25.cpp +357 -0
  29. package/src/test_hybrid_search.cpp +496 -0
  30. package/src/vector_store.cpp +239 -3
  31. package/src/vector_store.h +52 -1
  32. package/src/vector_store_loader.cpp +1 -1
  33. package/src/vector_store_loader_adaptive.cpp +1 -1
  34. package/src/vector_store_loader_mmap.cpp +2 -2
  35. package/prebuilds/win32-x64/native-vector-store.node +0 -0
@@ -1,4 +1,7 @@
1
1
  #include "vector_store.h"
2
+ #include "simple_tokenizer.h"
3
+ #include <cctype>
4
+ #include <algorithm>
2
5
 
3
6
  // ArenaAllocator implementation
4
7
 
@@ -109,7 +112,7 @@ void TopK::merge(const TopK& other) {
109
112
 
110
113
  // VectorStore implementation
111
114
 
112
- VectorStore::VectorStore(size_t dim) : dim_(dim) {
115
+ VectorStore::VectorStore(size_t dim) : dim_(dim), postings_(), doc_freq_() {
113
116
  entries_.resize(1'000'000); // Pre-size with default-constructed entries
114
117
 
115
118
  // Prepare per-thread arena allocators for zero-contention parallel loading
@@ -314,6 +317,45 @@ VectorStoreError VectorStore::add_document(simdjson::ondemand::object& json_doc)
314
317
  entry.doc = doc;
315
318
  entry.embedding = emb_ptr;
316
319
 
320
+ // Process text for BM25 - tokenize and build term frequencies
321
+ SimpleTokenizer tokenizer;
322
+ std::vector<std::string> tokens = tokenizer.split(std::string(text));
323
+
324
+ // Build term frequency map
325
+ entry.tf.clear();
326
+ for (const std::string& token : tokens) {
327
+ // Convert to lowercase for case-insensitive matching
328
+ std::string lower_token = token;
329
+ std::transform(lower_token.begin(), lower_token.end(), lower_token.begin(), ::tolower);
330
+ entry.tf[lower_token]++;
331
+ }
332
+
333
+ entry.length = tokens.size();
334
+
335
+ // Update BM25 index structures using lock-free parallel hashmap operations
336
+ total_length_.fetch_add(entry.length, std::memory_order_relaxed);
337
+
338
+ // Update postings and document frequencies
339
+ for (const auto& tf_pair : entry.tf) {
340
+ const std::string& term = tf_pair.first;
341
+
342
+ // Update postings list using parallel hashmap's thread-safe lazy_emplace_l
343
+ postings_.lazy_emplace_l(term,
344
+ // If key exists, append to the vector
345
+ [&idx](auto& p) { p.second.push_back(idx); },
346
+ // If key doesn't exist, create new vector with this idx
347
+ [&term, &idx](const auto& ctor) { ctor(term, std::vector<size_t>{idx}); }
348
+ );
349
+
350
+ // Update document frequency - parallel hashmap provides thread safety
351
+ doc_freq_.lazy_emplace_l(term,
352
+ // If key exists, increment the count
353
+ [](auto& p) { p.second++; },
354
+ // If key doesn't exist, create with value 1
355
+ [&term](const auto& ctor) { ctor(term, 1); }
356
+ );
357
+ }
358
+
317
359
  entries_[idx] = entry;
318
360
 
319
361
  return VectorStoreError::SUCCESS;
@@ -360,7 +402,7 @@ void VectorStore::normalize_all() {
360
402
  }
361
403
 
362
404
  std::vector<std::pair<float, size_t>>
363
- VectorStore::search(const float* query, size_t k) const {
405
+ VectorStore::search(const float* __restrict__ query, size_t k) const {
364
406
  // Exclusive lock: prevent overlapping OpenMP teams
365
407
  // Since each search uses all threads via OpenMP, concurrent searches provide no benefit
366
408
  std::unique_lock<std::shared_mutex> lock(search_mutex_);
@@ -394,7 +436,7 @@ VectorStore::search(const float* query, size_t k) const {
394
436
  #pragma omp for // default barrier kept - ensures all threads finish before merge
395
437
  for (int i = 0; i < static_cast<int>(n); ++i) {
396
438
  float score = 0.0f;
397
- const float* emb = entries_[i].embedding;
439
+ const float* __restrict__ emb = entries_[i].embedding;
398
440
 
399
441
  #pragma omp simd reduction(+:score)
400
442
  for (size_t j = 0; j < dim_; ++j) {
@@ -430,3 +472,197 @@ size_t VectorStore::size() const {
430
472
  bool VectorStore::is_finalized() const {
431
473
  return is_finalized_.load(std::memory_order_acquire);
432
474
  }
475
+
476
+ double VectorStore::avg_doc_length() const {
477
+ size_t n = size();
478
+ return n > 0 ? static_cast<double>(total_length_.load(std::memory_order_relaxed)) / n : 0.0;
479
+ }
480
+
481
+ void VectorStore::set_bm25_parameters(double k1, double b, double delta) {
482
+ k1_ = k1;
483
+ b_ = b;
484
+ delta_ = delta;
485
+ }
486
+
487
+ std::vector<std::pair<size_t, double>>
488
+ VectorStore::search_bm25(const std::vector<std::string>& query_terms) const {
489
+ if (!is_finalized()) {
490
+ return {}; // Store must be finalized
491
+ }
492
+
493
+ std::unordered_map<size_t, double> scores;
494
+ size_t N = size();
495
+ double avg_len = avg_doc_length();
496
+
497
+ // Precompute IDF for each unique query term
498
+ std::unordered_map<std::string, double> idf_cache;
499
+ for (const auto& term : query_terms) {
500
+ if (idf_cache.find(term) == idf_cache.end()) {
501
+ auto df_it = doc_freq_.find(term);
502
+ int df = (df_it != doc_freq_.end()) ? df_it->second : 0;
503
+ idf_cache[term] = std::log((N - df + 0.5) / (df + 0.5) + 1.0);
504
+ }
505
+ }
506
+
507
+ // For each unique term in the query:
508
+ for (const auto& term : query_terms) {
509
+ auto postings_it = postings_.find(term);
510
+ if (postings_it == postings_.end()) {
511
+ continue; // Term not found in corpus
512
+ }
513
+
514
+ double idf_t = idf_cache[term];
515
+ for (size_t doc_id : postings_it->second) {
516
+ const Entry& entry = entries_[doc_id];
517
+ auto tf_it = entry.tf.find(term);
518
+ if (tf_it == entry.tf.end()) {
519
+ continue; // Should not happen if postings are consistent
520
+ }
521
+
522
+ int tf = tf_it->second;
523
+ double norm = 1.0 - b_ + b_ * (entry.length / avg_len);
524
+ double tf_weight = (k1_ + 1) * tf / (tf + k1_ * norm);
525
+ scores[doc_id] += (tf_weight + delta_) * idf_t;
526
+ }
527
+ }
528
+
529
+ // Collect and sort results
530
+ std::vector<std::pair<size_t, double>> results(scores.begin(), scores.end());
531
+ std::sort(results.begin(), results.end(),
532
+ [](const auto& a, const auto& b) { return a.second > b.second; });
533
+
534
+ return results;
535
+ }
536
+
537
+ std::vector<std::pair<size_t, double>>
538
+ VectorStore::search_hybrid(const float* __restrict__ query_vector, const std::vector<std::string>& query_terms,
539
+ double vector_weight, double bm25_weight, size_t k) const {
540
+ // Exclusive lock: prevent overlapping OpenMP teams
541
+ std::unique_lock<std::shared_mutex> lock(search_mutex_);
542
+
543
+ if (!is_finalized()) {
544
+ return {}; // Store must be finalized
545
+ }
546
+
547
+ size_t n = count_.load(std::memory_order_acquire);
548
+ if (n == 0 || k == 0) return {};
549
+ k = std::min(k, n);
550
+
551
+ // Precompute BM25 IDF scores for query terms
552
+ std::unordered_map<std::string, double> idf_cache;
553
+ double avg_len = avg_doc_length();
554
+
555
+ for (const auto& term : query_terms) {
556
+ auto df_it = doc_freq_.find(term);
557
+ int df = (df_it != doc_freq_.end()) ? df_it->second : 0;
558
+ idf_cache[term] = std::log((n - df + 0.5) / (df + 0.5) + 1.0);
559
+ }
560
+
561
+ const int num_threads = omp_get_max_threads();
562
+
563
+ // Each thread maintains TWO heaps - one for vector, one for BM25
564
+ struct DualTopK {
565
+ TopK vector_heap;
566
+ TopK bm25_heap;
567
+ DualTopK(size_t k) : vector_heap(k), bm25_heap(k) {}
568
+
569
+ // Make DualTopK move-only like TopK
570
+ DualTopK(const DualTopK&) = delete;
571
+ DualTopK& operator=(const DualTopK&) = delete;
572
+ DualTopK(DualTopK&&) = default;
573
+ DualTopK& operator=(DualTopK&&) = default;
574
+ };
575
+
576
+ std::vector<DualTopK> thread_heaps;
577
+ thread_heaps.reserve(num_threads);
578
+ for (int i = 0; i < num_threads; ++i) {
579
+ thread_heaps.emplace_back(k);
580
+ }
581
+
582
+ #pragma omp parallel
583
+ {
584
+ const int tid = omp_get_thread_num();
585
+ DualTopK& local = thread_heaps[tid];
586
+
587
+ #pragma omp for
588
+ for (int i = 0; i < static_cast<int>(n); ++i) {
589
+ // 1. Compute vector similarity score
590
+ float vector_score = 0.0f;
591
+ const float* __restrict__ emb = entries_[i].embedding;
592
+
593
+ #pragma omp simd reduction(+:vector_score)
594
+ for (size_t j = 0; j < dim_; ++j) {
595
+ vector_score += emb[j] * query_vector[j];
596
+ }
597
+
598
+ // 2. Compute BM25 score for this document
599
+ double bm25_score = 0.0;
600
+ const Entry& entry = entries_[i];
601
+
602
+ for (const auto& term : query_terms) {
603
+ auto tf_it = entry.tf.find(term);
604
+ if (tf_it != entry.tf.end()) {
605
+ int tf = tf_it->second;
606
+ double norm = 1.0 - b_ + b_ * (entry.length / avg_len);
607
+ double tf_weight = (k1_ + 1) * tf / (tf + k1_ * norm);
608
+ bm25_score += (tf_weight + delta_) * idf_cache.at(term);
609
+ }
610
+ }
611
+
612
+ // 3. Push to both heaps
613
+ local.vector_heap.push(vector_score, i);
614
+ local.bm25_heap.push(static_cast<float>(bm25_score), i);
615
+ }
616
+
617
+ #pragma omp barrier
618
+ }
619
+
620
+ // Merge thread-local heaps to get global top-k for each score type
621
+ TopK global_vector_heap(k);
622
+ TopK global_bm25_heap(k);
623
+
624
+ for (auto& th : thread_heaps) {
625
+ global_vector_heap.merge(th.vector_heap);
626
+ global_bm25_heap.merge(th.bm25_heap);
627
+ }
628
+
629
+ // Sort heaps to get ranking order
630
+ std::sort(global_vector_heap.heap.begin(), global_vector_heap.heap.end(),
631
+ [](const auto& a, const auto& b) { return a.first > b.first; });
632
+ std::sort(global_bm25_heap.heap.begin(), global_bm25_heap.heap.end(),
633
+ [](const auto& a, const auto& b) { return a.first > b.first; });
634
+
635
+ // Apply Reciprocal Rank Fusion (RRF) with constant k=60 (typical value)
636
+ const double rrf_k = 60.0;
637
+ std::unordered_map<size_t, double> rrf_scores;
638
+
639
+ // Add vector search rankings
640
+ for (size_t rank = 0; rank < global_vector_heap.heap.size(); ++rank) {
641
+ size_t doc_id = global_vector_heap.heap[rank].second;
642
+ // Weight the RRF contribution
643
+ rrf_scores[doc_id] += vector_weight * (1.0 / (rrf_k + rank + 1));
644
+ }
645
+
646
+ // Add BM25 rankings
647
+ for (size_t rank = 0; rank < global_bm25_heap.heap.size(); ++rank) {
648
+ size_t doc_id = global_bm25_heap.heap[rank].second;
649
+ // Weight the RRF contribution
650
+ rrf_scores[doc_id] += bm25_weight * (1.0 / (rrf_k + rank + 1));
651
+ }
652
+
653
+ // Sort by RRF score and return top-k
654
+ std::vector<std::pair<size_t, double>> results;
655
+ results.reserve(rrf_scores.size());
656
+ for (const auto& pair : rrf_scores) {
657
+ results.emplace_back(pair.first, pair.second);
658
+ }
659
+
660
+ std::sort(results.begin(), results.end(),
661
+ [](const auto& a, const auto& b) { return a.second > b.second; });
662
+
663
+ if (results.size() > k) {
664
+ results.resize(k);
665
+ }
666
+
667
+ return results;
668
+ }
@@ -12,6 +12,9 @@
12
12
  #include <cassert>
13
13
  #include <algorithm>
14
14
  #include <functional>
15
+ #include <unordered_map>
16
+ #include <string>
17
+ #include <parallel_hashmap/phmap.h>
15
18
 
16
19
  // Custom error codes for VectorStore
17
20
  enum class VectorStoreError {
@@ -196,6 +199,10 @@ public:
196
199
  struct Entry {
197
200
  Document doc;
198
201
  float* embedding; // Extracted pointer for fast access
202
+
203
+ // BM25 fields
204
+ size_t length; // Total number of tokens in doc.text
205
+ phmap::flat_hash_map<std::string, int> tf; // Term frequencies - better cache locality
199
206
  };
200
207
 
201
208
  private:
@@ -214,6 +221,35 @@ private:
214
221
  enum class TextFieldType { UNKNOWN, TEXT, CONTENT };
215
222
  std::atomic<TextFieldType> text_field_type_{TextFieldType::UNKNOWN};
216
223
 
224
+ // BM25 index structures - using parallel hashmap for lock-free concurrent updates
225
+ phmap::parallel_flat_hash_map<
226
+ std::string,
227
+ std::vector<size_t>,
228
+ phmap::priv::hash_default_hash<std::string>,
229
+ phmap::priv::hash_default_eq<std::string>,
230
+ std::allocator<std::pair<const std::string, std::vector<size_t>>>,
231
+ 4, // 2^4 = 16 submaps for parallelism
232
+ std::mutex // Use std::mutex for each submap
233
+ > postings_; // term -> list of doc indices
234
+
235
+ phmap::parallel_flat_hash_map<
236
+ std::string,
237
+ int, // Regular int - parallel hashmap provides synchronization
238
+ phmap::priv::hash_default_hash<std::string>,
239
+ phmap::priv::hash_default_eq<std::string>,
240
+ std::allocator<std::pair<const std::string, int>>,
241
+ 4, // 16 submaps
242
+ std::mutex
243
+ > doc_freq_; // document frequencies
244
+
245
+ std::atomic<size_t> total_length_{0}; // sum of all document lengths - now atomic
246
+ // Note: bm25_index_mutex_ removed - no longer needed with parallel hashmap!
247
+
248
+ // BM25 parameters
249
+ double k1_ = 1.2;
250
+ double b_ = 0.75;
251
+ double delta_ = 1.0;
252
+
217
253
  public:
218
254
  explicit VectorStore(size_t dim);
219
255
 
@@ -251,11 +287,26 @@ public:
251
287
  void normalize_all();
252
288
 
253
289
  std::vector<std::pair<float, size_t>>
254
- search(const float* query, size_t k) const;
290
+ search(const float* __restrict__ query, size_t k) const;
291
+
292
+ // BM25 search
293
+ std::vector<std::pair<size_t, double>>
294
+ search_bm25(const std::vector<std::string>& query_terms) const;
295
+
296
+ // Hybrid search combining vector similarity and BM25
297
+ std::vector<std::pair<size_t, double>>
298
+ search_hybrid(const float* __restrict__ query_vector, const std::vector<std::string>& query_terms,
299
+ double vector_weight = 0.7, double bm25_weight = 0.3, size_t k = 10) const;
300
+
301
+ // BM25 parameter setters
302
+ void set_bm25_parameters(double k1, double b, double delta);
255
303
 
256
304
  const Entry& get_entry(size_t idx) const;
257
305
 
258
306
  size_t size() const;
259
307
 
260
308
  bool is_finalized() const;
309
+
310
+ // Get average document length for BM25
311
+ double avg_doc_length() const;
261
312
  };
@@ -93,7 +93,7 @@ void VectorStoreLoader::loadDirectory(VectorStore* store, const std::string& pat
93
93
  for (size_t w = 0; w < num_workers; ++w) {
94
94
  consumers.emplace_back([&]() {
95
95
  // Each thread needs its own parser with initial capacity
96
- simdjson::ondemand::parser doc_parser(16 * 1024 * 1024); // 16MB initial capacity
96
+ simdjson::ondemand::parser doc_parser(1 * 1024 * 1024 * 1024); // 16MB initial capacity
97
97
  // Set a larger maximum capacity for very large files (up to 512MB)
98
98
  doc_parser.allocate(512 * 1024 * 1024);
99
99
  FileData* data = nullptr;
@@ -133,7 +133,7 @@ void VectorStoreLoader::loadDirectoryAdaptive(VectorStore* store, const std::str
133
133
  for (size_t w = 0; w < num_workers; ++w) {
134
134
  consumers.emplace_back([&]() {
135
135
  // Each thread needs its own parser with initial capacity
136
- simdjson::ondemand::parser doc_parser(16 * 1024 * 1024); // 16MB initial capacity
136
+ simdjson::ondemand::parser doc_parser(1 * 1024 * 1024 * 1024); // 16MB initial capacity
137
137
  // Set a larger maximum capacity for very large files (up to 512MB)
138
138
  doc_parser.allocate(512 * 1024 * 1024);
139
139
  MixedFileData* data = nullptr;
@@ -68,7 +68,7 @@ void VectorStoreLoader::loadDirectoryMMap(VectorStore* store, const std::string&
68
68
  for (size_t w = 0; w < num_workers; ++w) {
69
69
  consumers.emplace_back([&]() {
70
70
  // Each thread needs its own parser with initial capacity
71
- simdjson::ondemand::parser doc_parser(16 * 1024 * 1024); // 16MB initial capacity
71
+ simdjson::ondemand::parser doc_parser(1 * 1024 * 1024 * 1024); // 16MB initial capacity
72
72
  // Set a larger maximum capacity for very large files (up to 512MB)
73
73
  doc_parser.set_max_capacity(512 * 1024 * 1024);
74
74
  MMapFileData* data = nullptr;
@@ -154,4 +154,4 @@ void VectorStoreLoader::loadDirectoryMMap(VectorStore* store, const std::string&
154
154
 
155
155
  // Finalize after batch load - normalize and switch to serving phase
156
156
  store->finalize();
157
- }
157
+ }