native-vector-store 0.3.8 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -3
- package/binding.gyp +3 -2
- package/deps/parallel_hashmap/btree.h +4076 -0
- package/deps/parallel_hashmap/meminfo.h +195 -0
- package/deps/parallel_hashmap/phmap.h +5236 -0
- package/deps/parallel_hashmap/phmap_base.h +5115 -0
- package/deps/parallel_hashmap/phmap_bits.h +665 -0
- package/deps/parallel_hashmap/phmap_config.h +790 -0
- package/deps/parallel_hashmap/phmap_dump.h +335 -0
- package/deps/parallel_hashmap/phmap_fwd_decl.h +186 -0
- package/deps/parallel_hashmap/phmap_utils.h +407 -0
- package/docs/index.html +52 -3
- package/lib/index.d.ts +35 -1
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
- package/prebuilds/darwin-x64/native-vector-store.node +0 -0
- package/prebuilds/linux-arm64/native-vector-store.node +0 -0
- package/prebuilds/linux-x64/native-vector-store.node +0 -0
- package/src/Makefile +26 -6
- package/src/binding.cc +185 -2
- package/src/english_abbreviations.h +197 -0
- package/src/english_dictionary.h +25185 -0
- package/src/english_punctuations.h +42 -0
- package/src/english_stop_words.h +434 -0
- package/src/simple_sentence_splitter.h +218 -0
- package/src/simple_tokenizer.cpp +92 -0
- package/src/simple_tokenizer.h +30 -0
- package/src/test_bm25.cpp +357 -0
- package/src/test_hybrid_search.cpp +496 -0
- package/src/vector_store.cpp +239 -3
- package/src/vector_store.h +52 -1
- package/src/vector_store_loader.cpp +1 -1
- package/src/vector_store_loader_adaptive.cpp +1 -1
- package/src/vector_store_loader_mmap.cpp +2 -2
- package/prebuilds/win32-x64/native-vector-store.node +0 -0
package/src/vector_store.cpp
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
#include "vector_store.h"
|
|
2
|
+
#include "simple_tokenizer.h"
|
|
3
|
+
#include <cctype>
|
|
4
|
+
#include <algorithm>
|
|
2
5
|
|
|
3
6
|
// ArenaAllocator implementation
|
|
4
7
|
|
|
@@ -109,7 +112,7 @@ void TopK::merge(const TopK& other) {
|
|
|
109
112
|
|
|
110
113
|
// VectorStore implementation
|
|
111
114
|
|
|
112
|
-
VectorStore::VectorStore(size_t dim) : dim_(dim) {
|
|
115
|
+
VectorStore::VectorStore(size_t dim) : dim_(dim), postings_(), doc_freq_() {
|
|
113
116
|
entries_.resize(1'000'000); // Pre-size with default-constructed entries
|
|
114
117
|
|
|
115
118
|
// Prepare per-thread arena allocators for zero-contention parallel loading
|
|
@@ -314,6 +317,45 @@ VectorStoreError VectorStore::add_document(simdjson::ondemand::object& json_doc)
|
|
|
314
317
|
entry.doc = doc;
|
|
315
318
|
entry.embedding = emb_ptr;
|
|
316
319
|
|
|
320
|
+
// Process text for BM25 - tokenize and build term frequencies
|
|
321
|
+
SimpleTokenizer tokenizer;
|
|
322
|
+
std::vector<std::string> tokens = tokenizer.split(std::string(text));
|
|
323
|
+
|
|
324
|
+
// Build term frequency map
|
|
325
|
+
entry.tf.clear();
|
|
326
|
+
for (const std::string& token : tokens) {
|
|
327
|
+
// Convert to lowercase for case-insensitive matching
|
|
328
|
+
std::string lower_token = token;
|
|
329
|
+
std::transform(lower_token.begin(), lower_token.end(), lower_token.begin(), ::tolower);
|
|
330
|
+
entry.tf[lower_token]++;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
entry.length = tokens.size();
|
|
334
|
+
|
|
335
|
+
// Update BM25 index structures using lock-free parallel hashmap operations
|
|
336
|
+
total_length_.fetch_add(entry.length, std::memory_order_relaxed);
|
|
337
|
+
|
|
338
|
+
// Update postings and document frequencies
|
|
339
|
+
for (const auto& tf_pair : entry.tf) {
|
|
340
|
+
const std::string& term = tf_pair.first;
|
|
341
|
+
|
|
342
|
+
// Update postings list using parallel hashmap's thread-safe lazy_emplace_l
|
|
343
|
+
postings_.lazy_emplace_l(term,
|
|
344
|
+
// If key exists, append to the vector
|
|
345
|
+
[&idx](auto& p) { p.second.push_back(idx); },
|
|
346
|
+
// If key doesn't exist, create new vector with this idx
|
|
347
|
+
[&term, &idx](const auto& ctor) { ctor(term, std::vector<size_t>{idx}); }
|
|
348
|
+
);
|
|
349
|
+
|
|
350
|
+
// Update document frequency - parallel hashmap provides thread safety
|
|
351
|
+
doc_freq_.lazy_emplace_l(term,
|
|
352
|
+
// If key exists, increment the count
|
|
353
|
+
[](auto& p) { p.second++; },
|
|
354
|
+
// If key doesn't exist, create with value 1
|
|
355
|
+
[&term](const auto& ctor) { ctor(term, 1); }
|
|
356
|
+
);
|
|
357
|
+
}
|
|
358
|
+
|
|
317
359
|
entries_[idx] = entry;
|
|
318
360
|
|
|
319
361
|
return VectorStoreError::SUCCESS;
|
|
@@ -360,7 +402,7 @@ void VectorStore::normalize_all() {
|
|
|
360
402
|
}
|
|
361
403
|
|
|
362
404
|
std::vector<std::pair<float, size_t>>
|
|
363
|
-
VectorStore::search(const float* query, size_t k) const {
|
|
405
|
+
VectorStore::search(const float* __restrict__ query, size_t k) const {
|
|
364
406
|
// Exclusive lock: prevent overlapping OpenMP teams
|
|
365
407
|
// Since each search uses all threads via OpenMP, concurrent searches provide no benefit
|
|
366
408
|
std::unique_lock<std::shared_mutex> lock(search_mutex_);
|
|
@@ -394,7 +436,7 @@ VectorStore::search(const float* query, size_t k) const {
|
|
|
394
436
|
#pragma omp for // default barrier kept - ensures all threads finish before merge
|
|
395
437
|
for (int i = 0; i < static_cast<int>(n); ++i) {
|
|
396
438
|
float score = 0.0f;
|
|
397
|
-
const float* emb = entries_[i].embedding;
|
|
439
|
+
const float* __restrict__ emb = entries_[i].embedding;
|
|
398
440
|
|
|
399
441
|
#pragma omp simd reduction(+:score)
|
|
400
442
|
for (size_t j = 0; j < dim_; ++j) {
|
|
@@ -430,3 +472,197 @@ size_t VectorStore::size() const {
|
|
|
430
472
|
bool VectorStore::is_finalized() const {
|
|
431
473
|
return is_finalized_.load(std::memory_order_acquire);
|
|
432
474
|
}
|
|
475
|
+
|
|
476
|
+
double VectorStore::avg_doc_length() const {
|
|
477
|
+
size_t n = size();
|
|
478
|
+
return n > 0 ? static_cast<double>(total_length_.load(std::memory_order_relaxed)) / n : 0.0;
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
void VectorStore::set_bm25_parameters(double k1, double b, double delta) {
|
|
482
|
+
k1_ = k1;
|
|
483
|
+
b_ = b;
|
|
484
|
+
delta_ = delta;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
std::vector<std::pair<size_t, double>>
|
|
488
|
+
VectorStore::search_bm25(const std::vector<std::string>& query_terms) const {
|
|
489
|
+
if (!is_finalized()) {
|
|
490
|
+
return {}; // Store must be finalized
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
std::unordered_map<size_t, double> scores;
|
|
494
|
+
size_t N = size();
|
|
495
|
+
double avg_len = avg_doc_length();
|
|
496
|
+
|
|
497
|
+
// Precompute IDF for each unique query term
|
|
498
|
+
std::unordered_map<std::string, double> idf_cache;
|
|
499
|
+
for (const auto& term : query_terms) {
|
|
500
|
+
if (idf_cache.find(term) == idf_cache.end()) {
|
|
501
|
+
auto df_it = doc_freq_.find(term);
|
|
502
|
+
int df = (df_it != doc_freq_.end()) ? df_it->second : 0;
|
|
503
|
+
idf_cache[term] = std::log((N - df + 0.5) / (df + 0.5) + 1.0);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// For each unique term in the query:
|
|
508
|
+
for (const auto& term : query_terms) {
|
|
509
|
+
auto postings_it = postings_.find(term);
|
|
510
|
+
if (postings_it == postings_.end()) {
|
|
511
|
+
continue; // Term not found in corpus
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
double idf_t = idf_cache[term];
|
|
515
|
+
for (size_t doc_id : postings_it->second) {
|
|
516
|
+
const Entry& entry = entries_[doc_id];
|
|
517
|
+
auto tf_it = entry.tf.find(term);
|
|
518
|
+
if (tf_it == entry.tf.end()) {
|
|
519
|
+
continue; // Should not happen if postings are consistent
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
int tf = tf_it->second;
|
|
523
|
+
double norm = 1.0 - b_ + b_ * (entry.length / avg_len);
|
|
524
|
+
double tf_weight = (k1_ + 1) * tf / (tf + k1_ * norm);
|
|
525
|
+
scores[doc_id] += (tf_weight + delta_) * idf_t;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
// Collect and sort results
|
|
530
|
+
std::vector<std::pair<size_t, double>> results(scores.begin(), scores.end());
|
|
531
|
+
std::sort(results.begin(), results.end(),
|
|
532
|
+
[](const auto& a, const auto& b) { return a.second > b.second; });
|
|
533
|
+
|
|
534
|
+
return results;
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
std::vector<std::pair<size_t, double>>
|
|
538
|
+
VectorStore::search_hybrid(const float* __restrict__ query_vector, const std::vector<std::string>& query_terms,
|
|
539
|
+
double vector_weight, double bm25_weight, size_t k) const {
|
|
540
|
+
// Exclusive lock: prevent overlapping OpenMP teams
|
|
541
|
+
std::unique_lock<std::shared_mutex> lock(search_mutex_);
|
|
542
|
+
|
|
543
|
+
if (!is_finalized()) {
|
|
544
|
+
return {}; // Store must be finalized
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
size_t n = count_.load(std::memory_order_acquire);
|
|
548
|
+
if (n == 0 || k == 0) return {};
|
|
549
|
+
k = std::min(k, n);
|
|
550
|
+
|
|
551
|
+
// Precompute BM25 IDF scores for query terms
|
|
552
|
+
std::unordered_map<std::string, double> idf_cache;
|
|
553
|
+
double avg_len = avg_doc_length();
|
|
554
|
+
|
|
555
|
+
for (const auto& term : query_terms) {
|
|
556
|
+
auto df_it = doc_freq_.find(term);
|
|
557
|
+
int df = (df_it != doc_freq_.end()) ? df_it->second : 0;
|
|
558
|
+
idf_cache[term] = std::log((n - df + 0.5) / (df + 0.5) + 1.0);
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
const int num_threads = omp_get_max_threads();
|
|
562
|
+
|
|
563
|
+
// Each thread maintains TWO heaps - one for vector, one for BM25
|
|
564
|
+
struct DualTopK {
|
|
565
|
+
TopK vector_heap;
|
|
566
|
+
TopK bm25_heap;
|
|
567
|
+
DualTopK(size_t k) : vector_heap(k), bm25_heap(k) {}
|
|
568
|
+
|
|
569
|
+
// Make DualTopK move-only like TopK
|
|
570
|
+
DualTopK(const DualTopK&) = delete;
|
|
571
|
+
DualTopK& operator=(const DualTopK&) = delete;
|
|
572
|
+
DualTopK(DualTopK&&) = default;
|
|
573
|
+
DualTopK& operator=(DualTopK&&) = default;
|
|
574
|
+
};
|
|
575
|
+
|
|
576
|
+
std::vector<DualTopK> thread_heaps;
|
|
577
|
+
thread_heaps.reserve(num_threads);
|
|
578
|
+
for (int i = 0; i < num_threads; ++i) {
|
|
579
|
+
thread_heaps.emplace_back(k);
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
#pragma omp parallel
|
|
583
|
+
{
|
|
584
|
+
const int tid = omp_get_thread_num();
|
|
585
|
+
DualTopK& local = thread_heaps[tid];
|
|
586
|
+
|
|
587
|
+
#pragma omp for
|
|
588
|
+
for (int i = 0; i < static_cast<int>(n); ++i) {
|
|
589
|
+
// 1. Compute vector similarity score
|
|
590
|
+
float vector_score = 0.0f;
|
|
591
|
+
const float* __restrict__ emb = entries_[i].embedding;
|
|
592
|
+
|
|
593
|
+
#pragma omp simd reduction(+:vector_score)
|
|
594
|
+
for (size_t j = 0; j < dim_; ++j) {
|
|
595
|
+
vector_score += emb[j] * query_vector[j];
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// 2. Compute BM25 score for this document
|
|
599
|
+
double bm25_score = 0.0;
|
|
600
|
+
const Entry& entry = entries_[i];
|
|
601
|
+
|
|
602
|
+
for (const auto& term : query_terms) {
|
|
603
|
+
auto tf_it = entry.tf.find(term);
|
|
604
|
+
if (tf_it != entry.tf.end()) {
|
|
605
|
+
int tf = tf_it->second;
|
|
606
|
+
double norm = 1.0 - b_ + b_ * (entry.length / avg_len);
|
|
607
|
+
double tf_weight = (k1_ + 1) * tf / (tf + k1_ * norm);
|
|
608
|
+
bm25_score += (tf_weight + delta_) * idf_cache.at(term);
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
// 3. Push to both heaps
|
|
613
|
+
local.vector_heap.push(vector_score, i);
|
|
614
|
+
local.bm25_heap.push(static_cast<float>(bm25_score), i);
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
#pragma omp barrier
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// Merge thread-local heaps to get global top-k for each score type
|
|
621
|
+
TopK global_vector_heap(k);
|
|
622
|
+
TopK global_bm25_heap(k);
|
|
623
|
+
|
|
624
|
+
for (auto& th : thread_heaps) {
|
|
625
|
+
global_vector_heap.merge(th.vector_heap);
|
|
626
|
+
global_bm25_heap.merge(th.bm25_heap);
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
// Sort heaps to get ranking order
|
|
630
|
+
std::sort(global_vector_heap.heap.begin(), global_vector_heap.heap.end(),
|
|
631
|
+
[](const auto& a, const auto& b) { return a.first > b.first; });
|
|
632
|
+
std::sort(global_bm25_heap.heap.begin(), global_bm25_heap.heap.end(),
|
|
633
|
+
[](const auto& a, const auto& b) { return a.first > b.first; });
|
|
634
|
+
|
|
635
|
+
// Apply Reciprocal Rank Fusion (RRF) with constant k=60 (typical value)
|
|
636
|
+
const double rrf_k = 60.0;
|
|
637
|
+
std::unordered_map<size_t, double> rrf_scores;
|
|
638
|
+
|
|
639
|
+
// Add vector search rankings
|
|
640
|
+
for (size_t rank = 0; rank < global_vector_heap.heap.size(); ++rank) {
|
|
641
|
+
size_t doc_id = global_vector_heap.heap[rank].second;
|
|
642
|
+
// Weight the RRF contribution
|
|
643
|
+
rrf_scores[doc_id] += vector_weight * (1.0 / (rrf_k + rank + 1));
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// Add BM25 rankings
|
|
647
|
+
for (size_t rank = 0; rank < global_bm25_heap.heap.size(); ++rank) {
|
|
648
|
+
size_t doc_id = global_bm25_heap.heap[rank].second;
|
|
649
|
+
// Weight the RRF contribution
|
|
650
|
+
rrf_scores[doc_id] += bm25_weight * (1.0 / (rrf_k + rank + 1));
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// Sort by RRF score and return top-k
|
|
654
|
+
std::vector<std::pair<size_t, double>> results;
|
|
655
|
+
results.reserve(rrf_scores.size());
|
|
656
|
+
for (const auto& pair : rrf_scores) {
|
|
657
|
+
results.emplace_back(pair.first, pair.second);
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
std::sort(results.begin(), results.end(),
|
|
661
|
+
[](const auto& a, const auto& b) { return a.second > b.second; });
|
|
662
|
+
|
|
663
|
+
if (results.size() > k) {
|
|
664
|
+
results.resize(k);
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
return results;
|
|
668
|
+
}
|
package/src/vector_store.h
CHANGED
|
@@ -12,6 +12,9 @@
|
|
|
12
12
|
#include <cassert>
|
|
13
13
|
#include <algorithm>
|
|
14
14
|
#include <functional>
|
|
15
|
+
#include <unordered_map>
|
|
16
|
+
#include <string>
|
|
17
|
+
#include <parallel_hashmap/phmap.h>
|
|
15
18
|
|
|
16
19
|
// Custom error codes for VectorStore
|
|
17
20
|
enum class VectorStoreError {
|
|
@@ -196,6 +199,10 @@ public:
|
|
|
196
199
|
struct Entry {
|
|
197
200
|
Document doc;
|
|
198
201
|
float* embedding; // Extracted pointer for fast access
|
|
202
|
+
|
|
203
|
+
// BM25 fields
|
|
204
|
+
size_t length; // Total number of tokens in doc.text
|
|
205
|
+
phmap::flat_hash_map<std::string, int> tf; // Term frequencies - better cache locality
|
|
199
206
|
};
|
|
200
207
|
|
|
201
208
|
private:
|
|
@@ -214,6 +221,35 @@ private:
|
|
|
214
221
|
enum class TextFieldType { UNKNOWN, TEXT, CONTENT };
|
|
215
222
|
std::atomic<TextFieldType> text_field_type_{TextFieldType::UNKNOWN};
|
|
216
223
|
|
|
224
|
+
// BM25 index structures - using parallel hashmap for lock-free concurrent updates
|
|
225
|
+
phmap::parallel_flat_hash_map<
|
|
226
|
+
std::string,
|
|
227
|
+
std::vector<size_t>,
|
|
228
|
+
phmap::priv::hash_default_hash<std::string>,
|
|
229
|
+
phmap::priv::hash_default_eq<std::string>,
|
|
230
|
+
std::allocator<std::pair<const std::string, std::vector<size_t>>>,
|
|
231
|
+
4, // 2^4 = 16 submaps for parallelism
|
|
232
|
+
std::mutex // Use std::mutex for each submap
|
|
233
|
+
> postings_; // term -> list of doc indices
|
|
234
|
+
|
|
235
|
+
phmap::parallel_flat_hash_map<
|
|
236
|
+
std::string,
|
|
237
|
+
int, // Regular int - parallel hashmap provides synchronization
|
|
238
|
+
phmap::priv::hash_default_hash<std::string>,
|
|
239
|
+
phmap::priv::hash_default_eq<std::string>,
|
|
240
|
+
std::allocator<std::pair<const std::string, int>>,
|
|
241
|
+
4, // 16 submaps
|
|
242
|
+
std::mutex
|
|
243
|
+
> doc_freq_; // document frequencies
|
|
244
|
+
|
|
245
|
+
std::atomic<size_t> total_length_{0}; // sum of all document lengths - now atomic
|
|
246
|
+
// Note: bm25_index_mutex_ removed - no longer needed with parallel hashmap!
|
|
247
|
+
|
|
248
|
+
// BM25 parameters
|
|
249
|
+
double k1_ = 1.2;
|
|
250
|
+
double b_ = 0.75;
|
|
251
|
+
double delta_ = 1.0;
|
|
252
|
+
|
|
217
253
|
public:
|
|
218
254
|
explicit VectorStore(size_t dim);
|
|
219
255
|
|
|
@@ -251,11 +287,26 @@ public:
|
|
|
251
287
|
void normalize_all();
|
|
252
288
|
|
|
253
289
|
std::vector<std::pair<float, size_t>>
|
|
254
|
-
search(const float* query, size_t k) const;
|
|
290
|
+
search(const float* __restrict__ query, size_t k) const;
|
|
291
|
+
|
|
292
|
+
// BM25 search
|
|
293
|
+
std::vector<std::pair<size_t, double>>
|
|
294
|
+
search_bm25(const std::vector<std::string>& query_terms) const;
|
|
295
|
+
|
|
296
|
+
// Hybrid search combining vector similarity and BM25
|
|
297
|
+
std::vector<std::pair<size_t, double>>
|
|
298
|
+
search_hybrid(const float* __restrict__ query_vector, const std::vector<std::string>& query_terms,
|
|
299
|
+
double vector_weight = 0.7, double bm25_weight = 0.3, size_t k = 10) const;
|
|
300
|
+
|
|
301
|
+
// BM25 parameter setters
|
|
302
|
+
void set_bm25_parameters(double k1, double b, double delta);
|
|
255
303
|
|
|
256
304
|
const Entry& get_entry(size_t idx) const;
|
|
257
305
|
|
|
258
306
|
size_t size() const;
|
|
259
307
|
|
|
260
308
|
bool is_finalized() const;
|
|
309
|
+
|
|
310
|
+
// Get average document length for BM25
|
|
311
|
+
double avg_doc_length() const;
|
|
261
312
|
};
|
|
@@ -93,7 +93,7 @@ void VectorStoreLoader::loadDirectory(VectorStore* store, const std::string& pat
|
|
|
93
93
|
for (size_t w = 0; w < num_workers; ++w) {
|
|
94
94
|
consumers.emplace_back([&]() {
|
|
95
95
|
// Each thread needs its own parser with initial capacity
|
|
96
|
-
simdjson::ondemand::parser doc_parser(
|
|
96
|
+
simdjson::ondemand::parser doc_parser(1 * 1024 * 1024 * 1024); // 16MB initial capacity
|
|
97
97
|
// Set a larger maximum capacity for very large files (up to 512MB)
|
|
98
98
|
doc_parser.allocate(512 * 1024 * 1024);
|
|
99
99
|
FileData* data = nullptr;
|
|
@@ -133,7 +133,7 @@ void VectorStoreLoader::loadDirectoryAdaptive(VectorStore* store, const std::str
|
|
|
133
133
|
for (size_t w = 0; w < num_workers; ++w) {
|
|
134
134
|
consumers.emplace_back([&]() {
|
|
135
135
|
// Each thread needs its own parser with initial capacity
|
|
136
|
-
simdjson::ondemand::parser doc_parser(
|
|
136
|
+
simdjson::ondemand::parser doc_parser(1 * 1024 * 1024 * 1024); // 16MB initial capacity
|
|
137
137
|
// Set a larger maximum capacity for very large files (up to 512MB)
|
|
138
138
|
doc_parser.allocate(512 * 1024 * 1024);
|
|
139
139
|
MixedFileData* data = nullptr;
|
|
@@ -68,7 +68,7 @@ void VectorStoreLoader::loadDirectoryMMap(VectorStore* store, const std::string&
|
|
|
68
68
|
for (size_t w = 0; w < num_workers; ++w) {
|
|
69
69
|
consumers.emplace_back([&]() {
|
|
70
70
|
// Each thread needs its own parser with initial capacity
|
|
71
|
-
simdjson::ondemand::parser doc_parser(
|
|
71
|
+
simdjson::ondemand::parser doc_parser(1 * 1024 * 1024 * 1024); // 16MB initial capacity
|
|
72
72
|
// Set a larger maximum capacity for very large files (up to 512MB)
|
|
73
73
|
doc_parser.set_max_capacity(512 * 1024 * 1024);
|
|
74
74
|
MMapFileData* data = nullptr;
|
|
@@ -154,4 +154,4 @@ void VectorStoreLoader::loadDirectoryMMap(VectorStore* store, const std::string&
|
|
|
154
154
|
|
|
155
155
|
// Finalize after batch load - normalize and switch to serving phase
|
|
156
156
|
store->finalize();
|
|
157
|
-
}
|
|
157
|
+
}
|
|
Binary file
|