native-vector-store 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +55 -3
  2. package/binding.gyp +3 -2
  3. package/deps/parallel_hashmap/btree.h +4076 -0
  4. package/deps/parallel_hashmap/meminfo.h +195 -0
  5. package/deps/parallel_hashmap/phmap.h +5236 -0
  6. package/deps/parallel_hashmap/phmap_base.h +5115 -0
  7. package/deps/parallel_hashmap/phmap_bits.h +665 -0
  8. package/deps/parallel_hashmap/phmap_config.h +790 -0
  9. package/deps/parallel_hashmap/phmap_dump.h +335 -0
  10. package/deps/parallel_hashmap/phmap_fwd_decl.h +186 -0
  11. package/deps/parallel_hashmap/phmap_utils.h +407 -0
  12. package/docs/index.html +52 -3
  13. package/lib/index.d.ts +35 -1
  14. package/package.json +1 -1
  15. package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
  16. package/prebuilds/darwin-x64/native-vector-store.node +0 -0
  17. package/prebuilds/linux-arm64/native-vector-store.node +0 -0
  18. package/prebuilds/linux-x64/native-vector-store.node +0 -0
  19. package/src/Makefile +26 -6
  20. package/src/binding.cc +185 -2
  21. package/src/english_abbreviations.h +197 -0
  22. package/src/english_dictionary.h +25185 -0
  23. package/src/english_punctuations.h +42 -0
  24. package/src/english_stop_words.h +434 -0
  25. package/src/simple_sentence_splitter.h +218 -0
  26. package/src/simple_tokenizer.cpp +92 -0
  27. package/src/simple_tokenizer.h +30 -0
  28. package/src/test_bm25.cpp +357 -0
  29. package/src/test_hybrid_search.cpp +496 -0
  30. package/src/vector_store.cpp +239 -3
  31. package/src/vector_store.h +52 -1
  32. package/src/vector_store_loader.cpp +1 -1
  33. package/src/vector_store_loader_adaptive.cpp +1 -1
  34. package/src/vector_store_loader_mmap.cpp +2 -2
  35. package/prebuilds/win32-x64/native-vector-store.node +0 -0
@@ -0,0 +1,496 @@
1
+ #include "vector_store.h"
2
+ #include "simple_tokenizer.h"
3
+ #include <iostream>
4
+ #include <vector>
5
+ #include <string>
6
+ #include <cassert>
7
+ #include <iomanip>
8
+ #include <random>
9
+ #include <chrono>
10
+ #include <set>
11
+
12
+ // Helper function to create test documents with embeddings
13
+ simdjson::error_code create_test_document(const std::string& id, const std::string& text,
14
+ const std::vector<float>& embedding,
15
+ std::string& json_out) {
16
+ json_out = R"({"id":")" + id + R"(","text":")" + text + R"(","metadata":{"embedding":[)";
17
+
18
+ for (size_t i = 0; i < embedding.size(); ++i) {
19
+ if (i > 0) json_out += ",";
20
+ json_out += std::to_string(embedding[i]);
21
+ }
22
+ json_out += R"(]}})";
23
+ return simdjson::SUCCESS;
24
+ }
25
+
26
+ // Generate random embedding with some semantic structure
27
+ std::vector<float> generate_embedding(size_t dim, int category, float noise_level = 0.1f) {
28
+ std::vector<float> embedding(dim);
29
+ std::mt19937 gen(42 + category);
30
+ std::normal_distribution<float> noise(0.0f, noise_level);
31
+
32
+ // Create category-specific patterns
33
+ for (size_t i = 0; i < dim; ++i) {
34
+ float base_value = 0.0f;
35
+
36
+ // Different categories have different patterns
37
+ switch (category) {
38
+ case 0: // Machine learning category
39
+ base_value = std::sin(i * 0.5f) * 0.5f;
40
+ break;
41
+ case 1: // Natural language category
42
+ base_value = std::cos(i * 0.3f) * 0.4f;
43
+ break;
44
+ case 2: // Computer vision category
45
+ base_value = std::sin(i * 0.2f) * std::cos(i * 0.1f);
46
+ break;
47
+ case 3: // Data science category
48
+ base_value = std::tanh(i * 0.1f - 2.0f);
49
+ break;
50
+ default: // General/other
51
+ base_value = 0.1f;
52
+ }
53
+
54
+ embedding[i] = base_value + noise(gen);
55
+ }
56
+
57
+ // Normalize
58
+ float sum = 0.0f;
59
+ for (float val : embedding) sum += val * val;
60
+ float inv_norm = 1.0f / std::sqrt(sum + 1e-10f);
61
+ for (float& val : embedding) val *= inv_norm;
62
+
63
+ return embedding;
64
+ }
65
+
66
+ void test_hybrid_search_basic() {
67
+ std::cout << "\n=== Testing Basic Hybrid Search ===" << std::endl;
68
+
69
+ const size_t dim = 128; // Realistic embedding dimension
70
+ VectorStore store(dim);
71
+
72
+ // Create test corpus with semantic categories
73
+ struct TestDoc {
74
+ std::string id;
75
+ std::string text;
76
+ int category;
77
+ };
78
+
79
+ std::vector<TestDoc> docs = {
80
+ // Machine learning cluster (category 0)
81
+ {"ml1", "Machine learning algorithms use gradient descent for optimization", 0},
82
+ {"ml2", "Deep learning neural networks require backpropagation training", 0},
83
+ {"ml3", "Supervised learning uses labeled data for model training", 0},
84
+ {"ml4", "Reinforcement learning agents maximize reward through exploration", 0},
85
+
86
+ // NLP cluster (category 1)
87
+ {"nlp1", "Natural language processing enables computers to understand text", 1},
88
+ {"nlp2", "Transformer models revolutionized language understanding tasks", 1},
89
+ {"nlp3", "Text embeddings capture semantic meaning in vector space", 1},
90
+ {"nlp4", "Named entity recognition identifies people places and organizations", 1},
91
+
92
+ // Computer vision cluster (category 2)
93
+ {"cv1", "Computer vision algorithms detect objects in images", 2},
94
+ {"cv2", "Convolutional neural networks excel at image classification", 2},
95
+ {"cv3", "Image segmentation divides pictures into meaningful regions", 2},
96
+ {"cv4", "Face recognition systems use biometric features for identification", 2},
97
+
98
+ // Data science cluster (category 3)
99
+ {"ds1", "Data science combines statistics and programming for insights", 3},
100
+ {"ds2", "Feature engineering improves machine learning model performance", 3},
101
+ {"ds3", "Data visualization helps communicate complex patterns effectively", 3},
102
+ {"ds4", "Statistical analysis reveals trends and correlations in datasets", 3},
103
+
104
+ // Mixed/ambiguous documents
105
+ {"mix1", "Neural networks process images and text using deep learning", 0},
106
+ {"mix2", "Computer algorithms analyze visual data for pattern recognition", 2},
107
+ {"mix3", "Machine learning transforms data science and analytics workflows", 3},
108
+ {"mix4", "Language models and vision systems share transformer architectures", 1},
109
+ };
110
+
111
+ // Add documents to store
112
+ std::cout << "Adding " << docs.size() << " documents to store..." << std::endl;
113
+
114
+ simdjson::ondemand::parser parser;
115
+ auto alloc_error = parser.allocate(1024 * 1024);
116
+ if (alloc_error) {
117
+ std::cerr << "Parser allocation failed: " << simdjson::error_message(alloc_error) << std::endl;
118
+ return;
119
+ }
120
+
121
+ for (const auto& doc : docs) {
122
+ auto embedding = generate_embedding(dim, doc.category);
123
+ std::string json_str;
124
+ create_test_document(doc.id, doc.text, embedding, json_str);
125
+
126
+ simdjson::padded_string padded(json_str);
127
+ simdjson::ondemand::document json_doc;
128
+ auto error = parser.iterate(padded).get(json_doc);
129
+ if (!error) {
130
+ store.add_document(json_doc);
131
+ }
132
+ }
133
+
134
+ store.finalize();
135
+ std::cout << "Store finalized with " << store.size() << " documents" << std::endl;
136
+ std::cout << "Average document length: " << std::fixed << std::setprecision(2)
137
+ << store.avg_doc_length() << " tokens\n" << std::endl;
138
+ }
139
+
140
+ void test_hybrid_search_queries() {
141
+ std::cout << "\n=== Testing Hybrid Search Queries ===" << std::endl;
142
+
143
+ const size_t dim = 128;
144
+ VectorStore store(dim);
145
+
146
+ // Build corpus (same as above but condensed)
147
+ std::vector<std::pair<std::string, std::string>> corpus = {
148
+ {"ml1", "Machine learning algorithms use gradient descent optimization"},
149
+ {"ml2", "Deep neural networks require backpropagation for training"},
150
+ {"nlp1", "Natural language processing transforms text into vectors"},
151
+ {"nlp2", "Transformer models excel at language understanding tasks"},
152
+ {"cv1", "Computer vision algorithms detect and classify objects"},
153
+ {"cv2", "Convolutional networks process images through multiple layers"},
154
+ {"ds1", "Data science combines statistics with machine learning"},
155
+ {"ds2", "Statistical analysis reveals patterns in large datasets"},
156
+ };
157
+
158
+ simdjson::ondemand::parser parser;
159
+ auto alloc_error = parser.allocate(1024 * 1024);
160
+ if (alloc_error) {
161
+ std::cerr << "Parser allocation failed: " << simdjson::error_message(alloc_error) << std::endl;
162
+ return;
163
+ }
164
+
165
+ // Add documents with category-based embeddings
166
+ for (size_t i = 0; i < corpus.size(); ++i) {
167
+ int category = i / 2; // Group pairs into categories
168
+ auto embedding = generate_embedding(dim, category);
169
+ std::string json_str;
170
+ create_test_document(corpus[i].first, corpus[i].second, embedding, json_str);
171
+
172
+ simdjson::padded_string padded(json_str);
173
+ simdjson::ondemand::document json_doc;
174
+ if (!parser.iterate(padded).get(json_doc)) {
175
+ store.add_document(json_doc);
176
+ }
177
+ }
178
+
179
+ store.finalize();
180
+
181
+ // Test different query scenarios
182
+ struct QueryTest {
183
+ std::string description;
184
+ std::vector<std::string> query_terms;
185
+ int semantic_category; // For generating query vector
186
+ std::vector<std::string> expected_top_docs; // Expected relevant docs
187
+ };
188
+
189
+ std::vector<QueryTest> test_queries = {
190
+ {
191
+ "Exact keyword match (neural networks)",
192
+ {"neural", "networks"},
193
+ 0, // ML category
194
+ {"ml2", "cv2"} // Both mention neural networks
195
+ },
196
+ {
197
+ "Semantic similarity (AI learning)",
198
+ {"artificial", "intelligence", "learning"},
199
+ 0, // ML category
200
+ {"ml1", "ml2", "ds1"} // Semantically related to ML
201
+ },
202
+ {
203
+ "Mixed query (vision + algorithms)",
204
+ {"vision", "algorithms"},
205
+ 2, // CV category
206
+ {"cv1", "cv2"} // Computer vision documents
207
+ },
208
+ {
209
+ "Broad query (data analysis)",
210
+ {"data", "analysis"},
211
+ 3, // DS category
212
+ {"ds1", "ds2"} // Data science documents
213
+ }
214
+ };
215
+
216
+ std::cout << "Testing query scenarios:\n" << std::endl;
217
+
218
+ for (const auto& test : test_queries) {
219
+ std::cout << "Query: " << test.description << std::endl;
220
+ std::cout << "Terms: ";
221
+ for (const auto& term : test.query_terms) {
222
+ std::cout << "\"" << term << "\" ";
223
+ }
224
+ std::cout << std::endl;
225
+
226
+ // Generate query vector based on semantic category
227
+ auto query_vector = generate_embedding(dim, test.semantic_category, 0.05f);
228
+
229
+ // Test different search modes
230
+ std::cout << "\n1. Vector-only search:" << std::endl;
231
+ auto vector_results = store.search(query_vector.data(), 5);
232
+ for (size_t i = 0; i < std::min(size_t(3), vector_results.size()); ++i) {
233
+ const auto& entry = store.get_entry(vector_results[i].second);
234
+ std::cout << " " << entry.doc.id << " (score: "
235
+ << std::fixed << std::setprecision(4) << vector_results[i].first << ")" << std::endl;
236
+ }
237
+
238
+ std::cout << "\n2. BM25-only search:" << std::endl;
239
+ auto bm25_results = store.search_bm25(test.query_terms);
240
+ for (size_t i = 0; i < std::min(size_t(3), bm25_results.size()); ++i) {
241
+ const auto& entry = store.get_entry(bm25_results[i].first);
242
+ std::cout << " " << entry.doc.id << " (score: "
243
+ << std::fixed << std::setprecision(4) << bm25_results[i].second << ")" << std::endl;
244
+ }
245
+
246
+ std::cout << "\n3. Hybrid search (0.5/0.5):" << std::endl;
247
+ auto hybrid_results = store.search_hybrid(query_vector.data(), test.query_terms, 0.5, 0.5, 5);
248
+ for (size_t i = 0; i < std::min(size_t(3), hybrid_results.size()); ++i) {
249
+ const auto& entry = store.get_entry(hybrid_results[i].first);
250
+ std::cout << " " << entry.doc.id << " (score: "
251
+ << std::fixed << std::setprecision(4) << hybrid_results[i].second << ")" << std::endl;
252
+ }
253
+
254
+ // Check if expected documents appear in top results
255
+ std::set<std::string> top_hybrid_docs;
256
+ for (size_t i = 0; i < std::min(size_t(3), hybrid_results.size()); ++i) {
257
+ const auto& entry = store.get_entry(hybrid_results[i].first);
258
+ top_hybrid_docs.insert(std::string(entry.doc.id));
259
+ }
260
+
261
+ std::cout << "\nExpected docs found: ";
262
+ for (const auto& expected : test.expected_top_docs) {
263
+ if (top_hybrid_docs.count(expected)) {
264
+ std::cout << expected << " ✓ ";
265
+ } else {
266
+ std::cout << expected << " ✗ ";
267
+ }
268
+ }
269
+ std::cout << "\n" << std::endl;
270
+ std::cout << "---" << std::endl;
271
+ }
272
+ }
273
+
274
+ void test_hybrid_search_weights() {
275
+ std::cout << "\n=== Testing Hybrid Search Weight Sensitivity ===" << std::endl;
276
+
277
+ const size_t dim = 64;
278
+ VectorStore store(dim);
279
+
280
+ // Create documents with varying keyword/semantic overlap
281
+ std::vector<std::tuple<std::string, std::string, int>> docs = {
282
+ {"exact1", "machine learning algorithms optimize neural network weights", 0},
283
+ {"exact2", "neural network training requires gradient optimization", 0},
284
+ {"similar1", "deep learning models use backpropagation", 0},
285
+ {"similar2", "artificial intelligence systems learn from data", 0},
286
+ {"different1", "database query processing improves performance", 2},
287
+ {"different2", "web server handles HTTP requests efficiently", 2},
288
+ };
289
+
290
+ simdjson::ondemand::parser parser;
291
+ auto alloc_error = parser.allocate(1024 * 1024);
292
+ if (alloc_error) {
293
+ std::cerr << "Parser allocation failed: " << simdjson::error_message(alloc_error) << std::endl;
294
+ return;
295
+ }
296
+
297
+ for (const auto& [id, text, category] : docs) {
298
+ auto embedding = generate_embedding(dim, category);
299
+ std::string json_str;
300
+ create_test_document(id, text, embedding, json_str);
301
+
302
+ simdjson::padded_string padded(json_str);
303
+ simdjson::ondemand::document json_doc;
304
+ if (!parser.iterate(padded).get(json_doc)) {
305
+ store.add_document(json_doc);
306
+ }
307
+ }
308
+
309
+ store.finalize();
310
+
311
+ // Query that has both exact matches and semantic similarity
312
+ std::vector<std::string> query_terms = {"neural", "network", "optimization"};
313
+ auto query_vector = generate_embedding(dim, 0, 0.05f); // Category 0 with low noise
314
+
315
+ std::cout << "Query terms: ";
316
+ for (const auto& term : query_terms) std::cout << "\"" << term << "\" ";
317
+ std::cout << "\n" << std::endl;
318
+
319
+ // Test different weight combinations
320
+ std::vector<std::pair<double, double>> weight_configs = {
321
+ {1.0, 0.0}, // Pure vector
322
+ {0.8, 0.2}, // Vector-heavy
323
+ {0.6, 0.4}, // Slightly vector-heavy
324
+ {0.5, 0.5}, // Balanced
325
+ {0.4, 0.6}, // Slightly BM25-heavy
326
+ {0.2, 0.8}, // BM25-heavy
327
+ {0.0, 1.0}, // Pure BM25
328
+ };
329
+
330
+ std::cout << "Weight sensitivity analysis:" << std::endl;
331
+ std::cout << std::setw(15) << "Vector Weight" << std::setw(15) << "BM25 Weight"
332
+ << std::setw(10) << "Top Doc" << std::setw(12) << "Score" << std::endl;
333
+ std::cout << std::string(52, '-') << std::endl;
334
+
335
+ for (const auto& [vector_weight, bm25_weight] : weight_configs) {
336
+ auto results = store.search_hybrid(query_vector.data(), query_terms,
337
+ vector_weight, bm25_weight, 3);
338
+
339
+ if (!results.empty()) {
340
+ const auto& entry = store.get_entry(results[0].first);
341
+ std::cout << std::setw(15) << std::fixed << std::setprecision(1) << vector_weight
342
+ << std::setw(15) << bm25_weight
343
+ << std::setw(10) << entry.doc.id
344
+ << std::setw(12) << std::setprecision(4) << results[0].second
345
+ << std::endl;
346
+ }
347
+ }
348
+ }
349
+
350
+ void test_hybrid_search_performance() {
351
+ std::cout << "\n=== Testing Hybrid Search Performance ===" << std::endl;
352
+
353
+ const size_t dim = 768; // BERT-like dimension
354
+ const size_t num_docs = 10000;
355
+ const size_t num_queries = 100;
356
+
357
+ VectorStore store(dim);
358
+
359
+ // Generate a larger corpus
360
+ std::cout << "Building corpus of " << num_docs << " documents..." << std::endl;
361
+
362
+ simdjson::ondemand::parser parser;
363
+ auto alloc_error = parser.allocate(1024 * 1024);
364
+ if (alloc_error) {
365
+ std::cerr << "Parser allocation failed: " << simdjson::error_message(alloc_error) << std::endl;
366
+ return;
367
+ }
368
+
369
+ std::mt19937 gen(42);
370
+ std::uniform_int_distribution<> cat_dist(0, 9); // 10 categories
371
+
372
+ std::vector<std::string> words = {
373
+ "machine", "learning", "neural", "network", "deep", "data", "algorithm",
374
+ "model", "training", "optimization", "gradient", "vector", "embedding",
375
+ "classification", "regression", "clustering", "analysis", "processing",
376
+ "computer", "vision", "language", "natural", "artificial", "intelligence"
377
+ };
378
+
379
+ std::uniform_int_distribution<> word_dist(0, words.size() - 1);
380
+ std::uniform_int_distribution<> text_len_dist(5, 15);
381
+
382
+ auto start = std::chrono::high_resolution_clock::now();
383
+
384
+ for (size_t i = 0; i < num_docs; ++i) {
385
+ // Generate random text
386
+ std::string text;
387
+ int text_len = text_len_dist(gen);
388
+ for (int j = 0; j < text_len; ++j) {
389
+ if (j > 0) text += " ";
390
+ text += words[word_dist(gen)];
391
+ }
392
+
393
+ // Generate embedding based on category
394
+ int category = cat_dist(gen);
395
+ auto embedding = generate_embedding(dim, category, 0.2f);
396
+
397
+ std::string json_str;
398
+ create_test_document("doc" + std::to_string(i), text, embedding, json_str);
399
+
400
+ simdjson::padded_string padded(json_str);
401
+ simdjson::ondemand::document json_doc;
402
+ if (!parser.iterate(padded).get(json_doc)) {
403
+ store.add_document(json_doc);
404
+ }
405
+ }
406
+
407
+ store.finalize();
408
+
409
+ auto build_time = std::chrono::high_resolution_clock::now() - start;
410
+ std::cout << "Corpus built in "
411
+ << std::chrono::duration_cast<std::chrono::milliseconds>(build_time).count()
412
+ << "ms" << std::endl;
413
+ std::cout << "Average document length: " << store.avg_doc_length() << " tokens\n" << std::endl;
414
+
415
+ // Benchmark different search types
416
+ std::cout << "Benchmarking " << num_queries << " queries:" << std::endl;
417
+
418
+ // Generate random queries
419
+ std::vector<std::pair<std::vector<float>, std::vector<std::string>>> queries;
420
+ for (size_t i = 0; i < num_queries; ++i) {
421
+ // Random query vector
422
+ auto query_vector = generate_embedding(dim, cat_dist(gen), 0.1f);
423
+
424
+ // Random query terms (2-4 terms)
425
+ std::uniform_int_distribution<> num_terms_dist(2, 4);
426
+ int num_terms = num_terms_dist(gen);
427
+ std::vector<std::string> query_terms;
428
+ for (int j = 0; j < num_terms; ++j) {
429
+ query_terms.push_back(words[word_dist(gen)]);
430
+ }
431
+
432
+ queries.push_back({query_vector, query_terms});
433
+ }
434
+
435
+ // Benchmark vector search
436
+ start = std::chrono::high_resolution_clock::now();
437
+ for (const auto& [query_vector, _] : queries) {
438
+ auto results = store.search(query_vector.data(), 10);
439
+ }
440
+ auto vector_time = std::chrono::high_resolution_clock::now() - start;
441
+
442
+ // Benchmark BM25 search
443
+ start = std::chrono::high_resolution_clock::now();
444
+ for (const auto& [_, query_terms] : queries) {
445
+ auto results = store.search_bm25(query_terms);
446
+ }
447
+ auto bm25_time = std::chrono::high_resolution_clock::now() - start;
448
+
449
+ // Benchmark hybrid search
450
+ start = std::chrono::high_resolution_clock::now();
451
+ for (const auto& [query_vector, query_terms] : queries) {
452
+ auto results = store.search_hybrid(query_vector.data(), query_terms, 0.5, 0.5, 10);
453
+ }
454
+ auto hybrid_time = std::chrono::high_resolution_clock::now() - start;
455
+
456
+ std::cout << "\nPerformance Results:" << std::endl;
457
+ std::cout << "Vector search: "
458
+ << std::chrono::duration_cast<std::chrono::microseconds>(vector_time).count() / num_queries
459
+ << " µs/query" << std::endl;
460
+ std::cout << "BM25 search: "
461
+ << std::chrono::duration_cast<std::chrono::microseconds>(bm25_time).count() / num_queries
462
+ << " µs/query" << std::endl;
463
+ std::cout << "Hybrid search: "
464
+ << std::chrono::duration_cast<std::chrono::microseconds>(hybrid_time).count() / num_queries
465
+ << " µs/query" << std::endl;
466
+
467
+ double overhead = (std::chrono::duration_cast<std::chrono::microseconds>(hybrid_time).count() -
468
+ std::chrono::duration_cast<std::chrono::microseconds>(vector_time).count()) * 100.0 /
469
+ std::chrono::duration_cast<std::chrono::microseconds>(vector_time).count();
470
+
471
+ std::cout << "Hybrid overhead vs vector-only: "
472
+ << std::fixed << std::setprecision(1) << overhead << "%" << std::endl;
473
+ }
474
+
475
+ int main() {
476
+ std::cout << "🔍 Hybrid Search Test Suite" << std::endl;
477
+ std::cout << "============================" << std::endl;
478
+
479
+ try {
480
+ test_hybrid_search_basic();
481
+ test_hybrid_search_queries();
482
+ test_hybrid_search_weights();
483
+ test_hybrid_search_performance();
484
+
485
+ std::cout << "\n✅ All hybrid search tests completed successfully!" << std::endl;
486
+
487
+ } catch (const std::exception& e) {
488
+ std::cerr << "❌ Test failed with exception: " << e.what() << std::endl;
489
+ return 1;
490
+ } catch (...) {
491
+ std::cerr << "❌ Test failed with unknown exception" << std::endl;
492
+ return 1;
493
+ }
494
+
495
+ return 0;
496
+ }