native-vector-store 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -12
- package/binding.gyp +22 -10
- package/deps/simdjson/simdjson.cpp +56403 -0
- package/deps/simdjson/simdjson.h +123534 -0
- package/docs/PERFORMANCE_CASE_STUDY.md +130 -0
- package/docs/PREBUILDS.md +69 -0
- package/docs/VectorStore.html +180 -0
- package/docs/VectorStoreWrapper.html +1356 -0
- package/docs/fonts/OpenSans-Bold-webfont.eot +0 -0
- package/docs/fonts/OpenSans-Bold-webfont.svg +1830 -0
- package/docs/fonts/OpenSans-Bold-webfont.woff +0 -0
- package/docs/fonts/OpenSans-BoldItalic-webfont.eot +0 -0
- package/docs/fonts/OpenSans-BoldItalic-webfont.svg +1830 -0
- package/docs/fonts/OpenSans-BoldItalic-webfont.woff +0 -0
- package/docs/fonts/OpenSans-Italic-webfont.eot +0 -0
- package/docs/fonts/OpenSans-Italic-webfont.svg +1830 -0
- package/docs/fonts/OpenSans-Italic-webfont.woff +0 -0
- package/docs/fonts/OpenSans-Light-webfont.eot +0 -0
- package/docs/fonts/OpenSans-Light-webfont.svg +1831 -0
- package/docs/fonts/OpenSans-Light-webfont.woff +0 -0
- package/docs/fonts/OpenSans-LightItalic-webfont.eot +0 -0
- package/docs/fonts/OpenSans-LightItalic-webfont.svg +1835 -0
- package/docs/fonts/OpenSans-LightItalic-webfont.woff +0 -0
- package/docs/fonts/OpenSans-Regular-webfont.eot +0 -0
- package/docs/fonts/OpenSans-Regular-webfont.svg +1831 -0
- package/docs/fonts/OpenSans-Regular-webfont.woff +0 -0
- package/docs/global.html +561 -0
- package/docs/index.html +570 -0
- package/docs/scripts/linenumber.js +25 -0
- package/docs/scripts/prettify/Apache-License-2.0.txt +202 -0
- package/docs/scripts/prettify/lang-css.js +2 -0
- package/docs/scripts/prettify/prettify.js +28 -0
- package/docs/styles/jsdoc-default.css +358 -0
- package/docs/styles/prettify-jsdoc.css +111 -0
- package/docs/styles/prettify-tomorrow.css +132 -0
- package/index.js +162 -0
- package/package.json +30 -7
- package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
- package/prebuilds/darwin-x64/native-vector-store.node +0 -0
- package/prebuilds/linux-arm64/native-vector-store.node +0 -0
- package/prebuilds/linux-x64/native-vector-store.node +0 -0
- package/prebuilds/linux-x64-musl/napi-v9/native-vector-store.node +0 -0
- package/prebuilds/linux-x64-musl/native-vector-store.node +0 -0
- package/prebuilds/win32-x64/native-vector-store.node +0 -0
- package/src/Makefile +87 -0
- package/src/test_main.cpp +173 -0
- package/src/test_stress.cpp +394 -0
- package/src/vector_store.cpp +344 -0
- package/src/vector_store.h +21 -323
- package/native-vector-store-0.1.0.tgz +0 -0
- package/scripts/build-prebuilds.sh +0 -23
- /package/{src → deps/atomic_queue}/atomic_queue.h +0 -0
- /package/{src → deps/atomic_queue}/defs.h +0 -0
@@ -0,0 +1,173 @@
|
|
1
|
+
#include "vector_store.h"
|
2
|
+
#include <iostream>
|
3
|
+
#include <filesystem>
|
4
|
+
#include <vector>
|
5
|
+
#include <simdjson.h>
|
6
|
+
#include <cctype>
|
7
|
+
|
8
|
+
void test_single_document() {
|
9
|
+
std::cout << "=== Testing Single Document ===" << std::endl;
|
10
|
+
|
11
|
+
VectorStore store(20);
|
12
|
+
|
13
|
+
// Create a test document manually
|
14
|
+
std::string json_str = R"({
|
15
|
+
"id": "test1",
|
16
|
+
"text": "Test document for debugging",
|
17
|
+
"metadata": {
|
18
|
+
"embedding": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
|
19
|
+
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
|
20
|
+
"category": "test"
|
21
|
+
}
|
22
|
+
})";
|
23
|
+
|
24
|
+
try {
|
25
|
+
simdjson::ondemand::parser parser;
|
26
|
+
simdjson::padded_string padded(json_str);
|
27
|
+
simdjson::ondemand::document doc;
|
28
|
+
auto parse_error = parser.iterate(padded).get(doc);
|
29
|
+
if (parse_error) {
|
30
|
+
std::cerr << "JSON parse error: " << simdjson::error_message(parse_error) << std::endl;
|
31
|
+
return;
|
32
|
+
}
|
33
|
+
|
34
|
+
std::cout << "Adding document..." << std::endl;
|
35
|
+
auto add_error = store.add_document(doc);
|
36
|
+
if (add_error) {
|
37
|
+
std::cerr << "Document add error: " << simdjson::error_message(add_error) << std::endl;
|
38
|
+
return;
|
39
|
+
}
|
40
|
+
std::cout << "Document added successfully. Store size: " << store.size() << std::endl;
|
41
|
+
|
42
|
+
// Test search
|
43
|
+
float query[20] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
|
44
|
+
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0};
|
45
|
+
|
46
|
+
auto results = store.search(query, 1);
|
47
|
+
std::cout << "Search completed. Found " << results.size() << " results" << std::endl;
|
48
|
+
if (!results.empty()) {
|
49
|
+
std::cout << "Top result score: " << results[0].first << std::endl;
|
50
|
+
}
|
51
|
+
|
52
|
+
} catch (const std::exception& e) {
|
53
|
+
std::cerr << "Error: " << e.what() << std::endl;
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
void test_load_directory(const std::string& path) {
|
58
|
+
std::cout << "\n=== Testing Load Directory ===" << std::endl;
|
59
|
+
std::cout << "Loading from: " << path << std::endl;
|
60
|
+
|
61
|
+
VectorStore store(20);
|
62
|
+
|
63
|
+
try {
|
64
|
+
// Collect JSON files
|
65
|
+
std::vector<std::filesystem::path> json_files;
|
66
|
+
for (const auto& entry : std::filesystem::directory_iterator(path)) {
|
67
|
+
if (entry.path().extension() == ".json") {
|
68
|
+
std::cout << "Found JSON file: " << entry.path() << std::endl;
|
69
|
+
json_files.push_back(entry.path());
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
std::cout << "Total JSON files found: " << json_files.size() << std::endl;
|
74
|
+
|
75
|
+
// Process files
|
76
|
+
simdjson::ondemand::parser file_parser;
|
77
|
+
for (size_t i = 0; i < json_files.size(); ++i) {
|
78
|
+
std::cout << "Processing file " << (i+1) << "/" << json_files.size()
|
79
|
+
<< ": " << json_files[i].filename() << std::endl;
|
80
|
+
|
81
|
+
try {
|
82
|
+
std::cout << " Loading file..." << std::endl;
|
83
|
+
auto json = simdjson::padded_string::load(json_files[i].string()).value_unsafe();
|
84
|
+
std::cout << " File loaded, size: " << json.size() << " bytes" << std::endl;
|
85
|
+
|
86
|
+
std::cout << " Parsing JSON..." << std::endl;
|
87
|
+
auto json_doc = file_parser.iterate(json).value_unsafe();
|
88
|
+
std::cout << " JSON parsed successfully" << std::endl;
|
89
|
+
|
90
|
+
// Check first character to determine if it's an array
|
91
|
+
const char* json_start = json.data();
|
92
|
+
while (json_start && *json_start && std::isspace(*json_start)) {
|
93
|
+
json_start++;
|
94
|
+
}
|
95
|
+
|
96
|
+
if (json_start && *json_start == '[') {
|
97
|
+
// It's an array
|
98
|
+
std::cout << " Detected array of documents" << std::endl;
|
99
|
+
simdjson::ondemand::array doc_array;
|
100
|
+
auto array_error = json_doc.get_array().get(doc_array);
|
101
|
+
if (array_error) {
|
102
|
+
std::cerr << " Error getting array: " << simdjson::error_message(array_error) << std::endl;
|
103
|
+
continue;
|
104
|
+
}
|
105
|
+
|
106
|
+
size_t doc_count = 0;
|
107
|
+
size_t error_count = 0;
|
108
|
+
for (auto doc_element : doc_array) {
|
109
|
+
simdjson::ondemand::object doc_obj;
|
110
|
+
auto obj_error = doc_element.get_object().get(doc_obj);
|
111
|
+
if (obj_error) {
|
112
|
+
std::cerr << " Error getting object from array element: " << simdjson::error_message(obj_error) << std::endl;
|
113
|
+
error_count++;
|
114
|
+
continue;
|
115
|
+
}
|
116
|
+
|
117
|
+
auto add_error = store.add_document(doc_obj);
|
118
|
+
if (add_error) {
|
119
|
+
std::cerr << " Error adding document: " << simdjson::error_message(add_error) << std::endl;
|
120
|
+
error_count++;
|
121
|
+
} else {
|
122
|
+
doc_count++;
|
123
|
+
}
|
124
|
+
}
|
125
|
+
std::cout << " Added " << doc_count << " documents";
|
126
|
+
if (error_count > 0) {
|
127
|
+
std::cout << " (with " << error_count << " errors)";
|
128
|
+
}
|
129
|
+
std::cout << ". Current store size: " << store.size() << std::endl;
|
130
|
+
} else {
|
131
|
+
// Single document
|
132
|
+
std::cout << " Detected single document" << std::endl;
|
133
|
+
std::cout << " Adding to store..." << std::endl;
|
134
|
+
auto add_error = store.add_document(json_doc);
|
135
|
+
if (add_error) {
|
136
|
+
std::cerr << " Error adding document: " << simdjson::error_message(add_error) << std::endl;
|
137
|
+
} else {
|
138
|
+
std::cout << " Document added successfully";
|
139
|
+
}
|
140
|
+
std::cout << ". Current store size: " << store.size() << std::endl;
|
141
|
+
}
|
142
|
+
|
143
|
+
} catch (const std::exception& e) {
|
144
|
+
std::cerr << " Error processing file: " << e.what() << std::endl;
|
145
|
+
}
|
146
|
+
}
|
147
|
+
|
148
|
+
std::cout << "\nAll files processed. Final store size: " << store.size() << std::endl;
|
149
|
+
|
150
|
+
// Test normalization
|
151
|
+
std::cout << "Testing normalization..." << std::endl;
|
152
|
+
store.normalize_all();
|
153
|
+
std::cout << "Normalization completed" << std::endl;
|
154
|
+
|
155
|
+
} catch (const std::exception& e) {
|
156
|
+
std::cerr << "Error in load directory: " << e.what() << std::endl;
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
int main(int argc, char** argv) {
|
161
|
+
std::cout << "Vector Store C++ Test Program" << std::endl;
|
162
|
+
std::cout << "=============================" << std::endl;
|
163
|
+
|
164
|
+
// Test single document first
|
165
|
+
test_single_document();
|
166
|
+
|
167
|
+
// Test directory loading
|
168
|
+
std::string test_dir = (argc > 1) ? argv[1] : "test";
|
169
|
+
test_load_directory(test_dir);
|
170
|
+
|
171
|
+
std::cout << "\nAll tests completed!" << std::endl;
|
172
|
+
return 0;
|
173
|
+
}
|
@@ -0,0 +1,394 @@
|
|
1
|
+
#include "vector_store.h"
|
2
|
+
#include "vector_store_loader.h"
|
3
|
+
#include <thread>
|
4
|
+
#include <random>
|
5
|
+
#include <chrono>
|
6
|
+
#include <iostream>
|
7
|
+
#include <atomic>
|
8
|
+
#include <cassert>
|
9
|
+
#include <sstream>
|
10
|
+
#include <iomanip>
|
11
|
+
#include <filesystem>
|
12
|
+
|
13
|
+
using namespace std::chrono;
|
14
|
+
|
15
|
+
// Test configuration
|
16
|
+
constexpr size_t DIM = 1536; // OpenAI embedding dimension
|
17
|
+
|
18
|
+
// Helper to generate random embedding
|
19
|
+
std::vector<float> generate_random_embedding(size_t dim, std::mt19937& rng) {
|
20
|
+
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
|
21
|
+
std::vector<float> embedding(dim);
|
22
|
+
float sum = 0.0f;
|
23
|
+
|
24
|
+
for (size_t i = 0; i < dim; ++i) {
|
25
|
+
embedding[i] = dist(rng);
|
26
|
+
sum += embedding[i] * embedding[i];
|
27
|
+
}
|
28
|
+
|
29
|
+
// Normalize
|
30
|
+
float inv_norm = 1.0f / std::sqrt(sum);
|
31
|
+
for (size_t i = 0; i < dim; ++i) {
|
32
|
+
embedding[i] *= inv_norm;
|
33
|
+
}
|
34
|
+
|
35
|
+
return embedding;
|
36
|
+
}
|
37
|
+
|
38
|
+
// Helper to create JSON document
|
39
|
+
std::string create_json_document(const std::string& id, const std::string& text,
|
40
|
+
const std::vector<float>& embedding) {
|
41
|
+
std::stringstream json;
|
42
|
+
json << "{\"id\":\"" << id << "\",\"text\":\"" << text << "\",\"metadata\":{\"embedding\":[";
|
43
|
+
|
44
|
+
for (size_t i = 0; i < embedding.size(); ++i) {
|
45
|
+
if (i > 0) json << ",";
|
46
|
+
json << std::fixed << std::setprecision(6) << embedding[i];
|
47
|
+
}
|
48
|
+
|
49
|
+
json << "]}}";
|
50
|
+
return json.str();
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
// Test 1: Producer-consumer loading performance
|
55
|
+
void test_loading_performance() {
|
56
|
+
std::cout << "\n📝 Test 1: Producer-consumer loadDir performance (1K documents)\n";
|
57
|
+
|
58
|
+
// Check if test data exists
|
59
|
+
const std::string test_data_dir = "../test_data";
|
60
|
+
if (!std::filesystem::exists(test_data_dir)) {
|
61
|
+
std::cout << "❌ Test data directory not found: " << test_data_dir << "\n";
|
62
|
+
std::cout << " Run: node test/generate_test_data.js\n";
|
63
|
+
std::exit(1);
|
64
|
+
}
|
65
|
+
|
66
|
+
VectorStore store(DIM);
|
67
|
+
auto start = high_resolution_clock::now();
|
68
|
+
|
69
|
+
// Load using the clean VectorStoreLoader interface
|
70
|
+
VectorStoreLoader::loadDirectory(&store, test_data_dir);
|
71
|
+
|
72
|
+
auto load_end = high_resolution_clock::now();
|
73
|
+
auto load_elapsed = duration_cast<milliseconds>(load_end - start).count();
|
74
|
+
|
75
|
+
std::cout << "✅ Loaded " << store.size() << " documents in " << load_elapsed << "ms\n";
|
76
|
+
std::cout << " Rate: " << (store.size() * 1000 / load_elapsed) << " docs/sec\n";
|
77
|
+
|
78
|
+
// Store should already be finalized by loadDir
|
79
|
+
assert(store.is_finalized());
|
80
|
+
std::cout << " Store finalized by loadDir\n";
|
81
|
+
}
|
82
|
+
|
83
|
+
// Test 2: Phase enforcement validation
|
84
|
+
void test_phase_enforcement() {
|
85
|
+
std::cout << "\n🚦 Test 2: Phase enforcement validation\n";
|
86
|
+
|
87
|
+
VectorStore store(DIM);
|
88
|
+
std::mt19937 rng(42);
|
89
|
+
simdjson::ondemand::parser parser;
|
90
|
+
|
91
|
+
// Verify search fails before finalization
|
92
|
+
auto query = generate_random_embedding(DIM, rng);
|
93
|
+
auto results = store.search(query.data(), 10);
|
94
|
+
assert(results.empty());
|
95
|
+
std::cout << " ✅ Search correctly blocked before finalization\n";
|
96
|
+
|
97
|
+
// Add some documents
|
98
|
+
for (size_t i = 0; i < 100; ++i) {
|
99
|
+
auto embedding = generate_random_embedding(DIM, rng);
|
100
|
+
std::string json_str = create_json_document(
|
101
|
+
"phase-" + std::to_string(i),
|
102
|
+
"Phase test document " + std::to_string(i),
|
103
|
+
embedding
|
104
|
+
);
|
105
|
+
|
106
|
+
simdjson::padded_string padded(json_str);
|
107
|
+
simdjson::ondemand::document doc;
|
108
|
+
if (!parser.iterate(padded).get(doc)) {
|
109
|
+
auto error = store.add_document(doc);
|
110
|
+
assert(error == simdjson::SUCCESS);
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
// Finalize the store
|
115
|
+
store.finalize();
|
116
|
+
assert(store.is_finalized());
|
117
|
+
|
118
|
+
// Verify we can search now
|
119
|
+
results = store.search(query.data(), 10);
|
120
|
+
assert(!results.empty());
|
121
|
+
std::cout << " ✅ Search works after finalization\n";
|
122
|
+
|
123
|
+
// Verify document addition fails after finalization
|
124
|
+
auto embedding = generate_random_embedding(DIM, rng);
|
125
|
+
std::string json_str = create_json_document("blocked", "Should fail", embedding);
|
126
|
+
simdjson::padded_string padded(json_str);
|
127
|
+
simdjson::ondemand::document doc;
|
128
|
+
parser.iterate(padded).get(doc);
|
129
|
+
auto error = store.add_document(doc);
|
130
|
+
assert(error == simdjson::INCORRECT_TYPE);
|
131
|
+
std::cout << " ✅ Document addition correctly blocked after finalization\n";
|
132
|
+
}
|
133
|
+
|
134
|
+
// Test 3: 64MB+1 allocation (expect fail)
|
135
|
+
void test_oversize_allocation() {
|
136
|
+
std::cout << "\n📏 Test 3: 64MB+1 allocation (expect fail)\n";
|
137
|
+
|
138
|
+
VectorStore store(10);
|
139
|
+
|
140
|
+
// Create a document with metadata that exceeds chunk size
|
141
|
+
std::stringstream huge_json;
|
142
|
+
huge_json << "{\"id\":\"huge\",\"text\":\"test\",\"metadata\":{\"embedding\":[";
|
143
|
+
for (int i = 0; i < 10; ++i) {
|
144
|
+
if (i > 0) huge_json << ",";
|
145
|
+
huge_json << "0.1";
|
146
|
+
}
|
147
|
+
huge_json << "],\"huge\":\"";
|
148
|
+
// Add 64MB + 1 byte of data
|
149
|
+
for (size_t i = 0; i < 67108865; ++i) {
|
150
|
+
huge_json << "x";
|
151
|
+
}
|
152
|
+
huge_json << "\"}}";
|
153
|
+
|
154
|
+
std::string json_str = huge_json.str();
|
155
|
+
simdjson::padded_string padded(json_str);
|
156
|
+
simdjson::ondemand::parser parser;
|
157
|
+
simdjson::ondemand::document doc;
|
158
|
+
|
159
|
+
auto error = parser.iterate(padded).get(doc);
|
160
|
+
if (!error) {
|
161
|
+
// This should fail in the allocator
|
162
|
+
error = store.add_document(doc);
|
163
|
+
if (error == simdjson::MEMALLOC) {
|
164
|
+
std::cout << "✅ Correctly rejected oversize allocation\n";
|
165
|
+
} else {
|
166
|
+
std::cout << "❌ Should have failed with MEMALLOC error, got: " << simdjson::error_message(error) << "\n";
|
167
|
+
std::exit(1);
|
168
|
+
}
|
169
|
+
} else {
|
170
|
+
std::cout << "❌ Failed to parse test JSON: " << simdjson::error_message(error) << "\n";
|
171
|
+
std::exit(1);
|
172
|
+
}
|
173
|
+
}
|
174
|
+
|
175
|
+
// Test 4: Alignment requests
|
176
|
+
void test_alignment_requests() {
|
177
|
+
std::cout << "\n🎯 Test 4: Various alignment requests\n";
|
178
|
+
|
179
|
+
class TestArenaAllocator : public ArenaAllocator {
|
180
|
+
public:
|
181
|
+
void test_alignments() {
|
182
|
+
// Test valid alignments
|
183
|
+
size_t valid_aligns[] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
|
184
|
+
|
185
|
+
for (size_t align : valid_aligns) {
|
186
|
+
void* ptr = allocate(128, align);
|
187
|
+
if (!ptr) {
|
188
|
+
std::cout << "❌ Failed to allocate with alignment " << align << "\n";
|
189
|
+
std::exit(1);
|
190
|
+
}
|
191
|
+
assert(((uintptr_t)ptr % align) == 0);
|
192
|
+
}
|
193
|
+
std::cout << "✅ All valid alignments handled correctly\n";
|
194
|
+
|
195
|
+
// Test invalid alignment (>4096)
|
196
|
+
void* ptr = allocate(128, 8192);
|
197
|
+
if (ptr) {
|
198
|
+
std::cout << "❌ Should have rejected alignment > 4096\n";
|
199
|
+
std::exit(1);
|
200
|
+
} else {
|
201
|
+
std::cout << "✅ Correctly rejected large alignment\n";
|
202
|
+
}
|
203
|
+
}
|
204
|
+
};
|
205
|
+
|
206
|
+
TestArenaAllocator allocator;
|
207
|
+
allocator.test_alignments();
|
208
|
+
}
|
209
|
+
|
210
|
+
// Test 5: Phase separation - load, finalize, then search
|
211
|
+
void test_phase_separation() {
|
212
|
+
std::cout << "\n🔄 Test 5: Phase separation - load, finalize, then search\n";
|
213
|
+
|
214
|
+
VectorStore store(DIM);
|
215
|
+
auto start = high_resolution_clock::now();
|
216
|
+
|
217
|
+
// Phase 1: Load documents (single-threaded for simplicity)
|
218
|
+
std::mt19937 rng(42);
|
219
|
+
simdjson::ondemand::parser parser;
|
220
|
+
size_t docs_loaded = 0;
|
221
|
+
|
222
|
+
for (size_t i = 0; i < 1000; ++i) {
|
223
|
+
auto embedding = generate_random_embedding(DIM, rng);
|
224
|
+
std::string json_str = create_json_document(
|
225
|
+
"doc-" + std::to_string(i),
|
226
|
+
"Document " + std::to_string(i),
|
227
|
+
embedding
|
228
|
+
);
|
229
|
+
|
230
|
+
simdjson::padded_string padded(json_str);
|
231
|
+
simdjson::ondemand::document doc;
|
232
|
+
if (!parser.iterate(padded).get(doc)) {
|
233
|
+
auto error = store.add_document(doc);
|
234
|
+
if (!error) {
|
235
|
+
docs_loaded++;
|
236
|
+
}
|
237
|
+
}
|
238
|
+
}
|
239
|
+
|
240
|
+
auto load_time = duration_cast<milliseconds>(high_resolution_clock::now() - start).count();
|
241
|
+
std::cout << " Loaded " << docs_loaded << " documents in " << load_time << "ms\n";
|
242
|
+
|
243
|
+
// Verify searches fail before finalization
|
244
|
+
auto query = generate_random_embedding(DIM, rng);
|
245
|
+
auto results = store.search(query.data(), 10);
|
246
|
+
assert(results.empty());
|
247
|
+
std::cout << " ✅ Searches correctly blocked before finalization\n";
|
248
|
+
|
249
|
+
// Phase 2: Finalize the store
|
250
|
+
auto finalize_start = high_resolution_clock::now();
|
251
|
+
store.finalize();
|
252
|
+
auto finalize_time = duration_cast<milliseconds>(high_resolution_clock::now() - finalize_start).count();
|
253
|
+
std::cout << " Finalized (normalized) in " << finalize_time << "ms\n";
|
254
|
+
|
255
|
+
// Verify no more documents can be added
|
256
|
+
{
|
257
|
+
auto embedding = generate_random_embedding(DIM, rng);
|
258
|
+
std::string json_str = create_json_document("blocked", "Should fail", embedding);
|
259
|
+
simdjson::padded_string padded(json_str);
|
260
|
+
simdjson::ondemand::document doc;
|
261
|
+
parser.iterate(padded).get(doc);
|
262
|
+
auto error = store.add_document(doc);
|
263
|
+
assert(error == simdjson::INCORRECT_TYPE);
|
264
|
+
std::cout << " ✅ Document additions correctly blocked after finalization\n";
|
265
|
+
}
|
266
|
+
|
267
|
+
// Phase 3: Concurrent searches (multiple threads)
|
268
|
+
std::atomic<size_t> total_searches{0};
|
269
|
+
auto search_start = high_resolution_clock::now();
|
270
|
+
|
271
|
+
std::vector<std::thread> searchers;
|
272
|
+
for (size_t t = 0; t < 4; ++t) {
|
273
|
+
searchers.emplace_back([&store, &total_searches, t]() {
|
274
|
+
std::mt19937 rng(t);
|
275
|
+
for (size_t i = 0; i < 25; ++i) {
|
276
|
+
auto query = generate_random_embedding(DIM, rng);
|
277
|
+
auto results = store.search(query.data(), 10);
|
278
|
+
assert(!results.empty() && results.size() <= 10);
|
279
|
+
total_searches++;
|
280
|
+
}
|
281
|
+
});
|
282
|
+
}
|
283
|
+
|
284
|
+
for (auto& t : searchers) {
|
285
|
+
t.join();
|
286
|
+
}
|
287
|
+
|
288
|
+
auto search_time = duration_cast<milliseconds>(high_resolution_clock::now() - search_start).count();
|
289
|
+
std::cout << " Performed " << total_searches.load() << " concurrent searches in " << search_time << "ms\n";
|
290
|
+
|
291
|
+
auto total_time = duration_cast<milliseconds>(high_resolution_clock::now() - start).count();
|
292
|
+
std::cout << "✅ Phase separation test completed in " << total_time << "ms\n";
|
293
|
+
}
|
294
|
+
|
295
|
+
// Test 6: Concurrent search performance after finalization
|
296
|
+
void test_concurrent_search_performance() {
|
297
|
+
std::cout << "\n🔍 Test 6: Concurrent search performance\n";
|
298
|
+
|
299
|
+
VectorStore store(DIM);
|
300
|
+
|
301
|
+
// Load test data
|
302
|
+
std::mt19937 rng(42);
|
303
|
+
simdjson::ondemand::parser parser;
|
304
|
+
|
305
|
+
for (size_t i = 0; i < 10000; ++i) {
|
306
|
+
auto embedding = generate_random_embedding(DIM, rng);
|
307
|
+
std::string json_str = create_json_document(
|
308
|
+
"search-" + std::to_string(i),
|
309
|
+
"Document for search testing " + std::to_string(i),
|
310
|
+
embedding
|
311
|
+
);
|
312
|
+
|
313
|
+
simdjson::padded_string padded(json_str);
|
314
|
+
simdjson::ondemand::document doc;
|
315
|
+
if (!parser.iterate(padded).get(doc)) {
|
316
|
+
store.add_document(doc);
|
317
|
+
}
|
318
|
+
}
|
319
|
+
|
320
|
+
std::cout << " Loaded " << store.size() << " documents\n";
|
321
|
+
|
322
|
+
// Finalize the store
|
323
|
+
auto finalize_start = high_resolution_clock::now();
|
324
|
+
store.finalize();
|
325
|
+
auto finalize_time = duration_cast<milliseconds>(high_resolution_clock::now() - finalize_start).count();
|
326
|
+
std::cout << " Finalized in " << finalize_time << "ms\n";
|
327
|
+
|
328
|
+
// Test concurrent searches
|
329
|
+
const size_t num_threads = 8;
|
330
|
+
const size_t searches_per_thread = 100;
|
331
|
+
std::atomic<size_t> total_searches{0};
|
332
|
+
std::atomic<size_t> total_results{0};
|
333
|
+
|
334
|
+
auto search_start = high_resolution_clock::now();
|
335
|
+
|
336
|
+
std::vector<std::thread> searchers;
|
337
|
+
for (size_t t = 0; t < num_threads; ++t) {
|
338
|
+
searchers.emplace_back([&store, &total_searches, &total_results, t]() {
|
339
|
+
std::mt19937 rng(t);
|
340
|
+
size_t local_results = 0;
|
341
|
+
|
342
|
+
for (size_t i = 0; i < searches_per_thread; ++i) {
|
343
|
+
auto query = generate_random_embedding(DIM, rng);
|
344
|
+
auto results = store.search(query.data(), 10);
|
345
|
+
assert(!results.empty() && results.size() <= 10);
|
346
|
+
local_results += results.size();
|
347
|
+
total_searches++;
|
348
|
+
}
|
349
|
+
|
350
|
+
total_results += local_results;
|
351
|
+
});
|
352
|
+
}
|
353
|
+
|
354
|
+
for (auto& t : searchers) {
|
355
|
+
t.join();
|
356
|
+
}
|
357
|
+
|
358
|
+
auto search_time = duration_cast<milliseconds>(high_resolution_clock::now() - search_start).count();
|
359
|
+
|
360
|
+
std::cout << "✅ Performed " << total_searches.load() << " concurrent searches in " << search_time << "ms\n";
|
361
|
+
std::cout << " Average results per search: " << (total_results.load() / total_searches.load()) << "\n";
|
362
|
+
std::cout << " Throughput: " << (total_searches.load() * 1000 / search_time) << " searches/sec\n";
|
363
|
+
}
|
364
|
+
|
365
|
+
int main() {
|
366
|
+
std::cout << "🔥 Starting concurrent stress tests...\n";
|
367
|
+
|
368
|
+
// Detect which sanitizer is enabled
|
369
|
+
#if defined(__has_feature)
|
370
|
+
#if __has_feature(address_sanitizer)
|
371
|
+
std::cout << " Running with AddressSanitizer (ASAN)\n";
|
372
|
+
#elif __has_feature(thread_sanitizer)
|
373
|
+
std::cout << " Running with ThreadSanitizer (TSAN)\n";
|
374
|
+
#endif
|
375
|
+
#elif defined(__SANITIZE_ADDRESS__)
|
376
|
+
std::cout << " Running with AddressSanitizer (ASAN)\n";
|
377
|
+
#elif defined(__SANITIZE_THREAD__)
|
378
|
+
std::cout << " Running with ThreadSanitizer (TSAN)\n";
|
379
|
+
#else
|
380
|
+
std::cout << " ⚠️ Running without sanitizers - use 'make stress' for ASAN by default\n";
|
381
|
+
std::cout << " Or use: make stress SANITIZER=thread for TSAN\n";
|
382
|
+
std::cout << " make stress SANITIZER=none to disable\n";
|
383
|
+
#endif
|
384
|
+
|
385
|
+
test_loading_performance();
|
386
|
+
test_phase_enforcement();
|
387
|
+
// test_oversize_allocation();
|
388
|
+
test_alignment_requests();
|
389
|
+
test_phase_separation();
|
390
|
+
test_concurrent_search_performance();
|
391
|
+
|
392
|
+
std::cout << "\n✅ All stress tests passed!\n";
|
393
|
+
return 0;
|
394
|
+
}
|