native-vector-store 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,401 @@
1
+ #pragma once
2
+ #include <atomic>
3
+ #include <memory>
4
+ #include <cstring>
5
+ #include <cmath>
6
+ #include <vector>
7
+ #include <string_view>
8
+ #include <simdjson.h>
9
+ #include <omp.h>
10
+ #include <mutex>
11
+ #include <shared_mutex>
12
+ #include <cassert>
13
+ #include <algorithm>
14
+ #include <functional>
15
+
16
+ class ArenaAllocator {
17
+ static constexpr size_t CHUNK_SIZE = 1 << 26; // 64MB chunks
18
+ struct Chunk {
19
+ alignas(64) char data[CHUNK_SIZE];
20
+ std::atomic<size_t> offset{0};
21
+ std::atomic<Chunk*> next{nullptr};
22
+ };
23
+
24
+ std::unique_ptr<Chunk> head_;
25
+ std::atomic<Chunk*> current_;
26
+ std::mutex chunk_creation_mutex_;
27
+
28
+ public:
29
+ ArenaAllocator() : head_(std::make_unique<Chunk>()),
30
+ current_(head_.get()) {}
31
+
32
+ void* allocate(size_t size, size_t align = 64) {
33
+ // Validate alignment is power of 2 and reasonable
34
+ assert(align > 0 && (align & (align - 1)) == 0);
35
+ if (align > 4096) {
36
+ return nullptr; // Alignment too large
37
+ }
38
+
39
+ // Validate size
40
+ if (size > CHUNK_SIZE) {
41
+ return nullptr; // Cannot allocate larger than chunk size
42
+ }
43
+
44
+ Chunk* chunk = current_.load(std::memory_order_acquire);
45
+ while (true) {
46
+ size_t old_offset = chunk->offset.load(std::memory_order_relaxed);
47
+
48
+ // Calculate the pointer that would result from current offset
49
+ void* ptr = chunk->data + old_offset;
50
+
51
+ // Calculate how much padding we need for alignment
52
+ size_t misalignment = (uintptr_t)ptr & (align - 1);
53
+ size_t padding = misalignment ? (align - misalignment) : 0;
54
+
55
+ size_t aligned_offset = old_offset + padding;
56
+ size_t new_offset = aligned_offset + size;
57
+
58
+ if (new_offset > CHUNK_SIZE) {
59
+ // Need new chunk
60
+ Chunk* next = chunk->next.load(std::memory_order_acquire);
61
+ if (!next) {
62
+ // Lock to prevent multiple threads creating chunks
63
+ std::lock_guard<std::mutex> lock(chunk_creation_mutex_);
64
+ // Double-check after acquiring lock
65
+ next = chunk->next.load(std::memory_order_acquire);
66
+ if (!next) {
67
+ auto new_chunk = std::make_unique<Chunk>();
68
+ next = new_chunk.get();
69
+ chunk->next.store(next, std::memory_order_release);
70
+ // Transfer ownership after setting atomic pointer
71
+ new_chunk.release();
72
+ }
73
+ }
74
+ // Update current to the new chunk
75
+ current_.store(next, std::memory_order_release);
76
+ chunk = next;
77
+ continue;
78
+ }
79
+
80
+ if (chunk->offset.compare_exchange_weak(old_offset, new_offset,
81
+ std::memory_order_release,
82
+ std::memory_order_relaxed)) {
83
+ return chunk->data + aligned_offset;
84
+ }
85
+ }
86
+ }
87
+
88
+ ~ArenaAllocator() {
89
+ // Clean up linked chunks
90
+ Chunk* chunk = head_->next.load(std::memory_order_acquire);
91
+ while (chunk) {
92
+ Chunk* next = chunk->next.load(std::memory_order_acquire);
93
+ delete chunk;
94
+ chunk = next;
95
+ }
96
+ }
97
+ };
98
+
99
+ struct Document {
100
+ std::string_view id;
101
+ std::string_view text;
102
+ std::string_view metadata_json; // Full JSON including embedding
103
+ };
104
+
105
+ // Per-thread top-k tracker for thread-safe parallel search
106
+ struct TopK {
107
+ size_t k;
108
+ std::vector<std::pair<float, size_t>> heap; // min-heap by score
109
+
110
+ explicit TopK(size_t k = 0) : k(k) {
111
+ heap.reserve(k + 1); // Reserve k+1 to avoid reallocation during push
112
+ }
113
+
114
+ // Make TopK move-only to prevent copy-construction races
115
+ TopK(const TopK&) = delete;
116
+ TopK& operator=(const TopK&) = delete;
117
+ TopK(TopK&&) = default;
118
+ TopK& operator=(TopK&&) = default;
119
+
120
+ void push(float score, size_t idx) {
121
+ if (heap.size() < k) {
122
+ heap.emplace_back(score, idx);
123
+ std::push_heap(heap.begin(), heap.end(), cmp);
124
+ } else if (k > 0 && score > heap.front().first) {
125
+ // Replace the minimum element
126
+ std::pop_heap(heap.begin(), heap.end(), cmp);
127
+ heap.back() = {score, idx};
128
+ std::push_heap(heap.begin(), heap.end(), cmp);
129
+ }
130
+ }
131
+
132
+ // Comparator for min-heap (greater than for min-heap behavior)
133
+ static bool cmp(const std::pair<float, size_t>& a, const std::pair<float, size_t>& b) {
134
+ return a.first > b.first;
135
+ }
136
+
137
+ void merge(const TopK& other) {
138
+ // More efficient: if we have space, bulk insert then re-heapify
139
+ if (heap.size() + other.heap.size() <= k) {
140
+ heap.insert(heap.end(), other.heap.begin(), other.heap.end());
141
+ std::make_heap(heap.begin(), heap.end(), cmp);
142
+ } else {
143
+ // Otherwise, insert one by one
144
+ for (const auto& [score, idx] : other.heap) {
145
+ push(score, idx);
146
+ }
147
+ }
148
+ }
149
+ };
150
+
151
+
152
+ class VectorStore {
153
+ const size_t dim_;
154
+ ArenaAllocator arena_;
155
+
156
+ struct Entry {
157
+ Document doc;
158
+ float* embedding; // Extracted pointer for fast access
159
+ };
160
+
161
+ std::vector<Entry> entries_;
162
+ std::atomic<size_t> count_{0}; // Atomic for parallel loading
163
+ std::atomic<bool> is_finalized_{false}; // Simple flag: false = loading, true = serving
164
+ mutable std::shared_mutex search_mutex_; // Protects against overlapping OpenMP teams
165
+
166
+ public:
167
+ explicit VectorStore(size_t dim) : dim_(dim) {
168
+ entries_.resize(1'000'000); // Pre-size with default-constructed entries
169
+ }
170
+
171
+ // Overload for document type (used in test_main.cpp)
172
+ simdjson::error_code add_document(simdjson::ondemand::document& json_doc) {
173
+ simdjson::ondemand::object obj;
174
+ auto error = json_doc.get_object().get(obj);
175
+ if (error) {
176
+ return error;
177
+ }
178
+ return add_document(obj);
179
+ }
180
+
181
+ simdjson::error_code add_document(simdjson::ondemand::object& json_doc) {
182
+ // Cannot add documents after finalization
183
+ if (is_finalized_.load(std::memory_order_acquire)) {
184
+ return simdjson::INCORRECT_TYPE;
185
+ }
186
+
187
+ // Parse with error handling
188
+ std::string_view id, text;
189
+ auto error = json_doc["id"].get_string().get(id);
190
+ if (error) return error;
191
+
192
+ error = json_doc["text"].get_string().get(text);
193
+ if (error) return error;
194
+
195
+ // Calculate sizes
196
+ size_t emb_size = dim_ * sizeof(float);
197
+ size_t id_size = id.size() + 1;
198
+ size_t text_size = text.size() + 1;
199
+
200
+ // Allocate temporary buffer for embedding
201
+ std::vector<float> temp_embedding;
202
+ temp_embedding.reserve(dim_);
203
+
204
+ // Process metadata and embedding first
205
+ simdjson::ondemand::object metadata;
206
+ error = json_doc["metadata"].get_object().get(metadata);
207
+ if (error) return error;
208
+
209
+ simdjson::ondemand::array emb_array;
210
+ error = metadata["embedding"].get_array().get(emb_array);
211
+ if (error) return error;
212
+
213
+ // Consume the array before touching anything else
214
+ size_t i = 0;
215
+ for (auto value_result : emb_array) {
216
+ simdjson::ondemand::value v;
217
+ error = value_result.get(v);
218
+ if (error) return error;
219
+ double val;
220
+ error = v.get_double().get(val);
221
+ if (error) return error;
222
+
223
+ if (i >= dim_) {
224
+ return simdjson::CAPACITY; // Too many embedding values
225
+ }
226
+ temp_embedding.push_back(float(val));
227
+ i++;
228
+ }
229
+
230
+ // Verify we got the expected number of embedding values
231
+ if (i != dim_) {
232
+ return simdjson::INCORRECT_TYPE; // Wrong embedding dimension
233
+ }
234
+
235
+ // Now it is safe to take the raw metadata JSON
236
+ std::string_view raw_json;
237
+ error = metadata.raw_json().get(raw_json);
238
+ if (error) return error;
239
+ size_t meta_size = raw_json.size() + 1;
240
+
241
+ // Single arena allocation
242
+ char* base = (char*)arena_.allocate(emb_size + id_size + text_size + meta_size);
243
+ if (!base) {
244
+ return simdjson::MEMALLOC; // Allocation failed
245
+ }
246
+
247
+ // Layout: [embedding][id][text][metadata_json]
248
+ float* emb_ptr = (float*)base;
249
+ char* id_ptr = base + emb_size;
250
+ char* text_ptr = id_ptr + id_size;
251
+ char* meta_ptr = text_ptr + text_size;
252
+
253
+ // Copy embedding from temporary buffer
254
+ std::memcpy(emb_ptr, temp_embedding.data(), emb_size);
255
+
256
+ // Copy strings (adding null terminator)
257
+ std::memcpy(id_ptr, id.data(), id.size());
258
+ id_ptr[id.size()] = '\0';
259
+
260
+ std::memcpy(text_ptr, text.data(), text.size());
261
+ text_ptr[text.size()] = '\0';
262
+
263
+ std::memcpy(meta_ptr, raw_json.data(), raw_json.size());
264
+ meta_ptr[raw_json.size()] = '\0';
265
+
266
+ // Atomic increment for parallel loading
267
+ size_t idx = count_.fetch_add(1, std::memory_order_relaxed);
268
+
269
+ // Bounds check
270
+ if (idx >= entries_.size()) {
271
+ count_.fetch_sub(1, std::memory_order_relaxed);
272
+ return simdjson::CAPACITY;
273
+ }
274
+
275
+ // Construct entry directly - no synchronization needed
276
+ entries_[idx] = Entry{
277
+ .doc = Document{
278
+ .id = std::string_view(id_ptr, id.size()),
279
+ .text = std::string_view(text_ptr, text.size()),
280
+ .metadata_json = std::string_view(meta_ptr, raw_json.size())
281
+ },
282
+ .embedding = emb_ptr
283
+ };
284
+
285
+ return simdjson::SUCCESS;
286
+ }
287
+
288
+ // Finalize the store: normalize and switch to serving phase
289
+ void finalize() {
290
+ // If already finalized, do nothing
291
+ if (is_finalized_.load(std::memory_order_acquire)) {
292
+ return;
293
+ }
294
+
295
+ // Get final count
296
+ size_t final_count = count_.load(std::memory_order_acquire);
297
+
298
+ // Normalize all embeddings (single-threaded, no races)
299
+ for (size_t i = 0; i < final_count; ++i) {
300
+ float* emb = entries_[i].embedding;
301
+ if (!emb) continue; // Skip uninitialized entries
302
+
303
+ float sum = 0.0f;
304
+ #pragma omp simd reduction(+:sum)
305
+ for (size_t j = 0; j < dim_; ++j) {
306
+ sum += emb[j] * emb[j];
307
+ }
308
+
309
+ if (sum > 1e-10f) { // Avoid division by zero
310
+ float inv_norm = 1.0f / std::sqrt(sum);
311
+ #pragma omp simd
312
+ for (size_t j = 0; j < dim_; ++j) {
313
+ emb[j] *= inv_norm;
314
+ }
315
+ }
316
+ }
317
+
318
+ // Ensure all threads see the normalized data
319
+ #pragma omp barrier
320
+
321
+ // Mark as finalized - this is the ONLY place this flag is set
322
+ is_finalized_.store(true, std::memory_order_seq_cst);
323
+ }
324
+
325
+ // Deprecated: use finalize() instead
326
+ void normalize_all() {
327
+ finalize();
328
+ }
329
+
330
+ std::vector<std::pair<float, size_t>>
331
+ search(const float* query, size_t k) const {
332
+ // Exclusive lock: prevent overlapping OpenMP teams
333
+ // Since each search uses all threads via OpenMP, concurrent searches provide no benefit
334
+ std::unique_lock<std::shared_mutex> lock(search_mutex_);
335
+
336
+ // Search can ONLY run if finalized
337
+ if (!is_finalized_.load(std::memory_order_acquire)) {
338
+ return {};
339
+ }
340
+
341
+ size_t n = count_.load(std::memory_order_acquire);
342
+ if (n == 0 || k == 0) return {};
343
+
344
+ k = std::min(k, n); // Ensure k doesn't exceed count
345
+
346
+
347
+ // Always use per-thread heaps to avoid any shared memory races
348
+ const int num_threads = omp_get_max_threads();
349
+ std::vector<TopK> thread_heaps;
350
+ thread_heaps.reserve(num_threads);
351
+ for (int i = 0; i < num_threads; ++i) {
352
+ thread_heaps.emplace_back(k); // in-place construction, no copies
353
+ }
354
+
355
+ std::vector<std::pair<float,std::size_t>> result;
356
+
357
+ #pragma omp parallel
358
+ {
359
+ const int tid = omp_get_thread_num();
360
+ TopK& local_heap = thread_heaps[tid];
361
+
362
+ #pragma omp for // default barrier kept - ensures all threads finish before merge
363
+ for (size_t i = 0; i < n; ++i) {
364
+ float score = 0.0f;
365
+ const float* emb = entries_[i].embedding;
366
+
367
+ #pragma omp simd reduction(+:score)
368
+ for (size_t j = 0; j < dim_; ++j) {
369
+ score += emb[j] * query[j];
370
+ }
371
+
372
+ local_heap.push(score, i);
373
+ }
374
+
375
+ #pragma omp barrier
376
+ #pragma omp single
377
+ {
378
+ TopK final_heap(k);
379
+ for (auto& th : thread_heaps) final_heap.merge(th);
380
+ result = std::move(final_heap.heap);
381
+ }
382
+ }
383
+
384
+ std::sort(result.begin(), result.end(),
385
+ [](const auto& a, const auto& b) { return a.first > b.first; });
386
+
387
+ return result;
388
+ }
389
+
390
+ const Entry& get_entry(size_t idx) const {
391
+ return entries_[idx];
392
+ }
393
+
394
+ size_t size() const {
395
+ return count_.load(std::memory_order_acquire);
396
+ }
397
+
398
+ bool is_finalized() const {
399
+ return is_finalized_.load(std::memory_order_acquire);
400
+ }
401
+ };
@@ -0,0 +1,176 @@
1
+ #include "vector_store_loader.h"
2
+ #include "atomic_queue.h"
3
+ #include <filesystem>
4
+ #include <fstream>
5
+ #include <thread>
6
+ #include <vector>
7
+ #include <atomic>
8
+ #include <cctype>
9
+
10
+ void VectorStoreLoader::loadDirectory(VectorStore* store, const std::string& path) {
11
+ // Cannot load if already finalized
12
+ if (store->is_finalized()) {
13
+ return;
14
+ }
15
+
16
+ // Collect all JSON files
17
+ std::vector<std::filesystem::path> json_files;
18
+ for (const auto& entry : std::filesystem::directory_iterator(path)) {
19
+ if (entry.path().extension() == ".json") {
20
+ json_files.push_back(entry.path());
21
+ }
22
+ }
23
+
24
+ if (json_files.empty()) {
25
+ store->finalize();
26
+ return;
27
+ }
28
+
29
+ // Producer-consumer queue for file data
30
+ struct FileData {
31
+ std::string filename;
32
+ std::string content;
33
+ };
34
+
35
+ // Queue with bounded capacity (max ~100MB of buffered data)
36
+ atomic_queue::AtomicQueue<FileData*, 1024> queue;
37
+
38
+ // Atomic flags for coordination
39
+ std::atomic<bool> producer_done{false};
40
+ std::atomic<size_t> files_processed{0};
41
+
42
+ // Producer thread - sequential file reading with optimizations
43
+ std::thread producer([&]() {
44
+ // Reusable buffer to avoid repeated allocations
45
+ std::vector<char> buffer;
46
+ // Start with reasonable capacity, will grow as needed
47
+ buffer.reserve(1024 * 1024); // 1MB initial capacity
48
+
49
+ for (const auto& filepath : json_files) {
50
+ // Get file size without opening (one syscall)
51
+ std::error_code ec;
52
+ auto size = std::filesystem::file_size(filepath, ec);
53
+ if (ec) {
54
+ fprintf(stderr, "Error getting size of %s: %s\n",
55
+ filepath.c_str(), ec.message().c_str());
56
+ continue;
57
+ }
58
+
59
+ // Open file for reading
60
+ std::ifstream file(filepath, std::ios::binary);
61
+ if (!file) {
62
+ fprintf(stderr, "Error opening %s\n", filepath.c_str());
63
+ continue;
64
+ }
65
+
66
+ // Ensure buffer has enough capacity
67
+ if (size > buffer.capacity()) {
68
+ buffer.reserve(size);
69
+ }
70
+ // Resize buffer to exact size needed
71
+ buffer.resize(size);
72
+
73
+ // Read directly into buffer
74
+ if (!file.read(buffer.data(), size)) {
75
+ fprintf(stderr, "Error reading %s\n", filepath.c_str());
76
+ continue;
77
+ }
78
+
79
+ // Create file data with string move-constructed from buffer
80
+ auto* data = new FileData{
81
+ filepath.string(),
82
+ std::string(buffer.begin(), buffer.end())
83
+ };
84
+ queue.push(data);
85
+ }
86
+ producer_done = true;
87
+ });
88
+
89
+ // Consumer threads - parallel JSON parsing
90
+ size_t num_workers = std::thread::hardware_concurrency();
91
+ std::vector<std::thread> consumers;
92
+
93
+ for (size_t w = 0; w < num_workers; ++w) {
94
+ consumers.emplace_back([&]() {
95
+ // Each thread needs its own parser
96
+ simdjson::ondemand::parser doc_parser;
97
+ FileData* data = nullptr;
98
+
99
+ while (true) {
100
+ // Try to get work from queue
101
+ if (queue.try_pop(data)) {
102
+ // Process the file
103
+ simdjson::padded_string json(data->content);
104
+
105
+ // Check if it's an array or object
106
+ const char* json_start = json.data();
107
+ while (json_start && *json_start && std::isspace(*json_start)) {
108
+ json_start++;
109
+ }
110
+ bool is_array = (json_start && *json_start == '[');
111
+
112
+ simdjson::ondemand::document doc;
113
+ auto error = doc_parser.iterate(json).get(doc);
114
+ if (error) {
115
+ fprintf(stderr, "Error parsing %s: %s\n", data->filename.c_str(), simdjson::error_message(error));
116
+ delete data;
117
+ continue;
118
+ }
119
+
120
+ if (is_array) {
121
+ // Process as array
122
+ simdjson::ondemand::array arr;
123
+ error = doc.get_array().get(arr);
124
+ if (error) {
125
+ fprintf(stderr, "Error getting array from %s: %s\n", data->filename.c_str(), simdjson::error_message(error));
126
+ delete data;
127
+ continue;
128
+ }
129
+
130
+ for (auto doc_element : arr) {
131
+ simdjson::ondemand::object obj;
132
+ error = doc_element.get_object().get(obj);
133
+ if (!error) {
134
+ auto add_error = store->add_document(obj);
135
+ if (add_error) {
136
+ fprintf(stderr, "Error adding document from %s: %s\n",
137
+ data->filename.c_str(), simdjson::error_message(add_error));
138
+ }
139
+ }
140
+ }
141
+ } else {
142
+ // Process as single document
143
+ simdjson::ondemand::object obj;
144
+ error = doc.get_object().get(obj);
145
+ if (!error) {
146
+ auto add_error = store->add_document(obj);
147
+ if (add_error) {
148
+ fprintf(stderr, "Error adding document from %s: %s\n",
149
+ data->filename.c_str(), simdjson::error_message(add_error));
150
+ }
151
+ }
152
+ }
153
+
154
+ delete data;
155
+ files_processed++;
156
+
157
+ } else if (producer_done.load()) {
158
+ // No more work and producer is done
159
+ break;
160
+ } else {
161
+ // Queue is empty but producer might add more
162
+ std::this_thread::yield();
163
+ }
164
+ }
165
+ });
166
+ }
167
+
168
+ // Wait for all threads to complete
169
+ producer.join();
170
+ for (auto& consumer : consumers) {
171
+ consumer.join();
172
+ }
173
+
174
+ // Finalize after batch load - normalize and switch to serving phase
175
+ store->finalize();
176
+ }
@@ -0,0 +1,19 @@
1
+ #pragma once
2
+ #include "vector_store.h"
3
+ #include <string>
4
+
5
+ // Clean interface for loading documents from a directory
6
+ class VectorStoreLoader {
7
+ public:
8
+ // Load all JSON files from a directory into the vector store
9
+ // Automatically calls finalize() when complete
10
+ static void loadDirectory(VectorStore* store, const std::string& path);
11
+
12
+ // Memory-mapped version for better performance with large files
13
+ // Uses zero-copy I/O via mmap/MapViewOfFile
14
+ static void loadDirectoryMMap(VectorStore* store, const std::string& path);
15
+
16
+ // Adaptive loader that chooses the best method per file
17
+ // Uses mmap for files <5MB, standard loading for larger files
18
+ static void loadDirectoryAdaptive(VectorStore* store, const std::string& path);
19
+ };