npm - native-vector-store - Versions diffs - 0.1.0 - Mend

native-vector-store 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/LICENSE +201 -0
package/README.md +223 -0
package/binding.gyp +45 -0
package/index.js +3 -0
package/lib/index.d.ts +60 -0
package/native-vector-store-0.1.0.tgz +0 -0
package/package.json +52 -0
package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
package/scripts/build-prebuilds.sh +23 -0
package/src/atomic_queue.h +646 -0
package/src/binding.cc +151 -0
package/src/defs.h +107 -0
package/src/mmap_file.h +159 -0
package/src/vector_store.h +401 -0
package/src/vector_store_loader.cpp +176 -0
package/src/vector_store_loader.h +19 -0
package/src/vector_store_loader_adaptive.cpp +220 -0
package/src/vector_store_loader_mmap.cpp +154 -0

package/src/vector_store.h ADDED Viewed

@@ -0,0 +1,401 @@
+#pragma once
+#include <atomic>
+#include <memory>
+#include <cstring>
+#include <cmath>
+#include <vector>
+#include <string_view>
+#include <simdjson.h>
+#include <omp.h>
+#include <mutex>
+#include <shared_mutex>
+#include <cassert>
+#include <algorithm>
+#include <functional>
+class ArenaAllocator {
+    static constexpr size_t CHUNK_SIZE = 1 << 26;  // 64MB chunks
+    struct Chunk {
+        alignas(64) char data[CHUNK_SIZE];
+        std::atomic<size_t> offset{0};
+        std::atomic<Chunk*> next{nullptr};
+    };
+    std::unique_ptr<Chunk> head_;
+    std::atomic<Chunk*> current_;
+    std::mutex chunk_creation_mutex_;
+public:
+    ArenaAllocator() : head_(std::make_unique<Chunk>()),
+                       current_(head_.get()) {}
+    void* allocate(size_t size, size_t align = 64) {
+        // Validate alignment is power of 2 and reasonable
+        assert(align > 0 && (align & (align - 1)) == 0);
+        if (align > 4096) {
+            return nullptr;  // Alignment too large
+        }
+        // Validate size
+        if (size > CHUNK_SIZE) {
+            return nullptr;  // Cannot allocate larger than chunk size
+        }
+        Chunk* chunk = current_.load(std::memory_order_acquire);
+        while (true) {
+            size_t old_offset = chunk->offset.load(std::memory_order_relaxed);
+            // Calculate the pointer that would result from current offset
+            void* ptr = chunk->data + old_offset;
+            // Calculate how much padding we need for alignment
+            size_t misalignment = (uintptr_t)ptr & (align - 1);
+            size_t padding = misalignment ? (align - misalignment) : 0;
+            size_t aligned_offset = old_offset + padding;
+            size_t new_offset = aligned_offset + size;
+            if (new_offset > CHUNK_SIZE) {
+                // Need new chunk
+                Chunk* next = chunk->next.load(std::memory_order_acquire);
+                if (!next) {
+                    // Lock to prevent multiple threads creating chunks
+                    std::lock_guard<std::mutex> lock(chunk_creation_mutex_);
+                    // Double-check after acquiring lock
+                    next = chunk->next.load(std::memory_order_acquire);
+                    if (!next) {
+                        auto new_chunk = std::make_unique<Chunk>();
+                        next = new_chunk.get();
+                        chunk->next.store(next, std::memory_order_release);
+                        // Transfer ownership after setting atomic pointer
+                        new_chunk.release();
+                    }
+                }
+                // Update current to the new chunk
+                current_.store(next, std::memory_order_release);
+                chunk = next;
+                continue;
+            }
+            if (chunk->offset.compare_exchange_weak(old_offset, new_offset,
+                                                   std::memory_order_release,
+                                                   std::memory_order_relaxed)) {
+                return chunk->data + aligned_offset;
+            }
+        }
+    }
+    ~ArenaAllocator() {
+        // Clean up linked chunks
+        Chunk* chunk = head_->next.load(std::memory_order_acquire);
+        while (chunk) {
+            Chunk* next = chunk->next.load(std::memory_order_acquire);
+            delete chunk;
+            chunk = next;
+        }
+    }
+};
+struct Document {
+    std::string_view id;
+    std::string_view text;
+    std::string_view metadata_json;  // Full JSON including embedding
+};
+// Per-thread top-k tracker for thread-safe parallel search
+struct TopK {
+    size_t k;
+    std::vector<std::pair<float, size_t>> heap; // min-heap by score
+    explicit TopK(size_t k = 0) : k(k) {
+        heap.reserve(k + 1); // Reserve k+1 to avoid reallocation during push
+    }
+    // Make TopK move-only to prevent copy-construction races
+    TopK(const TopK&) = delete;
+    TopK& operator=(const TopK&) = delete;
+    TopK(TopK&&) = default;
+    TopK& operator=(TopK&&) = default;
+    void push(float score, size_t idx) {
+        if (heap.size() < k) {
+            heap.emplace_back(score, idx);
+            std::push_heap(heap.begin(), heap.end(), cmp);
+        } else if (k > 0 && score > heap.front().first) {
+            // Replace the minimum element
+            std::pop_heap(heap.begin(), heap.end(), cmp);
+            heap.back() = {score, idx};
+            std::push_heap(heap.begin(), heap.end(), cmp);
+        }
+    }
+    // Comparator for min-heap (greater than for min-heap behavior)
+    static bool cmp(const std::pair<float, size_t>& a, const std::pair<float, size_t>& b) {
+        return a.first > b.first;
+    }
+    void merge(const TopK& other) {
+        // More efficient: if we have space, bulk insert then re-heapify
+        if (heap.size() + other.heap.size() <= k) {
+            heap.insert(heap.end(), other.heap.begin(), other.heap.end());
+            std::make_heap(heap.begin(), heap.end(), cmp);
+        } else {
+            // Otherwise, insert one by one
+            for (const auto& [score, idx] : other.heap) {
+                push(score, idx);
+            }
+        }
+    }
+};
+class VectorStore {
+    const size_t dim_;
+    ArenaAllocator arena_;
+    struct Entry {
+        Document doc;
+        float* embedding;  // Extracted pointer for fast access
+    };
+    std::vector<Entry> entries_;
+    std::atomic<size_t> count_{0};  // Atomic for parallel loading
+    std::atomic<bool> is_finalized_{false};  // Simple flag: false = loading, true = serving
+    mutable std::shared_mutex search_mutex_;  // Protects against overlapping OpenMP teams
+public:
+    explicit VectorStore(size_t dim) : dim_(dim) {
+        entries_.resize(1'000'000);  // Pre-size with default-constructed entries
+    }
+    // Overload for document type (used in test_main.cpp)
+    simdjson::error_code add_document(simdjson::ondemand::document& json_doc) {
+        simdjson::ondemand::object obj;
+        auto error = json_doc.get_object().get(obj);
+        if (error) {
+            return error;
+        }
+        return add_document(obj);
+    }
+    simdjson::error_code add_document(simdjson::ondemand::object& json_doc) {
+        // Cannot add documents after finalization
+        if (is_finalized_.load(std::memory_order_acquire)) {
+            return simdjson::INCORRECT_TYPE;
+        }
+        // Parse with error handling
+        std::string_view id, text;
+        auto error = json_doc["id"].get_string().get(id);
+        if (error) return error;
+        error = json_doc["text"].get_string().get(text);
+        if (error) return error;
+        // Calculate sizes
+        size_t emb_size = dim_ * sizeof(float);
+        size_t id_size = id.size() + 1;
+        size_t text_size = text.size() + 1;
+        // Allocate temporary buffer for embedding
+        std::vector<float> temp_embedding;
+        temp_embedding.reserve(dim_);
+        // Process metadata and embedding first
+        simdjson::ondemand::object metadata;
+        error = json_doc["metadata"].get_object().get(metadata);
+        if (error) return error;
+        simdjson::ondemand::array emb_array;
+        error = metadata["embedding"].get_array().get(emb_array);
+        if (error) return error;
+        // Consume the array before touching anything else
+        size_t i = 0;
+        for (auto value_result : emb_array) {
+            simdjson::ondemand::value v;
+            error = value_result.get(v);
+            if (error) return error;
+            double val;
+            error = v.get_double().get(val);
+            if (error) return error;
+            if (i >= dim_) {
+                return simdjson::CAPACITY; // Too many embedding values
+            }
+            temp_embedding.push_back(float(val));
+            i++;
+        }
+        // Verify we got the expected number of embedding values
+        if (i != dim_) {
+            return simdjson::INCORRECT_TYPE; // Wrong embedding dimension
+        }
+        // Now it is safe to take the raw metadata JSON
+        std::string_view raw_json;
+        error = metadata.raw_json().get(raw_json);
+        if (error) return error;
+        size_t meta_size = raw_json.size() + 1;
+        // Single arena allocation
+        char* base = (char*)arena_.allocate(emb_size + id_size + text_size + meta_size);
+        if (!base) {
+            return simdjson::MEMALLOC;  // Allocation failed
+        }
+        // Layout: [embedding][id][text][metadata_json]
+        float* emb_ptr = (float*)base;
+        char* id_ptr = base + emb_size;
+        char* text_ptr = id_ptr + id_size;
+        char* meta_ptr = text_ptr + text_size;
+        // Copy embedding from temporary buffer
+        std::memcpy(emb_ptr, temp_embedding.data(), emb_size);
+        // Copy strings (adding null terminator)
+        std::memcpy(id_ptr, id.data(), id.size());
+        id_ptr[id.size()] = '\0';
+        std::memcpy(text_ptr, text.data(), text.size());
+        text_ptr[text.size()] = '\0';
+        std::memcpy(meta_ptr, raw_json.data(), raw_json.size());
+        meta_ptr[raw_json.size()] = '\0';
+        // Atomic increment for parallel loading
+        size_t idx = count_.fetch_add(1, std::memory_order_relaxed);
+        // Bounds check
+        if (idx >= entries_.size()) {
+            count_.fetch_sub(1, std::memory_order_relaxed);
+            return simdjson::CAPACITY;
+        }
+        // Construct entry directly - no synchronization needed
+        entries_[idx] = Entry{
+            .doc = Document{
+                .id = std::string_view(id_ptr, id.size()),
+                .text = std::string_view(text_ptr, text.size()),
+                .metadata_json = std::string_view(meta_ptr, raw_json.size())
+            },
+            .embedding = emb_ptr
+        };
+        return simdjson::SUCCESS;
+    }
+    // Finalize the store: normalize and switch to serving phase
+    void finalize() {
+        // If already finalized, do nothing
+        if (is_finalized_.load(std::memory_order_acquire)) {
+            return;
+        }
+        // Get final count
+        size_t final_count = count_.load(std::memory_order_acquire);
+        // Normalize all embeddings (single-threaded, no races)
+        for (size_t i = 0; i < final_count; ++i) {
+            float* emb = entries_[i].embedding;
+            if (!emb) continue;  // Skip uninitialized entries
+            float sum = 0.0f;
+            #pragma omp simd reduction(+:sum)
+            for (size_t j = 0; j < dim_; ++j) {
+                sum += emb[j] * emb[j];
+            }
+            if (sum > 1e-10f) {  // Avoid division by zero
+                float inv_norm = 1.0f / std::sqrt(sum);
+                #pragma omp simd
+                for (size_t j = 0; j < dim_; ++j) {
+                    emb[j] *= inv_norm;
+                }
+            }
+        }
+        // Ensure all threads see the normalized data
+        #pragma omp barrier
+        // Mark as finalized - this is the ONLY place this flag is set
+        is_finalized_.store(true, std::memory_order_seq_cst);
+    }
+    // Deprecated: use finalize() instead
+    void normalize_all() {
+        finalize();
+    }
+    std::vector<std::pair<float, size_t>>
+    search(const float* query, size_t k) const {
+        // Exclusive lock: prevent overlapping OpenMP teams
+        // Since each search uses all threads via OpenMP, concurrent searches provide no benefit
+        std::unique_lock<std::shared_mutex> lock(search_mutex_);
+        // Search can ONLY run if finalized
+        if (!is_finalized_.load(std::memory_order_acquire)) {
+            return {};
+        }
+        size_t n = count_.load(std::memory_order_acquire);
+        if (n == 0 || k == 0) return {};
+        k = std::min(k, n);  // Ensure k doesn't exceed count
+        // Always use per-thread heaps to avoid any shared memory races
+        const int num_threads = omp_get_max_threads();
+        std::vector<TopK> thread_heaps;
+        thread_heaps.reserve(num_threads);
+        for (int i = 0; i < num_threads; ++i) {
+            thread_heaps.emplace_back(k);  // in-place construction, no copies
+        }
+         std::vector<std::pair<float,std::size_t>> result;
+        #pragma omp parallel
+        {
+            const int tid = omp_get_thread_num();
+            TopK& local_heap = thread_heaps[tid];
+            #pragma omp for  // default barrier kept - ensures all threads finish before merge
+            for (size_t i = 0; i < n; ++i) {
+                float score = 0.0f;
+                const float* emb = entries_[i].embedding;
+                #pragma omp simd reduction(+:score)
+                for (size_t j = 0; j < dim_; ++j) {
+                    score += emb[j] * query[j];
+                }
+                local_heap.push(score, i);
+            }
+            #pragma omp barrier
+            #pragma omp single
+            {
+                TopK final_heap(k);
+                for (auto& th : thread_heaps) final_heap.merge(th);
+                result = std::move(final_heap.heap);
+            }
+        }
+        std::sort(result.begin(), result.end(),
+                  [](const auto& a, const auto& b) { return a.first > b.first; });
+        return result;
+    }
+    const Entry& get_entry(size_t idx) const {
+        return entries_[idx];
+    }
+    size_t size() const {
+        return count_.load(std::memory_order_acquire);
+    }
+    bool is_finalized() const {
+        return is_finalized_.load(std::memory_order_acquire);
+    }
+};

package/src/vector_store_loader.cpp ADDED Viewed

@@ -0,0 +1,176 @@
+#include "vector_store_loader.h"
+#include "atomic_queue.h"
+#include <filesystem>
+#include <fstream>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <cctype>
+void VectorStoreLoader::loadDirectory(VectorStore* store, const std::string& path) {
+    // Cannot load if already finalized
+    if (store->is_finalized()) {
+        return;
+    }
+    // Collect all JSON files
+    std::vector<std::filesystem::path> json_files;
+    for (const auto& entry : std::filesystem::directory_iterator(path)) {
+        if (entry.path().extension() == ".json") {
+            json_files.push_back(entry.path());
+        }
+    }
+    if (json_files.empty()) {
+        store->finalize();
+        return;
+    }
+    // Producer-consumer queue for file data
+    struct FileData {
+        std::string filename;
+        std::string content;
+    };
+    // Queue with bounded capacity (max ~100MB of buffered data)
+    atomic_queue::AtomicQueue<FileData*, 1024> queue;
+    // Atomic flags for coordination
+    std::atomic<bool> producer_done{false};
+    std::atomic<size_t> files_processed{0};
+    // Producer thread - sequential file reading with optimizations
+    std::thread producer([&]() {
+        // Reusable buffer to avoid repeated allocations
+        std::vector<char> buffer;
+        // Start with reasonable capacity, will grow as needed
+        buffer.reserve(1024 * 1024); // 1MB initial capacity
+        for (const auto& filepath : json_files) {
+            // Get file size without opening (one syscall)
+            std::error_code ec;
+            auto size = std::filesystem::file_size(filepath, ec);
+            if (ec) {
+                fprintf(stderr, "Error getting size of %s: %s\n",
+                        filepath.c_str(), ec.message().c_str());
+                continue;
+            }
+            // Open file for reading
+            std::ifstream file(filepath, std::ios::binary);
+            if (!file) {
+                fprintf(stderr, "Error opening %s\n", filepath.c_str());
+                continue;
+            }
+            // Ensure buffer has enough capacity
+            if (size > buffer.capacity()) {
+                buffer.reserve(size);
+            }
+            // Resize buffer to exact size needed
+            buffer.resize(size);
+            // Read directly into buffer
+            if (!file.read(buffer.data(), size)) {
+                fprintf(stderr, "Error reading %s\n", filepath.c_str());
+                continue;
+            }
+            // Create file data with string move-constructed from buffer
+            auto* data = new FileData{
+                filepath.string(),
+                std::string(buffer.begin(), buffer.end())
+            };
+            queue.push(data);
+        }
+        producer_done = true;
+    });
+    // Consumer threads - parallel JSON parsing
+    size_t num_workers = std::thread::hardware_concurrency();
+    std::vector<std::thread> consumers;
+    for (size_t w = 0; w < num_workers; ++w) {
+        consumers.emplace_back([&]() {
+            // Each thread needs its own parser
+            simdjson::ondemand::parser doc_parser;
+            FileData* data = nullptr;
+            while (true) {
+                // Try to get work from queue
+                if (queue.try_pop(data)) {
+                    // Process the file
+                    simdjson::padded_string json(data->content);
+                    // Check if it's an array or object
+                    const char* json_start = json.data();
+                    while (json_start && *json_start && std::isspace(*json_start)) {
+                        json_start++;
+                    }
+                    bool is_array = (json_start && *json_start == '[');
+                    simdjson::ondemand::document doc;
+                    auto error = doc_parser.iterate(json).get(doc);
+                    if (error) {
+                        fprintf(stderr, "Error parsing %s: %s\n", data->filename.c_str(), simdjson::error_message(error));
+                        delete data;
+                        continue;
+                    }
+                    if (is_array) {
+                        // Process as array
+                        simdjson::ondemand::array arr;
+                        error = doc.get_array().get(arr);
+                        if (error) {
+                            fprintf(stderr, "Error getting array from %s: %s\n", data->filename.c_str(), simdjson::error_message(error));
+                            delete data;
+                            continue;
+                        }
+                        for (auto doc_element : arr) {
+                            simdjson::ondemand::object obj;
+                            error = doc_element.get_object().get(obj);
+                            if (!error) {
+                                auto add_error = store->add_document(obj);
+                                if (add_error) {
+                                    fprintf(stderr, "Error adding document from %s: %s\n",
+                                           data->filename.c_str(), simdjson::error_message(add_error));
+                                }
+                            }
+                        }
+                    } else {
+                        // Process as single document
+                        simdjson::ondemand::object obj;
+                        error = doc.get_object().get(obj);
+                        if (!error) {
+                            auto add_error = store->add_document(obj);
+                            if (add_error) {
+                                fprintf(stderr, "Error adding document from %s: %s\n",
+                                       data->filename.c_str(), simdjson::error_message(add_error));
+                            }
+                        }
+                    }
+                    delete data;
+                    files_processed++;
+                } else if (producer_done.load()) {
+                    // No more work and producer is done
+                    break;
+                } else {
+                    // Queue is empty but producer might add more
+                    std::this_thread::yield();
+                }
+            }
+        });
+    }
+    // Wait for all threads to complete
+    producer.join();
+    for (auto& consumer : consumers) {
+        consumer.join();
+    }
+    // Finalize after batch load - normalize and switch to serving phase
+    store->finalize();
+}

package/src/vector_store_loader.h ADDED Viewed

@@ -0,0 +1,19 @@
+#pragma once
+#include "vector_store.h"
+#include <string>
+// Clean interface for loading documents from a directory
+class VectorStoreLoader {
+public:
+    // Load all JSON files from a directory into the vector store
+    // Automatically calls finalize() when complete
+    static void loadDirectory(VectorStore* store, const std::string& path);
+    // Memory-mapped version for better performance with large files
+    // Uses zero-copy I/O via mmap/MapViewOfFile
+    static void loadDirectoryMMap(VectorStore* store, const std::string& path);
+    // Adaptive loader that chooses the best method per file
+    // Uses mmap for files <5MB, standard loading for larger files
+    static void loadDirectoryAdaptive(VectorStore* store, const std::string& path);
+};