native-vector-store 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +223 -0
- package/binding.gyp +45 -0
- package/index.js +3 -0
- package/lib/index.d.ts +60 -0
- package/native-vector-store-0.1.0.tgz +0 -0
- package/package.json +52 -0
- package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
- package/scripts/build-prebuilds.sh +23 -0
- package/src/atomic_queue.h +646 -0
- package/src/binding.cc +151 -0
- package/src/defs.h +107 -0
- package/src/mmap_file.h +159 -0
- package/src/vector_store.h +401 -0
- package/src/vector_store_loader.cpp +176 -0
- package/src/vector_store_loader.h +19 -0
- package/src/vector_store_loader_adaptive.cpp +220 -0
- package/src/vector_store_loader_mmap.cpp +154 -0
@@ -0,0 +1,220 @@
|
|
1
|
+
#include "vector_store_loader.h"
|
2
|
+
#include "mmap_file.h"
|
3
|
+
#include "atomic_queue.h"
|
4
|
+
#include <filesystem>
|
5
|
+
#include <fstream>
|
6
|
+
#include <thread>
|
7
|
+
#include <vector>
|
8
|
+
#include <atomic>
|
9
|
+
#include <cctype>
|
10
|
+
#include <memory>
|
11
|
+
|
12
|
+
// Adaptive loader that chooses the best method per file
|
13
|
+
void VectorStoreLoader::loadDirectoryAdaptive(VectorStore* store, const std::string& path) {
|
14
|
+
// Cannot load if already finalized
|
15
|
+
if (store->is_finalized()) {
|
16
|
+
return;
|
17
|
+
}
|
18
|
+
|
19
|
+
// File size threshold (5MB - files larger than this use standard loading)
|
20
|
+
constexpr size_t SIZE_THRESHOLD = 5 * 1024 * 1024;
|
21
|
+
|
22
|
+
// Collect and categorize files
|
23
|
+
struct FileInfo {
|
24
|
+
std::filesystem::path path;
|
25
|
+
size_t size;
|
26
|
+
bool use_mmap;
|
27
|
+
};
|
28
|
+
|
29
|
+
std::vector<FileInfo> file_infos;
|
30
|
+
|
31
|
+
for (const auto& entry : std::filesystem::directory_iterator(path)) {
|
32
|
+
if (entry.path().extension() == ".json") {
|
33
|
+
std::error_code ec;
|
34
|
+
auto size = std::filesystem::file_size(entry.path(), ec);
|
35
|
+
if (!ec) {
|
36
|
+
file_infos.push_back({
|
37
|
+
entry.path(),
|
38
|
+
size,
|
39
|
+
size < SIZE_THRESHOLD // Use mmap for smaller files
|
40
|
+
});
|
41
|
+
}
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
if (file_infos.empty()) {
|
46
|
+
store->finalize();
|
47
|
+
return;
|
48
|
+
}
|
49
|
+
|
50
|
+
// Producer-consumer queue for mixed file data
|
51
|
+
struct MixedFileData {
|
52
|
+
std::string filename;
|
53
|
+
std::unique_ptr<MMapFile> mmap; // For mmap files
|
54
|
+
std::string content; // For standard loaded files
|
55
|
+
bool is_mmap;
|
56
|
+
};
|
57
|
+
|
58
|
+
// Queue with bounded capacity
|
59
|
+
atomic_queue::AtomicQueue<MixedFileData*, 1024> queue;
|
60
|
+
|
61
|
+
// Atomic flags for coordination
|
62
|
+
std::atomic<bool> producer_done{false};
|
63
|
+
std::atomic<size_t> files_processed{0};
|
64
|
+
std::atomic<size_t> mmap_count{0};
|
65
|
+
std::atomic<size_t> standard_count{0};
|
66
|
+
|
67
|
+
// Producer thread - loads files using appropriate method
|
68
|
+
std::thread producer([&]() {
|
69
|
+
// Reusable buffer for standard loading
|
70
|
+
std::vector<char> buffer;
|
71
|
+
buffer.reserve(1024 * 1024); // Reserve 1MB initial capacity
|
72
|
+
|
73
|
+
for (const auto& file_info : file_infos) {
|
74
|
+
auto* data = new MixedFileData{
|
75
|
+
file_info.path.string(),
|
76
|
+
nullptr,
|
77
|
+
"",
|
78
|
+
file_info.use_mmap
|
79
|
+
};
|
80
|
+
|
81
|
+
if (file_info.use_mmap) {
|
82
|
+
// Memory map smaller files
|
83
|
+
auto mmap = std::make_unique<MMapFile>();
|
84
|
+
|
85
|
+
if (!mmap->open(file_info.path.string())) {
|
86
|
+
fprintf(stderr, "Error mapping file %s\n", file_info.path.c_str());
|
87
|
+
delete data;
|
88
|
+
continue;
|
89
|
+
}
|
90
|
+
|
91
|
+
data->mmap = std::move(mmap);
|
92
|
+
mmap_count++;
|
93
|
+
|
94
|
+
} else {
|
95
|
+
// Standard load for larger files
|
96
|
+
// Ensure buffer has enough capacity
|
97
|
+
if (file_info.size > buffer.capacity()) {
|
98
|
+
buffer.reserve(file_info.size);
|
99
|
+
}
|
100
|
+
buffer.resize(file_info.size);
|
101
|
+
|
102
|
+
std::ifstream file(file_info.path, std::ios::binary);
|
103
|
+
if (!file) {
|
104
|
+
fprintf(stderr, "Error opening %s\n", file_info.path.c_str());
|
105
|
+
delete data;
|
106
|
+
continue;
|
107
|
+
}
|
108
|
+
|
109
|
+
if (!file.read(buffer.data(), file_info.size)) {
|
110
|
+
fprintf(stderr, "Error reading %s\n", file_info.path.c_str());
|
111
|
+
delete data;
|
112
|
+
continue;
|
113
|
+
}
|
114
|
+
|
115
|
+
data->content = std::string(buffer.begin(), buffer.end());
|
116
|
+
standard_count++;
|
117
|
+
}
|
118
|
+
|
119
|
+
queue.push(data);
|
120
|
+
}
|
121
|
+
|
122
|
+
producer_done = true;
|
123
|
+
|
124
|
+
// Log loading stats
|
125
|
+
fprintf(stderr, "Adaptive loader: %zu files via mmap, %zu files via standard\n",
|
126
|
+
mmap_count.load(), standard_count.load());
|
127
|
+
});
|
128
|
+
|
129
|
+
// Consumer threads - parallel JSON parsing
|
130
|
+
size_t num_workers = std::thread::hardware_concurrency();
|
131
|
+
std::vector<std::thread> consumers;
|
132
|
+
|
133
|
+
for (size_t w = 0; w < num_workers; ++w) {
|
134
|
+
consumers.emplace_back([&]() {
|
135
|
+
// Each thread needs its own parser
|
136
|
+
simdjson::ondemand::parser doc_parser;
|
137
|
+
MixedFileData* data = nullptr;
|
138
|
+
|
139
|
+
while (true) {
|
140
|
+
// Try to get work from queue
|
141
|
+
if (queue.try_pop(data)) {
|
142
|
+
// Get JSON content based on loading method
|
143
|
+
simdjson::padded_string json = data->is_mmap
|
144
|
+
? simdjson::padded_string(data->mmap->data(), data->mmap->size())
|
145
|
+
: simdjson::padded_string(data->content);
|
146
|
+
|
147
|
+
// Check if it's an array or object
|
148
|
+
const char* json_start = json.data();
|
149
|
+
while (json_start && *json_start && std::isspace(*json_start)) {
|
150
|
+
json_start++;
|
151
|
+
}
|
152
|
+
bool is_array = (json_start && *json_start == '[');
|
153
|
+
|
154
|
+
simdjson::ondemand::document doc;
|
155
|
+
auto error = doc_parser.iterate(json).get(doc);
|
156
|
+
if (error) {
|
157
|
+
fprintf(stderr, "Error parsing %s: %s\n",
|
158
|
+
data->filename.c_str(), simdjson::error_message(error));
|
159
|
+
delete data;
|
160
|
+
continue;
|
161
|
+
}
|
162
|
+
|
163
|
+
if (is_array) {
|
164
|
+
// Process as array
|
165
|
+
simdjson::ondemand::array arr;
|
166
|
+
error = doc.get_array().get(arr);
|
167
|
+
if (error) {
|
168
|
+
fprintf(stderr, "Error getting array from %s: %s\n",
|
169
|
+
data->filename.c_str(), simdjson::error_message(error));
|
170
|
+
delete data;
|
171
|
+
continue;
|
172
|
+
}
|
173
|
+
|
174
|
+
for (auto doc_element : arr) {
|
175
|
+
simdjson::ondemand::object obj;
|
176
|
+
error = doc_element.get_object().get(obj);
|
177
|
+
if (!error) {
|
178
|
+
auto add_error = store->add_document(obj);
|
179
|
+
if (add_error) {
|
180
|
+
fprintf(stderr, "Error adding document from %s: %s\n",
|
181
|
+
data->filename.c_str(), simdjson::error_message(add_error));
|
182
|
+
}
|
183
|
+
}
|
184
|
+
}
|
185
|
+
} else {
|
186
|
+
// Process as single document
|
187
|
+
simdjson::ondemand::object obj;
|
188
|
+
error = doc.get_object().get(obj);
|
189
|
+
if (!error) {
|
190
|
+
auto add_error = store->add_document(obj);
|
191
|
+
if (add_error) {
|
192
|
+
fprintf(stderr, "Error adding document from %s: %s\n",
|
193
|
+
data->filename.c_str(), simdjson::error_message(add_error));
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}
|
197
|
+
|
198
|
+
delete data;
|
199
|
+
files_processed++;
|
200
|
+
|
201
|
+
} else if (producer_done.load()) {
|
202
|
+
// No more work and producer is done
|
203
|
+
break;
|
204
|
+
} else {
|
205
|
+
// Queue is empty but producer might add more
|
206
|
+
std::this_thread::yield();
|
207
|
+
}
|
208
|
+
}
|
209
|
+
});
|
210
|
+
}
|
211
|
+
|
212
|
+
// Wait for all threads to complete
|
213
|
+
producer.join();
|
214
|
+
for (auto& consumer : consumers) {
|
215
|
+
consumer.join();
|
216
|
+
}
|
217
|
+
|
218
|
+
// Finalize after batch load - normalize and switch to serving phase
|
219
|
+
store->finalize();
|
220
|
+
}
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#include "vector_store_loader.h"
|
2
|
+
#include "mmap_file.h"
|
3
|
+
#include "atomic_queue.h"
|
4
|
+
#include <filesystem>
|
5
|
+
#include <thread>
|
6
|
+
#include <vector>
|
7
|
+
#include <atomic>
|
8
|
+
#include <cctype>
|
9
|
+
#include <memory>
|
10
|
+
|
11
|
+
// Memory-mapped version of loadDirectory for better performance
|
12
|
+
void VectorStoreLoader::loadDirectoryMMap(VectorStore* store, const std::string& path) {
|
13
|
+
// Cannot load if already finalized
|
14
|
+
if (store->is_finalized()) {
|
15
|
+
return;
|
16
|
+
}
|
17
|
+
|
18
|
+
// Collect all JSON files
|
19
|
+
std::vector<std::filesystem::path> json_files;
|
20
|
+
for (const auto& entry : std::filesystem::directory_iterator(path)) {
|
21
|
+
if (entry.path().extension() == ".json") {
|
22
|
+
json_files.push_back(entry.path());
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
if (json_files.empty()) {
|
27
|
+
store->finalize();
|
28
|
+
return;
|
29
|
+
}
|
30
|
+
|
31
|
+
// Producer-consumer queue for memory-mapped files
|
32
|
+
struct MMapFileData {
|
33
|
+
std::string filename;
|
34
|
+
std::unique_ptr<MMapFile> mmap;
|
35
|
+
};
|
36
|
+
|
37
|
+
// Queue with bounded capacity
|
38
|
+
atomic_queue::AtomicQueue<MMapFileData*, 1024> queue;
|
39
|
+
|
40
|
+
// Atomic flags for coordination
|
41
|
+
std::atomic<bool> producer_done{false};
|
42
|
+
std::atomic<size_t> files_processed{0};
|
43
|
+
|
44
|
+
// Producer thread - memory maps files
|
45
|
+
std::thread producer([&]() {
|
46
|
+
for (const auto& filepath : json_files) {
|
47
|
+
auto mmap = std::make_unique<MMapFile>();
|
48
|
+
|
49
|
+
if (!mmap->open(filepath.string())) {
|
50
|
+
fprintf(stderr, "Error mapping file %s\n", filepath.c_str());
|
51
|
+
continue;
|
52
|
+
}
|
53
|
+
|
54
|
+
// Create file data and push to queue
|
55
|
+
auto* data = new MMapFileData{
|
56
|
+
filepath.string(),
|
57
|
+
std::move(mmap)
|
58
|
+
};
|
59
|
+
queue.push(data);
|
60
|
+
}
|
61
|
+
producer_done = true;
|
62
|
+
});
|
63
|
+
|
64
|
+
// Consumer threads - parallel JSON parsing
|
65
|
+
size_t num_workers = std::thread::hardware_concurrency();
|
66
|
+
std::vector<std::thread> consumers;
|
67
|
+
|
68
|
+
for (size_t w = 0; w < num_workers; ++w) {
|
69
|
+
consumers.emplace_back([&]() {
|
70
|
+
// Each thread needs its own parser
|
71
|
+
simdjson::ondemand::parser doc_parser;
|
72
|
+
MMapFileData* data = nullptr;
|
73
|
+
|
74
|
+
while (true) {
|
75
|
+
// Try to get work from queue
|
76
|
+
if (queue.try_pop(data)) {
|
77
|
+
// Process the memory-mapped file
|
78
|
+
// For mmap, we need to copy to ensure padding
|
79
|
+
simdjson::padded_string json(data->mmap->data(), data->mmap->size());
|
80
|
+
|
81
|
+
// Check if it's an array or object
|
82
|
+
const char* json_start = data->mmap->data();
|
83
|
+
while (json_start && *json_start && std::isspace(*json_start)) {
|
84
|
+
json_start++;
|
85
|
+
}
|
86
|
+
bool is_array = (json_start && *json_start == '[');
|
87
|
+
|
88
|
+
simdjson::ondemand::document doc;
|
89
|
+
auto error = doc_parser.iterate(json).get(doc);
|
90
|
+
if (error) {
|
91
|
+
fprintf(stderr, "Error parsing %s: %s\n",
|
92
|
+
data->filename.c_str(), simdjson::error_message(error));
|
93
|
+
delete data;
|
94
|
+
continue;
|
95
|
+
}
|
96
|
+
|
97
|
+
if (is_array) {
|
98
|
+
// Process as array
|
99
|
+
simdjson::ondemand::array arr;
|
100
|
+
error = doc.get_array().get(arr);
|
101
|
+
if (error) {
|
102
|
+
fprintf(stderr, "Error getting array from %s: %s\n",
|
103
|
+
data->filename.c_str(), simdjson::error_message(error));
|
104
|
+
delete data;
|
105
|
+
continue;
|
106
|
+
}
|
107
|
+
|
108
|
+
for (auto doc_element : arr) {
|
109
|
+
simdjson::ondemand::object obj;
|
110
|
+
error = doc_element.get_object().get(obj);
|
111
|
+
if (!error) {
|
112
|
+
auto add_error = store->add_document(obj);
|
113
|
+
if (add_error) {
|
114
|
+
fprintf(stderr, "Error adding document from %s: %s\n",
|
115
|
+
data->filename.c_str(), simdjson::error_message(add_error));
|
116
|
+
}
|
117
|
+
}
|
118
|
+
}
|
119
|
+
} else {
|
120
|
+
// Process as single document
|
121
|
+
simdjson::ondemand::object obj;
|
122
|
+
error = doc.get_object().get(obj);
|
123
|
+
if (!error) {
|
124
|
+
auto add_error = store->add_document(obj);
|
125
|
+
if (add_error) {
|
126
|
+
fprintf(stderr, "Error adding document from %s: %s\n",
|
127
|
+
data->filename.c_str(), simdjson::error_message(add_error));
|
128
|
+
}
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
delete data;
|
133
|
+
files_processed++;
|
134
|
+
|
135
|
+
} else if (producer_done.load()) {
|
136
|
+
// No more work and producer is done
|
137
|
+
break;
|
138
|
+
} else {
|
139
|
+
// Queue is empty but producer might add more
|
140
|
+
std::this_thread::yield();
|
141
|
+
}
|
142
|
+
}
|
143
|
+
});
|
144
|
+
}
|
145
|
+
|
146
|
+
// Wait for all threads to complete
|
147
|
+
producer.join();
|
148
|
+
for (auto& consumer : consumers) {
|
149
|
+
consumer.join();
|
150
|
+
}
|
151
|
+
|
152
|
+
// Finalize after batch load - normalize and switch to serving phase
|
153
|
+
store->finalize();
|
154
|
+
}
|