native-vector-store 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,220 @@
1
+ #include "vector_store_loader.h"
2
+ #include "mmap_file.h"
3
+ #include "atomic_queue.h"
4
+ #include <filesystem>
5
+ #include <fstream>
6
+ #include <thread>
7
+ #include <vector>
8
+ #include <atomic>
9
+ #include <cctype>
10
+ #include <memory>
11
+
12
+ // Adaptive loader that chooses the best method per file
13
+ void VectorStoreLoader::loadDirectoryAdaptive(VectorStore* store, const std::string& path) {
14
+ // Cannot load if already finalized
15
+ if (store->is_finalized()) {
16
+ return;
17
+ }
18
+
19
+ // File size threshold (5MB - files larger than this use standard loading)
20
+ constexpr size_t SIZE_THRESHOLD = 5 * 1024 * 1024;
21
+
22
+ // Collect and categorize files
23
+ struct FileInfo {
24
+ std::filesystem::path path;
25
+ size_t size;
26
+ bool use_mmap;
27
+ };
28
+
29
+ std::vector<FileInfo> file_infos;
30
+
31
+ for (const auto& entry : std::filesystem::directory_iterator(path)) {
32
+ if (entry.path().extension() == ".json") {
33
+ std::error_code ec;
34
+ auto size = std::filesystem::file_size(entry.path(), ec);
35
+ if (!ec) {
36
+ file_infos.push_back({
37
+ entry.path(),
38
+ size,
39
+ size < SIZE_THRESHOLD // Use mmap for smaller files
40
+ });
41
+ }
42
+ }
43
+ }
44
+
45
+ if (file_infos.empty()) {
46
+ store->finalize();
47
+ return;
48
+ }
49
+
50
+ // Producer-consumer queue for mixed file data
51
+ struct MixedFileData {
52
+ std::string filename;
53
+ std::unique_ptr<MMapFile> mmap; // For mmap files
54
+ std::string content; // For standard loaded files
55
+ bool is_mmap;
56
+ };
57
+
58
+ // Queue with bounded capacity
59
+ atomic_queue::AtomicQueue<MixedFileData*, 1024> queue;
60
+
61
+ // Atomic flags for coordination
62
+ std::atomic<bool> producer_done{false};
63
+ std::atomic<size_t> files_processed{0};
64
+ std::atomic<size_t> mmap_count{0};
65
+ std::atomic<size_t> standard_count{0};
66
+
67
+ // Producer thread - loads files using appropriate method
68
+ std::thread producer([&]() {
69
+ // Reusable buffer for standard loading
70
+ std::vector<char> buffer;
71
+ buffer.reserve(1024 * 1024); // Reserve 1MB initial capacity
72
+
73
+ for (const auto& file_info : file_infos) {
74
+ auto* data = new MixedFileData{
75
+ file_info.path.string(),
76
+ nullptr,
77
+ "",
78
+ file_info.use_mmap
79
+ };
80
+
81
+ if (file_info.use_mmap) {
82
+ // Memory map smaller files
83
+ auto mmap = std::make_unique<MMapFile>();
84
+
85
+ if (!mmap->open(file_info.path.string())) {
86
+ fprintf(stderr, "Error mapping file %s\n", file_info.path.c_str());
87
+ delete data;
88
+ continue;
89
+ }
90
+
91
+ data->mmap = std::move(mmap);
92
+ mmap_count++;
93
+
94
+ } else {
95
+ // Standard load for larger files
96
+ // Ensure buffer has enough capacity
97
+ if (file_info.size > buffer.capacity()) {
98
+ buffer.reserve(file_info.size);
99
+ }
100
+ buffer.resize(file_info.size);
101
+
102
+ std::ifstream file(file_info.path, std::ios::binary);
103
+ if (!file) {
104
+ fprintf(stderr, "Error opening %s\n", file_info.path.c_str());
105
+ delete data;
106
+ continue;
107
+ }
108
+
109
+ if (!file.read(buffer.data(), file_info.size)) {
110
+ fprintf(stderr, "Error reading %s\n", file_info.path.c_str());
111
+ delete data;
112
+ continue;
113
+ }
114
+
115
+ data->content = std::string(buffer.begin(), buffer.end());
116
+ standard_count++;
117
+ }
118
+
119
+ queue.push(data);
120
+ }
121
+
122
+ producer_done = true;
123
+
124
+ // Log loading stats
125
+ fprintf(stderr, "Adaptive loader: %zu files via mmap, %zu files via standard\n",
126
+ mmap_count.load(), standard_count.load());
127
+ });
128
+
129
+ // Consumer threads - parallel JSON parsing
130
+ size_t num_workers = std::thread::hardware_concurrency();
131
+ std::vector<std::thread> consumers;
132
+
133
+ for (size_t w = 0; w < num_workers; ++w) {
134
+ consumers.emplace_back([&]() {
135
+ // Each thread needs its own parser
136
+ simdjson::ondemand::parser doc_parser;
137
+ MixedFileData* data = nullptr;
138
+
139
+ while (true) {
140
+ // Try to get work from queue
141
+ if (queue.try_pop(data)) {
142
+ // Get JSON content based on loading method
143
+ simdjson::padded_string json = data->is_mmap
144
+ ? simdjson::padded_string(data->mmap->data(), data->mmap->size())
145
+ : simdjson::padded_string(data->content);
146
+
147
+ // Check if it's an array or object
148
+ const char* json_start = json.data();
149
+ while (json_start && *json_start && std::isspace(*json_start)) {
150
+ json_start++;
151
+ }
152
+ bool is_array = (json_start && *json_start == '[');
153
+
154
+ simdjson::ondemand::document doc;
155
+ auto error = doc_parser.iterate(json).get(doc);
156
+ if (error) {
157
+ fprintf(stderr, "Error parsing %s: %s\n",
158
+ data->filename.c_str(), simdjson::error_message(error));
159
+ delete data;
160
+ continue;
161
+ }
162
+
163
+ if (is_array) {
164
+ // Process as array
165
+ simdjson::ondemand::array arr;
166
+ error = doc.get_array().get(arr);
167
+ if (error) {
168
+ fprintf(stderr, "Error getting array from %s: %s\n",
169
+ data->filename.c_str(), simdjson::error_message(error));
170
+ delete data;
171
+ continue;
172
+ }
173
+
174
+ for (auto doc_element : arr) {
175
+ simdjson::ondemand::object obj;
176
+ error = doc_element.get_object().get(obj);
177
+ if (!error) {
178
+ auto add_error = store->add_document(obj);
179
+ if (add_error) {
180
+ fprintf(stderr, "Error adding document from %s: %s\n",
181
+ data->filename.c_str(), simdjson::error_message(add_error));
182
+ }
183
+ }
184
+ }
185
+ } else {
186
+ // Process as single document
187
+ simdjson::ondemand::object obj;
188
+ error = doc.get_object().get(obj);
189
+ if (!error) {
190
+ auto add_error = store->add_document(obj);
191
+ if (add_error) {
192
+ fprintf(stderr, "Error adding document from %s: %s\n",
193
+ data->filename.c_str(), simdjson::error_message(add_error));
194
+ }
195
+ }
196
+ }
197
+
198
+ delete data;
199
+ files_processed++;
200
+
201
+ } else if (producer_done.load()) {
202
+ // No more work and producer is done
203
+ break;
204
+ } else {
205
+ // Queue is empty but producer might add more
206
+ std::this_thread::yield();
207
+ }
208
+ }
209
+ });
210
+ }
211
+
212
+ // Wait for all threads to complete
213
+ producer.join();
214
+ for (auto& consumer : consumers) {
215
+ consumer.join();
216
+ }
217
+
218
+ // Finalize after batch load - normalize and switch to serving phase
219
+ store->finalize();
220
+ }
@@ -0,0 +1,154 @@
1
+ #include "vector_store_loader.h"
2
+ #include "mmap_file.h"
3
+ #include "atomic_queue.h"
4
+ #include <filesystem>
5
+ #include <thread>
6
+ #include <vector>
7
+ #include <atomic>
8
+ #include <cctype>
9
+ #include <memory>
10
+
11
+ // Memory-mapped version of loadDirectory for better performance
12
+ void VectorStoreLoader::loadDirectoryMMap(VectorStore* store, const std::string& path) {
13
+ // Cannot load if already finalized
14
+ if (store->is_finalized()) {
15
+ return;
16
+ }
17
+
18
+ // Collect all JSON files
19
+ std::vector<std::filesystem::path> json_files;
20
+ for (const auto& entry : std::filesystem::directory_iterator(path)) {
21
+ if (entry.path().extension() == ".json") {
22
+ json_files.push_back(entry.path());
23
+ }
24
+ }
25
+
26
+ if (json_files.empty()) {
27
+ store->finalize();
28
+ return;
29
+ }
30
+
31
+ // Producer-consumer queue for memory-mapped files
32
+ struct MMapFileData {
33
+ std::string filename;
34
+ std::unique_ptr<MMapFile> mmap;
35
+ };
36
+
37
+ // Queue with bounded capacity
38
+ atomic_queue::AtomicQueue<MMapFileData*, 1024> queue;
39
+
40
+ // Atomic flags for coordination
41
+ std::atomic<bool> producer_done{false};
42
+ std::atomic<size_t> files_processed{0};
43
+
44
+ // Producer thread - memory maps files
45
+ std::thread producer([&]() {
46
+ for (const auto& filepath : json_files) {
47
+ auto mmap = std::make_unique<MMapFile>();
48
+
49
+ if (!mmap->open(filepath.string())) {
50
+ fprintf(stderr, "Error mapping file %s\n", filepath.c_str());
51
+ continue;
52
+ }
53
+
54
+ // Create file data and push to queue
55
+ auto* data = new MMapFileData{
56
+ filepath.string(),
57
+ std::move(mmap)
58
+ };
59
+ queue.push(data);
60
+ }
61
+ producer_done = true;
62
+ });
63
+
64
+ // Consumer threads - parallel JSON parsing
65
+ size_t num_workers = std::thread::hardware_concurrency();
66
+ std::vector<std::thread> consumers;
67
+
68
+ for (size_t w = 0; w < num_workers; ++w) {
69
+ consumers.emplace_back([&]() {
70
+ // Each thread needs its own parser
71
+ simdjson::ondemand::parser doc_parser;
72
+ MMapFileData* data = nullptr;
73
+
74
+ while (true) {
75
+ // Try to get work from queue
76
+ if (queue.try_pop(data)) {
77
+ // Process the memory-mapped file
78
+ // For mmap, we need to copy to ensure padding
79
+ simdjson::padded_string json(data->mmap->data(), data->mmap->size());
80
+
81
+ // Check if it's an array or object
82
+ const char* json_start = data->mmap->data();
83
+ while (json_start && *json_start && std::isspace(*json_start)) {
84
+ json_start++;
85
+ }
86
+ bool is_array = (json_start && *json_start == '[');
87
+
88
+ simdjson::ondemand::document doc;
89
+ auto error = doc_parser.iterate(json).get(doc);
90
+ if (error) {
91
+ fprintf(stderr, "Error parsing %s: %s\n",
92
+ data->filename.c_str(), simdjson::error_message(error));
93
+ delete data;
94
+ continue;
95
+ }
96
+
97
+ if (is_array) {
98
+ // Process as array
99
+ simdjson::ondemand::array arr;
100
+ error = doc.get_array().get(arr);
101
+ if (error) {
102
+ fprintf(stderr, "Error getting array from %s: %s\n",
103
+ data->filename.c_str(), simdjson::error_message(error));
104
+ delete data;
105
+ continue;
106
+ }
107
+
108
+ for (auto doc_element : arr) {
109
+ simdjson::ondemand::object obj;
110
+ error = doc_element.get_object().get(obj);
111
+ if (!error) {
112
+ auto add_error = store->add_document(obj);
113
+ if (add_error) {
114
+ fprintf(stderr, "Error adding document from %s: %s\n",
115
+ data->filename.c_str(), simdjson::error_message(add_error));
116
+ }
117
+ }
118
+ }
119
+ } else {
120
+ // Process as single document
121
+ simdjson::ondemand::object obj;
122
+ error = doc.get_object().get(obj);
123
+ if (!error) {
124
+ auto add_error = store->add_document(obj);
125
+ if (add_error) {
126
+ fprintf(stderr, "Error adding document from %s: %s\n",
127
+ data->filename.c_str(), simdjson::error_message(add_error));
128
+ }
129
+ }
130
+ }
131
+
132
+ delete data;
133
+ files_processed++;
134
+
135
+ } else if (producer_done.load()) {
136
+ // No more work and producer is done
137
+ break;
138
+ } else {
139
+ // Queue is empty but producer might add more
140
+ std::this_thread::yield();
141
+ }
142
+ }
143
+ });
144
+ }
145
+
146
+ // Wait for all threads to complete
147
+ producer.join();
148
+ for (auto& consumer : consumers) {
149
+ consumer.join();
150
+ }
151
+
152
+ // Finalize after batch load - normalize and switch to serving phase
153
+ store->finalize();
154
+ }