logosdb 0.7.8 → 0.7.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -8
- package/binding.gyp +9 -9
- package/deps/core/include/logosdb/logosdb.h +530 -0
- package/deps/core/src/hnsw_index.cpp +345 -0
- package/deps/core/src/hnsw_index.h +77 -0
- package/deps/core/src/logosdb.cpp +781 -0
- package/deps/core/src/metadata.cpp +266 -0
- package/deps/core/src/metadata.h +69 -0
- package/deps/core/src/platform.cpp +342 -0
- package/deps/core/src/platform.h +94 -0
- package/deps/core/src/storage.cpp +764 -0
- package/deps/core/src/storage.h +131 -0
- package/deps/core/src/wal.cpp +570 -0
- package/deps/core/src/wal.h +99 -0
- package/deps/core/third_party/hnswlib/hnswlib/bruteforce.h +163 -0
- package/deps/core/third_party/hnswlib/hnswlib/hnswalg.h +1411 -0
- package/deps/core/third_party/hnswlib/hnswlib/hnswlib.h +228 -0
- package/deps/core/third_party/hnswlib/hnswlib/space_ip.h +400 -0
- package/deps/core/third_party/hnswlib/hnswlib/space_l2.h +324 -0
- package/deps/core/third_party/hnswlib/hnswlib/stop_condition.h +276 -0
- package/deps/core/third_party/hnswlib/hnswlib/visited_list_pool.h +78 -0
- package/deps/core/third_party/nlohmann/json.hpp +24765 -0
- package/package.json +21 -5
- package/scripts/vendor-core.mjs +66 -0
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
#include "hnsw_index.h"
|
|
2
|
+
|
|
3
|
+
#include <hnswlib/hnswlib/hnswlib.h>
|
|
4
|
+
|
|
5
|
+
#include <cstring>
|
|
6
|
+
#include <fstream>
|
|
7
|
+
|
|
8
|
+
namespace logosdb
|
|
9
|
+
{
|
|
10
|
+
namespace internal
|
|
11
|
+
{
|
|
12
|
+
|
|
13
|
+
/* Index file header for distance metric persistence */
|
|
14
|
+
struct IndexHeader
|
|
15
|
+
{
|
|
16
|
+
char magic[8]; /* "LOGOSDB\0" */
|
|
17
|
+
uint32_t version; /* 1 */
|
|
18
|
+
int32_t distance; /* DistanceMetric value */
|
|
19
|
+
int32_t dim; /* vector dimension */
|
|
20
|
+
uint8_t reserved[12]; /* padding to 32 bytes */
|
|
21
|
+
|
|
22
|
+
IndexHeader()
|
|
23
|
+
{
|
|
24
|
+
std::memset(this, 0, sizeof(*this));
|
|
25
|
+
std::memcpy(magic, "LOGOSDB\0", 8);
|
|
26
|
+
version = 1;
|
|
27
|
+
distance = DIST_IP;
|
|
28
|
+
dim = 0;
|
|
29
|
+
}
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
struct HnswIndex::Impl
|
|
33
|
+
{
|
|
34
|
+
hnswlib::SpaceInterface<float>* space = nullptr;
|
|
35
|
+
hnswlib::HierarchicalNSW<float>* alg = nullptr;
|
|
36
|
+
HnswParams params;
|
|
37
|
+
DistanceMetric distance_metric = DIST_IP;
|
|
38
|
+
|
|
39
|
+
~Impl()
|
|
40
|
+
{
|
|
41
|
+
delete alg;
|
|
42
|
+
delete space;
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
HnswIndex::~HnswIndex()
|
|
47
|
+
{
|
|
48
|
+
close();
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
static hnswlib::SpaceInterface<float>* create_space(DistanceMetric dist, int dim)
|
|
52
|
+
{
|
|
53
|
+
switch (dist)
|
|
54
|
+
{
|
|
55
|
+
case DIST_L2:
|
|
56
|
+
return new hnswlib::L2Space(dim);
|
|
57
|
+
case DIST_IP:
|
|
58
|
+
case DIST_COSINE:
|
|
59
|
+
default:
|
|
60
|
+
return new hnswlib::InnerProductSpace(dim);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/* Get the metadata path from the index path */
|
|
65
|
+
static std::string get_meta_path(const std::string& index_path)
|
|
66
|
+
{
|
|
67
|
+
return index_path + ".meta";
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
bool HnswIndex::open(const std::string& path, const HnswParams& params, std::string& err)
|
|
71
|
+
{
|
|
72
|
+
close();
|
|
73
|
+
path_ = path;
|
|
74
|
+
|
|
75
|
+
impl_ = new Impl();
|
|
76
|
+
impl_->params = params;
|
|
77
|
+
impl_->distance_metric = static_cast<DistanceMetric>(params.distance);
|
|
78
|
+
|
|
79
|
+
// Check for metadata file with stored distance metric
|
|
80
|
+
std::string meta_path = get_meta_path(path);
|
|
81
|
+
std::ifstream meta_in(meta_path, std::ios::binary);
|
|
82
|
+
if (meta_in.good())
|
|
83
|
+
{
|
|
84
|
+
IndexHeader header;
|
|
85
|
+
meta_in.read(reinterpret_cast<char*>(&header), sizeof(header));
|
|
86
|
+
meta_in.close();
|
|
87
|
+
|
|
88
|
+
if (std::memcmp(header.magic, "LOGOSDB\0", 8) == 0 && header.version == 1)
|
|
89
|
+
{
|
|
90
|
+
// Check distance metric compatibility
|
|
91
|
+
DistanceMetric stored_dist = static_cast<DistanceMetric>(header.distance);
|
|
92
|
+
if (stored_dist != impl_->distance_metric)
|
|
93
|
+
{
|
|
94
|
+
err = "distance metric mismatch: stored=" + std::to_string(header.distance) +
|
|
95
|
+
", requested=" + std::to_string(params.distance);
|
|
96
|
+
close();
|
|
97
|
+
return false;
|
|
98
|
+
}
|
|
99
|
+
// Check dimension
|
|
100
|
+
if (header.dim != params.dim)
|
|
101
|
+
{
|
|
102
|
+
err = "dimension mismatch: stored=" + std::to_string(header.dim) +
|
|
103
|
+
", requested=" + std::to_string(params.dim);
|
|
104
|
+
close();
|
|
105
|
+
return false;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Try loading existing index.
|
|
111
|
+
std::ifstream in(path, std::ios::binary);
|
|
112
|
+
if (in.good() && in.peek() != std::ifstream::traits_type::eof())
|
|
113
|
+
{
|
|
114
|
+
in.close();
|
|
115
|
+
try
|
|
116
|
+
{
|
|
117
|
+
impl_->space = create_space(impl_->distance_metric, params.dim);
|
|
118
|
+
impl_->alg = new hnswlib::HierarchicalNSW<float>(
|
|
119
|
+
impl_->space, path, false, params.max_elements, true);
|
|
120
|
+
impl_->alg->setEf(params.ef_search);
|
|
121
|
+
return true;
|
|
122
|
+
}
|
|
123
|
+
catch (const std::exception& e)
|
|
124
|
+
{
|
|
125
|
+
err = std::string("hnsw load: ") + e.what();
|
|
126
|
+
close();
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
in.close();
|
|
131
|
+
|
|
132
|
+
// Create new index
|
|
133
|
+
try
|
|
134
|
+
{
|
|
135
|
+
impl_->space = create_space(impl_->distance_metric, params.dim);
|
|
136
|
+
impl_->alg = new hnswlib::HierarchicalNSW<float>(
|
|
137
|
+
impl_->space, params.max_elements, params.M, params.ef_construction);
|
|
138
|
+
impl_->alg->setEf(params.ef_search);
|
|
139
|
+
}
|
|
140
|
+
catch (const std::exception& e)
|
|
141
|
+
{
|
|
142
|
+
err = std::string("hnsw init: ") + e.what();
|
|
143
|
+
close();
|
|
144
|
+
return false;
|
|
145
|
+
}
|
|
146
|
+
return true;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
void HnswIndex::close()
|
|
150
|
+
{
|
|
151
|
+
delete impl_;
|
|
152
|
+
impl_ = nullptr;
|
|
153
|
+
path_.clear();
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/* Normalize a vector to unit length (for cosine similarity) */
|
|
157
|
+
static void l2_normalize(float* vec, int dim)
|
|
158
|
+
{
|
|
159
|
+
float norm = 0.0f;
|
|
160
|
+
for (int i = 0; i < dim; ++i)
|
|
161
|
+
{
|
|
162
|
+
norm += vec[i] * vec[i];
|
|
163
|
+
}
|
|
164
|
+
norm = std::sqrt(norm);
|
|
165
|
+
if (norm > 0.0f)
|
|
166
|
+
{
|
|
167
|
+
for (int i = 0; i < dim; ++i)
|
|
168
|
+
{
|
|
169
|
+
vec[i] /= norm;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
bool HnswIndex::add(uint64_t label, const float* vec, std::string& err)
|
|
175
|
+
{
|
|
176
|
+
if (!impl_ || !impl_->alg)
|
|
177
|
+
{
|
|
178
|
+
err = "index not open";
|
|
179
|
+
return false;
|
|
180
|
+
}
|
|
181
|
+
try
|
|
182
|
+
{
|
|
183
|
+
if (impl_->distance_metric == DIST_COSINE)
|
|
184
|
+
{
|
|
185
|
+
// Make a copy and normalize for cosine similarity
|
|
186
|
+
std::vector<float> normalized(vec, vec + impl_->params.dim);
|
|
187
|
+
l2_normalize(normalized.data(), impl_->params.dim);
|
|
188
|
+
impl_->alg->addPoint(normalized.data(), (hnswlib::labeltype)label);
|
|
189
|
+
}
|
|
190
|
+
else
|
|
191
|
+
{
|
|
192
|
+
impl_->alg->addPoint(vec, (hnswlib::labeltype)label);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
catch (const std::exception& e)
|
|
196
|
+
{
|
|
197
|
+
err = std::string("hnsw add: ") + e.what();
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
200
|
+
return true;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
bool HnswIndex::mark_deleted(uint64_t label, std::string& err)
|
|
204
|
+
{
|
|
205
|
+
if (!impl_ || !impl_->alg)
|
|
206
|
+
{
|
|
207
|
+
err = "index not open";
|
|
208
|
+
return false;
|
|
209
|
+
}
|
|
210
|
+
try
|
|
211
|
+
{
|
|
212
|
+
impl_->alg->markDelete((hnswlib::labeltype)label);
|
|
213
|
+
}
|
|
214
|
+
catch (const std::exception& e)
|
|
215
|
+
{
|
|
216
|
+
err = std::string("hnsw markDelete: ") + e.what();
|
|
217
|
+
return false;
|
|
218
|
+
}
|
|
219
|
+
return true;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
bool HnswIndex::is_deleted(uint64_t label) const
|
|
223
|
+
{
|
|
224
|
+
if (!impl_ || !impl_->alg)
|
|
225
|
+
return false;
|
|
226
|
+
auto& lookup = impl_->alg->label_lookup_;
|
|
227
|
+
std::unique_lock<std::mutex> lk(impl_->alg->label_lookup_lock);
|
|
228
|
+
auto it = lookup.find((hnswlib::labeltype)label);
|
|
229
|
+
if (it == lookup.end())
|
|
230
|
+
return false;
|
|
231
|
+
auto internal_id = it->second;
|
|
232
|
+
lk.unlock();
|
|
233
|
+
return impl_->alg->isMarkedDeleted(internal_id);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
bool HnswIndex::has_label(uint64_t label) const
|
|
237
|
+
{
|
|
238
|
+
if (!impl_ || !impl_->alg)
|
|
239
|
+
return false;
|
|
240
|
+
auto& lookup = impl_->alg->label_lookup_;
|
|
241
|
+
std::unique_lock<std::mutex> lk(impl_->alg->label_lookup_lock);
|
|
242
|
+
return lookup.find((hnswlib::labeltype)label) != lookup.end();
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
std::vector<std::pair<uint64_t, float>>
|
|
246
|
+
HnswIndex::search(const float* query, int top_k, std::string& err) const
|
|
247
|
+
{
|
|
248
|
+
std::vector<std::pair<uint64_t, float>> results;
|
|
249
|
+
if (!impl_ || !impl_->alg)
|
|
250
|
+
{
|
|
251
|
+
err = "index not open";
|
|
252
|
+
return results;
|
|
253
|
+
}
|
|
254
|
+
if (impl_->alg->cur_element_count == 0)
|
|
255
|
+
return results;
|
|
256
|
+
if (top_k <= 0)
|
|
257
|
+
return results;
|
|
258
|
+
|
|
259
|
+
size_t k = std::min((size_t)top_k, (size_t)impl_->alg->cur_element_count);
|
|
260
|
+
|
|
261
|
+
// Prepare query (normalize for cosine)
|
|
262
|
+
std::vector<float> normalized_query;
|
|
263
|
+
const float* search_vec = query;
|
|
264
|
+
if (impl_->distance_metric == DIST_COSINE)
|
|
265
|
+
{
|
|
266
|
+
normalized_query.assign(query, query + impl_->params.dim);
|
|
267
|
+
l2_normalize(normalized_query.data(), impl_->params.dim);
|
|
268
|
+
search_vec = normalized_query.data();
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
try
|
|
272
|
+
{
|
|
273
|
+
auto pq = impl_->alg->searchKnn(search_vec, k);
|
|
274
|
+
results.reserve(pq.size());
|
|
275
|
+
while (!pq.empty())
|
|
276
|
+
{
|
|
277
|
+
auto& top = pq.top();
|
|
278
|
+
float score;
|
|
279
|
+
if (impl_->distance_metric == DIST_L2)
|
|
280
|
+
{
|
|
281
|
+
// L2: distance is already correct, invert so higher = more similar
|
|
282
|
+
score = 1.0f / (1.0f + top.first);
|
|
283
|
+
}
|
|
284
|
+
else
|
|
285
|
+
{
|
|
286
|
+
// IP/Cosine: hnswlib returns (1 - ip) as distance; convert back
|
|
287
|
+
score = 1.0f - top.first;
|
|
288
|
+
}
|
|
289
|
+
results.push_back({(uint64_t)top.second, score});
|
|
290
|
+
pq.pop();
|
|
291
|
+
}
|
|
292
|
+
// Reverse so highest score is first.
|
|
293
|
+
std::reverse(results.begin(), results.end());
|
|
294
|
+
}
|
|
295
|
+
catch (const std::exception& e)
|
|
296
|
+
{
|
|
297
|
+
err = std::string("hnsw search: ") + e.what();
|
|
298
|
+
}
|
|
299
|
+
return results;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
size_t HnswIndex::count() const
|
|
303
|
+
{
|
|
304
|
+
if (!impl_ || !impl_->alg)
|
|
305
|
+
return 0;
|
|
306
|
+
return impl_->alg->cur_element_count;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
bool HnswIndex::save(std::string& err) const
|
|
310
|
+
{
|
|
311
|
+
if (!impl_ || !impl_->alg)
|
|
312
|
+
{
|
|
313
|
+
err = "index not open";
|
|
314
|
+
return false;
|
|
315
|
+
}
|
|
316
|
+
try
|
|
317
|
+
{
|
|
318
|
+
impl_->alg->saveIndex(path_);
|
|
319
|
+
|
|
320
|
+
// Write metadata file with distance metric
|
|
321
|
+
std::string meta_path = get_meta_path(path_);
|
|
322
|
+
std::ofstream meta_out(meta_path, std::ios::binary | std::ios::trunc);
|
|
323
|
+
if (!meta_out.good())
|
|
324
|
+
{
|
|
325
|
+
err = "cannot write index metadata";
|
|
326
|
+
return false;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
IndexHeader header;
|
|
330
|
+
header.distance = impl_->distance_metric;
|
|
331
|
+
header.dim = impl_->params.dim;
|
|
332
|
+
|
|
333
|
+
meta_out.write(reinterpret_cast<const char*>(&header), sizeof(header));
|
|
334
|
+
meta_out.close();
|
|
335
|
+
}
|
|
336
|
+
catch (const std::exception& e)
|
|
337
|
+
{
|
|
338
|
+
err = std::string("hnsw save: ") + e.what();
|
|
339
|
+
return false;
|
|
340
|
+
}
|
|
341
|
+
return true;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
} // namespace internal
|
|
345
|
+
} // namespace logosdb
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <cstddef>
|
|
4
|
+
#include <cstdint>
|
|
5
|
+
#include <string>
|
|
6
|
+
#include <utility>
|
|
7
|
+
#include <vector>
|
|
8
|
+
|
|
9
|
+
namespace logosdb
|
|
10
|
+
{
|
|
11
|
+
namespace internal
|
|
12
|
+
{
|
|
13
|
+
|
|
14
|
+
/* Distance metrics - match the public API constants */
|
|
15
|
+
enum DistanceMetric
|
|
16
|
+
{
|
|
17
|
+
DIST_IP = 0, /* Inner product */
|
|
18
|
+
DIST_COSINE = 1, /* Cosine (IP on normalized vectors) */
|
|
19
|
+
DIST_L2 = 2 /* Euclidean distance */
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
struct HnswParams
|
|
23
|
+
{
|
|
24
|
+
int dim = 0;
|
|
25
|
+
size_t max_elements = 1000000;
|
|
26
|
+
int ef_construction = 200;
|
|
27
|
+
int M = 16;
|
|
28
|
+
int ef_search = 50;
|
|
29
|
+
int distance = DIST_IP; /* DIST_IP, DIST_COSINE, or DIST_L2 */
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
// Thin wrapper around hnswlib for inner-product search on L2-normalized vectors.
|
|
33
|
+
class HnswIndex
|
|
34
|
+
{
|
|
35
|
+
public:
|
|
36
|
+
HnswIndex() = default;
|
|
37
|
+
~HnswIndex();
|
|
38
|
+
|
|
39
|
+
HnswIndex(const HnswIndex&) = delete;
|
|
40
|
+
HnswIndex& operator=(const HnswIndex&) = delete;
|
|
41
|
+
|
|
42
|
+
// Create a new index or load an existing one from disk.
|
|
43
|
+
bool open(const std::string& path, const HnswParams& params, std::string& err);
|
|
44
|
+
void close();
|
|
45
|
+
|
|
46
|
+
// Add a vector with the given internal label (row index).
|
|
47
|
+
bool add(uint64_t label, const float* vec, std::string& err);
|
|
48
|
+
|
|
49
|
+
// Mark `label` as deleted. Excluded from subsequent search results.
|
|
50
|
+
// Returns false if the label is not in the index.
|
|
51
|
+
bool mark_deleted(uint64_t label, std::string& err);
|
|
52
|
+
|
|
53
|
+
// True if `label` exists in the index and is currently marked deleted.
|
|
54
|
+
// Returns false if the label is missing.
|
|
55
|
+
bool is_deleted(uint64_t label) const;
|
|
56
|
+
|
|
57
|
+
// True if `label` is currently present (known) in the index, regardless
|
|
58
|
+
// of its deletion state.
|
|
59
|
+
bool has_label(uint64_t label) const;
|
|
60
|
+
|
|
61
|
+
// Search for top-k nearest. Returns (label, distance) pairs sorted by distance desc
|
|
62
|
+
// (inner product — higher is more similar).
|
|
63
|
+
std::vector<std::pair<uint64_t, float>>
|
|
64
|
+
search(const float* query, int top_k, std::string& err) const;
|
|
65
|
+
|
|
66
|
+
size_t count() const;
|
|
67
|
+
|
|
68
|
+
bool save(std::string& err) const;
|
|
69
|
+
|
|
70
|
+
private:
|
|
71
|
+
struct Impl;
|
|
72
|
+
Impl* impl_ = nullptr;
|
|
73
|
+
std::string path_;
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
} // namespace internal
|
|
77
|
+
} // namespace logosdb
|