logosdb 0.7.8 → 0.7.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,345 @@
1
+ #include "hnsw_index.h"
2
+
3
+ #include <hnswlib/hnswlib/hnswlib.h>
4
+
5
+ #include <cstring>
6
+ #include <fstream>
7
+
8
+ namespace logosdb
9
+ {
10
+ namespace internal
11
+ {
12
+
13
+ /* Index file header for distance metric persistence */
14
+ struct IndexHeader
15
+ {
16
+ char magic[8]; /* "LOGOSDB\0" */
17
+ uint32_t version; /* 1 */
18
+ int32_t distance; /* DistanceMetric value */
19
+ int32_t dim; /* vector dimension */
20
+ uint8_t reserved[12]; /* padding to 32 bytes */
21
+
22
+ IndexHeader()
23
+ {
24
+ std::memset(this, 0, sizeof(*this));
25
+ std::memcpy(magic, "LOGOSDB\0", 8);
26
+ version = 1;
27
+ distance = DIST_IP;
28
+ dim = 0;
29
+ }
30
+ };
31
+
32
+ struct HnswIndex::Impl
33
+ {
34
+ hnswlib::SpaceInterface<float>* space = nullptr;
35
+ hnswlib::HierarchicalNSW<float>* alg = nullptr;
36
+ HnswParams params;
37
+ DistanceMetric distance_metric = DIST_IP;
38
+
39
+ ~Impl()
40
+ {
41
+ delete alg;
42
+ delete space;
43
+ }
44
+ };
45
+
46
+ HnswIndex::~HnswIndex()
47
+ {
48
+ close();
49
+ }
50
+
51
+ static hnswlib::SpaceInterface<float>* create_space(DistanceMetric dist, int dim)
52
+ {
53
+ switch (dist)
54
+ {
55
+ case DIST_L2:
56
+ return new hnswlib::L2Space(dim);
57
+ case DIST_IP:
58
+ case DIST_COSINE:
59
+ default:
60
+ return new hnswlib::InnerProductSpace(dim);
61
+ }
62
+ }
63
+
64
+ /* Get the metadata path from the index path */
65
+ static std::string get_meta_path(const std::string& index_path)
66
+ {
67
+ return index_path + ".meta";
68
+ }
69
+
70
+ bool HnswIndex::open(const std::string& path, const HnswParams& params, std::string& err)
71
+ {
72
+ close();
73
+ path_ = path;
74
+
75
+ impl_ = new Impl();
76
+ impl_->params = params;
77
+ impl_->distance_metric = static_cast<DistanceMetric>(params.distance);
78
+
79
+ // Check for metadata file with stored distance metric
80
+ std::string meta_path = get_meta_path(path);
81
+ std::ifstream meta_in(meta_path, std::ios::binary);
82
+ if (meta_in.good())
83
+ {
84
+ IndexHeader header;
85
+ meta_in.read(reinterpret_cast<char*>(&header), sizeof(header));
86
+ meta_in.close();
87
+
88
+ if (std::memcmp(header.magic, "LOGOSDB\0", 8) == 0 && header.version == 1)
89
+ {
90
+ // Check distance metric compatibility
91
+ DistanceMetric stored_dist = static_cast<DistanceMetric>(header.distance);
92
+ if (stored_dist != impl_->distance_metric)
93
+ {
94
+ err = "distance metric mismatch: stored=" + std::to_string(header.distance) +
95
+ ", requested=" + std::to_string(params.distance);
96
+ close();
97
+ return false;
98
+ }
99
+ // Check dimension
100
+ if (header.dim != params.dim)
101
+ {
102
+ err = "dimension mismatch: stored=" + std::to_string(header.dim) +
103
+ ", requested=" + std::to_string(params.dim);
104
+ close();
105
+ return false;
106
+ }
107
+ }
108
+ }
109
+
110
+ // Try loading existing index.
111
+ std::ifstream in(path, std::ios::binary);
112
+ if (in.good() && in.peek() != std::ifstream::traits_type::eof())
113
+ {
114
+ in.close();
115
+ try
116
+ {
117
+ impl_->space = create_space(impl_->distance_metric, params.dim);
118
+ impl_->alg = new hnswlib::HierarchicalNSW<float>(
119
+ impl_->space, path, false, params.max_elements, true);
120
+ impl_->alg->setEf(params.ef_search);
121
+ return true;
122
+ }
123
+ catch (const std::exception& e)
124
+ {
125
+ err = std::string("hnsw load: ") + e.what();
126
+ close();
127
+ return false;
128
+ }
129
+ }
130
+ in.close();
131
+
132
+ // Create new index
133
+ try
134
+ {
135
+ impl_->space = create_space(impl_->distance_metric, params.dim);
136
+ impl_->alg = new hnswlib::HierarchicalNSW<float>(
137
+ impl_->space, params.max_elements, params.M, params.ef_construction);
138
+ impl_->alg->setEf(params.ef_search);
139
+ }
140
+ catch (const std::exception& e)
141
+ {
142
+ err = std::string("hnsw init: ") + e.what();
143
+ close();
144
+ return false;
145
+ }
146
+ return true;
147
+ }
148
+
149
+ void HnswIndex::close()
150
+ {
151
+ delete impl_;
152
+ impl_ = nullptr;
153
+ path_.clear();
154
+ }
155
+
156
+ /* Normalize a vector to unit length (for cosine similarity) */
157
+ static void l2_normalize(float* vec, int dim)
158
+ {
159
+ float norm = 0.0f;
160
+ for (int i = 0; i < dim; ++i)
161
+ {
162
+ norm += vec[i] * vec[i];
163
+ }
164
+ norm = std::sqrt(norm);
165
+ if (norm > 0.0f)
166
+ {
167
+ for (int i = 0; i < dim; ++i)
168
+ {
169
+ vec[i] /= norm;
170
+ }
171
+ }
172
+ }
173
+
174
+ bool HnswIndex::add(uint64_t label, const float* vec, std::string& err)
175
+ {
176
+ if (!impl_ || !impl_->alg)
177
+ {
178
+ err = "index not open";
179
+ return false;
180
+ }
181
+ try
182
+ {
183
+ if (impl_->distance_metric == DIST_COSINE)
184
+ {
185
+ // Make a copy and normalize for cosine similarity
186
+ std::vector<float> normalized(vec, vec + impl_->params.dim);
187
+ l2_normalize(normalized.data(), impl_->params.dim);
188
+ impl_->alg->addPoint(normalized.data(), (hnswlib::labeltype)label);
189
+ }
190
+ else
191
+ {
192
+ impl_->alg->addPoint(vec, (hnswlib::labeltype)label);
193
+ }
194
+ }
195
+ catch (const std::exception& e)
196
+ {
197
+ err = std::string("hnsw add: ") + e.what();
198
+ return false;
199
+ }
200
+ return true;
201
+ }
202
+
203
+ bool HnswIndex::mark_deleted(uint64_t label, std::string& err)
204
+ {
205
+ if (!impl_ || !impl_->alg)
206
+ {
207
+ err = "index not open";
208
+ return false;
209
+ }
210
+ try
211
+ {
212
+ impl_->alg->markDelete((hnswlib::labeltype)label);
213
+ }
214
+ catch (const std::exception& e)
215
+ {
216
+ err = std::string("hnsw markDelete: ") + e.what();
217
+ return false;
218
+ }
219
+ return true;
220
+ }
221
+
222
+ bool HnswIndex::is_deleted(uint64_t label) const
223
+ {
224
+ if (!impl_ || !impl_->alg)
225
+ return false;
226
+ auto& lookup = impl_->alg->label_lookup_;
227
+ std::unique_lock<std::mutex> lk(impl_->alg->label_lookup_lock);
228
+ auto it = lookup.find((hnswlib::labeltype)label);
229
+ if (it == lookup.end())
230
+ return false;
231
+ auto internal_id = it->second;
232
+ lk.unlock();
233
+ return impl_->alg->isMarkedDeleted(internal_id);
234
+ }
235
+
236
+ bool HnswIndex::has_label(uint64_t label) const
237
+ {
238
+ if (!impl_ || !impl_->alg)
239
+ return false;
240
+ auto& lookup = impl_->alg->label_lookup_;
241
+ std::unique_lock<std::mutex> lk(impl_->alg->label_lookup_lock);
242
+ return lookup.find((hnswlib::labeltype)label) != lookup.end();
243
+ }
244
+
245
+ std::vector<std::pair<uint64_t, float>>
246
+ HnswIndex::search(const float* query, int top_k, std::string& err) const
247
+ {
248
+ std::vector<std::pair<uint64_t, float>> results;
249
+ if (!impl_ || !impl_->alg)
250
+ {
251
+ err = "index not open";
252
+ return results;
253
+ }
254
+ if (impl_->alg->cur_element_count == 0)
255
+ return results;
256
+ if (top_k <= 0)
257
+ return results;
258
+
259
+ size_t k = std::min((size_t)top_k, (size_t)impl_->alg->cur_element_count);
260
+
261
+ // Prepare query (normalize for cosine)
262
+ std::vector<float> normalized_query;
263
+ const float* search_vec = query;
264
+ if (impl_->distance_metric == DIST_COSINE)
265
+ {
266
+ normalized_query.assign(query, query + impl_->params.dim);
267
+ l2_normalize(normalized_query.data(), impl_->params.dim);
268
+ search_vec = normalized_query.data();
269
+ }
270
+
271
+ try
272
+ {
273
+ auto pq = impl_->alg->searchKnn(search_vec, k);
274
+ results.reserve(pq.size());
275
+ while (!pq.empty())
276
+ {
277
+ auto& top = pq.top();
278
+ float score;
279
+ if (impl_->distance_metric == DIST_L2)
280
+ {
281
+ // L2: distance is already correct, invert so higher = more similar
282
+ score = 1.0f / (1.0f + top.first);
283
+ }
284
+ else
285
+ {
286
+ // IP/Cosine: hnswlib returns (1 - ip) as distance; convert back
287
+ score = 1.0f - top.first;
288
+ }
289
+ results.push_back({(uint64_t)top.second, score});
290
+ pq.pop();
291
+ }
292
+ // Reverse so highest score is first.
293
+ std::reverse(results.begin(), results.end());
294
+ }
295
+ catch (const std::exception& e)
296
+ {
297
+ err = std::string("hnsw search: ") + e.what();
298
+ }
299
+ return results;
300
+ }
301
+
302
+ size_t HnswIndex::count() const
303
+ {
304
+ if (!impl_ || !impl_->alg)
305
+ return 0;
306
+ return impl_->alg->cur_element_count;
307
+ }
308
+
309
+ bool HnswIndex::save(std::string& err) const
310
+ {
311
+ if (!impl_ || !impl_->alg)
312
+ {
313
+ err = "index not open";
314
+ return false;
315
+ }
316
+ try
317
+ {
318
+ impl_->alg->saveIndex(path_);
319
+
320
+ // Write metadata file with distance metric
321
+ std::string meta_path = get_meta_path(path_);
322
+ std::ofstream meta_out(meta_path, std::ios::binary | std::ios::trunc);
323
+ if (!meta_out.good())
324
+ {
325
+ err = "cannot write index metadata";
326
+ return false;
327
+ }
328
+
329
+ IndexHeader header;
330
+ header.distance = impl_->distance_metric;
331
+ header.dim = impl_->params.dim;
332
+
333
+ meta_out.write(reinterpret_cast<const char*>(&header), sizeof(header));
334
+ meta_out.close();
335
+ }
336
+ catch (const std::exception& e)
337
+ {
338
+ err = std::string("hnsw save: ") + e.what();
339
+ return false;
340
+ }
341
+ return true;
342
+ }
343
+
344
+ } // namespace internal
345
+ } // namespace logosdb
@@ -0,0 +1,77 @@
1
+ #pragma once
2
+
3
+ #include <cstddef>
4
+ #include <cstdint>
5
+ #include <string>
6
+ #include <utility>
7
+ #include <vector>
8
+
9
+ namespace logosdb
10
+ {
11
+ namespace internal
12
+ {
13
+
14
+ /* Distance metrics - match the public API constants */
15
+ enum DistanceMetric
16
+ {
17
+ DIST_IP = 0, /* Inner product */
18
+ DIST_COSINE = 1, /* Cosine (IP on normalized vectors) */
19
+ DIST_L2 = 2 /* Euclidean distance */
20
+ };
21
+
22
+ struct HnswParams
23
+ {
24
+ int dim = 0;
25
+ size_t max_elements = 1000000;
26
+ int ef_construction = 200;
27
+ int M = 16;
28
+ int ef_search = 50;
29
+ int distance = DIST_IP; /* DIST_IP, DIST_COSINE, or DIST_L2 */
30
+ };
31
+
32
+ // Thin wrapper around hnswlib for inner-product search on L2-normalized vectors.
33
+ class HnswIndex
34
+ {
35
+ public:
36
+ HnswIndex() = default;
37
+ ~HnswIndex();
38
+
39
+ HnswIndex(const HnswIndex&) = delete;
40
+ HnswIndex& operator=(const HnswIndex&) = delete;
41
+
42
+ // Create a new index or load an existing one from disk.
43
+ bool open(const std::string& path, const HnswParams& params, std::string& err);
44
+ void close();
45
+
46
+ // Add a vector with the given internal label (row index).
47
+ bool add(uint64_t label, const float* vec, std::string& err);
48
+
49
+ // Mark `label` as deleted. Excluded from subsequent search results.
50
+ // Returns false if the label is not in the index.
51
+ bool mark_deleted(uint64_t label, std::string& err);
52
+
53
+ // True if `label` exists in the index and is currently marked deleted.
54
+ // Returns false if the label is missing.
55
+ bool is_deleted(uint64_t label) const;
56
+
57
+ // True if `label` is currently present (known) in the index, regardless
58
+ // of its deletion state.
59
+ bool has_label(uint64_t label) const;
60
+
61
+ // Search for top-k nearest. Returns (label, distance) pairs sorted by distance desc
62
+ // (inner product — higher is more similar).
63
+ std::vector<std::pair<uint64_t, float>>
64
+ search(const float* query, int top_k, std::string& err) const;
65
+
66
+ size_t count() const;
67
+
68
+ bool save(std::string& err) const;
69
+
70
+ private:
71
+ struct Impl;
72
+ Impl* impl_ = nullptr;
73
+ std::string path_;
74
+ };
75
+
76
+ } // namespace internal
77
+ } // namespace logosdb