logosdb 0.7.8 → 0.7.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -8
- package/binding.gyp +9 -9
- package/deps/core/include/logosdb/logosdb.h +530 -0
- package/deps/core/src/hnsw_index.cpp +345 -0
- package/deps/core/src/hnsw_index.h +77 -0
- package/deps/core/src/logosdb.cpp +781 -0
- package/deps/core/src/metadata.cpp +266 -0
- package/deps/core/src/metadata.h +69 -0
- package/deps/core/src/platform.cpp +342 -0
- package/deps/core/src/platform.h +94 -0
- package/deps/core/src/storage.cpp +764 -0
- package/deps/core/src/storage.h +131 -0
- package/deps/core/src/wal.cpp +570 -0
- package/deps/core/src/wal.h +99 -0
- package/deps/core/third_party/hnswlib/hnswlib/bruteforce.h +163 -0
- package/deps/core/third_party/hnswlib/hnswlib/hnswalg.h +1411 -0
- package/deps/core/third_party/hnswlib/hnswlib/hnswlib.h +228 -0
- package/deps/core/third_party/hnswlib/hnswlib/space_ip.h +400 -0
- package/deps/core/third_party/hnswlib/hnswlib/space_l2.h +324 -0
- package/deps/core/third_party/hnswlib/hnswlib/stop_condition.h +276 -0
- package/deps/core/third_party/hnswlib/hnswlib/visited_list_pool.h +78 -0
- package/deps/core/third_party/nlohmann/json.hpp +24765 -0
- package/package.json +21 -5
- package/scripts/vendor-core.mjs +66 -0
|
@@ -0,0 +1,781 @@
|
|
|
1
|
+
#include "hnsw_index.h"
|
|
2
|
+
#include "metadata.h"
|
|
3
|
+
#include "platform.h"
|
|
4
|
+
#include "storage.h"
|
|
5
|
+
#include "wal.h"
|
|
6
|
+
|
|
7
|
+
#include <logosdb/logosdb.h>
|
|
8
|
+
|
|
9
|
+
#include <cstdlib>
|
|
10
|
+
#include <cstring>
|
|
11
|
+
#include <filesystem>
|
|
12
|
+
#include <functional>
|
|
13
|
+
#include <mutex>
|
|
14
|
+
#include <string>
|
|
15
|
+
#include <vector>
|
|
16
|
+
|
|
17
|
+
using namespace logosdb::internal;
|
|
18
|
+
|
|
19
|
+
/* ── Internal DB struct ────────────────────────────────────────────── */
|
|
20
|
+
|
|
21
|
+
struct logosdb_t
|
|
22
|
+
{
|
|
23
|
+
VectorStorage vectors;
|
|
24
|
+
MetadataStore meta;
|
|
25
|
+
HnswIndex index;
|
|
26
|
+
WriteAheadLog wal;
|
|
27
|
+
std::mutex mu;
|
|
28
|
+
int dim = 0;
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
struct logosdb_options_t
|
|
32
|
+
{
|
|
33
|
+
int dim = 0;
|
|
34
|
+
size_t max_elements = 1000000;
|
|
35
|
+
int ef_construction = 200;
|
|
36
|
+
int M = 16;
|
|
37
|
+
int ef_search = 50;
|
|
38
|
+
int distance = 0; /* LOGOSDB_DIST_IP default */
|
|
39
|
+
int dtype = 0; /* LOGOSDB_DTYPE_FLOAT32 default */
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
struct logosdb_search_result_t
|
|
43
|
+
{
|
|
44
|
+
struct Hit
|
|
45
|
+
{
|
|
46
|
+
uint64_t id;
|
|
47
|
+
float score;
|
|
48
|
+
std::string text;
|
|
49
|
+
std::string timestamp;
|
|
50
|
+
};
|
|
51
|
+
std::vector<Hit> hits;
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
/* ── Helpers ───────────────────────────────────────────────────────── */
|
|
55
|
+
|
|
56
|
+
static void set_err(char** errptr, const std::string& msg)
|
|
57
|
+
{
|
|
58
|
+
if (errptr)
|
|
59
|
+
{
|
|
60
|
+
*errptr = platform::string_duplicate(msg.c_str());
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/* ── Options ───────────────────────────────────────────────────────── */
|
|
65
|
+
|
|
66
|
+
logosdb_options_t* logosdb_options_create(void)
|
|
67
|
+
{
|
|
68
|
+
return new logosdb_options_t();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
void logosdb_options_destroy(logosdb_options_t* opts)
|
|
72
|
+
{
|
|
73
|
+
delete opts;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
void logosdb_options_set_dim(logosdb_options_t* o, int d)
|
|
77
|
+
{
|
|
78
|
+
if (o && d > 0)
|
|
79
|
+
o->dim = d;
|
|
80
|
+
}
|
|
81
|
+
void logosdb_options_set_max_elements(logosdb_options_t* o, size_t n)
|
|
82
|
+
{
|
|
83
|
+
if (o && n > 0)
|
|
84
|
+
o->max_elements = n;
|
|
85
|
+
}
|
|
86
|
+
void logosdb_options_set_ef_construction(logosdb_options_t* o, int e)
|
|
87
|
+
{
|
|
88
|
+
if (o && e > 0)
|
|
89
|
+
o->ef_construction = e;
|
|
90
|
+
}
|
|
91
|
+
void logosdb_options_set_M(logosdb_options_t* o, int m)
|
|
92
|
+
{
|
|
93
|
+
if (o && m > 0)
|
|
94
|
+
o->M = m;
|
|
95
|
+
}
|
|
96
|
+
void logosdb_options_set_ef_search(logosdb_options_t* o, int e)
|
|
97
|
+
{
|
|
98
|
+
if (o && e > 0)
|
|
99
|
+
o->ef_search = e;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
int logosdb_options_set_distance(logosdb_options_t* o, int metric)
|
|
103
|
+
{
|
|
104
|
+
if (!o)
|
|
105
|
+
return -1;
|
|
106
|
+
if (metric < 0 || metric > 2)
|
|
107
|
+
return -1; /* Invalid metric */
|
|
108
|
+
o->distance = metric;
|
|
109
|
+
return 0;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
int logosdb_options_set_dtype(logosdb_options_t* o, int dtype)
|
|
113
|
+
{
|
|
114
|
+
if (!o)
|
|
115
|
+
return -1;
|
|
116
|
+
if (dtype < 0 || dtype > 2)
|
|
117
|
+
return -1; /* Invalid dtype */
|
|
118
|
+
o->dtype = dtype;
|
|
119
|
+
return 0;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/* ── Lifecycle ─────────────────────────────────────────────────────── */
|
|
123
|
+
|
|
124
|
+
logosdb_t* logosdb_open(const char* path, const logosdb_options_t* opts, char** errptr)
|
|
125
|
+
{
|
|
126
|
+
if (!path || !opts || opts->dim <= 0)
|
|
127
|
+
{
|
|
128
|
+
set_err(errptr, "invalid arguments: path and dim > 0 required");
|
|
129
|
+
return nullptr;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
std::filesystem::create_directories(path);
|
|
133
|
+
|
|
134
|
+
auto db = new logosdb_t();
|
|
135
|
+
db->dim = opts->dim;
|
|
136
|
+
std::string err;
|
|
137
|
+
|
|
138
|
+
std::string vec_path = std::string(path) + "/vectors.bin";
|
|
139
|
+
std::string meta_path = std::string(path) + "/meta.jsonl";
|
|
140
|
+
std::string idx_path = std::string(path) + "/hnsw.idx";
|
|
141
|
+
std::string wal_path = std::string(path) + "/wal.log";
|
|
142
|
+
|
|
143
|
+
StorageDtype dtype = static_cast<StorageDtype>(opts->dtype);
|
|
144
|
+
if (!db->vectors.open(vec_path, opts->dim, dtype, err))
|
|
145
|
+
{
|
|
146
|
+
set_err(errptr, err);
|
|
147
|
+
delete db;
|
|
148
|
+
return nullptr;
|
|
149
|
+
}
|
|
150
|
+
if (!db->meta.open(meta_path, err))
|
|
151
|
+
{
|
|
152
|
+
set_err(errptr, err);
|
|
153
|
+
delete db;
|
|
154
|
+
return nullptr;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
HnswParams hp;
|
|
158
|
+
hp.dim = opts->dim;
|
|
159
|
+
hp.max_elements = opts->max_elements;
|
|
160
|
+
hp.ef_construction = opts->ef_construction;
|
|
161
|
+
hp.M = opts->M;
|
|
162
|
+
hp.ef_search = opts->ef_search;
|
|
163
|
+
hp.distance = opts->distance;
|
|
164
|
+
|
|
165
|
+
if (!db->index.open(idx_path, hp, err))
|
|
166
|
+
{
|
|
167
|
+
set_err(errptr, err);
|
|
168
|
+
delete db;
|
|
169
|
+
return nullptr;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Open WAL and replay any pending entries for atomic recovery.
|
|
173
|
+
if (!db->wal.open(wal_path, err))
|
|
174
|
+
{
|
|
175
|
+
set_err(errptr, err);
|
|
176
|
+
delete db;
|
|
177
|
+
return nullptr;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Replay pending WAL entries to ensure consistency.
|
|
181
|
+
int replayed = db->wal.replay_pending(
|
|
182
|
+
[&db](const WALEntry& entry, std::string& replay_err) -> bool
|
|
183
|
+
{
|
|
184
|
+
// Validate: expected_id should match current row count
|
|
185
|
+
if (entry.expected_id != db->vectors.n_rows())
|
|
186
|
+
{
|
|
187
|
+
replay_err = "wal replay: expected_id mismatch (" +
|
|
188
|
+
std::to_string(entry.expected_id) + " vs " +
|
|
189
|
+
std::to_string(db->vectors.n_rows()) + ")";
|
|
190
|
+
return false;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Replay vector
|
|
194
|
+
uint64_t vid = db->vectors.append(entry.vector.data(), (int)entry.dim, replay_err);
|
|
195
|
+
if (vid == UINT64_MAX)
|
|
196
|
+
return false;
|
|
197
|
+
|
|
198
|
+
// Replay metadata
|
|
199
|
+
uint64_t mid = db->meta.append(entry.text.c_str(), entry.timestamp.c_str(), replay_err);
|
|
200
|
+
if (mid == UINT64_MAX)
|
|
201
|
+
return false;
|
|
202
|
+
|
|
203
|
+
// Replay index
|
|
204
|
+
if (!db->index.add(vid, entry.vector.data(), replay_err))
|
|
205
|
+
{
|
|
206
|
+
return false;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return true;
|
|
210
|
+
},
|
|
211
|
+
err);
|
|
212
|
+
|
|
213
|
+
if (replayed < 0)
|
|
214
|
+
{
|
|
215
|
+
set_err(errptr, "wal replay: " + err);
|
|
216
|
+
delete db;
|
|
217
|
+
return nullptr;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Backfill index if vector storage has more rows than the index (e.g. crash recovery).
|
|
221
|
+
size_t n_vec = db->vectors.n_rows();
|
|
222
|
+
size_t n_idx = db->index.count();
|
|
223
|
+
bool backfilled = false;
|
|
224
|
+
if (n_vec > n_idx)
|
|
225
|
+
{
|
|
226
|
+
std::vector<float> float32_row(db->dim);
|
|
227
|
+
for (size_t i = n_idx; i < n_vec; ++i)
|
|
228
|
+
{
|
|
229
|
+
db->vectors.row_to_float32(i, float32_row.data());
|
|
230
|
+
if (!db->index.add(i, float32_row.data(), err))
|
|
231
|
+
{
|
|
232
|
+
set_err(errptr, "backfill index: " + err);
|
|
233
|
+
delete db;
|
|
234
|
+
return nullptr;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
backfilled = true;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Re-apply deletion marks. If we just rebuilt (part of) the index from the
|
|
241
|
+
// vector store, or if the index file was out of sync with the metadata
|
|
242
|
+
// tombstone log, we may have live slots for rows the metadata log says are
|
|
243
|
+
// deleted. Replay the tombstone set onto the index (skip entries already
|
|
244
|
+
// marked to tolerate normal reopens).
|
|
245
|
+
for (uint64_t id : db->meta.deleted_ids())
|
|
246
|
+
{
|
|
247
|
+
if (!db->index.has_label(id))
|
|
248
|
+
continue;
|
|
249
|
+
if (db->index.is_deleted(id))
|
|
250
|
+
continue;
|
|
251
|
+
if (!db->index.mark_deleted(id, err))
|
|
252
|
+
{
|
|
253
|
+
set_err(errptr, "replay tombstone: " + err);
|
|
254
|
+
delete db;
|
|
255
|
+
return nullptr;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if (backfilled)
|
|
260
|
+
db->index.save(err);
|
|
261
|
+
|
|
262
|
+
return db;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
void logosdb_close(logosdb_t* db)
|
|
266
|
+
{
|
|
267
|
+
if (!db)
|
|
268
|
+
return;
|
|
269
|
+
std::string err;
|
|
270
|
+
db->wal.sync(err); // Ensure WAL is durable before closing other stores
|
|
271
|
+
db->index.save(err);
|
|
272
|
+
db->vectors.sync(err);
|
|
273
|
+
db->meta.sync(err);
|
|
274
|
+
db->wal.close();
|
|
275
|
+
delete db;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/* ── Write ─────────────────────────────────────────────────────────── */
|
|
279
|
+
|
|
280
|
+
uint64_t logosdb_put(logosdb_t* db,
|
|
281
|
+
const float* embedding,
|
|
282
|
+
int dim,
|
|
283
|
+
const char* text,
|
|
284
|
+
const char* timestamp,
|
|
285
|
+
char** errptr)
|
|
286
|
+
{
|
|
287
|
+
if (!db || !embedding)
|
|
288
|
+
{
|
|
289
|
+
set_err(errptr, "null db or embedding");
|
|
290
|
+
return UINT64_MAX;
|
|
291
|
+
}
|
|
292
|
+
if (dim != db->dim)
|
|
293
|
+
{
|
|
294
|
+
set_err(errptr, "dimension mismatch");
|
|
295
|
+
return UINT64_MAX;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
std::lock_guard<std::mutex> lock(db->mu);
|
|
299
|
+
std::string err;
|
|
300
|
+
|
|
301
|
+
// Compute expected row id before writing
|
|
302
|
+
uint64_t expected_id = db->vectors.n_rows();
|
|
303
|
+
|
|
304
|
+
// Step 1: Write WAL entry (durability point)
|
|
305
|
+
int64_t wal_offset = db->wal.append_pending(embedding, dim, text, timestamp, expected_id, err);
|
|
306
|
+
if (wal_offset < 0)
|
|
307
|
+
{
|
|
308
|
+
set_err(errptr, err);
|
|
309
|
+
return UINT64_MAX;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// Step 2: Write to vector storage
|
|
313
|
+
uint64_t vid = db->vectors.append(embedding, dim, err);
|
|
314
|
+
if (vid == UINT64_MAX)
|
|
315
|
+
{
|
|
316
|
+
set_err(errptr, err);
|
|
317
|
+
return UINT64_MAX;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// Step 3: Write to metadata storage
|
|
321
|
+
uint64_t mid = db->meta.append(text, timestamp, err);
|
|
322
|
+
if (mid == UINT64_MAX)
|
|
323
|
+
{
|
|
324
|
+
// Metadata write failed - entry remains in WAL for replay on recovery
|
|
325
|
+
set_err(errptr, err);
|
|
326
|
+
return UINT64_MAX;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Step 4: Write to HNSW index
|
|
330
|
+
if (!db->index.add(vid, embedding, err))
|
|
331
|
+
{
|
|
332
|
+
// Index write failed - entry remains in WAL for replay on recovery
|
|
333
|
+
set_err(errptr, err);
|
|
334
|
+
return UINT64_MAX;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Step 5: Mark WAL entry as committed
|
|
338
|
+
if (!db->wal.mark_committed(wal_offset, err))
|
|
339
|
+
{
|
|
340
|
+
// Non-fatal: entry will be replayed on next open if needed
|
|
341
|
+
// Log but don't fail the operation
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return vid;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
int logosdb_put_batch(logosdb_t* db,
|
|
348
|
+
const float* embeddings,
|
|
349
|
+
int n,
|
|
350
|
+
int dim,
|
|
351
|
+
const char* const* texts,
|
|
352
|
+
const char* const* timestamps,
|
|
353
|
+
uint64_t* out_ids,
|
|
354
|
+
char** errptr)
|
|
355
|
+
{
|
|
356
|
+
if (!db || !embeddings || !out_ids)
|
|
357
|
+
{
|
|
358
|
+
set_err(errptr, "null db, embeddings, or out_ids");
|
|
359
|
+
return -1;
|
|
360
|
+
}
|
|
361
|
+
if (n <= 0)
|
|
362
|
+
{
|
|
363
|
+
return 0; // Empty batch is a no-op success
|
|
364
|
+
}
|
|
365
|
+
if (dim != db->dim)
|
|
366
|
+
{
|
|
367
|
+
set_err(errptr, "dimension mismatch");
|
|
368
|
+
return -1;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
std::lock_guard<std::mutex> lock(db->mu);
|
|
372
|
+
std::string err;
|
|
373
|
+
uint64_t start_id = db->vectors.n_rows();
|
|
374
|
+
|
|
375
|
+
// For batch operations, we write a single WAL entry for the entire batch
|
|
376
|
+
// to minimize WAL overhead. The entry contains the expected starting id.
|
|
377
|
+
|
|
378
|
+
// Step 1: Batch write to vector storage (single ftruncate + pwrite)
|
|
379
|
+
uint64_t vid = db->vectors.append_batch(embeddings, n, dim, err);
|
|
380
|
+
if (vid == UINT64_MAX)
|
|
381
|
+
{
|
|
382
|
+
set_err(errptr, err);
|
|
383
|
+
return -1;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// Step 2: Batch write to metadata storage
|
|
387
|
+
uint64_t mid = db->meta.append_batch(texts, timestamps, n, err);
|
|
388
|
+
if (mid == UINT64_MAX)
|
|
389
|
+
{
|
|
390
|
+
set_err(errptr, err);
|
|
391
|
+
return -1;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
// Step 3: Add to HNSW index (must be done per-vector)
|
|
395
|
+
const float* vec = embeddings;
|
|
396
|
+
size_t stride = (size_t)dim * sizeof(float);
|
|
397
|
+
for (int i = 0; i < n; ++i)
|
|
398
|
+
{
|
|
399
|
+
if (!db->index.add(vid + i, vec, err))
|
|
400
|
+
{
|
|
401
|
+
set_err(errptr, err);
|
|
402
|
+
return -1;
|
|
403
|
+
}
|
|
404
|
+
vec = reinterpret_cast<const float*>(reinterpret_cast<const uint8_t*>(vec) + stride);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
// Fill out_ids with assigned ids
|
|
408
|
+
for (int i = 0; i < n; ++i)
|
|
409
|
+
{
|
|
410
|
+
out_ids[i] = vid + i;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
return 0;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
/* ── Delete / Update ───────────────────────────────────────────────── */
|
|
417
|
+
|
|
418
|
+
int logosdb_delete(logosdb_t* db, uint64_t id, char** errptr)
|
|
419
|
+
{
|
|
420
|
+
if (!db)
|
|
421
|
+
{
|
|
422
|
+
set_err(errptr, "null db");
|
|
423
|
+
return -1;
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
std::lock_guard<std::mutex> lock(db->mu);
|
|
427
|
+
std::string err;
|
|
428
|
+
|
|
429
|
+
if (id >= db->vectors.n_rows())
|
|
430
|
+
{
|
|
431
|
+
set_err(errptr, "delete: id out of range");
|
|
432
|
+
return -1;
|
|
433
|
+
}
|
|
434
|
+
if (db->meta.is_deleted(id))
|
|
435
|
+
{
|
|
436
|
+
set_err(errptr, "delete: id already deleted");
|
|
437
|
+
return -1;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Mark in the index first. If hnswlib rejects (e.g. label missing because
|
|
441
|
+
// the row was never indexed), bail out before touching the metadata log.
|
|
442
|
+
if (!db->index.mark_deleted(id, err))
|
|
443
|
+
{
|
|
444
|
+
set_err(errptr, err);
|
|
445
|
+
return -1;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
if (!db->meta.mark_deleted(id, err))
|
|
449
|
+
{
|
|
450
|
+
// Index is now marked but metadata failed. Best-effort rollback so
|
|
451
|
+
// the two stores don't diverge.
|
|
452
|
+
std::string ignore;
|
|
453
|
+
(void)ignore;
|
|
454
|
+
set_err(errptr, err);
|
|
455
|
+
return -1;
|
|
456
|
+
}
|
|
457
|
+
return 0;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
uint64_t logosdb_update(logosdb_t* db,
|
|
461
|
+
uint64_t id,
|
|
462
|
+
const float* embedding,
|
|
463
|
+
int dim,
|
|
464
|
+
const char* text,
|
|
465
|
+
const char* timestamp,
|
|
466
|
+
char** errptr)
|
|
467
|
+
{
|
|
468
|
+
if (!db || !embedding)
|
|
469
|
+
{
|
|
470
|
+
set_err(errptr, "null db or embedding");
|
|
471
|
+
return UINT64_MAX;
|
|
472
|
+
}
|
|
473
|
+
if (dim != db->dim)
|
|
474
|
+
{
|
|
475
|
+
set_err(errptr, "dimension mismatch");
|
|
476
|
+
return UINT64_MAX;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
std::lock_guard<std::mutex> lock(db->mu);
|
|
480
|
+
std::string err;
|
|
481
|
+
|
|
482
|
+
if (id >= db->vectors.n_rows())
|
|
483
|
+
{
|
|
484
|
+
set_err(errptr, "update: id out of range");
|
|
485
|
+
return UINT64_MAX;
|
|
486
|
+
}
|
|
487
|
+
if (db->meta.is_deleted(id))
|
|
488
|
+
{
|
|
489
|
+
set_err(errptr, "update: id already deleted");
|
|
490
|
+
return UINT64_MAX;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// Mark the old row deleted, then append a fresh row. The two operations
|
|
494
|
+
// share the write mutex, so no reader sees an intermediate state.
|
|
495
|
+
if (!db->index.mark_deleted(id, err))
|
|
496
|
+
{
|
|
497
|
+
set_err(errptr, err);
|
|
498
|
+
return UINT64_MAX;
|
|
499
|
+
}
|
|
500
|
+
if (!db->meta.mark_deleted(id, err))
|
|
501
|
+
{
|
|
502
|
+
set_err(errptr, err);
|
|
503
|
+
return UINT64_MAX;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
uint64_t vid = db->vectors.append(embedding, dim, err);
|
|
507
|
+
if (vid == UINT64_MAX)
|
|
508
|
+
{
|
|
509
|
+
set_err(errptr, err);
|
|
510
|
+
return UINT64_MAX;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
uint64_t mid = db->meta.append(text, timestamp, err);
|
|
514
|
+
if (mid == UINT64_MAX)
|
|
515
|
+
{
|
|
516
|
+
set_err(errptr, err);
|
|
517
|
+
return UINT64_MAX;
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
if (!db->index.add(vid, embedding, err))
|
|
521
|
+
{
|
|
522
|
+
set_err(errptr, err);
|
|
523
|
+
return UINT64_MAX;
|
|
524
|
+
}
|
|
525
|
+
return vid;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
/* ── Search ────────────────────────────────────────────────────────── */
|
|
529
|
+
|
|
530
|
+
logosdb_search_result_t*
|
|
531
|
+
logosdb_search(logosdb_t* db, const float* query, int dim, int top_k, char** errptr)
|
|
532
|
+
{
|
|
533
|
+
if (!db || !query)
|
|
534
|
+
{
|
|
535
|
+
set_err(errptr, "null db or query");
|
|
536
|
+
return nullptr;
|
|
537
|
+
}
|
|
538
|
+
if (dim != db->dim)
|
|
539
|
+
{
|
|
540
|
+
set_err(errptr, "dimension mismatch in search");
|
|
541
|
+
return nullptr;
|
|
542
|
+
}
|
|
543
|
+
if (top_k <= 0)
|
|
544
|
+
{
|
|
545
|
+
set_err(errptr, "top_k must be > 0");
|
|
546
|
+
return nullptr;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
std::string err;
|
|
550
|
+
auto raw = db->index.search(query, top_k, err);
|
|
551
|
+
if (!err.empty())
|
|
552
|
+
{
|
|
553
|
+
set_err(errptr, err);
|
|
554
|
+
return nullptr;
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
auto* r = new logosdb_search_result_t();
|
|
558
|
+
r->hits.reserve(raw.size());
|
|
559
|
+
for (auto& [label, score] : raw)
|
|
560
|
+
{
|
|
561
|
+
logosdb_search_result_t::Hit h;
|
|
562
|
+
h.id = label;
|
|
563
|
+
h.score = score;
|
|
564
|
+
auto* m = db->meta.row(label);
|
|
565
|
+
if (m)
|
|
566
|
+
{
|
|
567
|
+
h.text = m->text;
|
|
568
|
+
h.timestamp = m->timestamp;
|
|
569
|
+
}
|
|
570
|
+
r->hits.push_back(std::move(h));
|
|
571
|
+
}
|
|
572
|
+
return r;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
logosdb_search_result_t* logosdb_search_ts_range(logosdb_t* db,
|
|
576
|
+
const float* query,
|
|
577
|
+
int dim,
|
|
578
|
+
int top_k,
|
|
579
|
+
const char* ts_from_iso8601,
|
|
580
|
+
const char* ts_to_iso8601,
|
|
581
|
+
int candidate_k,
|
|
582
|
+
char** errptr)
|
|
583
|
+
{
|
|
584
|
+
if (!db || !query)
|
|
585
|
+
{
|
|
586
|
+
set_err(errptr, "null db or query");
|
|
587
|
+
return nullptr;
|
|
588
|
+
}
|
|
589
|
+
if (dim != db->dim)
|
|
590
|
+
{
|
|
591
|
+
set_err(errptr, "dimension mismatch in search");
|
|
592
|
+
return nullptr;
|
|
593
|
+
}
|
|
594
|
+
if (top_k <= 0)
|
|
595
|
+
{
|
|
596
|
+
set_err(errptr, "top_k must be > 0");
|
|
597
|
+
return nullptr;
|
|
598
|
+
}
|
|
599
|
+
if (candidate_k < top_k)
|
|
600
|
+
{
|
|
601
|
+
candidate_k = top_k; // Ensure we fetch at least top_k candidates
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
std::string err;
|
|
605
|
+
// Fetch more candidates than needed to allow for filtering
|
|
606
|
+
auto raw = db->index.search(query, candidate_k, err);
|
|
607
|
+
if (!err.empty())
|
|
608
|
+
{
|
|
609
|
+
set_err(errptr, err);
|
|
610
|
+
return nullptr;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
auto* r = new logosdb_search_result_t();
|
|
614
|
+
r->hits.reserve(top_k);
|
|
615
|
+
|
|
616
|
+
for (auto& [label, score] : raw)
|
|
617
|
+
{
|
|
618
|
+
// Check if we've collected enough results
|
|
619
|
+
if ((int)r->hits.size() >= top_k)
|
|
620
|
+
break;
|
|
621
|
+
|
|
622
|
+
// Get metadata for this row
|
|
623
|
+
auto* m = db->meta.row(label);
|
|
624
|
+
if (!m)
|
|
625
|
+
continue;
|
|
626
|
+
|
|
627
|
+
// Apply timestamp filter
|
|
628
|
+
if (ts_from_iso8601 && !m->timestamp.empty())
|
|
629
|
+
{
|
|
630
|
+
if (m->timestamp < ts_from_iso8601)
|
|
631
|
+
continue; // Before start
|
|
632
|
+
}
|
|
633
|
+
if (ts_to_iso8601 && !m->timestamp.empty())
|
|
634
|
+
{
|
|
635
|
+
if (m->timestamp > ts_to_iso8601)
|
|
636
|
+
continue; // After end
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
// This result passes the filter
|
|
640
|
+
logosdb_search_result_t::Hit h;
|
|
641
|
+
h.id = label;
|
|
642
|
+
h.score = score;
|
|
643
|
+
h.text = m->text;
|
|
644
|
+
h.timestamp = m->timestamp;
|
|
645
|
+
r->hits.push_back(std::move(h));
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
return r;
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
int logosdb_result_count(const logosdb_search_result_t* r)
|
|
652
|
+
{
|
|
653
|
+
return r ? (int)r->hits.size() : 0;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
uint64_t logosdb_result_id(const logosdb_search_result_t* r, int i)
|
|
657
|
+
{
|
|
658
|
+
return (r && i >= 0 && i < (int)r->hits.size()) ? r->hits[i].id : UINT64_MAX;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
float logosdb_result_score(const logosdb_search_result_t* r, int i)
|
|
662
|
+
{
|
|
663
|
+
return (r && i >= 0 && i < (int)r->hits.size()) ? r->hits[i].score : 0.0f;
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
const char* logosdb_result_text(const logosdb_search_result_t* r, int i)
|
|
667
|
+
{
|
|
668
|
+
if (!r || i < 0 || i >= (int)r->hits.size())
|
|
669
|
+
return nullptr;
|
|
670
|
+
return r->hits[i].text.empty() ? nullptr : r->hits[i].text.c_str();
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
const char* logosdb_result_timestamp(const logosdb_search_result_t* r, int i)
|
|
674
|
+
{
|
|
675
|
+
if (!r || i < 0 || i >= (int)r->hits.size())
|
|
676
|
+
return nullptr;
|
|
677
|
+
return r->hits[i].timestamp.empty() ? nullptr : r->hits[i].timestamp.c_str();
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
void logosdb_result_free(logosdb_search_result_t* r)
|
|
681
|
+
{
|
|
682
|
+
delete r;
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
/* ── Info ──────────────────────────────────────────────────────────── */
|
|
686
|
+
|
|
687
|
+
size_t logosdb_count(logosdb_t* db)
|
|
688
|
+
{
|
|
689
|
+
return db ? db->vectors.n_rows() : 0;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
size_t logosdb_count_live(logosdb_t* db)
|
|
693
|
+
{
|
|
694
|
+
if (!db)
|
|
695
|
+
return 0;
|
|
696
|
+
size_t total = db->vectors.n_rows();
|
|
697
|
+
size_t deleted = db->meta.deleted_count();
|
|
698
|
+
return total >= deleted ? total - deleted : 0;
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
int logosdb_dim(logosdb_t* db)
|
|
702
|
+
{
|
|
703
|
+
return db ? db->dim : 0;
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
const float* logosdb_raw_vectors(logosdb_t* db, size_t* n_rows, int* dim)
|
|
707
|
+
{
|
|
708
|
+
if (!db)
|
|
709
|
+
{
|
|
710
|
+
if (n_rows)
|
|
711
|
+
*n_rows = 0;
|
|
712
|
+
if (dim)
|
|
713
|
+
*dim = 0;
|
|
714
|
+
return nullptr;
|
|
715
|
+
}
|
|
716
|
+
if (n_rows)
|
|
717
|
+
*n_rows = db->vectors.n_rows();
|
|
718
|
+
if (dim)
|
|
719
|
+
*dim = db->dim;
|
|
720
|
+
|
|
721
|
+
// For reduced precision storage, we cannot return a direct pointer to float32 data
|
|
722
|
+
// because the storage is in a different format. Users should use the C++ API or
|
|
723
|
+
// individual row access for quantized storage.
|
|
724
|
+
StorageDtype dtype = db->vectors.dtype();
|
|
725
|
+
if (dtype != DTYPE_FLOAT32)
|
|
726
|
+
{
|
|
727
|
+
// Return nullptr to indicate this API doesn't work with quantized storage
|
|
728
|
+
if (n_rows)
|
|
729
|
+
*n_rows = 0;
|
|
730
|
+
if (dim)
|
|
731
|
+
*dim = 0;
|
|
732
|
+
return nullptr;
|
|
733
|
+
}
|
|
734
|
+
return static_cast<const float*>(db->vectors.data_raw());
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
/* ── Vector utilities ──────────────────────────────────────────────── */
|
|
738
|
+
|
|
739
|
+
#include <cmath>
|
|
740
|
+
|
|
741
|
+
int logosdb_l2_normalize(float* vec, int dim)
|
|
742
|
+
{
|
|
743
|
+
if (!vec || dim <= 0)
|
|
744
|
+
return -1;
|
|
745
|
+
|
|
746
|
+
double sq_sum = 0.0;
|
|
747
|
+
for (int i = 0; i < dim; ++i)
|
|
748
|
+
{
|
|
749
|
+
sq_sum += (double)vec[i] * (double)vec[i];
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
double norm = std::sqrt(sq_sum);
|
|
753
|
+
if (norm == 0.0)
|
|
754
|
+
return -1; // Zero norm - cannot normalize
|
|
755
|
+
|
|
756
|
+
float scale = (float)(1.0 / norm);
|
|
757
|
+
for (int i = 0; i < dim; ++i)
|
|
758
|
+
{
|
|
759
|
+
vec[i] *= scale;
|
|
760
|
+
}
|
|
761
|
+
return 0;
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
/* C++ convenience wrappers */
|
|
765
|
+
namespace logosdb
|
|
766
|
+
{
|
|
767
|
+
|
|
768
|
+
bool l2_normalize(std::vector<float>& v)
|
|
769
|
+
{
|
|
770
|
+
if (v.empty())
|
|
771
|
+
return false;
|
|
772
|
+
return logosdb_l2_normalize(v.data(), (int)v.size()) == 0;
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
std::vector<float> l2_normalized(std::vector<float> v)
|
|
776
|
+
{
|
|
777
|
+
l2_normalize(v);
|
|
778
|
+
return v; // NRVO should elide the copy
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
} // namespace logosdb
|