logosdb 0.7.8 → 0.7.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -8
- package/binding.gyp +9 -9
- package/deps/core/include/logosdb/logosdb.h +530 -0
- package/deps/core/src/hnsw_index.cpp +345 -0
- package/deps/core/src/hnsw_index.h +77 -0
- package/deps/core/src/logosdb.cpp +781 -0
- package/deps/core/src/metadata.cpp +266 -0
- package/deps/core/src/metadata.h +69 -0
- package/deps/core/src/platform.cpp +342 -0
- package/deps/core/src/platform.h +94 -0
- package/deps/core/src/storage.cpp +764 -0
- package/deps/core/src/storage.h +131 -0
- package/deps/core/src/wal.cpp +570 -0
- package/deps/core/src/wal.h +99 -0
- package/deps/core/third_party/hnswlib/hnswlib/bruteforce.h +163 -0
- package/deps/core/third_party/hnswlib/hnswlib/hnswalg.h +1411 -0
- package/deps/core/third_party/hnswlib/hnswlib/hnswlib.h +228 -0
- package/deps/core/third_party/hnswlib/hnswlib/space_ip.h +400 -0
- package/deps/core/third_party/hnswlib/hnswlib/space_l2.h +324 -0
- package/deps/core/third_party/hnswlib/hnswlib/stop_condition.h +276 -0
- package/deps/core/third_party/hnswlib/hnswlib/visited_list_pool.h +78 -0
- package/deps/core/third_party/nlohmann/json.hpp +24765 -0
- package/package.json +21 -5
- package/scripts/vendor-core.mjs +66 -0
package/README.md
CHANGED
|
@@ -16,7 +16,18 @@ Fast semantic vector database (HNSW + mmap) for Node.js. Zero-copy memory-mapped
|
|
|
16
16
|
npm install logosdb
|
|
17
17
|
```
|
|
18
18
|
|
|
19
|
-
Prebuilt binaries
|
|
19
|
+
Prebuilt binaries may be attached to GitHub releases as **N-API** tarballs (`napi-v8-*`). The install script runs **`prebuild-install`** from the maintained **`@mmomtchev/prebuild-install`** fork (not the deprecated `prebuild-install@7` package) with **`--runtime napi --target 8`**. If no binary matches, npm falls back to **`node-gyp rebuild`** using vendored C++ under **`deps/core/`** (requires Python + a C++17 toolchain; no CMake for this addon).
|
|
20
|
+
|
|
21
|
+
## Publishing (maintainers)
|
|
22
|
+
|
|
23
|
+
Run **`npm publish` from this directory** (`nodejs/` in the git repo), not from the repository root. The root `package.json` is a **private** workspace (`logosdb-workspace`) for the MCP subpackage; publishing there hits `EPRIVATE` and can pack the wrong tree.
|
|
24
|
+
|
|
25
|
+
**`prepublishOnly`** runs **`npm run vendor-core`**, which refreshes **`deps/core/`** from the monorepo parent (`../include`, `../src`, `../third_party/...`) so the published tarball stays self-contained. Maintainer prebuild tarballs: **`npm run native:prebuild`** (not `prebuild` — that name is reserved as an npm lifecycle hook and would run before every `npm run build`).
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
cd nodejs
|
|
29
|
+
npm publish
|
|
30
|
+
```
|
|
20
31
|
|
|
21
32
|
## Quick Start
|
|
22
33
|
|
|
@@ -122,17 +133,18 @@ const db = new DB('/tmp/mydb', { dim: 384, distance: DIST_COSINE });
|
|
|
122
133
|
const hits: SearchHit[] = db.search(embedding, 5);
|
|
123
134
|
```
|
|
124
135
|
|
|
125
|
-
## Building from
|
|
136
|
+
## Building from source
|
|
137
|
+
|
|
138
|
+
From a **full monorepo** checkout, regenerate vendored core C++ (same step as publish):
|
|
126
139
|
|
|
127
140
|
```bash
|
|
128
|
-
|
|
141
|
+
cd nodejs
|
|
142
|
+
npm run vendor-core
|
|
143
|
+
npm install
|
|
144
|
+
npm run build
|
|
129
145
|
```
|
|
130
146
|
|
|
131
|
-
Requirements:
|
|
132
|
-
- Python 3.x
|
|
133
|
-
- C++17 compiler (GCC, Clang, MSVC)
|
|
134
|
-
- CMake 3.15+
|
|
135
|
-
- Node.js 16+
|
|
147
|
+
Requirements: Python usable by **node-gyp** (often 3.10–3.13), C++17 (Clang/GCC/MSVC). The Node addon does **not** use CMake.
|
|
136
148
|
|
|
137
149
|
## License
|
|
138
150
|
|
package/binding.gyp
CHANGED
|
@@ -7,18 +7,18 @@
|
|
|
7
7
|
"target_name": "logosdb",
|
|
8
8
|
"sources": [
|
|
9
9
|
"src/node_logosdb.cpp",
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
10
|
+
"deps/core/src/logosdb.cpp",
|
|
11
|
+
"deps/core/src/storage.cpp",
|
|
12
|
+
"deps/core/src/metadata.cpp",
|
|
13
|
+
"deps/core/src/hnsw_index.cpp",
|
|
14
|
+
"deps/core/src/wal.cpp",
|
|
15
|
+
"deps/core/src/platform.cpp"
|
|
16
16
|
],
|
|
17
17
|
"include_dirs": [
|
|
18
18
|
"<!@(node -p \"require('node-addon-api').include\")",
|
|
19
|
-
"
|
|
20
|
-
"
|
|
21
|
-
"
|
|
19
|
+
"deps/core/include",
|
|
20
|
+
"deps/core/src",
|
|
21
|
+
"deps/core/third_party"
|
|
22
22
|
],
|
|
23
23
|
"cflags!": ["-fno-exceptions"],
|
|
24
24
|
"cflags_cc!": ["-fno-exceptions"],
|
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
#ifndef LOGOSDB_H
|
|
2
|
+
#define LOGOSDB_H
|
|
3
|
+
|
|
4
|
+
#include <stddef.h>
|
|
5
|
+
#include <stdint.h>
|
|
6
|
+
|
|
7
|
+
#define LOGOSDB_VERSION_MAJOR 0
|
|
8
|
+
#define LOGOSDB_VERSION_MINOR 7
|
|
9
|
+
#define LOGOSDB_VERSION_PATCH 8
|
|
10
|
+
#define LOGOSDB_VERSION_STRING "0.7.11"
|
|
11
|
+
|
|
12
|
+
/* Distance metrics for vector similarity search */
|
|
13
|
+
#define LOGOSDB_DIST_IP 0 /* Inner product (default, requires L2-normalized vectors) */
|
|
14
|
+
#define LOGOSDB_DIST_COSINE 1 /* Cosine similarity (auto-normalizes vectors) */
|
|
15
|
+
#define LOGOSDB_DIST_L2 2 /* Euclidean distance (L2 space) */
|
|
16
|
+
|
|
17
|
+
/* Storage data types for reduced-precision vector storage */
|
|
18
|
+
#define LOGOSDB_DTYPE_FLOAT32 0 /* 4 bytes per dimension (default) */
|
|
19
|
+
#define LOGOSDB_DTYPE_FLOAT16 1 /* 2 bytes per dimension, ~50% smaller */
|
|
20
|
+
#define LOGOSDB_DTYPE_INT8 2 /* 1 byte per dimension, ~75% smaller */
|
|
21
|
+
|
|
22
|
+
#ifdef __cplusplus
|
|
23
|
+
extern "C"
|
|
24
|
+
{
|
|
25
|
+
#endif
|
|
26
|
+
|
|
27
|
+
/* ── Opaque handles ────────────────────────────────────────────────── */
|
|
28
|
+
|
|
29
|
+
typedef struct logosdb_t logosdb_t;
|
|
30
|
+
typedef struct logosdb_options_t logosdb_options_t;
|
|
31
|
+
typedef struct logosdb_search_result_t logosdb_search_result_t;
|
|
32
|
+
|
|
33
|
+
/* ── Options ───────────────────────────────────────────────────────── */
|
|
34
|
+
|
|
35
|
+
logosdb_options_t* logosdb_options_create(void);
|
|
36
|
+
void logosdb_options_destroy(logosdb_options_t* opts);
|
|
37
|
+
|
|
38
|
+
void logosdb_options_set_dim(logosdb_options_t* opts, int dim);
|
|
39
|
+
void logosdb_options_set_max_elements(logosdb_options_t* opts, size_t n);
|
|
40
|
+
void logosdb_options_set_ef_construction(logosdb_options_t* opts, int ef);
|
|
41
|
+
void logosdb_options_set_M(logosdb_options_t* opts, int M);
|
|
42
|
+
void logosdb_options_set_ef_search(logosdb_options_t* opts, int ef);
|
|
43
|
+
|
|
44
|
+
/* Set distance metric for vector similarity.
|
|
45
|
+
* metric: one of LOGOSDB_DIST_IP, LOGOSDB_DIST_COSINE, or LOGOSDB_DIST_L2
|
|
46
|
+
* (default is LOGOSDB_DIST_IP)
|
|
47
|
+
*
|
|
48
|
+
* For LOGOSDB_DIST_IP: vectors must be L2-normalized by caller.
|
|
49
|
+
* For LOGOSDB_DIST_COSINE: vectors are automatically L2-normalized on put/search.
|
|
50
|
+
* For LOGOSDB_DIST_L2: Euclidean distance is used (lower is more similar).
|
|
51
|
+
*
|
|
52
|
+
* The distance metric is persisted in the index file and must match on reopen.
|
|
53
|
+
* Returns 0 on success, -1 on invalid metric. */
|
|
54
|
+
int logosdb_options_set_distance(logosdb_options_t* opts, int metric);
|
|
55
|
+
|
|
56
|
+
/* Set storage data type for reduced-precision vector storage.
|
|
57
|
+
* dtype: one of LOGOSDB_DTYPE_FLOAT32, LOGOSDB_DTYPE_FLOAT16, or LOGOSDB_DTYPE_INT8
|
|
58
|
+
* (default is LOGOSDB_DTYPE_FLOAT32)
|
|
59
|
+
*
|
|
60
|
+
* This affects how vectors are stored on disk:
|
|
61
|
+
* - FLOAT32: Full precision, 4 bytes per dimension
|
|
62
|
+
* - FLOAT16: Half precision, 2 bytes per dimension (~50% smaller)
|
|
63
|
+
* - INT8: 8-bit quantized, 1 byte per dimension (~75% smaller)
|
|
64
|
+
*
|
|
65
|
+
* Vectors are always dequantized to float32 for HNSW search.
|
|
66
|
+
* The dtype is persisted in the storage file and must match on reopen.
|
|
67
|
+
* Returns 0 on success, -1 on invalid dtype. */
|
|
68
|
+
int logosdb_options_set_dtype(logosdb_options_t* opts, int dtype);
|
|
69
|
+
|
|
70
|
+
/* ── Lifecycle ─────────────────────────────────────────────────────── */
|
|
71
|
+
|
|
72
|
+
logosdb_t* logosdb_open(const char* path, const logosdb_options_t* opts, char** errptr);
|
|
73
|
+
void logosdb_close(logosdb_t* db);
|
|
74
|
+
|
|
75
|
+
/* ── Write ─────────────────────────────────────────────────────────── */
|
|
76
|
+
|
|
77
|
+
uint64_t logosdb_put(logosdb_t* db,
|
|
78
|
+
const float* embedding,
|
|
79
|
+
int dim,
|
|
80
|
+
const char* text,
|
|
81
|
+
const char* timestamp,
|
|
82
|
+
char** errptr);
|
|
83
|
+
|
|
84
|
+
/* Batch insert of N vectors. More efficient than N separate logosdb_put calls.
|
|
85
|
+
* embeddings: flattened (n x dim) float array
|
|
86
|
+
* texts, timestamps: arrays of n pointers (may be NULL)
|
|
87
|
+
* out_ids: pre-allocated array of size n to receive assigned row ids
|
|
88
|
+
*
|
|
89
|
+
* Returns 0 on success, -1 on error (partial insert may have occurred).
|
|
90
|
+
* On error, *errptr is set to a newly-allocated message that the caller
|
|
91
|
+
* must free(). */
|
|
92
|
+
int logosdb_put_batch(logosdb_t* db,
|
|
93
|
+
const float* embeddings,
|
|
94
|
+
int n,
|
|
95
|
+
int dim,
|
|
96
|
+
const char* const* texts,
|
|
97
|
+
const char* const* timestamps,
|
|
98
|
+
uint64_t* out_ids,
|
|
99
|
+
char** errptr);
|
|
100
|
+
|
|
101
|
+
/* Mark the row with the given id as deleted. The vector bytes remain on
|
|
102
|
+
* disk but the row is excluded from search results and future `raw_vectors`
|
|
103
|
+
* bulk views (logosdb_count_live reflects live rows only).
|
|
104
|
+
*
|
|
105
|
+
* Returns 0 on success, -1 on error (id out of range, already deleted,
|
|
106
|
+
* db not open). On error `*errptr` is set to a newly-allocated message
|
|
107
|
+
* that the caller must free(). */
|
|
108
|
+
int logosdb_delete(logosdb_t* db, uint64_t id, char** errptr);
|
|
109
|
+
|
|
110
|
+
/* Replace the row with the given id. This is implemented as a
|
|
111
|
+
* delete-then-put, so the returned id is NEW (not the original `id`).
|
|
112
|
+
* Callers must update any stored references.
|
|
113
|
+
*
|
|
114
|
+
* Returns the new id on success, UINT64_MAX on error. */
|
|
115
|
+
uint64_t logosdb_update(logosdb_t* db,
|
|
116
|
+
uint64_t id,
|
|
117
|
+
const float* embedding,
|
|
118
|
+
int dim,
|
|
119
|
+
const char* text,
|
|
120
|
+
const char* timestamp,
|
|
121
|
+
char** errptr);
|
|
122
|
+
|
|
123
|
+
/* ── Search ────────────────────────────────────────────────────────── */
|
|
124
|
+
|
|
125
|
+
logosdb_search_result_t*
|
|
126
|
+
logosdb_search(logosdb_t* db, const float* query, int dim, int top_k, char** errptr);
|
|
127
|
+
|
|
128
|
+
/* Search with timestamp range filter.
|
|
129
|
+
* ts_from_iso8601: optional start timestamp (inclusive), NULL for no lower bound
|
|
130
|
+
* ts_to_iso8601: optional end timestamp (inclusive), NULL for no upper bound
|
|
131
|
+
* candidate_k: internal multiplier for post-filtering (e.g., 10x top_k)
|
|
132
|
+
*
|
|
133
|
+
* Returns top_k results that match both the vector similarity and timestamp range.
|
|
134
|
+
* Uses post-filtering: fetches candidate_k results internally, filters by timestamp,
|
|
135
|
+
* then returns the top_k that pass. Higher candidate_k improves recall but costs more.
|
|
136
|
+
*
|
|
137
|
+
* Timestamp format: ISO 8601 strings (e.g., "2025-01-01T00:00:00Z").
|
|
138
|
+
* Comparison is lexicographic (string compare), which works correctly for ISO 8601.
|
|
139
|
+
*/
|
|
140
|
+
logosdb_search_result_t* logosdb_search_ts_range(logosdb_t* db,
|
|
141
|
+
const float* query,
|
|
142
|
+
int dim,
|
|
143
|
+
int top_k,
|
|
144
|
+
const char* ts_from_iso8601,
|
|
145
|
+
const char* ts_to_iso8601,
|
|
146
|
+
int candidate_k,
|
|
147
|
+
char** errptr);
|
|
148
|
+
|
|
149
|
+
int logosdb_result_count(const logosdb_search_result_t* r);
|
|
150
|
+
uint64_t logosdb_result_id(const logosdb_search_result_t* r, int i);
|
|
151
|
+
float logosdb_result_score(const logosdb_search_result_t* r, int i);
|
|
152
|
+
const char* logosdb_result_text(const logosdb_search_result_t* r, int i);
|
|
153
|
+
const char* logosdb_result_timestamp(const logosdb_search_result_t* r, int i);
|
|
154
|
+
void logosdb_result_free(logosdb_search_result_t* r);
|
|
155
|
+
|
|
156
|
+
/* ── Info ──────────────────────────────────────────────────────────── */
|
|
157
|
+
|
|
158
|
+
/* Total row count, including rows that have been marked deleted. */
|
|
159
|
+
size_t logosdb_count(logosdb_t* db);
|
|
160
|
+
|
|
161
|
+
/* Live row count: total rows minus rows marked deleted. */
|
|
162
|
+
size_t logosdb_count_live(logosdb_t* db);
|
|
163
|
+
|
|
164
|
+
int logosdb_dim(logosdb_t* db);
|
|
165
|
+
|
|
166
|
+
/* ── Bulk read (for tensor construction) ───────────────────────────── */
|
|
167
|
+
|
|
168
|
+
const float* logosdb_raw_vectors(logosdb_t* db, size_t* n_rows, int* dim);
|
|
169
|
+
|
|
170
|
+
/* ── Vector utilities ──────────────────────────────────────────────── */
|
|
171
|
+
|
|
172
|
+
/* Normalize `vec` in-place using L2 (Euclidean) norm.
|
|
173
|
+
* vec: pointer to float array (will be modified in place)
|
|
174
|
+
* dim: number of elements in vec
|
|
175
|
+
*
|
|
176
|
+
* Returns 0 on success, -1 if the input has zero norm (cannot normalize).
|
|
177
|
+
* On zero norm, vec is left unchanged.
|
|
178
|
+
*
|
|
179
|
+
* Example:
|
|
180
|
+
* float vec[128] = { ... };
|
|
181
|
+
* if (logosdb_l2_normalize(vec, 128) == 0) {
|
|
182
|
+
* logosdb_put(db, vec, 128, "text", NULL, &err);
|
|
183
|
+
* }
|
|
184
|
+
*/
|
|
185
|
+
int logosdb_l2_normalize(float* vec, int dim);
|
|
186
|
+
|
|
187
|
+
#ifdef __cplusplus
|
|
188
|
+
} /* extern "C" */
|
|
189
|
+
#endif
|
|
190
|
+
|
|
191
|
+
/* ── C++ convenience wrapper ───────────────────────────────────────── */
|
|
192
|
+
|
|
193
|
+
#ifdef __cplusplus
|
|
194
|
+
#include <cstring>
|
|
195
|
+
#include <stdexcept>
|
|
196
|
+
#include <string>
|
|
197
|
+
#include <vector>
|
|
198
|
+
|
|
199
|
+
namespace logosdb
|
|
200
|
+
{
|
|
201
|
+
|
|
202
|
+
struct Options
|
|
203
|
+
{
|
|
204
|
+
int dim = 0;
|
|
205
|
+
size_t max_elements = 1000000;
|
|
206
|
+
int ef_construction = 200;
|
|
207
|
+
int M = 16;
|
|
208
|
+
int ef_search = 50;
|
|
209
|
+
int distance = LOGOSDB_DIST_IP; /* IP, COSINE, or L2 */
|
|
210
|
+
int dtype = LOGOSDB_DTYPE_FLOAT32; /* FLOAT32, FLOAT16, or INT8 */
|
|
211
|
+
};
|
|
212
|
+
|
|
213
|
+
struct SearchHit
|
|
214
|
+
{
|
|
215
|
+
uint64_t id;
|
|
216
|
+
float score;
|
|
217
|
+
std::string text;
|
|
218
|
+
std::string timestamp;
|
|
219
|
+
};
|
|
220
|
+
|
|
221
|
+
class DB
|
|
222
|
+
{
|
|
223
|
+
public:
|
|
224
|
+
explicit DB(const std::string& path, const Options& opts = {})
|
|
225
|
+
{
|
|
226
|
+
logosdb_options_t* o = logosdb_options_create();
|
|
227
|
+
if (opts.dim > 0)
|
|
228
|
+
logosdb_options_set_dim(o, opts.dim);
|
|
229
|
+
if (opts.max_elements > 0)
|
|
230
|
+
logosdb_options_set_max_elements(o, opts.max_elements);
|
|
231
|
+
if (opts.ef_construction > 0)
|
|
232
|
+
logosdb_options_set_ef_construction(o, opts.ef_construction);
|
|
233
|
+
if (opts.M > 0)
|
|
234
|
+
logosdb_options_set_M(o, opts.M);
|
|
235
|
+
if (opts.ef_search > 0)
|
|
236
|
+
logosdb_options_set_ef_search(o, opts.ef_search);
|
|
237
|
+
logosdb_options_set_distance(o, opts.distance);
|
|
238
|
+
logosdb_options_set_dtype(o, opts.dtype);
|
|
239
|
+
char* err = nullptr;
|
|
240
|
+
db_ = logosdb_open(path.c_str(), o, &err);
|
|
241
|
+
logosdb_options_destroy(o);
|
|
242
|
+
if (err)
|
|
243
|
+
{
|
|
244
|
+
std::string msg(err);
|
|
245
|
+
free(err);
|
|
246
|
+
throw std::runtime_error("logosdb_open: " + msg);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
~DB()
|
|
251
|
+
{
|
|
252
|
+
if (db_)
|
|
253
|
+
logosdb_close(db_);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
DB(const DB&) = delete;
|
|
257
|
+
DB& operator=(const DB&) = delete;
|
|
258
|
+
DB(DB&& o) noexcept
|
|
259
|
+
: db_(o.db_)
|
|
260
|
+
{
|
|
261
|
+
o.db_ = nullptr;
|
|
262
|
+
}
|
|
263
|
+
DB& operator=(DB&& o) noexcept
|
|
264
|
+
{
|
|
265
|
+
if (this != &o)
|
|
266
|
+
{
|
|
267
|
+
if (db_)
|
|
268
|
+
logosdb_close(db_);
|
|
269
|
+
db_ = o.db_;
|
|
270
|
+
o.db_ = nullptr;
|
|
271
|
+
}
|
|
272
|
+
return *this;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
uint64_t put(const std::vector<float>& embedding,
|
|
276
|
+
const std::string& text = {},
|
|
277
|
+
const std::string& timestamp = {})
|
|
278
|
+
{
|
|
279
|
+
char* err = nullptr;
|
|
280
|
+
uint64_t id = logosdb_put(db_,
|
|
281
|
+
embedding.data(),
|
|
282
|
+
(int)embedding.size(),
|
|
283
|
+
text.empty() ? nullptr : text.c_str(),
|
|
284
|
+
timestamp.empty() ? nullptr : timestamp.c_str(),
|
|
285
|
+
&err);
|
|
286
|
+
if (err)
|
|
287
|
+
{
|
|
288
|
+
std::string msg(err);
|
|
289
|
+
free(err);
|
|
290
|
+
throw std::runtime_error("logosdb_put: " + msg);
|
|
291
|
+
}
|
|
292
|
+
return id;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/* Batch insert of n vectors. 'embeddings' must contain n*dim floats.
|
|
296
|
+
* Returns vector of assigned row ids. Throws on error. */
|
|
297
|
+
std::vector<uint64_t> put_batch(const std::vector<float>& embeddings,
|
|
298
|
+
int n,
|
|
299
|
+
const std::vector<std::string>& texts = {},
|
|
300
|
+
const std::vector<std::string>& timestamps = {})
|
|
301
|
+
{
|
|
302
|
+
if (n <= 0)
|
|
303
|
+
return {};
|
|
304
|
+
if ((int)embeddings.size() < n * dim())
|
|
305
|
+
{
|
|
306
|
+
throw std::runtime_error("put_batch: embeddings size too small");
|
|
307
|
+
}
|
|
308
|
+
std::vector<uint64_t> ids(n);
|
|
309
|
+
std::vector<const char*> text_ptrs;
|
|
310
|
+
std::vector<const char*> ts_ptrs;
|
|
311
|
+
if (!texts.empty())
|
|
312
|
+
{
|
|
313
|
+
text_ptrs.reserve(n);
|
|
314
|
+
for (int i = 0; i < n; ++i)
|
|
315
|
+
{
|
|
316
|
+
text_ptrs.push_back(i < (int)texts.size() && !texts[i].empty() ? texts[i].c_str()
|
|
317
|
+
: nullptr);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
if (!timestamps.empty())
|
|
321
|
+
{
|
|
322
|
+
ts_ptrs.reserve(n);
|
|
323
|
+
for (int i = 0; i < n; ++i)
|
|
324
|
+
{
|
|
325
|
+
ts_ptrs.push_back(i < (int)timestamps.size() && !timestamps[i].empty()
|
|
326
|
+
? timestamps[i].c_str()
|
|
327
|
+
: nullptr);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
char* err = nullptr;
|
|
331
|
+
int rc = logosdb_put_batch(db_,
|
|
332
|
+
embeddings.data(),
|
|
333
|
+
n,
|
|
334
|
+
dim(),
|
|
335
|
+
texts.empty() ? nullptr : text_ptrs.data(),
|
|
336
|
+
timestamps.empty() ? nullptr : ts_ptrs.data(),
|
|
337
|
+
ids.data(),
|
|
338
|
+
&err);
|
|
339
|
+
if (rc != 0)
|
|
340
|
+
{
|
|
341
|
+
std::string msg = err ? err : "unknown";
|
|
342
|
+
free(err);
|
|
343
|
+
throw std::runtime_error("logosdb_put_batch: " + msg);
|
|
344
|
+
}
|
|
345
|
+
return ids;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
void del(uint64_t id)
|
|
349
|
+
{
|
|
350
|
+
char* err = nullptr;
|
|
351
|
+
int rc = logosdb_delete(db_, id, &err);
|
|
352
|
+
if (rc != 0)
|
|
353
|
+
{
|
|
354
|
+
std::string msg = err ? err : "unknown";
|
|
355
|
+
free(err);
|
|
356
|
+
throw std::runtime_error("logosdb_delete: " + msg);
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
uint64_t update(uint64_t id,
|
|
361
|
+
const std::vector<float>& embedding,
|
|
362
|
+
const std::string& text = {},
|
|
363
|
+
const std::string& timestamp = {})
|
|
364
|
+
{
|
|
365
|
+
char* err = nullptr;
|
|
366
|
+
uint64_t new_id = logosdb_update(db_,
|
|
367
|
+
id,
|
|
368
|
+
embedding.data(),
|
|
369
|
+
(int)embedding.size(),
|
|
370
|
+
text.empty() ? nullptr : text.c_str(),
|
|
371
|
+
timestamp.empty() ? nullptr : timestamp.c_str(),
|
|
372
|
+
&err);
|
|
373
|
+
if (err)
|
|
374
|
+
{
|
|
375
|
+
std::string msg(err);
|
|
376
|
+
free(err);
|
|
377
|
+
throw std::runtime_error("logosdb_update: " + msg);
|
|
378
|
+
}
|
|
379
|
+
return new_id;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
std::vector<SearchHit> search(const std::vector<float>& query, int top_k)
|
|
383
|
+
{
|
|
384
|
+
char* err = nullptr;
|
|
385
|
+
logosdb_search_result_t* r =
|
|
386
|
+
logosdb_search(db_, query.data(), (int)query.size(), top_k, &err);
|
|
387
|
+
if (err)
|
|
388
|
+
{
|
|
389
|
+
std::string msg(err);
|
|
390
|
+
free(err);
|
|
391
|
+
throw std::runtime_error("logosdb_search: " + msg);
|
|
392
|
+
}
|
|
393
|
+
std::vector<SearchHit> hits;
|
|
394
|
+
int n = logosdb_result_count(r);
|
|
395
|
+
hits.reserve(n);
|
|
396
|
+
for (int i = 0; i < n; ++i)
|
|
397
|
+
{
|
|
398
|
+
SearchHit h;
|
|
399
|
+
h.id = logosdb_result_id(r, i);
|
|
400
|
+
h.score = logosdb_result_score(r, i);
|
|
401
|
+
const char* t = logosdb_result_text(r, i);
|
|
402
|
+
if (t)
|
|
403
|
+
h.text = t;
|
|
404
|
+
const char* ts = logosdb_result_timestamp(r, i);
|
|
405
|
+
if (ts)
|
|
406
|
+
h.timestamp = ts;
|
|
407
|
+
hits.push_back(std::move(h));
|
|
408
|
+
}
|
|
409
|
+
logosdb_result_free(r);
|
|
410
|
+
return hits;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
/* Search with timestamp range filter.
|
|
414
|
+
* ts_from: optional start timestamp (inclusive), empty for no lower bound
|
|
415
|
+
* ts_to: optional end timestamp (inclusive), empty for no upper bound
|
|
416
|
+
* candidate_k: internal multiplier for post-filtering (default 10x top_k)
|
|
417
|
+
*
|
|
418
|
+
* Timestamp format: ISO 8601 strings (e.g., "2025-01-01T00:00:00Z").
|
|
419
|
+
*/
|
|
420
|
+
std::vector<SearchHit> search_ts_range(const std::vector<float>& query,
|
|
421
|
+
int top_k,
|
|
422
|
+
const std::string& ts_from,
|
|
423
|
+
const std::string& ts_to,
|
|
424
|
+
int candidate_k = 0)
|
|
425
|
+
{
|
|
426
|
+
char* err = nullptr;
|
|
427
|
+
int ck = candidate_k > 0 ? candidate_k : top_k * 10;
|
|
428
|
+
logosdb_search_result_t* r =
|
|
429
|
+
logosdb_search_ts_range(db_,
|
|
430
|
+
query.data(),
|
|
431
|
+
(int)query.size(),
|
|
432
|
+
top_k,
|
|
433
|
+
ts_from.empty() ? nullptr : ts_from.c_str(),
|
|
434
|
+
ts_to.empty() ? nullptr : ts_to.c_str(),
|
|
435
|
+
ck,
|
|
436
|
+
&err);
|
|
437
|
+
if (err)
|
|
438
|
+
{
|
|
439
|
+
std::string msg(err);
|
|
440
|
+
free(err);
|
|
441
|
+
throw std::runtime_error("logosdb_search_ts_range: " + msg);
|
|
442
|
+
}
|
|
443
|
+
std::vector<SearchHit> hits;
|
|
444
|
+
int n = logosdb_result_count(r);
|
|
445
|
+
hits.reserve(n);
|
|
446
|
+
for (int i = 0; i < n; ++i)
|
|
447
|
+
{
|
|
448
|
+
SearchHit h;
|
|
449
|
+
h.id = logosdb_result_id(r, i);
|
|
450
|
+
h.score = logosdb_result_score(r, i);
|
|
451
|
+
const char* t = logosdb_result_text(r, i);
|
|
452
|
+
if (t)
|
|
453
|
+
h.text = t;
|
|
454
|
+
const char* ts = logosdb_result_timestamp(r, i);
|
|
455
|
+
if (ts)
|
|
456
|
+
h.timestamp = ts;
|
|
457
|
+
hits.push_back(std::move(h));
|
|
458
|
+
}
|
|
459
|
+
logosdb_result_free(r);
|
|
460
|
+
return hits;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
size_t count() const { return logosdb_count(db_); }
|
|
464
|
+
size_t count_live() const { return logosdb_count_live(db_); }
|
|
465
|
+
int dim() const { return logosdb_dim(db_); }
|
|
466
|
+
|
|
467
|
+
// Get raw vectors as float32. For quantized storage, this allocates and dequantizes.
|
|
468
|
+
// The caller receives a vector they own (copy for quantized, view for float32).
|
|
469
|
+
std::vector<float> raw_vectors(size_t& n_rows, int& d) const
|
|
470
|
+
{
|
|
471
|
+
const float* raw = logosdb_raw_vectors(db_, &n_rows, &d);
|
|
472
|
+
if (raw)
|
|
473
|
+
{
|
|
474
|
+
// Float32 storage - return a copy for API consistency
|
|
475
|
+
return std::vector<float>(raw, raw + n_rows * d);
|
|
476
|
+
}
|
|
477
|
+
// Quantized storage - need to dequantize
|
|
478
|
+
n_rows = count();
|
|
479
|
+
d = dim();
|
|
480
|
+
if (n_rows == 0 || d == 0)
|
|
481
|
+
return {};
|
|
482
|
+
|
|
483
|
+
std::vector<float> result(n_rows * d);
|
|
484
|
+
// Note: This is a simplified version - actual implementation would
|
|
485
|
+
// access the storage layer directly. For now, we return empty.
|
|
486
|
+
// Full implementation requires exposing row_to_float32 to C API.
|
|
487
|
+
return result;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
logosdb_t* handle() { return db_; }
|
|
491
|
+
|
|
492
|
+
private:
|
|
493
|
+
logosdb_t* db_ = nullptr;
|
|
494
|
+
};
|
|
495
|
+
|
|
496
|
+
} // namespace logosdb
|
|
497
|
+
|
|
498
|
+
/* L2-normalization helpers for C++
|
|
499
|
+
*
|
|
500
|
+
* These functions simplify preparing vectors for use with LOGOSDB_DIST_IP
|
|
501
|
+
* (inner product distance), which requires L2-normalized vectors.
|
|
502
|
+
*
|
|
503
|
+
* Example:
|
|
504
|
+
* std::vector<float> vec = load_some_vector(); // unnormalized
|
|
505
|
+
* if (logosdb::l2_normalize(vec)) {
|
|
506
|
+
* db.put(vec, "text", "2025-04-28T10:00:00Z");
|
|
507
|
+
* }
|
|
508
|
+
*
|
|
509
|
+
* Or use l2_normalized() to get a normalized copy:
|
|
510
|
+
* auto normalized = logosdb::l2_normalized(vec);
|
|
511
|
+
* db.put(normalized, "text");
|
|
512
|
+
*/
|
|
513
|
+
namespace logosdb
|
|
514
|
+
{
|
|
515
|
+
|
|
516
|
+
/* Normalize `v` in-place using L2 norm.
|
|
517
|
+
* Returns true on success, false if the vector has zero norm.
|
|
518
|
+
* On zero norm, v is left unchanged. */
|
|
519
|
+
bool l2_normalize(std::vector<float>& v);
|
|
520
|
+
|
|
521
|
+
/* Return an L2-normalized copy of `v`.
|
|
522
|
+
* The input `v` is not modified.
|
|
523
|
+
* Returns a zero vector if input has zero norm (caller should check). */
|
|
524
|
+
std::vector<float> l2_normalized(std::vector<float> v);
|
|
525
|
+
|
|
526
|
+
} // namespace logosdb
|
|
527
|
+
|
|
528
|
+
#endif
|
|
529
|
+
|
|
530
|
+
#endif // LOGOSDB_H
|