native-vector-store 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -3
- package/binding.gyp +3 -2
- package/deps/parallel_hashmap/btree.h +4076 -0
- package/deps/parallel_hashmap/meminfo.h +195 -0
- package/deps/parallel_hashmap/phmap.h +5236 -0
- package/deps/parallel_hashmap/phmap_base.h +5115 -0
- package/deps/parallel_hashmap/phmap_bits.h +665 -0
- package/deps/parallel_hashmap/phmap_config.h +790 -0
- package/deps/parallel_hashmap/phmap_dump.h +335 -0
- package/deps/parallel_hashmap/phmap_fwd_decl.h +186 -0
- package/deps/parallel_hashmap/phmap_utils.h +407 -0
- package/docs/index.html +52 -3
- package/lib/index.d.ts +35 -1
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
- package/prebuilds/darwin-x64/native-vector-store.node +0 -0
- package/prebuilds/linux-arm64/native-vector-store.node +0 -0
- package/prebuilds/linux-x64/native-vector-store.node +0 -0
- package/src/Makefile +26 -6
- package/src/binding.cc +185 -2
- package/src/english_abbreviations.h +197 -0
- package/src/english_dictionary.h +25185 -0
- package/src/english_punctuations.h +42 -0
- package/src/english_stop_words.h +434 -0
- package/src/simple_sentence_splitter.h +218 -0
- package/src/simple_tokenizer.cpp +92 -0
- package/src/simple_tokenizer.h +30 -0
- package/src/test_bm25.cpp +357 -0
- package/src/test_hybrid_search.cpp +496 -0
- package/src/vector_store.cpp +239 -3
- package/src/vector_store.h +52 -1
- package/prebuilds/win32-x64/native-vector-store.node +0 -0
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
#if !defined(phmap_utils_h_guard_)
|
|
2
|
+
#define phmap_utils_h_guard_
|
|
3
|
+
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
// Copyright (c) 2019, Gregory Popovitch - greg7mdp@gmail.com
|
|
6
|
+
//
|
|
7
|
+
// minimal header providing phmap::HashState
|
|
8
|
+
//
|
|
9
|
+
// use as: phmap::HashState().combine(0, _first_name, _last_name, _age);
|
|
10
|
+
//
|
|
11
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
12
|
+
// you may not use this file except in compliance with the License.
|
|
13
|
+
// You may obtain a copy of the License at
|
|
14
|
+
//
|
|
15
|
+
// https://www.apache.org/licenses/LICENSE-2.0
|
|
16
|
+
//
|
|
17
|
+
// Unless required by applicable law or agreed to in writing, software
|
|
18
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
19
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
20
|
+
// See the License for the specific language governing permissions and
|
|
21
|
+
// limitations under the License.
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
#ifdef _MSC_VER
|
|
25
|
+
#pragma warning(push)
|
|
26
|
+
#pragma warning(disable : 4514) // unreferenced inline function has been removed
|
|
27
|
+
#pragma warning(disable : 4710) // function not inlined
|
|
28
|
+
#pragma warning(disable : 4711) // selected for automatic inline expansion
|
|
29
|
+
#endif
|
|
30
|
+
|
|
31
|
+
#include <cstdint>
|
|
32
|
+
#include <functional>
|
|
33
|
+
#include <tuple>
|
|
34
|
+
#include "phmap_bits.h"
|
|
35
|
+
|
|
36
|
+
// ---------------------------------------------------------------
|
|
37
|
+
// Absl forward declaration requires global scope.
|
|
38
|
+
// ---------------------------------------------------------------
|
|
39
|
+
#if defined(PHMAP_USE_ABSL_HASH) && !defined(phmap_fwd_decl_h_guard_) && !defined(ABSL_HASH_HASH_H_)
|
|
40
|
+
namespace absl { template <class T> struct Hash; };
|
|
41
|
+
#endif
|
|
42
|
+
|
|
43
|
+
namespace phmap
|
|
44
|
+
{
|
|
45
|
+
|
|
46
|
+
// ---------------------------------------------------------------
|
|
47
|
+
// ---------------------------------------------------------------
|
|
48
|
+
template<int n>
|
|
49
|
+
struct phmap_mix
|
|
50
|
+
{
|
|
51
|
+
inline size_t operator()(size_t) const;
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
template<>
|
|
55
|
+
struct phmap_mix<4>
|
|
56
|
+
{
|
|
57
|
+
inline size_t operator()(size_t a) const
|
|
58
|
+
{
|
|
59
|
+
static constexpr uint64_t kmul = 0xcc9e2d51UL;
|
|
60
|
+
uint64_t l = a * kmul;
|
|
61
|
+
return static_cast<size_t>(l ^ (l >> 32));
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
#if defined(PHMAP_HAS_UMUL128)
|
|
66
|
+
template<>
|
|
67
|
+
struct phmap_mix<8>
|
|
68
|
+
{
|
|
69
|
+
// Very fast mixing (similar to Abseil)
|
|
70
|
+
inline size_t operator()(size_t a) const
|
|
71
|
+
{
|
|
72
|
+
static constexpr uint64_t k = 0xde5fb9d2630458e9ULL;
|
|
73
|
+
uint64_t h;
|
|
74
|
+
uint64_t l = umul128(a, k, &h);
|
|
75
|
+
return static_cast<size_t>(h + l);
|
|
76
|
+
}
|
|
77
|
+
};
|
|
78
|
+
#else
|
|
79
|
+
template<>
|
|
80
|
+
struct phmap_mix<8>
|
|
81
|
+
{
|
|
82
|
+
inline size_t operator()(size_t a) const
|
|
83
|
+
{
|
|
84
|
+
a = (~a) + (a << 21); // a = (a << 21) - a - 1;
|
|
85
|
+
a = a ^ (a >> 24);
|
|
86
|
+
a = (a + (a << 3)) + (a << 8); // a * 265
|
|
87
|
+
a = a ^ (a >> 14);
|
|
88
|
+
a = (a + (a << 2)) + (a << 4); // a * 21
|
|
89
|
+
a = a ^ (a >> 28);
|
|
90
|
+
a = a + (a << 31);
|
|
91
|
+
return static_cast<size_t>(a);
|
|
92
|
+
}
|
|
93
|
+
};
|
|
94
|
+
#endif
|
|
95
|
+
|
|
96
|
+
// --------------------------------------------
|
|
97
|
+
template<int n>
|
|
98
|
+
struct fold_if_needed
|
|
99
|
+
{
|
|
100
|
+
inline size_t operator()(uint64_t) const;
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
template<>
|
|
104
|
+
struct fold_if_needed<4>
|
|
105
|
+
{
|
|
106
|
+
inline size_t operator()(uint64_t a) const
|
|
107
|
+
{
|
|
108
|
+
return static_cast<size_t>(a ^ (a >> 32));
|
|
109
|
+
}
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
template<>
|
|
113
|
+
struct fold_if_needed<8>
|
|
114
|
+
{
|
|
115
|
+
inline size_t operator()(uint64_t a) const
|
|
116
|
+
{
|
|
117
|
+
return static_cast<size_t>(a);
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
// ---------------------------------------------------------------
|
|
122
|
+
// see if class T has a hash_value() friend method
|
|
123
|
+
// ---------------------------------------------------------------
|
|
124
|
+
template<typename T>
|
|
125
|
+
struct has_hash_value
|
|
126
|
+
{
|
|
127
|
+
private:
|
|
128
|
+
typedef std::true_type yes;
|
|
129
|
+
typedef std::false_type no;
|
|
130
|
+
|
|
131
|
+
template<typename U> static auto test(int) -> decltype(hash_value(std::declval<const U&>()) == 1, yes());
|
|
132
|
+
|
|
133
|
+
template<typename> static no test(...);
|
|
134
|
+
|
|
135
|
+
public:
|
|
136
|
+
static constexpr bool value = std::is_same<decltype(test<T>(0)), yes>::value;
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
#if defined(PHMAP_USE_ABSL_HASH) && !defined(phmap_fwd_decl_h_guard_)
|
|
140
|
+
template <class T> using Hash = ::absl::Hash<T>;
|
|
141
|
+
#elif !defined(PHMAP_USE_ABSL_HASH)
|
|
142
|
+
// ---------------------------------------------------------------
|
|
143
|
+
// phmap::Hash
|
|
144
|
+
// ---------------------------------------------------------------
|
|
145
|
+
template <class T>
|
|
146
|
+
struct Hash
|
|
147
|
+
{
|
|
148
|
+
template <class U, typename std::enable_if<has_hash_value<U>::value, int>::type = 0>
|
|
149
|
+
size_t _hash(const T& val) const
|
|
150
|
+
{
|
|
151
|
+
return hash_value(val);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
template <class U, typename std::enable_if<!has_hash_value<U>::value, int>::type = 0>
|
|
155
|
+
size_t _hash(const T& val) const
|
|
156
|
+
{
|
|
157
|
+
return std::hash<T>()(val);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
inline size_t operator()(const T& val) const
|
|
161
|
+
{
|
|
162
|
+
return _hash<T>(val);
|
|
163
|
+
}
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
template<class ArgumentType, class ResultType>
|
|
167
|
+
struct phmap_unary_function
|
|
168
|
+
{
|
|
169
|
+
typedef ArgumentType argument_type;
|
|
170
|
+
typedef ResultType result_type;
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
template <>
|
|
174
|
+
struct Hash<bool> : public phmap_unary_function<bool, size_t>
|
|
175
|
+
{
|
|
176
|
+
inline size_t operator()(bool val) const noexcept
|
|
177
|
+
{ return static_cast<size_t>(val); }
|
|
178
|
+
};
|
|
179
|
+
|
|
180
|
+
template <>
|
|
181
|
+
struct Hash<char> : public phmap_unary_function<char, size_t>
|
|
182
|
+
{
|
|
183
|
+
inline size_t operator()(char val) const noexcept
|
|
184
|
+
{ return static_cast<size_t>(val); }
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
template <>
|
|
188
|
+
struct Hash<signed char> : public phmap_unary_function<signed char, size_t>
|
|
189
|
+
{
|
|
190
|
+
inline size_t operator()(signed char val) const noexcept
|
|
191
|
+
{ return static_cast<size_t>(val); }
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
template <>
|
|
195
|
+
struct Hash<unsigned char> : public phmap_unary_function<unsigned char, size_t>
|
|
196
|
+
{
|
|
197
|
+
inline size_t operator()(unsigned char val) const noexcept
|
|
198
|
+
{ return static_cast<size_t>(val); }
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
#ifdef PHMAP_HAS_NATIVE_WCHAR_T
|
|
202
|
+
template <>
|
|
203
|
+
struct Hash<wchar_t> : public phmap_unary_function<wchar_t, size_t>
|
|
204
|
+
{
|
|
205
|
+
inline size_t operator()(wchar_t val) const noexcept
|
|
206
|
+
{ return static_cast<size_t>(val); }
|
|
207
|
+
};
|
|
208
|
+
#endif
|
|
209
|
+
|
|
210
|
+
template <>
|
|
211
|
+
struct Hash<int16_t> : public phmap_unary_function<int16_t, size_t>
|
|
212
|
+
{
|
|
213
|
+
inline size_t operator()(int16_t val) const noexcept
|
|
214
|
+
{ return static_cast<size_t>(val); }
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
template <>
|
|
218
|
+
struct Hash<uint16_t> : public phmap_unary_function<uint16_t, size_t>
|
|
219
|
+
{
|
|
220
|
+
inline size_t operator()(uint16_t val) const noexcept
|
|
221
|
+
{ return static_cast<size_t>(val); }
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
template <>
|
|
225
|
+
struct Hash<int32_t> : public phmap_unary_function<int32_t, size_t>
|
|
226
|
+
{
|
|
227
|
+
inline size_t operator()(int32_t val) const noexcept
|
|
228
|
+
{ return static_cast<size_t>(val); }
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
template <>
|
|
232
|
+
struct Hash<uint32_t> : public phmap_unary_function<uint32_t, size_t>
|
|
233
|
+
{
|
|
234
|
+
inline size_t operator()(uint32_t val) const noexcept
|
|
235
|
+
{ return static_cast<size_t>(val); }
|
|
236
|
+
};
|
|
237
|
+
|
|
238
|
+
template <>
|
|
239
|
+
struct Hash<int64_t> : public phmap_unary_function<int64_t, size_t>
|
|
240
|
+
{
|
|
241
|
+
inline size_t operator()(int64_t val) const noexcept
|
|
242
|
+
{ return fold_if_needed<sizeof(size_t)>()(static_cast<uint64_t>(val)); }
|
|
243
|
+
};
|
|
244
|
+
|
|
245
|
+
template <>
|
|
246
|
+
struct Hash<uint64_t> : public phmap_unary_function<uint64_t, size_t>
|
|
247
|
+
{
|
|
248
|
+
inline size_t operator()(uint64_t val) const noexcept
|
|
249
|
+
{ return fold_if_needed<sizeof(size_t)>()(val); }
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
template <>
|
|
253
|
+
struct Hash<float> : public phmap_unary_function<float, size_t>
|
|
254
|
+
{
|
|
255
|
+
inline size_t operator()(float val) const noexcept
|
|
256
|
+
{
|
|
257
|
+
// -0.0 and 0.0 should return same hash
|
|
258
|
+
uint32_t *as_int = reinterpret_cast<uint32_t *>(&val);
|
|
259
|
+
return (val == 0) ? static_cast<size_t>(0) :
|
|
260
|
+
static_cast<size_t>(*as_int);
|
|
261
|
+
}
|
|
262
|
+
};
|
|
263
|
+
|
|
264
|
+
template <>
|
|
265
|
+
struct Hash<double> : public phmap_unary_function<double, size_t>
|
|
266
|
+
{
|
|
267
|
+
inline size_t operator()(double val) const noexcept
|
|
268
|
+
{
|
|
269
|
+
// -0.0 and 0.0 should return same hash
|
|
270
|
+
uint64_t *as_int = reinterpret_cast<uint64_t *>(&val);
|
|
271
|
+
return (val == 0) ? static_cast<size_t>(0) :
|
|
272
|
+
fold_if_needed<sizeof(size_t)>()(*as_int);
|
|
273
|
+
}
|
|
274
|
+
};
|
|
275
|
+
|
|
276
|
+
#endif
|
|
277
|
+
|
|
278
|
+
#if defined(_MSC_VER)
|
|
279
|
+
# define PHMAP_HASH_ROTL32(x, r) _rotl(x,r)
|
|
280
|
+
#else
|
|
281
|
+
# define PHMAP_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
|
|
282
|
+
#endif
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
template <class H, int sz> struct Combiner
|
|
286
|
+
{
|
|
287
|
+
H operator()(H seed, size_t value);
|
|
288
|
+
};
|
|
289
|
+
|
|
290
|
+
template <class H> struct Combiner<H, 4>
|
|
291
|
+
{
|
|
292
|
+
H operator()(H h1, size_t k1)
|
|
293
|
+
{
|
|
294
|
+
// Copyright 2005-2014 Daniel James.
|
|
295
|
+
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
296
|
+
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
297
|
+
|
|
298
|
+
const uint32_t c1 = 0xcc9e2d51;
|
|
299
|
+
const uint32_t c2 = 0x1b873593;
|
|
300
|
+
|
|
301
|
+
k1 *= c1;
|
|
302
|
+
k1 = PHMAP_HASH_ROTL32(k1,15);
|
|
303
|
+
k1 *= c2;
|
|
304
|
+
|
|
305
|
+
h1 ^= k1;
|
|
306
|
+
h1 = PHMAP_HASH_ROTL32(h1,13);
|
|
307
|
+
h1 = h1*5+0xe6546b64;
|
|
308
|
+
|
|
309
|
+
return h1;
|
|
310
|
+
}
|
|
311
|
+
};
|
|
312
|
+
|
|
313
|
+
template <class H> struct Combiner<H, 8>
|
|
314
|
+
{
|
|
315
|
+
H operator()(H h, size_t k)
|
|
316
|
+
{
|
|
317
|
+
// Copyright 2005-2014 Daniel James.
|
|
318
|
+
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
319
|
+
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
320
|
+
const uint64_t m = (uint64_t(0xc6a4a793) << 32) + 0x5bd1e995;
|
|
321
|
+
const int r = 47;
|
|
322
|
+
|
|
323
|
+
k *= m;
|
|
324
|
+
k ^= k >> r;
|
|
325
|
+
k *= m;
|
|
326
|
+
|
|
327
|
+
h ^= k;
|
|
328
|
+
h *= m;
|
|
329
|
+
|
|
330
|
+
// Completely arbitrary number, to prevent 0's
|
|
331
|
+
// from hashing to 0.
|
|
332
|
+
h += 0xe6546b64;
|
|
333
|
+
|
|
334
|
+
return h;
|
|
335
|
+
}
|
|
336
|
+
};
|
|
337
|
+
|
|
338
|
+
// define HashState to combine member hashes... see example below
|
|
339
|
+
// -----------------------------------------------------------------------------
|
|
340
|
+
template <typename H>
|
|
341
|
+
class HashStateBase {
|
|
342
|
+
public:
|
|
343
|
+
template <typename T, typename... Ts>
|
|
344
|
+
static H combine(H state, const T& value, const Ts&... values);
|
|
345
|
+
|
|
346
|
+
static H combine(H state) { return state; }
|
|
347
|
+
};
|
|
348
|
+
|
|
349
|
+
template <typename H>
|
|
350
|
+
template <typename T, typename... Ts>
|
|
351
|
+
H HashStateBase<H>::combine(H seed, const T& v, const Ts&... vs)
|
|
352
|
+
{
|
|
353
|
+
return HashStateBase<H>::combine(Combiner<H, sizeof(H)>()(
|
|
354
|
+
seed, phmap::Hash<T>()(v)),
|
|
355
|
+
vs...);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
using HashState = HashStateBase<size_t>;
|
|
359
|
+
|
|
360
|
+
// -----------------------------------------------------------------------------
|
|
361
|
+
|
|
362
|
+
#if !defined(PHMAP_USE_ABSL_HASH)
|
|
363
|
+
|
|
364
|
+
// define Hash for std::pair
|
|
365
|
+
// -------------------------
|
|
366
|
+
template<class T1, class T2>
|
|
367
|
+
struct Hash<std::pair<T1, T2>> {
|
|
368
|
+
size_t operator()(std::pair<T1, T2> const& p) const noexcept {
|
|
369
|
+
return phmap::HashState().combine(phmap::Hash<T1>()(p.first), p.second);
|
|
370
|
+
}
|
|
371
|
+
};
|
|
372
|
+
|
|
373
|
+
// define Hash for std::tuple
|
|
374
|
+
// --------------------------
|
|
375
|
+
template<class... T>
|
|
376
|
+
struct Hash<std::tuple<T...>> {
|
|
377
|
+
size_t operator()(std::tuple<T...> const& t) const noexcept {
|
|
378
|
+
size_t seed = 0;
|
|
379
|
+
return _hash_helper(seed, t);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
private:
|
|
383
|
+
template<size_t I = 0, class TUP>
|
|
384
|
+
typename std::enable_if<I == std::tuple_size<TUP>::value, size_t>::type
|
|
385
|
+
_hash_helper(size_t seed, const TUP &) const noexcept { return seed; }
|
|
386
|
+
|
|
387
|
+
template<size_t I = 0, class TUP>
|
|
388
|
+
typename std::enable_if<I < std::tuple_size<TUP>::value, size_t>::type
|
|
389
|
+
_hash_helper(size_t seed, const TUP &t) const noexcept {
|
|
390
|
+
const auto &el = std::get<I>(t);
|
|
391
|
+
using el_type = typename std::remove_cv<typename std::remove_reference<decltype(el)>::type>::type;
|
|
392
|
+
seed = Combiner<size_t, sizeof(size_t)>()(seed, phmap::Hash<el_type>()(el));
|
|
393
|
+
return _hash_helper<I + 1>(seed, t);
|
|
394
|
+
}
|
|
395
|
+
};
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
#endif
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
} // namespace phmap
|
|
402
|
+
|
|
403
|
+
#ifdef _MSC_VER
|
|
404
|
+
#pragma warning(pop)
|
|
405
|
+
#endif
|
|
406
|
+
|
|
407
|
+
#endif // phmap_utils_h_guard_
|
package/docs/index.html
CHANGED
|
@@ -60,8 +60,9 @@
|
|
|
60
60
|
<li><strong>🚀 High Performance</strong>: C++ implementation with OpenMP SIMD optimization</li>
|
|
61
61
|
<li><strong>📦 Arena Allocation</strong>: Memory-efficient storage with 64MB chunks</li>
|
|
62
62
|
<li><strong>⚡ Fast Search</strong>: Sub-10ms similarity search for large document collections</li>
|
|
63
|
+
<li><strong>🔍 Hybrid Search</strong>: Combines vector similarity (semantic) with BM25 text search (lexical)</li>
|
|
63
64
|
<li><strong>🔧 MCP Integration</strong>: Built for Model Context Protocol servers</li>
|
|
64
|
-
<li><strong>🌐 Cross-Platform</strong>: Works on Linux
|
|
65
|
+
<li><strong>🌐 Cross-Platform</strong>: Works on Linux and macOS (Windows users: use WSL)</li>
|
|
65
66
|
<li><strong>📊 TypeScript Support</strong>: Full type definitions included</li>
|
|
66
67
|
<li><strong>🔄 Producer-Consumer Loading</strong>: Parallel document loading at 178k+ docs/sec</li>
|
|
67
68
|
</ul>
|
|
@@ -85,7 +86,7 @@
|
|
|
85
86
|
<li><strong>Linux</strong>: <code>sudo apt-get install libgomp1</code> (Ubuntu/Debian) or <code>dnf install libgomp</code> (Fedora)</li>
|
|
86
87
|
<li><strong>Alpine</strong>: <code>apk add libgomp</code></li>
|
|
87
88
|
<li><strong>macOS</strong>: <code>brew install libomp</code></li>
|
|
88
|
-
<li><strong>Windows</strong>:
|
|
89
|
+
<li><strong>Windows</strong>: Use WSL (Windows Subsystem for Linux)</li>
|
|
89
90
|
</ul>
|
|
90
91
|
</li>
|
|
91
92
|
</ul>
|
|
@@ -93,7 +94,6 @@
|
|
|
93
94
|
<ul>
|
|
94
95
|
<li>Linux (x64, arm64, musl/Alpine) - x64 builds are AWS Lambda compatible (no AVX-512)</li>
|
|
95
96
|
<li>macOS (x64, arm64/Apple Silicon)</li>
|
|
96
|
-
<li>Windows (x64)</li>
|
|
97
97
|
</ul>
|
|
98
98
|
<p>If building from source, you'll need:</p>
|
|
99
99
|
<ul>
|
|
@@ -125,8 +125,16 @@ store.finalize(); // Must call before searching!
|
|
|
125
125
|
|
|
126
126
|
// Search for similar documents
|
|
127
127
|
const queryEmbedding = new Float32Array(1536);
|
|
128
|
+
|
|
129
|
+
// Option 1: Vector-only search (traditional)
|
|
128
130
|
const results = store.search(queryEmbedding, 5); // Top 5 results
|
|
129
131
|
|
|
132
|
+
// Option 2: Hybrid search (NEW - combines vector + BM25 text search)
|
|
133
|
+
const hybridResults = store.search(queryEmbedding, 5, "your search query text");
|
|
134
|
+
|
|
135
|
+
// Option 3: BM25 text-only search
|
|
136
|
+
const textResults = store.searchBM25("your search query", 5);
|
|
137
|
+
|
|
130
138
|
// Results format - array of SearchResult objects, sorted by score (highest first):
|
|
131
139
|
console.log(results);
|
|
132
140
|
// [
|
|
@@ -284,6 +292,47 @@ if (process.env.NODE_ENV === 'development') {
|
|
|
284
292
|
fs.watch('./documents', { recursive: true }, reloadStore);
|
|
285
293
|
}
|
|
286
294
|
</code></pre>
|
|
295
|
+
<h2 id="hybrid-search">Hybrid Search</h2>
|
|
296
|
+
<p>The vector store now supports hybrid search, combining semantic similarity (vector search) with lexical matching (BM25 text search) for improved retrieval accuracy:</p>
|
|
297
|
+
<pre class="prettyprint source lang-javascript"><code>const { VectorStore } = require('native-vector-store');
|
|
298
|
+
|
|
299
|
+
const store = new VectorStore(1536);
|
|
300
|
+
store.loadDir('./documents');
|
|
301
|
+
|
|
302
|
+
// Hybrid search automatically combines vector and text search
|
|
303
|
+
const queryEmbedding = new Float32Array(1536);
|
|
304
|
+
const results = store.search(
|
|
305
|
+
queryEmbedding,
|
|
306
|
+
10, // Top 10 results
|
|
307
|
+
"machine learning algorithms" // Query text for BM25
|
|
308
|
+
);
|
|
309
|
+
|
|
310
|
+
// You can also use individual search methods
|
|
311
|
+
const vectorResults = store.searchVector(queryEmbedding, 10);
|
|
312
|
+
const textResults = store.searchBM25("machine learning", 10);
|
|
313
|
+
|
|
314
|
+
// Or explicitly control the hybrid weights
|
|
315
|
+
const customResults = store.searchHybrid(
|
|
316
|
+
queryEmbedding,
|
|
317
|
+
"machine learning",
|
|
318
|
+
10,
|
|
319
|
+
0.3, // Vector weight (30%)
|
|
320
|
+
0.7 // BM25 weight (70%)
|
|
321
|
+
);
|
|
322
|
+
|
|
323
|
+
// Tune BM25 parameters for your corpus
|
|
324
|
+
store.setBM25Parameters(
|
|
325
|
+
1.2, // k1: Term frequency saturation (default: 1.2)
|
|
326
|
+
0.75, // b: Document length normalization (default: 0.75)
|
|
327
|
+
1.0 // delta: Smoothing parameter (default: 1.0)
|
|
328
|
+
);
|
|
329
|
+
</code></pre>
|
|
330
|
+
<p>Hybrid search is particularly effective for:</p>
|
|
331
|
+
<ul>
|
|
332
|
+
<li><strong>Question answering</strong>: BM25 finds documents with exact terms while vectors capture semantic meaning</li>
|
|
333
|
+
<li><strong>Knowledge retrieval</strong>: Combines conceptual similarity with keyword matching</li>
|
|
334
|
+
<li><strong>Multi-lingual search</strong>: Vectors handle cross-language similarity while BM25 matches exact terms</li>
|
|
335
|
+
</ul>
|
|
287
336
|
<h2 id="mcp-server-integration">MCP Server Integration</h2>
|
|
288
337
|
<p>Perfect for building local RAG capabilities in MCP servers:</p>
|
|
289
338
|
<pre class="prettyprint source lang-javascript"><code>const { MCPVectorServer } = require('native-vector-store/examples/mcp-server');
|
package/lib/index.d.ts
CHANGED
|
@@ -31,11 +31,45 @@ export class VectorStore {
|
|
|
31
31
|
|
|
32
32
|
/**
|
|
33
33
|
* Search for k most similar documents
|
|
34
|
+
* Uses hybrid search if queryText is provided, otherwise vector-only search
|
|
35
|
+
* @param query - Query embedding vector
|
|
36
|
+
* @param k - Number of results to return
|
|
37
|
+
* @param queryText - Optional text query for hybrid search (BM25 + vector)
|
|
38
|
+
*/
|
|
39
|
+
search(query: Float32Array, k: number, queryText?: string): SearchResult[];
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Pure vector similarity search
|
|
34
43
|
* @param query - Query embedding vector
|
|
35
44
|
* @param k - Number of results to return
|
|
36
45
|
* @param normalizeQuery - Whether to L2 normalize the query (default: true)
|
|
37
46
|
*/
|
|
38
|
-
|
|
47
|
+
searchVector(query: Float32Array, k: number, normalizeQuery?: boolean): SearchResult[];
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Pure BM25 text search
|
|
51
|
+
* @param queryText - Text query or array of query terms
|
|
52
|
+
* @param k - Number of results to return
|
|
53
|
+
*/
|
|
54
|
+
searchBM25(queryText: string | string[], k: number): SearchResult[];
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Hybrid search combining vector similarity and BM25 text search
|
|
58
|
+
* @param query - Query embedding vector
|
|
59
|
+
* @param queryText - Text query for BM25 component
|
|
60
|
+
* @param k - Number of results to return
|
|
61
|
+
* @param vectorWeight - Weight for vector similarity (default: 0.5)
|
|
62
|
+
* @param bm25Weight - Weight for BM25 score (default: 0.5)
|
|
63
|
+
*/
|
|
64
|
+
searchHybrid(query: Float32Array, queryText: string, k: number, vectorWeight?: number, bm25Weight?: number): SearchResult[];
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Set BM25 parameters for text search
|
|
68
|
+
* @param k1 - Controls term frequency saturation (default: 1.2)
|
|
69
|
+
* @param b - Controls document length normalization (default: 0.75)
|
|
70
|
+
* @param delta - Smoothing parameter (default: 1.0)
|
|
71
|
+
*/
|
|
72
|
+
setBM25Parameters(k1: number, b: number, delta?: number): void;
|
|
39
73
|
|
|
40
74
|
/**
|
|
41
75
|
* Normalize all stored embeddings
|
package/package.json
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/src/Makefile
CHANGED
|
@@ -14,7 +14,7 @@ CXXFLAGS = -std=c++17 -g -O0 -fno-omit-frame-pointer -DDEBUG
|
|
|
14
14
|
# OS-specific configuration
|
|
15
15
|
ifeq ($(UNAME_S),Darwin)
|
|
16
16
|
# macOS configuration
|
|
17
|
-
INCLUDES = -I. -I../deps/simdjson -I../deps/atomic_queue -I/opt/homebrew/opt/libomp/include
|
|
17
|
+
INCLUDES = -I. -I../deps/simdjson -I../deps/atomic_queue -I../deps -I/opt/homebrew/opt/libomp/include
|
|
18
18
|
LDFLAGS = -L/opt/homebrew/opt/libomp/lib
|
|
19
19
|
LIBS = -lomp
|
|
20
20
|
# Add flags for better debugging with lldb
|
|
@@ -23,7 +23,7 @@ ifeq ($(UNAME_S),Darwin)
|
|
|
23
23
|
CXXFLAGS += -Xpreprocessor -fopenmp
|
|
24
24
|
else
|
|
25
25
|
# Linux configuration
|
|
26
|
-
INCLUDES = -I. -I../deps/simdjson -I../deps/atomic_queue
|
|
26
|
+
INCLUDES = -I. -I../deps/simdjson -I../deps/atomic_queue -I../deps
|
|
27
27
|
LDFLAGS = -L/usr/lib
|
|
28
28
|
LIBS = -lgomp
|
|
29
29
|
# OpenMP flags for Linux
|
|
@@ -32,12 +32,18 @@ endif
|
|
|
32
32
|
|
|
33
33
|
TARGET = test_vector_store
|
|
34
34
|
STRESS_TARGET = test_stress
|
|
35
|
-
|
|
36
|
-
|
|
35
|
+
BM25_TARGET = test_bm25
|
|
36
|
+
HYBRID_TARGET = test_hybrid_search
|
|
37
|
+
SOURCES = test_main.cpp vector_store.cpp simple_tokenizer.cpp ../deps/simdjson/simdjson.cpp
|
|
38
|
+
STRESS_SOURCES = test_stress.cpp vector_store.cpp simple_tokenizer.cpp vector_store_loader.cpp vector_store_loader_mmap.cpp vector_store_loader_adaptive.cpp ../deps/simdjson/simdjson.cpp
|
|
39
|
+
BM25_SOURCES = test_bm25.cpp vector_store.cpp simple_tokenizer.cpp ../deps/simdjson/simdjson.cpp
|
|
40
|
+
HYBRID_SOURCES = test_hybrid_search.cpp vector_store.cpp simple_tokenizer.cpp ../deps/simdjson/simdjson.cpp
|
|
37
41
|
OBJECTS = $(SOURCES:.cpp=.o)
|
|
38
42
|
STRESS_OBJECTS = $(STRESS_SOURCES:.cpp=.o)
|
|
43
|
+
BM25_OBJECTS = $(BM25_SOURCES:.cpp=.o)
|
|
44
|
+
HYBRID_OBJECTS = $(HYBRID_SOURCES:.cpp=.o)
|
|
39
45
|
|
|
40
|
-
all: $(TARGET) $(STRESS_TARGET)
|
|
46
|
+
all: $(TARGET) $(STRESS_TARGET) $(BM25_TARGET) $(HYBRID_TARGET)
|
|
41
47
|
|
|
42
48
|
$(TARGET): $(OBJECTS)
|
|
43
49
|
$(CXX) $(CXXFLAGS) $(OBJECTS) -o $(TARGET) $(LDFLAGS) $(LIBS)
|
|
@@ -45,11 +51,25 @@ $(TARGET): $(OBJECTS)
|
|
|
45
51
|
$(STRESS_TARGET): $(STRESS_OBJECTS)
|
|
46
52
|
$(CXX) $(CXXFLAGS) $(STRESS_OBJECTS) -o $(STRESS_TARGET) $(LDFLAGS) $(LIBS)
|
|
47
53
|
|
|
54
|
+
$(BM25_TARGET): $(BM25_OBJECTS)
|
|
55
|
+
$(CXX) $(CXXFLAGS) $(BM25_OBJECTS) -o $(BM25_TARGET) $(LDFLAGS) $(LIBS)
|
|
56
|
+
|
|
57
|
+
$(HYBRID_TARGET): $(HYBRID_OBJECTS)
|
|
58
|
+
$(CXX) $(CXXFLAGS) $(HYBRID_OBJECTS) -o $(HYBRID_TARGET) $(LDFLAGS) $(LIBS)
|
|
59
|
+
|
|
48
60
|
%.o: %.cpp
|
|
49
61
|
$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
|
|
50
62
|
|
|
51
63
|
clean:
|
|
52
|
-
rm -f $(OBJECTS) $(STRESS_OBJECTS) $(TARGET) $(STRESS_TARGET)
|
|
64
|
+
rm -f $(OBJECTS) $(STRESS_OBJECTS) $(BM25_OBJECTS) $(HYBRID_OBJECTS) $(TARGET) $(STRESS_TARGET) $(BM25_TARGET) $(HYBRID_TARGET)
|
|
65
|
+
|
|
66
|
+
# BM25 test
|
|
67
|
+
bm25: $(BM25_TARGET)
|
|
68
|
+
./$(BM25_TARGET)
|
|
69
|
+
|
|
70
|
+
# Hybrid search test
|
|
71
|
+
hybrid: $(HYBRID_TARGET)
|
|
72
|
+
./$(HYBRID_TARGET)
|
|
53
73
|
|
|
54
74
|
# Force rebuild (useful for CI)
|
|
55
75
|
rebuild: clean all
|