native-vector-store 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +55 -3
  2. package/binding.gyp +3 -2
  3. package/deps/parallel_hashmap/btree.h +4076 -0
  4. package/deps/parallel_hashmap/meminfo.h +195 -0
  5. package/deps/parallel_hashmap/phmap.h +5236 -0
  6. package/deps/parallel_hashmap/phmap_base.h +5115 -0
  7. package/deps/parallel_hashmap/phmap_bits.h +665 -0
  8. package/deps/parallel_hashmap/phmap_config.h +790 -0
  9. package/deps/parallel_hashmap/phmap_dump.h +335 -0
  10. package/deps/parallel_hashmap/phmap_fwd_decl.h +186 -0
  11. package/deps/parallel_hashmap/phmap_utils.h +407 -0
  12. package/docs/index.html +52 -3
  13. package/lib/index.d.ts +35 -1
  14. package/package.json +1 -1
  15. package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
  16. package/prebuilds/darwin-x64/native-vector-store.node +0 -0
  17. package/prebuilds/linux-arm64/native-vector-store.node +0 -0
  18. package/prebuilds/linux-x64/native-vector-store.node +0 -0
  19. package/src/Makefile +26 -6
  20. package/src/binding.cc +185 -2
  21. package/src/english_abbreviations.h +197 -0
  22. package/src/english_dictionary.h +25185 -0
  23. package/src/english_punctuations.h +42 -0
  24. package/src/english_stop_words.h +434 -0
  25. package/src/simple_sentence_splitter.h +218 -0
  26. package/src/simple_tokenizer.cpp +92 -0
  27. package/src/simple_tokenizer.h +30 -0
  28. package/src/test_bm25.cpp +357 -0
  29. package/src/test_hybrid_search.cpp +496 -0
  30. package/src/vector_store.cpp +239 -3
  31. package/src/vector_store.h +52 -1
  32. package/src/vector_store_loader.cpp +1 -1
  33. package/src/vector_store_loader_adaptive.cpp +1 -1
  34. package/src/vector_store_loader_mmap.cpp +2 -2
  35. package/prebuilds/win32-x64/native-vector-store.node +0 -0
@@ -0,0 +1,42 @@
1
+ #pragma once
2
+ #include <unordered_set>
3
+ #include <string>
4
+
5
+ class EnglishPunctuations {
6
+ public:
7
+ /// Returns the singleton instance
8
+ static const EnglishPunctuations& getInstance() {
9
+ static EnglishPunctuations instance; // Thread-safe in C++11+
10
+ return instance;
11
+ }
12
+
13
+ /// Test membership
14
+ bool contains(const std::string& mark) const noexcept {
15
+ return dict_.find(mark) != dict_.end();
16
+ }
17
+
18
+ /// Number of punctuation marks
19
+ std::size_t size() const noexcept {
20
+ return dict_.size();
21
+ }
22
+
23
+ /// Iteration support
24
+ auto begin() const noexcept { return dict_.begin(); }
25
+ auto end() const noexcept { return dict_.end(); }
26
+
27
+ private:
28
+ // Private ctor builds the fixed set
29
+ EnglishPunctuations()
30
+ : dict_{
31
+ "[", "]", "(", ")", "{", "}", "<", ">", ":",
32
+ ",", ";", "-", "--", "---", "!", "?", ".",
33
+ "...", "`", "'", "\"", "/"
34
+ }
35
+ {}
36
+
37
+ // Non-copyable, non-movable
38
+ EnglishPunctuations(const EnglishPunctuations&) = delete;
39
+ EnglishPunctuations& operator=(const EnglishPunctuations&) = delete;
40
+
41
+ const std::unordered_set<std::string> dict_;
42
+ };
@@ -0,0 +1,434 @@
1
+ // EnglishStopWords.h
2
+ #pragma once
3
+
4
+ #include <string>
5
+ #include <unordered_set>
6
+ #include <sstream>
7
+
8
+ /// Provides access to a single, compiled-in list of English stop words.
9
+ /// The set is built once (thread-safe in C++11+) from the raw string literal below.
10
+ class EnglishStopWords {
11
+ public:
12
+ /// Returns the singleton stop-word set.
13
+ static const std::unordered_set<std::string>& instance() {
14
+ static const std::unordered_set<std::string> dict = []{
15
+ // Paste your newline-separated stop-words list between the delimiters:
16
+ static constexpr const char* blob = R"STOPWORDS(
17
+ *
18
+
19
+
20
+ 'll
21
+ 're
22
+ 've
23
+ n't
24
+ 's
25
+ 'm
26
+ 'da
27
+ 'n
28
+ 'ye
29
+ i'm
30
+ you're
31
+ he's
32
+ she's
33
+ it's
34
+ we're
35
+ they're
36
+ i've
37
+ you've
38
+ we've
39
+ they've
40
+ i'd
41
+ you'd
42
+ he'd
43
+ she'd
44
+ we'd
45
+ they'd
46
+ i'll
47
+ you'll
48
+ he'll
49
+ she'll
50
+ we'll
51
+ they'll
52
+ isn't
53
+ aren't
54
+ wasn't
55
+ weren't
56
+ hasn't
57
+ haven't
58
+ hadn't
59
+ doesn't
60
+ don't
61
+ didn't
62
+ won't
63
+ wouldn't
64
+ shan't
65
+ shouldn't
66
+ can't
67
+ cannot
68
+ couldn't
69
+ mustn't
70
+ let's
71
+ that's
72
+ who's
73
+ what's
74
+ here's
75
+ there's
76
+ when's
77
+ where's
78
+ why's
79
+ how's
80
+ daren't
81
+ needn't
82
+ oughtn't
83
+ mightn't
84
+ a
85
+ able
86
+ about
87
+ above
88
+ across
89
+ after
90
+ afterwards
91
+ again
92
+ against
93
+ all
94
+ almost
95
+ alone
96
+ along
97
+ already
98
+ also
99
+ although
100
+ always
101
+ am
102
+ among
103
+ amongst
104
+ amoungst
105
+ amount
106
+ an
107
+ and
108
+ another
109
+ any
110
+ anyhow
111
+ anyone
112
+ anything
113
+ anyway
114
+ anywhere
115
+ are
116
+ around
117
+ as
118
+ at
119
+ back
120
+ be
121
+ became
122
+ because
123
+ become
124
+ becomes
125
+ becoming
126
+ been
127
+ before
128
+ beforehand
129
+ behind
130
+ being
131
+ below
132
+ beside
133
+ besides
134
+ between
135
+ beyond
136
+ bill
137
+ both
138
+ bottom
139
+ but
140
+ by
141
+ call
142
+ can
143
+ cannot
144
+ cant
145
+ co
146
+ con
147
+ could
148
+ couldnt
149
+ cry
150
+ de
151
+ describe
152
+ detail
153
+ do
154
+ done
155
+ down
156
+ due
157
+ during
158
+ each
159
+ eg
160
+ eight
161
+ either
162
+ eleven
163
+ else
164
+ elsewhere
165
+ empty
166
+ enough
167
+ etc
168
+ even
169
+ ever
170
+ every
171
+ everyone
172
+ everything
173
+ everywhere
174
+ except
175
+ few
176
+ fifteen
177
+ fify
178
+ fill
179
+ find
180
+ fire
181
+ first
182
+ five
183
+ for
184
+ former
185
+ formerly
186
+ forty
187
+ found
188
+ four
189
+ from
190
+ front
191
+ full
192
+ further
193
+ get
194
+ give
195
+ go
196
+ had
197
+ has
198
+ hasnt
199
+ have
200
+ he
201
+ hence
202
+ her
203
+ here
204
+ hereafter
205
+ hereby
206
+ herein
207
+ hereupon
208
+ hers
209
+ herself
210
+ him
211
+ himself
212
+ his
213
+ how
214
+ however
215
+ hundred
216
+ i
217
+ ie
218
+ if
219
+ in
220
+ inc
221
+ indeed
222
+ interest
223
+ into
224
+ is
225
+ it
226
+ its
227
+ itself
228
+ keep
229
+ last
230
+ latter
231
+ latterly
232
+ least
233
+ less
234
+ ltd
235
+ made
236
+ many
237
+ may
238
+ me
239
+ meanwhile
240
+ might
241
+ mill
242
+ mine
243
+ more
244
+ moreover
245
+ most
246
+ mostly
247
+ move
248
+ much
249
+ must
250
+ my
251
+ myself
252
+ name
253
+ namely
254
+ neither
255
+ never
256
+ nevertheless
257
+ next
258
+ nine
259
+ no
260
+ nobody
261
+ none
262
+ noone
263
+ nor
264
+ not
265
+ nothing
266
+ now
267
+ nowhere
268
+ of
269
+ off
270
+ often
271
+ on
272
+ once
273
+ one
274
+ only
275
+ onto
276
+ or
277
+ other
278
+ others
279
+ otherwise
280
+ our
281
+ ours
282
+ ourselves
283
+ out
284
+ over
285
+ own
286
+ part
287
+ per
288
+ perhaps
289
+ please
290
+ put
291
+ rather
292
+ re
293
+ same
294
+ see
295
+ seem
296
+ seemed
297
+ seeming
298
+ seems
299
+ serious
300
+ several
301
+ she
302
+ should
303
+ show
304
+ side
305
+ since
306
+ sincere
307
+ six
308
+ sixty
309
+ so
310
+ some
311
+ somehow
312
+ someone
313
+ something
314
+ sometime
315
+ sometimes
316
+ somewhere
317
+ still
318
+ such
319
+ system
320
+ take
321
+ ten
322
+ than
323
+ that
324
+ the
325
+ their
326
+ them
327
+ themselves
328
+ then
329
+ thence
330
+ there
331
+ thereafter
332
+ thereby
333
+ therefore
334
+ therein
335
+ thereupon
336
+ these
337
+ they
338
+ thick
339
+ thin
340
+ third
341
+ this
342
+ those
343
+ though
344
+ three
345
+ through
346
+ throughout
347
+ thru
348
+ thus
349
+ to
350
+ together
351
+ too
352
+ top
353
+ toward
354
+ towards
355
+ twelve
356
+ twenty
357
+ two
358
+ un
359
+ under
360
+ until
361
+ up
362
+ upon
363
+ us
364
+ very
365
+ via
366
+ was
367
+ we
368
+ well
369
+ were
370
+ what
371
+ whatever
372
+ when
373
+ whence
374
+ whenever
375
+ where
376
+ whereafter
377
+ whereas
378
+ whereby
379
+ wherein
380
+ whereupon
381
+ wherever
382
+ whether
383
+ which
384
+ while
385
+ whither
386
+ who
387
+ whoever
388
+ whole
389
+ whom
390
+ whose
391
+ why
392
+ will
393
+ with
394
+ within
395
+ without
396
+ would
397
+ yet
398
+ you
399
+ your
400
+ yours
401
+ yourself
402
+ yourselves
403
+ )STOPWORDS";
404
+
405
+ std::unordered_set<std::string> tmp;
406
+ std::istringstream in{blob};
407
+ for (std::string w; std::getline(in, w); ) {
408
+ if (!w.empty()) {
409
+ tmp.insert(w);
410
+ }
411
+ }
412
+ return tmp;
413
+ }();
414
+ return dict;
415
+ }
416
+
417
+ /// Returns true if `word` is a stop word.
418
+ static bool contains(const std::string& word) {
419
+ return instance().count(word) > 0;
420
+ }
421
+
422
+ /// Number of stop words loaded.
423
+ static std::size_t size() {
424
+ return instance().size();
425
+ }
426
+
427
+ private:
428
+ // Prevent instantiation or copying.
429
+ EnglishStopWords() = delete;
430
+ ~EnglishStopWords() = delete;
431
+ EnglishStopWords(const EnglishStopWords&) = delete;
432
+ EnglishStopWords& operator=(const EnglishStopWords&) = delete;
433
+ };
434
+
@@ -0,0 +1,218 @@
1
+ // SimpleSentenceSplitter.h
2
+ #pragma once
3
+
4
+ #include <string>
5
+ #include <vector>
6
+ #include <regex>
7
+ #include <algorithm>
8
+ #include <cctype>
9
+
10
+ // You’ll need C++ ports of these:
11
+ // • EnglishAbbreviations::contains(const std::string&)
12
+ // • EnglishDictionary::instance().count(const std::string&)
13
+ #include "english_abbreviations.h"
14
+ #include "english_dictionary.h"
15
+
16
+ class SimpleSentenceSplitter {
17
+ public:
18
+ /// Singleton accessor
19
+ static SimpleSentenceSplitter& getInstance() {
20
+ static SimpleSentenceSplitter instance;
21
+ return instance;
22
+ }
23
+
24
+ /// Split text into sentences.
25
+ std::vector<std::string> split(const std::string& input) {
26
+ std::vector<std::string> sentences;
27
+ int len = 0;
28
+ std::string text = input;
29
+
30
+ // 1) Normalize carriage returns to spaces
31
+ text = std::regex_replace(text, regexCarriageReturn(), " ");
32
+
33
+ // 2) Clear any stray 0x19 markers
34
+ for (char& c : text) if (c == '\x19') c = ' ';
35
+
36
+ // 3) Insert 0x19 where a space was likely forgotten after .!?
37
+ text = std::regex_replace(text, regexForgottenSpace(), "$1$2\x19$3");
38
+
39
+ // 4) Add a newline so regex can match the final sentence
40
+ text.push_back('\n');
41
+
42
+ auto begin = text.cbegin();
43
+ std::smatch m;
44
+ std::string current;
45
+
46
+ // 5) Loop over sentence-boundary matches
47
+ while (std::regex_search(begin, text.cend(), m, regexSentence())) {
48
+ // Extract groups
49
+ std::string sent = m[1].str();
50
+ std::string punct = m[2].str();
51
+
52
+ // Determine which “after” group matched, and compute its end offset
53
+ std::string after;
54
+ size_t offsetBase = begin - text.cbegin();
55
+ size_t newEnd;
56
+ if (m[3].matched) {
57
+ after = m[3].str();
58
+ newEnd = m.position(3) + m.length(3) + offsetBase;
59
+ }
60
+ else if (m[5].matched) {
61
+ after = m[5].str();
62
+ newEnd = m.position(5) + m.length(5) + offsetBase;
63
+ }
64
+ else {
65
+ after.clear();
66
+ newEnd = m.position(0) + m.length(0) + offsetBase;
67
+ }
68
+
69
+ // Count words in 'sent'
70
+ len += countWords(sent);
71
+
72
+ std::string nextWord = m[4].matched ? m[4].str() : "";
73
+
74
+ // Decide if this is a true break
75
+ bool isBreak = false;
76
+ if (punct == ".") {
77
+ if (!isAbbreviation(sent, nextWord, len)) isBreak = true;
78
+ }
79
+ else if (punct == "!" || punct == "?" || (punct == ":" && len > 6)) {
80
+ isBreak = true;
81
+ }
82
+
83
+ // Append appropriately
84
+ if (isBreak) {
85
+ appendSentence(sentences, current, sent, punct, after);
86
+ len = 0;
87
+ } else {
88
+ appendContinuation(current, sent, punct, after);
89
+ }
90
+
91
+ // Move search cursor forward
92
+ begin = text.cbegin() + newEnd;
93
+ }
94
+
95
+ // Capture any trailing text
96
+ size_t consumed = begin - text.cbegin();
97
+ if (consumed < text.size()) {
98
+ current += text.substr(consumed);
99
+ }
100
+ if (!current.empty()) {
101
+ sentences.push_back(cleanOutput(current));
102
+ }
103
+
104
+ return sentences;
105
+ }
106
+
107
+ private:
108
+ SimpleSentenceSplitter() = default;
109
+ SimpleSentenceSplitter(const SimpleSentenceSplitter&) = delete;
110
+ SimpleSentenceSplitter& operator=(const SimpleSentenceSplitter&) = delete;
111
+
112
+ // Regex factories (thread‐safe init)
113
+ static const std::regex& regexCarriageReturn() {
114
+ static const std::regex r{"[\\n\\r]+"};
115
+ return r;
116
+ }
117
+ static const std::regex& regexForgottenSpace() {
118
+ static const std::regex r{"(.)([\\.!?])([^0-9\\s\\.\"'`\\)\\}\\]])"};
119
+ return r;
120
+ }
121
+ static const std::regex& regexSentence() {
122
+ static const std::regex r{
123
+ R"((['\"`]*[\(\{\[]?[A-Za-z0-9]+.*?)([\.!\?:])"
124
+ R"(?:(?=([\(\[\{\"'`<>]*[ \x19]+)[\(\[\{\"'`\)\}\] ]*([A-Z0-9][a-z]*))"
125
+ R"(|(?=([\(\)\"'`<\}\] \x19]+)\s)))"
126
+ };
127
+ return r;
128
+ }
129
+ static const std::regex& regexWhitespace() {
130
+ static const std::regex r{"\\s+"};
131
+ return r;
132
+ }
133
+ static const std::regex& regexLastWord() {
134
+ static const std::regex r{"\\b([\\w0-9\\.']+)$"};
135
+ return r;
136
+ }
137
+
138
+ // Helpers
139
+ static size_t countWords(const std::string& s) {
140
+ return std::distance(
141
+ std::sregex_token_iterator(s.begin(), s.end(), regexWhitespace(), -1),
142
+ std::sregex_token_iterator{}
143
+ );
144
+ }
145
+
146
+ static std::string extractLastWord(const std::string& s) {
147
+ std::smatch m2;
148
+ if (std::regex_search(s, m2, regexLastWord())) return m2[1].str();
149
+ return "";
150
+ }
151
+
152
+ static bool isAbbreviation(const std::string& sentence,
153
+ const std::string& nextWord,
154
+ int wordCount)
155
+ {
156
+ std::string last = extractLastWord(sentence);
157
+ // Check vowel presence, letter patterns, single-letter
158
+ static const std::regex hasVowel{"[AEIOUaeiou]"};
159
+ static const std::regex hasLower{"[a-z]"};
160
+ static const std::regex hasY{"y"};
161
+ static const std::regex letterDot{"([A-Za-z]\\.)+"};
162
+
163
+ bool cond1 = !std::regex_search(last, hasVowel)
164
+ && std::regex_search(last, hasLower)
165
+ && !std::regex_search(last, hasY);
166
+ bool cond2 = std::regex_match(last, letterDot);
167
+ bool cond3 = (last.size()==1 && std::isalpha(last[0]) && last!="I");
168
+ bool cond4 = EnglishAbbreviations::contains(toLower(last));
169
+
170
+ if (cond1||cond2||cond3||cond4) {
171
+ if (EnglishDictionary::instance().count(nextWord) && wordCount>6) {
172
+ return false; // actually a sentence break
173
+ }
174
+ return true; // abbreviation = no break
175
+ }
176
+ return false;
177
+ }
178
+
179
+ static std::string toLower(const std::string& s) {
180
+ std::string out; out.reserve(s.size());
181
+ for (char c: s) out.push_back(std::tolower((unsigned char)c));
182
+ return out;
183
+ }
184
+
185
+ static void appendSentence(std::vector<std::string>& v,
186
+ std::string& curr,
187
+ const std::string& sent,
188
+ const std::string& punct,
189
+ const std::string& after)
190
+ {
191
+ curr += sent + punct + after;
192
+ v.push_back(cleanOutput(curr));
193
+ curr.clear();
194
+ }
195
+
196
+ static void appendContinuation(std::string& curr,
197
+ const std::string& sent,
198
+ const std::string& punct,
199
+ const std::string& after)
200
+ {
201
+ curr += sent + punct;
202
+ if (after.find('\x19')==std::string::npos) curr.push_back(' ');
203
+ }
204
+
205
+ static std::string cleanOutput(const std::string& s) {
206
+ // Remove markers and trim whitespace
207
+ std::string tmp;
208
+ tmp.reserve(s.size());
209
+ for (char c: s) if (c!='\x19') tmp.push_back(c);
210
+ // Trim
211
+ auto ws = " \t\n\r";
212
+ auto start = tmp.find_first_not_of(ws);
213
+ if (start==std::string::npos) return "";
214
+ auto end = tmp.find_last_not_of(ws);
215
+ return tmp.substr(start, end-start+1);
216
+ }
217
+ };
218
+