native-vector-store 0.3.8 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -3
- package/binding.gyp +3 -2
- package/deps/parallel_hashmap/btree.h +4076 -0
- package/deps/parallel_hashmap/meminfo.h +195 -0
- package/deps/parallel_hashmap/phmap.h +5236 -0
- package/deps/parallel_hashmap/phmap_base.h +5115 -0
- package/deps/parallel_hashmap/phmap_bits.h +665 -0
- package/deps/parallel_hashmap/phmap_config.h +790 -0
- package/deps/parallel_hashmap/phmap_dump.h +335 -0
- package/deps/parallel_hashmap/phmap_fwd_decl.h +186 -0
- package/deps/parallel_hashmap/phmap_utils.h +407 -0
- package/docs/index.html +52 -3
- package/lib/index.d.ts +35 -1
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/native-vector-store.node +0 -0
- package/prebuilds/darwin-x64/native-vector-store.node +0 -0
- package/prebuilds/linux-arm64/native-vector-store.node +0 -0
- package/prebuilds/linux-x64/native-vector-store.node +0 -0
- package/src/Makefile +26 -6
- package/src/binding.cc +185 -2
- package/src/english_abbreviations.h +197 -0
- package/src/english_dictionary.h +25185 -0
- package/src/english_punctuations.h +42 -0
- package/src/english_stop_words.h +434 -0
- package/src/simple_sentence_splitter.h +218 -0
- package/src/simple_tokenizer.cpp +92 -0
- package/src/simple_tokenizer.h +30 -0
- package/src/test_bm25.cpp +357 -0
- package/src/test_hybrid_search.cpp +496 -0
- package/src/vector_store.cpp +239 -3
- package/src/vector_store.h +52 -1
- package/src/vector_store_loader.cpp +1 -1
- package/src/vector_store_loader_adaptive.cpp +1 -1
- package/src/vector_store_loader_mmap.cpp +2 -2
- package/prebuilds/win32-x64/native-vector-store.node +0 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <unordered_set>
|
|
3
|
+
#include <string>
|
|
4
|
+
|
|
5
|
+
class EnglishPunctuations {
|
|
6
|
+
public:
|
|
7
|
+
/// Returns the singleton instance
|
|
8
|
+
static const EnglishPunctuations& getInstance() {
|
|
9
|
+
static EnglishPunctuations instance; // Thread-safe in C++11+
|
|
10
|
+
return instance;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/// Test membership
|
|
14
|
+
bool contains(const std::string& mark) const noexcept {
|
|
15
|
+
return dict_.find(mark) != dict_.end();
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/// Number of punctuation marks
|
|
19
|
+
std::size_t size() const noexcept {
|
|
20
|
+
return dict_.size();
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/// Iteration support
|
|
24
|
+
auto begin() const noexcept { return dict_.begin(); }
|
|
25
|
+
auto end() const noexcept { return dict_.end(); }
|
|
26
|
+
|
|
27
|
+
private:
|
|
28
|
+
// Private ctor builds the fixed set
|
|
29
|
+
EnglishPunctuations()
|
|
30
|
+
: dict_{
|
|
31
|
+
"[", "]", "(", ")", "{", "}", "<", ">", ":",
|
|
32
|
+
",", ";", "-", "--", "---", "!", "?", ".",
|
|
33
|
+
"...", "`", "'", "\"", "/"
|
|
34
|
+
}
|
|
35
|
+
{}
|
|
36
|
+
|
|
37
|
+
// Non-copyable, non-movable
|
|
38
|
+
EnglishPunctuations(const EnglishPunctuations&) = delete;
|
|
39
|
+
EnglishPunctuations& operator=(const EnglishPunctuations&) = delete;
|
|
40
|
+
|
|
41
|
+
const std::unordered_set<std::string> dict_;
|
|
42
|
+
};
|
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
// EnglishStopWords.h
|
|
2
|
+
#pragma once
|
|
3
|
+
|
|
4
|
+
#include <string>
|
|
5
|
+
#include <unordered_set>
|
|
6
|
+
#include <sstream>
|
|
7
|
+
|
|
8
|
+
/// Provides access to a single, compiled-in list of English stop words.
|
|
9
|
+
/// The set is built once (thread-safe in C++11+) from the raw string literal below.
|
|
10
|
+
class EnglishStopWords {
|
|
11
|
+
public:
|
|
12
|
+
/// Returns the singleton stop-word set.
|
|
13
|
+
static const std::unordered_set<std::string>& instance() {
|
|
14
|
+
static const std::unordered_set<std::string> dict = []{
|
|
15
|
+
// Paste your newline-separated stop-words list between the delimiters:
|
|
16
|
+
static constexpr const char* blob = R"STOPWORDS(
|
|
17
|
+
*
|
|
18
|
+
•
|
|
19
|
+
|
|
20
|
+
'll
|
|
21
|
+
're
|
|
22
|
+
've
|
|
23
|
+
n't
|
|
24
|
+
's
|
|
25
|
+
'm
|
|
26
|
+
'da
|
|
27
|
+
'n
|
|
28
|
+
'ye
|
|
29
|
+
i'm
|
|
30
|
+
you're
|
|
31
|
+
he's
|
|
32
|
+
she's
|
|
33
|
+
it's
|
|
34
|
+
we're
|
|
35
|
+
they're
|
|
36
|
+
i've
|
|
37
|
+
you've
|
|
38
|
+
we've
|
|
39
|
+
they've
|
|
40
|
+
i'd
|
|
41
|
+
you'd
|
|
42
|
+
he'd
|
|
43
|
+
she'd
|
|
44
|
+
we'd
|
|
45
|
+
they'd
|
|
46
|
+
i'll
|
|
47
|
+
you'll
|
|
48
|
+
he'll
|
|
49
|
+
she'll
|
|
50
|
+
we'll
|
|
51
|
+
they'll
|
|
52
|
+
isn't
|
|
53
|
+
aren't
|
|
54
|
+
wasn't
|
|
55
|
+
weren't
|
|
56
|
+
hasn't
|
|
57
|
+
haven't
|
|
58
|
+
hadn't
|
|
59
|
+
doesn't
|
|
60
|
+
don't
|
|
61
|
+
didn't
|
|
62
|
+
won't
|
|
63
|
+
wouldn't
|
|
64
|
+
shan't
|
|
65
|
+
shouldn't
|
|
66
|
+
can't
|
|
67
|
+
cannot
|
|
68
|
+
couldn't
|
|
69
|
+
mustn't
|
|
70
|
+
let's
|
|
71
|
+
that's
|
|
72
|
+
who's
|
|
73
|
+
what's
|
|
74
|
+
here's
|
|
75
|
+
there's
|
|
76
|
+
when's
|
|
77
|
+
where's
|
|
78
|
+
why's
|
|
79
|
+
how's
|
|
80
|
+
daren't
|
|
81
|
+
needn't
|
|
82
|
+
oughtn't
|
|
83
|
+
mightn't
|
|
84
|
+
a
|
|
85
|
+
able
|
|
86
|
+
about
|
|
87
|
+
above
|
|
88
|
+
across
|
|
89
|
+
after
|
|
90
|
+
afterwards
|
|
91
|
+
again
|
|
92
|
+
against
|
|
93
|
+
all
|
|
94
|
+
almost
|
|
95
|
+
alone
|
|
96
|
+
along
|
|
97
|
+
already
|
|
98
|
+
also
|
|
99
|
+
although
|
|
100
|
+
always
|
|
101
|
+
am
|
|
102
|
+
among
|
|
103
|
+
amongst
|
|
104
|
+
amoungst
|
|
105
|
+
amount
|
|
106
|
+
an
|
|
107
|
+
and
|
|
108
|
+
another
|
|
109
|
+
any
|
|
110
|
+
anyhow
|
|
111
|
+
anyone
|
|
112
|
+
anything
|
|
113
|
+
anyway
|
|
114
|
+
anywhere
|
|
115
|
+
are
|
|
116
|
+
around
|
|
117
|
+
as
|
|
118
|
+
at
|
|
119
|
+
back
|
|
120
|
+
be
|
|
121
|
+
became
|
|
122
|
+
because
|
|
123
|
+
become
|
|
124
|
+
becomes
|
|
125
|
+
becoming
|
|
126
|
+
been
|
|
127
|
+
before
|
|
128
|
+
beforehand
|
|
129
|
+
behind
|
|
130
|
+
being
|
|
131
|
+
below
|
|
132
|
+
beside
|
|
133
|
+
besides
|
|
134
|
+
between
|
|
135
|
+
beyond
|
|
136
|
+
bill
|
|
137
|
+
both
|
|
138
|
+
bottom
|
|
139
|
+
but
|
|
140
|
+
by
|
|
141
|
+
call
|
|
142
|
+
can
|
|
143
|
+
cannot
|
|
144
|
+
cant
|
|
145
|
+
co
|
|
146
|
+
con
|
|
147
|
+
could
|
|
148
|
+
couldnt
|
|
149
|
+
cry
|
|
150
|
+
de
|
|
151
|
+
describe
|
|
152
|
+
detail
|
|
153
|
+
do
|
|
154
|
+
done
|
|
155
|
+
down
|
|
156
|
+
due
|
|
157
|
+
during
|
|
158
|
+
each
|
|
159
|
+
eg
|
|
160
|
+
eight
|
|
161
|
+
either
|
|
162
|
+
eleven
|
|
163
|
+
else
|
|
164
|
+
elsewhere
|
|
165
|
+
empty
|
|
166
|
+
enough
|
|
167
|
+
etc
|
|
168
|
+
even
|
|
169
|
+
ever
|
|
170
|
+
every
|
|
171
|
+
everyone
|
|
172
|
+
everything
|
|
173
|
+
everywhere
|
|
174
|
+
except
|
|
175
|
+
few
|
|
176
|
+
fifteen
|
|
177
|
+
fify
|
|
178
|
+
fill
|
|
179
|
+
find
|
|
180
|
+
fire
|
|
181
|
+
first
|
|
182
|
+
five
|
|
183
|
+
for
|
|
184
|
+
former
|
|
185
|
+
formerly
|
|
186
|
+
forty
|
|
187
|
+
found
|
|
188
|
+
four
|
|
189
|
+
from
|
|
190
|
+
front
|
|
191
|
+
full
|
|
192
|
+
further
|
|
193
|
+
get
|
|
194
|
+
give
|
|
195
|
+
go
|
|
196
|
+
had
|
|
197
|
+
has
|
|
198
|
+
hasnt
|
|
199
|
+
have
|
|
200
|
+
he
|
|
201
|
+
hence
|
|
202
|
+
her
|
|
203
|
+
here
|
|
204
|
+
hereafter
|
|
205
|
+
hereby
|
|
206
|
+
herein
|
|
207
|
+
hereupon
|
|
208
|
+
hers
|
|
209
|
+
herself
|
|
210
|
+
him
|
|
211
|
+
himself
|
|
212
|
+
his
|
|
213
|
+
how
|
|
214
|
+
however
|
|
215
|
+
hundred
|
|
216
|
+
i
|
|
217
|
+
ie
|
|
218
|
+
if
|
|
219
|
+
in
|
|
220
|
+
inc
|
|
221
|
+
indeed
|
|
222
|
+
interest
|
|
223
|
+
into
|
|
224
|
+
is
|
|
225
|
+
it
|
|
226
|
+
its
|
|
227
|
+
itself
|
|
228
|
+
keep
|
|
229
|
+
last
|
|
230
|
+
latter
|
|
231
|
+
latterly
|
|
232
|
+
least
|
|
233
|
+
less
|
|
234
|
+
ltd
|
|
235
|
+
made
|
|
236
|
+
many
|
|
237
|
+
may
|
|
238
|
+
me
|
|
239
|
+
meanwhile
|
|
240
|
+
might
|
|
241
|
+
mill
|
|
242
|
+
mine
|
|
243
|
+
more
|
|
244
|
+
moreover
|
|
245
|
+
most
|
|
246
|
+
mostly
|
|
247
|
+
move
|
|
248
|
+
much
|
|
249
|
+
must
|
|
250
|
+
my
|
|
251
|
+
myself
|
|
252
|
+
name
|
|
253
|
+
namely
|
|
254
|
+
neither
|
|
255
|
+
never
|
|
256
|
+
nevertheless
|
|
257
|
+
next
|
|
258
|
+
nine
|
|
259
|
+
no
|
|
260
|
+
nobody
|
|
261
|
+
none
|
|
262
|
+
noone
|
|
263
|
+
nor
|
|
264
|
+
not
|
|
265
|
+
nothing
|
|
266
|
+
now
|
|
267
|
+
nowhere
|
|
268
|
+
of
|
|
269
|
+
off
|
|
270
|
+
often
|
|
271
|
+
on
|
|
272
|
+
once
|
|
273
|
+
one
|
|
274
|
+
only
|
|
275
|
+
onto
|
|
276
|
+
or
|
|
277
|
+
other
|
|
278
|
+
others
|
|
279
|
+
otherwise
|
|
280
|
+
our
|
|
281
|
+
ours
|
|
282
|
+
ourselves
|
|
283
|
+
out
|
|
284
|
+
over
|
|
285
|
+
own
|
|
286
|
+
part
|
|
287
|
+
per
|
|
288
|
+
perhaps
|
|
289
|
+
please
|
|
290
|
+
put
|
|
291
|
+
rather
|
|
292
|
+
re
|
|
293
|
+
same
|
|
294
|
+
see
|
|
295
|
+
seem
|
|
296
|
+
seemed
|
|
297
|
+
seeming
|
|
298
|
+
seems
|
|
299
|
+
serious
|
|
300
|
+
several
|
|
301
|
+
she
|
|
302
|
+
should
|
|
303
|
+
show
|
|
304
|
+
side
|
|
305
|
+
since
|
|
306
|
+
sincere
|
|
307
|
+
six
|
|
308
|
+
sixty
|
|
309
|
+
so
|
|
310
|
+
some
|
|
311
|
+
somehow
|
|
312
|
+
someone
|
|
313
|
+
something
|
|
314
|
+
sometime
|
|
315
|
+
sometimes
|
|
316
|
+
somewhere
|
|
317
|
+
still
|
|
318
|
+
such
|
|
319
|
+
system
|
|
320
|
+
take
|
|
321
|
+
ten
|
|
322
|
+
than
|
|
323
|
+
that
|
|
324
|
+
the
|
|
325
|
+
their
|
|
326
|
+
them
|
|
327
|
+
themselves
|
|
328
|
+
then
|
|
329
|
+
thence
|
|
330
|
+
there
|
|
331
|
+
thereafter
|
|
332
|
+
thereby
|
|
333
|
+
therefore
|
|
334
|
+
therein
|
|
335
|
+
thereupon
|
|
336
|
+
these
|
|
337
|
+
they
|
|
338
|
+
thick
|
|
339
|
+
thin
|
|
340
|
+
third
|
|
341
|
+
this
|
|
342
|
+
those
|
|
343
|
+
though
|
|
344
|
+
three
|
|
345
|
+
through
|
|
346
|
+
throughout
|
|
347
|
+
thru
|
|
348
|
+
thus
|
|
349
|
+
to
|
|
350
|
+
together
|
|
351
|
+
too
|
|
352
|
+
top
|
|
353
|
+
toward
|
|
354
|
+
towards
|
|
355
|
+
twelve
|
|
356
|
+
twenty
|
|
357
|
+
two
|
|
358
|
+
un
|
|
359
|
+
under
|
|
360
|
+
until
|
|
361
|
+
up
|
|
362
|
+
upon
|
|
363
|
+
us
|
|
364
|
+
very
|
|
365
|
+
via
|
|
366
|
+
was
|
|
367
|
+
we
|
|
368
|
+
well
|
|
369
|
+
were
|
|
370
|
+
what
|
|
371
|
+
whatever
|
|
372
|
+
when
|
|
373
|
+
whence
|
|
374
|
+
whenever
|
|
375
|
+
where
|
|
376
|
+
whereafter
|
|
377
|
+
whereas
|
|
378
|
+
whereby
|
|
379
|
+
wherein
|
|
380
|
+
whereupon
|
|
381
|
+
wherever
|
|
382
|
+
whether
|
|
383
|
+
which
|
|
384
|
+
while
|
|
385
|
+
whither
|
|
386
|
+
who
|
|
387
|
+
whoever
|
|
388
|
+
whole
|
|
389
|
+
whom
|
|
390
|
+
whose
|
|
391
|
+
why
|
|
392
|
+
will
|
|
393
|
+
with
|
|
394
|
+
within
|
|
395
|
+
without
|
|
396
|
+
would
|
|
397
|
+
yet
|
|
398
|
+
you
|
|
399
|
+
your
|
|
400
|
+
yours
|
|
401
|
+
yourself
|
|
402
|
+
yourselves
|
|
403
|
+
)STOPWORDS";
|
|
404
|
+
|
|
405
|
+
std::unordered_set<std::string> tmp;
|
|
406
|
+
std::istringstream in{blob};
|
|
407
|
+
for (std::string w; std::getline(in, w); ) {
|
|
408
|
+
if (!w.empty()) {
|
|
409
|
+
tmp.insert(w);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
return tmp;
|
|
413
|
+
}();
|
|
414
|
+
return dict;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
/// Returns true if `word` is a stop word.
|
|
418
|
+
static bool contains(const std::string& word) {
|
|
419
|
+
return instance().count(word) > 0;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
/// Number of stop words loaded.
|
|
423
|
+
static std::size_t size() {
|
|
424
|
+
return instance().size();
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
private:
|
|
428
|
+
// Prevent instantiation or copying.
|
|
429
|
+
EnglishStopWords() = delete;
|
|
430
|
+
~EnglishStopWords() = delete;
|
|
431
|
+
EnglishStopWords(const EnglishStopWords&) = delete;
|
|
432
|
+
EnglishStopWords& operator=(const EnglishStopWords&) = delete;
|
|
433
|
+
};
|
|
434
|
+
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
// SimpleSentenceSplitter.h
|
|
2
|
+
#pragma once
|
|
3
|
+
|
|
4
|
+
#include <string>
|
|
5
|
+
#include <vector>
|
|
6
|
+
#include <regex>
|
|
7
|
+
#include <algorithm>
|
|
8
|
+
#include <cctype>
|
|
9
|
+
|
|
10
|
+
// You’ll need C++ ports of these:
|
|
11
|
+
// • EnglishAbbreviations::contains(const std::string&)
|
|
12
|
+
// • EnglishDictionary::instance().count(const std::string&)
|
|
13
|
+
#include "english_abbreviations.h"
|
|
14
|
+
#include "english_dictionary.h"
|
|
15
|
+
|
|
16
|
+
class SimpleSentenceSplitter {
|
|
17
|
+
public:
|
|
18
|
+
/// Singleton accessor
|
|
19
|
+
static SimpleSentenceSplitter& getInstance() {
|
|
20
|
+
static SimpleSentenceSplitter instance;
|
|
21
|
+
return instance;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/// Split text into sentences.
|
|
25
|
+
std::vector<std::string> split(const std::string& input) {
|
|
26
|
+
std::vector<std::string> sentences;
|
|
27
|
+
int len = 0;
|
|
28
|
+
std::string text = input;
|
|
29
|
+
|
|
30
|
+
// 1) Normalize carriage returns to spaces
|
|
31
|
+
text = std::regex_replace(text, regexCarriageReturn(), " ");
|
|
32
|
+
|
|
33
|
+
// 2) Clear any stray 0x19 markers
|
|
34
|
+
for (char& c : text) if (c == '\x19') c = ' ';
|
|
35
|
+
|
|
36
|
+
// 3) Insert 0x19 where a space was likely forgotten after .!?
|
|
37
|
+
text = std::regex_replace(text, regexForgottenSpace(), "$1$2\x19$3");
|
|
38
|
+
|
|
39
|
+
// 4) Add a newline so regex can match the final sentence
|
|
40
|
+
text.push_back('\n');
|
|
41
|
+
|
|
42
|
+
auto begin = text.cbegin();
|
|
43
|
+
std::smatch m;
|
|
44
|
+
std::string current;
|
|
45
|
+
|
|
46
|
+
// 5) Loop over sentence-boundary matches
|
|
47
|
+
while (std::regex_search(begin, text.cend(), m, regexSentence())) {
|
|
48
|
+
// Extract groups
|
|
49
|
+
std::string sent = m[1].str();
|
|
50
|
+
std::string punct = m[2].str();
|
|
51
|
+
|
|
52
|
+
// Determine which “after” group matched, and compute its end offset
|
|
53
|
+
std::string after;
|
|
54
|
+
size_t offsetBase = begin - text.cbegin();
|
|
55
|
+
size_t newEnd;
|
|
56
|
+
if (m[3].matched) {
|
|
57
|
+
after = m[3].str();
|
|
58
|
+
newEnd = m.position(3) + m.length(3) + offsetBase;
|
|
59
|
+
}
|
|
60
|
+
else if (m[5].matched) {
|
|
61
|
+
after = m[5].str();
|
|
62
|
+
newEnd = m.position(5) + m.length(5) + offsetBase;
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
after.clear();
|
|
66
|
+
newEnd = m.position(0) + m.length(0) + offsetBase;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Count words in 'sent'
|
|
70
|
+
len += countWords(sent);
|
|
71
|
+
|
|
72
|
+
std::string nextWord = m[4].matched ? m[4].str() : "";
|
|
73
|
+
|
|
74
|
+
// Decide if this is a true break
|
|
75
|
+
bool isBreak = false;
|
|
76
|
+
if (punct == ".") {
|
|
77
|
+
if (!isAbbreviation(sent, nextWord, len)) isBreak = true;
|
|
78
|
+
}
|
|
79
|
+
else if (punct == "!" || punct == "?" || (punct == ":" && len > 6)) {
|
|
80
|
+
isBreak = true;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Append appropriately
|
|
84
|
+
if (isBreak) {
|
|
85
|
+
appendSentence(sentences, current, sent, punct, after);
|
|
86
|
+
len = 0;
|
|
87
|
+
} else {
|
|
88
|
+
appendContinuation(current, sent, punct, after);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Move search cursor forward
|
|
92
|
+
begin = text.cbegin() + newEnd;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Capture any trailing text
|
|
96
|
+
size_t consumed = begin - text.cbegin();
|
|
97
|
+
if (consumed < text.size()) {
|
|
98
|
+
current += text.substr(consumed);
|
|
99
|
+
}
|
|
100
|
+
if (!current.empty()) {
|
|
101
|
+
sentences.push_back(cleanOutput(current));
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return sentences;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
private:
|
|
108
|
+
SimpleSentenceSplitter() = default;
|
|
109
|
+
SimpleSentenceSplitter(const SimpleSentenceSplitter&) = delete;
|
|
110
|
+
SimpleSentenceSplitter& operator=(const SimpleSentenceSplitter&) = delete;
|
|
111
|
+
|
|
112
|
+
// Regex factories (thread‐safe init)
|
|
113
|
+
static const std::regex& regexCarriageReturn() {
|
|
114
|
+
static const std::regex r{"[\\n\\r]+"};
|
|
115
|
+
return r;
|
|
116
|
+
}
|
|
117
|
+
static const std::regex& regexForgottenSpace() {
|
|
118
|
+
static const std::regex r{"(.)([\\.!?])([^0-9\\s\\.\"'`\\)\\}\\]])"};
|
|
119
|
+
return r;
|
|
120
|
+
}
|
|
121
|
+
static const std::regex& regexSentence() {
|
|
122
|
+
static const std::regex r{
|
|
123
|
+
R"((['\"`]*[\(\{\[]?[A-Za-z0-9]+.*?)([\.!\?:])"
|
|
124
|
+
R"(?:(?=([\(\[\{\"'`<>]*[ \x19]+)[\(\[\{\"'`\)\}\] ]*([A-Z0-9][a-z]*))"
|
|
125
|
+
R"(|(?=([\(\)\"'`<\}\] \x19]+)\s)))"
|
|
126
|
+
};
|
|
127
|
+
return r;
|
|
128
|
+
}
|
|
129
|
+
static const std::regex& regexWhitespace() {
|
|
130
|
+
static const std::regex r{"\\s+"};
|
|
131
|
+
return r;
|
|
132
|
+
}
|
|
133
|
+
static const std::regex& regexLastWord() {
|
|
134
|
+
static const std::regex r{"\\b([\\w0-9\\.']+)$"};
|
|
135
|
+
return r;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Helpers
|
|
139
|
+
static size_t countWords(const std::string& s) {
|
|
140
|
+
return std::distance(
|
|
141
|
+
std::sregex_token_iterator(s.begin(), s.end(), regexWhitespace(), -1),
|
|
142
|
+
std::sregex_token_iterator{}
|
|
143
|
+
);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
static std::string extractLastWord(const std::string& s) {
|
|
147
|
+
std::smatch m2;
|
|
148
|
+
if (std::regex_search(s, m2, regexLastWord())) return m2[1].str();
|
|
149
|
+
return "";
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
static bool isAbbreviation(const std::string& sentence,
|
|
153
|
+
const std::string& nextWord,
|
|
154
|
+
int wordCount)
|
|
155
|
+
{
|
|
156
|
+
std::string last = extractLastWord(sentence);
|
|
157
|
+
// Check vowel presence, letter patterns, single-letter
|
|
158
|
+
static const std::regex hasVowel{"[AEIOUaeiou]"};
|
|
159
|
+
static const std::regex hasLower{"[a-z]"};
|
|
160
|
+
static const std::regex hasY{"y"};
|
|
161
|
+
static const std::regex letterDot{"([A-Za-z]\\.)+"};
|
|
162
|
+
|
|
163
|
+
bool cond1 = !std::regex_search(last, hasVowel)
|
|
164
|
+
&& std::regex_search(last, hasLower)
|
|
165
|
+
&& !std::regex_search(last, hasY);
|
|
166
|
+
bool cond2 = std::regex_match(last, letterDot);
|
|
167
|
+
bool cond3 = (last.size()==1 && std::isalpha(last[0]) && last!="I");
|
|
168
|
+
bool cond4 = EnglishAbbreviations::contains(toLower(last));
|
|
169
|
+
|
|
170
|
+
if (cond1||cond2||cond3||cond4) {
|
|
171
|
+
if (EnglishDictionary::instance().count(nextWord) && wordCount>6) {
|
|
172
|
+
return false; // actually a sentence break
|
|
173
|
+
}
|
|
174
|
+
return true; // abbreviation = no break
|
|
175
|
+
}
|
|
176
|
+
return false;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
static std::string toLower(const std::string& s) {
|
|
180
|
+
std::string out; out.reserve(s.size());
|
|
181
|
+
for (char c: s) out.push_back(std::tolower((unsigned char)c));
|
|
182
|
+
return out;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
static void appendSentence(std::vector<std::string>& v,
|
|
186
|
+
std::string& curr,
|
|
187
|
+
const std::string& sent,
|
|
188
|
+
const std::string& punct,
|
|
189
|
+
const std::string& after)
|
|
190
|
+
{
|
|
191
|
+
curr += sent + punct + after;
|
|
192
|
+
v.push_back(cleanOutput(curr));
|
|
193
|
+
curr.clear();
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
static void appendContinuation(std::string& curr,
|
|
197
|
+
const std::string& sent,
|
|
198
|
+
const std::string& punct,
|
|
199
|
+
const std::string& after)
|
|
200
|
+
{
|
|
201
|
+
curr += sent + punct;
|
|
202
|
+
if (after.find('\x19')==std::string::npos) curr.push_back(' ');
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
static std::string cleanOutput(const std::string& s) {
|
|
206
|
+
// Remove markers and trim whitespace
|
|
207
|
+
std::string tmp;
|
|
208
|
+
tmp.reserve(s.size());
|
|
209
|
+
for (char c: s) if (c!='\x19') tmp.push_back(c);
|
|
210
|
+
// Trim
|
|
211
|
+
auto ws = " \t\n\r";
|
|
212
|
+
auto start = tmp.find_first_not_of(ws);
|
|
213
|
+
if (start==std::string::npos) return "";
|
|
214
|
+
auto end = tmp.find_last_not_of(ws);
|
|
215
|
+
return tmp.substr(start, end-start+1);
|
|
216
|
+
}
|
|
217
|
+
};
|
|
218
|
+
|