@elizaos/core 1.5.1 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/index.browser.js +120 -120
- package/dist/browser/index.browser.js.map +5 -21
- package/dist/browser/index.d.ts +3 -1
- package/dist/index.d.ts +2 -3
- package/dist/index.js +1 -5
- package/dist/node/index.d.ts +3 -1
- package/package.json +10 -4
- package/src/__tests__/action-chaining-simple.test.ts +203 -0
- package/src/__tests__/actions.test.ts +218 -0
- package/src/__tests__/buffer.test.ts +337 -0
- package/src/__tests__/character-validation.test.ts +309 -0
- package/src/__tests__/database.test.ts +750 -0
- package/src/__tests__/entities.test.ts +727 -0
- package/src/__tests__/env.test.ts +23 -0
- package/src/__tests__/environment.test.ts +285 -0
- package/src/__tests__/logger-browser-node.test.ts +716 -0
- package/src/__tests__/logger.test.ts +403 -0
- package/src/__tests__/messages.test.ts +196 -0
- package/src/__tests__/mockCharacter.ts +544 -0
- package/src/__tests__/parsing.test.ts +58 -0
- package/src/__tests__/prompts.test.ts +159 -0
- package/src/__tests__/roles.test.ts +331 -0
- package/src/__tests__/runtime-embedding.test.ts +343 -0
- package/src/__tests__/runtime.test.ts +978 -0
- package/src/__tests__/search.test.ts +15 -0
- package/src/__tests__/services-by-type.test.ts +204 -0
- package/src/__tests__/services.test.ts +136 -0
- package/src/__tests__/settings.test.ts +810 -0
- package/src/__tests__/utils.test.ts +1105 -0
- package/src/__tests__/uuid.test.ts +94 -0
- package/src/actions.ts +122 -0
- package/src/database.ts +579 -0
- package/src/entities.ts +406 -0
- package/src/index.browser.ts +48 -0
- package/src/index.node.ts +39 -0
- package/src/index.ts +50 -0
- package/src/logger.ts +527 -0
- package/src/prompts.ts +243 -0
- package/src/roles.ts +85 -0
- package/src/runtime.ts +2514 -0
- package/src/schemas/character.ts +149 -0
- package/src/search.ts +1543 -0
- package/src/sentry/instrument.browser.ts +65 -0
- package/src/sentry/instrument.node.ts +57 -0
- package/src/sentry/instrument.ts +82 -0
- package/src/services.ts +105 -0
- package/src/settings.ts +409 -0
- package/src/test_resources/constants.ts +12 -0
- package/src/test_resources/testSetup.ts +21 -0
- package/src/test_resources/types.ts +22 -0
- package/src/types/agent.ts +112 -0
- package/src/types/browser.ts +145 -0
- package/src/types/components.ts +184 -0
- package/src/types/database.ts +348 -0
- package/src/types/email.ts +162 -0
- package/src/types/environment.ts +129 -0
- package/src/types/events.ts +249 -0
- package/src/types/index.ts +29 -0
- package/src/types/knowledge.ts +65 -0
- package/src/types/lp.ts +124 -0
- package/src/types/memory.ts +228 -0
- package/src/types/message.ts +233 -0
- package/src/types/messaging.ts +57 -0
- package/src/types/model.ts +359 -0
- package/src/types/pdf.ts +77 -0
- package/src/types/plugin.ts +78 -0
- package/src/types/post.ts +271 -0
- package/src/types/primitives.ts +97 -0
- package/src/types/runtime.ts +190 -0
- package/src/types/service.ts +198 -0
- package/src/types/settings.ts +30 -0
- package/src/types/state.ts +60 -0
- package/src/types/task.ts +72 -0
- package/src/types/tee.ts +107 -0
- package/src/types/testing.ts +30 -0
- package/src/types/token.ts +96 -0
- package/src/types/transcription.ts +133 -0
- package/src/types/video.ts +108 -0
- package/src/types/wallet.ts +56 -0
- package/src/types/web-search.ts +146 -0
- package/src/utils/__tests__/buffer.test.ts +80 -0
- package/src/utils/__tests__/environment.test.ts +58 -0
- package/src/utils/__tests__/stringToUuid.test.ts +88 -0
- package/src/utils/buffer.ts +312 -0
- package/src/utils/environment.ts +316 -0
- package/src/utils/server-health.ts +117 -0
- package/src/utils.ts +1076 -0
- package/dist/tsconfig.build.tsbuildinfo +0 -1
package/src/search.ts
ADDED
|
@@ -0,0 +1,1543 @@
|
|
|
1
|
+
// Implementation of BM25 and Porter2 stemming
|
|
2
|
+
// https://github.com/eilvelia/porter2.js
|
|
3
|
+
// https://www.npmjs.com/package/fast-bm25
|
|
4
|
+
|
|
5
|
+
// The MIT License
|
|
6
|
+
|
|
7
|
+
// Copyright (c) 2024 eilvelia <hi@eilvelia.cat>
|
|
8
|
+
|
|
9
|
+
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
// of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
// in the Software without restriction, including without limitation the rights
|
|
12
|
+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
+
// copies of the Software, and to permit persons to whom the Software is
|
|
14
|
+
// furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
// The above copyright notice and this permission notice shall be included in all
|
|
17
|
+
// copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
25
|
+
// SOFTWARE.
|
|
26
|
+
|
|
27
|
+
// MIT License
|
|
28
|
+
|
|
29
|
+
// Copyright (c) 2024 Vivek Patel <me@patelvivek.dev>.
|
|
30
|
+
|
|
31
|
+
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
32
|
+
// of this software and associated documentation files (the "Software"), to deal
|
|
33
|
+
// in the Software without restriction, including without limitation the rights
|
|
34
|
+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
35
|
+
// copies of the Software, and to permit persons to whom the Software is
|
|
36
|
+
// furnished to do so, subject to the following conditions:
|
|
37
|
+
|
|
38
|
+
// The above copyright notice and this permission notice shall be included in all
|
|
39
|
+
// copies or substantial portions of the Software.
|
|
40
|
+
|
|
41
|
+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
42
|
+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
43
|
+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
44
|
+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
45
|
+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
46
|
+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
47
|
+
// SOFTWARE.
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Checks if the character code represents a vowel (a, e, i, o, u, y).
|
|
51
|
+
* @param char - The character code.
|
|
52
|
+
* @returns True if the character is a vowel, false otherwise.
|
|
53
|
+
*/
|
|
54
|
+
const isV = (char: number): boolean => {
|
|
55
|
+
switch (char) {
|
|
56
|
+
case 97:
|
|
57
|
+
case 101:
|
|
58
|
+
case 105:
|
|
59
|
+
case 111:
|
|
60
|
+
case 117:
|
|
61
|
+
case 121:
|
|
62
|
+
return true;
|
|
63
|
+
default:
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Checks if the character code is 'w', 'x', 'y', or a vowel.
|
|
70
|
+
* Used in determining short syllables.
|
|
71
|
+
* @param char - The character code.
|
|
72
|
+
* @returns True if the character is w, x, y, or a vowel, false otherwise.
|
|
73
|
+
*/
|
|
74
|
+
const isWxy = (char: number): boolean => {
|
|
75
|
+
switch (char) {
|
|
76
|
+
case 97:
|
|
77
|
+
case 101:
|
|
78
|
+
case 105:
|
|
79
|
+
case 111:
|
|
80
|
+
case 117:
|
|
81
|
+
case 121:
|
|
82
|
+
case 119:
|
|
83
|
+
case 120:
|
|
84
|
+
case 89:
|
|
85
|
+
return true;
|
|
86
|
+
default:
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Checks if the character code is one of the valid endings for Step 1c ('li' rule).
|
|
93
|
+
* Valid endings: c, d, e, g, h, k, m, n, r, t.
|
|
94
|
+
* @param char - The character code.
|
|
95
|
+
* @returns True if the character is a valid 'li' ending, false otherwise.
|
|
96
|
+
*/
|
|
97
|
+
const isValidLi = (char: number): boolean => {
|
|
98
|
+
switch (char) {
|
|
99
|
+
case 99:
|
|
100
|
+
case 100:
|
|
101
|
+
case 101:
|
|
102
|
+
case 103:
|
|
103
|
+
case 104:
|
|
104
|
+
case 107:
|
|
105
|
+
case 109:
|
|
106
|
+
case 110:
|
|
107
|
+
case 114:
|
|
108
|
+
case 116:
|
|
109
|
+
return true;
|
|
110
|
+
default:
|
|
111
|
+
return false;
|
|
112
|
+
}
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Checks if the character code represents a "double" consonant sound
|
|
117
|
+
* (bb, dd, ff, gg, mm, nn, pp, rr, tt).
|
|
118
|
+
* @param char - The character code.
|
|
119
|
+
* @returns True if the character forms a double consonant, false otherwise.
|
|
120
|
+
*/
|
|
121
|
+
const isDouble = (char: number): boolean => {
|
|
122
|
+
switch (char) {
|
|
123
|
+
case 98:
|
|
124
|
+
case 100:
|
|
125
|
+
case 102:
|
|
126
|
+
case 103:
|
|
127
|
+
case 109:
|
|
128
|
+
case 110:
|
|
129
|
+
case 112:
|
|
130
|
+
case 114:
|
|
131
|
+
case 116:
|
|
132
|
+
return true;
|
|
133
|
+
default:
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Checks if a word ends in a short syllable.
|
|
140
|
+
* A short syllable is defined as:
|
|
141
|
+
* 1. A vowel followed by a non-vowel at the end of the word. (e.g., "hop")
|
|
142
|
+
* 2. A vowel followed by a non-vowel followed by a non-vowel ('w', 'x', 'y' excluded). (e.g., "trap")
|
|
143
|
+
* @param w - Array of character codes representing the word.
|
|
144
|
+
* @param len - The current effective length of the word being considered.
|
|
145
|
+
* @returns True if the word ends in a short syllable, false otherwise.
|
|
146
|
+
*/
|
|
147
|
+
const isShortV = (w: number[], len: number): boolean => {
|
|
148
|
+
// backwardmode: ( non-v_WXY v non-v ) or ( non-v v atlimit )
|
|
149
|
+
return (
|
|
150
|
+
len >= 2 &&
|
|
151
|
+
isV(w[len - 2]) &&
|
|
152
|
+
((len === 2 && !isV(w[len - 1])) || (len >= 3 && !isV(w[len - 3]) && !isWxy(w[len - 1])))
|
|
153
|
+
);
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
// #endregion Porter2 Stemmer Helper Functions
|
|
157
|
+
|
|
158
|
+
// #region Porter2 Stemmer Algorithm
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Stems a given word using the Porter2 (Snowball English) stemming algorithm.
|
|
162
|
+
*
|
|
163
|
+
* The algorithm works in several steps, applying suffix stripping rules based on
|
|
164
|
+
* regions R1 and R2 within the word.
|
|
165
|
+
* - R1: The region after the first non-vowel following a vowel.
|
|
166
|
+
* - R2: The region after the first non-vowel following a vowel in R1.
|
|
167
|
+
*
|
|
168
|
+
* The steps generally involve:
|
|
169
|
+
* 1. Handling plurals and past participles (-s, -es, -ed, -ing).
|
|
170
|
+
* 2. Turning terminal 'y' to 'i' if there is another vowel in the stem.
|
|
171
|
+
* 3. Mapping double suffixes to single ones (e.g., -ization to -ize).
|
|
172
|
+
* 4. Dealing with suffixes like -full, -ness, etc.
|
|
173
|
+
* 5. Removing suffixes like -ant, -ence, etc.
|
|
174
|
+
* 6. Removing a final -e.
|
|
175
|
+
*
|
|
176
|
+
* @param word - The word to be stemmed.
|
|
177
|
+
* @returns The stemmed version of the word.
|
|
178
|
+
*/
|
|
179
|
+
const stem = (word: string): string => {
|
|
180
|
+
if (word.length < 3) return word;
|
|
181
|
+
// exception1
|
|
182
|
+
if (word.length <= 6) {
|
|
183
|
+
switch (word) {
|
|
184
|
+
case 'ski':
|
|
185
|
+
return 'ski';
|
|
186
|
+
case 'skies':
|
|
187
|
+
return 'sky';
|
|
188
|
+
case 'dying':
|
|
189
|
+
return 'die';
|
|
190
|
+
case 'lying':
|
|
191
|
+
return 'lie';
|
|
192
|
+
case 'tying':
|
|
193
|
+
return 'tie';
|
|
194
|
+
// special -LY cases
|
|
195
|
+
case 'idly':
|
|
196
|
+
return 'idl';
|
|
197
|
+
case 'gently':
|
|
198
|
+
return 'gentl';
|
|
199
|
+
case 'ugly':
|
|
200
|
+
return 'ugli';
|
|
201
|
+
case 'early':
|
|
202
|
+
return 'earli';
|
|
203
|
+
case 'only':
|
|
204
|
+
return 'onli';
|
|
205
|
+
case 'singly':
|
|
206
|
+
return 'singl';
|
|
207
|
+
// invariant forms
|
|
208
|
+
case 'sky':
|
|
209
|
+
case 'news':
|
|
210
|
+
case 'howe':
|
|
211
|
+
// not plural forms
|
|
212
|
+
case 'atlas':
|
|
213
|
+
case 'cosmos':
|
|
214
|
+
case 'bias':
|
|
215
|
+
case 'andes':
|
|
216
|
+
return word;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
const initialOffset = word.charCodeAt(0) === 39 /* ' */ ? 1 : 0;
|
|
220
|
+
let l = word.length - initialOffset;
|
|
221
|
+
const w = new Array<number>(l);
|
|
222
|
+
let yFound = false;
|
|
223
|
+
for (let i = 0; i < l; ++i) {
|
|
224
|
+
const ch = word.charCodeAt(i + initialOffset);
|
|
225
|
+
if (ch === 121 && (i === 0 || isV(w[i - 1]))) {
|
|
226
|
+
yFound = true;
|
|
227
|
+
w[i] = 89;
|
|
228
|
+
continue;
|
|
229
|
+
}
|
|
230
|
+
w[i] = ch;
|
|
231
|
+
}
|
|
232
|
+
if (w[l - 1] === 39 /* ' */) --l;
|
|
233
|
+
if (l >= 2 && w[l - 2] === 39 /* ' */ && w[l - 1] === 115 /* s */) l -= 2;
|
|
234
|
+
// mark_regions
|
|
235
|
+
let rv = 0;
|
|
236
|
+
// rv is the position after the first vowel
|
|
237
|
+
while (rv < l && !isV(w[rv])) ++rv;
|
|
238
|
+
if (rv < l) ++rv;
|
|
239
|
+
let r1 = rv;
|
|
240
|
+
if (
|
|
241
|
+
l >= 5 &&
|
|
242
|
+
((w[0] === 103 && w[1] === 101 && w[2] === 110 && w[3] === 101 && w[4] === 114) || // gener
|
|
243
|
+
(w[0] === 97 && w[1] === 114 && w[2] === 115 && w[3] === 101 && w[4] === 110)) // arsen
|
|
244
|
+
)
|
|
245
|
+
r1 = 5;
|
|
246
|
+
else if (
|
|
247
|
+
l >= 6 &&
|
|
248
|
+
w[0] === 99 && // c
|
|
249
|
+
w[1] === 111 && // o
|
|
250
|
+
w[2] === 109 && // m
|
|
251
|
+
w[3] === 109 && // m
|
|
252
|
+
w[4] === 117 && // u
|
|
253
|
+
w[5] === 110 // n
|
|
254
|
+
)
|
|
255
|
+
// commun
|
|
256
|
+
r1 = 6;
|
|
257
|
+
else {
|
|
258
|
+
// > R1 is the region after the first non-vowel following a vowel,
|
|
259
|
+
// > or the end of the word if there is no such non-vowel.
|
|
260
|
+
while (r1 < l && isV(w[r1])) ++r1;
|
|
261
|
+
if (r1 < l) ++r1;
|
|
262
|
+
}
|
|
263
|
+
// > R2 is the region after the first non-vowel following a vowel in R1,
|
|
264
|
+
// > or the end of the word if there is no such non-vowel.
|
|
265
|
+
let r2 = r1;
|
|
266
|
+
while (r2 < l && !isV(w[r2])) ++r2;
|
|
267
|
+
while (r2 < l && isV(w[r2])) ++r2;
|
|
268
|
+
if (r2 < l) ++r2;
|
|
269
|
+
// Step_1a
|
|
270
|
+
if (l >= 3) {
|
|
271
|
+
if (w[l - 1] === 115) {
|
|
272
|
+
// s
|
|
273
|
+
if (l >= 4 && w[l - 2] === 101 && w[l - 3] === 115 && w[l - 4] === 115)
|
|
274
|
+
// sses
|
|
275
|
+
l -= 2; // sses -> ss
|
|
276
|
+
else if (w[l - 2] === 101 && w[l - 3] === 105)
|
|
277
|
+
// ies
|
|
278
|
+
l -= l >= 5 ? 2 : 1; // ies
|
|
279
|
+
else if (w[l - 2] !== 117 && w[l - 2] !== 115 && rv < l - 1)
|
|
280
|
+
// us ss -> <nothing>; s -> "delete if the preceding word part
|
|
281
|
+
// contains a vowel not immediately before the s"
|
|
282
|
+
l -= 1;
|
|
283
|
+
} else if (w[l - 1] === 100 && w[l - 2] === 101 && w[l - 3] === 105) l -= l >= 5 ? 2 : 1; // ied
|
|
284
|
+
}
|
|
285
|
+
// exception2
|
|
286
|
+
if (
|
|
287
|
+
(l === 6 &&
|
|
288
|
+
((w[0] === 105 && // i
|
|
289
|
+
w[1] === 110 && // n
|
|
290
|
+
w[2] === 110 && // n
|
|
291
|
+
w[3] === 105 && // i
|
|
292
|
+
w[4] === 110 && // n
|
|
293
|
+
w[5] === 103) || // g (inning)
|
|
294
|
+
(w[0] === 111 && // o
|
|
295
|
+
w[1] === 117 && // u
|
|
296
|
+
w[2] === 116 && // t
|
|
297
|
+
w[3] === 105 && // i
|
|
298
|
+
w[4] === 110 && // n
|
|
299
|
+
w[5] === 103) || // g (outing)
|
|
300
|
+
(w[0] === 101 && // e
|
|
301
|
+
w[1] === 120 && // x
|
|
302
|
+
w[2] === 99 && // c
|
|
303
|
+
w[3] === 101 && // e
|
|
304
|
+
w[4] === 101 && // e
|
|
305
|
+
w[5] === 100))) || // d (exceed)
|
|
306
|
+
(l === 7 &&
|
|
307
|
+
((w[0] === 99 && // c
|
|
308
|
+
w[1] === 97 && // a
|
|
309
|
+
w[2] === 110 && // n
|
|
310
|
+
w[3] === 110 && // n
|
|
311
|
+
w[4] === 105 && // i
|
|
312
|
+
w[5] === 110 && // n
|
|
313
|
+
w[6] === 103) || // g (canning)
|
|
314
|
+
(w[0] === 104 && // h
|
|
315
|
+
w[1] === 101 && // e
|
|
316
|
+
w[2] === 114 && // r
|
|
317
|
+
w[3] === 114 && // r
|
|
318
|
+
w[4] === 105 && // i
|
|
319
|
+
w[5] === 110 && // n
|
|
320
|
+
w[6] === 103) || // g (herring)
|
|
321
|
+
(w[0] === 101 && // e
|
|
322
|
+
w[1] === 97 && // a
|
|
323
|
+
w[2] === 114 && // r
|
|
324
|
+
w[3] === 114 && // r
|
|
325
|
+
w[4] === 105 && // i
|
|
326
|
+
w[5] === 110 && // n
|
|
327
|
+
w[6] === 103) || // g (earring)
|
|
328
|
+
(w[0] === 112 && // p
|
|
329
|
+
w[1] === 114 && // r
|
|
330
|
+
w[2] === 111 && // o
|
|
331
|
+
w[3] === 99 && // c
|
|
332
|
+
w[4] === 101 && // e
|
|
333
|
+
w[5] === 101 && // e
|
|
334
|
+
w[6] === 100) || // d (proceed)
|
|
335
|
+
(w[0] === 115 && // s
|
|
336
|
+
w[1] === 117 && // u
|
|
337
|
+
w[2] === 99 && // c
|
|
338
|
+
w[3] === 99 && // c
|
|
339
|
+
w[4] === 101 && // e
|
|
340
|
+
w[5] === 101 && // e
|
|
341
|
+
w[6] === 100))) // d (succeed)
|
|
342
|
+
) {
|
|
343
|
+
let exp2Out = '';
|
|
344
|
+
for (let i = 0; i < l; ++i) exp2Out += String.fromCharCode(w[i]);
|
|
345
|
+
return exp2Out;
|
|
346
|
+
}
|
|
347
|
+
// Step_1b
|
|
348
|
+
let ll =
|
|
349
|
+
// l (length) without the -ly ending
|
|
350
|
+
l >= 2 && w[l - 1] === 121 && w[l - 2] === 108 ? l - 2 : l;
|
|
351
|
+
if (ll >= 3) {
|
|
352
|
+
if (w[ll - 3] === 101 && w[ll - 2] === 101 && w[ll - 1] === 100) {
|
|
353
|
+
// eed
|
|
354
|
+
if (ll >= r1 + 3) l = ll - 1; // eed eedly -> ee (if in R1)
|
|
355
|
+
} else {
|
|
356
|
+
// ll without: ed edly ing ingly (-1 if not found)
|
|
357
|
+
if (w[ll - 2] === 101 && w[ll - 1] === 100)
|
|
358
|
+
ll -= 2; // ed
|
|
359
|
+
else if (w[ll - 3] === 105 && w[ll - 2] === 110 && w[ll - 1] === 103)
|
|
360
|
+
ll -= 3; // ing
|
|
361
|
+
else ll = -1;
|
|
362
|
+
if (ll >= 0 && rv <= ll) {
|
|
363
|
+
l = ll;
|
|
364
|
+
if (l >= 2) {
|
|
365
|
+
if (
|
|
366
|
+
(w[l - 1] === 116 && w[l - 2] === 97) || // at
|
|
367
|
+
(w[l - 1] === 108 && w[l - 2] === 98) || // bl
|
|
368
|
+
(w[l - 1] === 122 && w[l - 2] === 105) // iz
|
|
369
|
+
) {
|
|
370
|
+
// at -> ate bl -> ble iz -> ize
|
|
371
|
+
w[l] = 101;
|
|
372
|
+
++l;
|
|
373
|
+
} else if (w[l - 2] === w[l - 1] && isDouble(w[l - 1])) {
|
|
374
|
+
--l;
|
|
375
|
+
} else if (r1 >= l && isShortV(w, l)) {
|
|
376
|
+
// <shortv> -> e
|
|
377
|
+
w[l] = 101;
|
|
378
|
+
++l;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
// Step_1c
|
|
385
|
+
if (l >= 3 && (w[l - 1] === 89 || w[l - 1] === 121) && !isV(w[l - 2])) w[l - 1] = 105; // i
|
|
386
|
+
// Step_2
|
|
387
|
+
if (l >= r1 + 2) {
|
|
388
|
+
switch (w[l - 1]) {
|
|
389
|
+
case 108: // l
|
|
390
|
+
if (
|
|
391
|
+
l >= r1 + 6 &&
|
|
392
|
+
w[l - 2] === 97 && // a
|
|
393
|
+
w[l - 3] === 110 && // n
|
|
394
|
+
w[l - 4] === 111 && // o
|
|
395
|
+
w[l - 5] === 105 && // i
|
|
396
|
+
w[l - 6] === 116 // t (tional)
|
|
397
|
+
) {
|
|
398
|
+
if (l >= 7 && w[l - 7] === 97) {
|
|
399
|
+
// a (ational)
|
|
400
|
+
if (l >= r1 + 7) {
|
|
401
|
+
// ational -> ate
|
|
402
|
+
l -= 4;
|
|
403
|
+
w[l - 1] = 101; // e
|
|
404
|
+
}
|
|
405
|
+
} else {
|
|
406
|
+
l -= 2; // tional -> tion
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
break;
|
|
410
|
+
case 110: // n
|
|
411
|
+
if (
|
|
412
|
+
l >= r1 + 5 &&
|
|
413
|
+
w[l - 2] === 111 && // o
|
|
414
|
+
w[l - 3] === 105 && // i
|
|
415
|
+
w[l - 4] === 116 && // t
|
|
416
|
+
w[l - 5] === 97 // a (ation)
|
|
417
|
+
) {
|
|
418
|
+
if (l >= 7 && w[l - 6] === 122 && w[l - 7] === 105) {
|
|
419
|
+
// iz (ization)
|
|
420
|
+
if (l >= r1 + 7) {
|
|
421
|
+
// ization -> ize
|
|
422
|
+
l -= 4;
|
|
423
|
+
w[l - 1] = 101; // e
|
|
424
|
+
}
|
|
425
|
+
} else {
|
|
426
|
+
// ation -> ate
|
|
427
|
+
l -= 2;
|
|
428
|
+
w[l - 1] = 101; // e
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
break;
|
|
432
|
+
case 114: // r
|
|
433
|
+
if (l >= r1 + 4) {
|
|
434
|
+
if (w[l - 2] === 101) {
|
|
435
|
+
// e (er)
|
|
436
|
+
if (w[l - 3] === 122 && w[l - 4] === 105) --l; // izer -> ize
|
|
437
|
+
} else if (w[l - 2] === 111) {
|
|
438
|
+
// o (or)
|
|
439
|
+
if (w[l - 3] === 116 && w[l - 4] === 97) {
|
|
440
|
+
// ator
|
|
441
|
+
--l;
|
|
442
|
+
w[l - 1] = 101; // e
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
break;
|
|
447
|
+
case 115: // s
|
|
448
|
+
if (
|
|
449
|
+
l >= r1 + 7 &&
|
|
450
|
+
w[l - 2] === 115 && // s
|
|
451
|
+
w[l - 3] === 101 && // e
|
|
452
|
+
w[l - 4] === 110 && // n (ness)
|
|
453
|
+
((w[l - 5] === 108 && w[l - 6] === 117 && w[l - 7] === 102) || // fulness
|
|
454
|
+
(w[l - 5] === 115 && w[l - 6] === 117 && w[l - 7] === 111) || // ousness
|
|
455
|
+
(w[l - 5] === 101 && w[l - 6] === 118 && w[l - 7] === 105)) // iveness
|
|
456
|
+
) {
|
|
457
|
+
l -= 4; // fulness -> ful ousness -> ous iveness -> ive
|
|
458
|
+
}
|
|
459
|
+
break;
|
|
460
|
+
case 109: // m
|
|
461
|
+
if (
|
|
462
|
+
l >= r1 + 5 &&
|
|
463
|
+
w[l - 2] === 115 && // s
|
|
464
|
+
w[l - 3] === 105 && // i
|
|
465
|
+
w[l - 4] === 108 && // l
|
|
466
|
+
w[l - 5] === 97 // a (alism)
|
|
467
|
+
)
|
|
468
|
+
l -= 3; // alism -> al
|
|
469
|
+
break;
|
|
470
|
+
case 105: // i
|
|
471
|
+
if (w[l - 2] === 99) {
|
|
472
|
+
// c (ic)
|
|
473
|
+
if (l >= r1 + 4 && (w[l - 4] === 101 || w[l - 4] === 97) && w[l - 3] === 110) {
|
|
474
|
+
// enci anci
|
|
475
|
+
w[l - 1] = 101; // enci -> ence anci -> ance
|
|
476
|
+
}
|
|
477
|
+
} else if (w[l - 2] === 103) {
|
|
478
|
+
// g (gi)
|
|
479
|
+
if (l >= r1 + 3 && l >= 4 && w[l - 2] === 103 && w[l - 3] === 111 && w[l - 4] === 108)
|
|
480
|
+
// logi
|
|
481
|
+
--l; // ogi -> og (if preceded by l)
|
|
482
|
+
} else if (w[l - 2] === 116) {
|
|
483
|
+
// t (ti)
|
|
484
|
+
if (l >= r1 + 5 && w[l - 3] === 105) {
|
|
485
|
+
// iti
|
|
486
|
+
if (w[l - 4] === 108) {
|
|
487
|
+
// liti
|
|
488
|
+
if (l >= 6 && w[l - 5] === 105 && w[l - 6] === 98) {
|
|
489
|
+
// biliti
|
|
490
|
+
if (l >= r1 + 6) {
|
|
491
|
+
// biliti -> ble
|
|
492
|
+
l -= 3;
|
|
493
|
+
w[l - 2] = 108; // l
|
|
494
|
+
w[l - 1] = 101; // e
|
|
495
|
+
}
|
|
496
|
+
} else if (w[l - 4] === 108 && w[l - 5] === 97) {
|
|
497
|
+
// aliti
|
|
498
|
+
l -= 3; // aliti -> al
|
|
499
|
+
}
|
|
500
|
+
} else if (w[l - 4] === 118 && w[l - 5] === 105) {
|
|
501
|
+
// iviti
|
|
502
|
+
// iviti -> ive
|
|
503
|
+
l -= 2;
|
|
504
|
+
w[l - 1] = 101; // e
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
} else if (w[l - 2] === 108 && l >= 3) {
|
|
508
|
+
// l (li)
|
|
509
|
+
if (w[l - 3] === 98) {
|
|
510
|
+
// bli
|
|
511
|
+
if (l >= 4 && w[l - 4] === 97) {
|
|
512
|
+
// abli
|
|
513
|
+
if (l >= r1 + 4) w[l - 1] = 101; // abli -> able
|
|
514
|
+
} else if (l >= r1 + 3) {
|
|
515
|
+
w[l - 1] = 101; // bli -> ble
|
|
516
|
+
}
|
|
517
|
+
} else {
|
|
518
|
+
// Remove li
|
|
519
|
+
if (w[l - 3] === 108) {
|
|
520
|
+
// lli
|
|
521
|
+
if (l >= 5 && w[l - 4] === 117 && w[l - 5] === 102) {
|
|
522
|
+
// fulli
|
|
523
|
+
if (l >= r1 + 5) l -= 2; // fulli -> ful
|
|
524
|
+
} else if (l >= r1 + 4 && w[l - 4] === 97) {
|
|
525
|
+
// alli
|
|
526
|
+
l -= 2; // alli -> al
|
|
527
|
+
}
|
|
528
|
+
} else if (w[l - 3] === 115) {
|
|
529
|
+
// sli
|
|
530
|
+
if (l >= 6 && w[l - 4] === 115 && w[l - 5] === 101 && w[l - 6] === 108) {
|
|
531
|
+
// lessli
|
|
532
|
+
if (l >= r1 + 6) l -= 2; // lessli -> less
|
|
533
|
+
} else if (l >= r1 + 5 && w[l - 4] === 117 && w[l - 5] === 111) {
|
|
534
|
+
// ousli
|
|
535
|
+
l -= 2; // ousli -> ous
|
|
536
|
+
}
|
|
537
|
+
} else if (l >= 5 && w[l - 3] === 116 && w[l - 4] === 110 && w[l - 5] === 101) {
|
|
538
|
+
// entli
|
|
539
|
+
if (l >= r1 + 5) l -= 2; // entli -> ent
|
|
540
|
+
} else if (isValidLi(w[l - 3])) {
|
|
541
|
+
l -= 2;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
// Step_3
|
|
548
|
+
if (l >= r1 + 3) {
|
|
549
|
+
switch (w[l - 1]) {
|
|
550
|
+
case 108: // l
|
|
551
|
+
if (w[l - 3] === 99) {
|
|
552
|
+
// cal
|
|
553
|
+
if (l >= r1 + 4 && w[l - 4] === 105 && w[l - 2] === 97) l -= 2; // ical -> ic
|
|
554
|
+
} else if (w[l - 3] === 102) {
|
|
555
|
+
// ful
|
|
556
|
+
if (w[l - 2] === 117) l -= 3; // ful -> <delete>
|
|
557
|
+
} else if (w[l - 3] === 110) {
|
|
558
|
+
// nal
|
|
559
|
+
if (
|
|
560
|
+
l >= r1 + 6 &&
|
|
561
|
+
w[l - 2] === 97 && // a
|
|
562
|
+
w[l - 4] === 111 && // o
|
|
563
|
+
w[l - 5] === 105 && // i
|
|
564
|
+
w[l - 6] === 116 // t (tional)
|
|
565
|
+
) {
|
|
566
|
+
if (l >= 7 && w[l - 7] === 97) {
|
|
567
|
+
// ational
|
|
568
|
+
if (l >= r1 + 7) {
|
|
569
|
+
// ational -> ate
|
|
570
|
+
l -= 4;
|
|
571
|
+
w[l - 1] = 101; // e
|
|
572
|
+
}
|
|
573
|
+
} else {
|
|
574
|
+
l -= 2; // tional -> tion
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
break;
|
|
579
|
+
case 101: // e
|
|
580
|
+
if (w[l - 2] === 122) {
|
|
581
|
+
// ze
|
|
582
|
+
if (l >= r1 + 5 && w[l - 3] === 105 && w[l - 4] === 108 && w[l - 5] === 97) l -= 3; // alize -> al
|
|
583
|
+
} else if (w[l - 2] === 116) {
|
|
584
|
+
// te
|
|
585
|
+
if (l >= r1 + 5 && w[l - 3] === 97 && w[l - 4] === 99 && w[l - 5] === 105) l -= 3; // icate -> ic
|
|
586
|
+
} else if (w[l - 2] === 118) {
|
|
587
|
+
// ve
|
|
588
|
+
if (l >= r2 + 5 && w[l - 3] === 105 && w[l - 4] === 116 && w[l - 5] === 97) l -= 5; // ative -> <delete> (if in R2)
|
|
589
|
+
}
|
|
590
|
+
break;
|
|
591
|
+
case 105: // i
|
|
592
|
+
if (
|
|
593
|
+
l >= r1 + 5 &&
|
|
594
|
+
w[l - 2] === 116 && // t
|
|
595
|
+
w[l - 3] === 105 && // i
|
|
596
|
+
w[l - 4] === 99 && // c
|
|
597
|
+
w[l - 5] === 105 // i (iciti)
|
|
598
|
+
)
|
|
599
|
+
l -= 3; // iciti -> ic
|
|
600
|
+
break;
|
|
601
|
+
case 115: // s
|
|
602
|
+
if (l >= r1 + 4 && w[l - 2] === 115 && w[l - 3] === 101 && w[l - 4] === 110) l -= 4; // ness -> <delete>
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
// Step_4
|
|
606
|
+
if (l >= r2 + 2) {
|
|
607
|
+
switch (w[l - 1]) {
|
|
608
|
+
case 110: // n
|
|
609
|
+
if (
|
|
610
|
+
l >= r2 + 3 &&
|
|
611
|
+
w[l - 2] === 111 && // o
|
|
612
|
+
w[l - 3] === 105 && // i (ion)
|
|
613
|
+
(w[l - 4] === 115 || w[l - 4] === 116) // s or t
|
|
614
|
+
)
|
|
615
|
+
l -= 3; // ion -> <delete> (if preceded by s or t)
|
|
616
|
+
break;
|
|
617
|
+
case 108: // l
|
|
618
|
+
if (w[l - 2] === 97) l -= 2; // al
|
|
619
|
+
break;
|
|
620
|
+
case 114: // r
|
|
621
|
+
if (w[l - 2] === 101) l -= 2; // er
|
|
622
|
+
break;
|
|
623
|
+
case 99: // c
|
|
624
|
+
if (w[l - 2] === 105) l -= 2; // ic
|
|
625
|
+
break;
|
|
626
|
+
case 109: // m
|
|
627
|
+
if (l >= r2 + 3 && w[l - 2] === 115 && w[l - 3] === 105) l -= 3; // ism
|
|
628
|
+
break;
|
|
629
|
+
case 105: // i
|
|
630
|
+
if (l >= r2 + 3 && w[l - 2] === 116 && w[l - 3] === 105) l -= 3; // iti
|
|
631
|
+
break;
|
|
632
|
+
case 115: // s
|
|
633
|
+
if (l >= r2 + 3 && w[l - 2] === 117 && w[l - 3] === 111) l -= 3; // ous
|
|
634
|
+
break;
|
|
635
|
+
case 116: // t
|
|
636
|
+
if (l >= r2 + 3 && w[l - 2] === 110) {
|
|
637
|
+
// nt
|
|
638
|
+
if (w[l - 3] === 97) {
|
|
639
|
+
// ant
|
|
640
|
+
l -= 3; // ant
|
|
641
|
+
} else if (w[l - 3] === 101) {
|
|
642
|
+
// ent
|
|
643
|
+
if (l >= 4 && w[l - 4] === 109) {
|
|
644
|
+
// ment
|
|
645
|
+
if (l >= 5 && w[l - 5] === 101) {
|
|
646
|
+
// ement
|
|
647
|
+
if (l >= r2 + 5) l -= 5; // ement
|
|
648
|
+
} else if (l >= r2 + 4) {
|
|
649
|
+
l -= 4; // ment
|
|
650
|
+
}
|
|
651
|
+
} else {
|
|
652
|
+
l -= 3; // ent
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
break;
|
|
657
|
+
case 101: // e
|
|
658
|
+
if (w[l - 2] === 99) {
|
|
659
|
+
// ce
|
|
660
|
+
if (l >= r2 + 4 && w[l - 3] === 110 && (w[l - 4] === 97 || w[l - 4] === 101)) l -= 4; // ance ence
|
|
661
|
+
} else if (w[l - 2] === 108) {
|
|
662
|
+
// le
|
|
663
|
+
if (l >= r2 + 4 && w[l - 3] === 98 && (w[l - 4] === 97 || w[l - 4] === 105)) l -= 4; // able ible
|
|
664
|
+
} else if (w[l - 2] === 116) {
|
|
665
|
+
// te
|
|
666
|
+
if (l >= r2 + 3 && w[l - 3] === 97) l -= 3; // ate
|
|
667
|
+
} else if (l >= r2 + 3 && (w[l - 2] === 118 || w[l - 2] === 122) && w[l - 3] === 105) {
|
|
668
|
+
// ive ize
|
|
669
|
+
l -= 3; // ive ize
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
// Step_5
|
|
674
|
+
if (
|
|
675
|
+
l >= r1 + 1 && // r1 is >= 1
|
|
676
|
+
((l >= r2 + 1 && w[l - 1] === 108 && w[l - 2] === 108) || // ll
|
|
677
|
+
(w[l - 1] === 101 && (l >= r2 + 1 || !isShortV(w, l - 1)))) // e
|
|
678
|
+
)
|
|
679
|
+
--l;
|
|
680
|
+
let out = '';
|
|
681
|
+
if (yFound) {
|
|
682
|
+
for (let i = 0; i < l; ++i) {
|
|
683
|
+
out += String.fromCharCode(w[i] === 89 ? 121 : w[i]); // Y -> y
|
|
684
|
+
}
|
|
685
|
+
} else {
|
|
686
|
+
for (let i = 0; i < l; ++i) out += String.fromCharCode(w[i]);
|
|
687
|
+
}
|
|
688
|
+
return out;
|
|
689
|
+
};
|
|
690
|
+
|
|
691
|
+
// #endregion Porter2 Stemmer Algorithm
|
|
692
|
+
|
|
693
|
+
// src/constants.ts
|
|
694
|
+
const DEFAULT_OPTIONS = {
|
|
695
|
+
k1: 1.2,
|
|
696
|
+
b: 0.75,
|
|
697
|
+
minLength: 2,
|
|
698
|
+
stopWords: /* @__PURE__ */ new Set<string>([
|
|
699
|
+
'a',
|
|
700
|
+
'an',
|
|
701
|
+
'and',
|
|
702
|
+
'are',
|
|
703
|
+
'as',
|
|
704
|
+
'at',
|
|
705
|
+
'be',
|
|
706
|
+
'by',
|
|
707
|
+
'for',
|
|
708
|
+
'from',
|
|
709
|
+
'has',
|
|
710
|
+
'he',
|
|
711
|
+
'in',
|
|
712
|
+
'is',
|
|
713
|
+
'it',
|
|
714
|
+
'its',
|
|
715
|
+
'of',
|
|
716
|
+
'on',
|
|
717
|
+
'that',
|
|
718
|
+
'the',
|
|
719
|
+
'to',
|
|
720
|
+
'was',
|
|
721
|
+
'were',
|
|
722
|
+
'will',
|
|
723
|
+
'with',
|
|
724
|
+
]),
|
|
725
|
+
stemming: false,
|
|
726
|
+
stemWords: (word: string): string => word,
|
|
727
|
+
};
|
|
728
|
+
|
|
729
|
+
/**
|
|
730
|
+
* Interface for tokenization statistics.
|
|
731
|
+
*/
|
|
732
|
+
interface TokenizationStats {
|
|
733
|
+
/** Number of words in the original text before any processing. */
|
|
734
|
+
originalWordCount: number;
|
|
735
|
+
/** Number of words removed because they were identified as stop words. */
|
|
736
|
+
stopWordsRemoved: number;
|
|
737
|
+
/** Number of words that were stemmed (only if stemming is enabled). */
|
|
738
|
+
stemmedWords: number;
|
|
739
|
+
/** Time taken for tokenization in milliseconds. */
|
|
740
|
+
processingTimeMs: number;
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
/**
|
|
744
|
+
* Interface for the result of tokenization.
|
|
745
|
+
*/
|
|
746
|
+
interface TokenizationResult {
|
|
747
|
+
/** Array of processed tokens (words). */
|
|
748
|
+
tokens: string[];
|
|
749
|
+
/** Optional statistics about the tokenization process. */
|
|
750
|
+
stats?: TokenizationStats;
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
/**
|
|
754
|
+
* Interface for stemming rules.
|
|
755
|
+
*/
|
|
756
|
+
interface StemmingRule {
|
|
757
|
+
/** A RegExp pattern or string to match suffixes. */
|
|
758
|
+
pattern: RegExp | string;
|
|
759
|
+
/** The replacement string or function. */
|
|
760
|
+
replacement: string | ((substring: string, ...args: any[]) => string);
|
|
761
|
+
/** Optional minimum measure (complexity) of the word stem for the rule to apply. */
|
|
762
|
+
minMeasure?: number;
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
/**
|
|
766
|
+
* Options for configuring the Tokenizer.
|
|
767
|
+
*/
|
|
768
|
+
interface TokenizerOptions {
|
|
769
|
+
/** A set of words to be ignored during tokenization. Defaults to an empty set. */
|
|
770
|
+
stopWords?: Set<string>;
|
|
771
|
+
/** The minimum length for a token to be kept. Defaults to 2. Numeric tokens are always kept. */
|
|
772
|
+
minLength?: number;
|
|
773
|
+
/** Whether to apply stemming to tokens. Defaults to false. */
|
|
774
|
+
stemming?: boolean;
|
|
775
|
+
/** Custom stemming rules to apply before the default Porter2 stemmer. Defaults to an empty array. */
|
|
776
|
+
stemmingRules?: StemmingRule[];
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
/**
|
|
780
|
+
* Flexible text tokenizer with support for stop words, minimum token length,
|
|
781
|
+
* Unicode normalization, and optional Porter2 stemming with custom rules.
|
|
782
|
+
*/
|
|
783
|
+
class Tokenizer {
|
|
784
|
+
/** Set of stop words to ignore. */
|
|
785
|
+
readonly stopWords: Set<string>;
|
|
786
|
+
/** Minimum length of tokens to keep. */
|
|
787
|
+
readonly minLength: number;
|
|
788
|
+
/** Flag indicating if stemming is enabled. */
|
|
789
|
+
readonly stemming: boolean;
|
|
790
|
+
/** Custom stemming rules. */
|
|
791
|
+
readonly stemmingRules: {
|
|
792
|
+
pattern: RegExp;
|
|
793
|
+
replacement: string | ((substring: string, ...args: any[]) => string);
|
|
794
|
+
minMeasure?: number;
|
|
795
|
+
}[];
|
|
796
|
+
|
|
797
|
+
/** Default options for the Tokenizer. */
|
|
798
|
+
static readonly DEFAULT_OPTIONS: Required<TokenizerOptions> = {
|
|
799
|
+
stopWords: /* @__PURE__ */ new Set<string>(),
|
|
800
|
+
minLength: 2,
|
|
801
|
+
stemming: false,
|
|
802
|
+
stemmingRules: [],
|
|
803
|
+
};
|
|
804
|
+
|
|
805
|
+
/**
|
|
806
|
+
* Creates a new tokenizer instance.
|
|
807
|
+
* @param options - Tokenization options including stop words, min length, stemming, and custom rules.
|
|
808
|
+
*/
|
|
809
|
+
constructor(options: TokenizerOptions = {}) {
|
|
810
|
+
const mergedOptions = { ...Tokenizer.DEFAULT_OPTIONS, ...options };
|
|
811
|
+
this.stopWords = mergedOptions.stopWords;
|
|
812
|
+
this.minLength = mergedOptions.minLength;
|
|
813
|
+
this.stemming = mergedOptions.stemming;
|
|
814
|
+
// Ensure all rule patterns are RegExp objects
|
|
815
|
+
this.stemmingRules = mergedOptions.stemmingRules.map((rule) => ({
|
|
816
|
+
...rule,
|
|
817
|
+
pattern: typeof rule.pattern === 'string' ? new RegExp(rule.pattern) : rule.pattern,
|
|
818
|
+
}));
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
/**
|
|
822
|
+
* Tokenizes input text into an array of processed terms.
|
|
823
|
+
* Steps:
|
|
824
|
+
* 1. Cleans the text (lowercase, normalize, remove punctuation/symbols).
|
|
825
|
+
* 2. Splits the text into potential tokens.
|
|
826
|
+
* 3. Filters tokens based on `minLength` and `stopWords`.
|
|
827
|
+
* 4. Applies stemming if `stemming` is true (custom rules first, then Porter2).
|
|
828
|
+
* 5. Optionally calculates statistics.
|
|
829
|
+
*
|
|
830
|
+
* @param text - The input text string to tokenize.
|
|
831
|
+
* @param includeStats - If true, returns tokenization statistics along with tokens. Defaults to false.
|
|
832
|
+
* @returns A `TokenizationResult` object containing the array of tokens and optional stats.
|
|
833
|
+
* @throws {Error} If the input text is null, undefined, or empty.
|
|
834
|
+
*/
|
|
835
|
+
tokenize(text: string, includeStats = false): TokenizationResult {
|
|
836
|
+
if (!text) {
|
|
837
|
+
throw new Error('Input text cannot be null or empty');
|
|
838
|
+
}
|
|
839
|
+
const startTime = Date.now();
|
|
840
|
+
const originalWords = text.split(/\s+/).filter((word) => word.length > 0);
|
|
841
|
+
const cleaned = this.cleanText(text);
|
|
842
|
+
const tokens = cleaned
|
|
843
|
+
.split(/\s+/)
|
|
844
|
+
.filter((token) => this.isValidToken(token))
|
|
845
|
+
.map((token) => (this.stemming ? this.stemWord(token) : token));
|
|
846
|
+
const stats: TokenizationStats = includeStats
|
|
847
|
+
? {
|
|
848
|
+
originalWordCount: originalWords.length,
|
|
849
|
+
stopWordsRemoved: originalWords.length - tokens.length, // This might be incorrect if stemming changes token count
|
|
850
|
+
stemmedWords: this.stemming ? tokens.length : 0,
|
|
851
|
+
processingTimeMs: Date.now() - startTime,
|
|
852
|
+
}
|
|
853
|
+
: {
|
|
854
|
+
originalWordCount: 0,
|
|
855
|
+
stopWordsRemoved: 0,
|
|
856
|
+
stemmedWords: 0,
|
|
857
|
+
processingTimeMs: 0,
|
|
858
|
+
};
|
|
859
|
+
return { tokens, stats };
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
/**
|
|
863
|
+
* Cleans and normalizes text for tokenization.
|
|
864
|
+
* - Converts to lowercase.
|
|
865
|
+
* - Normalizes Unicode characters (NFKD).
|
|
866
|
+
* - Removes control characters and zero-width spaces.
|
|
867
|
+
* - Removes diacritical marks (accents).
|
|
868
|
+
* - Removes emojis and pictographs.
|
|
869
|
+
* - Removes common symbols (™, ®, ©, ℠, ‼).
|
|
870
|
+
* - Replaces Unicode punctuation with spaces.
|
|
871
|
+
* - Removes characters not matching basic Latin, CJK, Hangul, or whitespace.
|
|
872
|
+
* - Collapses multiple spaces into single spaces.
|
|
873
|
+
* - Trims leading/trailing whitespace.
|
|
874
|
+
*
|
|
875
|
+
* @param text - Input text to clean.
|
|
876
|
+
* @returns Cleaned and normalized text, ready for splitting into tokens.
|
|
877
|
+
*
|
|
878
|
+
* @example
|
|
879
|
+
* cleanText("Hello, World™!") // "hello world"
|
|
880
|
+
* cleanText("héllo 👋") // "hello"
|
|
881
|
+
* cleanText("Hello 世界!") // "hello 世界"
|
|
882
|
+
* cleanText("I'm don't") // "i'm don't" (apostrophes kept by replacing punctuation with space)
|
|
883
|
+
* cleanText("test©2023") // "test 2023"
|
|
884
|
+
*/
|
|
885
|
+
cleanText(text: string): string {
|
|
886
|
+
return text
|
|
887
|
+
.toLowerCase()
|
|
888
|
+
.normalize('NFKD')
|
|
889
|
+
.replace(/[\u0000-\u001F\u007F-\u009F\u200B-\u200D\uFEFF]/g, '') // Control characters & zero-width spaces
|
|
890
|
+
.replace(/[\u0300-\u036f]/g, '') // Diacritical marks
|
|
891
|
+
.replace(/[\p{Emoji_Presentation}\p{Extended_Pictographic}]/gu, '') // Emojis and pictographs
|
|
892
|
+
.replace(/[™®©℠‼]/g, '') // Common symbols
|
|
893
|
+
.replace(/[\p{P}]/gu, ' ') // Unicode punctuation to space
|
|
894
|
+
.replace(/[^a-z0-9\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\s]/gu, ' ') // Keep only latin, cjk, hangul, numbers, whitespace
|
|
895
|
+
.replace(/\s+/g, ' ') // Collapse multiple spaces
|
|
896
|
+
.trim();
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
/**
|
|
900
|
+
* Checks if a token is valid (meets `minLength` criteria and is not a stop word).
|
|
901
|
+
* Numeric tokens are always considered valid regardless of length.
|
|
902
|
+
* @param token - The token string to validate.
|
|
903
|
+
* @returns `true` if the token is valid, `false` otherwise.
|
|
904
|
+
*/
|
|
905
|
+
isValidToken(token: string): boolean {
|
|
906
|
+
const isNumeric = /^\d+$/.test(token);
|
|
907
|
+
return (token.length >= this.minLength || isNumeric) && !this.stopWords.has(token);
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
/**
|
|
911
|
+
* Applies stemming to a single word.
|
|
912
|
+
* First, tries to apply custom stemming rules defined in `stemmingRules`.
|
|
913
|
+
* If no custom rule matches, applies the default Porter2 stemming algorithm.
|
|
914
|
+
* Words shorter than 3 characters are not stemmed.
|
|
915
|
+
* @param word - The word to stem.
|
|
916
|
+
* @returns The stemmed word.
|
|
917
|
+
*/
|
|
918
|
+
stemWord(word: string): string {
|
|
919
|
+
if (word.length < 3) return word;
|
|
920
|
+
let customRuleApplied = false;
|
|
921
|
+
let stemmed = word;
|
|
922
|
+
for (const rule of this.stemmingRules) {
|
|
923
|
+
const match = stemmed.match(rule.pattern);
|
|
924
|
+
if (match) {
|
|
925
|
+
if (
|
|
926
|
+
!rule.minMeasure ||
|
|
927
|
+
this.measure(stemmed.substring(0, match.index)) >= rule.minMeasure
|
|
928
|
+
) {
|
|
929
|
+
// Apply replacement
|
|
930
|
+
if (typeof rule.replacement === 'string') {
|
|
931
|
+
stemmed = stemmed.replace(rule.pattern, rule.replacement);
|
|
932
|
+
} else {
|
|
933
|
+
// If replacement is a function, it might need more specific arguments based on its definition.
|
|
934
|
+
// Assuming it takes the matched substring and potentially other match groups.
|
|
935
|
+
stemmed = stemmed.replace(rule.pattern, (...args) =>
|
|
936
|
+
(rule.replacement as Function)(...args)
|
|
937
|
+
);
|
|
938
|
+
}
|
|
939
|
+
customRuleApplied = true; // Mark that a custom rule was (potentially) applied
|
|
940
|
+
// Depending on stemming strategy, might want to break or continue applying rules
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
// If a custom rule was applied and modified the word, return it.
|
|
945
|
+
// Otherwise, or if custom rules are meant to precede default stemming, apply Porter2.
|
|
946
|
+
if (customRuleApplied && stemmed !== word) return stemmed; // Return if custom rule changed the word
|
|
947
|
+
|
|
948
|
+
// Fallback to Porter2 if no custom rule applied or if custom rules are pre-processing
|
|
949
|
+
return stem(stemmed); // Apply Porter2 to the (potentially already custom-stemmed) word
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
/**
|
|
953
|
+
* Checks if the character at a given index in a word is a consonant.
|
|
954
|
+
* Treats 'y' as a consonant if it's the first letter or follows a consonant.
|
|
955
|
+
* @param word - The word string.
|
|
956
|
+
* @param i - The index of the character to check.
|
|
957
|
+
* @returns `true` if the character is a consonant, `false` otherwise.
|
|
958
|
+
*/
|
|
959
|
+
isConsonant(word: string, i: number): boolean {
|
|
960
|
+
const char = word[i];
|
|
961
|
+
if ('aeiou'.includes(char)) return false;
|
|
962
|
+
return char !== 'y' || (i === 0 ? true : !this.isConsonant(word, i - 1));
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
/**
|
|
966
|
+
* Calculates the "measure" of a word stem (approximates syllable count).
|
|
967
|
+
* The measure (m) is the number of times a sequence of vowels is followed by a
|
|
968
|
+
* sequence of consonants (VC). Used in some stemming rules.
|
|
969
|
+
* Example: measure("tree") = 0, measure("trouble") = 1, measure("private") = 2
|
|
970
|
+
* @param word - The word (or stem) to measure.
|
|
971
|
+
* @returns The measure (m) of the word.
|
|
972
|
+
*/
|
|
973
|
+
measure(word: string): number {
|
|
974
|
+
let m = 0;
|
|
975
|
+
let vowelSeen = false;
|
|
976
|
+
for (let i = 0; i < word.length; i++) {
|
|
977
|
+
if (this.isConsonant(word, i)) {
|
|
978
|
+
if (vowelSeen) {
|
|
979
|
+
m++;
|
|
980
|
+
vowelSeen = false;
|
|
981
|
+
}
|
|
982
|
+
} else {
|
|
983
|
+
vowelSeen = true;
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
return m;
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
/**
|
|
991
|
+
* BM25 Options Interface.
|
|
992
|
+
* Extends TokenizerOptions and adds BM25 specific parameters.
|
|
993
|
+
*/
|
|
994
|
+
interface BM25Options extends TokenizerOptions {
|
|
995
|
+
/**
|
|
996
|
+
* Term frequency saturation parameter (k1). Controls how quickly term frequency
|
|
997
|
+
* saturates. Higher values mean TF contributes more significantly even for high counts.
|
|
998
|
+
* Typical values are between 1.2 and 2.0. Default: 1.2.
|
|
999
|
+
*/
|
|
1000
|
+
k1?: number;
|
|
1001
|
+
/**
|
|
1002
|
+
* Document length normalization parameter (b). Controls the influence of document
|
|
1003
|
+
* length. 0 means no length normalization, 1 means full normalization.
|
|
1004
|
+
* Typical values are around 0.75. Default: 0.75.
|
|
1005
|
+
*/
|
|
1006
|
+
b?: number;
|
|
1007
|
+
/**
|
|
1008
|
+
* A dictionary defining boost factors for specific document fields.
|
|
1009
|
+
* Terms found in fields with higher boost factors will contribute more to the score.
|
|
1010
|
+
* Example: `{ title: 2, body: 1 }`. Default: `{}` (no boosts).
|
|
1011
|
+
*/
|
|
1012
|
+
fieldBoosts?: { [key: string]: number };
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
/**
|
|
1016
|
+
* Represents a search result item.
|
|
1017
|
+
*/
|
|
1018
|
+
interface SearchResult {
|
|
1019
|
+
/** The index of the matching document in the original document array. */
|
|
1020
|
+
index: number;
|
|
1021
|
+
/** The BM25 relevance score for the document. Higher scores indicate better relevance. */
|
|
1022
|
+
score: number;
|
|
1023
|
+
/** The actual document object (optional, depends on how results are retrieved). */
|
|
1024
|
+
doc?: any; // Consider using a generic <T> for BM25 class if docs are typed
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
/**
|
|
1028
|
+
* Implements the Okapi BM25 (Best Matching 25) ranking function for information retrieval.
|
|
1029
|
+
*
|
|
1030
|
+
* BM25 ranks documents based on the query terms appearing in each document,
|
|
1031
|
+
* considering term frequency (TF) and inverse document frequency (IDF).
|
|
1032
|
+
* It improves upon basic TF-IDF by incorporating:
|
|
1033
|
+
* - Term Frequency Saturation (k1): Prevents overly frequent terms from dominating the score.
|
|
1034
|
+
* - Document Length Normalization (b): Penalizes documents that are longer than average,
|
|
1035
|
+
* assuming longer documents are more likely to contain query terms by chance.
|
|
1036
|
+
*
|
|
1037
|
+
* Key Components:
|
|
1038
|
+
* - Tokenizer: Processes text into terms (words), handles stop words and stemming.
|
|
1039
|
+
* - Document Indexing: Stores document lengths, term frequencies per document,
|
|
1040
|
+
* and overall document frequency for each term.
|
|
1041
|
+
* - IDF Calculation: Measures the informativeness of a term based on how many documents contain it.
|
|
1042
|
+
* - Scoring: Combines TF, IDF, document length, and parameters k1/b to calculate relevance.
|
|
1043
|
+
*/
|
|
1044
|
+
export class BM25 {
|
|
1045
|
+
/** Term frequency saturation parameter (k1). */
|
|
1046
|
+
readonly termFrequencySaturation: number; // k1
|
|
1047
|
+
/** Document length normalization factor (b). */
|
|
1048
|
+
readonly lengthNormalizationFactor: number; // b
|
|
1049
|
+
/** Tokenizer instance used for processing text. */
|
|
1050
|
+
readonly tokenizer: Tokenizer;
|
|
1051
|
+
/** Array storing the length (number of tokens, adjusted by field boosts) of each document. */
|
|
1052
|
+
documentLengths: Uint32Array;
|
|
1053
|
+
/** Average length of all documents in the index. */
|
|
1054
|
+
averageDocLength: number;
|
|
1055
|
+
/** Map from term (string) to its unique integer index. */
|
|
1056
|
+
termToIndex: Map<string, number>;
|
|
1057
|
+
/** Array storing the document frequency (number of docs containing the term) for each term index. */
|
|
1058
|
+
documentFrequency: Uint32Array; // DF for each term index
|
|
1059
|
+
/** Map from term index to another map storing `docIndex: termFrequencyInDoc`. */
|
|
1060
|
+
termFrequencies: Map<number, Map<number, number>>; // TermIndex -> { DocIndex -> TF }
|
|
1061
|
+
/** Boost factors for different fields within documents. */
|
|
1062
|
+
readonly fieldBoosts: { [key: string]: number };
|
|
1063
|
+
/** Array storing the original documents added to the index. */
|
|
1064
|
+
documents: any[]; // Consider using a generic <T>
|
|
1065
|
+
|
|
1066
|
+
/**
|
|
1067
|
+
* Creates a new BM25 search instance.
|
|
1068
|
+
* @param docs - Optional array of initial documents (objects with string fields) to index.
|
|
1069
|
+
* @param options - Configuration options for BM25 parameters (k1, b), tokenizer (stopWords, stemming, minLength), and field boosts.
|
|
1070
|
+
*/
|
|
1071
|
+
constructor(docs?: any[], options: BM25Options = {}) {
|
|
1072
|
+
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
1073
|
+
this.termFrequencySaturation = opts.k1!; // Non-null assertion as DEFAULT_OPTIONS provides it
|
|
1074
|
+
this.lengthNormalizationFactor = opts.b!; // Non-null assertion
|
|
1075
|
+
this.tokenizer = new Tokenizer(opts);
|
|
1076
|
+
this.fieldBoosts = opts.fieldBoosts || {};
|
|
1077
|
+
|
|
1078
|
+
// Initialize index structures
|
|
1079
|
+
this.documents = [];
|
|
1080
|
+
this.documentLengths = new Uint32Array(0);
|
|
1081
|
+
this.termToIndex = new Map<string, number>();
|
|
1082
|
+
this.documentFrequency = new Uint32Array(0); // Will be sized later
|
|
1083
|
+
this.averageDocLength = 0;
|
|
1084
|
+
this.termFrequencies = new Map<number, Map<number, number>>(); // TermIndex -> { DocIndex -> TF }
|
|
1085
|
+
|
|
1086
|
+
// Index initial documents if provided
|
|
1087
|
+
if (docs && docs.length > 0) {
|
|
1088
|
+
this.documents = [...docs]; // Store original documents
|
|
1089
|
+
const { documentLengths, termToIndex, documentFrequency, averageDocLength, termFrequencies } =
|
|
1090
|
+
this.processDocuments(docs);
|
|
1091
|
+
// Assign processed data to instance properties
|
|
1092
|
+
this.documentLengths = documentLengths;
|
|
1093
|
+
this.termToIndex = termToIndex;
|
|
1094
|
+
this.documentFrequency = documentFrequency;
|
|
1095
|
+
this.averageDocLength = averageDocLength;
|
|
1096
|
+
this.termFrequencies = termFrequencies;
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
/**
|
|
1101
|
+
* Processes an array of documents to build the initial index structures.
|
|
1102
|
+
* Calculates document lengths, term frequencies, document frequencies, and average document length.
|
|
1103
|
+
* @param docs - Array of documents to process.
|
|
1104
|
+
* @returns An object containing the calculated index data.
|
|
1105
|
+
* @internal
|
|
1106
|
+
*/
|
|
1107
|
+
private processDocuments(docs: any[]): {
|
|
1108
|
+
documentLengths: Uint32Array;
|
|
1109
|
+
termToIndex: Map<string, number>;
|
|
1110
|
+
documentFrequency: Uint32Array;
|
|
1111
|
+
averageDocLength: number;
|
|
1112
|
+
termFrequencies: Map<number, Map<number, number>>;
|
|
1113
|
+
} {
|
|
1114
|
+
const numDocs = docs.length;
|
|
1115
|
+
const documentLengths = new Uint32Array(numDocs);
|
|
1116
|
+
const termToIndex = new Map<string, number>();
|
|
1117
|
+
const termDocs = new Map<string, Set<number>>(); // Temp map: Term -> Set<DocIndex>
|
|
1118
|
+
const termFrequencies = new Map<number, Map<number, number>>(); // TermIndex -> { DocIndex -> TF }
|
|
1119
|
+
let totalDocLength = 0;
|
|
1120
|
+
let nextTermIndex = 0;
|
|
1121
|
+
|
|
1122
|
+
docs.forEach((doc, docIndex) => {
|
|
1123
|
+
let currentDocLength = 0;
|
|
1124
|
+
const docTermFrequencies = new Map<number, number>(); // TermIndex -> TF for this doc
|
|
1125
|
+
|
|
1126
|
+
// Iterate through fields of the document
|
|
1127
|
+
Object.entries(doc).forEach(([field, content]) => {
|
|
1128
|
+
if (typeof content !== 'string') return; // Skip non-string fields
|
|
1129
|
+
|
|
1130
|
+
const fieldBoost = this.fieldBoosts[field] || 1;
|
|
1131
|
+
const { tokens } = this.tokenizer.tokenize(content);
|
|
1132
|
+
const fieldLength = tokens.length * fieldBoost;
|
|
1133
|
+
currentDocLength += fieldLength;
|
|
1134
|
+
|
|
1135
|
+
// Calculate term frequencies within this field/doc
|
|
1136
|
+
tokens.forEach((term) => {
|
|
1137
|
+
// Assign index to new terms
|
|
1138
|
+
if (!termToIndex.has(term)) {
|
|
1139
|
+
termToIndex.set(term, nextTermIndex++);
|
|
1140
|
+
}
|
|
1141
|
+
const termIndexVal = termToIndex.get(term)!;
|
|
1142
|
+
|
|
1143
|
+
// Track which documents contain the term
|
|
1144
|
+
if (!termDocs.has(term)) {
|
|
1145
|
+
termDocs.set(term, new Set<number>());
|
|
1146
|
+
}
|
|
1147
|
+
termDocs.get(term)!.add(docIndex);
|
|
1148
|
+
|
|
1149
|
+
// Increment frequency for this term in this document
|
|
1150
|
+
const currentFreq = docTermFrequencies.get(termIndexVal) || 0;
|
|
1151
|
+
docTermFrequencies.set(termIndexVal, currentFreq + fieldBoost); // TF weighted by boost
|
|
1152
|
+
});
|
|
1153
|
+
});
|
|
1154
|
+
|
|
1155
|
+
// Store the calculated length for this document
|
|
1156
|
+
documentLengths[docIndex] = currentDocLength;
|
|
1157
|
+
totalDocLength += currentDocLength;
|
|
1158
|
+
|
|
1159
|
+
// Merge this document's term frequencies into the main structure
|
|
1160
|
+
docTermFrequencies.forEach((freq, termIndexVal) => {
|
|
1161
|
+
if (!termFrequencies.has(termIndexVal)) {
|
|
1162
|
+
termFrequencies.set(termIndexVal, new Map<number, number>());
|
|
1163
|
+
}
|
|
1164
|
+
termFrequencies.get(termIndexVal)!.set(docIndex, freq);
|
|
1165
|
+
});
|
|
1166
|
+
});
|
|
1167
|
+
|
|
1168
|
+
// Calculate document frequency (DF) for each term
|
|
1169
|
+
const documentFrequency = new Uint32Array(termToIndex.size);
|
|
1170
|
+
termDocs.forEach((docsSet, term) => {
|
|
1171
|
+
const termIndexVal = termToIndex.get(term)!;
|
|
1172
|
+
documentFrequency[termIndexVal] = docsSet.size;
|
|
1173
|
+
});
|
|
1174
|
+
|
|
1175
|
+
return {
|
|
1176
|
+
documentLengths,
|
|
1177
|
+
termToIndex,
|
|
1178
|
+
documentFrequency,
|
|
1179
|
+
averageDocLength: numDocs > 0 ? totalDocLength / numDocs : 0,
|
|
1180
|
+
termFrequencies,
|
|
1181
|
+
};
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
/**
|
|
1185
|
+
* Recalculates the average document length based on the current `documentLengths`.
|
|
1186
|
+
* @internal
|
|
1187
|
+
*/
|
|
1188
|
+
private recalculateAverageLength(): void {
|
|
1189
|
+
if (this.documentLengths.length === 0) {
|
|
1190
|
+
this.averageDocLength = 0;
|
|
1191
|
+
return;
|
|
1192
|
+
}
|
|
1193
|
+
// Use the typed array's reduce method for type safety and performance
|
|
1194
|
+
const totalLength = this.documentLengths.reduce((sum, len) => sum + len, 0);
|
|
1195
|
+
this.averageDocLength = totalLength / this.documentLengths.length;
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
/**
|
|
1199
|
+
* Searches the indexed documents for a given query string using the BM25 ranking formula.
|
|
1200
|
+
*
|
|
1201
|
+
* @param query - The search query text.
|
|
1202
|
+
* @param topK - The maximum number of top-scoring results to return. Defaults to 10.
|
|
1203
|
+
* @returns An array of `SearchResult` objects, sorted by descending BM25 score.
|
|
1204
|
+
*/
|
|
1205
|
+
search(query: string, topK = 10): SearchResult[] {
|
|
1206
|
+
const { tokens: queryTokens } = this.tokenizer.tokenize(query); // Tokenize the query
|
|
1207
|
+
const scores = new Float32Array(this.documentLengths.length).fill(0); // Initialize scores to 0
|
|
1208
|
+
|
|
1209
|
+
// Accumulate scores for each document based on query terms
|
|
1210
|
+
queryTokens.forEach((term) => {
|
|
1211
|
+
const termIndex = this.termToIndex.get(term);
|
|
1212
|
+
// Ignore terms not found in the index
|
|
1213
|
+
if (termIndex === undefined) return;
|
|
1214
|
+
|
|
1215
|
+
const idf = this.calculateIdf(termIndex);
|
|
1216
|
+
// Skip terms with non-positive IDF (e.g., term in all docs)
|
|
1217
|
+
if (idf <= 0) return;
|
|
1218
|
+
|
|
1219
|
+
const termFreqsInDocs = this.termFrequencies.get(termIndex); // Map<DocIndex, TF>
|
|
1220
|
+
if (!termFreqsInDocs) return; // Should not happen if termIndex exists, but check anyway
|
|
1221
|
+
|
|
1222
|
+
// Iterate over documents containing this term
|
|
1223
|
+
termFreqsInDocs.forEach((tf, docIndex) => {
|
|
1224
|
+
const docLength = this.documentLengths[docIndex];
|
|
1225
|
+
|
|
1226
|
+
// --- BM25 Term Score Calculation ---
|
|
1227
|
+
// Normalizes TF based on document length and saturation parameters.
|
|
1228
|
+
const numerator = tf * (this.termFrequencySaturation + 1);
|
|
1229
|
+
const denominator =
|
|
1230
|
+
tf +
|
|
1231
|
+
this.termFrequencySaturation *
|
|
1232
|
+
(1 -
|
|
1233
|
+
this.lengthNormalizationFactor +
|
|
1234
|
+
(this.lengthNormalizationFactor * docLength) / this.averageDocLength);
|
|
1235
|
+
|
|
1236
|
+
// Add the weighted score (IDF * normalized TF) for this term to the document's total score
|
|
1237
|
+
scores[docIndex] += idf * (numerator / denominator);
|
|
1238
|
+
});
|
|
1239
|
+
});
|
|
1240
|
+
|
|
1241
|
+
// --- Result Generation ---
|
|
1242
|
+
// Create result objects, filter out zero scores, sort, and take top K
|
|
1243
|
+
return Array.from({ length: scores.length }, (_, i) => ({
|
|
1244
|
+
index: i,
|
|
1245
|
+
score: scores[i],
|
|
1246
|
+
// Optionally add: doc: this.getDocument(i) // If you want the full doc in results
|
|
1247
|
+
}))
|
|
1248
|
+
.filter((result) => result.score > 0) // Keep only documents with positive scores
|
|
1249
|
+
.sort((a, b) => b.score - a.score) // Sort by score descending
|
|
1250
|
+
.slice(0, topK); // Limit to topK results
|
|
1251
|
+
}
|
|
1252
|
+
|
|
1253
|
+
/**
|
|
1254
|
+
* Searches for an exact phrase within the indexed documents.
|
|
1255
|
+
* Ranks documents containing the exact sequence of tokens higher.
|
|
1256
|
+
* Note: This is a basic implementation. More sophisticated phrase search might consider proximity.
|
|
1257
|
+
*
|
|
1258
|
+
* @param phrase - The exact phrase to search for.
|
|
1259
|
+
* @param topK - The maximum number of results to return. Defaults to 10.
|
|
1260
|
+
* @returns An array of `SearchResult` objects, sorted by score, for documents containing the phrase.
|
|
1261
|
+
*/
|
|
1262
|
+
searchPhrase(phrase: string, topK = 10): SearchResult[] {
|
|
1263
|
+
const { tokens: phraseTokens } = this.tokenizer.tokenize(phrase); // Tokenize the phrase
|
|
1264
|
+
if (phraseTokens.length === 0) return []; // Cannot search for empty phrase
|
|
1265
|
+
|
|
1266
|
+
// --- Find Candidate Documents ---
|
|
1267
|
+
// Start with documents containing the *first* term, then intersect with subsequent terms.
|
|
1268
|
+
let candidateDocs: Set<number> | null = null;
|
|
1269
|
+
|
|
1270
|
+
for (const term of phraseTokens) {
|
|
1271
|
+
const termIndex = this.termToIndex.get(term);
|
|
1272
|
+
if (termIndex === undefined) return []; // Phrase cannot exist if any term is missing
|
|
1273
|
+
|
|
1274
|
+
const docsContainingTermIter = this.termFrequencies.get(termIndex)?.keys();
|
|
1275
|
+
if (!docsContainingTermIter) return []; // Should not happen, but check
|
|
1276
|
+
|
|
1277
|
+
const currentTermDocs = new Set(docsContainingTermIter);
|
|
1278
|
+
|
|
1279
|
+
if (candidateDocs === null) {
|
|
1280
|
+
// First term initializes the candidates
|
|
1281
|
+
candidateDocs = currentTermDocs;
|
|
1282
|
+
} else {
|
|
1283
|
+
// Intersect: Keep only documents present in both sets
|
|
1284
|
+
candidateDocs = new Set(
|
|
1285
|
+
[...candidateDocs].filter((docIdx: number) => currentTermDocs.has(docIdx))
|
|
1286
|
+
);
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
// If intersection becomes empty, the phrase cannot exist
|
|
1290
|
+
if (candidateDocs.size === 0) return [];
|
|
1291
|
+
}
|
|
1292
|
+
|
|
1293
|
+
if (candidateDocs === null || candidateDocs.size === 0) return []; // No candidates found
|
|
1294
|
+
|
|
1295
|
+
// --- Verify Phrase Occurrence and Score ---
|
|
1296
|
+
const scores = new Map<number, number>(); // Map<DocIndex, Score>
|
|
1297
|
+
|
|
1298
|
+
candidateDocs.forEach((docIndex) => {
|
|
1299
|
+
const doc = this.getDocument(docIndex); // Get the original document content
|
|
1300
|
+
let phraseFoundInDoc = false;
|
|
1301
|
+
|
|
1302
|
+
// Check each field for the phrase
|
|
1303
|
+
Object.entries(doc).forEach(([field, content]) => {
|
|
1304
|
+
if (typeof content !== 'string' || phraseFoundInDoc) return; // Skip non-strings or if already found
|
|
1305
|
+
|
|
1306
|
+
const fieldBoost = this.fieldBoosts[field] || 1;
|
|
1307
|
+
// Tokenize the field content using the same settings
|
|
1308
|
+
const { tokens: docTokens } = this.tokenizer.tokenize(content);
|
|
1309
|
+
|
|
1310
|
+
// Simple sliding window check for the exact phrase sequence
|
|
1311
|
+
for (let i = 0; i <= docTokens.length - phraseTokens.length; i++) {
|
|
1312
|
+
let match = true;
|
|
1313
|
+
for (let j = 0; j < phraseTokens.length; j++) {
|
|
1314
|
+
if (docTokens[i + j] !== phraseTokens[j]) {
|
|
1315
|
+
match = false;
|
|
1316
|
+
break;
|
|
1317
|
+
}
|
|
1318
|
+
}
|
|
1319
|
+
if (match) {
|
|
1320
|
+
// Phrase found! Calculate score for this document based on the phrase terms
|
|
1321
|
+
const phraseScoreVal = this.calculatePhraseScore(phraseTokens, docIndex) * fieldBoost;
|
|
1322
|
+
scores.set(docIndex, (scores.get(docIndex) || 0) + phraseScoreVal);
|
|
1323
|
+
phraseFoundInDoc = true; // Only score once per doc even if phrase repeats
|
|
1324
|
+
break; // Move to next document once found in this one
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
});
|
|
1328
|
+
});
|
|
1329
|
+
|
|
1330
|
+
// --- Format and Sort Results ---
|
|
1331
|
+
return Array.from(scores.entries())
|
|
1332
|
+
.map(([index, score]) => ({ index, score }))
|
|
1333
|
+
.sort((a, b) => b.score - a.score) // Sort by score descending
|
|
1334
|
+
.slice(0, topK); // Limit results
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
/**
|
|
1338
|
+
* Calculates a BM25-like score for a sequence of phrase tokens within a specific document.
|
|
1339
|
+
* Sums the individual BM25 scores of the terms in the phrase for that document.
|
|
1340
|
+
* @param phraseTokens - The tokenized phrase.
|
|
1341
|
+
* @param docIndex - The index of the document to score against.
|
|
1342
|
+
* @returns The calculated phrase score for the document.
|
|
1343
|
+
* @internal
|
|
1344
|
+
*/
|
|
1345
|
+
private calculatePhraseScore(phraseTokens: string[], docIndex: number): number {
|
|
1346
|
+
return phraseTokens.reduce((currentScore, term) => {
|
|
1347
|
+
const termIndex = this.termToIndex.get(term);
|
|
1348
|
+
// Ignore terms not in index (shouldn't happen if candidate selection worked)
|
|
1349
|
+
if (termIndex === undefined) return currentScore;
|
|
1350
|
+
|
|
1351
|
+
const idf = this.calculateIdf(termIndex);
|
|
1352
|
+
const tf = this.termFrequencies.get(termIndex)?.get(docIndex) || 0;
|
|
1353
|
+
const docLength = this.documentLengths[docIndex];
|
|
1354
|
+
|
|
1355
|
+
// Calculate the BM25 contribution of this single term
|
|
1356
|
+
const numerator = tf * (this.termFrequencySaturation + 1);
|
|
1357
|
+
const denominator =
|
|
1358
|
+
tf +
|
|
1359
|
+
this.termFrequencySaturation *
|
|
1360
|
+
(1 -
|
|
1361
|
+
this.lengthNormalizationFactor +
|
|
1362
|
+
(this.lengthNormalizationFactor * docLength) / this.averageDocLength);
|
|
1363
|
+
|
|
1364
|
+
// Add IDF * normalized TF to the total phrase score
|
|
1365
|
+
return currentScore + idf * (numerator / denominator);
|
|
1366
|
+
}, 0); // Start score at 0
|
|
1367
|
+
}
|
|
1368
|
+
|
|
1369
|
+
/**
|
|
1370
|
+
* Adds a single new document to the index.
|
|
1371
|
+
* Updates all internal index structures incrementally.
|
|
1372
|
+
* Note: For adding many documents, `addDocumentsParallel` is generally more efficient.
|
|
1373
|
+
*
|
|
1374
|
+
* @param doc - The document object (with string fields) to add.
|
|
1375
|
+
* @throws {Error} If the document is null or undefined.
|
|
1376
|
+
*/
|
|
1377
|
+
async addDocument(doc: any): Promise<void> {
|
|
1378
|
+
if (!doc) throw new Error('Document cannot be null');
|
|
1379
|
+
|
|
1380
|
+
const docIndex = this.documentLengths.length; // Index for the new document
|
|
1381
|
+
|
|
1382
|
+
// --- Update Document List and Lengths ---
|
|
1383
|
+
this.documents.push(doc);
|
|
1384
|
+
// Resize documentLengths array (simple append)
|
|
1385
|
+
const newDocLengths = new Uint32Array(docIndex + 1);
|
|
1386
|
+
newDocLengths.set(this.documentLengths, 0); // Copy old lengths
|
|
1387
|
+
// Calculate length later...
|
|
1388
|
+
this.documentLengths = newDocLengths; // Assign temporarily
|
|
1389
|
+
|
|
1390
|
+
let currentDocLength = 0;
|
|
1391
|
+
const docTermFrequencies = new Map<number, number>(); // TermIndex -> TF for this new doc
|
|
1392
|
+
|
|
1393
|
+
// --- Process Fields and Tokens ---
|
|
1394
|
+
Object.entries(doc).forEach(([field, content]) => {
|
|
1395
|
+
if (typeof content !== 'string') return;
|
|
1396
|
+
|
|
1397
|
+
const fieldBoost = this.fieldBoosts[field] || 1;
|
|
1398
|
+
const { tokens } = this.tokenizer.tokenize(content);
|
|
1399
|
+
currentDocLength += tokens.length * fieldBoost;
|
|
1400
|
+
|
|
1401
|
+
// Process each token in the field
|
|
1402
|
+
tokens.forEach((term) => {
|
|
1403
|
+
let termIndexVal: number;
|
|
1404
|
+
// Add term to index if new
|
|
1405
|
+
if (!this.termToIndex.has(term)) {
|
|
1406
|
+
termIndexVal = this.termToIndex.size;
|
|
1407
|
+
this.termToIndex.set(term, termIndexVal);
|
|
1408
|
+
|
|
1409
|
+
// Ensure documentFrequency array is large enough
|
|
1410
|
+
if (this.documentFrequency.length <= termIndexVal) {
|
|
1411
|
+
const oldDf = this.documentFrequency;
|
|
1412
|
+
// Grow exponentially, ensure it's at least termIndex + 1
|
|
1413
|
+
const newSize = Math.max(termIndexVal + 1, oldDf.length * 2 || 1); // Ensure newSize is at least 1 if oldDf.length is 0
|
|
1414
|
+
this.documentFrequency = new Uint32Array(newSize);
|
|
1415
|
+
this.documentFrequency.set(oldDf, 0);
|
|
1416
|
+
}
|
|
1417
|
+
// Initialize DF for new term (will be incremented below)
|
|
1418
|
+
this.documentFrequency[termIndexVal] = 0;
|
|
1419
|
+
} else {
|
|
1420
|
+
termIndexVal = this.termToIndex.get(term)!;
|
|
1421
|
+
}
|
|
1422
|
+
|
|
1423
|
+
// Increment frequency for this term in this new document
|
|
1424
|
+
const currentFreq = docTermFrequencies.get(termIndexVal) || 0;
|
|
1425
|
+
docTermFrequencies.set(termIndexVal, currentFreq + fieldBoost); // Weighted TF
|
|
1426
|
+
});
|
|
1427
|
+
});
|
|
1428
|
+
|
|
1429
|
+
// --- Update Global Structures ---
|
|
1430
|
+
// Set the calculated length for the new document
|
|
1431
|
+
this.documentLengths[docIndex] = currentDocLength;
|
|
1432
|
+
|
|
1433
|
+
// Add this document's term frequencies to the main map and update DF
|
|
1434
|
+
docTermFrequencies.forEach((freq, termIndexVal) => {
|
|
1435
|
+
// Add TF entry
|
|
1436
|
+
if (!this.termFrequencies.has(termIndexVal)) {
|
|
1437
|
+
this.termFrequencies.set(termIndexVal, new Map<number, number>());
|
|
1438
|
+
}
|
|
1439
|
+
this.termFrequencies.get(termIndexVal)!.set(docIndex, freq);
|
|
1440
|
+
|
|
1441
|
+
// Increment document frequency for the term
|
|
1442
|
+
// Ensure termIndexVal is within bounds of documentFrequency before incrementing
|
|
1443
|
+
if (termIndexVal < this.documentFrequency.length) {
|
|
1444
|
+
this.documentFrequency[termIndexVal]++;
|
|
1445
|
+
} else {
|
|
1446
|
+
// This case should ideally not be reached if array was resized correctly
|
|
1447
|
+
console.error(
|
|
1448
|
+
`Error: termIndexVal ${termIndexVal} is out of bounds for documentFrequency (length ${this.documentFrequency.length}). This indicates an issue with array resizing or term indexing.`
|
|
1449
|
+
);
|
|
1450
|
+
}
|
|
1451
|
+
});
|
|
1452
|
+
|
|
1453
|
+
// Recalculate average document length
|
|
1454
|
+
this.recalculateAverageLength(); // Efficiently update average
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
/**
|
|
1458
|
+
* Calculates the Inverse Document Frequency (IDF) for a given term index.
|
|
1459
|
+
* Uses the BM25 IDF formula: log(1 + (N - n + 0.5) / (n + 0.5))
|
|
1460
|
+
* where N is the total number of documents and n is the number of documents
|
|
1461
|
+
* containing the term. The +1 smooths the logarithm.
|
|
1462
|
+
*
|
|
1463
|
+
* @param termIndex - The integer index of the term.
|
|
1464
|
+
* @returns The IDF score for the term. Returns 0 if the term is not found or has 0 DF.
|
|
1465
|
+
*/
|
|
1466
|
+
calculateIdf(termIndex: number): number {
|
|
1467
|
+
// Ensure termIndex is valid
|
|
1468
|
+
if (termIndex < 0 || termIndex >= this.documentFrequency.length) {
|
|
1469
|
+
return 0; // Term not in index or index out of bounds
|
|
1470
|
+
}
|
|
1471
|
+
|
|
1472
|
+
const docFreq = this.documentFrequency[termIndex]; // n: number of docs containing the term
|
|
1473
|
+
// If term appears in 0 documents or more docs than exist (error state), return 0 IDF.
|
|
1474
|
+
if (docFreq <= 0 || docFreq > this.documentLengths.length) {
|
|
1475
|
+
return 0;
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
const N = this.documentLengths.length; // Total number of documents
|
|
1479
|
+
const numerator = N - docFreq + 0.5;
|
|
1480
|
+
const denominator = docFreq + 0.5;
|
|
1481
|
+
|
|
1482
|
+
// Adding 1 inside the log ensures IDF is always non-negative.
|
|
1483
|
+
return Math.log(1 + numerator / denominator);
|
|
1484
|
+
}
|
|
1485
|
+
|
|
1486
|
+
/**
|
|
1487
|
+
* Retrieves the term frequency (TF) for a specific term in a specific document.
|
|
1488
|
+
* @param termIndex - The integer index of the term.
|
|
1489
|
+
* @param docIndex - The index of the document.
|
|
1490
|
+
* @returns The term frequency, or 0 if the term is not in the document or indices are invalid.
|
|
1491
|
+
*/
|
|
1492
|
+
getTermFrequency(termIndex: number, docIndex: number): number {
|
|
1493
|
+
return this.termFrequencies.get(termIndex)?.get(docIndex) || 0;
|
|
1494
|
+
}
|
|
1495
|
+
|
|
1496
|
+
/**
|
|
1497
|
+
* Retrieves the original document object stored at a given index.
|
|
1498
|
+
* @param index - The index of the document to retrieve.
|
|
1499
|
+
* @returns The document object.
|
|
1500
|
+
* @throws {Error} If the index is out of bounds.
|
|
1501
|
+
*/
|
|
1502
|
+
getDocument(index: number): any {
|
|
1503
|
+
// Consider using a generic <T>
|
|
1504
|
+
if (index < 0 || index >= this.documents.length) {
|
|
1505
|
+
throw new Error(`Document index ${index} out of bounds (0-${this.documents.length - 1})`);
|
|
1506
|
+
}
|
|
1507
|
+
return this.documents[index];
|
|
1508
|
+
}
|
|
1509
|
+
|
|
1510
|
+
/**
|
|
1511
|
+
* Clears all indexed documents and resets the BM25 instance to its initial state.
|
|
1512
|
+
*/
|
|
1513
|
+
clearDocuments(): void {
|
|
1514
|
+
this.documents = [];
|
|
1515
|
+
this.documentLengths = new Uint32Array(0);
|
|
1516
|
+
this.termToIndex.clear();
|
|
1517
|
+
this.documentFrequency = new Uint32Array(0);
|
|
1518
|
+
this.averageDocLength = 0;
|
|
1519
|
+
this.termFrequencies.clear();
|
|
1520
|
+
}
|
|
1521
|
+
|
|
1522
|
+
/**
|
|
1523
|
+
* Gets the total number of documents currently indexed.
|
|
1524
|
+
* @returns The document count.
|
|
1525
|
+
*/
|
|
1526
|
+
getDocumentCount(): number {
|
|
1527
|
+
return this.documents.length;
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
/**
|
|
1531
|
+
* Adds multiple documents sequentially by calling `addDocument` for each.
|
|
1532
|
+
* This method processes documents sequentially in the main thread.
|
|
1533
|
+
* @param docs - An array of documents to add.
|
|
1534
|
+
*/
|
|
1535
|
+
async addDocuments(docs: any[]): Promise<void[]> {
|
|
1536
|
+
// Allow Promise<void> return type
|
|
1537
|
+
// Using Promise.all to potentially run additions concurrently if addDocument becomes async
|
|
1538
|
+
// Although the current addDocument is sync, this structure allows future flexibility.
|
|
1539
|
+
return Promise.all(docs.map((doc) => this.addDocument(doc)));
|
|
1540
|
+
// Note: If addDocument remains purely synchronous, a simple forEach would also work:
|
|
1541
|
+
// docs.forEach(doc => this.addDocument(doc));
|
|
1542
|
+
}
|
|
1543
|
+
}
|