@elizaos/core 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/dist/browser/index.browser.js +120 -120
  2. package/dist/browser/index.browser.js.map +5 -21
  3. package/dist/browser/index.d.ts +3 -1
  4. package/dist/index.d.ts +2 -3
  5. package/dist/index.js +1 -5
  6. package/dist/node/index.d.ts +3 -1
  7. package/package.json +10 -4
  8. package/src/__tests__/action-chaining-simple.test.ts +203 -0
  9. package/src/__tests__/actions.test.ts +218 -0
  10. package/src/__tests__/buffer.test.ts +337 -0
  11. package/src/__tests__/character-validation.test.ts +309 -0
  12. package/src/__tests__/database.test.ts +750 -0
  13. package/src/__tests__/entities.test.ts +727 -0
  14. package/src/__tests__/env.test.ts +23 -0
  15. package/src/__tests__/environment.test.ts +285 -0
  16. package/src/__tests__/logger-browser-node.test.ts +716 -0
  17. package/src/__tests__/logger.test.ts +403 -0
  18. package/src/__tests__/messages.test.ts +196 -0
  19. package/src/__tests__/mockCharacter.ts +544 -0
  20. package/src/__tests__/parsing.test.ts +58 -0
  21. package/src/__tests__/prompts.test.ts +159 -0
  22. package/src/__tests__/roles.test.ts +331 -0
  23. package/src/__tests__/runtime-embedding.test.ts +343 -0
  24. package/src/__tests__/runtime.test.ts +978 -0
  25. package/src/__tests__/search.test.ts +15 -0
  26. package/src/__tests__/services-by-type.test.ts +204 -0
  27. package/src/__tests__/services.test.ts +136 -0
  28. package/src/__tests__/settings.test.ts +810 -0
  29. package/src/__tests__/utils.test.ts +1105 -0
  30. package/src/__tests__/uuid.test.ts +94 -0
  31. package/src/actions.ts +122 -0
  32. package/src/database.ts +579 -0
  33. package/src/entities.ts +406 -0
  34. package/src/index.browser.ts +48 -0
  35. package/src/index.node.ts +39 -0
  36. package/src/index.ts +50 -0
  37. package/src/logger.ts +527 -0
  38. package/src/prompts.ts +243 -0
  39. package/src/roles.ts +85 -0
  40. package/src/runtime.ts +2514 -0
  41. package/src/schemas/character.ts +149 -0
  42. package/src/search.ts +1543 -0
  43. package/src/sentry/instrument.browser.ts +65 -0
  44. package/src/sentry/instrument.node.ts +57 -0
  45. package/src/sentry/instrument.ts +82 -0
  46. package/src/services.ts +105 -0
  47. package/src/settings.ts +409 -0
  48. package/src/test_resources/constants.ts +12 -0
  49. package/src/test_resources/testSetup.ts +21 -0
  50. package/src/test_resources/types.ts +22 -0
  51. package/src/types/agent.ts +112 -0
  52. package/src/types/browser.ts +145 -0
  53. package/src/types/components.ts +184 -0
  54. package/src/types/database.ts +348 -0
  55. package/src/types/email.ts +162 -0
  56. package/src/types/environment.ts +129 -0
  57. package/src/types/events.ts +249 -0
  58. package/src/types/index.ts +29 -0
  59. package/src/types/knowledge.ts +65 -0
  60. package/src/types/lp.ts +124 -0
  61. package/src/types/memory.ts +228 -0
  62. package/src/types/message.ts +233 -0
  63. package/src/types/messaging.ts +57 -0
  64. package/src/types/model.ts +359 -0
  65. package/src/types/pdf.ts +77 -0
  66. package/src/types/plugin.ts +78 -0
  67. package/src/types/post.ts +271 -0
  68. package/src/types/primitives.ts +97 -0
  69. package/src/types/runtime.ts +190 -0
  70. package/src/types/service.ts +198 -0
  71. package/src/types/settings.ts +30 -0
  72. package/src/types/state.ts +60 -0
  73. package/src/types/task.ts +72 -0
  74. package/src/types/tee.ts +107 -0
  75. package/src/types/testing.ts +30 -0
  76. package/src/types/token.ts +96 -0
  77. package/src/types/transcription.ts +133 -0
  78. package/src/types/video.ts +108 -0
  79. package/src/types/wallet.ts +56 -0
  80. package/src/types/web-search.ts +146 -0
  81. package/src/utils/__tests__/buffer.test.ts +80 -0
  82. package/src/utils/__tests__/environment.test.ts +58 -0
  83. package/src/utils/__tests__/stringToUuid.test.ts +88 -0
  84. package/src/utils/buffer.ts +312 -0
  85. package/src/utils/environment.ts +316 -0
  86. package/src/utils/server-health.ts +117 -0
  87. package/src/utils.ts +1076 -0
  88. package/dist/tsconfig.build.tsbuildinfo +0 -1
package/src/search.ts ADDED
@@ -0,0 +1,1543 @@
1
+ // Implementation of BM25 and Porter2 stemming
2
+ // https://github.com/eilvelia/porter2.js
3
+ // https://www.npmjs.com/package/fast-bm25
4
+
5
+ // The MIT License
6
+
7
+ // Copyright (c) 2024 eilvelia <hi@eilvelia.cat>
8
+
9
+ // Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ // of this software and associated documentation files (the "Software"), to deal
11
+ // in the Software without restriction, including without limitation the rights
12
+ // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ // copies of the Software, and to permit persons to whom the Software is
14
+ // furnished to do so, subject to the following conditions:
15
+
16
+ // The above copyright notice and this permission notice shall be included in all
17
+ // copies or substantial portions of the Software.
18
+
19
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ // SOFTWARE.
26
+
27
+ // MIT License
28
+
29
+ // Copyright (c) 2024 Vivek Patel <me@patelvivek.dev>.
30
+
31
+ // Permission is hereby granted, free of charge, to any person obtaining a copy
32
+ // of this software and associated documentation files (the "Software"), to deal
33
+ // in the Software without restriction, including without limitation the rights
34
+ // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
35
+ // copies of the Software, and to permit persons to whom the Software is
36
+ // furnished to do so, subject to the following conditions:
37
+
38
+ // The above copyright notice and this permission notice shall be included in all
39
+ // copies or substantial portions of the Software.
40
+
41
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
42
+ // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
43
+ // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
44
+ // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
45
+ // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
46
+ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
47
+ // SOFTWARE.
48
+
49
+ /**
50
+ * Checks if the character code represents a vowel (a, e, i, o, u, y).
51
+ * @param char - The character code.
52
+ * @returns True if the character is a vowel, false otherwise.
53
+ */
54
+ const isV = (char: number): boolean => {
55
+ switch (char) {
56
+ case 97:
57
+ case 101:
58
+ case 105:
59
+ case 111:
60
+ case 117:
61
+ case 121:
62
+ return true;
63
+ default:
64
+ return false;
65
+ }
66
+ };
67
+
68
+ /**
69
+ * Checks if the character code is 'w', 'x', 'y', or a vowel.
70
+ * Used in determining short syllables.
71
+ * @param char - The character code.
72
+ * @returns True if the character is w, x, y, or a vowel, false otherwise.
73
+ */
74
+ const isWxy = (char: number): boolean => {
75
+ switch (char) {
76
+ case 97:
77
+ case 101:
78
+ case 105:
79
+ case 111:
80
+ case 117:
81
+ case 121:
82
+ case 119:
83
+ case 120:
84
+ case 89:
85
+ return true;
86
+ default:
87
+ return false;
88
+ }
89
+ };
90
+
91
+ /**
92
+ * Checks if the character code is one of the valid endings for Step 1c ('li' rule).
93
+ * Valid endings: c, d, e, g, h, k, m, n, r, t.
94
+ * @param char - The character code.
95
+ * @returns True if the character is a valid 'li' ending, false otherwise.
96
+ */
97
+ const isValidLi = (char: number): boolean => {
98
+ switch (char) {
99
+ case 99:
100
+ case 100:
101
+ case 101:
102
+ case 103:
103
+ case 104:
104
+ case 107:
105
+ case 109:
106
+ case 110:
107
+ case 114:
108
+ case 116:
109
+ return true;
110
+ default:
111
+ return false;
112
+ }
113
+ };
114
+
115
+ /**
116
+ * Checks if the character code represents a "double" consonant sound
117
+ * (bb, dd, ff, gg, mm, nn, pp, rr, tt).
118
+ * @param char - The character code.
119
+ * @returns True if the character forms a double consonant, false otherwise.
120
+ */
121
+ const isDouble = (char: number): boolean => {
122
+ switch (char) {
123
+ case 98:
124
+ case 100:
125
+ case 102:
126
+ case 103:
127
+ case 109:
128
+ case 110:
129
+ case 112:
130
+ case 114:
131
+ case 116:
132
+ return true;
133
+ default:
134
+ return false;
135
+ }
136
+ };
137
+
138
+ /**
139
+ * Checks if a word ends in a short syllable.
140
+ * A short syllable is defined as:
141
+ * 1. A vowel followed by a non-vowel at the end of the word. (e.g., "hop")
142
+ * 2. A vowel followed by a non-vowel followed by a non-vowel ('w', 'x', 'y' excluded). (e.g., "trap")
143
+ * @param w - Array of character codes representing the word.
144
+ * @param len - The current effective length of the word being considered.
145
+ * @returns True if the word ends in a short syllable, false otherwise.
146
+ */
147
+ const isShortV = (w: number[], len: number): boolean => {
148
+ // backwardmode: ( non-v_WXY v non-v ) or ( non-v v atlimit )
149
+ return (
150
+ len >= 2 &&
151
+ isV(w[len - 2]) &&
152
+ ((len === 2 && !isV(w[len - 1])) || (len >= 3 && !isV(w[len - 3]) && !isWxy(w[len - 1])))
153
+ );
154
+ };
155
+
156
+ // #endregion Porter2 Stemmer Helper Functions
157
+
158
+ // #region Porter2 Stemmer Algorithm
159
+
160
+ /**
161
+ * Stems a given word using the Porter2 (Snowball English) stemming algorithm.
162
+ *
163
+ * The algorithm works in several steps, applying suffix stripping rules based on
164
+ * regions R1 and R2 within the word.
165
+ * - R1: The region after the first non-vowel following a vowel.
166
+ * - R2: The region after the first non-vowel following a vowel in R1.
167
+ *
168
+ * The steps generally involve:
169
+ * 1. Handling plurals and past participles (-s, -es, -ed, -ing).
170
+ * 2. Turning terminal 'y' to 'i' if there is another vowel in the stem.
171
+ * 3. Mapping double suffixes to single ones (e.g., -ization to -ize).
172
+ * 4. Dealing with suffixes like -full, -ness, etc.
173
+ * 5. Removing suffixes like -ant, -ence, etc.
174
+ * 6. Removing a final -e.
175
+ *
176
+ * @param word - The word to be stemmed.
177
+ * @returns The stemmed version of the word.
178
+ */
179
+ const stem = (word: string): string => {
180
+ if (word.length < 3) return word;
181
+ // exception1
182
+ if (word.length <= 6) {
183
+ switch (word) {
184
+ case 'ski':
185
+ return 'ski';
186
+ case 'skies':
187
+ return 'sky';
188
+ case 'dying':
189
+ return 'die';
190
+ case 'lying':
191
+ return 'lie';
192
+ case 'tying':
193
+ return 'tie';
194
+ // special -LY cases
195
+ case 'idly':
196
+ return 'idl';
197
+ case 'gently':
198
+ return 'gentl';
199
+ case 'ugly':
200
+ return 'ugli';
201
+ case 'early':
202
+ return 'earli';
203
+ case 'only':
204
+ return 'onli';
205
+ case 'singly':
206
+ return 'singl';
207
+ // invariant forms
208
+ case 'sky':
209
+ case 'news':
210
+ case 'howe':
211
+ // not plural forms
212
+ case 'atlas':
213
+ case 'cosmos':
214
+ case 'bias':
215
+ case 'andes':
216
+ return word;
217
+ }
218
+ }
219
+ const initialOffset = word.charCodeAt(0) === 39 /* ' */ ? 1 : 0;
220
+ let l = word.length - initialOffset;
221
+ const w = new Array<number>(l);
222
+ let yFound = false;
223
+ for (let i = 0; i < l; ++i) {
224
+ const ch = word.charCodeAt(i + initialOffset);
225
+ if (ch === 121 && (i === 0 || isV(w[i - 1]))) {
226
+ yFound = true;
227
+ w[i] = 89;
228
+ continue;
229
+ }
230
+ w[i] = ch;
231
+ }
232
+ if (w[l - 1] === 39 /* ' */) --l;
233
+ if (l >= 2 && w[l - 2] === 39 /* ' */ && w[l - 1] === 115 /* s */) l -= 2;
234
+ // mark_regions
235
+ let rv = 0;
236
+ // rv is the position after the first vowel
237
+ while (rv < l && !isV(w[rv])) ++rv;
238
+ if (rv < l) ++rv;
239
+ let r1 = rv;
240
+ if (
241
+ l >= 5 &&
242
+ ((w[0] === 103 && w[1] === 101 && w[2] === 110 && w[3] === 101 && w[4] === 114) || // gener
243
+ (w[0] === 97 && w[1] === 114 && w[2] === 115 && w[3] === 101 && w[4] === 110)) // arsen
244
+ )
245
+ r1 = 5;
246
+ else if (
247
+ l >= 6 &&
248
+ w[0] === 99 && // c
249
+ w[1] === 111 && // o
250
+ w[2] === 109 && // m
251
+ w[3] === 109 && // m
252
+ w[4] === 117 && // u
253
+ w[5] === 110 // n
254
+ )
255
+ // commun
256
+ r1 = 6;
257
+ else {
258
+ // > R1 is the region after the first non-vowel following a vowel,
259
+ // > or the end of the word if there is no such non-vowel.
260
+ while (r1 < l && isV(w[r1])) ++r1;
261
+ if (r1 < l) ++r1;
262
+ }
263
+ // > R2 is the region after the first non-vowel following a vowel in R1,
264
+ // > or the end of the word if there is no such non-vowel.
265
+ let r2 = r1;
266
+ while (r2 < l && !isV(w[r2])) ++r2;
267
+ while (r2 < l && isV(w[r2])) ++r2;
268
+ if (r2 < l) ++r2;
269
+ // Step_1a
270
+ if (l >= 3) {
271
+ if (w[l - 1] === 115) {
272
+ // s
273
+ if (l >= 4 && w[l - 2] === 101 && w[l - 3] === 115 && w[l - 4] === 115)
274
+ // sses
275
+ l -= 2; // sses -> ss
276
+ else if (w[l - 2] === 101 && w[l - 3] === 105)
277
+ // ies
278
+ l -= l >= 5 ? 2 : 1; // ies
279
+ else if (w[l - 2] !== 117 && w[l - 2] !== 115 && rv < l - 1)
280
+ // us ss -> <nothing>; s -> "delete if the preceding word part
281
+ // contains a vowel not immediately before the s"
282
+ l -= 1;
283
+ } else if (w[l - 1] === 100 && w[l - 2] === 101 && w[l - 3] === 105) l -= l >= 5 ? 2 : 1; // ied
284
+ }
285
+ // exception2
286
+ if (
287
+ (l === 6 &&
288
+ ((w[0] === 105 && // i
289
+ w[1] === 110 && // n
290
+ w[2] === 110 && // n
291
+ w[3] === 105 && // i
292
+ w[4] === 110 && // n
293
+ w[5] === 103) || // g (inning)
294
+ (w[0] === 111 && // o
295
+ w[1] === 117 && // u
296
+ w[2] === 116 && // t
297
+ w[3] === 105 && // i
298
+ w[4] === 110 && // n
299
+ w[5] === 103) || // g (outing)
300
+ (w[0] === 101 && // e
301
+ w[1] === 120 && // x
302
+ w[2] === 99 && // c
303
+ w[3] === 101 && // e
304
+ w[4] === 101 && // e
305
+ w[5] === 100))) || // d (exceed)
306
+ (l === 7 &&
307
+ ((w[0] === 99 && // c
308
+ w[1] === 97 && // a
309
+ w[2] === 110 && // n
310
+ w[3] === 110 && // n
311
+ w[4] === 105 && // i
312
+ w[5] === 110 && // n
313
+ w[6] === 103) || // g (canning)
314
+ (w[0] === 104 && // h
315
+ w[1] === 101 && // e
316
+ w[2] === 114 && // r
317
+ w[3] === 114 && // r
318
+ w[4] === 105 && // i
319
+ w[5] === 110 && // n
320
+ w[6] === 103) || // g (herring)
321
+ (w[0] === 101 && // e
322
+ w[1] === 97 && // a
323
+ w[2] === 114 && // r
324
+ w[3] === 114 && // r
325
+ w[4] === 105 && // i
326
+ w[5] === 110 && // n
327
+ w[6] === 103) || // g (earring)
328
+ (w[0] === 112 && // p
329
+ w[1] === 114 && // r
330
+ w[2] === 111 && // o
331
+ w[3] === 99 && // c
332
+ w[4] === 101 && // e
333
+ w[5] === 101 && // e
334
+ w[6] === 100) || // d (proceed)
335
+ (w[0] === 115 && // s
336
+ w[1] === 117 && // u
337
+ w[2] === 99 && // c
338
+ w[3] === 99 && // c
339
+ w[4] === 101 && // e
340
+ w[5] === 101 && // e
341
+ w[6] === 100))) // d (succeed)
342
+ ) {
343
+ let exp2Out = '';
344
+ for (let i = 0; i < l; ++i) exp2Out += String.fromCharCode(w[i]);
345
+ return exp2Out;
346
+ }
347
+ // Step_1b
348
+ let ll =
349
+ // l (length) without the -ly ending
350
+ l >= 2 && w[l - 1] === 121 && w[l - 2] === 108 ? l - 2 : l;
351
+ if (ll >= 3) {
352
+ if (w[ll - 3] === 101 && w[ll - 2] === 101 && w[ll - 1] === 100) {
353
+ // eed
354
+ if (ll >= r1 + 3) l = ll - 1; // eed eedly -> ee (if in R1)
355
+ } else {
356
+ // ll without: ed edly ing ingly (-1 if not found)
357
+ if (w[ll - 2] === 101 && w[ll - 1] === 100)
358
+ ll -= 2; // ed
359
+ else if (w[ll - 3] === 105 && w[ll - 2] === 110 && w[ll - 1] === 103)
360
+ ll -= 3; // ing
361
+ else ll = -1;
362
+ if (ll >= 0 && rv <= ll) {
363
+ l = ll;
364
+ if (l >= 2) {
365
+ if (
366
+ (w[l - 1] === 116 && w[l - 2] === 97) || // at
367
+ (w[l - 1] === 108 && w[l - 2] === 98) || // bl
368
+ (w[l - 1] === 122 && w[l - 2] === 105) // iz
369
+ ) {
370
+ // at -> ate bl -> ble iz -> ize
371
+ w[l] = 101;
372
+ ++l;
373
+ } else if (w[l - 2] === w[l - 1] && isDouble(w[l - 1])) {
374
+ --l;
375
+ } else if (r1 >= l && isShortV(w, l)) {
376
+ // <shortv> -> e
377
+ w[l] = 101;
378
+ ++l;
379
+ }
380
+ }
381
+ }
382
+ }
383
+ }
384
+ // Step_1c
385
+ if (l >= 3 && (w[l - 1] === 89 || w[l - 1] === 121) && !isV(w[l - 2])) w[l - 1] = 105; // i
386
+ // Step_2
387
+ if (l >= r1 + 2) {
388
+ switch (w[l - 1]) {
389
+ case 108: // l
390
+ if (
391
+ l >= r1 + 6 &&
392
+ w[l - 2] === 97 && // a
393
+ w[l - 3] === 110 && // n
394
+ w[l - 4] === 111 && // o
395
+ w[l - 5] === 105 && // i
396
+ w[l - 6] === 116 // t (tional)
397
+ ) {
398
+ if (l >= 7 && w[l - 7] === 97) {
399
+ // a (ational)
400
+ if (l >= r1 + 7) {
401
+ // ational -> ate
402
+ l -= 4;
403
+ w[l - 1] = 101; // e
404
+ }
405
+ } else {
406
+ l -= 2; // tional -> tion
407
+ }
408
+ }
409
+ break;
410
+ case 110: // n
411
+ if (
412
+ l >= r1 + 5 &&
413
+ w[l - 2] === 111 && // o
414
+ w[l - 3] === 105 && // i
415
+ w[l - 4] === 116 && // t
416
+ w[l - 5] === 97 // a (ation)
417
+ ) {
418
+ if (l >= 7 && w[l - 6] === 122 && w[l - 7] === 105) {
419
+ // iz (ization)
420
+ if (l >= r1 + 7) {
421
+ // ization -> ize
422
+ l -= 4;
423
+ w[l - 1] = 101; // e
424
+ }
425
+ } else {
426
+ // ation -> ate
427
+ l -= 2;
428
+ w[l - 1] = 101; // e
429
+ }
430
+ }
431
+ break;
432
+ case 114: // r
433
+ if (l >= r1 + 4) {
434
+ if (w[l - 2] === 101) {
435
+ // e (er)
436
+ if (w[l - 3] === 122 && w[l - 4] === 105) --l; // izer -> ize
437
+ } else if (w[l - 2] === 111) {
438
+ // o (or)
439
+ if (w[l - 3] === 116 && w[l - 4] === 97) {
440
+ // ator
441
+ --l;
442
+ w[l - 1] = 101; // e
443
+ }
444
+ }
445
+ }
446
+ break;
447
+ case 115: // s
448
+ if (
449
+ l >= r1 + 7 &&
450
+ w[l - 2] === 115 && // s
451
+ w[l - 3] === 101 && // e
452
+ w[l - 4] === 110 && // n (ness)
453
+ ((w[l - 5] === 108 && w[l - 6] === 117 && w[l - 7] === 102) || // fulness
454
+ (w[l - 5] === 115 && w[l - 6] === 117 && w[l - 7] === 111) || // ousness
455
+ (w[l - 5] === 101 && w[l - 6] === 118 && w[l - 7] === 105)) // iveness
456
+ ) {
457
+ l -= 4; // fulness -> ful ousness -> ous iveness -> ive
458
+ }
459
+ break;
460
+ case 109: // m
461
+ if (
462
+ l >= r1 + 5 &&
463
+ w[l - 2] === 115 && // s
464
+ w[l - 3] === 105 && // i
465
+ w[l - 4] === 108 && // l
466
+ w[l - 5] === 97 // a (alism)
467
+ )
468
+ l -= 3; // alism -> al
469
+ break;
470
+ case 105: // i
471
+ if (w[l - 2] === 99) {
472
+ // c (ic)
473
+ if (l >= r1 + 4 && (w[l - 4] === 101 || w[l - 4] === 97) && w[l - 3] === 110) {
474
+ // enci anci
475
+ w[l - 1] = 101; // enci -> ence anci -> ance
476
+ }
477
+ } else if (w[l - 2] === 103) {
478
+ // g (gi)
479
+ if (l >= r1 + 3 && l >= 4 && w[l - 2] === 103 && w[l - 3] === 111 && w[l - 4] === 108)
480
+ // logi
481
+ --l; // ogi -> og (if preceded by l)
482
+ } else if (w[l - 2] === 116) {
483
+ // t (ti)
484
+ if (l >= r1 + 5 && w[l - 3] === 105) {
485
+ // iti
486
+ if (w[l - 4] === 108) {
487
+ // liti
488
+ if (l >= 6 && w[l - 5] === 105 && w[l - 6] === 98) {
489
+ // biliti
490
+ if (l >= r1 + 6) {
491
+ // biliti -> ble
492
+ l -= 3;
493
+ w[l - 2] = 108; // l
494
+ w[l - 1] = 101; // e
495
+ }
496
+ } else if (w[l - 4] === 108 && w[l - 5] === 97) {
497
+ // aliti
498
+ l -= 3; // aliti -> al
499
+ }
500
+ } else if (w[l - 4] === 118 && w[l - 5] === 105) {
501
+ // iviti
502
+ // iviti -> ive
503
+ l -= 2;
504
+ w[l - 1] = 101; // e
505
+ }
506
+ }
507
+ } else if (w[l - 2] === 108 && l >= 3) {
508
+ // l (li)
509
+ if (w[l - 3] === 98) {
510
+ // bli
511
+ if (l >= 4 && w[l - 4] === 97) {
512
+ // abli
513
+ if (l >= r1 + 4) w[l - 1] = 101; // abli -> able
514
+ } else if (l >= r1 + 3) {
515
+ w[l - 1] = 101; // bli -> ble
516
+ }
517
+ } else {
518
+ // Remove li
519
+ if (w[l - 3] === 108) {
520
+ // lli
521
+ if (l >= 5 && w[l - 4] === 117 && w[l - 5] === 102) {
522
+ // fulli
523
+ if (l >= r1 + 5) l -= 2; // fulli -> ful
524
+ } else if (l >= r1 + 4 && w[l - 4] === 97) {
525
+ // alli
526
+ l -= 2; // alli -> al
527
+ }
528
+ } else if (w[l - 3] === 115) {
529
+ // sli
530
+ if (l >= 6 && w[l - 4] === 115 && w[l - 5] === 101 && w[l - 6] === 108) {
531
+ // lessli
532
+ if (l >= r1 + 6) l -= 2; // lessli -> less
533
+ } else if (l >= r1 + 5 && w[l - 4] === 117 && w[l - 5] === 111) {
534
+ // ousli
535
+ l -= 2; // ousli -> ous
536
+ }
537
+ } else if (l >= 5 && w[l - 3] === 116 && w[l - 4] === 110 && w[l - 5] === 101) {
538
+ // entli
539
+ if (l >= r1 + 5) l -= 2; // entli -> ent
540
+ } else if (isValidLi(w[l - 3])) {
541
+ l -= 2;
542
+ }
543
+ }
544
+ }
545
+ }
546
+ }
547
+ // Step_3
548
+ if (l >= r1 + 3) {
549
+ switch (w[l - 1]) {
550
+ case 108: // l
551
+ if (w[l - 3] === 99) {
552
+ // cal
553
+ if (l >= r1 + 4 && w[l - 4] === 105 && w[l - 2] === 97) l -= 2; // ical -> ic
554
+ } else if (w[l - 3] === 102) {
555
+ // ful
556
+ if (w[l - 2] === 117) l -= 3; // ful -> <delete>
557
+ } else if (w[l - 3] === 110) {
558
+ // nal
559
+ if (
560
+ l >= r1 + 6 &&
561
+ w[l - 2] === 97 && // a
562
+ w[l - 4] === 111 && // o
563
+ w[l - 5] === 105 && // i
564
+ w[l - 6] === 116 // t (tional)
565
+ ) {
566
+ if (l >= 7 && w[l - 7] === 97) {
567
+ // ational
568
+ if (l >= r1 + 7) {
569
+ // ational -> ate
570
+ l -= 4;
571
+ w[l - 1] = 101; // e
572
+ }
573
+ } else {
574
+ l -= 2; // tional -> tion
575
+ }
576
+ }
577
+ }
578
+ break;
579
+ case 101: // e
580
+ if (w[l - 2] === 122) {
581
+ // ze
582
+ if (l >= r1 + 5 && w[l - 3] === 105 && w[l - 4] === 108 && w[l - 5] === 97) l -= 3; // alize -> al
583
+ } else if (w[l - 2] === 116) {
584
+ // te
585
+ if (l >= r1 + 5 && w[l - 3] === 97 && w[l - 4] === 99 && w[l - 5] === 105) l -= 3; // icate -> ic
586
+ } else if (w[l - 2] === 118) {
587
+ // ve
588
+ if (l >= r2 + 5 && w[l - 3] === 105 && w[l - 4] === 116 && w[l - 5] === 97) l -= 5; // ative -> <delete> (if in R2)
589
+ }
590
+ break;
591
+ case 105: // i
592
+ if (
593
+ l >= r1 + 5 &&
594
+ w[l - 2] === 116 && // t
595
+ w[l - 3] === 105 && // i
596
+ w[l - 4] === 99 && // c
597
+ w[l - 5] === 105 // i (iciti)
598
+ )
599
+ l -= 3; // iciti -> ic
600
+ break;
601
+ case 115: // s
602
+ if (l >= r1 + 4 && w[l - 2] === 115 && w[l - 3] === 101 && w[l - 4] === 110) l -= 4; // ness -> <delete>
603
+ }
604
+ }
605
+ // Step_4
606
+ if (l >= r2 + 2) {
607
+ switch (w[l - 1]) {
608
+ case 110: // n
609
+ if (
610
+ l >= r2 + 3 &&
611
+ w[l - 2] === 111 && // o
612
+ w[l - 3] === 105 && // i (ion)
613
+ (w[l - 4] === 115 || w[l - 4] === 116) // s or t
614
+ )
615
+ l -= 3; // ion -> <delete> (if preceded by s or t)
616
+ break;
617
+ case 108: // l
618
+ if (w[l - 2] === 97) l -= 2; // al
619
+ break;
620
+ case 114: // r
621
+ if (w[l - 2] === 101) l -= 2; // er
622
+ break;
623
+ case 99: // c
624
+ if (w[l - 2] === 105) l -= 2; // ic
625
+ break;
626
+ case 109: // m
627
+ if (l >= r2 + 3 && w[l - 2] === 115 && w[l - 3] === 105) l -= 3; // ism
628
+ break;
629
+ case 105: // i
630
+ if (l >= r2 + 3 && w[l - 2] === 116 && w[l - 3] === 105) l -= 3; // iti
631
+ break;
632
+ case 115: // s
633
+ if (l >= r2 + 3 && w[l - 2] === 117 && w[l - 3] === 111) l -= 3; // ous
634
+ break;
635
+ case 116: // t
636
+ if (l >= r2 + 3 && w[l - 2] === 110) {
637
+ // nt
638
+ if (w[l - 3] === 97) {
639
+ // ant
640
+ l -= 3; // ant
641
+ } else if (w[l - 3] === 101) {
642
+ // ent
643
+ if (l >= 4 && w[l - 4] === 109) {
644
+ // ment
645
+ if (l >= 5 && w[l - 5] === 101) {
646
+ // ement
647
+ if (l >= r2 + 5) l -= 5; // ement
648
+ } else if (l >= r2 + 4) {
649
+ l -= 4; // ment
650
+ }
651
+ } else {
652
+ l -= 3; // ent
653
+ }
654
+ }
655
+ }
656
+ break;
657
+ case 101: // e
658
+ if (w[l - 2] === 99) {
659
+ // ce
660
+ if (l >= r2 + 4 && w[l - 3] === 110 && (w[l - 4] === 97 || w[l - 4] === 101)) l -= 4; // ance ence
661
+ } else if (w[l - 2] === 108) {
662
+ // le
663
+ if (l >= r2 + 4 && w[l - 3] === 98 && (w[l - 4] === 97 || w[l - 4] === 105)) l -= 4; // able ible
664
+ } else if (w[l - 2] === 116) {
665
+ // te
666
+ if (l >= r2 + 3 && w[l - 3] === 97) l -= 3; // ate
667
+ } else if (l >= r2 + 3 && (w[l - 2] === 118 || w[l - 2] === 122) && w[l - 3] === 105) {
668
+ // ive ize
669
+ l -= 3; // ive ize
670
+ }
671
+ }
672
+ }
673
+ // Step_5
674
+ if (
675
+ l >= r1 + 1 && // r1 is >= 1
676
+ ((l >= r2 + 1 && w[l - 1] === 108 && w[l - 2] === 108) || // ll
677
+ (w[l - 1] === 101 && (l >= r2 + 1 || !isShortV(w, l - 1)))) // e
678
+ )
679
+ --l;
680
+ let out = '';
681
+ if (yFound) {
682
+ for (let i = 0; i < l; ++i) {
683
+ out += String.fromCharCode(w[i] === 89 ? 121 : w[i]); // Y -> y
684
+ }
685
+ } else {
686
+ for (let i = 0; i < l; ++i) out += String.fromCharCode(w[i]);
687
+ }
688
+ return out;
689
+ };
690
+
691
+ // #endregion Porter2 Stemmer Algorithm
692
+
693
+ // src/constants.ts
694
+ const DEFAULT_OPTIONS = {
695
+ k1: 1.2,
696
+ b: 0.75,
697
+ minLength: 2,
698
+ stopWords: /* @__PURE__ */ new Set<string>([
699
+ 'a',
700
+ 'an',
701
+ 'and',
702
+ 'are',
703
+ 'as',
704
+ 'at',
705
+ 'be',
706
+ 'by',
707
+ 'for',
708
+ 'from',
709
+ 'has',
710
+ 'he',
711
+ 'in',
712
+ 'is',
713
+ 'it',
714
+ 'its',
715
+ 'of',
716
+ 'on',
717
+ 'that',
718
+ 'the',
719
+ 'to',
720
+ 'was',
721
+ 'were',
722
+ 'will',
723
+ 'with',
724
+ ]),
725
+ stemming: false,
726
+ stemWords: (word: string): string => word,
727
+ };
728
+
729
+ /**
730
+ * Interface for tokenization statistics.
731
+ */
732
+ interface TokenizationStats {
733
+ /** Number of words in the original text before any processing. */
734
+ originalWordCount: number;
735
+ /** Number of words removed because they were identified as stop words. */
736
+ stopWordsRemoved: number;
737
+ /** Number of words that were stemmed (only if stemming is enabled). */
738
+ stemmedWords: number;
739
+ /** Time taken for tokenization in milliseconds. */
740
+ processingTimeMs: number;
741
+ }
742
+
743
+ /**
744
+ * Interface for the result of tokenization.
745
+ */
746
+ interface TokenizationResult {
747
+ /** Array of processed tokens (words). */
748
+ tokens: string[];
749
+ /** Optional statistics about the tokenization process. */
750
+ stats?: TokenizationStats;
751
+ }
752
+
753
+ /**
754
+ * Interface for stemming rules.
755
+ */
756
+ interface StemmingRule {
757
+ /** A RegExp pattern or string to match suffixes. */
758
+ pattern: RegExp | string;
759
+ /** The replacement string or function. */
760
+ replacement: string | ((substring: string, ...args: any[]) => string);
761
+ /** Optional minimum measure (complexity) of the word stem for the rule to apply. */
762
+ minMeasure?: number;
763
+ }
764
+
765
+ /**
766
+ * Options for configuring the Tokenizer.
767
+ */
768
+ interface TokenizerOptions {
769
+ /** A set of words to be ignored during tokenization. Defaults to an empty set. */
770
+ stopWords?: Set<string>;
771
+ /** The minimum length for a token to be kept. Defaults to 2. Numeric tokens are always kept. */
772
+ minLength?: number;
773
+ /** Whether to apply stemming to tokens. Defaults to false. */
774
+ stemming?: boolean;
775
+ /** Custom stemming rules to apply before the default Porter2 stemmer. Defaults to an empty array. */
776
+ stemmingRules?: StemmingRule[];
777
+ }
778
+
779
+ /**
780
+ * Flexible text tokenizer with support for stop words, minimum token length,
781
+ * Unicode normalization, and optional Porter2 stemming with custom rules.
782
+ */
783
+ class Tokenizer {
784
+ /** Set of stop words to ignore. */
785
+ readonly stopWords: Set<string>;
786
+ /** Minimum length of tokens to keep. */
787
+ readonly minLength: number;
788
+ /** Flag indicating if stemming is enabled. */
789
+ readonly stemming: boolean;
790
+ /** Custom stemming rules. */
791
+ readonly stemmingRules: {
792
+ pattern: RegExp;
793
+ replacement: string | ((substring: string, ...args: any[]) => string);
794
+ minMeasure?: number;
795
+ }[];
796
+
797
+ /** Default options for the Tokenizer. */
798
+ static readonly DEFAULT_OPTIONS: Required<TokenizerOptions> = {
799
+ stopWords: /* @__PURE__ */ new Set<string>(),
800
+ minLength: 2,
801
+ stemming: false,
802
+ stemmingRules: [],
803
+ };
804
+
805
+ /**
806
+ * Creates a new tokenizer instance.
807
+ * @param options - Tokenization options including stop words, min length, stemming, and custom rules.
808
+ */
809
+ constructor(options: TokenizerOptions = {}) {
810
+ const mergedOptions = { ...Tokenizer.DEFAULT_OPTIONS, ...options };
811
+ this.stopWords = mergedOptions.stopWords;
812
+ this.minLength = mergedOptions.minLength;
813
+ this.stemming = mergedOptions.stemming;
814
+ // Ensure all rule patterns are RegExp objects
815
+ this.stemmingRules = mergedOptions.stemmingRules.map((rule) => ({
816
+ ...rule,
817
+ pattern: typeof rule.pattern === 'string' ? new RegExp(rule.pattern) : rule.pattern,
818
+ }));
819
+ }
820
+
821
+ /**
822
+ * Tokenizes input text into an array of processed terms.
823
+ * Steps:
824
+ * 1. Cleans the text (lowercase, normalize, remove punctuation/symbols).
825
+ * 2. Splits the text into potential tokens.
826
+ * 3. Filters tokens based on `minLength` and `stopWords`.
827
+ * 4. Applies stemming if `stemming` is true (custom rules first, then Porter2).
828
+ * 5. Optionally calculates statistics.
829
+ *
830
+ * @param text - The input text string to tokenize.
831
+ * @param includeStats - If true, returns tokenization statistics along with tokens. Defaults to false.
832
+ * @returns A `TokenizationResult` object containing the array of tokens and optional stats.
833
+ * @throws {Error} If the input text is null, undefined, or empty.
834
+ */
835
+ tokenize(text: string, includeStats = false): TokenizationResult {
836
+ if (!text) {
837
+ throw new Error('Input text cannot be null or empty');
838
+ }
839
+ const startTime = Date.now();
840
+ const originalWords = text.split(/\s+/).filter((word) => word.length > 0);
841
+ const cleaned = this.cleanText(text);
842
+ const tokens = cleaned
843
+ .split(/\s+/)
844
+ .filter((token) => this.isValidToken(token))
845
+ .map((token) => (this.stemming ? this.stemWord(token) : token));
846
+ const stats: TokenizationStats = includeStats
847
+ ? {
848
+ originalWordCount: originalWords.length,
849
+ stopWordsRemoved: originalWords.length - tokens.length, // This might be incorrect if stemming changes token count
850
+ stemmedWords: this.stemming ? tokens.length : 0,
851
+ processingTimeMs: Date.now() - startTime,
852
+ }
853
+ : {
854
+ originalWordCount: 0,
855
+ stopWordsRemoved: 0,
856
+ stemmedWords: 0,
857
+ processingTimeMs: 0,
858
+ };
859
+ return { tokens, stats };
860
+ }
861
+
862
+ /**
863
+ * Cleans and normalizes text for tokenization.
864
+ * - Converts to lowercase.
865
+ * - Normalizes Unicode characters (NFKD).
866
+ * - Removes control characters and zero-width spaces.
867
+ * - Removes diacritical marks (accents).
868
+ * - Removes emojis and pictographs.
869
+ * - Removes common symbols (™, ®, ©, ℠, ‼).
870
+ * - Replaces Unicode punctuation with spaces.
871
+ * - Removes characters not matching basic Latin, CJK, Hangul, or whitespace.
872
+ * - Collapses multiple spaces into single spaces.
873
+ * - Trims leading/trailing whitespace.
874
+ *
875
+ * @param text - Input text to clean.
876
+ * @returns Cleaned and normalized text, ready for splitting into tokens.
877
+ *
878
+ * @example
879
+ * cleanText("Hello, World™!") // "hello world"
880
+ * cleanText("héllo 👋") // "hello"
881
+ * cleanText("Hello 世界!") // "hello 世界"
882
+ * cleanText("I'm don't") // "i'm don't" (apostrophes kept by replacing punctuation with space)
883
+ * cleanText("test©2023") // "test 2023"
884
+ */
885
+ cleanText(text: string): string {
886
+ return text
887
+ .toLowerCase()
888
+ .normalize('NFKD')
889
+ .replace(/[\u0000-\u001F\u007F-\u009F\u200B-\u200D\uFEFF]/g, '') // Control characters & zero-width spaces
890
+ .replace(/[\u0300-\u036f]/g, '') // Diacritical marks
891
+ .replace(/[\p{Emoji_Presentation}\p{Extended_Pictographic}]/gu, '') // Emojis and pictographs
892
+ .replace(/[™®©℠‼]/g, '') // Common symbols
893
+ .replace(/[\p{P}]/gu, ' ') // Unicode punctuation to space
894
+ .replace(/[^a-z0-9\u3040-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\s]/gu, ' ') // Keep only latin, cjk, hangul, numbers, whitespace
895
+ .replace(/\s+/g, ' ') // Collapse multiple spaces
896
+ .trim();
897
+ }
898
+
899
+ /**
900
+ * Checks if a token is valid (meets `minLength` criteria and is not a stop word).
901
+ * Numeric tokens are always considered valid regardless of length.
902
+ * @param token - The token string to validate.
903
+ * @returns `true` if the token is valid, `false` otherwise.
904
+ */
905
+ isValidToken(token: string): boolean {
906
+ const isNumeric = /^\d+$/.test(token);
907
+ return (token.length >= this.minLength || isNumeric) && !this.stopWords.has(token);
908
+ }
909
+
910
+ /**
911
+ * Applies stemming to a single word.
912
+ * First, tries to apply custom stemming rules defined in `stemmingRules`.
913
+ * If no custom rule matches, applies the default Porter2 stemming algorithm.
914
+ * Words shorter than 3 characters are not stemmed.
915
+ * @param word - The word to stem.
916
+ * @returns The stemmed word.
917
+ */
918
+ stemWord(word: string): string {
919
+ if (word.length < 3) return word;
920
+ let customRuleApplied = false;
921
+ let stemmed = word;
922
+ for (const rule of this.stemmingRules) {
923
+ const match = stemmed.match(rule.pattern);
924
+ if (match) {
925
+ if (
926
+ !rule.minMeasure ||
927
+ this.measure(stemmed.substring(0, match.index)) >= rule.minMeasure
928
+ ) {
929
+ // Apply replacement
930
+ if (typeof rule.replacement === 'string') {
931
+ stemmed = stemmed.replace(rule.pattern, rule.replacement);
932
+ } else {
933
+ // If replacement is a function, it might need more specific arguments based on its definition.
934
+ // Assuming it takes the matched substring and potentially other match groups.
935
+ stemmed = stemmed.replace(rule.pattern, (...args) =>
936
+ (rule.replacement as Function)(...args)
937
+ );
938
+ }
939
+ customRuleApplied = true; // Mark that a custom rule was (potentially) applied
940
+ // Depending on stemming strategy, might want to break or continue applying rules
941
+ }
942
+ }
943
+ }
944
+ // If a custom rule was applied and modified the word, return it.
945
+ // Otherwise, or if custom rules are meant to precede default stemming, apply Porter2.
946
+ if (customRuleApplied && stemmed !== word) return stemmed; // Return if custom rule changed the word
947
+
948
+ // Fallback to Porter2 if no custom rule applied or if custom rules are pre-processing
949
+ return stem(stemmed); // Apply Porter2 to the (potentially already custom-stemmed) word
950
+ }
951
+
952
+ /**
953
+ * Checks if the character at a given index in a word is a consonant.
954
+ * Treats 'y' as a consonant if it's the first letter or follows a consonant.
955
+ * @param word - The word string.
956
+ * @param i - The index of the character to check.
957
+ * @returns `true` if the character is a consonant, `false` otherwise.
958
+ */
959
+ isConsonant(word: string, i: number): boolean {
960
+ const char = word[i];
961
+ if ('aeiou'.includes(char)) return false;
962
+ return char !== 'y' || (i === 0 ? true : !this.isConsonant(word, i - 1));
963
+ }
964
+
965
+ /**
966
+ * Calculates the "measure" of a word stem (approximates syllable count).
967
+ * The measure (m) is the number of times a sequence of vowels is followed by a
968
+ * sequence of consonants (VC). Used in some stemming rules.
969
+ * Example: measure("tree") = 0, measure("trouble") = 1, measure("private") = 2
970
+ * @param word - The word (or stem) to measure.
971
+ * @returns The measure (m) of the word.
972
+ */
973
+ measure(word: string): number {
974
+ let m = 0;
975
+ let vowelSeen = false;
976
+ for (let i = 0; i < word.length; i++) {
977
+ if (this.isConsonant(word, i)) {
978
+ if (vowelSeen) {
979
+ m++;
980
+ vowelSeen = false;
981
+ }
982
+ } else {
983
+ vowelSeen = true;
984
+ }
985
+ }
986
+ return m;
987
+ }
988
+ }
989
+
990
+ /**
991
+ * BM25 Options Interface.
992
+ * Extends TokenizerOptions and adds BM25 specific parameters.
993
+ */
994
+ interface BM25Options extends TokenizerOptions {
995
+ /**
996
+ * Term frequency saturation parameter (k1). Controls how quickly term frequency
997
+ * saturates. Higher values mean TF contributes more significantly even for high counts.
998
+ * Typical values are between 1.2 and 2.0. Default: 1.2.
999
+ */
1000
+ k1?: number;
1001
+ /**
1002
+ * Document length normalization parameter (b). Controls the influence of document
1003
+ * length. 0 means no length normalization, 1 means full normalization.
1004
+ * Typical values are around 0.75. Default: 0.75.
1005
+ */
1006
+ b?: number;
1007
+ /**
1008
+ * A dictionary defining boost factors for specific document fields.
1009
+ * Terms found in fields with higher boost factors will contribute more to the score.
1010
+ * Example: `{ title: 2, body: 1 }`. Default: `{}` (no boosts).
1011
+ */
1012
+ fieldBoosts?: { [key: string]: number };
1013
+ }
1014
+
1015
+ /**
1016
+ * Represents a search result item.
1017
+ */
1018
+ interface SearchResult {
1019
+ /** The index of the matching document in the original document array. */
1020
+ index: number;
1021
+ /** The BM25 relevance score for the document. Higher scores indicate better relevance. */
1022
+ score: number;
1023
+ /** The actual document object (optional, depends on how results are retrieved). */
1024
+ doc?: any; // Consider using a generic <T> for BM25 class if docs are typed
1025
+ }
1026
+
1027
+ /**
1028
+ * Implements the Okapi BM25 (Best Matching 25) ranking function for information retrieval.
1029
+ *
1030
+ * BM25 ranks documents based on the query terms appearing in each document,
1031
+ * considering term frequency (TF) and inverse document frequency (IDF).
1032
+ * It improves upon basic TF-IDF by incorporating:
1033
+ * - Term Frequency Saturation (k1): Prevents overly frequent terms from dominating the score.
1034
+ * - Document Length Normalization (b): Penalizes documents that are longer than average,
1035
+ * assuming longer documents are more likely to contain query terms by chance.
1036
+ *
1037
+ * Key Components:
1038
+ * - Tokenizer: Processes text into terms (words), handles stop words and stemming.
1039
+ * - Document Indexing: Stores document lengths, term frequencies per document,
1040
+ * and overall document frequency for each term.
1041
+ * - IDF Calculation: Measures the informativeness of a term based on how many documents contain it.
1042
+ * - Scoring: Combines TF, IDF, document length, and parameters k1/b to calculate relevance.
1043
+ */
1044
+ export class BM25 {
1045
+ /** Term frequency saturation parameter (k1). */
1046
+ readonly termFrequencySaturation: number; // k1
1047
+ /** Document length normalization factor (b). */
1048
+ readonly lengthNormalizationFactor: number; // b
1049
+ /** Tokenizer instance used for processing text. */
1050
+ readonly tokenizer: Tokenizer;
1051
+ /** Array storing the length (number of tokens, adjusted by field boosts) of each document. */
1052
+ documentLengths: Uint32Array;
1053
+ /** Average length of all documents in the index. */
1054
+ averageDocLength: number;
1055
+ /** Map from term (string) to its unique integer index. */
1056
+ termToIndex: Map<string, number>;
1057
+ /** Array storing the document frequency (number of docs containing the term) for each term index. */
1058
+ documentFrequency: Uint32Array; // DF for each term index
1059
+ /** Map from term index to another map storing `docIndex: termFrequencyInDoc`. */
1060
+ termFrequencies: Map<number, Map<number, number>>; // TermIndex -> { DocIndex -> TF }
1061
+ /** Boost factors for different fields within documents. */
1062
+ readonly fieldBoosts: { [key: string]: number };
1063
+ /** Array storing the original documents added to the index. */
1064
+ documents: any[]; // Consider using a generic <T>
1065
+
1066
+ /**
1067
+ * Creates a new BM25 search instance.
1068
+ * @param docs - Optional array of initial documents (objects with string fields) to index.
1069
+ * @param options - Configuration options for BM25 parameters (k1, b), tokenizer (stopWords, stemming, minLength), and field boosts.
1070
+ */
1071
+ constructor(docs?: any[], options: BM25Options = {}) {
1072
+ const opts = { ...DEFAULT_OPTIONS, ...options };
1073
+ this.termFrequencySaturation = opts.k1!; // Non-null assertion as DEFAULT_OPTIONS provides it
1074
+ this.lengthNormalizationFactor = opts.b!; // Non-null assertion
1075
+ this.tokenizer = new Tokenizer(opts);
1076
+ this.fieldBoosts = opts.fieldBoosts || {};
1077
+
1078
+ // Initialize index structures
1079
+ this.documents = [];
1080
+ this.documentLengths = new Uint32Array(0);
1081
+ this.termToIndex = new Map<string, number>();
1082
+ this.documentFrequency = new Uint32Array(0); // Will be sized later
1083
+ this.averageDocLength = 0;
1084
+ this.termFrequencies = new Map<number, Map<number, number>>(); // TermIndex -> { DocIndex -> TF }
1085
+
1086
+ // Index initial documents if provided
1087
+ if (docs && docs.length > 0) {
1088
+ this.documents = [...docs]; // Store original documents
1089
+ const { documentLengths, termToIndex, documentFrequency, averageDocLength, termFrequencies } =
1090
+ this.processDocuments(docs);
1091
+ // Assign processed data to instance properties
1092
+ this.documentLengths = documentLengths;
1093
+ this.termToIndex = termToIndex;
1094
+ this.documentFrequency = documentFrequency;
1095
+ this.averageDocLength = averageDocLength;
1096
+ this.termFrequencies = termFrequencies;
1097
+ }
1098
+ }
1099
+
1100
+ /**
1101
+ * Processes an array of documents to build the initial index structures.
1102
+ * Calculates document lengths, term frequencies, document frequencies, and average document length.
1103
+ * @param docs - Array of documents to process.
1104
+ * @returns An object containing the calculated index data.
1105
+ * @internal
1106
+ */
1107
+ private processDocuments(docs: any[]): {
1108
+ documentLengths: Uint32Array;
1109
+ termToIndex: Map<string, number>;
1110
+ documentFrequency: Uint32Array;
1111
+ averageDocLength: number;
1112
+ termFrequencies: Map<number, Map<number, number>>;
1113
+ } {
1114
+ const numDocs = docs.length;
1115
+ const documentLengths = new Uint32Array(numDocs);
1116
+ const termToIndex = new Map<string, number>();
1117
+ const termDocs = new Map<string, Set<number>>(); // Temp map: Term -> Set<DocIndex>
1118
+ const termFrequencies = new Map<number, Map<number, number>>(); // TermIndex -> { DocIndex -> TF }
1119
+ let totalDocLength = 0;
1120
+ let nextTermIndex = 0;
1121
+
1122
+ docs.forEach((doc, docIndex) => {
1123
+ let currentDocLength = 0;
1124
+ const docTermFrequencies = new Map<number, number>(); // TermIndex -> TF for this doc
1125
+
1126
+ // Iterate through fields of the document
1127
+ Object.entries(doc).forEach(([field, content]) => {
1128
+ if (typeof content !== 'string') return; // Skip non-string fields
1129
+
1130
+ const fieldBoost = this.fieldBoosts[field] || 1;
1131
+ const { tokens } = this.tokenizer.tokenize(content);
1132
+ const fieldLength = tokens.length * fieldBoost;
1133
+ currentDocLength += fieldLength;
1134
+
1135
+ // Calculate term frequencies within this field/doc
1136
+ tokens.forEach((term) => {
1137
+ // Assign index to new terms
1138
+ if (!termToIndex.has(term)) {
1139
+ termToIndex.set(term, nextTermIndex++);
1140
+ }
1141
+ const termIndexVal = termToIndex.get(term)!;
1142
+
1143
+ // Track which documents contain the term
1144
+ if (!termDocs.has(term)) {
1145
+ termDocs.set(term, new Set<number>());
1146
+ }
1147
+ termDocs.get(term)!.add(docIndex);
1148
+
1149
+ // Increment frequency for this term in this document
1150
+ const currentFreq = docTermFrequencies.get(termIndexVal) || 0;
1151
+ docTermFrequencies.set(termIndexVal, currentFreq + fieldBoost); // TF weighted by boost
1152
+ });
1153
+ });
1154
+
1155
+ // Store the calculated length for this document
1156
+ documentLengths[docIndex] = currentDocLength;
1157
+ totalDocLength += currentDocLength;
1158
+
1159
+ // Merge this document's term frequencies into the main structure
1160
+ docTermFrequencies.forEach((freq, termIndexVal) => {
1161
+ if (!termFrequencies.has(termIndexVal)) {
1162
+ termFrequencies.set(termIndexVal, new Map<number, number>());
1163
+ }
1164
+ termFrequencies.get(termIndexVal)!.set(docIndex, freq);
1165
+ });
1166
+ });
1167
+
1168
+ // Calculate document frequency (DF) for each term
1169
+ const documentFrequency = new Uint32Array(termToIndex.size);
1170
+ termDocs.forEach((docsSet, term) => {
1171
+ const termIndexVal = termToIndex.get(term)!;
1172
+ documentFrequency[termIndexVal] = docsSet.size;
1173
+ });
1174
+
1175
+ return {
1176
+ documentLengths,
1177
+ termToIndex,
1178
+ documentFrequency,
1179
+ averageDocLength: numDocs > 0 ? totalDocLength / numDocs : 0,
1180
+ termFrequencies,
1181
+ };
1182
+ }
1183
+
1184
+ /**
1185
+ * Recalculates the average document length based on the current `documentLengths`.
1186
+ * @internal
1187
+ */
1188
+ private recalculateAverageLength(): void {
1189
+ if (this.documentLengths.length === 0) {
1190
+ this.averageDocLength = 0;
1191
+ return;
1192
+ }
1193
+ // Use the typed array's reduce method for type safety and performance
1194
+ const totalLength = this.documentLengths.reduce((sum, len) => sum + len, 0);
1195
+ this.averageDocLength = totalLength / this.documentLengths.length;
1196
+ }
1197
+
1198
+ /**
1199
+ * Searches the indexed documents for a given query string using the BM25 ranking formula.
1200
+ *
1201
+ * @param query - The search query text.
1202
+ * @param topK - The maximum number of top-scoring results to return. Defaults to 10.
1203
+ * @returns An array of `SearchResult` objects, sorted by descending BM25 score.
1204
+ */
1205
+ search(query: string, topK = 10): SearchResult[] {
1206
+ const { tokens: queryTokens } = this.tokenizer.tokenize(query); // Tokenize the query
1207
+ const scores = new Float32Array(this.documentLengths.length).fill(0); // Initialize scores to 0
1208
+
1209
+ // Accumulate scores for each document based on query terms
1210
+ queryTokens.forEach((term) => {
1211
+ const termIndex = this.termToIndex.get(term);
1212
+ // Ignore terms not found in the index
1213
+ if (termIndex === undefined) return;
1214
+
1215
+ const idf = this.calculateIdf(termIndex);
1216
+ // Skip terms with non-positive IDF (e.g., term in all docs)
1217
+ if (idf <= 0) return;
1218
+
1219
+ const termFreqsInDocs = this.termFrequencies.get(termIndex); // Map<DocIndex, TF>
1220
+ if (!termFreqsInDocs) return; // Should not happen if termIndex exists, but check anyway
1221
+
1222
+ // Iterate over documents containing this term
1223
+ termFreqsInDocs.forEach((tf, docIndex) => {
1224
+ const docLength = this.documentLengths[docIndex];
1225
+
1226
+ // --- BM25 Term Score Calculation ---
1227
+ // Normalizes TF based on document length and saturation parameters.
1228
+ const numerator = tf * (this.termFrequencySaturation + 1);
1229
+ const denominator =
1230
+ tf +
1231
+ this.termFrequencySaturation *
1232
+ (1 -
1233
+ this.lengthNormalizationFactor +
1234
+ (this.lengthNormalizationFactor * docLength) / this.averageDocLength);
1235
+
1236
+ // Add the weighted score (IDF * normalized TF) for this term to the document's total score
1237
+ scores[docIndex] += idf * (numerator / denominator);
1238
+ });
1239
+ });
1240
+
1241
+ // --- Result Generation ---
1242
+ // Create result objects, filter out zero scores, sort, and take top K
1243
+ return Array.from({ length: scores.length }, (_, i) => ({
1244
+ index: i,
1245
+ score: scores[i],
1246
+ // Optionally add: doc: this.getDocument(i) // If you want the full doc in results
1247
+ }))
1248
+ .filter((result) => result.score > 0) // Keep only documents with positive scores
1249
+ .sort((a, b) => b.score - a.score) // Sort by score descending
1250
+ .slice(0, topK); // Limit to topK results
1251
+ }
1252
+
1253
+ /**
1254
+ * Searches for an exact phrase within the indexed documents.
1255
+ * Ranks documents containing the exact sequence of tokens higher.
1256
+ * Note: This is a basic implementation. More sophisticated phrase search might consider proximity.
1257
+ *
1258
+ * @param phrase - The exact phrase to search for.
1259
+ * @param topK - The maximum number of results to return. Defaults to 10.
1260
+ * @returns An array of `SearchResult` objects, sorted by score, for documents containing the phrase.
1261
+ */
1262
+ searchPhrase(phrase: string, topK = 10): SearchResult[] {
1263
+ const { tokens: phraseTokens } = this.tokenizer.tokenize(phrase); // Tokenize the phrase
1264
+ if (phraseTokens.length === 0) return []; // Cannot search for empty phrase
1265
+
1266
+ // --- Find Candidate Documents ---
1267
+ // Start with documents containing the *first* term, then intersect with subsequent terms.
1268
+ let candidateDocs: Set<number> | null = null;
1269
+
1270
+ for (const term of phraseTokens) {
1271
+ const termIndex = this.termToIndex.get(term);
1272
+ if (termIndex === undefined) return []; // Phrase cannot exist if any term is missing
1273
+
1274
+ const docsContainingTermIter = this.termFrequencies.get(termIndex)?.keys();
1275
+ if (!docsContainingTermIter) return []; // Should not happen, but check
1276
+
1277
+ const currentTermDocs = new Set(docsContainingTermIter);
1278
+
1279
+ if (candidateDocs === null) {
1280
+ // First term initializes the candidates
1281
+ candidateDocs = currentTermDocs;
1282
+ } else {
1283
+ // Intersect: Keep only documents present in both sets
1284
+ candidateDocs = new Set(
1285
+ [...candidateDocs].filter((docIdx: number) => currentTermDocs.has(docIdx))
1286
+ );
1287
+ }
1288
+
1289
+ // If intersection becomes empty, the phrase cannot exist
1290
+ if (candidateDocs.size === 0) return [];
1291
+ }
1292
+
1293
+ if (candidateDocs === null || candidateDocs.size === 0) return []; // No candidates found
1294
+
1295
+ // --- Verify Phrase Occurrence and Score ---
1296
+ const scores = new Map<number, number>(); // Map<DocIndex, Score>
1297
+
1298
+ candidateDocs.forEach((docIndex) => {
1299
+ const doc = this.getDocument(docIndex); // Get the original document content
1300
+ let phraseFoundInDoc = false;
1301
+
1302
+ // Check each field for the phrase
1303
+ Object.entries(doc).forEach(([field, content]) => {
1304
+ if (typeof content !== 'string' || phraseFoundInDoc) return; // Skip non-strings or if already found
1305
+
1306
+ const fieldBoost = this.fieldBoosts[field] || 1;
1307
+ // Tokenize the field content using the same settings
1308
+ const { tokens: docTokens } = this.tokenizer.tokenize(content);
1309
+
1310
+ // Simple sliding window check for the exact phrase sequence
1311
+ for (let i = 0; i <= docTokens.length - phraseTokens.length; i++) {
1312
+ let match = true;
1313
+ for (let j = 0; j < phraseTokens.length; j++) {
1314
+ if (docTokens[i + j] !== phraseTokens[j]) {
1315
+ match = false;
1316
+ break;
1317
+ }
1318
+ }
1319
+ if (match) {
1320
+ // Phrase found! Calculate score for this document based on the phrase terms
1321
+ const phraseScoreVal = this.calculatePhraseScore(phraseTokens, docIndex) * fieldBoost;
1322
+ scores.set(docIndex, (scores.get(docIndex) || 0) + phraseScoreVal);
1323
+ phraseFoundInDoc = true; // Only score once per doc even if phrase repeats
1324
+ break; // Move to next document once found in this one
1325
+ }
1326
+ }
1327
+ });
1328
+ });
1329
+
1330
+ // --- Format and Sort Results ---
1331
+ return Array.from(scores.entries())
1332
+ .map(([index, score]) => ({ index, score }))
1333
+ .sort((a, b) => b.score - a.score) // Sort by score descending
1334
+ .slice(0, topK); // Limit results
1335
+ }
1336
+
1337
+ /**
1338
+ * Calculates a BM25-like score for a sequence of phrase tokens within a specific document.
1339
+ * Sums the individual BM25 scores of the terms in the phrase for that document.
1340
+ * @param phraseTokens - The tokenized phrase.
1341
+ * @param docIndex - The index of the document to score against.
1342
+ * @returns The calculated phrase score for the document.
1343
+ * @internal
1344
+ */
1345
+ private calculatePhraseScore(phraseTokens: string[], docIndex: number): number {
1346
+ return phraseTokens.reduce((currentScore, term) => {
1347
+ const termIndex = this.termToIndex.get(term);
1348
+ // Ignore terms not in index (shouldn't happen if candidate selection worked)
1349
+ if (termIndex === undefined) return currentScore;
1350
+
1351
+ const idf = this.calculateIdf(termIndex);
1352
+ const tf = this.termFrequencies.get(termIndex)?.get(docIndex) || 0;
1353
+ const docLength = this.documentLengths[docIndex];
1354
+
1355
+ // Calculate the BM25 contribution of this single term
1356
+ const numerator = tf * (this.termFrequencySaturation + 1);
1357
+ const denominator =
1358
+ tf +
1359
+ this.termFrequencySaturation *
1360
+ (1 -
1361
+ this.lengthNormalizationFactor +
1362
+ (this.lengthNormalizationFactor * docLength) / this.averageDocLength);
1363
+
1364
+ // Add IDF * normalized TF to the total phrase score
1365
+ return currentScore + idf * (numerator / denominator);
1366
+ }, 0); // Start score at 0
1367
+ }
1368
+
1369
+ /**
1370
+ * Adds a single new document to the index.
1371
+ * Updates all internal index structures incrementally.
1372
+ * Note: For adding many documents, `addDocumentsParallel` is generally more efficient.
1373
+ *
1374
+ * @param doc - The document object (with string fields) to add.
1375
+ * @throws {Error} If the document is null or undefined.
1376
+ */
1377
+ async addDocument(doc: any): Promise<void> {
1378
+ if (!doc) throw new Error('Document cannot be null');
1379
+
1380
+ const docIndex = this.documentLengths.length; // Index for the new document
1381
+
1382
+ // --- Update Document List and Lengths ---
1383
+ this.documents.push(doc);
1384
+ // Resize documentLengths array (simple append)
1385
+ const newDocLengths = new Uint32Array(docIndex + 1);
1386
+ newDocLengths.set(this.documentLengths, 0); // Copy old lengths
1387
+ // Calculate length later...
1388
+ this.documentLengths = newDocLengths; // Assign temporarily
1389
+
1390
+ let currentDocLength = 0;
1391
+ const docTermFrequencies = new Map<number, number>(); // TermIndex -> TF for this new doc
1392
+
1393
+ // --- Process Fields and Tokens ---
1394
+ Object.entries(doc).forEach(([field, content]) => {
1395
+ if (typeof content !== 'string') return;
1396
+
1397
+ const fieldBoost = this.fieldBoosts[field] || 1;
1398
+ const { tokens } = this.tokenizer.tokenize(content);
1399
+ currentDocLength += tokens.length * fieldBoost;
1400
+
1401
+ // Process each token in the field
1402
+ tokens.forEach((term) => {
1403
+ let termIndexVal: number;
1404
+ // Add term to index if new
1405
+ if (!this.termToIndex.has(term)) {
1406
+ termIndexVal = this.termToIndex.size;
1407
+ this.termToIndex.set(term, termIndexVal);
1408
+
1409
+ // Ensure documentFrequency array is large enough
1410
+ if (this.documentFrequency.length <= termIndexVal) {
1411
+ const oldDf = this.documentFrequency;
1412
+ // Grow exponentially, ensure it's at least termIndex + 1
1413
+ const newSize = Math.max(termIndexVal + 1, oldDf.length * 2 || 1); // Ensure newSize is at least 1 if oldDf.length is 0
1414
+ this.documentFrequency = new Uint32Array(newSize);
1415
+ this.documentFrequency.set(oldDf, 0);
1416
+ }
1417
+ // Initialize DF for new term (will be incremented below)
1418
+ this.documentFrequency[termIndexVal] = 0;
1419
+ } else {
1420
+ termIndexVal = this.termToIndex.get(term)!;
1421
+ }
1422
+
1423
+ // Increment frequency for this term in this new document
1424
+ const currentFreq = docTermFrequencies.get(termIndexVal) || 0;
1425
+ docTermFrequencies.set(termIndexVal, currentFreq + fieldBoost); // Weighted TF
1426
+ });
1427
+ });
1428
+
1429
+ // --- Update Global Structures ---
1430
+ // Set the calculated length for the new document
1431
+ this.documentLengths[docIndex] = currentDocLength;
1432
+
1433
+ // Add this document's term frequencies to the main map and update DF
1434
+ docTermFrequencies.forEach((freq, termIndexVal) => {
1435
+ // Add TF entry
1436
+ if (!this.termFrequencies.has(termIndexVal)) {
1437
+ this.termFrequencies.set(termIndexVal, new Map<number, number>());
1438
+ }
1439
+ this.termFrequencies.get(termIndexVal)!.set(docIndex, freq);
1440
+
1441
+ // Increment document frequency for the term
1442
+ // Ensure termIndexVal is within bounds of documentFrequency before incrementing
1443
+ if (termIndexVal < this.documentFrequency.length) {
1444
+ this.documentFrequency[termIndexVal]++;
1445
+ } else {
1446
+ // This case should ideally not be reached if array was resized correctly
1447
+ console.error(
1448
+ `Error: termIndexVal ${termIndexVal} is out of bounds for documentFrequency (length ${this.documentFrequency.length}). This indicates an issue with array resizing or term indexing.`
1449
+ );
1450
+ }
1451
+ });
1452
+
1453
+ // Recalculate average document length
1454
+ this.recalculateAverageLength(); // Efficiently update average
1455
+ }
1456
+
1457
+ /**
1458
+ * Calculates the Inverse Document Frequency (IDF) for a given term index.
1459
+ * Uses the BM25 IDF formula: log(1 + (N - n + 0.5) / (n + 0.5))
1460
+ * where N is the total number of documents and n is the number of documents
1461
+ * containing the term. The +1 smooths the logarithm.
1462
+ *
1463
+ * @param termIndex - The integer index of the term.
1464
+ * @returns The IDF score for the term. Returns 0 if the term is not found or has 0 DF.
1465
+ */
1466
+ calculateIdf(termIndex: number): number {
1467
+ // Ensure termIndex is valid
1468
+ if (termIndex < 0 || termIndex >= this.documentFrequency.length) {
1469
+ return 0; // Term not in index or index out of bounds
1470
+ }
1471
+
1472
+ const docFreq = this.documentFrequency[termIndex]; // n: number of docs containing the term
1473
+ // If term appears in 0 documents or more docs than exist (error state), return 0 IDF.
1474
+ if (docFreq <= 0 || docFreq > this.documentLengths.length) {
1475
+ return 0;
1476
+ }
1477
+
1478
+ const N = this.documentLengths.length; // Total number of documents
1479
+ const numerator = N - docFreq + 0.5;
1480
+ const denominator = docFreq + 0.5;
1481
+
1482
+ // Adding 1 inside the log ensures IDF is always non-negative.
1483
+ return Math.log(1 + numerator / denominator);
1484
+ }
1485
+
1486
+ /**
1487
+ * Retrieves the term frequency (TF) for a specific term in a specific document.
1488
+ * @param termIndex - The integer index of the term.
1489
+ * @param docIndex - The index of the document.
1490
+ * @returns The term frequency, or 0 if the term is not in the document or indices are invalid.
1491
+ */
1492
+ getTermFrequency(termIndex: number, docIndex: number): number {
1493
+ return this.termFrequencies.get(termIndex)?.get(docIndex) || 0;
1494
+ }
1495
+
1496
+ /**
1497
+ * Retrieves the original document object stored at a given index.
1498
+ * @param index - The index of the document to retrieve.
1499
+ * @returns The document object.
1500
+ * @throws {Error} If the index is out of bounds.
1501
+ */
1502
+ getDocument(index: number): any {
1503
+ // Consider using a generic <T>
1504
+ if (index < 0 || index >= this.documents.length) {
1505
+ throw new Error(`Document index ${index} out of bounds (0-${this.documents.length - 1})`);
1506
+ }
1507
+ return this.documents[index];
1508
+ }
1509
+
1510
+ /**
1511
+ * Clears all indexed documents and resets the BM25 instance to its initial state.
1512
+ */
1513
+ clearDocuments(): void {
1514
+ this.documents = [];
1515
+ this.documentLengths = new Uint32Array(0);
1516
+ this.termToIndex.clear();
1517
+ this.documentFrequency = new Uint32Array(0);
1518
+ this.averageDocLength = 0;
1519
+ this.termFrequencies.clear();
1520
+ }
1521
+
1522
+ /**
1523
+ * Gets the total number of documents currently indexed.
1524
+ * @returns The document count.
1525
+ */
1526
+ getDocumentCount(): number {
1527
+ return this.documents.length;
1528
+ }
1529
+
1530
+ /**
1531
+ * Adds multiple documents sequentially by calling `addDocument` for each.
1532
+ * This method processes documents sequentially in the main thread.
1533
+ * @param docs - An array of documents to add.
1534
+ */
1535
+ async addDocuments(docs: any[]): Promise<void[]> {
1536
+ // Allow Promise<void> return type
1537
+ // Using Promise.all to potentially run additions concurrently if addDocument becomes async
1538
+ // Although the current addDocument is sync, this structure allows future flexibility.
1539
+ return Promise.all(docs.map((doc) => this.addDocument(doc)));
1540
+ // Note: If addDocument remains purely synchronous, a simple forEach would also work:
1541
+ // docs.forEach(doc => this.addDocument(doc));
1542
+ }
1543
+ }