webpeel 0.15.2 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/cli-auth.d.ts.map +1 -1
- package/dist/cli-auth.js +5 -0
- package/dist/cli-auth.js.map +1 -1
- package/dist/cli.js +43 -11
- package/dist/cli.js.map +1 -1
- package/dist/core/crawler.d.ts +2 -0
- package/dist/core/crawler.d.ts.map +1 -1
- package/dist/core/crawler.js +12 -3
- package/dist/core/crawler.js.map +1 -1
- package/dist/core/pipeline.d.ts +1 -0
- package/dist/core/pipeline.d.ts.map +1 -1
- package/dist/core/pipeline.js +63 -2
- package/dist/core/pipeline.js.map +1 -1
- package/dist/core/quick-answer.d.ts +26 -0
- package/dist/core/quick-answer.d.ts.map +1 -1
- package/dist/core/quick-answer.js +451 -84
- package/dist/core/quick-answer.js.map +1 -1
- package/dist/core/search-provider.d.ts +47 -4
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +278 -7
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/stemmer.d.ts +39 -0
- package/dist/core/stemmer.d.ts.map +1 -0
- package/dist/core/stemmer.js +510 -0
- package/dist/core/stemmer.js.map +1 -0
- package/dist/core/synonyms.d.ts +43 -0
- package/dist/core/synonyms.d.ts.map +1 -0
- package/dist/core/synonyms.js +185 -0
- package/dist/core/synonyms.js.map +1 -0
- package/dist/mcp/server.js +109 -4
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts +1 -0
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +76 -10
- package/dist/server/app.js.map +1 -1
- package/dist/server/middleware/auth.d.ts +2 -1
- package/dist/server/middleware/auth.d.ts.map +1 -1
- package/dist/server/middleware/auth.js +25 -12
- package/dist/server/middleware/auth.js.map +1 -1
- package/dist/server/middleware/rate-limit.d.ts +1 -0
- package/dist/server/middleware/rate-limit.d.ts.map +1 -1
- package/dist/server/middleware/rate-limit.js +20 -11
- package/dist/server/middleware/rate-limit.js.map +1 -1
- package/dist/server/routes/agent.d.ts +4 -0
- package/dist/server/routes/agent.d.ts.map +1 -1
- package/dist/server/routes/agent.js +196 -9
- package/dist/server/routes/agent.js.map +1 -1
- package/dist/server/routes/batch.d.ts.map +1 -1
- package/dist/server/routes/batch.js +126 -1
- package/dist/server/routes/batch.js.map +1 -1
- package/dist/server/routes/fetch.d.ts +1 -0
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +193 -55
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/jobs.d.ts.map +1 -1
- package/dist/server/routes/jobs.js +115 -2
- package/dist/server/routes/jobs.js.map +1 -1
- package/dist/server/routes/mcp.d.ts +1 -0
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +113 -6
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/search.js +1 -1
- package/dist/server/routes/search.js.map +1 -1
- package/dist/server/types.d.ts +16 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +8 -0
- package/dist/server/types.js.map +1 -0
- package/dist/server/utils/response.d.ts +45 -0
- package/dist/server/utils/response.d.ts.map +1 -0
- package/dist/server/utils/response.js +70 -0
- package/dist/server/utils/response.js.map +1 -0
- package/dist/server/utils/sse.d.ts +23 -0
- package/dist/server/utils/sse.d.ts.map +1 -0
- package/dist/server/utils/sse.js +39 -0
- package/dist/server/utils/sse.js.map +1 -0
- package/dist/types.d.ts +2 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Porter Stemmer — Lightweight implementation of the Porter stemming algorithm.
|
|
3
|
+
*
|
|
4
|
+
* Based on: Martin Porter, "An algorithm for suffix stripping", 1980.
|
|
5
|
+
* Reference: https://tartarus.org/martin/PorterStemmer/
|
|
6
|
+
*
|
|
7
|
+
* This is a well-tested, deterministic implementation with no external dependencies.
|
|
8
|
+
* It correctly handles all standard Porter stemmer rules including steps 1a-5b.
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Irregular verb forms → base form.
|
|
12
|
+
* Porter stemmer only handles regular morphology (-ed, -ing, -s).
|
|
13
|
+
* English has ~200 irregular verbs; we cover the most common ones.
|
|
14
|
+
* This table normalizes irregular forms before stemming so that
|
|
15
|
+
* "built" → "build" → stem("build") = "build" matches stem("build").
|
|
16
|
+
*
|
|
17
|
+
* Ambiguous words are intentionally excluded:
|
|
18
|
+
* "found" — could be find (past) OR establish (base form "found a company")
|
|
19
|
+
* "left" — could be leave (past) OR direction
|
|
20
|
+
* "bore"/"borne"/"born" — could be bear (past) OR bore=boring OR born=birth
|
|
21
|
+
* "bound" — could be bind (past) OR boundary (noun)
|
|
22
|
+
*/
|
|
23
|
+
export declare const IRREGULAR_FORMS: Record<string, string>;
|
|
24
|
+
/**
|
|
25
|
+
* Stem a single word using the Porter stemming algorithm.
|
|
26
|
+
*
|
|
27
|
+
* Returns the stemmed word (lowercase). Input is also lowercased.
|
|
28
|
+
* Words shorter than 3 characters are returned as-is.
|
|
29
|
+
*
|
|
30
|
+
* Irregular verb forms (e.g. "built", "ran", "spoke") are first normalized
|
|
31
|
+
* to their base form before Porter steps are applied, ensuring that
|
|
32
|
+
* stem("built") === stem("build"), stem("spoke") === stem("speak"), etc.
|
|
33
|
+
*/
|
|
34
|
+
export declare function stem(word: string): string;
|
|
35
|
+
/**
|
|
36
|
+
* Stem an array of tokens.
|
|
37
|
+
*/
|
|
38
|
+
export declare function stemTokens(tokens: string[]): string[];
|
|
39
|
+
//# sourceMappingURL=stemmer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"stemmer.d.ts","sourceRoot":"","sources":["../../src/core/stemmer.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAmTH;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAyKlD,CAAC;AAMF;;;;;;;;;GASG;AACH,wBAAgB,IAAI,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAqBzC;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,EAAE,CAErD"}
|
|
@@ -0,0 +1,510 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Porter Stemmer — Lightweight implementation of the Porter stemming algorithm.
|
|
3
|
+
*
|
|
4
|
+
* Based on: Martin Porter, "An algorithm for suffix stripping", 1980.
|
|
5
|
+
* Reference: https://tartarus.org/martin/PorterStemmer/
|
|
6
|
+
*
|
|
7
|
+
* This is a well-tested, deterministic implementation with no external dependencies.
|
|
8
|
+
* It correctly handles all standard Porter stemmer rules including steps 1a-5b.
|
|
9
|
+
*/
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Vowel / consonant helpers
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
/**
|
|
14
|
+
* Returns true if character at position i in word is a vowel.
|
|
15
|
+
* 'y' is treated as a vowel when preceded by a consonant.
|
|
16
|
+
*/
|
|
17
|
+
function isVowelAt(word, i) {
|
|
18
|
+
const c = word[i];
|
|
19
|
+
if ('aeiou'.includes(c))
|
|
20
|
+
return true;
|
|
21
|
+
if (c === 'y' && i > 0 && !isVowelAt(word, i - 1))
|
|
22
|
+
return true;
|
|
23
|
+
return false;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Compute the "measure" m of a string stem.
|
|
27
|
+
* m = number of VC (vowel-then-consonant) transitions.
|
|
28
|
+
* The pattern is: [C](VC)^m[V]
|
|
29
|
+
*/
|
|
30
|
+
function getMeasure(stem) {
|
|
31
|
+
let m = 0;
|
|
32
|
+
let inVowel = false;
|
|
33
|
+
for (let i = 0; i < stem.length; i++) {
|
|
34
|
+
const v = isVowelAt(stem, i);
|
|
35
|
+
if (inVowel && !v) {
|
|
36
|
+
m++;
|
|
37
|
+
inVowel = false;
|
|
38
|
+
}
|
|
39
|
+
else if (!inVowel && v) {
|
|
40
|
+
inVowel = true;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return m;
|
|
44
|
+
}
|
|
45
|
+
/** Returns true if the stem contains at least one vowel. */
|
|
46
|
+
function containsVowel(stem) {
|
|
47
|
+
for (let i = 0; i < stem.length; i++) {
|
|
48
|
+
if (isVowelAt(stem, i))
|
|
49
|
+
return true;
|
|
50
|
+
}
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
/** Returns true if the stem ends in a double consonant (same consonant twice). */
|
|
54
|
+
function endsDoubleConsonant(stem) {
|
|
55
|
+
const n = stem.length;
|
|
56
|
+
if (n < 2)
|
|
57
|
+
return false;
|
|
58
|
+
return stem[n - 1] === stem[n - 2] && !isVowelAt(stem, n - 1);
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Returns true if stem ends in CVC where the final C is not W, X, or Y.
|
|
62
|
+
* This is the "*o" condition in Porter's paper.
|
|
63
|
+
*/
|
|
64
|
+
function endsCVC(stem) {
|
|
65
|
+
const n = stem.length;
|
|
66
|
+
if (n < 3)
|
|
67
|
+
return false;
|
|
68
|
+
const c3 = stem[n - 1];
|
|
69
|
+
return (!isVowelAt(stem, n - 1) &&
|
|
70
|
+
isVowelAt(stem, n - 2) &&
|
|
71
|
+
!isVowelAt(stem, n - 3) &&
|
|
72
|
+
c3 !== 'w' && c3 !== 'x' && c3 !== 'y');
|
|
73
|
+
}
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
// Step 1a — Plurals
|
|
76
|
+
// ---------------------------------------------------------------------------
|
|
77
|
+
function step1a(word) {
|
|
78
|
+
if (word.endsWith('sses')) {
|
|
79
|
+
return word.slice(0, -2); // caresses → caress
|
|
80
|
+
}
|
|
81
|
+
if (word.endsWith('ies')) {
|
|
82
|
+
return word.slice(0, -2); // ponies → poni
|
|
83
|
+
}
|
|
84
|
+
if (word.endsWith('ss')) {
|
|
85
|
+
return word; // caress → caress (no change)
|
|
86
|
+
}
|
|
87
|
+
if (word.endsWith('s') && word.length > 1) {
|
|
88
|
+
return word.slice(0, -1); // cats → cat
|
|
89
|
+
}
|
|
90
|
+
return word;
|
|
91
|
+
}
|
|
92
|
+
// ---------------------------------------------------------------------------
|
|
93
|
+
// Step 1b — Past tenses / gerunds
|
|
94
|
+
// ---------------------------------------------------------------------------
|
|
95
|
+
function step1bFixup(word) {
|
|
96
|
+
// AT → ATE
|
|
97
|
+
if (word.endsWith('at'))
|
|
98
|
+
return word + 'e'; // conflated → conflate
|
|
99
|
+
// BL → BLE
|
|
100
|
+
if (word.endsWith('bl'))
|
|
101
|
+
return word + 'e'; // troubled → trouble
|
|
102
|
+
// IZ → IZE
|
|
103
|
+
if (word.endsWith('iz'))
|
|
104
|
+
return word + 'e'; // sized → size
|
|
105
|
+
// Double consonant (not L, S, Z) → remove one
|
|
106
|
+
if (endsDoubleConsonant(word) &&
|
|
107
|
+
!word.endsWith('ll') &&
|
|
108
|
+
!word.endsWith('ss') &&
|
|
109
|
+
!word.endsWith('zz')) {
|
|
110
|
+
return word.slice(0, -1); // hopping → hop, tapping → tap
|
|
111
|
+
}
|
|
112
|
+
// m=1 and CVC (*o) → add E
|
|
113
|
+
if (getMeasure(word) === 1 && endsCVC(word)) {
|
|
114
|
+
return word + 'e'; // failing → fail handled differently... wait
|
|
115
|
+
// filing → file: after removing ING we get "fil" → m=1 and *o → add E → "file"
|
|
116
|
+
}
|
|
117
|
+
return word;
|
|
118
|
+
}
|
|
119
|
+
function step1b(word) {
|
|
120
|
+
// (m>0) EED → EE
|
|
121
|
+
if (word.endsWith('eed')) {
|
|
122
|
+
const stem = word.slice(0, -3);
|
|
123
|
+
if (getMeasure(stem) > 0) {
|
|
124
|
+
return word.slice(0, -1); // agreed → agre, feed → feed
|
|
125
|
+
}
|
|
126
|
+
return word;
|
|
127
|
+
}
|
|
128
|
+
// (*v*) ED → delete + fixup
|
|
129
|
+
if (word.endsWith('ed')) {
|
|
130
|
+
const stem = word.slice(0, -2);
|
|
131
|
+
if (containsVowel(stem)) {
|
|
132
|
+
return step1bFixup(stem);
|
|
133
|
+
}
|
|
134
|
+
return word;
|
|
135
|
+
}
|
|
136
|
+
// (*v*) ING → delete + fixup
|
|
137
|
+
if (word.endsWith('ing')) {
|
|
138
|
+
const stem = word.slice(0, -3);
|
|
139
|
+
if (containsVowel(stem)) {
|
|
140
|
+
return step1bFixup(stem);
|
|
141
|
+
}
|
|
142
|
+
return word;
|
|
143
|
+
}
|
|
144
|
+
return word;
|
|
145
|
+
}
|
|
146
|
+
// ---------------------------------------------------------------------------
|
|
147
|
+
// Step 1c — y → i
|
|
148
|
+
// ---------------------------------------------------------------------------
|
|
149
|
+
function step1c(word) {
|
|
150
|
+
if (word.endsWith('y') && word.length > 2) {
|
|
151
|
+
const stem = word.slice(0, -1);
|
|
152
|
+
if (containsVowel(stem)) {
|
|
153
|
+
return stem + 'i'; // happy → happi
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return word;
|
|
157
|
+
}
|
|
158
|
+
// ---------------------------------------------------------------------------
|
|
159
|
+
// Step 2 — Suffix removal (m > 0)
|
|
160
|
+
// ---------------------------------------------------------------------------
|
|
161
|
+
const STEP2_RULES = [
|
|
162
|
+
['ational', 'ate'],
|
|
163
|
+
['tional', 'tion'],
|
|
164
|
+
['enci', 'ence'],
|
|
165
|
+
['anci', 'ance'],
|
|
166
|
+
['izer', 'ize'],
|
|
167
|
+
['abli', 'able'],
|
|
168
|
+
['alli', 'al'],
|
|
169
|
+
['entli', 'ent'],
|
|
170
|
+
['eli', 'e'],
|
|
171
|
+
['ousli', 'ous'],
|
|
172
|
+
['ization', 'ize'],
|
|
173
|
+
['ation', 'ate'],
|
|
174
|
+
['ator', 'ate'],
|
|
175
|
+
['alism', 'al'],
|
|
176
|
+
['iveness', 'ive'],
|
|
177
|
+
['fulness', 'ful'],
|
|
178
|
+
['ousness', 'ous'],
|
|
179
|
+
['aliti', 'al'],
|
|
180
|
+
['iviti', 'ive'],
|
|
181
|
+
['biliti', 'ble'],
|
|
182
|
+
];
|
|
183
|
+
function step2(word) {
|
|
184
|
+
for (const [suffix, replacement] of STEP2_RULES) {
|
|
185
|
+
if (word.endsWith(suffix)) {
|
|
186
|
+
const stem = word.slice(0, -suffix.length);
|
|
187
|
+
if (getMeasure(stem) > 0) {
|
|
188
|
+
return stem + replacement;
|
|
189
|
+
}
|
|
190
|
+
return word;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return word;
|
|
194
|
+
}
|
|
195
|
+
// ---------------------------------------------------------------------------
|
|
196
|
+
// Step 3 — Suffix removal (m > 0)
|
|
197
|
+
// ---------------------------------------------------------------------------
|
|
198
|
+
const STEP3_RULES = [
|
|
199
|
+
['icate', 'ic'],
|
|
200
|
+
['ative', ''],
|
|
201
|
+
['alize', 'al'],
|
|
202
|
+
['iciti', 'ic'],
|
|
203
|
+
['ical', 'ic'],
|
|
204
|
+
['ful', ''],
|
|
205
|
+
['ness', ''],
|
|
206
|
+
];
|
|
207
|
+
function step3(word) {
|
|
208
|
+
for (const [suffix, replacement] of STEP3_RULES) {
|
|
209
|
+
if (word.endsWith(suffix)) {
|
|
210
|
+
const stem = word.slice(0, -suffix.length);
|
|
211
|
+
if (getMeasure(stem) > 0) {
|
|
212
|
+
return stem + replacement;
|
|
213
|
+
}
|
|
214
|
+
return word;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
return word;
|
|
218
|
+
}
|
|
219
|
+
// ---------------------------------------------------------------------------
|
|
220
|
+
// Step 4 — Suffix removal (m > 1)
|
|
221
|
+
// ---------------------------------------------------------------------------
|
|
222
|
+
const STEP4_RULES = [
|
|
223
|
+
['ement', ''],
|
|
224
|
+
['ment', ''],
|
|
225
|
+
['ance', ''],
|
|
226
|
+
['ence', ''],
|
|
227
|
+
['able', ''],
|
|
228
|
+
['ible', ''],
|
|
229
|
+
['ism', ''],
|
|
230
|
+
['ate', ''],
|
|
231
|
+
['iti', ''],
|
|
232
|
+
['ous', ''],
|
|
233
|
+
['ive', ''],
|
|
234
|
+
['ize', ''],
|
|
235
|
+
['ant', ''],
|
|
236
|
+
['ent', ''],
|
|
237
|
+
['al', ''],
|
|
238
|
+
['er', ''],
|
|
239
|
+
['ic', ''],
|
|
240
|
+
['ou', ''],
|
|
241
|
+
];
|
|
242
|
+
function step4(word) {
|
|
243
|
+
// Special case: ION — stem must end in S or T
|
|
244
|
+
if (word.endsWith('ion')) {
|
|
245
|
+
const stem = word.slice(0, -3);
|
|
246
|
+
if (getMeasure(stem) > 1 && (stem.endsWith('s') || stem.endsWith('t'))) {
|
|
247
|
+
return stem;
|
|
248
|
+
}
|
|
249
|
+
return word;
|
|
250
|
+
}
|
|
251
|
+
for (const [suffix, replacement] of STEP4_RULES) {
|
|
252
|
+
if (word.endsWith(suffix)) {
|
|
253
|
+
const stem = word.slice(0, -suffix.length);
|
|
254
|
+
if (getMeasure(stem) > 1) {
|
|
255
|
+
return stem + replacement;
|
|
256
|
+
}
|
|
257
|
+
return word;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
return word;
|
|
261
|
+
}
|
|
262
|
+
// ---------------------------------------------------------------------------
|
|
263
|
+
// Step 5a — Final E removal
|
|
264
|
+
// ---------------------------------------------------------------------------
|
|
265
|
+
function step5a(word) {
|
|
266
|
+
if (word.endsWith('e')) {
|
|
267
|
+
const stem = word.slice(0, -1);
|
|
268
|
+
const m = getMeasure(stem);
|
|
269
|
+
if (m > 1)
|
|
270
|
+
return stem;
|
|
271
|
+
if (m === 1 && !endsCVC(stem))
|
|
272
|
+
return stem;
|
|
273
|
+
}
|
|
274
|
+
return word;
|
|
275
|
+
}
|
|
276
|
+
// ---------------------------------------------------------------------------
|
|
277
|
+
// Step 5b — Double L removal
|
|
278
|
+
// ---------------------------------------------------------------------------
|
|
279
|
+
function step5b(word) {
|
|
280
|
+
if (word.endsWith('ll') && getMeasure(word) > 1) {
|
|
281
|
+
return word.slice(0, -1);
|
|
282
|
+
}
|
|
283
|
+
return word;
|
|
284
|
+
}
|
|
285
|
+
// ---------------------------------------------------------------------------
|
|
286
|
+
// Irregular verb forms table
|
|
287
|
+
// ---------------------------------------------------------------------------
|
|
288
|
+
/**
|
|
289
|
+
* Irregular verb forms → base form.
|
|
290
|
+
* Porter stemmer only handles regular morphology (-ed, -ing, -s).
|
|
291
|
+
* English has ~200 irregular verbs; we cover the most common ones.
|
|
292
|
+
* This table normalizes irregular forms before stemming so that
|
|
293
|
+
* "built" → "build" → stem("build") = "build" matches stem("build").
|
|
294
|
+
*
|
|
295
|
+
* Ambiguous words are intentionally excluded:
|
|
296
|
+
* "found" — could be find (past) OR establish (base form "found a company")
|
|
297
|
+
* "left" — could be leave (past) OR direction
|
|
298
|
+
* "bore"/"borne"/"born" — could be bear (past) OR bore=boring OR born=birth
|
|
299
|
+
* "bound" — could be bind (past) OR boundary (noun)
|
|
300
|
+
*/
|
|
301
|
+
export const IRREGULAR_FORMS = {
|
|
302
|
+
// build
|
|
303
|
+
'built': 'build',
|
|
304
|
+
// run
|
|
305
|
+
'ran': 'run',
|
|
306
|
+
// make
|
|
307
|
+
'made': 'make',
|
|
308
|
+
// write
|
|
309
|
+
'wrote': 'write', 'written': 'write',
|
|
310
|
+
// begin
|
|
311
|
+
'began': 'begin', 'begun': 'begin',
|
|
312
|
+
// give
|
|
313
|
+
'gave': 'give', 'given': 'give',
|
|
314
|
+
// take
|
|
315
|
+
'took': 'take', 'taken': 'take',
|
|
316
|
+
// go
|
|
317
|
+
'went': 'go', 'gone': 'go',
|
|
318
|
+
// come
|
|
319
|
+
'came': 'come',
|
|
320
|
+
// see
|
|
321
|
+
'saw': 'see', 'seen': 'see',
|
|
322
|
+
// know
|
|
323
|
+
'knew': 'know', 'known': 'know',
|
|
324
|
+
// think
|
|
325
|
+
'thought': 'think',
|
|
326
|
+
// tell
|
|
327
|
+
'told': 'tell',
|
|
328
|
+
// say
|
|
329
|
+
'said': 'say',
|
|
330
|
+
// get
|
|
331
|
+
'got': 'get', 'gotten': 'get',
|
|
332
|
+
// buy
|
|
333
|
+
'bought': 'buy',
|
|
334
|
+
// bring
|
|
335
|
+
'brought': 'bring',
|
|
336
|
+
// send
|
|
337
|
+
'sent': 'send',
|
|
338
|
+
// spend
|
|
339
|
+
'spent': 'spend',
|
|
340
|
+
// keep
|
|
341
|
+
'kept': 'keep',
|
|
342
|
+
// hold
|
|
343
|
+
'held': 'hold',
|
|
344
|
+
// stand
|
|
345
|
+
'stood': 'stand',
|
|
346
|
+
// lose
|
|
347
|
+
'lost': 'lose',
|
|
348
|
+
// pay
|
|
349
|
+
'paid': 'pay',
|
|
350
|
+
// meet
|
|
351
|
+
'met': 'meet',
|
|
352
|
+
// lead
|
|
353
|
+
'led': 'lead',
|
|
354
|
+
// grow
|
|
355
|
+
'grew': 'grow', 'grown': 'grow',
|
|
356
|
+
// draw
|
|
357
|
+
'drew': 'draw', 'drawn': 'draw',
|
|
358
|
+
// break
|
|
359
|
+
'broke': 'break', 'broken': 'break',
|
|
360
|
+
// speak
|
|
361
|
+
'spoke': 'speak', 'spoken': 'speak',
|
|
362
|
+
// choose
|
|
363
|
+
'chose': 'choose', 'chosen': 'choose',
|
|
364
|
+
// fall
|
|
365
|
+
'fell': 'fall', 'fallen': 'fall',
|
|
366
|
+
// drive
|
|
367
|
+
'drove': 'drive', 'driven': 'drive',
|
|
368
|
+
// rise
|
|
369
|
+
'rose': 'rise', 'risen': 'rise',
|
|
370
|
+
// fly
|
|
371
|
+
'flew': 'fly', 'flown': 'fly',
|
|
372
|
+
// throw
|
|
373
|
+
'threw': 'throw', 'thrown': 'throw',
|
|
374
|
+
// wear
|
|
375
|
+
'wore': 'wear', 'worn': 'wear',
|
|
376
|
+
// hide
|
|
377
|
+
'hid': 'hide', 'hidden': 'hide',
|
|
378
|
+
// sit
|
|
379
|
+
'sat': 'sit',
|
|
380
|
+
// swim
|
|
381
|
+
'swam': 'swim', 'swum': 'swim',
|
|
382
|
+
// sing
|
|
383
|
+
'sang': 'sing', 'sung': 'sing',
|
|
384
|
+
// ring
|
|
385
|
+
'rang': 'ring', 'rung': 'ring',
|
|
386
|
+
// drink
|
|
387
|
+
'drank': 'drink', 'drunk': 'drink',
|
|
388
|
+
// wake
|
|
389
|
+
'woke': 'wake', 'woken': 'wake',
|
|
390
|
+
// freeze
|
|
391
|
+
'froze': 'freeze', 'frozen': 'freeze',
|
|
392
|
+
// steal
|
|
393
|
+
'stole': 'steal', 'stolen': 'steal',
|
|
394
|
+
// tear
|
|
395
|
+
'tore': 'tear', 'torn': 'tear',
|
|
396
|
+
// shake
|
|
397
|
+
'shook': 'shake', 'shaken': 'shake',
|
|
398
|
+
// forgive
|
|
399
|
+
'forgave': 'forgive', 'forgiven': 'forgive',
|
|
400
|
+
// forget
|
|
401
|
+
'forgot': 'forget', 'forgotten': 'forget',
|
|
402
|
+
// bite
|
|
403
|
+
'bit': 'bite', 'bitten': 'bite',
|
|
404
|
+
// blow
|
|
405
|
+
'blew': 'blow', 'blown': 'blow',
|
|
406
|
+
// catch
|
|
407
|
+
'caught': 'catch',
|
|
408
|
+
// teach
|
|
409
|
+
'taught': 'teach',
|
|
410
|
+
// fight
|
|
411
|
+
'fought': 'fight',
|
|
412
|
+
// seek
|
|
413
|
+
'sought': 'seek',
|
|
414
|
+
// sell
|
|
415
|
+
'sold': 'sell',
|
|
416
|
+
// win
|
|
417
|
+
'won': 'win',
|
|
418
|
+
// feed
|
|
419
|
+
'fed': 'feed',
|
|
420
|
+
// feel
|
|
421
|
+
'felt': 'feel',
|
|
422
|
+
// mean
|
|
423
|
+
'meant': 'mean',
|
|
424
|
+
// lend
|
|
425
|
+
'lent': 'lend',
|
|
426
|
+
// bend
|
|
427
|
+
'bent': 'bend',
|
|
428
|
+
// dig
|
|
429
|
+
'dug': 'dig',
|
|
430
|
+
// stick
|
|
431
|
+
'stuck': 'stick',
|
|
432
|
+
// strike
|
|
433
|
+
'struck': 'strike', 'stricken': 'strike',
|
|
434
|
+
// swear
|
|
435
|
+
'swore': 'swear', 'sworn': 'swear',
|
|
436
|
+
// spin
|
|
437
|
+
'spun': 'spin',
|
|
438
|
+
// hang
|
|
439
|
+
'hung': 'hang',
|
|
440
|
+
// slide
|
|
441
|
+
'slid': 'slide',
|
|
442
|
+
// shine
|
|
443
|
+
'shone': 'shine',
|
|
444
|
+
// shoot
|
|
445
|
+
'shot': 'shoot',
|
|
446
|
+
// sleep
|
|
447
|
+
'slept': 'sleep',
|
|
448
|
+
// sweep
|
|
449
|
+
'swept': 'sweep',
|
|
450
|
+
// creep
|
|
451
|
+
'crept': 'creep',
|
|
452
|
+
// weep
|
|
453
|
+
'wept': 'weep',
|
|
454
|
+
// deal
|
|
455
|
+
'dealt': 'deal',
|
|
456
|
+
// dream (irregular British)
|
|
457
|
+
'dreamt': 'dream',
|
|
458
|
+
// learn (irregular British)
|
|
459
|
+
'learnt': 'learn',
|
|
460
|
+
// burn (irregular British)
|
|
461
|
+
'burnt': 'burn',
|
|
462
|
+
// lean
|
|
463
|
+
'leant': 'lean',
|
|
464
|
+
// leap
|
|
465
|
+
'leapt': 'leap',
|
|
466
|
+
// spell
|
|
467
|
+
'spelt': 'spell',
|
|
468
|
+
// spill
|
|
469
|
+
'spilt': 'spill',
|
|
470
|
+
};
|
|
471
|
+
// ---------------------------------------------------------------------------
|
|
472
|
+
// Main stem function
|
|
473
|
+
// ---------------------------------------------------------------------------
|
|
474
|
+
/**
|
|
475
|
+
* Stem a single word using the Porter stemming algorithm.
|
|
476
|
+
*
|
|
477
|
+
* Returns the stemmed word (lowercase). Input is also lowercased.
|
|
478
|
+
* Words shorter than 3 characters are returned as-is.
|
|
479
|
+
*
|
|
480
|
+
* Irregular verb forms (e.g. "built", "ran", "spoke") are first normalized
|
|
481
|
+
* to their base form before Porter steps are applied, ensuring that
|
|
482
|
+
* stem("built") === stem("build"), stem("spoke") === stem("speak"), etc.
|
|
483
|
+
*/
|
|
484
|
+
export function stem(word) {
|
|
485
|
+
if (!word)
|
|
486
|
+
return word;
|
|
487
|
+
const lower = word.toLowerCase();
|
|
488
|
+
// Short words: don't stem
|
|
489
|
+
if (lower.length <= 2)
|
|
490
|
+
return lower;
|
|
491
|
+
// Normalize irregular verb forms to base before stemming
|
|
492
|
+
const normalized = IRREGULAR_FORMS[lower] ?? lower;
|
|
493
|
+
let w = normalized;
|
|
494
|
+
w = step1a(w);
|
|
495
|
+
w = step1b(w);
|
|
496
|
+
w = step1c(w);
|
|
497
|
+
w = step2(w);
|
|
498
|
+
w = step3(w);
|
|
499
|
+
w = step4(w);
|
|
500
|
+
w = step5a(w);
|
|
501
|
+
w = step5b(w);
|
|
502
|
+
return w;
|
|
503
|
+
}
|
|
504
|
+
/**
|
|
505
|
+
* Stem an array of tokens.
|
|
506
|
+
*/
|
|
507
|
+
export function stemTokens(tokens) {
|
|
508
|
+
return tokens.map(stem);
|
|
509
|
+
}
|
|
510
|
+
//# sourceMappingURL=stemmer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"stemmer.js","sourceRoot":"","sources":["../../src/core/stemmer.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E;;;GAGG;AACH,SAAS,SAAS,CAAC,IAAY,EAAE,CAAS;IACxC,MAAM,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,IAAI,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IACrC,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IAC/D,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;GAIG;AACH,SAAS,UAAU,CAAC,IAAY;IAC9B,IAAI,CAAC,GAAG,CAAC,CAAC;IACV,IAAI,OAAO,GAAG,KAAK,CAAC;IACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,CAAC,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAC7B,IAAI,OAAO,IAAI,CAAC,CAAC,EAAE,CAAC;YAClB,CAAC,EAAE,CAAC;YACJ,OAAO,GAAG,KAAK,CAAC;QAClB,CAAC;aAAM,IAAI,CAAC,OAAO,IAAI,CAAC,EAAE,CAAC;YACzB,OAAO,GAAG,IAAI,CAAC;QACjB,CAAC;IACH,CAAC;IACD,OAAO,CAAC,CAAC;AACX,CAAC;AAED,4DAA4D;AAC5D,SAAS,aAAa,CAAC,IAAY;IACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,IAAI,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC;YAAE,OAAO,IAAI,CAAC;IACtC,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,kFAAkF;AAClF,SAAS,mBAAmB,CAAC,IAAY;IACvC,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC;IACtB,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IACxB,OAAO,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;AAChE,CAAC;AAED;;;GAGG;AACH,SAAS,OAAO,CAAC,IAAY;IAC3B,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC;IACtB,IAAI,CAAC,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IACxB,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvB,OAAO,CACL,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC;QACvB,SAAS,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC;QACvB,EAAE,KAAK,GAAG,IAAI,EAAE,KAAK,GAAG,IAAI,EAAE,KAAK,GAAG,CACvC,CAAC;AACJ,CAAC;AAED,8EAA8E;AAC9E,oBAAoB;AACpB,8EAA8E;AAE9E,SAAS,MAAM,CAAC,IAAY;IAC1B,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QAC1B,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,oBAAoB;IAChD,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,gBAAgB;IAC5C,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACxB,OAAO,IAAI,CAAC,CAAC,8BAA8B;IAC7C,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1C,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,aAAa;IACzC,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,8EAA8E;AAC9E,kCAAkC;AAClC,8EAA8E;AAE9E,SAAS,WAAW,CAAC,IAAY;IAC/B,WAAW;IACX,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,GAAG,GAAG,CAAC,CAAC,uBAAuB;IACnE,WAAW;IACX,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,GAAG,GAAG,CAAC,CAAC,qBAAqB;IACjE,WAAW;IACX,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,GAAG,GAAG,CAAC,CAAC,eAAe;IAE3D,8CAA8C;IAC9C,IACE,mBAAmB,CAAC,IAAI,CAAC;QACzB,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;QACpB,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC;QACpB,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EACpB,CAAC;QACD,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,+BAA+B;IAC3D,CAAC;IAED,2BAA2B;IAC3B,IAAI,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;QAC5C,OAAO,IAAI,GAAG,GAAG,CAAC,CAAC,6CAA6C;QAChE,+EAA+E;IACjF,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,MAAM,CAAC,IAAY;IAC1B,iBAAiB;IACjB,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACzB,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,6BAA6B;QACzD,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,4BAA4B;IAC5B,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACxB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,OAAO,WAAW,CAAC,IAAI,CAAC,CAAC;QAC3B,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,6BAA6B;IAC7B,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,OAAO,WAAW,CAAC,IAAI,CAAC,CAAC;QAC3B,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC;AAED,8EAA8E;AAC9E,kBAAkB;AAClB,8EAA8E;AAE9E,SAAS,MAAM,CAAC,IAAY;IAC1B,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,OAAO,IAAI,GAAG,GAAG,CAAC,CAAC,gBAAgB;QACrC,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,8EAA8E;AAC9E,kCAAkC;AAClC,8EAA8E;AAE9E,MAAM,WAAW,GAA4B;IAC3C,CAAC,SAAS,EAAE,KAAK,CAAC;IAClB,CAAC,QAAQ,EAAE,MAAM,CAAC;IAClB,CAAC,MAAM,EAAE,MAAM,CAAC;IAChB,CAAC,MAAM,EAAE,MAAM,CAAC;IAChB,CAAC,MAAM,EAAE,KAAK,CAAC;IACf,CAAC,MAAM,EAAE,MAAM,CAAC;IAChB,CAAC,MAAM,EAAE,IAAI,CAAC;IACd,CAAC,OAAO,EAAE,KAAK,CAAC;IAChB,CAAC,KAAK,EAAE,GAAG,CAAC;IACZ,CAAC,OAAO,EAAE,KAAK,CAAC;IAChB,CAAC,SAAS,EAAE,KAAK,CAAC;IAClB,CAAC,OAAO,EAAE,KAAK,CAAC;IAChB,CAAC,MAAM,EAAE,KAAK,CAAC;IACf,CAAC,OAAO,EAAE,IAAI,CAAC;IACf,CAAC,SAAS,EAAE,KAAK,CAAC;IAClB,CAAC,SAAS,EAAE,KAAK,CAAC;IAClB,CAAC,SAAS,EAAE,KAAK,CAAC;IAClB,CAAC,OAAO,EAAE,IAAI,CAAC;IACf,CAAC,OAAO,EAAE,KAAK,CAAC;IAChB,CAAC,QAAQ,EAAE,KAAK,CAAC;CAClB,CAAC;AAEF,SAAS,KAAK,CAAC,IAAY;IACzB,KAAK,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,IAAI,WAAW,EAAE,CAAC;QAChD,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;YAC3C,IAAI,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACzB,OAAO,IAAI,GAAG,WAAW,CAAC;YAC5B,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,8EAA8E;AAC9E,kCAAkC;AAClC,8EAA8E;AAE9E,MAAM,WAAW,GAA4B;IAC3C,CAAC,OAAO,EAAE,IAAI,CAAC;IACf,CAAC,OAAO,EAAE,EAAE,CAAC;IACb,CAAC,OAAO,EAAE,IAAI,CAAC;IACf,CAAC,OAAO,EAAE,IAAI,CAAC;IACf,CAAC,MAAM,EAAE,IAAI,CAAC;IACd,CAAC,KAAK,EAAE,EAAE,CAAC;IACX,CAAC,MAAM,EAAE,EAAE,CAAC;CACb,CAAC;AAEF,SAAS,KAAK,CAAC,IAAY;IACzB,KAAK,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,IAAI,WAAW,EAAE,CAAC;QAChD,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;YAC3C,IAAI,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACzB,OAAO,IAAI,GAAG,WAAW,CAAC;YAC5B,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,8EAA8E;AAC9E,kCAAkC;AAClC,8EAA8E;AAE9E,MAAM,WAAW,GAA4B;IAC3C,CAAC,OAAO,EAAE,EAAE,CAAC;IACb,CAAC,MAAM,EAAE,EAAE,CAAC;IACZ,CAAC,MAAM,EAAE,EAAE,CAAC;IACZ,CAAC,MAAM,EAAE,EAAE,CAAC;IACZ,CAAC,MAAM,EAAE,EAAE,CAAC;IACZ,CAAC,MAAM,EAAE,EAAE,CAAC;IACZ,CAAC,KAAK,EAAE,EAAE,CAAC;IACX,CAAC,KAAK,EAAE,EAAE,CAAC;IACX,CAAC,KAAK,EAAE,EAAE,CAAC;IACX,CAAC,KAAK,EAAE,EAAE,CAAC;IACX,CAAC,KAAK,EAAE,EAAE,CAAC;IACX,CAAC,KAAK,EAAE,EAAE,CAAC;IACX,CAAC,KAAK,EAAE,EAAE,CAAC;IACX,CAAC,KAAK,EAAE,EAAE,CAAC;IACX,CAAC,IAAI,EAAE,EAAE,CAAC;IACV,CAAC,IAAI,EAAE,EAAE,CAAC;IACV,CAAC,IAAI,EAAE,EAAE,CAAC;IACV,CAAC,IAAI,EAAE,EAAE,CAAC;CACX,CAAC;AAEF,SAAS,KAAK,CAAC,IAAY;IACzB,8CAA8C;IAC9C,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;YACvE,OAAO,IAAI,CAAC;QACd,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,KAAK,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,IAAI,WAAW,EAAE,CAAC;QAChD,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;YAC3C,IAAI,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACzB,OAAO,IAAI,GAAG,WAAW,CAAC;YAC5B,CAAC;YACD,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,SAAS,MAAM,CAAC,IAAY;IAC1B,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;QAC3B,IAAI,CAAC,GAAG,CAAC;YAAE,OAAO,IAAI,CAAC;QACvB,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC;YAAE,OAAO,IAAI,CAAC;IAC7C,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,8EAA8E;AAC9E,6BAA6B;AAC7B,8EAA8E;AAE9E,SAAS,MAAM,CAAC,IAAY;IAC1B,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;QAChD,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IAC3B,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,8EAA8E;AAC9E,6BAA6B;AAC7B,8EAA8E;AAE9E;;;;;;;;;;;;GAYG;AACH,MAAM,CAAC,MAAM,eAAe,GAA2B;IACrD,QAAQ;IACR,OAAO,EAAE,OAAO;IAChB,MAAM;IACN,KAAK,EAAE,KAAK;IACZ,OAAO;IACP,MAAM,EAAE,MAAM;IACd,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,OAAO;IACpC,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO;IAClC,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;IAC/B,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;IAC/B,KAAK;IACL,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI;IAC1B,OAAO;IACP,MAAM,EAAE,MAAM;IACd,MAAM;IACN,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK;IAC3B,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;IAC/B,QAAQ;IACR,SAAS,EAAE,OAAO;IAClB,OAAO;IACP,MAAM,EAAE,MAAM;IACd,MAAM;IACN,MAAM,EAAE,KAAK;IACb,MAAM;IACN,KAAK,EAAE,KAAK,EAAE,QAAQ,EAAE,KAAK;IAC7B,MAAM;IACN,QAAQ,EAAE,KAAK;IACf,QAAQ;IACR,SAAS,EAAE,OAAO;IAClB,OAAO;IACP,MAAM,EAAE,MAAM;IACd,QAAQ;IACR,OAAO,EAAE,OAAO;IAChB,OAAO;IACP,MAAM,EAAE,MAAM;IACd,OAAO;IACP,MAAM,EAAE,MAAM;IACd,QAAQ;IACR,OAAO,EAAE,OAAO;IAChB,OAAO;IACP,MAAM,EAAE,MAAM;IACd,MAAM;IACN,MAAM,EAAE,KAAK;IACb,OAAO;IACP,KAAK,EAAE,MAAM;IACb,OAAO;IACP,KAAK,EAAE,MAAM;IACb,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;IAC/B,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;IAC/B,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO;IACnC,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO;IACnC,SAAS;IACT,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ;IACrC,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM;IAChC,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO;IACnC,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;IAC/B,MAAM;IACN,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,KAAK;IAC7B,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO;IACnC,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IAC9B,OAAO;IACP,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM;IAC/B,MAAM;IACN,KAAK,EAAE,KAAK;IACZ,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IAC9B,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IAC9B,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IAC9B,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO;IAClC,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;IAC/B,SAAS;IACT,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ;IACrC,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO;IACnC,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IAC9B,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO;IACnC,UAAU;IACV,SAAS,EAAE,SAAS,EAAE,UAAU,EAAE,SAAS;IAC3C,SAAS;IACT,QAAQ,EAAE,QAAQ,EAAE,WAAW,EAAE,QAAQ;IACzC,OAAO;IACP,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM;IAC/B,OAAO;IACP,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM;IAC/B,QAAQ;IACR,QAAQ,EAAE,OAAO;IACjB,QAAQ;IACR,QAAQ,EAAE,OAAO;IACjB,QAAQ;IACR,QAAQ,EAAE,OAAO;IACjB,OAAO;IACP,QAAQ,EAAE,MAAM;IAChB,OAAO;IACP,MAAM,EAAE,MAAM;IACd,MAAM;IACN,KAAK,EAAE,KAAK;IACZ,OAAO;IACP,KAAK,EAAE,MAAM;IACb,OAAO;IACP,MAAM,EAAE,MAAM;IACd,OAAO;IACP,OAAO,EAAE,MAAM;IACf,OAAO;IACP,MAAM,EAAE,MAAM;IACd,OAAO;IACP,MAAM,EAAE,MAAM;IACd,MAAM;IACN,KAAK,EAAE,KAAK;IACZ,QAAQ;IACR,OAAO,EAAE,OAAO;IAChB,SAAS;IACT,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ;IACxC,QAAQ;IACR,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO;IAClC,OAAO;IACP,MAAM,EAAE,MAAM;IACd,OAAO;IACP,MAAM,EAAE,MAAM;IACd,QAAQ;IACR,MAAM,EAAE,OAAO;IACf,QAAQ;IACR,OAAO,EAAE,OAAO;IAChB,QAAQ;IACR,MAAM,EAAE,OAAO;IACf,QAAQ;IACR,OAAO,EAAE,OAAO;IAChB,QAAQ;IACR,OAAO,EAAE,OAAO;IAChB,QAAQ;IACR,OAAO,EAAE,OAAO;IAChB,OAAO;IACP,MAAM,EAAE,MAAM;IACd,OAAO;IACP,OAAO,EAAE,MAAM;IACf,4BAA4B;IAC5B,QAAQ,EAAE,OAAO;IACjB,4BAA4B;IAC5B,QAAQ,EAAE,OAAO;IACjB,2BAA2B;IAC3B,OAAO,EAAE,MAAM;IACf,OAAO;IACP,OAAO,EAAE,MAAM;IACf,OAAO;IACP,OAAO,EAAE,MAAM;IACf,QAAQ;IACR,OAAO,EAAE,OAAO;IAChB,QAAQ;IACR,OAAO,EAAE,OAAO;CACjB,CAAC;AAEF,8EAA8E;AAC9E,qBAAqB;AACrB,8EAA8E;AAE9E;;;;;;;;;GASG;AACH,MAAM,UAAU,IAAI,CAAC,IAAY;IAC/B,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAC;IACvB,MAAM,KAAK,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IAEjC,0BAA0B;IAC1B,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IAEpC,yDAAyD;IACzD,MAAM,UAAU,GAAG,eAAe,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC;IAEnD,IAAI,CAAC,GAAG,UAAU,CAAC;IACnB,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;IACd,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;IACd,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;IACd,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IACb,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IACb,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IACb,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;IACd,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;IAEd,OAAO,CAAC,CAAC;AACX,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,MAAgB;IACzC,OAAO,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Synonym expansion for query broadening.
|
|
3
|
+
*
|
|
4
|
+
* Provides stemmed synonym groups and a function to expand a set of stemmed
|
|
5
|
+
* query tokens with related synonyms (at a lower weight).
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* const queryTerms = tokenizeQuestion(question); // already stemmed
|
|
9
|
+
* const expanded = expandWithSynonyms(queryTerms);
|
|
10
|
+
* // expanded includes originals (weight=1.0) + synonyms (weight=0.5)
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Raw synonym groups. Each group is a set of words with equivalent or near-
|
|
14
|
+
* equivalent meaning in the context of software/web documentation queries.
|
|
15
|
+
*
|
|
16
|
+
* These are stored in unstemmed form for readability; the build process stems
|
|
17
|
+
* them into STEMMED_SYNONYM_GROUPS and builds an index.
|
|
18
|
+
*/
|
|
19
|
+
export declare const SYNONYM_GROUPS: string[][];
|
|
20
|
+
/**
|
|
21
|
+
* Stemmed synonym groups.
|
|
22
|
+
* Each word in each group has been run through the Porter stemmer.
|
|
23
|
+
* Duplicate stems within a group are deduplicated.
|
|
24
|
+
*/
|
|
25
|
+
export declare const STEMMED_SYNONYM_GROUPS: string[][];
|
|
26
|
+
export interface ExpandedTerm {
|
|
27
|
+
/** The stemmed term */
|
|
28
|
+
term: string;
|
|
29
|
+
/** 1.0 for original query terms, 0.5 for synonym expansions */
|
|
30
|
+
weight: number;
|
|
31
|
+
/** True if this term came from the original query */
|
|
32
|
+
isOriginal: boolean;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Expand a list of stemmed query tokens with their synonyms.
|
|
36
|
+
*
|
|
37
|
+
* @param terms - Already-stemmed tokens from the query
|
|
38
|
+
* @returns Array of ExpandedTerm objects. Original terms have weight=1.0,
|
|
39
|
+
* synonym expansions have weight=0.5.
|
|
40
|
+
* The returned array preserves originals first, then synonyms.
|
|
41
|
+
*/
|
|
42
|
+
export declare function expandWithSynonyms(terms: string[]): ExpandedTerm[];
|
|
43
|
+
//# sourceMappingURL=synonyms.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"synonyms.d.ts","sourceRoot":"","sources":["../../src/core/synonyms.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAQH;;;;;;GAMG;AACH,eAAO,MAAM,cAAc,EAAE,MAAM,EAAE,EA8GpC,CAAC;AAMF;;;;GAIG;AACH,eAAO,MAAM,sBAAsB,EAAE,MAAM,EAAE,EAI3C,CAAC;AAsBH,MAAM,WAAW,YAAY;IAC3B,uBAAuB;IACvB,IAAI,EAAE,MAAM,CAAC;IACb,+DAA+D;IAC/D,MAAM,EAAE,MAAM,CAAC;IACf,qDAAqD;IACrD,UAAU,EAAE,OAAO,CAAC;CACrB;AAED;;;;;;;GAOG;AACH,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,YAAY,EAAE,CA0BlE"}
|