webpeel 0.15.2 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/cli-auth.d.ts.map +1 -1
- package/dist/cli-auth.js +5 -0
- package/dist/cli-auth.js.map +1 -1
- package/dist/cli.js +43 -11
- package/dist/cli.js.map +1 -1
- package/dist/core/crawler.d.ts +2 -0
- package/dist/core/crawler.d.ts.map +1 -1
- package/dist/core/crawler.js +12 -3
- package/dist/core/crawler.js.map +1 -1
- package/dist/core/pipeline.d.ts +1 -0
- package/dist/core/pipeline.d.ts.map +1 -1
- package/dist/core/pipeline.js +63 -2
- package/dist/core/pipeline.js.map +1 -1
- package/dist/core/quick-answer.d.ts +26 -0
- package/dist/core/quick-answer.d.ts.map +1 -1
- package/dist/core/quick-answer.js +451 -84
- package/dist/core/quick-answer.js.map +1 -1
- package/dist/core/search-provider.d.ts +47 -4
- package/dist/core/search-provider.d.ts.map +1 -1
- package/dist/core/search-provider.js +278 -7
- package/dist/core/search-provider.js.map +1 -1
- package/dist/core/stemmer.d.ts +39 -0
- package/dist/core/stemmer.d.ts.map +1 -0
- package/dist/core/stemmer.js +510 -0
- package/dist/core/stemmer.js.map +1 -0
- package/dist/core/synonyms.d.ts +43 -0
- package/dist/core/synonyms.d.ts.map +1 -0
- package/dist/core/synonyms.js +185 -0
- package/dist/core/synonyms.js.map +1 -0
- package/dist/mcp/server.js +109 -4
- package/dist/mcp/server.js.map +1 -1
- package/dist/server/app.d.ts +1 -0
- package/dist/server/app.d.ts.map +1 -1
- package/dist/server/app.js +76 -10
- package/dist/server/app.js.map +1 -1
- package/dist/server/middleware/auth.d.ts +2 -1
- package/dist/server/middleware/auth.d.ts.map +1 -1
- package/dist/server/middleware/auth.js +25 -12
- package/dist/server/middleware/auth.js.map +1 -1
- package/dist/server/middleware/rate-limit.d.ts +1 -0
- package/dist/server/middleware/rate-limit.d.ts.map +1 -1
- package/dist/server/middleware/rate-limit.js +20 -11
- package/dist/server/middleware/rate-limit.js.map +1 -1
- package/dist/server/routes/agent.d.ts +4 -0
- package/dist/server/routes/agent.d.ts.map +1 -1
- package/dist/server/routes/agent.js +196 -9
- package/dist/server/routes/agent.js.map +1 -1
- package/dist/server/routes/batch.d.ts.map +1 -1
- package/dist/server/routes/batch.js +126 -1
- package/dist/server/routes/batch.js.map +1 -1
- package/dist/server/routes/fetch.d.ts +1 -0
- package/dist/server/routes/fetch.d.ts.map +1 -1
- package/dist/server/routes/fetch.js +193 -55
- package/dist/server/routes/fetch.js.map +1 -1
- package/dist/server/routes/jobs.d.ts.map +1 -1
- package/dist/server/routes/jobs.js +115 -2
- package/dist/server/routes/jobs.js.map +1 -1
- package/dist/server/routes/mcp.d.ts +1 -0
- package/dist/server/routes/mcp.d.ts.map +1 -1
- package/dist/server/routes/mcp.js +113 -6
- package/dist/server/routes/mcp.js.map +1 -1
- package/dist/server/routes/search.js +1 -1
- package/dist/server/routes/search.js.map +1 -1
- package/dist/server/types.d.ts +16 -0
- package/dist/server/types.d.ts.map +1 -0
- package/dist/server/types.js +8 -0
- package/dist/server/types.js.map +1 -0
- package/dist/server/utils/response.d.ts +45 -0
- package/dist/server/utils/response.d.ts.map +1 -0
- package/dist/server/utils/response.js +70 -0
- package/dist/server/utils/response.js.map +1 -0
- package/dist/server/utils/sse.d.ts +23 -0
- package/dist/server/utils/sse.d.ts.map +1 -0
- package/dist/server/utils/sse.js +39 -0
- package/dist/server/utils/sse.js.map +1 -0
- package/dist/types.d.ts +2 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -4,8 +4,12 @@
|
|
|
4
4
|
* Answers a question about page content without any API key.
|
|
5
5
|
* Uses BM25 relevance scoring + answer-signal boosting to surface
|
|
6
6
|
* the most relevant sentences.
|
|
7
|
+
*
|
|
8
|
+
* v2: Added Porter stemming, synonym expansion, and sliding window scoring.
|
|
7
9
|
*/
|
|
8
10
|
import { scoreBM25 } from './bm25-filter.js';
|
|
11
|
+
import { stem } from './stemmer.js';
|
|
12
|
+
import { expandWithSynonyms } from './synonyms.js';
|
|
9
13
|
// ---------------------------------------------------------------------------
|
|
10
14
|
// Stopwords — removed from question before BM25 scoring
|
|
11
15
|
// ---------------------------------------------------------------------------
|
|
@@ -21,8 +25,12 @@ const STOPWORDS = new Set([
|
|
|
21
25
|
]);
|
|
22
26
|
function detectQuestionType(question) {
|
|
23
27
|
const q = question.toLowerCase().trim();
|
|
24
|
-
|
|
28
|
+
// Fix #1: Distinguish "how many/much/long" (quantity/duration) from "how do/does/can/to/is" (process/explanation)
|
|
29
|
+
if (/how\s+many|how\s+much|how\s+long|what\s+price|what\s+cost|pricing/.test(q))
|
|
25
30
|
return 'how_many';
|
|
31
|
+
// Fix #11: Yes/no questions (starts with auxiliary verb)
|
|
32
|
+
if (/^(is|does|can|will|are|has|do|did|was|were|could|should|would)\b/i.test(q))
|
|
33
|
+
return 'yes_no';
|
|
26
34
|
if (/when\b/.test(q))
|
|
27
35
|
return 'when';
|
|
28
36
|
if (/where\b/.test(q))
|
|
@@ -36,14 +44,33 @@ function detectQuestionType(question) {
|
|
|
36
44
|
return 'who';
|
|
37
45
|
if (/what\b/.test(q))
|
|
38
46
|
return 'what';
|
|
47
|
+
// Fix #1: "how do/does/can/to/is" → 'how' (process/explanation), bare 'how' → 'how' (not 'how_many')
|
|
48
|
+
if (/how\s+(?:do|does|can|to|is|are|was|were|will|would|could|should)\b/.test(q))
|
|
49
|
+
return 'how';
|
|
39
50
|
if (/how\b/.test(q))
|
|
40
|
-
return '
|
|
51
|
+
return 'how';
|
|
41
52
|
return 'other';
|
|
42
53
|
}
|
|
43
54
|
// ---------------------------------------------------------------------------
|
|
44
55
|
// Tokenization
|
|
45
56
|
// ---------------------------------------------------------------------------
|
|
57
|
+
/**
|
|
58
|
+
* Tokenize and stem text. Used for BM25 scoring — both query and content
|
|
59
|
+
* go through the same stemming pipeline so "limitations" matches "limit".
|
|
60
|
+
*/
|
|
46
61
|
function tokenize(text) {
|
|
62
|
+
return text
|
|
63
|
+
.toLowerCase()
|
|
64
|
+
.replace(/[^\w\s]/g, ' ')
|
|
65
|
+
.split(/\s+/)
|
|
66
|
+
.filter(t => t.length > 1)
|
|
67
|
+
.map(t => stem(t));
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Tokenize WITHOUT stemming. Used for regex pattern building in
|
|
71
|
+
* tryDirectExtraction so that exact text patterns still match.
|
|
72
|
+
*/
|
|
73
|
+
function tokenizeRaw(text) {
|
|
47
74
|
return text
|
|
48
75
|
.toLowerCase()
|
|
49
76
|
.replace(/[^\w\s]/g, ' ')
|
|
@@ -51,7 +78,10 @@ function tokenize(text) {
|
|
|
51
78
|
.filter(t => t.length > 1);
|
|
52
79
|
}
|
|
53
80
|
function tokenizeQuestion(question) {
|
|
54
|
-
|
|
81
|
+
// Filter stopwords on raw tokens (before stemming), then stem
|
|
82
|
+
return tokenizeRaw(question)
|
|
83
|
+
.filter(t => !STOPWORDS.has(t))
|
|
84
|
+
.map(t => stem(t));
|
|
55
85
|
}
|
|
56
86
|
// ---------------------------------------------------------------------------
|
|
57
87
|
// Sentence splitting
|
|
@@ -59,6 +89,7 @@ function tokenizeQuestion(question) {
|
|
|
59
89
|
/**
|
|
60
90
|
* Split text into sentences. Handles common abbreviations to avoid false splits.
|
|
61
91
|
* Returns an array of sentences with their start position (index in original text).
|
|
92
|
+
* Also extracts list items (markdown bullets/numbers) as pseudo-sentences.
|
|
62
93
|
*/
|
|
63
94
|
function splitIntoSentences(content) {
|
|
64
95
|
// Strip markdown formatting while preserving positions is complex;
|
|
@@ -81,6 +112,13 @@ function splitIntoSentences(content) {
|
|
|
81
112
|
PLACEHOLDER_MAP.set(ph, m);
|
|
82
113
|
return ph;
|
|
83
114
|
});
|
|
115
|
+
// Protect version numbers with multiple dots (e.g., 0.9.0, 1.2.3, 3.11.4)
|
|
116
|
+
// Must run BEFORE the decimal number protection to avoid partial replacement
|
|
117
|
+
protected_ = protected_.replace(/\b(\d+\.\d+(?:\.\d+)+)/g, (m) => {
|
|
118
|
+
const ph = `\x00VER${placeholderIdx++}\x00`;
|
|
119
|
+
PLACEHOLDER_MAP.set(ph, m);
|
|
120
|
+
return ph;
|
|
121
|
+
});
|
|
84
122
|
// Protect decimal numbers (e.g., 3.14, $29.99)
|
|
85
123
|
protected_ = protected_.replace(/\b(\d+)\.(\d+)/g, (_m, a, b) => {
|
|
86
124
|
const ph = `\x00NUM${placeholderIdx++}\x00`;
|
|
@@ -114,10 +152,23 @@ function splitIntoSentences(content) {
|
|
|
114
152
|
sentences.push({ text: remaining, start: lastEnd });
|
|
115
153
|
}
|
|
116
154
|
}
|
|
117
|
-
//
|
|
155
|
+
// Fix #12: Also extract list items (markdown bullets/numbers) as "sentences"
|
|
156
|
+
const listPattern = /^[\s]*[-*+]\s+(.+)$/gm;
|
|
157
|
+
let listMatch;
|
|
158
|
+
while ((listMatch = listPattern.exec(content)) !== null) {
|
|
159
|
+
const item = listMatch[1].trim();
|
|
160
|
+
if (item.length >= 10 && item.length <= 800) {
|
|
161
|
+
// Only add if not already captured by sentence splitting
|
|
162
|
+
const isDuplicate = sentences.some(s => s.text.includes(item) || item.includes(s.text));
|
|
163
|
+
if (!isDuplicate) {
|
|
164
|
+
sentences.push({ text: item, start: listMatch.index });
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
// Fix #7: Increase max sentence length from 500 to 800 chars
|
|
118
169
|
return sentences.filter(s => {
|
|
119
170
|
const len = s.text.length;
|
|
120
|
-
return len >= 10 && len <=
|
|
171
|
+
return len >= 10 && len <= 800;
|
|
121
172
|
});
|
|
122
173
|
}
|
|
123
174
|
// ---------------------------------------------------------------------------
|
|
@@ -131,8 +182,8 @@ function computeBoost(sentence, questionType, isTopicSentence) {
|
|
|
131
182
|
}
|
|
132
183
|
switch (questionType) {
|
|
133
184
|
case 'how_many': {
|
|
134
|
-
// Contains a number or price
|
|
135
|
-
if (/\$[\d,.]+|\d+[,.]?\d*\s*(per|\/|month|year|week|day|request|api|call|token|user)/i.test(sentence)) {
|
|
185
|
+
// Contains a number or price or duration
|
|
186
|
+
if (/\$[\d,.]+|\d+[,.]?\d*\s*(per|\/|month|year|week|day|request|api|call|token|user|minute|second|hour|degree|meter|mile|kg|lb)/i.test(sentence)) {
|
|
136
187
|
boost += 0.3;
|
|
137
188
|
}
|
|
138
189
|
else if (/\b\d+\b/.test(sentence)) {
|
|
@@ -140,6 +191,18 @@ function computeBoost(sentence, questionType, isTopicSentence) {
|
|
|
140
191
|
}
|
|
141
192
|
break;
|
|
142
193
|
}
|
|
194
|
+
// Fix #1: New 'how' (process/explanation) boost
|
|
195
|
+
case 'how': {
|
|
196
|
+
// Process/explanation sentences
|
|
197
|
+
if (/\b(by using|through|works by|in order to|step|first|then|next|finally|process|method|approach|technique|way to|can be done)\b/i.test(s)) {
|
|
198
|
+
boost += 0.4;
|
|
199
|
+
}
|
|
200
|
+
// Instructional patterns
|
|
201
|
+
if (/\b(install|run|execute|configure|set up|use|import|require|enable|disable|create|build|deploy)\b/i.test(s)) {
|
|
202
|
+
boost += 0.2;
|
|
203
|
+
}
|
|
204
|
+
break;
|
|
205
|
+
}
|
|
143
206
|
case 'when': {
|
|
144
207
|
// Contains a date
|
|
145
208
|
if (/\b(january|february|march|april|may|june|july|august|september|october|november|december|\d{4}|\d+\s*(days?|weeks?|months?|years?))\b/i.test(sentence)) {
|
|
@@ -151,10 +214,21 @@ function computeBoost(sentence, questionType, isTopicSentence) {
|
|
|
151
214
|
}
|
|
152
215
|
break;
|
|
153
216
|
}
|
|
217
|
+
// Fix #4: Use more specific location indicators
|
|
154
218
|
case 'where': {
|
|
155
|
-
//
|
|
156
|
-
if (/\b
|
|
157
|
-
|
|
219
|
+
// Primary location signal — strong indicator (located/headquartered/based in + geographic proper noun)
|
|
220
|
+
if (/\b(located|headquartered|based|founded|established)\s+(in|at)\b/i.test(s) ||
|
|
221
|
+
/\b(?:in|at)\s+(?:the\s+)?[A-Z][a-z]+(?:(?:\s+[A-Z][a-z]+)*|(?:,\s+[A-Z][a-z]+)*)\b/.test(sentence) ||
|
|
222
|
+
/\b(city|country|state|region|continent|capital|office|campus|location|address)\b/i.test(s)) {
|
|
223
|
+
boost += 0.6;
|
|
224
|
+
}
|
|
225
|
+
// Specific geographic indicators including country names
|
|
226
|
+
if (/\b(street|avenue|boulevard|road|highway|route|district|province|county|netherlands|amsterdam|berlin|london|paris|tokyo|beijing|moscow|france|germany|japan|china|india|canada|australia|san francisco|new york|los angeles|seattle|chicago|boston|austin|miami)\b/i.test(s)) {
|
|
227
|
+
boost += 0.4;
|
|
228
|
+
}
|
|
229
|
+
// Birth/origin patterns
|
|
230
|
+
if (/\b(born|raised|grew up|native|hometown|birthplace|originally from)\b/i.test(s)) {
|
|
231
|
+
boost += 0.4;
|
|
158
232
|
}
|
|
159
233
|
break;
|
|
160
234
|
}
|
|
@@ -170,13 +244,17 @@ function computeBoost(sentence, questionType, isTopicSentence) {
|
|
|
170
244
|
if (/\b(because|due to|reason|therefore|since|as a result|consequently|thus)\b/.test(s)) {
|
|
171
245
|
boost += 0.4;
|
|
172
246
|
}
|
|
247
|
+
// Purpose/goal sentences ("as a successor to", "in order to", "to allow", "to provide")
|
|
248
|
+
if (/\b(as a successor|successor to|in order to|so that|to allow|to provide|to enable|to support|to replace|to improve|to address|to solve)\b/i.test(s)) {
|
|
249
|
+
boost += 0.4;
|
|
250
|
+
}
|
|
173
251
|
break;
|
|
174
252
|
}
|
|
175
253
|
case 'who': {
|
|
176
254
|
// Pattern: "[topic] was created/designed/developed by [Person]"
|
|
177
255
|
// Or: "[Person] created/designed/developed [topic]"
|
|
178
|
-
if (/\b(created|designed|developed|built|invented|founded|authored|introduced|proposed|conceived)\s+by\b/i.test(s) ||
|
|
179
|
-
/\b[A-Z][a-z]+\s+(?:[A-Z][a-z]+\s+)?(?:created|designed|developed|built|invented|founded|authored|introduced)\b/.test(sentence)) {
|
|
256
|
+
if (/\b(created|designed|developed|built|invented|founded|authored|introduced|proposed|conceived|released|launched|established)\s+(?:\w+\s+){0,4}by\b/i.test(s) ||
|
|
257
|
+
/\b[A-Z][a-z]+\s+(?:[A-Z][a-z]+\s+)?(?:created|designed|developed|built|invented|founded|authored|introduced|conceived|began)\b/.test(sentence)) {
|
|
180
258
|
boost += 0.5;
|
|
181
259
|
}
|
|
182
260
|
// Also boost if contains person names (capitalized words that aren't sentence starters)
|
|
@@ -190,20 +268,62 @@ function computeBoost(sentence, questionType, isTopicSentence) {
|
|
|
190
268
|
}
|
|
191
269
|
break;
|
|
192
270
|
}
|
|
271
|
+
// Fix #11: Yes/no question boost
|
|
272
|
+
case 'yes_no': {
|
|
273
|
+
if (/\b(yes|no|not|does not|doesn't|cannot|can't|isn't|aren't|won't|supports?|enables?|allows?|provides?|includes?)\b/i.test(s)) {
|
|
274
|
+
boost += 0.3;
|
|
275
|
+
}
|
|
276
|
+
break;
|
|
277
|
+
}
|
|
193
278
|
}
|
|
194
279
|
return boost;
|
|
195
280
|
}
|
|
196
|
-
|
|
281
|
+
// Fix #9: Remove unused `_question` parameter
|
|
282
|
+
// NOTE: topicTerms must be RAW (unstemmed) for correct regex pattern building
|
|
283
|
+
function tryDirectExtraction(content, questionType, topicTerms) {
|
|
197
284
|
if (topicTerms.length === 0)
|
|
198
285
|
return null;
|
|
199
286
|
// Build a regex pattern that matches any topic term (case-insensitive)
|
|
200
287
|
const topicPattern = topicTerms.map(t => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
|
288
|
+
// --- Tiered 'who' infobox extraction ---
|
|
289
|
+
// Wikipedia infobox entries appear as list items like:
|
|
290
|
+
// "- Founders · Sam AltmanElon Musk..."
|
|
291
|
+
// We search for the field pattern directly (no topic prefix required) since
|
|
292
|
+
// "Founders ·" is specific enough to avoid false positives.
|
|
293
|
+
// Split into two tiers: creator fields (always try first) vs. developer/maintainer fields
|
|
294
|
+
// (skip for creation questions so we don't return "The Rust Team" for "Who created Rust?")
|
|
295
|
+
if (questionType === 'who') {
|
|
296
|
+
// Detect if question is about creation/origin.
|
|
297
|
+
// These are stem prefixes (e.g. "creat" from "created"), so use leading \b only —
|
|
298
|
+
// no trailing \b, since the stem appears INSIDE the full word.
|
|
299
|
+
const isCreationQuestion = /\b(?:creat|built|invent|found|design|start|conceiv|originat|develop|made|wrote|began)\w*/i.test(topicTerms.join(' '));
|
|
300
|
+
// Tier 1: Original creator fields (always try first) — search directly without topic prefix
|
|
301
|
+
const creatorFields = /(?:Original\s+author|Creator|Inventor|Designed\s+by|Created\s+by|Founded\s+by|Founders)\s*[·:]\s*(.+)/i;
|
|
302
|
+
const creatorMatch = content.match(creatorFields);
|
|
303
|
+
if (creatorMatch?.[1]) {
|
|
304
|
+
const value = creatorMatch[1].split('\n')[0].trim().slice(0, 300);
|
|
305
|
+
if (value.length > 2) {
|
|
306
|
+
return { text: value, context: creatorMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
// Tier 2: General developer fields (skip for creation questions — let BM25 find the original creator)
|
|
310
|
+
if (!isCreationQuestion) {
|
|
311
|
+
const devFields = /(?:Developers|Developer|Maintainer|Author)\s*[·:]\s*(.+)/i;
|
|
312
|
+
const devMatch = content.match(devFields);
|
|
313
|
+
if (devMatch?.[1]) {
|
|
314
|
+
const value = devMatch[1].split('\n')[0].trim().slice(0, 300);
|
|
315
|
+
if (value.length > 2) {
|
|
316
|
+
return { text: value, context: devMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
201
321
|
// --- Infobox patterns (Wikipedia-style: "Topic: Field · Value") ---
|
|
202
322
|
// Note: Wikipedia uses \u00A0 (NBSP) in infobox fields, so we use \\s+ (which matches NBSP) instead of literal spaces
|
|
203
323
|
const infoboxPatterns = [
|
|
204
|
-
{ type: ['who'], field: new RegExp(`(?:${topicPattern}).*?(?:Designed\\s+by|Created\\s+by|Developed\\s+by|Founded\\s+by|Original\\s+author|Developers|Developer|Maintainer|Author|Inventor|Creator)\\s*[·:]\\s*(.+)`, 'i') },
|
|
205
324
|
{ type: ['when'], field: new RegExp(`(?:${topicPattern}).*?(?:First\\s+appeared|Released|Founded|Established|Created|Launch\\s+date|Initial\\s+release)\\s*[·:]\\s*(.+)`, 'i') },
|
|
206
325
|
{ type: ['what'], field: new RegExp(`(?:${topicPattern}).*?(?:Type|Genre|Category|Classification)\\s*[·:]\\s*(.+)`, 'i') },
|
|
326
|
+
{ type: ['where'], field: /(?:Headquarters|Headquartered|Location|Address|HQ|Head\s+office|Based\s+in)\s*[·:]\s*(.+)/i },
|
|
207
327
|
];
|
|
208
328
|
for (const pat of infoboxPatterns) {
|
|
209
329
|
if (!pat.type.includes(questionType))
|
|
@@ -225,7 +345,7 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
|
|
|
225
345
|
// "developed/designed/created by [Name]" in first 20% of content
|
|
226
346
|
const first20 = content.slice(0, Math.max(500, Math.floor(content.length * 0.2)));
|
|
227
347
|
// Use case-insensitive for verbs, but validate name casing separately
|
|
228
|
-
const byPattern = /(?:developed|designed|created|built|invented|founded|authored|introduced|coined)\s+by\s+(\S+(?:\s+\S+){0,3})/i;
|
|
348
|
+
const byPattern = /(?:developed|designed|created|built|invented|founded|authored|introduced|coined|conceived|released|started|launched|begun|proposed|established)\s+(?:\w+\s+){0,4}by\s+(\S+(?:\s+\S+){0,3})/i;
|
|
229
349
|
const byMatch = first20.match(byPattern);
|
|
230
350
|
if (byMatch?.[1]) {
|
|
231
351
|
const candidateName = byMatch[1].trim();
|
|
@@ -249,7 +369,10 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
|
|
|
249
369
|
if (questionType === 'when') {
|
|
250
370
|
// Look for a date near topic terms in first 30% of content
|
|
251
371
|
const first30 = content.slice(0, Math.max(600, Math.floor(content.length * 0.3)));
|
|
252
|
-
|
|
372
|
+
// Note: "began"/"started" are intentionally excluded — they can match
|
|
373
|
+
// construction/start events that don't answer the specific question
|
|
374
|
+
// (e.g. "When did X fall?" should NOT match "began on Aug 13, 1961").
|
|
375
|
+
const datePattern = /(?:released|launched|first appeared|founded|established|created|introduced|conceived|opened|invented)\s+(?:\w+\s+){0,2}(?:in|on)\s+(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})/i;
|
|
253
376
|
const dateMatch = first30.match(datePattern);
|
|
254
377
|
if (dateMatch) {
|
|
255
378
|
const idx = first30.indexOf(dateMatch[0]);
|
|
@@ -266,6 +389,51 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
|
|
|
266
389
|
return null;
|
|
267
390
|
}
|
|
268
391
|
// ---------------------------------------------------------------------------
|
|
392
|
+
// Entity extraction — for who/when questions answered by BM25
|
|
393
|
+
// ---------------------------------------------------------------------------
|
|
394
|
+
/**
|
|
395
|
+
* Try to extract a specific entity (person name, date) from a BM25-selected passage.
|
|
396
|
+
* Returns the entity string if found, or null.
|
|
397
|
+
*/
|
|
398
|
+
function extractEntity(passage, questionType) {
|
|
399
|
+
if (questionType === 'who') {
|
|
400
|
+
// Try: "by [Name Name]"
|
|
401
|
+
const byMatch = passage.match(/\bby\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})/);
|
|
402
|
+
if (byMatch)
|
|
403
|
+
return byMatch[1];
|
|
404
|
+
// Try: "[Name Name] created/founded/..."
|
|
405
|
+
const nameVerbMatch = passage.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})\s+(?:created|founded|designed|developed|built|invented|authored|introduced)/);
|
|
406
|
+
if (nameVerbMatch)
|
|
407
|
+
return nameVerbMatch[1];
|
|
408
|
+
return null;
|
|
409
|
+
}
|
|
410
|
+
if (questionType === 'when') {
|
|
411
|
+
const dateMatch = passage.match(/\b(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})\b/);
|
|
412
|
+
if (dateMatch)
|
|
413
|
+
return dateMatch[1];
|
|
414
|
+
return null;
|
|
415
|
+
}
|
|
416
|
+
return null;
|
|
417
|
+
}
|
|
418
|
+
// ---------------------------------------------------------------------------
|
|
419
|
+
// Entity type check for confidence formula
|
|
420
|
+
// ---------------------------------------------------------------------------
|
|
421
|
+
function hasExpectedEntityType(text, questionType) {
|
|
422
|
+
switch (questionType) {
|
|
423
|
+
case 'who':
|
|
424
|
+
return /[A-Z][a-z]+\s+[A-Z][a-z]+/.test(text);
|
|
425
|
+
case 'when':
|
|
426
|
+
return /\b\d{4}\b|\b(january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(text);
|
|
427
|
+
case 'how_many':
|
|
428
|
+
case 'how_much':
|
|
429
|
+
return /\b\d+\b/.test(text);
|
|
430
|
+
case 'where':
|
|
431
|
+
return /\b(in|at|near|located|based|headquarter)\b/i.test(text);
|
|
432
|
+
default:
|
|
433
|
+
return true;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
// ---------------------------------------------------------------------------
|
|
269
437
|
// Content cleaning — strip citation/reference noise before BM25 scoring
|
|
270
438
|
// ---------------------------------------------------------------------------
|
|
271
439
|
/**
|
|
@@ -275,6 +443,25 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
|
|
|
275
443
|
*/
|
|
276
444
|
function cleanContentForQA(content) {
|
|
277
445
|
let cleaned = content;
|
|
446
|
+
// Strip markdown formatting to get clean text for BM25 scoring
|
|
447
|
+
// Images:  → remove entirely
|
|
448
|
+
cleaned = cleaned.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
|
|
449
|
+
// Links: [text](url "title") → text (keep link text, remove URL and title)
|
|
450
|
+
cleaned = cleaned.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
|
|
451
|
+
// Bold/italic: ***text***, **text**, *text* → text
|
|
452
|
+
cleaned = cleaned.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
|
|
453
|
+
// Inline code: `text` → text
|
|
454
|
+
cleaned = cleaned.replace(/`([^`]+)`/g, '$1');
|
|
455
|
+
// Heading markers: ## Heading → Heading
|
|
456
|
+
cleaned = cleaned.replace(/^#{1,6}\s+/gm, '');
|
|
457
|
+
// Horizontal rules
|
|
458
|
+
cleaned = cleaned.replace(/^---+$/gm, '');
|
|
459
|
+
// HTML entities
|
|
460
|
+
cleaned = cleaned.replace(/&/g, '&');
|
|
461
|
+
cleaned = cleaned.replace(/</g, '<');
|
|
462
|
+
cleaned = cleaned.replace(/>/g, '>');
|
|
463
|
+
cleaned = cleaned.replace(/ /g, ' ');
|
|
464
|
+
cleaned = cleaned.replace(/&#\d+;/g, '');
|
|
278
465
|
// Remove Wikipedia citation metadata (CS1_maint, Category:, etc.)
|
|
279
466
|
cleaned = cleaned.replace(/CS1[_\s]\w+[:\s][^\n]*/gi, '');
|
|
280
467
|
cleaned = cleaned.replace(/Category:[^\n]*/gi, '');
|
|
@@ -288,12 +475,11 @@ function cleanContentForQA(content) {
|
|
|
288
475
|
cleaned = cleaned.replace(/\b(retrieved|archived from the original)\b[^\n]{0,100}/gi, '');
|
|
289
476
|
// Remove "External links" and everything after (usually just URLs)
|
|
290
477
|
cleaned = cleaned.replace(/^#{1,3}\s*External\s+links[\s\S]*$/im, '');
|
|
291
|
-
// Remove
|
|
292
|
-
// (
|
|
478
|
+
// Fix #8: Remove entire "See also", "Notes", "Further reading" sections
|
|
479
|
+
// (heading + all content until the next heading)
|
|
480
|
+
cleaned = cleaned.replace(/^#{1,3}\s*(?:See\s+also|Notes|Further\s+reading)\s*\n(?:(?!^#{1,3}\s).*\n?)*/gim, '');
|
|
481
|
+
// Remove "References" heading only (keep nearby content that may be relevant)
|
|
293
482
|
cleaned = cleaned.replace(/^#{1,3}\s*References\s*$/im, '');
|
|
294
|
-
cleaned = cleaned.replace(/^#{1,3}\s*Further\s+reading\s*$/im, '');
|
|
295
|
-
cleaned = cleaned.replace(/^#{1,3}\s*See\s+also\s*$/im, '');
|
|
296
|
-
cleaned = cleaned.replace(/^#{1,3}\s*Notes\s*$/im, '');
|
|
297
483
|
// Remove lines that are mostly citation-like (very short with lots of punctuation/numbers)
|
|
298
484
|
cleaned = cleaned.split('\n').filter(line => {
|
|
299
485
|
const trimmed = line.trim();
|
|
@@ -318,6 +504,30 @@ function cleanContentForQA(content) {
|
|
|
318
504
|
// ---------------------------------------------------------------------------
|
|
319
505
|
// Main quickAnswer function
|
|
320
506
|
// ---------------------------------------------------------------------------
|
|
507
|
+
/**
|
|
508
|
+
* Answer a question about fetched page content using BM25 + heuristics.
|
|
509
|
+
*
|
|
510
|
+
* This is a fully offline, LLM-free approach. It:
|
|
511
|
+
* 1. Cleans the content (strips Wikipedia citations, reference noise, etc.)
|
|
512
|
+
* 2. Tries direct pattern extraction for structured content (infoboxes, definitions)
|
|
513
|
+
* 3. Falls back to BM25 sentence scoring with question-type-aware boosting
|
|
514
|
+
* 4. Uses sliding windows (1-3 sentences) to capture multi-sentence answers
|
|
515
|
+
* 5. Expands query terms with synonyms for broader matching
|
|
516
|
+
* 6. Returns the top passages with scores and surrounding context
|
|
517
|
+
*
|
|
518
|
+
* @param options - Question, content, and optional tuning parameters
|
|
519
|
+
* @returns A result object with answer text, confidence score, and ranked passages
|
|
520
|
+
*
|
|
521
|
+
* @example
|
|
522
|
+
* ```ts
|
|
523
|
+
* const result = await quickAnswer({
|
|
524
|
+
* question: 'What is the pricing?',
|
|
525
|
+
* content: pageMarkdown,
|
|
526
|
+
* url: 'https://example.com/pricing',
|
|
527
|
+
* });
|
|
528
|
+
* console.log(result.answer, result.confidence);
|
|
529
|
+
* ```
|
|
530
|
+
*/
|
|
321
531
|
export function quickAnswer(options) {
|
|
322
532
|
const { question, content, maxPassages = 3, maxChars = 2000, url = '', } = options;
|
|
323
533
|
const emptyResult = {
|
|
@@ -334,12 +544,23 @@ export function quickAnswer(options) {
|
|
|
334
544
|
return emptyResult;
|
|
335
545
|
// Clean content to remove citation/reference noise before BM25 scoring
|
|
336
546
|
const cleanedContent = cleanContentForQA(content);
|
|
547
|
+
// For very long content, focus on the most relevant portion.
|
|
548
|
+
// Wikipedia article tails contain references, tangential details, and noise.
|
|
549
|
+
const MAX_QA_CHARS = 20000;
|
|
550
|
+
let qaContent = cleanedContent;
|
|
551
|
+
if (qaContent.length > MAX_QA_CHARS) {
|
|
552
|
+
// Keep the first 70% — definitions, key facts, and main content
|
|
553
|
+
// are almost always in the first 2/3 of the article
|
|
554
|
+
qaContent = qaContent.slice(0, Math.floor(qaContent.length * 0.7));
|
|
555
|
+
}
|
|
337
556
|
// Step 0: Direct pattern extraction — try to find structured answers before BM25
|
|
338
557
|
// This catches infobox patterns (e.g. "TypeScript: Designed by · Anders Hejlsberg")
|
|
339
558
|
// and definition sentences (e.g. "TypeScript is ... developed by Microsoft")
|
|
340
559
|
const questionType = detectQuestionType(question);
|
|
341
|
-
|
|
342
|
-
const
|
|
560
|
+
// RAW (unstemmed) topic terms for tryDirectExtraction regex patterns
|
|
561
|
+
const topicTermsRaw = tokenizeRaw(question).filter(t => !STOPWORDS.has(t));
|
|
562
|
+
// Fix #9: Remove the unused `question` argument from the call site
|
|
563
|
+
const directAnswer = tryDirectExtraction(cleanedContent, questionType, topicTermsRaw);
|
|
343
564
|
if (directAnswer) {
|
|
344
565
|
return {
|
|
345
566
|
question,
|
|
@@ -350,98 +571,245 @@ export function quickAnswer(options) {
|
|
|
350
571
|
method: 'bm25',
|
|
351
572
|
};
|
|
352
573
|
}
|
|
353
|
-
// Step 1: Split into sentences
|
|
354
|
-
const sentences = splitIntoSentences(
|
|
574
|
+
// Step 1: Split into sentences (use qaContent — truncated for long articles)
|
|
575
|
+
const sentences = splitIntoSentences(qaContent);
|
|
355
576
|
if (sentences.length === 0)
|
|
356
577
|
return emptyResult;
|
|
357
|
-
// Step 2: Tokenize question (remove stopwords)
|
|
578
|
+
// Step 2: Tokenize question (remove stopwords, then stem)
|
|
358
579
|
const queryTerms = tokenizeQuestion(question);
|
|
359
580
|
if (queryTerms.length === 0) {
|
|
360
|
-
// Fall back to all tokens if all were stopwords
|
|
361
|
-
|
|
581
|
+
// Fall back to all stemmed tokens if all were stopwords
|
|
582
|
+
const fallback = tokenize(question);
|
|
583
|
+
if (fallback.length === 0)
|
|
584
|
+
return emptyResult;
|
|
585
|
+
queryTerms.push(...fallback);
|
|
362
586
|
}
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
//
|
|
366
|
-
const
|
|
367
|
-
|
|
368
|
-
//
|
|
369
|
-
//
|
|
587
|
+
// Expand query with synonyms for broader matching
|
|
588
|
+
const expanded = expandWithSynonyms(queryTerms);
|
|
589
|
+
// Use all expanded terms for BM25 (IDF naturally downweights common synonyms)
|
|
590
|
+
const uniqueQueryTerms = [...new Set(expanded.map(e => e.term))];
|
|
591
|
+
// Step 3: Create stemmed scoring blocks for each sentence.
|
|
592
|
+
// We pass stemmed text to scoreBM25 so that its internal tokenizer gets stemmed tokens,
|
|
593
|
+
// matching the stemmed queryTerms. The original sentence text is preserved for display.
|
|
594
|
+
const scoringBlocks = sentences.map((s, index) => ({
|
|
595
|
+
raw: tokenize(s.text).join(' '), // pre-stemmed text for BM25 scoring
|
|
596
|
+
index,
|
|
597
|
+
}));
|
|
598
|
+
// ---------------------------------------------------------------------------
|
|
599
|
+
// Step 3.5: Lightweight topic propagation (coreference approximation)
|
|
600
|
+
// ---------------------------------------------------------------------------
|
|
601
|
+
// When a sentence uses a referent phrase like "The platform" or "The company"
|
|
602
|
+
// instead of the topic entity name, BM25 can't match it. We inject stemmed
|
|
603
|
+
// topic terms into scoring blocks of nearby referent sentences so BM25 has
|
|
604
|
+
// something to work with.
|
|
605
|
+
//
|
|
606
|
+
// Only active for question types where coreference resolution helps:
|
|
607
|
+
// where, who, when — NOT for what/how/yes_no/how_many (no entity tracking needed).
|
|
608
|
+
//
|
|
609
|
+
// Heuristic: A sentence gets topic injection if:
|
|
610
|
+
// 1. It contains a common referent pattern (the platform/company/service/etc.)
|
|
611
|
+
// 2. It is within PROXIMITY_WINDOW sentences of a sentence containing the topic
|
|
612
|
+
// 3. OR the content has fewer than SMALL_CONTENT_THRESHOLD sentences AND
|
|
613
|
+
// the topic is actually mentioned somewhere in the content (topicSentenceIndices non-empty)
|
|
614
|
+
if (questionType === 'where' || questionType === 'who' || questionType === 'when') {
|
|
615
|
+
const REFERENT_PATTERNS = /\b(?:the\s+)?(?:platform|company|service|product|tool|application|system|framework|library|project|organization|software|language|program|site|website|app|api|sdk|package|module|engine|firm|startup|corporation)\b|^(?:It|They|He|She)\s/im;
|
|
616
|
+
const PROXIMITY_WINDOW = 5;
|
|
617
|
+
const SMALL_CONTENT_THRESHOLD = 15;
|
|
618
|
+
// Find which sentences contain at least one topic term
|
|
619
|
+
const topicSentenceIndices = new Set();
|
|
620
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
621
|
+
const stemmedSentence = scoringBlocks[i].raw;
|
|
622
|
+
if (queryTerms.some(t => stemmedSentence.includes(t))) {
|
|
623
|
+
topicSentenceIndices.add(i);
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
// Only inject if the topic is actually mentioned somewhere (non-empty topicSentenceIndices)
|
|
627
|
+
if (topicSentenceIndices.size > 0) {
|
|
628
|
+
// Inject topic terms into referent sentences that are near topic sentences
|
|
629
|
+
const topicInjection = ' ' + queryTerms.join(' ');
|
|
630
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
631
|
+
if (topicSentenceIndices.has(i))
|
|
632
|
+
continue; // already has topic terms
|
|
633
|
+
const hasReferent = REFERENT_PATTERNS.test(sentences[i].text);
|
|
634
|
+
if (!hasReferent)
|
|
635
|
+
continue;
|
|
636
|
+
// Check proximity: is this sentence within PROXIMITY_WINDOW of a topic sentence?
|
|
637
|
+
const isNearTopic = sentences.length < SMALL_CONTENT_THRESHOLD ||
|
|
638
|
+
[...topicSentenceIndices].some(j => Math.abs(i - j) <= PROXIMITY_WINDOW);
|
|
639
|
+
if (isNearTopic) {
|
|
640
|
+
scoringBlocks[i].raw += topicInjection;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
// Step 4: Score sentences with BM25
|
|
646
|
+
const bm25Scores = scoreBM25(scoringBlocks, uniqueQueryTerms);
|
|
647
|
+
// Step 5: Compute max possible score for normalization
|
|
370
648
|
const maxPossibleScore = Math.max(...bm25Scores, 0.001);
|
|
371
|
-
// Step
|
|
649
|
+
// Step 6: Apply boosts (position bias, question type, definition patterns)
|
|
372
650
|
const totalSentences = sentences.length;
|
|
373
651
|
const sentenceScores = sentences.map((s, i) => {
|
|
374
|
-
|
|
375
|
-
// We detect this by checking if the previous character in the content is a newline
|
|
376
|
-
const isTopicSentence = i === 0 || cleanedContent.slice(Math.max(0, s.start - 2), s.start).includes('\n');
|
|
652
|
+
const isTopicSentence = i === 0 || qaContent.slice(Math.max(0, s.start - 2), s.start).includes('\n');
|
|
377
653
|
const base = bm25Scores[i];
|
|
378
654
|
const boost = computeBoost(s.text, questionType, isTopicSentence);
|
|
379
|
-
// Position bias
|
|
380
|
-
|
|
381
|
-
// Decays linearly: first 10% of sentences get full boost (0.4), drops to 0 by 50%.
|
|
655
|
+
// Fix #3: Position bias — reduce for 'why' and 'how' (answers can be anywhere)
|
|
656
|
+
const maxPositionBoost = (questionType === 'why' || questionType === 'how') ? 0.15 : 0.4;
|
|
382
657
|
const positionRatio = i / totalSentences;
|
|
383
|
-
|
|
384
|
-
|
|
658
|
+
// Fix position bias: scale by how many query terms THIS sentence matches.
|
|
659
|
+
// A sentence matching only 1/3 query terms (e.g., just "python") gets 1/3 of the
|
|
660
|
+
// position boost — prevents the first sentence from winning on position alone.
|
|
661
|
+
const sentTokens = tokenize(s.text);
|
|
662
|
+
const sentTermMatches = uniqueQueryTerms.filter(t => sentTokens.includes(t)).length;
|
|
663
|
+
const sentTermCoverage = uniqueQueryTerms.length > 0
|
|
664
|
+
? sentTermMatches / Math.min(uniqueQueryTerms.length, 5)
|
|
665
|
+
: 0;
|
|
666
|
+
const rawPositionBoost = positionRatio < 0.1 ? maxPositionBoost
|
|
667
|
+
: positionRatio < 0.5 ? maxPositionBoost * (1 - (positionRatio - 0.1) / 0.4)
|
|
385
668
|
: 0;
|
|
386
|
-
|
|
669
|
+
const positionBoost = rawPositionBoost * sentTermCoverage;
|
|
670
|
+
// Fix #2: Only apply definitionBoost for 'what' and 'other' question types.
|
|
387
671
|
const sl = s.text.toLowerCase();
|
|
388
|
-
const definitionBoost =
|
|
389
|
-
|
|
672
|
+
const definitionBoost = (questionType === 'what' || questionType === 'other') &&
|
|
673
|
+
/\b(is a|is an|was a|are a|refers to|is the|was the)\b/.test(sl) ? 0.3 : 0;
|
|
674
|
+
// Extra boost for definition sentences very early in the content (for 'what' questions)
|
|
675
|
+
// This handles Wikipedia-style articles where the first sentence IS the answer
|
|
676
|
+
const earlyDefinitionBoost = (questionType === 'what' &&
|
|
677
|
+
positionRatio < 0.05 &&
|
|
678
|
+
/\b(is a|is an|are a|refers to|means|defined as|known as)\b/.test(sl)) ? 0.5 : 0;
|
|
679
|
+
const total = base + (boost + positionBoost + definitionBoost + earlyDefinitionBoost) * maxPossibleScore;
|
|
390
680
|
return { text: s.text, index: i, score: total, base };
|
|
391
681
|
});
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
682
|
+
const windows = [];
|
|
683
|
+
// Single-sentence windows (preserve existing behavior)
|
|
684
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
685
|
+
const score = sentenceScores[i].score;
|
|
686
|
+
const lengthPenalty = 0;
|
|
687
|
+
windows.push({
|
|
688
|
+
text: sentences[i].text,
|
|
689
|
+
indices: [i],
|
|
690
|
+
startSentenceIdx: i,
|
|
691
|
+
score: score * (1 - lengthPenalty),
|
|
692
|
+
});
|
|
693
|
+
}
|
|
694
|
+
// 2-sentence windows
|
|
695
|
+
for (let i = 0; i < sentences.length - 1; i++) {
|
|
696
|
+
const score = (sentenceScores[i].score + sentenceScores[i + 1].score) / 2;
|
|
697
|
+
const lengthPenalty = 0.05;
|
|
698
|
+
windows.push({
|
|
699
|
+
text: sentences[i].text + ' ' + sentences[i + 1].text,
|
|
700
|
+
indices: [i, i + 1],
|
|
701
|
+
startSentenceIdx: i,
|
|
702
|
+
score: score * (1 - lengthPenalty),
|
|
703
|
+
});
|
|
704
|
+
}
|
|
705
|
+
// 3-sentence windows (only when content has enough sentences)
|
|
706
|
+
if (sentences.length >= 5) {
|
|
707
|
+
for (let i = 0; i < sentences.length - 2; i++) {
|
|
708
|
+
const score = (sentenceScores[i].score + sentenceScores[i + 1].score + sentenceScores[i + 2].score) / 3;
|
|
709
|
+
const lengthPenalty = 0.10;
|
|
710
|
+
windows.push({
|
|
711
|
+
text: sentences[i].text + ' ' + sentences[i + 1].text + ' ' + sentences[i + 2].text,
|
|
712
|
+
indices: [i, i + 1, i + 2],
|
|
713
|
+
startSentenceIdx: i,
|
|
714
|
+
score: score * (1 - lengthPenalty),
|
|
715
|
+
});
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
// Step 8: Sort windows by score
|
|
719
|
+
const sortedWindows = [...windows].sort((a, b) => b.score - a.score);
|
|
720
|
+
// Step 9: Select top N non-overlapping windows
|
|
397
721
|
const selectedPassages = [];
|
|
398
|
-
const
|
|
399
|
-
for (const
|
|
400
|
-
if (
|
|
722
|
+
const usedSentenceIndices = new Set();
|
|
723
|
+
for (const win of sortedWindows) {
|
|
724
|
+
if (selectedPassages.length >= maxPassages)
|
|
725
|
+
break;
|
|
726
|
+
// Skip if any sentence in this window was already used
|
|
727
|
+
const hasOverlap = win.indices.some(i => usedSentenceIndices.has(i));
|
|
728
|
+
if (hasOverlap)
|
|
401
729
|
continue;
|
|
402
|
-
|
|
730
|
+
// Mark all sentences in this window as used
|
|
731
|
+
for (const i of win.indices)
|
|
732
|
+
usedSentenceIndices.add(i);
|
|
733
|
+
// Build context: include sentence before the window and after
|
|
734
|
+
const firstIdx = win.indices[0];
|
|
735
|
+
const lastIdx = win.indices[win.indices.length - 1];
|
|
403
736
|
const contextParts = [];
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
contextParts.push(sentences[i - 1].text);
|
|
737
|
+
if (firstIdx > 0 && !usedSentenceIndices.has(firstIdx - 1)) {
|
|
738
|
+
contextParts.push(sentences[firstIdx - 1].text);
|
|
407
739
|
}
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
if (i < sentences.length - 1 && !usedIndices.has(i + 1)) {
|
|
412
|
-
contextParts.push(sentences[i + 1].text);
|
|
740
|
+
contextParts.push(win.text);
|
|
741
|
+
if (lastIdx < sentences.length - 1 && !usedSentenceIndices.has(lastIdx + 1)) {
|
|
742
|
+
contextParts.push(sentences[lastIdx + 1].text);
|
|
413
743
|
}
|
|
414
|
-
// Mark
|
|
415
|
-
if (
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
usedIndices.add(i + 1);
|
|
744
|
+
// Mark surrounding context sentences as used to avoid overlap
|
|
745
|
+
if (firstIdx > 0)
|
|
746
|
+
usedSentenceIndices.add(firstIdx - 1);
|
|
747
|
+
if (lastIdx < sentences.length - 1)
|
|
748
|
+
usedSentenceIndices.add(lastIdx + 1);
|
|
420
749
|
const context = contextParts.join(' ');
|
|
421
750
|
selectedPassages.push({
|
|
422
|
-
text:
|
|
423
|
-
score: parseFloat((
|
|
751
|
+
text: win.text,
|
|
752
|
+
score: Math.min(1, parseFloat((win.score / (maxPossibleScore || 1)).toFixed(4))),
|
|
424
753
|
context,
|
|
754
|
+
startIdx: firstIdx,
|
|
755
|
+
indices: win.indices,
|
|
425
756
|
});
|
|
426
757
|
}
|
|
427
|
-
//
|
|
428
|
-
|
|
429
|
-
|
|
758
|
+
// ---------------------------------------------------------------------------
|
|
759
|
+
// Step 10: Confidence computation — multi-signal formula
|
|
760
|
+
// ---------------------------------------------------------------------------
|
|
761
|
+
const topWindow = sortedWindows[0];
|
|
762
|
+
const topBase = topWindow ? Math.max(...topWindow.indices.map(i => sentenceScores[i].base)) : 0;
|
|
430
763
|
const meanScore = bm25Scores.reduce((a, b) => a + b, 0) / bm25Scores.length;
|
|
764
|
+
// Signal 1: Score gap
|
|
431
765
|
const scoreGap = maxPossibleScore > 0 ? (topBase - meanScore) / maxPossibleScore : 0;
|
|
432
|
-
//
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
const
|
|
766
|
+
// Signal 2: Term coverage — what % of query terms appear in top window
|
|
767
|
+
// Also count synonym-mediated matches (at 0.7 weight)
|
|
768
|
+
const topWindowTokens = tokenize(topWindow?.text || '');
|
|
769
|
+
const directMatches = queryTerms.filter(t => topWindowTokens.includes(t)).length;
|
|
770
|
+
const matchedTerms = queryTerms.filter(t => {
|
|
771
|
+
if (topWindowTokens.includes(t))
|
|
772
|
+
return true;
|
|
773
|
+
// Check if any synonym of this term appears in the top window
|
|
774
|
+
const synonymsForTerm = expandWithSynonyms([t]);
|
|
775
|
+
return synonymsForTerm.some(e => !e.isOriginal && topWindowTokens.includes(e.term));
|
|
776
|
+
});
|
|
777
|
+
const synonymMatches = matchedTerms.length - directMatches;
|
|
778
|
+
const effectiveCoverage = queryTerms.length > 0
|
|
779
|
+
? (directMatches + synonymMatches * 0.7) / queryTerms.length
|
|
780
|
+
: 0;
|
|
781
|
+
// Signal 3: Position signal — early in document is more reliable for factual Qs
|
|
782
|
+
const positionSignal = (topWindow?.startSentenceIdx ?? 999) < sentences.length * 0.2 ? 0.1 : 0;
|
|
783
|
+
// Signal 4: Answer type match — does the answer look like it answers the question type?
|
|
784
|
+
const typeMatch = hasExpectedEntityType(topWindow?.text || '', questionType) ? 0.20 : 0;
|
|
785
|
+
const rawConfidence = Math.min(1, Math.max(0, 0.1 + // reduced baseline (was 0.2)
|
|
786
|
+
scoreGap * 0.35 +
|
|
787
|
+
effectiveCoverage * 0.25 + // synonym-aware term coverage (was 0.30)
|
|
788
|
+
positionSignal +
|
|
789
|
+
typeMatch));
|
|
790
|
+
// Penalty: noise/metadata in top answer reduces confidence
|
|
791
|
+
const topAnswerText = (topWindow?.text || '').toLowerCase();
|
|
436
792
|
const noisePenalty = (/\bcs1[_\s]/i.test(topAnswerText) ||
|
|
437
793
|
/\bcategory:/i.test(topAnswerText) ||
|
|
438
794
|
/\b(archived|retrieved)\s+(from|on)\b/i.test(topAnswerText) ||
|
|
439
795
|
/\b(isbn|issn|doi|arxiv|bibcode|pmid)\b/i.test(topAnswerText) ||
|
|
440
|
-
// Line is mostly URLs
|
|
441
796
|
(topAnswerText.match(/https?:\/\//g) || []).length > 2) ? 0.5 : 0;
|
|
442
|
-
|
|
443
|
-
|
|
797
|
+
// Fix #13: Penalty for UI chrome / navigation elements
|
|
798
|
+
const uiChromePenalty = (/\b(sign in|sign up|log in|log out|subscribe|newsletter|cookie|privacy policy|terms of service)\b/i.test(topAnswerText) ||
|
|
799
|
+
/\b(skip to|main menu|navigation|sidebar|footer|header|breadcrumb)\b/i.test(topAnswerText)) ? 0.3 : 0;
|
|
800
|
+
const confidence = Math.max(0, rawConfidence - noisePenalty - uiChromePenalty);
|
|
801
|
+
// ---------------------------------------------------------------------------
|
|
802
|
+
// Step 11: Try entity extraction for who/when questions (BM25 fallback)
|
|
803
|
+
// ---------------------------------------------------------------------------
|
|
444
804
|
let answerText = selectedPassages[0]?.context || selectedPassages[0]?.text || '';
|
|
805
|
+
// For who/when, try to surface a concise entity from the top passage
|
|
806
|
+
if ((questionType === 'who' || questionType === 'when') && selectedPassages[0]) {
|
|
807
|
+
const entity = extractEntity(selectedPassages[0].text, questionType);
|
|
808
|
+
if (entity && selectedPassages[0].text.includes(entity)) {
|
|
809
|
+
// Keep full passage text as answer (it contains the entity)
|
|
810
|
+
answerText = selectedPassages[0].text;
|
|
811
|
+
}
|
|
812
|
+
}
|
|
445
813
|
if (answerText.length > maxChars) {
|
|
446
814
|
answerText = answerText.slice(0, maxChars).replace(/\s+\S*$/, '') + '…';
|
|
447
815
|
}
|
|
@@ -452,9 +820,8 @@ export function quickAnswer(options) {
|
|
|
452
820
|
? p.context.slice(0, Math.max(0, maxChars - totalChars)).replace(/\s+\S*$/, '') + '…'
|
|
453
821
|
: p.context;
|
|
454
822
|
totalChars += contextTrimmed.length;
|
|
455
|
-
return {
|
|
823
|
+
return { text: p.text, score: p.score, context: contextTrimmed };
|
|
456
824
|
});
|
|
457
|
-
void topScore; // consumed via sorted[0]
|
|
458
825
|
return {
|
|
459
826
|
question,
|
|
460
827
|
answer: answerText,
|