webpeel 0.15.2 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/README.md +2 -2
  2. package/dist/cli-auth.d.ts.map +1 -1
  3. package/dist/cli-auth.js +5 -0
  4. package/dist/cli-auth.js.map +1 -1
  5. package/dist/cli.js +43 -11
  6. package/dist/cli.js.map +1 -1
  7. package/dist/core/crawler.d.ts +2 -0
  8. package/dist/core/crawler.d.ts.map +1 -1
  9. package/dist/core/crawler.js +12 -3
  10. package/dist/core/crawler.js.map +1 -1
  11. package/dist/core/pipeline.d.ts +1 -0
  12. package/dist/core/pipeline.d.ts.map +1 -1
  13. package/dist/core/pipeline.js +63 -2
  14. package/dist/core/pipeline.js.map +1 -1
  15. package/dist/core/quick-answer.d.ts +26 -0
  16. package/dist/core/quick-answer.d.ts.map +1 -1
  17. package/dist/core/quick-answer.js +451 -84
  18. package/dist/core/quick-answer.js.map +1 -1
  19. package/dist/core/search-provider.d.ts +47 -4
  20. package/dist/core/search-provider.d.ts.map +1 -1
  21. package/dist/core/search-provider.js +278 -7
  22. package/dist/core/search-provider.js.map +1 -1
  23. package/dist/core/stemmer.d.ts +39 -0
  24. package/dist/core/stemmer.d.ts.map +1 -0
  25. package/dist/core/stemmer.js +510 -0
  26. package/dist/core/stemmer.js.map +1 -0
  27. package/dist/core/synonyms.d.ts +43 -0
  28. package/dist/core/synonyms.d.ts.map +1 -0
  29. package/dist/core/synonyms.js +185 -0
  30. package/dist/core/synonyms.js.map +1 -0
  31. package/dist/mcp/server.js +109 -4
  32. package/dist/mcp/server.js.map +1 -1
  33. package/dist/server/app.d.ts +1 -0
  34. package/dist/server/app.d.ts.map +1 -1
  35. package/dist/server/app.js +76 -10
  36. package/dist/server/app.js.map +1 -1
  37. package/dist/server/middleware/auth.d.ts +2 -1
  38. package/dist/server/middleware/auth.d.ts.map +1 -1
  39. package/dist/server/middleware/auth.js +25 -12
  40. package/dist/server/middleware/auth.js.map +1 -1
  41. package/dist/server/middleware/rate-limit.d.ts +1 -0
  42. package/dist/server/middleware/rate-limit.d.ts.map +1 -1
  43. package/dist/server/middleware/rate-limit.js +20 -11
  44. package/dist/server/middleware/rate-limit.js.map +1 -1
  45. package/dist/server/routes/agent.d.ts +4 -0
  46. package/dist/server/routes/agent.d.ts.map +1 -1
  47. package/dist/server/routes/agent.js +196 -9
  48. package/dist/server/routes/agent.js.map +1 -1
  49. package/dist/server/routes/batch.d.ts.map +1 -1
  50. package/dist/server/routes/batch.js +126 -1
  51. package/dist/server/routes/batch.js.map +1 -1
  52. package/dist/server/routes/fetch.d.ts +1 -0
  53. package/dist/server/routes/fetch.d.ts.map +1 -1
  54. package/dist/server/routes/fetch.js +193 -55
  55. package/dist/server/routes/fetch.js.map +1 -1
  56. package/dist/server/routes/jobs.d.ts.map +1 -1
  57. package/dist/server/routes/jobs.js +115 -2
  58. package/dist/server/routes/jobs.js.map +1 -1
  59. package/dist/server/routes/mcp.d.ts +1 -0
  60. package/dist/server/routes/mcp.d.ts.map +1 -1
  61. package/dist/server/routes/mcp.js +113 -6
  62. package/dist/server/routes/mcp.js.map +1 -1
  63. package/dist/server/routes/search.js +1 -1
  64. package/dist/server/routes/search.js.map +1 -1
  65. package/dist/server/types.d.ts +16 -0
  66. package/dist/server/types.d.ts.map +1 -0
  67. package/dist/server/types.js +8 -0
  68. package/dist/server/types.js.map +1 -0
  69. package/dist/server/utils/response.d.ts +45 -0
  70. package/dist/server/utils/response.d.ts.map +1 -0
  71. package/dist/server/utils/response.js +70 -0
  72. package/dist/server/utils/response.js.map +1 -0
  73. package/dist/server/utils/sse.d.ts +23 -0
  74. package/dist/server/utils/sse.d.ts.map +1 -0
  75. package/dist/server/utils/sse.js +39 -0
  76. package/dist/server/utils/sse.js.map +1 -0
  77. package/dist/types.d.ts +2 -0
  78. package/dist/types.d.ts.map +1 -1
  79. package/dist/types.js.map +1 -1
  80. package/package.json +1 -1
@@ -4,8 +4,12 @@
4
4
  * Answers a question about page content without any API key.
5
5
  * Uses BM25 relevance scoring + answer-signal boosting to surface
6
6
  * the most relevant sentences.
7
+ *
8
+ * v2: Added Porter stemming, synonym expansion, and sliding window scoring.
7
9
  */
8
10
  import { scoreBM25 } from './bm25-filter.js';
11
+ import { stem } from './stemmer.js';
12
+ import { expandWithSynonyms } from './synonyms.js';
9
13
  // ---------------------------------------------------------------------------
10
14
  // Stopwords — removed from question before BM25 scoring
11
15
  // ---------------------------------------------------------------------------
@@ -21,8 +25,12 @@ const STOPWORDS = new Set([
21
25
  ]);
22
26
  function detectQuestionType(question) {
23
27
  const q = question.toLowerCase().trim();
24
- if (/how\s+many|how\s+much|what\s+price|what\s+cost|pricing/.test(q))
28
+ // Fix #1: Distinguish "how many/much/long" (quantity/duration) from "how do/does/can/to/is" (process/explanation)
29
+ if (/how\s+many|how\s+much|how\s+long|what\s+price|what\s+cost|pricing/.test(q))
25
30
  return 'how_many';
31
+ // Fix #11: Yes/no questions (starts with auxiliary verb)
32
+ if (/^(is|does|can|will|are|has|do|did|was|were|could|should|would)\b/i.test(q))
33
+ return 'yes_no';
26
34
  if (/when\b/.test(q))
27
35
  return 'when';
28
36
  if (/where\b/.test(q))
@@ -36,14 +44,33 @@ function detectQuestionType(question) {
36
44
  return 'who';
37
45
  if (/what\b/.test(q))
38
46
  return 'what';
47
+ // Fix #1: "how do/does/can/to/is" → 'how' (process/explanation), bare 'how' → 'how' (not 'how_many')
48
+ if (/how\s+(?:do|does|can|to|is|are|was|were|will|would|could|should)\b/.test(q))
49
+ return 'how';
39
50
  if (/how\b/.test(q))
40
- return 'how_many';
51
+ return 'how';
41
52
  return 'other';
42
53
  }
43
54
  // ---------------------------------------------------------------------------
44
55
  // Tokenization
45
56
  // ---------------------------------------------------------------------------
57
+ /**
58
+ * Tokenize and stem text. Used for BM25 scoring — both query and content
59
+ * go through the same stemming pipeline so "limitations" matches "limit".
60
+ */
46
61
  function tokenize(text) {
62
+ return text
63
+ .toLowerCase()
64
+ .replace(/[^\w\s]/g, ' ')
65
+ .split(/\s+/)
66
+ .filter(t => t.length > 1)
67
+ .map(t => stem(t));
68
+ }
69
+ /**
70
+ * Tokenize WITHOUT stemming. Used for regex pattern building in
71
+ * tryDirectExtraction so that exact text patterns still match.
72
+ */
73
+ function tokenizeRaw(text) {
47
74
  return text
48
75
  .toLowerCase()
49
76
  .replace(/[^\w\s]/g, ' ')
@@ -51,7 +78,10 @@ function tokenize(text) {
51
78
  .filter(t => t.length > 1);
52
79
  }
53
80
  function tokenizeQuestion(question) {
54
- return tokenize(question).filter(t => !STOPWORDS.has(t));
81
+ // Filter stopwords on raw tokens (before stemming), then stem
82
+ return tokenizeRaw(question)
83
+ .filter(t => !STOPWORDS.has(t))
84
+ .map(t => stem(t));
55
85
  }
56
86
  // ---------------------------------------------------------------------------
57
87
  // Sentence splitting
@@ -59,6 +89,7 @@ function tokenizeQuestion(question) {
59
89
  /**
60
90
  * Split text into sentences. Handles common abbreviations to avoid false splits.
61
91
  * Returns an array of sentences with their start position (index in original text).
92
+ * Also extracts list items (markdown bullets/numbers) as pseudo-sentences.
62
93
  */
63
94
  function splitIntoSentences(content) {
64
95
  // Strip markdown formatting while preserving positions is complex;
@@ -81,6 +112,13 @@ function splitIntoSentences(content) {
81
112
  PLACEHOLDER_MAP.set(ph, m);
82
113
  return ph;
83
114
  });
115
+ // Protect version numbers with multiple dots (e.g., 0.9.0, 1.2.3, 3.11.4)
116
+ // Must run BEFORE the decimal number protection to avoid partial replacement
117
+ protected_ = protected_.replace(/\b(\d+\.\d+(?:\.\d+)+)/g, (m) => {
118
+ const ph = `\x00VER${placeholderIdx++}\x00`;
119
+ PLACEHOLDER_MAP.set(ph, m);
120
+ return ph;
121
+ });
84
122
  // Protect decimal numbers (e.g., 3.14, $29.99)
85
123
  protected_ = protected_.replace(/\b(\d+)\.(\d+)/g, (_m, a, b) => {
86
124
  const ph = `\x00NUM${placeholderIdx++}\x00`;
@@ -114,10 +152,23 @@ function splitIntoSentences(content) {
114
152
  sentences.push({ text: remaining, start: lastEnd });
115
153
  }
116
154
  }
117
- // Filter: keep sentences between 10 and 500 chars
155
+ // Fix #12: Also extract list items (markdown bullets/numbers) as "sentences"
156
+ const listPattern = /^[\s]*[-*+]\s+(.+)$/gm;
157
+ let listMatch;
158
+ while ((listMatch = listPattern.exec(content)) !== null) {
159
+ const item = listMatch[1].trim();
160
+ if (item.length >= 10 && item.length <= 800) {
161
+ // Only add if not already captured by sentence splitting
162
+ const isDuplicate = sentences.some(s => s.text.includes(item) || item.includes(s.text));
163
+ if (!isDuplicate) {
164
+ sentences.push({ text: item, start: listMatch.index });
165
+ }
166
+ }
167
+ }
168
+ // Fix #7: Increase max sentence length from 500 to 800 chars
118
169
  return sentences.filter(s => {
119
170
  const len = s.text.length;
120
- return len >= 10 && len <= 500;
171
+ return len >= 10 && len <= 800;
121
172
  });
122
173
  }
123
174
  // ---------------------------------------------------------------------------
@@ -131,8 +182,8 @@ function computeBoost(sentence, questionType, isTopicSentence) {
131
182
  }
132
183
  switch (questionType) {
133
184
  case 'how_many': {
134
- // Contains a number or price
135
- if (/\$[\d,.]+|\d+[,.]?\d*\s*(per|\/|month|year|week|day|request|api|call|token|user)/i.test(sentence)) {
185
+ // Contains a number or price or duration
186
+ if (/\$[\d,.]+|\d+[,.]?\d*\s*(per|\/|month|year|week|day|request|api|call|token|user|minute|second|hour|degree|meter|mile|kg|lb)/i.test(sentence)) {
136
187
  boost += 0.3;
137
188
  }
138
189
  else if (/\b\d+\b/.test(sentence)) {
@@ -140,6 +191,18 @@ function computeBoost(sentence, questionType, isTopicSentence) {
140
191
  }
141
192
  break;
142
193
  }
194
+ // Fix #1: New 'how' (process/explanation) boost
195
+ case 'how': {
196
+ // Process/explanation sentences
197
+ if (/\b(by using|through|works by|in order to|step|first|then|next|finally|process|method|approach|technique|way to|can be done)\b/i.test(s)) {
198
+ boost += 0.4;
199
+ }
200
+ // Instructional patterns
201
+ if (/\b(install|run|execute|configure|set up|use|import|require|enable|disable|create|build|deploy)\b/i.test(s)) {
202
+ boost += 0.2;
203
+ }
204
+ break;
205
+ }
143
206
  case 'when': {
144
207
  // Contains a date
145
208
  if (/\b(january|february|march|april|may|june|july|august|september|october|november|december|\d{4}|\d+\s*(days?|weeks?|months?|years?))\b/i.test(sentence)) {
@@ -151,10 +214,21 @@ function computeBoost(sentence, questionType, isTopicSentence) {
151
214
  }
152
215
  break;
153
216
  }
217
+ // Fix #4: Use more specific location indicators
154
218
  case 'where': {
155
- // Contains a location hint (capitalized proper noun)
156
- if (/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/.test(sentence) && !/^(The|A|An|In|On|At|For)\b/.test(sentence)) {
157
- boost += 0.3;
219
+ // Primary location signal — strong indicator (located/headquartered/based in + geographic proper noun)
220
+ if (/\b(located|headquartered|based|founded|established)\s+(in|at)\b/i.test(s) ||
221
+ /\b(?:in|at)\s+(?:the\s+)?[A-Z][a-z]+(?:(?:\s+[A-Z][a-z]+)*|(?:,\s+[A-Z][a-z]+)*)\b/.test(sentence) ||
222
+ /\b(city|country|state|region|continent|capital|office|campus|location|address)\b/i.test(s)) {
223
+ boost += 0.6;
224
+ }
225
+ // Specific geographic indicators including country names
226
+ if (/\b(street|avenue|boulevard|road|highway|route|district|province|county|netherlands|amsterdam|berlin|london|paris|tokyo|beijing|moscow|france|germany|japan|china|india|canada|australia|san francisco|new york|los angeles|seattle|chicago|boston|austin|miami)\b/i.test(s)) {
227
+ boost += 0.4;
228
+ }
229
+ // Birth/origin patterns
230
+ if (/\b(born|raised|grew up|native|hometown|birthplace|originally from)\b/i.test(s)) {
231
+ boost += 0.4;
158
232
  }
159
233
  break;
160
234
  }
@@ -170,13 +244,17 @@ function computeBoost(sentence, questionType, isTopicSentence) {
170
244
  if (/\b(because|due to|reason|therefore|since|as a result|consequently|thus)\b/.test(s)) {
171
245
  boost += 0.4;
172
246
  }
247
+ // Purpose/goal sentences ("as a successor to", "in order to", "to allow", "to provide")
248
+ if (/\b(as a successor|successor to|in order to|so that|to allow|to provide|to enable|to support|to replace|to improve|to address|to solve)\b/i.test(s)) {
249
+ boost += 0.4;
250
+ }
173
251
  break;
174
252
  }
175
253
  case 'who': {
176
254
  // Pattern: "[topic] was created/designed/developed by [Person]"
177
255
  // Or: "[Person] created/designed/developed [topic]"
178
- if (/\b(created|designed|developed|built|invented|founded|authored|introduced|proposed|conceived)\s+by\b/i.test(s) ||
179
- /\b[A-Z][a-z]+\s+(?:[A-Z][a-z]+\s+)?(?:created|designed|developed|built|invented|founded|authored|introduced)\b/.test(sentence)) {
256
+ if (/\b(created|designed|developed|built|invented|founded|authored|introduced|proposed|conceived|released|launched|established)\s+(?:\w+\s+){0,4}by\b/i.test(s) ||
257
+ /\b[A-Z][a-z]+\s+(?:[A-Z][a-z]+\s+)?(?:created|designed|developed|built|invented|founded|authored|introduced|conceived|began)\b/.test(sentence)) {
180
258
  boost += 0.5;
181
259
  }
182
260
  // Also boost if contains person names (capitalized words that aren't sentence starters)
@@ -190,20 +268,62 @@ function computeBoost(sentence, questionType, isTopicSentence) {
190
268
  }
191
269
  break;
192
270
  }
271
+ // Fix #11: Yes/no question boost
272
+ case 'yes_no': {
273
+ if (/\b(yes|no|not|does not|doesn't|cannot|can't|isn't|aren't|won't|supports?|enables?|allows?|provides?|includes?)\b/i.test(s)) {
274
+ boost += 0.3;
275
+ }
276
+ break;
277
+ }
193
278
  }
194
279
  return boost;
195
280
  }
196
- function tryDirectExtraction(content, questionType, topicTerms, _question) {
281
+ // Fix #9: Remove unused `_question` parameter
282
+ // NOTE: topicTerms must be RAW (unstemmed) for correct regex pattern building
283
+ function tryDirectExtraction(content, questionType, topicTerms) {
197
284
  if (topicTerms.length === 0)
198
285
  return null;
199
286
  // Build a regex pattern that matches any topic term (case-insensitive)
200
287
  const topicPattern = topicTerms.map(t => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
288
+ // --- Tiered 'who' infobox extraction ---
289
+ // Wikipedia infobox entries appear as list items like:
290
+ // "- Founders · Sam AltmanElon Musk..."
291
+ // We search for the field pattern directly (no topic prefix required) since
292
+ // "Founders ·" is specific enough to avoid false positives.
293
+ // Split into two tiers: creator fields (always try first) vs. developer/maintainer fields
294
+ // (skip for creation questions so we don't return "The Rust Team" for "Who created Rust?")
295
+ if (questionType === 'who') {
296
+ // Detect if question is about creation/origin.
297
+ // These are stem prefixes (e.g. "creat" from "created"), so use leading \b only —
298
+ // no trailing \b, since the stem appears INSIDE the full word.
299
+ const isCreationQuestion = /\b(?:creat|built|invent|found|design|start|conceiv|originat|develop|made|wrote|began)\w*/i.test(topicTerms.join(' '));
300
+ // Tier 1: Original creator fields (always try first) — search directly without topic prefix
301
+ const creatorFields = /(?:Original\s+author|Creator|Inventor|Designed\s+by|Created\s+by|Founded\s+by|Founders)\s*[·:]\s*(.+)/i;
302
+ const creatorMatch = content.match(creatorFields);
303
+ if (creatorMatch?.[1]) {
304
+ const value = creatorMatch[1].split('\n')[0].trim().slice(0, 300);
305
+ if (value.length > 2) {
306
+ return { text: value, context: creatorMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
307
+ }
308
+ }
309
+ // Tier 2: General developer fields (skip for creation questions — let BM25 find the original creator)
310
+ if (!isCreationQuestion) {
311
+ const devFields = /(?:Developers|Developer|Maintainer|Author)\s*[·:]\s*(.+)/i;
312
+ const devMatch = content.match(devFields);
313
+ if (devMatch?.[1]) {
314
+ const value = devMatch[1].split('\n')[0].trim().slice(0, 300);
315
+ if (value.length > 2) {
316
+ return { text: value, context: devMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
317
+ }
318
+ }
319
+ }
320
+ }
201
321
  // --- Infobox patterns (Wikipedia-style: "Topic: Field · Value") ---
202
322
  // Note: Wikipedia uses \u00A0 (NBSP) in infobox fields, so we use \\s+ (which matches NBSP) instead of literal spaces
203
323
  const infoboxPatterns = [
204
- { type: ['who'], field: new RegExp(`(?:${topicPattern}).*?(?:Designed\\s+by|Created\\s+by|Developed\\s+by|Founded\\s+by|Original\\s+author|Developers|Developer|Maintainer|Author|Inventor|Creator)\\s*[·:]\\s*(.+)`, 'i') },
205
324
  { type: ['when'], field: new RegExp(`(?:${topicPattern}).*?(?:First\\s+appeared|Released|Founded|Established|Created|Launch\\s+date|Initial\\s+release)\\s*[·:]\\s*(.+)`, 'i') },
206
325
  { type: ['what'], field: new RegExp(`(?:${topicPattern}).*?(?:Type|Genre|Category|Classification)\\s*[·:]\\s*(.+)`, 'i') },
326
+ { type: ['where'], field: /(?:Headquarters|Headquartered|Location|Address|HQ|Head\s+office|Based\s+in)\s*[·:]\s*(.+)/i },
207
327
  ];
208
328
  for (const pat of infoboxPatterns) {
209
329
  if (!pat.type.includes(questionType))
@@ -225,7 +345,7 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
225
345
  // "developed/designed/created by [Name]" in first 20% of content
226
346
  const first20 = content.slice(0, Math.max(500, Math.floor(content.length * 0.2)));
227
347
  // Use case-insensitive for verbs, but validate name casing separately
228
- const byPattern = /(?:developed|designed|created|built|invented|founded|authored|introduced|coined)\s+by\s+(\S+(?:\s+\S+){0,3})/i;
348
+ const byPattern = /(?:developed|designed|created|built|invented|founded|authored|introduced|coined|conceived|released|started|launched|begun|proposed|established)\s+(?:\w+\s+){0,4}by\s+(\S+(?:\s+\S+){0,3})/i;
229
349
  const byMatch = first20.match(byPattern);
230
350
  if (byMatch?.[1]) {
231
351
  const candidateName = byMatch[1].trim();
@@ -249,7 +369,10 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
249
369
  if (questionType === 'when') {
250
370
  // Look for a date near topic terms in first 30% of content
251
371
  const first30 = content.slice(0, Math.max(600, Math.floor(content.length * 0.3)));
252
- const datePattern = /(?:released|launched|first appeared|founded|established|created|introduced|began|started)\s+(?:in|on)?\s*(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})/i;
372
+ // Note: "began"/"started" are intentionally excluded — they can match
373
+ // construction/start events that don't answer the specific question
374
+ // (e.g. "When did X fall?" should NOT match "began on Aug 13, 1961").
375
+ const datePattern = /(?:released|launched|first appeared|founded|established|created|introduced|conceived|opened|invented)\s+(?:\w+\s+){0,2}(?:in|on)\s+(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})/i;
253
376
  const dateMatch = first30.match(datePattern);
254
377
  if (dateMatch) {
255
378
  const idx = first30.indexOf(dateMatch[0]);
@@ -266,6 +389,51 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
266
389
  return null;
267
390
  }
268
391
  // ---------------------------------------------------------------------------
392
+ // Entity extraction — for who/when questions answered by BM25
393
+ // ---------------------------------------------------------------------------
394
+ /**
395
+ * Try to extract a specific entity (person name, date) from a BM25-selected passage.
396
+ * Returns the entity string if found, or null.
397
+ */
398
+ function extractEntity(passage, questionType) {
399
+ if (questionType === 'who') {
400
+ // Try: "by [Name Name]"
401
+ const byMatch = passage.match(/\bby\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})/);
402
+ if (byMatch)
403
+ return byMatch[1];
404
+ // Try: "[Name Name] created/founded/..."
405
+ const nameVerbMatch = passage.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})\s+(?:created|founded|designed|developed|built|invented|authored|introduced)/);
406
+ if (nameVerbMatch)
407
+ return nameVerbMatch[1];
408
+ return null;
409
+ }
410
+ if (questionType === 'when') {
411
+ const dateMatch = passage.match(/\b(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})\b/);
412
+ if (dateMatch)
413
+ return dateMatch[1];
414
+ return null;
415
+ }
416
+ return null;
417
+ }
418
+ // ---------------------------------------------------------------------------
419
+ // Entity type check for confidence formula
420
+ // ---------------------------------------------------------------------------
421
+ function hasExpectedEntityType(text, questionType) {
422
+ switch (questionType) {
423
+ case 'who':
424
+ return /[A-Z][a-z]+\s+[A-Z][a-z]+/.test(text);
425
+ case 'when':
426
+ return /\b\d{4}\b|\b(january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(text);
427
+ case 'how_many':
428
+ case 'how_much':
429
+ return /\b\d+\b/.test(text);
430
+ case 'where':
431
+ return /\b(in|at|near|located|based|headquarter)\b/i.test(text);
432
+ default:
433
+ return true;
434
+ }
435
+ }
436
+ // ---------------------------------------------------------------------------
269
437
  // Content cleaning — strip citation/reference noise before BM25 scoring
270
438
  // ---------------------------------------------------------------------------
271
439
  /**
@@ -275,6 +443,25 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
275
443
  */
276
444
  function cleanContentForQA(content) {
277
445
  let cleaned = content;
446
+ // Strip markdown formatting to get clean text for BM25 scoring
447
+ // Images: ![alt](url) → remove entirely
448
+ cleaned = cleaned.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
449
+ // Links: [text](url "title") → text (keep link text, remove URL and title)
450
+ cleaned = cleaned.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
451
+ // Bold/italic: ***text***, **text**, *text* → text
452
+ cleaned = cleaned.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
453
+ // Inline code: `text` → text
454
+ cleaned = cleaned.replace(/`([^`]+)`/g, '$1');
455
+ // Heading markers: ## Heading → Heading
456
+ cleaned = cleaned.replace(/^#{1,6}\s+/gm, '');
457
+ // Horizontal rules
458
+ cleaned = cleaned.replace(/^---+$/gm, '');
459
+ // HTML entities
460
+ cleaned = cleaned.replace(/&amp;/g, '&');
461
+ cleaned = cleaned.replace(/&lt;/g, '<');
462
+ cleaned = cleaned.replace(/&gt;/g, '>');
463
+ cleaned = cleaned.replace(/&nbsp;/g, ' ');
464
+ cleaned = cleaned.replace(/&#\d+;/g, '');
278
465
  // Remove Wikipedia citation metadata (CS1_maint, Category:, etc.)
279
466
  cleaned = cleaned.replace(/CS1[_\s]\w+[:\s][^\n]*/gi, '');
280
467
  cleaned = cleaned.replace(/Category:[^\n]*/gi, '');
@@ -288,12 +475,11 @@ function cleanContentForQA(content) {
288
475
  cleaned = cleaned.replace(/\b(retrieved|archived from the original)\b[^\n]{0,100}/gi, '');
289
476
  // Remove "External links" and everything after (usually just URLs)
290
477
  cleaned = cleaned.replace(/^#{1,3}\s*External\s+links[\s\S]*$/im, '');
291
- // Remove section headers for reference-like sections
292
- // (but keep real content that happens to be after these headings)
478
+ // Fix #8: Remove entire "See also", "Notes", "Further reading" sections
479
+ // (heading + all content until the next heading)
480
+ cleaned = cleaned.replace(/^#{1,3}\s*(?:See\s+also|Notes|Further\s+reading)\s*\n(?:(?!^#{1,3}\s).*\n?)*/gim, '');
481
+ // Remove "References" heading only (keep nearby content that may be relevant)
293
482
  cleaned = cleaned.replace(/^#{1,3}\s*References\s*$/im, '');
294
- cleaned = cleaned.replace(/^#{1,3}\s*Further\s+reading\s*$/im, '');
295
- cleaned = cleaned.replace(/^#{1,3}\s*See\s+also\s*$/im, '');
296
- cleaned = cleaned.replace(/^#{1,3}\s*Notes\s*$/im, '');
297
483
  // Remove lines that are mostly citation-like (very short with lots of punctuation/numbers)
298
484
  cleaned = cleaned.split('\n').filter(line => {
299
485
  const trimmed = line.trim();
@@ -318,6 +504,30 @@ function cleanContentForQA(content) {
318
504
  // ---------------------------------------------------------------------------
319
505
  // Main quickAnswer function
320
506
  // ---------------------------------------------------------------------------
507
+ /**
508
+ * Answer a question about fetched page content using BM25 + heuristics.
509
+ *
510
+ * This is a fully offline, LLM-free approach. It:
511
+ * 1. Cleans the content (strips Wikipedia citations, reference noise, etc.)
512
+ * 2. Tries direct pattern extraction for structured content (infoboxes, definitions)
513
+ * 3. Falls back to BM25 sentence scoring with question-type-aware boosting
514
+ * 4. Uses sliding windows (1-3 sentences) to capture multi-sentence answers
515
+ * 5. Expands query terms with synonyms for broader matching
516
+ * 6. Returns the top passages with scores and surrounding context
517
+ *
518
+ * @param options - Question, content, and optional tuning parameters
519
+ * @returns A result object with answer text, confidence score, and ranked passages
520
+ *
521
+ * @example
522
+ * ```ts
523
+ * const result = await quickAnswer({
524
+ * question: 'What is the pricing?',
525
+ * content: pageMarkdown,
526
+ * url: 'https://example.com/pricing',
527
+ * });
528
+ * console.log(result.answer, result.confidence);
529
+ * ```
530
+ */
321
531
  export function quickAnswer(options) {
322
532
  const { question, content, maxPassages = 3, maxChars = 2000, url = '', } = options;
323
533
  const emptyResult = {
@@ -334,12 +544,23 @@ export function quickAnswer(options) {
334
544
  return emptyResult;
335
545
  // Clean content to remove citation/reference noise before BM25 scoring
336
546
  const cleanedContent = cleanContentForQA(content);
547
+ // For very long content, focus on the most relevant portion.
548
+ // Wikipedia article tails contain references, tangential details, and noise.
549
+ const MAX_QA_CHARS = 20000;
550
+ let qaContent = cleanedContent;
551
+ if (qaContent.length > MAX_QA_CHARS) {
552
+ // Keep the first 70% — definitions, key facts, and main content
553
+ // are almost always in the first 2/3 of the article
554
+ qaContent = qaContent.slice(0, Math.floor(qaContent.length * 0.7));
555
+ }
337
556
  // Step 0: Direct pattern extraction — try to find structured answers before BM25
338
557
  // This catches infobox patterns (e.g. "TypeScript: Designed by · Anders Hejlsberg")
339
558
  // and definition sentences (e.g. "TypeScript is ... developed by Microsoft")
340
559
  const questionType = detectQuestionType(question);
341
- const topicTerms = tokenizeQuestion(question);
342
- const directAnswer = tryDirectExtraction(cleanedContent, questionType, topicTerms, question);
560
+ // RAW (unstemmed) topic terms for tryDirectExtraction regex patterns
561
+ const topicTermsRaw = tokenizeRaw(question).filter(t => !STOPWORDS.has(t));
562
+ // Fix #9: Remove the unused `question` argument from the call site
563
+ const directAnswer = tryDirectExtraction(cleanedContent, questionType, topicTermsRaw);
343
564
  if (directAnswer) {
344
565
  return {
345
566
  question,
@@ -350,98 +571,245 @@ export function quickAnswer(options) {
350
571
  method: 'bm25',
351
572
  };
352
573
  }
353
- // Step 1: Split into sentences
354
- const sentences = splitIntoSentences(cleanedContent);
574
+ // Step 1: Split into sentences (use qaContent — truncated for long articles)
575
+ const sentences = splitIntoSentences(qaContent);
355
576
  if (sentences.length === 0)
356
577
  return emptyResult;
357
- // Step 2: Tokenize question (remove stopwords)
578
+ // Step 2: Tokenize question (remove stopwords, then stem)
358
579
  const queryTerms = tokenizeQuestion(question);
359
580
  if (queryTerms.length === 0) {
360
- // Fall back to all tokens if all were stopwords
361
- queryTerms.push(...tokenize(question));
581
+ // Fall back to all stemmed tokens if all were stopwords
582
+ const fallback = tokenize(question);
583
+ if (fallback.length === 0)
584
+ return emptyResult;
585
+ queryTerms.push(...fallback);
362
586
  }
363
- if (queryTerms.length === 0)
364
- return emptyResult;
365
- // Step 3: Score sentences with BM25 (questionType already computed in Step 0)
366
- const blocks = sentences.map((s, index) => ({ raw: s.text, index }));
367
- const bm25Scores = scoreBM25(blocks, queryTerms);
368
- // Step 4: Compute max possible score for normalization
369
- // (the sentence with the highest BM25 score against itself as a reference)
587
+ // Expand query with synonyms for broader matching
588
+ const expanded = expandWithSynonyms(queryTerms);
589
+ // Use all expanded terms for BM25 (IDF naturally downweights common synonyms)
590
+ const uniqueQueryTerms = [...new Set(expanded.map(e => e.term))];
591
+ // Step 3: Create stemmed scoring blocks for each sentence.
592
+ // We pass stemmed text to scoreBM25 so that its internal tokenizer gets stemmed tokens,
593
+ // matching the stemmed queryTerms. The original sentence text is preserved for display.
594
+ const scoringBlocks = sentences.map((s, index) => ({
595
+ raw: tokenize(s.text).join(' '), // pre-stemmed text for BM25 scoring
596
+ index,
597
+ }));
598
+ // ---------------------------------------------------------------------------
599
+ // Step 3.5: Lightweight topic propagation (coreference approximation)
600
+ // ---------------------------------------------------------------------------
601
+ // When a sentence uses a referent phrase like "The platform" or "The company"
602
+ // instead of the topic entity name, BM25 can't match it. We inject stemmed
603
+ // topic terms into scoring blocks of nearby referent sentences so BM25 has
604
+ // something to work with.
605
+ //
606
+ // Only active for question types where coreference resolution helps:
607
+ // where, who, when — NOT for what/how/yes_no/how_many (no entity tracking needed).
608
+ //
609
+ // Heuristic: A sentence gets topic injection if:
610
+ // 1. It contains a common referent pattern (the platform/company/service/etc.)
611
+ // 2. It is within PROXIMITY_WINDOW sentences of a sentence containing the topic
612
+ // 3. OR the content has fewer than SMALL_CONTENT_THRESHOLD sentences AND
613
+ // the topic is actually mentioned somewhere in the content (topicSentenceIndices non-empty)
614
+ if (questionType === 'where' || questionType === 'who' || questionType === 'when') {
615
+ const REFERENT_PATTERNS = /\b(?:the\s+)?(?:platform|company|service|product|tool|application|system|framework|library|project|organization|software|language|program|site|website|app|api|sdk|package|module|engine|firm|startup|corporation)\b|^(?:It|They|He|She)\s/im;
616
+ const PROXIMITY_WINDOW = 5;
617
+ const SMALL_CONTENT_THRESHOLD = 15;
618
+ // Find which sentences contain at least one topic term
619
+ const topicSentenceIndices = new Set();
620
+ for (let i = 0; i < sentences.length; i++) {
621
+ const stemmedSentence = scoringBlocks[i].raw;
622
+ if (queryTerms.some(t => stemmedSentence.includes(t))) {
623
+ topicSentenceIndices.add(i);
624
+ }
625
+ }
626
+ // Only inject if the topic is actually mentioned somewhere (non-empty topicSentenceIndices)
627
+ if (topicSentenceIndices.size > 0) {
628
+ // Inject topic terms into referent sentences that are near topic sentences
629
+ const topicInjection = ' ' + queryTerms.join(' ');
630
+ for (let i = 0; i < sentences.length; i++) {
631
+ if (topicSentenceIndices.has(i))
632
+ continue; // already has topic terms
633
+ const hasReferent = REFERENT_PATTERNS.test(sentences[i].text);
634
+ if (!hasReferent)
635
+ continue;
636
+ // Check proximity: is this sentence within PROXIMITY_WINDOW of a topic sentence?
637
+ const isNearTopic = sentences.length < SMALL_CONTENT_THRESHOLD ||
638
+ [...topicSentenceIndices].some(j => Math.abs(i - j) <= PROXIMITY_WINDOW);
639
+ if (isNearTopic) {
640
+ scoringBlocks[i].raw += topicInjection;
641
+ }
642
+ }
643
+ }
644
+ }
645
+ // Step 4: Score sentences with BM25
646
+ const bm25Scores = scoreBM25(scoringBlocks, uniqueQueryTerms);
647
+ // Step 5: Compute max possible score for normalization
370
648
  const maxPossibleScore = Math.max(...bm25Scores, 0.001);
371
- // Step 5: Apply boosts (including position bias intro sentences are more likely to answer factual questions)
649
+ // Step 6: Apply boosts (position bias, question type, definition patterns)
372
650
  const totalSentences = sentences.length;
373
651
  const sentenceScores = sentences.map((s, i) => {
374
- // A "topic sentence" is the first sentence in a paragraph/section
375
- // We detect this by checking if the previous character in the content is a newline
376
- const isTopicSentence = i === 0 || cleanedContent.slice(Math.max(0, s.start - 2), s.start).includes('\n');
652
+ const isTopicSentence = i === 0 || qaContent.slice(Math.max(0, s.start - 2), s.start).includes('\n');
377
653
  const base = bm25Scores[i];
378
654
  const boost = computeBoost(s.text, questionType, isTopicSentence);
379
- // Position bias: early sentences get a boost (answers to factual questions
380
- // are typically in the intro paragraph, especially on Wikipedia/docs).
381
- // Decays linearly: first 10% of sentences get full boost (0.4), drops to 0 by 50%.
655
+ // Fix #3: Position bias reduce for 'why' and 'how' (answers can be anywhere)
656
+ const maxPositionBoost = (questionType === 'why' || questionType === 'how') ? 0.15 : 0.4;
382
657
  const positionRatio = i / totalSentences;
383
- const positionBoost = positionRatio < 0.1 ? 0.4
384
- : positionRatio < 0.5 ? 0.4 * (1 - (positionRatio - 0.1) / 0.4)
658
+ // Fix position bias: scale by how many query terms THIS sentence matches.
659
+ // A sentence matching only 1/3 query terms (e.g., just "python") gets 1/3 of the
660
+ // position boost — prevents the first sentence from winning on position alone.
661
+ const sentTokens = tokenize(s.text);
662
+ const sentTermMatches = uniqueQueryTerms.filter(t => sentTokens.includes(t)).length;
663
+ const sentTermCoverage = uniqueQueryTerms.length > 0
664
+ ? sentTermMatches / Math.min(uniqueQueryTerms.length, 5)
665
+ : 0;
666
+ const rawPositionBoost = positionRatio < 0.1 ? maxPositionBoost
667
+ : positionRatio < 0.5 ? maxPositionBoost * (1 - (positionRatio - 0.1) / 0.4)
385
668
  : 0;
386
- // Definition sentences anywhere get a boost (covers "X is a Y" patterns)
669
+ const positionBoost = rawPositionBoost * sentTermCoverage;
670
+ // Fix #2: Only apply definitionBoost for 'what' and 'other' question types.
387
671
  const sl = s.text.toLowerCase();
388
- const definitionBoost = /\b(is a|is an|was a|are a|refers to|is the|was the)\b/.test(sl) ? 0.3 : 0;
389
- const total = base + (boost + positionBoost + definitionBoost) * maxPossibleScore;
672
+ const definitionBoost = (questionType === 'what' || questionType === 'other') &&
673
+ /\b(is a|is an|was a|are a|refers to|is the|was the)\b/.test(sl) ? 0.3 : 0;
674
+ // Extra boost for definition sentences very early in the content (for 'what' questions)
675
+ // This handles Wikipedia-style articles where the first sentence IS the answer
676
+ const earlyDefinitionBoost = (questionType === 'what' &&
677
+ positionRatio < 0.05 &&
678
+ /\b(is a|is an|are a|refers to|means|defined as|known as)\b/.test(sl)) ? 0.5 : 0;
679
+ const total = base + (boost + positionBoost + definitionBoost + earlyDefinitionBoost) * maxPossibleScore;
390
680
  return { text: s.text, index: i, score: total, base };
391
681
  });
392
- // Step 6: Sort by score and select top N
393
- const sorted = [...sentenceScores].sort((a, b) => b.score - a.score);
394
- const topN = Math.min(maxPassages, sorted.length);
395
- const topSentences = sorted.slice(0, topN);
396
- // Step 7: For each top sentence, collect context (surrounding sentences)
682
+ const windows = [];
683
+ // Single-sentence windows (preserve existing behavior)
684
+ for (let i = 0; i < sentences.length; i++) {
685
+ const score = sentenceScores[i].score;
686
+ const lengthPenalty = 0;
687
+ windows.push({
688
+ text: sentences[i].text,
689
+ indices: [i],
690
+ startSentenceIdx: i,
691
+ score: score * (1 - lengthPenalty),
692
+ });
693
+ }
694
+ // 2-sentence windows
695
+ for (let i = 0; i < sentences.length - 1; i++) {
696
+ const score = (sentenceScores[i].score + sentenceScores[i + 1].score) / 2;
697
+ const lengthPenalty = 0.05;
698
+ windows.push({
699
+ text: sentences[i].text + ' ' + sentences[i + 1].text,
700
+ indices: [i, i + 1],
701
+ startSentenceIdx: i,
702
+ score: score * (1 - lengthPenalty),
703
+ });
704
+ }
705
+ // 3-sentence windows (only when content has enough sentences)
706
+ if (sentences.length >= 5) {
707
+ for (let i = 0; i < sentences.length - 2; i++) {
708
+ const score = (sentenceScores[i].score + sentenceScores[i + 1].score + sentenceScores[i + 2].score) / 3;
709
+ const lengthPenalty = 0.10;
710
+ windows.push({
711
+ text: sentences[i].text + ' ' + sentences[i + 1].text + ' ' + sentences[i + 2].text,
712
+ indices: [i, i + 1, i + 2],
713
+ startSentenceIdx: i,
714
+ score: score * (1 - lengthPenalty),
715
+ });
716
+ }
717
+ }
718
+ // Step 8: Sort windows by score
719
+ const sortedWindows = [...windows].sort((a, b) => b.score - a.score);
720
+ // Step 9: Select top N non-overlapping windows
397
721
  const selectedPassages = [];
398
- const usedIndices = new Set();
399
- for (const entry of topSentences) {
400
- if (usedIndices.has(entry.index))
722
+ const usedSentenceIndices = new Set();
723
+ for (const win of sortedWindows) {
724
+ if (selectedPassages.length >= maxPassages)
725
+ break;
726
+ // Skip if any sentence in this window was already used
727
+ const hasOverlap = win.indices.some(i => usedSentenceIndices.has(i));
728
+ if (hasOverlap)
401
729
  continue;
402
- const i = entry.index;
730
+ // Mark all sentences in this window as used
731
+ for (const i of win.indices)
732
+ usedSentenceIndices.add(i);
733
+ // Build context: include sentence before the window and after
734
+ const firstIdx = win.indices[0];
735
+ const lastIdx = win.indices[win.indices.length - 1];
403
736
  const contextParts = [];
404
- // Include sentence before
405
- if (i > 0 && !usedIndices.has(i - 1)) {
406
- contextParts.push(sentences[i - 1].text);
737
+ if (firstIdx > 0 && !usedSentenceIndices.has(firstIdx - 1)) {
738
+ contextParts.push(sentences[firstIdx - 1].text);
407
739
  }
408
- // The sentence itself
409
- contextParts.push(entry.text);
410
- // Include sentence after
411
- if (i < sentences.length - 1 && !usedIndices.has(i + 1)) {
412
- contextParts.push(sentences[i + 1].text);
740
+ contextParts.push(win.text);
741
+ if (lastIdx < sentences.length - 1 && !usedSentenceIndices.has(lastIdx + 1)) {
742
+ contextParts.push(sentences[lastIdx + 1].text);
413
743
  }
414
- // Mark all context indices as used to avoid overlap
415
- if (i > 0)
416
- usedIndices.add(i - 1);
417
- usedIndices.add(i);
418
- if (i < sentences.length - 1)
419
- usedIndices.add(i + 1);
744
+ // Mark surrounding context sentences as used to avoid overlap
745
+ if (firstIdx > 0)
746
+ usedSentenceIndices.add(firstIdx - 1);
747
+ if (lastIdx < sentences.length - 1)
748
+ usedSentenceIndices.add(lastIdx + 1);
420
749
  const context = contextParts.join(' ');
421
750
  selectedPassages.push({
422
- text: entry.text,
423
- score: parseFloat((entry.score / (maxPossibleScore || 1)).toFixed(4)),
751
+ text: win.text,
752
+ score: Math.min(1, parseFloat((win.score / (maxPossibleScore || 1)).toFixed(4))),
424
753
  context,
754
+ startIdx: firstIdx,
755
+ indices: win.indices,
425
756
  });
426
757
  }
427
- // Step 8: Compute confidence from how much the top BM25 score stands out vs. the mean
428
- const topScore = sorted[0]?.score ?? 0;
429
- const topBase = sorted[0]?.base ?? 0;
758
+ // ---------------------------------------------------------------------------
759
+ // Step 10: Confidence computation — multi-signal formula
760
+ // ---------------------------------------------------------------------------
761
+ const topWindow = sortedWindows[0];
762
+ const topBase = topWindow ? Math.max(...topWindow.indices.map(i => sentenceScores[i].base)) : 0;
430
763
  const meanScore = bm25Scores.reduce((a, b) => a + b, 0) / bm25Scores.length;
764
+ // Signal 1: Score gap
431
765
  const scoreGap = maxPossibleScore > 0 ? (topBase - meanScore) / maxPossibleScore : 0;
432
- // 0.3 baseline (we found something), up to 1.0 if top answer dominates
433
- const rawConfidence = Math.min(1, Math.max(0, 0.3 + scoreGap * 0.7));
434
- // Penalty: if the top answer still looks like citation/metadata noise, reduce confidence
435
- const topAnswerText = sorted[0]?.text?.toLowerCase() || '';
766
+ // Signal 2: Term coverage what % of query terms appear in top window
767
+ // Also count synonym-mediated matches (at 0.7 weight)
768
+ const topWindowTokens = tokenize(topWindow?.text || '');
769
+ const directMatches = queryTerms.filter(t => topWindowTokens.includes(t)).length;
770
+ const matchedTerms = queryTerms.filter(t => {
771
+ if (topWindowTokens.includes(t))
772
+ return true;
773
+ // Check if any synonym of this term appears in the top window
774
+ const synonymsForTerm = expandWithSynonyms([t]);
775
+ return synonymsForTerm.some(e => !e.isOriginal && topWindowTokens.includes(e.term));
776
+ });
777
+ const synonymMatches = matchedTerms.length - directMatches;
778
+ const effectiveCoverage = queryTerms.length > 0
779
+ ? (directMatches + synonymMatches * 0.7) / queryTerms.length
780
+ : 0;
781
+ // Signal 3: Position signal — early in document is more reliable for factual Qs
782
+ const positionSignal = (topWindow?.startSentenceIdx ?? 999) < sentences.length * 0.2 ? 0.1 : 0;
783
+ // Signal 4: Answer type match — does the answer look like it answers the question type?
784
+ const typeMatch = hasExpectedEntityType(topWindow?.text || '', questionType) ? 0.20 : 0;
785
+ const rawConfidence = Math.min(1, Math.max(0, 0.1 + // reduced baseline (was 0.2)
786
+ scoreGap * 0.35 +
787
+ effectiveCoverage * 0.25 + // synonym-aware term coverage (was 0.30)
788
+ positionSignal +
789
+ typeMatch));
790
+ // Penalty: noise/metadata in top answer reduces confidence
791
+ const topAnswerText = (topWindow?.text || '').toLowerCase();
436
792
  const noisePenalty = (/\bcs1[_\s]/i.test(topAnswerText) ||
437
793
  /\bcategory:/i.test(topAnswerText) ||
438
794
  /\b(archived|retrieved)\s+(from|on)\b/i.test(topAnswerText) ||
439
795
  /\b(isbn|issn|doi|arxiv|bibcode|pmid)\b/i.test(topAnswerText) ||
440
- // Line is mostly URLs
441
796
  (topAnswerText.match(/https?:\/\//g) || []).length > 2) ? 0.5 : 0;
442
- const confidence = Math.max(0, rawConfidence - noisePenalty);
443
- // Step 9: Build answer best passage text, trimmed to maxChars
797
+ // Fix #13: Penalty for UI chrome / navigation elements
798
+ const uiChromePenalty = (/\b(sign in|sign up|log in|log out|subscribe|newsletter|cookie|privacy policy|terms of service)\b/i.test(topAnswerText) ||
799
+ /\b(skip to|main menu|navigation|sidebar|footer|header|breadcrumb)\b/i.test(topAnswerText)) ? 0.3 : 0;
800
+ const confidence = Math.max(0, rawConfidence - noisePenalty - uiChromePenalty);
801
+ // ---------------------------------------------------------------------------
802
+ // Step 11: Try entity extraction for who/when questions (BM25 fallback)
803
+ // ---------------------------------------------------------------------------
444
804
  let answerText = selectedPassages[0]?.context || selectedPassages[0]?.text || '';
805
+ // For who/when, try to surface a concise entity from the top passage
806
+ if ((questionType === 'who' || questionType === 'when') && selectedPassages[0]) {
807
+ const entity = extractEntity(selectedPassages[0].text, questionType);
808
+ if (entity && selectedPassages[0].text.includes(entity)) {
809
+ // Keep full passage text as answer (it contains the entity)
810
+ answerText = selectedPassages[0].text;
811
+ }
812
+ }
445
813
  if (answerText.length > maxChars) {
446
814
  answerText = answerText.slice(0, maxChars).replace(/\s+\S*$/, '') + '…';
447
815
  }
@@ -452,9 +820,8 @@ export function quickAnswer(options) {
452
820
  ? p.context.slice(0, Math.max(0, maxChars - totalChars)).replace(/\s+\S*$/, '') + '…'
453
821
  : p.context;
454
822
  totalChars += contextTrimmed.length;
455
- return { ...p, context: contextTrimmed };
823
+ return { text: p.text, score: p.score, context: contextTrimmed };
456
824
  });
457
- void topScore; // consumed via sorted[0]
458
825
  return {
459
826
  question,
460
827
  answer: answerText,