npm - webpeel - Versions diffs - 0.15.2 → 0.16.1 - Mend

webpeel 0.15.2 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

package/README.md +2 -2
package/dist/cli-auth.d.ts.map +1 -1
package/dist/cli-auth.js +5 -0
package/dist/cli-auth.js.map +1 -1
package/dist/cli.js +43 -11
package/dist/cli.js.map +1 -1
package/dist/core/crawler.d.ts +2 -0
package/dist/core/crawler.d.ts.map +1 -1
package/dist/core/crawler.js +12 -3
package/dist/core/crawler.js.map +1 -1
package/dist/core/pipeline.d.ts +1 -0
package/dist/core/pipeline.d.ts.map +1 -1
package/dist/core/pipeline.js +63 -2
package/dist/core/pipeline.js.map +1 -1
package/dist/core/quick-answer.d.ts +26 -0
package/dist/core/quick-answer.d.ts.map +1 -1
package/dist/core/quick-answer.js +451 -84
package/dist/core/quick-answer.js.map +1 -1
package/dist/core/search-provider.d.ts +47 -4
package/dist/core/search-provider.d.ts.map +1 -1
package/dist/core/search-provider.js +278 -7
package/dist/core/search-provider.js.map +1 -1
package/dist/core/stemmer.d.ts +39 -0
package/dist/core/stemmer.d.ts.map +1 -0
package/dist/core/stemmer.js +510 -0
package/dist/core/stemmer.js.map +1 -0
package/dist/core/synonyms.d.ts +43 -0
package/dist/core/synonyms.d.ts.map +1 -0
package/dist/core/synonyms.js +185 -0
package/dist/core/synonyms.js.map +1 -0
package/dist/mcp/server.js +109 -4
package/dist/mcp/server.js.map +1 -1
package/dist/server/app.d.ts +1 -0
package/dist/server/app.d.ts.map +1 -1
package/dist/server/app.js +76 -10
package/dist/server/app.js.map +1 -1
package/dist/server/middleware/auth.d.ts +2 -1
package/dist/server/middleware/auth.d.ts.map +1 -1
package/dist/server/middleware/auth.js +25 -12
package/dist/server/middleware/auth.js.map +1 -1
package/dist/server/middleware/rate-limit.d.ts +1 -0
package/dist/server/middleware/rate-limit.d.ts.map +1 -1
package/dist/server/middleware/rate-limit.js +20 -11
package/dist/server/middleware/rate-limit.js.map +1 -1
package/dist/server/routes/agent.d.ts +4 -0
package/dist/server/routes/agent.d.ts.map +1 -1
package/dist/server/routes/agent.js +196 -9
package/dist/server/routes/agent.js.map +1 -1
package/dist/server/routes/batch.d.ts.map +1 -1
package/dist/server/routes/batch.js +126 -1
package/dist/server/routes/batch.js.map +1 -1
package/dist/server/routes/fetch.d.ts +1 -0
package/dist/server/routes/fetch.d.ts.map +1 -1
package/dist/server/routes/fetch.js +193 -55
package/dist/server/routes/fetch.js.map +1 -1
package/dist/server/routes/jobs.d.ts.map +1 -1
package/dist/server/routes/jobs.js +115 -2
package/dist/server/routes/jobs.js.map +1 -1
package/dist/server/routes/mcp.d.ts +1 -0
package/dist/server/routes/mcp.d.ts.map +1 -1
package/dist/server/routes/mcp.js +113 -6
package/dist/server/routes/mcp.js.map +1 -1
package/dist/server/routes/search.js +1 -1
package/dist/server/routes/search.js.map +1 -1
package/dist/server/types.d.ts +16 -0
package/dist/server/types.d.ts.map +1 -0
package/dist/server/types.js +8 -0
package/dist/server/types.js.map +1 -0
package/dist/server/utils/response.d.ts +45 -0
package/dist/server/utils/response.d.ts.map +1 -0
package/dist/server/utils/response.js +70 -0
package/dist/server/utils/response.js.map +1 -0
package/dist/server/utils/sse.d.ts +23 -0
package/dist/server/utils/sse.d.ts.map +1 -0
package/dist/server/utils/sse.js +39 -0
package/dist/server/utils/sse.js.map +1 -0
package/dist/types.d.ts +2 -0
package/dist/types.d.ts.map +1 -1
package/dist/types.js.map +1 -1
package/package.json +1 -1

package/dist/core/quick-answer.js CHANGED Viewed

@@ -4,8 +4,12 @@
  * Answers a question about page content without any API key.
  * Uses BM25 relevance scoring + answer-signal boosting to surface
  * the most relevant sentences.
+ *
+ * v2: Added Porter stemming, synonym expansion, and sliding window scoring.
  */
 import { scoreBM25 } from './bm25-filter.js';
+import { stem } from './stemmer.js';
+import { expandWithSynonyms } from './synonyms.js';
 // ---------------------------------------------------------------------------
 // Stopwords — removed from question before BM25 scoring
 // ---------------------------------------------------------------------------
@@ -21,8 +25,12 @@ const STOPWORDS = new Set([
 ]);
 function detectQuestionType(question) {
     const q = question.toLowerCase().trim();
-    if (/how\s+many|how\s+much|what\s+price|what\s+cost|pricing/.test(q))
+    // Fix #1: Distinguish "how many/much/long" (quantity/duration) from "how do/does/can/to/is" (process/explanation)
+    if (/how\s+many|how\s+much|how\s+long|what\s+price|what\s+cost|pricing/.test(q))
         return 'how_many';
+    // Fix #11: Yes/no questions (starts with auxiliary verb)
+    if (/^(is|does|can|will|are|has|do|did|was|were|could|should|would)\b/i.test(q))
+        return 'yes_no';
     if (/when\b/.test(q))
         return 'when';
     if (/where\b/.test(q))
@@ -36,14 +44,33 @@ function detectQuestionType(question) {
         return 'who';
     if (/what\b/.test(q))
         return 'what';
+    // Fix #1: "how do/does/can/to/is" → 'how' (process/explanation), bare 'how' → 'how' (not 'how_many')
+    if (/how\s+(?:do|does|can|to|is|are|was|were|will|would|could|should)\b/.test(q))
+        return 'how';
     if (/how\b/.test(q))
-        return 'how_many';
+        return 'how';
     return 'other';
 }
 // ---------------------------------------------------------------------------
 // Tokenization
 // ---------------------------------------------------------------------------
+/**
+ * Tokenize and stem text. Used for BM25 scoring — both query and content
+ * go through the same stemming pipeline so "limitations" matches "limit".
+ */
 function tokenize(text) {
+    return text
+        .toLowerCase()
+        .replace(/[^\w\s]/g, ' ')
+        .split(/\s+/)
+        .filter(t => t.length > 1)
+        .map(t => stem(t));
+}
+/**
+ * Tokenize WITHOUT stemming. Used for regex pattern building in
+ * tryDirectExtraction so that exact text patterns still match.
+ */
+function tokenizeRaw(text) {
     return text
         .toLowerCase()
         .replace(/[^\w\s]/g, ' ')
@@ -51,7 +78,10 @@ function tokenize(text) {
         .filter(t => t.length > 1);
 }
 function tokenizeQuestion(question) {
-    return tokenize(question).filter(t => !STOPWORDS.has(t));
+    // Filter stopwords on raw tokens (before stemming), then stem
+    return tokenizeRaw(question)
+        .filter(t => !STOPWORDS.has(t))
+        .map(t => stem(t));
 }
 // ---------------------------------------------------------------------------
 // Sentence splitting
@@ -59,6 +89,7 @@ function tokenizeQuestion(question) {
 /**
  * Split text into sentences. Handles common abbreviations to avoid false splits.
  * Returns an array of sentences with their start position (index in original text).
+ * Also extracts list items (markdown bullets/numbers) as pseudo-sentences.
  */
 function splitIntoSentences(content) {
     // Strip markdown formatting while preserving positions is complex;
@@ -81,6 +112,13 @@ function splitIntoSentences(content) {
         PLACEHOLDER_MAP.set(ph, m);
         return ph;
     });
+    // Protect version numbers with multiple dots (e.g., 0.9.0, 1.2.3, 3.11.4)
+    // Must run BEFORE the decimal number protection to avoid partial replacement
+    protected_ = protected_.replace(/\b(\d+\.\d+(?:\.\d+)+)/g, (m) => {
+        const ph = `\x00VER${placeholderIdx++}\x00`;
+        PLACEHOLDER_MAP.set(ph, m);
+        return ph;
+    });
     // Protect decimal numbers (e.g., 3.14, $29.99)
     protected_ = protected_.replace(/\b(\d+)\.(\d+)/g, (_m, a, b) => {
         const ph = `\x00NUM${placeholderIdx++}\x00`;
@@ -114,10 +152,23 @@ function splitIntoSentences(content) {
             sentences.push({ text: remaining, start: lastEnd });
         }
     }
-    // Filter: keep sentences between 10 and 500 chars
+    // Fix #12: Also extract list items (markdown bullets/numbers) as "sentences"
+    const listPattern = /^[\s]*[-*+]\s+(.+)$/gm;
+    let listMatch;
+    while ((listMatch = listPattern.exec(content)) !== null) {
+        const item = listMatch[1].trim();
+        if (item.length >= 10 && item.length <= 800) {
+            // Only add if not already captured by sentence splitting
+            const isDuplicate = sentences.some(s => s.text.includes(item) || item.includes(s.text));
+            if (!isDuplicate) {
+                sentences.push({ text: item, start: listMatch.index });
+            }
+        }
+    }
+    // Fix #7: Increase max sentence length from 500 to 800 chars
     return sentences.filter(s => {
         const len = s.text.length;
-        return len >= 10 && len <= 500;
+        return len >= 10 && len <= 800;
     });
 }
 // ---------------------------------------------------------------------------
@@ -131,8 +182,8 @@ function computeBoost(sentence, questionType, isTopicSentence) {
     }
     switch (questionType) {
         case 'how_many': {
-            // Contains a number or price
-            if (/\$[\d,.]+|\d+[,.]?\d*\s*(per|\/|month|year|week|day|request|api|call|token|user)/i.test(sentence)) {
+            // Contains a number or price or duration
+            if (/\$[\d,.]+|\d+[,.]?\d*\s*(per|\/|month|year|week|day|request|api|call|token|user|minute|second|hour|degree|meter|mile|kg|lb)/i.test(sentence)) {
                 boost += 0.3;
             }
             else if (/\b\d+\b/.test(sentence)) {
@@ -140,6 +191,18 @@ function computeBoost(sentence, questionType, isTopicSentence) {
             }
             break;
         }
+        // Fix #1: New 'how' (process/explanation) boost
+        case 'how': {
+            // Process/explanation sentences
+            if (/\b(by using|through|works by|in order to|step|first|then|next|finally|process|method|approach|technique|way to|can be done)\b/i.test(s)) {
+                boost += 0.4;
+            }
+            // Instructional patterns
+            if (/\b(install|run|execute|configure|set up|use|import|require|enable|disable|create|build|deploy)\b/i.test(s)) {
+                boost += 0.2;
+            }
+            break;
+        }
         case 'when': {
             // Contains a date
             if (/\b(january|february|march|april|may|june|july|august|september|october|november|december|\d{4}|\d+\s*(days?|weeks?|months?|years?))\b/i.test(sentence)) {
@@ -151,10 +214,21 @@ function computeBoost(sentence, questionType, isTopicSentence) {
             }
             break;
         }
+        // Fix #4: Use more specific location indicators
         case 'where': {
-            // Contains a location hint (capitalized proper noun)
-            if (/\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b/.test(sentence) && !/^(The|A|An|In|On|At|For)\b/.test(sentence)) {
-                boost += 0.3;
+            // Primary location signal — strong indicator (located/headquartered/based in + geographic proper noun)
+            if (/\b(located|headquartered|based|founded|established)\s+(in|at)\b/i.test(s) ||
+                /\b(?:in|at)\s+(?:the\s+)?[A-Z][a-z]+(?:(?:\s+[A-Z][a-z]+)*|(?:,\s+[A-Z][a-z]+)*)\b/.test(sentence) ||
+                /\b(city|country|state|region|continent|capital|office|campus|location|address)\b/i.test(s)) {
+                boost += 0.6;
+            }
+            // Specific geographic indicators including country names
+            if (/\b(street|avenue|boulevard|road|highway|route|district|province|county|netherlands|amsterdam|berlin|london|paris|tokyo|beijing|moscow|france|germany|japan|china|india|canada|australia|san francisco|new york|los angeles|seattle|chicago|boston|austin|miami)\b/i.test(s)) {
+                boost += 0.4;
+            }
+            // Birth/origin patterns
+            if (/\b(born|raised|grew up|native|hometown|birthplace|originally from)\b/i.test(s)) {
+                boost += 0.4;
             }
             break;
         }
@@ -170,13 +244,17 @@ function computeBoost(sentence, questionType, isTopicSentence) {
             if (/\b(because|due to|reason|therefore|since|as a result|consequently|thus)\b/.test(s)) {
                 boost += 0.4;
             }
+            // Purpose/goal sentences ("as a successor to", "in order to", "to allow", "to provide")
+            if (/\b(as a successor|successor to|in order to|so that|to allow|to provide|to enable|to support|to replace|to improve|to address|to solve)\b/i.test(s)) {
+                boost += 0.4;
+            }
             break;
         }
         case 'who': {
             // Pattern: "[topic] was created/designed/developed by [Person]"
             // Or: "[Person] created/designed/developed [topic]"
-            if (/\b(created|designed|developed|built|invented|founded|authored|introduced|proposed|conceived)\s+by\b/i.test(s) ||
-                /\b[A-Z][a-z]+\s+(?:[A-Z][a-z]+\s+)?(?:created|designed|developed|built|invented|founded|authored|introduced)\b/.test(sentence)) {
+            if (/\b(created|designed|developed|built|invented|founded|authored|introduced|proposed|conceived|released|launched|established)\s+(?:\w+\s+){0,4}by\b/i.test(s) ||
+                /\b[A-Z][a-z]+\s+(?:[A-Z][a-z]+\s+)?(?:created|designed|developed|built|invented|founded|authored|introduced|conceived|began)\b/.test(sentence)) {
                 boost += 0.5;
             }
             // Also boost if contains person names (capitalized words that aren't sentence starters)
@@ -190,20 +268,62 @@ function computeBoost(sentence, questionType, isTopicSentence) {
             }
             break;
         }
+        // Fix #11: Yes/no question boost
+        case 'yes_no': {
+            if (/\b(yes|no|not|does not|doesn't|cannot|can't|isn't|aren't|won't|supports?|enables?|allows?|provides?|includes?)\b/i.test(s)) {
+                boost += 0.3;
+            }
+            break;
+        }
     }
     return boost;
 }
-function tryDirectExtraction(content, questionType, topicTerms, _question) {
+// Fix #9: Remove unused `_question` parameter
+// NOTE: topicTerms must be RAW (unstemmed) for correct regex pattern building
+function tryDirectExtraction(content, questionType, topicTerms) {
     if (topicTerms.length === 0)
         return null;
     // Build a regex pattern that matches any topic term (case-insensitive)
     const topicPattern = topicTerms.map(t => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
+    // --- Tiered 'who' infobox extraction ---
+    // Wikipedia infobox entries appear as list items like:
+    //   "-   Founders · Sam AltmanElon Musk..."
+    // We search for the field pattern directly (no topic prefix required) since
+    // "Founders ·" is specific enough to avoid false positives.
+    // Split into two tiers: creator fields (always try first) vs. developer/maintainer fields
+    // (skip for creation questions so we don't return "The Rust Team" for "Who created Rust?")
+    if (questionType === 'who') {
+        // Detect if question is about creation/origin.
+        // These are stem prefixes (e.g. "creat" from "created"), so use leading \b only —
+        // no trailing \b, since the stem appears INSIDE the full word.
+        const isCreationQuestion = /\b(?:creat|built|invent|found|design|start|conceiv|originat|develop|made|wrote|began)\w*/i.test(topicTerms.join(' '));
+        // Tier 1: Original creator fields (always try first) — search directly without topic prefix
+        const creatorFields = /(?:Original\s+author|Creator|Inventor|Designed\s+by|Created\s+by|Founded\s+by|Founders)\s*[·:]\s*(.+)/i;
+        const creatorMatch = content.match(creatorFields);
+        if (creatorMatch?.[1]) {
+            const value = creatorMatch[1].split('\n')[0].trim().slice(0, 300);
+            if (value.length > 2) {
+                return { text: value, context: creatorMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
+            }
+        }
+        // Tier 2: General developer fields (skip for creation questions — let BM25 find the original creator)
+        if (!isCreationQuestion) {
+            const devFields = /(?:Developers|Developer|Maintainer|Author)\s*[·:]\s*(.+)/i;
+            const devMatch = content.match(devFields);
+            if (devMatch?.[1]) {
+                const value = devMatch[1].split('\n')[0].trim().slice(0, 300);
+                if (value.length > 2) {
+                    return { text: value, context: devMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
+                }
+            }
+        }
+    }
     // --- Infobox patterns (Wikipedia-style: "Topic: Field · Value") ---
     // Note: Wikipedia uses \u00A0 (NBSP) in infobox fields, so we use \\s+ (which matches NBSP) instead of literal spaces
     const infoboxPatterns = [
-        { type: ['who'], field: new RegExp(`(?:${topicPattern}).*?(?:Designed\\s+by|Created\\s+by|Developed\\s+by|Founded\\s+by|Original\\s+author|Developers|Developer|Maintainer|Author|Inventor|Creator)\\s*[·:]\\s*(.+)`, 'i') },
         { type: ['when'], field: new RegExp(`(?:${topicPattern}).*?(?:First\\s+appeared|Released|Founded|Established|Created|Launch\\s+date|Initial\\s+release)\\s*[·:]\\s*(.+)`, 'i') },
         { type: ['what'], field: new RegExp(`(?:${topicPattern}).*?(?:Type|Genre|Category|Classification)\\s*[·:]\\s*(.+)`, 'i') },
+        { type: ['where'], field: /(?:Headquarters|Headquartered|Location|Address|HQ|Head\s+office|Based\s+in)\s*[·:]\s*(.+)/i },
     ];
     for (const pat of infoboxPatterns) {
         if (!pat.type.includes(questionType))
@@ -225,7 +345,7 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
         // "developed/designed/created by [Name]" in first 20% of content
         const first20 = content.slice(0, Math.max(500, Math.floor(content.length * 0.2)));
         // Use case-insensitive for verbs, but validate name casing separately
-        const byPattern = /(?:developed|designed|created|built|invented|founded|authored|introduced|coined)\s+by\s+(\S+(?:\s+\S+){0,3})/i;
+        const byPattern = /(?:developed|designed|created|built|invented|founded|authored|introduced|coined|conceived|released|started|launched|begun|proposed|established)\s+(?:\w+\s+){0,4}by\s+(\S+(?:\s+\S+){0,3})/i;
         const byMatch = first20.match(byPattern);
         if (byMatch?.[1]) {
             const candidateName = byMatch[1].trim();
@@ -249,7 +369,10 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
     if (questionType === 'when') {
         // Look for a date near topic terms in first 30% of content
         const first30 = content.slice(0, Math.max(600, Math.floor(content.length * 0.3)));
-        const datePattern = /(?:released|launched|first appeared|founded|established|created|introduced|began|started)\s+(?:in|on)?\s*(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})/i;
+        // Note: "began"/"started" are intentionally excluded — they can match
+        // construction/start events that don't answer the specific question
+        // (e.g. "When did X fall?" should NOT match "began on Aug 13, 1961").
+        const datePattern = /(?:released|launched|first appeared|founded|established|created|introduced|conceived|opened|invented)\s+(?:\w+\s+){0,2}(?:in|on)\s+(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})/i;
         const dateMatch = first30.match(datePattern);
         if (dateMatch) {
             const idx = first30.indexOf(dateMatch[0]);
@@ -266,6 +389,51 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
     return null;
 }
 // ---------------------------------------------------------------------------
+// Entity extraction — for who/when questions answered by BM25
+// ---------------------------------------------------------------------------
+/**
+ * Try to extract a specific entity (person name, date) from a BM25-selected passage.
+ * Returns the entity string if found, or null.
+ */
+function extractEntity(passage, questionType) {
+    if (questionType === 'who') {
+        // Try: "by [Name Name]"
+        const byMatch = passage.match(/\bby\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})/);
+        if (byMatch)
+            return byMatch[1];
+        // Try: "[Name Name] created/founded/..."
+        const nameVerbMatch = passage.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})\s+(?:created|founded|designed|developed|built|invented|authored|introduced)/);
+        if (nameVerbMatch)
+            return nameVerbMatch[1];
+        return null;
+    }
+    if (questionType === 'when') {
+        const dateMatch = passage.match(/\b(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})\b/);
+        if (dateMatch)
+            return dateMatch[1];
+        return null;
+    }
+    return null;
+}
+// ---------------------------------------------------------------------------
+// Entity type check for confidence formula
+// ---------------------------------------------------------------------------
+function hasExpectedEntityType(text, questionType) {
+    switch (questionType) {
+        case 'who':
+            return /[A-Z][a-z]+\s+[A-Z][a-z]+/.test(text);
+        case 'when':
+            return /\b\d{4}\b|\b(january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(text);
+        case 'how_many':
+        case 'how_much':
+            return /\b\d+\b/.test(text);
+        case 'where':
+            return /\b(in|at|near|located|based|headquarter)\b/i.test(text);
+        default:
+            return true;
+    }
+}
+// ---------------------------------------------------------------------------
 // Content cleaning — strip citation/reference noise before BM25 scoring
 // ---------------------------------------------------------------------------
 /**
@@ -275,6 +443,25 @@ function tryDirectExtraction(content, questionType, topicTerms, _question) {
  */
 function cleanContentForQA(content) {
     let cleaned = content;
+    // Strip markdown formatting to get clean text for BM25 scoring
+    // Images: ![alt](url) → remove entirely
+    cleaned = cleaned.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
+    // Links: [text](url "title") → text (keep link text, remove URL and title)
+    cleaned = cleaned.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
+    // Bold/italic: ***text***, **text**, *text* → text
+    cleaned = cleaned.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
+    // Inline code: `text` → text
+    cleaned = cleaned.replace(/`([^`]+)`/g, '$1');
+    // Heading markers: ## Heading → Heading
+    cleaned = cleaned.replace(/^#{1,6}\s+/gm, '');
+    // Horizontal rules
+    cleaned = cleaned.replace(/^---+$/gm, '');
+    // HTML entities
+    cleaned = cleaned.replace(/&amp;/g, '&');
+    cleaned = cleaned.replace(/&lt;/g, '<');
+    cleaned = cleaned.replace(/&gt;/g, '>');
+    cleaned = cleaned.replace(/&nbsp;/g, ' ');
+    cleaned = cleaned.replace(/&#\d+;/g, '');
     // Remove Wikipedia citation metadata (CS1_maint, Category:, etc.)
     cleaned = cleaned.replace(/CS1[_\s]\w+[:\s][^\n]*/gi, '');
     cleaned = cleaned.replace(/Category:[^\n]*/gi, '');
@@ -288,12 +475,11 @@ function cleanContentForQA(content) {
     cleaned = cleaned.replace(/\b(retrieved|archived from the original)\b[^\n]{0,100}/gi, '');
     // Remove "External links" and everything after (usually just URLs)
     cleaned = cleaned.replace(/^#{1,3}\s*External\s+links[\s\S]*$/im, '');
-    // Remove section headers for reference-like sections
-    // (but keep real content that happens to be after these headings)
+    // Fix #8: Remove entire "See also", "Notes", "Further reading" sections
+    // (heading + all content until the next heading)
+    cleaned = cleaned.replace(/^#{1,3}\s*(?:See\s+also|Notes|Further\s+reading)\s*\n(?:(?!^#{1,3}\s).*\n?)*/gim, '');
+    // Remove "References" heading only (keep nearby content that may be relevant)
     cleaned = cleaned.replace(/^#{1,3}\s*References\s*$/im, '');
-    cleaned = cleaned.replace(/^#{1,3}\s*Further\s+reading\s*$/im, '');
-    cleaned = cleaned.replace(/^#{1,3}\s*See\s+also\s*$/im, '');
-    cleaned = cleaned.replace(/^#{1,3}\s*Notes\s*$/im, '');
     // Remove lines that are mostly citation-like (very short with lots of punctuation/numbers)
     cleaned = cleaned.split('\n').filter(line => {
         const trimmed = line.trim();
@@ -318,6 +504,30 @@ function cleanContentForQA(content) {
 // ---------------------------------------------------------------------------
 // Main quickAnswer function
 // ---------------------------------------------------------------------------
+/**
+ * Answer a question about fetched page content using BM25 + heuristics.
+ *
+ * This is a fully offline, LLM-free approach. It:
+ * 1. Cleans the content (strips Wikipedia citations, reference noise, etc.)
+ * 2. Tries direct pattern extraction for structured content (infoboxes, definitions)
+ * 3. Falls back to BM25 sentence scoring with question-type-aware boosting
+ * 4. Uses sliding windows (1-3 sentences) to capture multi-sentence answers
+ * 5. Expands query terms with synonyms for broader matching
+ * 6. Returns the top passages with scores and surrounding context
+ *
+ * @param options - Question, content, and optional tuning parameters
+ * @returns A result object with answer text, confidence score, and ranked passages
+ *
+ * @example
+ * ```ts
+ * const result = await quickAnswer({
+ *   question: 'What is the pricing?',
+ *   content: pageMarkdown,
+ *   url: 'https://example.com/pricing',
+ * });
+ * console.log(result.answer, result.confidence);
+ * ```
+ */
 export function quickAnswer(options) {
     const { question, content, maxPassages = 3, maxChars = 2000, url = '', } = options;
     const emptyResult = {
@@ -334,12 +544,23 @@ export function quickAnswer(options) {
         return emptyResult;
     // Clean content to remove citation/reference noise before BM25 scoring
     const cleanedContent = cleanContentForQA(content);
+    // For very long content, focus on the most relevant portion.
+    // Wikipedia article tails contain references, tangential details, and noise.
+    const MAX_QA_CHARS = 20000;
+    let qaContent = cleanedContent;
+    if (qaContent.length > MAX_QA_CHARS) {
+        // Keep the first 70% — definitions, key facts, and main content
+        // are almost always in the first 2/3 of the article
+        qaContent = qaContent.slice(0, Math.floor(qaContent.length * 0.7));
+    }
     // Step 0: Direct pattern extraction — try to find structured answers before BM25
     // This catches infobox patterns (e.g. "TypeScript: Designed by · Anders Hejlsberg")
     // and definition sentences (e.g. "TypeScript is ... developed by Microsoft")
     const questionType = detectQuestionType(question);
-    const topicTerms = tokenizeQuestion(question);
-    const directAnswer = tryDirectExtraction(cleanedContent, questionType, topicTerms, question);
+    // RAW (unstemmed) topic terms for tryDirectExtraction regex patterns
+    const topicTermsRaw = tokenizeRaw(question).filter(t => !STOPWORDS.has(t));
+    // Fix #9: Remove the unused `question` argument from the call site
+    const directAnswer = tryDirectExtraction(cleanedContent, questionType, topicTermsRaw);
     if (directAnswer) {
         return {
             question,
@@ -350,98 +571,245 @@ export function quickAnswer(options) {
             method: 'bm25',
         };
     }
-    // Step 1: Split into sentences
-    const sentences = splitIntoSentences(cleanedContent);
+    // Step 1: Split into sentences (use qaContent — truncated for long articles)
+    const sentences = splitIntoSentences(qaContent);
     if (sentences.length === 0)
         return emptyResult;
-    // Step 2: Tokenize question (remove stopwords)
+    // Step 2: Tokenize question (remove stopwords, then stem)
     const queryTerms = tokenizeQuestion(question);
     if (queryTerms.length === 0) {
-        // Fall back to all tokens if all were stopwords
-        queryTerms.push(...tokenize(question));
+        // Fall back to all stemmed tokens if all were stopwords
+        const fallback = tokenize(question);
+        if (fallback.length === 0)
+            return emptyResult;
+        queryTerms.push(...fallback);
     }
-    if (queryTerms.length === 0)
-        return emptyResult;
-    // Step 3: Score sentences with BM25 (questionType already computed in Step 0)
-    const blocks = sentences.map((s, index) => ({ raw: s.text, index }));
-    const bm25Scores = scoreBM25(blocks, queryTerms);
-    // Step 4: Compute max possible score for normalization
-    // (the sentence with the highest BM25 score against itself as a reference)
+    // Expand query with synonyms for broader matching
+    const expanded = expandWithSynonyms(queryTerms);
+    // Use all expanded terms for BM25 (IDF naturally downweights common synonyms)
+    const uniqueQueryTerms = [...new Set(expanded.map(e => e.term))];
+    // Step 3: Create stemmed scoring blocks for each sentence.
+    // We pass stemmed text to scoreBM25 so that its internal tokenizer gets stemmed tokens,
+    // matching the stemmed queryTerms. The original sentence text is preserved for display.
+    const scoringBlocks = sentences.map((s, index) => ({
+        raw: tokenize(s.text).join(' '), // pre-stemmed text for BM25 scoring
+        index,
+    }));
+    // ---------------------------------------------------------------------------
+    // Step 3.5: Lightweight topic propagation (coreference approximation)
+    // ---------------------------------------------------------------------------
+    // When a sentence uses a referent phrase like "The platform" or "The company"
+    // instead of the topic entity name, BM25 can't match it. We inject stemmed
+    // topic terms into scoring blocks of nearby referent sentences so BM25 has
+    // something to work with.
+    //
+    // Only active for question types where coreference resolution helps:
+    // where, who, when — NOT for what/how/yes_no/how_many (no entity tracking needed).
+    //
+    // Heuristic: A sentence gets topic injection if:
+    // 1. It contains a common referent pattern (the platform/company/service/etc.)
+    // 2. It is within PROXIMITY_WINDOW sentences of a sentence containing the topic
+    // 3. OR the content has fewer than SMALL_CONTENT_THRESHOLD sentences AND
+    //    the topic is actually mentioned somewhere in the content (topicSentenceIndices non-empty)
+    if (questionType === 'where' || questionType === 'who' || questionType === 'when') {
+        const REFERENT_PATTERNS = /\b(?:the\s+)?(?:platform|company|service|product|tool|application|system|framework|library|project|organization|software|language|program|site|website|app|api|sdk|package|module|engine|firm|startup|corporation)\b|^(?:It|They|He|She)\s/im;
+        const PROXIMITY_WINDOW = 5;
+        const SMALL_CONTENT_THRESHOLD = 15;
+        // Find which sentences contain at least one topic term
+        const topicSentenceIndices = new Set();
+        for (let i = 0; i < sentences.length; i++) {
+            const stemmedSentence = scoringBlocks[i].raw;
+            if (queryTerms.some(t => stemmedSentence.includes(t))) {
+                topicSentenceIndices.add(i);
+            }
+        }
+        // Only inject if the topic is actually mentioned somewhere (non-empty topicSentenceIndices)
+        if (topicSentenceIndices.size > 0) {
+            // Inject topic terms into referent sentences that are near topic sentences
+            const topicInjection = ' ' + queryTerms.join(' ');
+            for (let i = 0; i < sentences.length; i++) {
+                if (topicSentenceIndices.has(i))
+                    continue; // already has topic terms
+                const hasReferent = REFERENT_PATTERNS.test(sentences[i].text);
+                if (!hasReferent)
+                    continue;
+                // Check proximity: is this sentence within PROXIMITY_WINDOW of a topic sentence?
+                const isNearTopic = sentences.length < SMALL_CONTENT_THRESHOLD ||
+                    [...topicSentenceIndices].some(j => Math.abs(i - j) <= PROXIMITY_WINDOW);
+                if (isNearTopic) {
+                    scoringBlocks[i].raw += topicInjection;
+                }
+            }
+        }
+    }
+    // Step 4: Score sentences with BM25
+    const bm25Scores = scoreBM25(scoringBlocks, uniqueQueryTerms);
+    // Step 5: Compute max possible score for normalization
     const maxPossibleScore = Math.max(...bm25Scores, 0.001);
-    // Step 5: Apply boosts (including position bias — intro sentences are more likely to answer factual questions)
+    // Step 6: Apply boosts (position bias, question type, definition patterns)
     const totalSentences = sentences.length;
     const sentenceScores = sentences.map((s, i) => {
-        // A "topic sentence" is the first sentence in a paragraph/section
-        // We detect this by checking if the previous character in the content is a newline
-        const isTopicSentence = i === 0 || cleanedContent.slice(Math.max(0, s.start - 2), s.start).includes('\n');
+        const isTopicSentence = i === 0 || qaContent.slice(Math.max(0, s.start - 2), s.start).includes('\n');
         const base = bm25Scores[i];
         const boost = computeBoost(s.text, questionType, isTopicSentence);
-        // Position bias: early sentences get a boost (answers to factual questions
-        // are typically in the intro paragraph, especially on Wikipedia/docs).
-        // Decays linearly: first 10% of sentences get full boost (0.4), drops to 0 by 50%.
+        // Fix #3: Position bias — reduce for 'why' and 'how' (answers can be anywhere)
+        const maxPositionBoost = (questionType === 'why' || questionType === 'how') ? 0.15 : 0.4;
         const positionRatio = i / totalSentences;
-        const positionBoost = positionRatio < 0.1 ? 0.4
-            : positionRatio < 0.5 ? 0.4 * (1 - (positionRatio - 0.1) / 0.4)
+        // Fix position bias: scale by how many query terms THIS sentence matches.
+        // A sentence matching only 1/3 query terms (e.g., just "python") gets 1/3 of the
+        // position boost — prevents the first sentence from winning on position alone.
+        const sentTokens = tokenize(s.text);
+        const sentTermMatches = uniqueQueryTerms.filter(t => sentTokens.includes(t)).length;
+        const sentTermCoverage = uniqueQueryTerms.length > 0
+            ? sentTermMatches / Math.min(uniqueQueryTerms.length, 5)
+            : 0;
+        const rawPositionBoost = positionRatio < 0.1 ? maxPositionBoost
+            : positionRatio < 0.5 ? maxPositionBoost * (1 - (positionRatio - 0.1) / 0.4)
                 : 0;
-        // Definition sentences anywhere get a boost (covers "X is a Y" patterns)
+        const positionBoost = rawPositionBoost * sentTermCoverage;
+        // Fix #2: Only apply definitionBoost for 'what' and 'other' question types.
         const sl = s.text.toLowerCase();
-        const definitionBoost = /\b(is a|is an|was a|are a|refers to|is the|was the)\b/.test(sl) ? 0.3 : 0;
-        const total = base + (boost + positionBoost + definitionBoost) * maxPossibleScore;
+        const definitionBoost = (questionType === 'what' || questionType === 'other') &&
+            /\b(is a|is an|was a|are a|refers to|is the|was the)\b/.test(sl) ? 0.3 : 0;
+        // Extra boost for definition sentences very early in the content (for 'what' questions)
+        // This handles Wikipedia-style articles where the first sentence IS the answer
+        const earlyDefinitionBoost = (questionType === 'what' &&
+            positionRatio < 0.05 &&
+            /\b(is a|is an|are a|refers to|means|defined as|known as)\b/.test(sl)) ? 0.5 : 0;
+        const total = base + (boost + positionBoost + definitionBoost + earlyDefinitionBoost) * maxPossibleScore;
         return { text: s.text, index: i, score: total, base };
     });
-    // Step 6: Sort by score and select top N
-    const sorted = [...sentenceScores].sort((a, b) => b.score - a.score);
-    const topN = Math.min(maxPassages, sorted.length);
-    const topSentences = sorted.slice(0, topN);
-    // Step 7: For each top sentence, collect context (surrounding sentences)
+    const windows = [];
+    // Single-sentence windows (preserve existing behavior)
+    for (let i = 0; i < sentences.length; i++) {
+        const score = sentenceScores[i].score;
+        const lengthPenalty = 0;
+        windows.push({
+            text: sentences[i].text,
+            indices: [i],
+            startSentenceIdx: i,
+            score: score * (1 - lengthPenalty),
+        });
+    }
+    // 2-sentence windows
+    for (let i = 0; i < sentences.length - 1; i++) {
+        const score = (sentenceScores[i].score + sentenceScores[i + 1].score) / 2;
+        const lengthPenalty = 0.05;
+        windows.push({
+            text: sentences[i].text + ' ' + sentences[i + 1].text,
+            indices: [i, i + 1],
+            startSentenceIdx: i,
+            score: score * (1 - lengthPenalty),
+        });
+    }
+    // 3-sentence windows (only when content has enough sentences)
+    if (sentences.length >= 5) {
+        for (let i = 0; i < sentences.length - 2; i++) {
+            const score = (sentenceScores[i].score + sentenceScores[i + 1].score + sentenceScores[i + 2].score) / 3;
+            const lengthPenalty = 0.10;
+            windows.push({
+                text: sentences[i].text + ' ' + sentences[i + 1].text + ' ' + sentences[i + 2].text,
+                indices: [i, i + 1, i + 2],
+                startSentenceIdx: i,
+                score: score * (1 - lengthPenalty),
+            });
+        }
+    }
+    // Step 8: Sort windows by score
+    const sortedWindows = [...windows].sort((a, b) => b.score - a.score);
+    // Step 9: Select top N non-overlapping windows
     const selectedPassages = [];
-    const usedIndices = new Set();
-    for (const entry of topSentences) {
-        if (usedIndices.has(entry.index))
+    const usedSentenceIndices = new Set();
+    for (const win of sortedWindows) {
+        if (selectedPassages.length >= maxPassages)
+            break;
+        // Skip if any sentence in this window was already used
+        const hasOverlap = win.indices.some(i => usedSentenceIndices.has(i));
+        if (hasOverlap)
             continue;
-        const i = entry.index;
+        // Mark all sentences in this window as used
+        for (const i of win.indices)
+            usedSentenceIndices.add(i);
+        // Build context: include sentence before the window and after
+        const firstIdx = win.indices[0];
+        const lastIdx = win.indices[win.indices.length - 1];
         const contextParts = [];
-        // Include sentence before
-        if (i > 0 && !usedIndices.has(i - 1)) {
-            contextParts.push(sentences[i - 1].text);
+        if (firstIdx > 0 && !usedSentenceIndices.has(firstIdx - 1)) {
+            contextParts.push(sentences[firstIdx - 1].text);
         }
-        // The sentence itself
-        contextParts.push(entry.text);
-        // Include sentence after
-        if (i < sentences.length - 1 && !usedIndices.has(i + 1)) {
-            contextParts.push(sentences[i + 1].text);
+        contextParts.push(win.text);
+        if (lastIdx < sentences.length - 1 && !usedSentenceIndices.has(lastIdx + 1)) {
+            contextParts.push(sentences[lastIdx + 1].text);
         }
-        // Mark all context indices as used to avoid overlap
-        if (i > 0)
-            usedIndices.add(i - 1);
-        usedIndices.add(i);
-        if (i < sentences.length - 1)
-            usedIndices.add(i + 1);
+        // Mark surrounding context sentences as used to avoid overlap
+        if (firstIdx > 0)
+            usedSentenceIndices.add(firstIdx - 1);
+        if (lastIdx < sentences.length - 1)
+            usedSentenceIndices.add(lastIdx + 1);
         const context = contextParts.join(' ');
         selectedPassages.push({
-            text: entry.text,
-            score: parseFloat((entry.score / (maxPossibleScore || 1)).toFixed(4)),
+            text: win.text,
+            score: Math.min(1, parseFloat((win.score / (maxPossibleScore || 1)).toFixed(4))),
             context,
+            startIdx: firstIdx,
+            indices: win.indices,
         });
     }
-    // Step 8: Compute confidence from how much the top BM25 score stands out vs. the mean
-    const topScore = sorted[0]?.score ?? 0;
-    const topBase = sorted[0]?.base ?? 0;
+    // ---------------------------------------------------------------------------
+    // Step 10: Confidence computation — multi-signal formula
+    // ---------------------------------------------------------------------------
+    const topWindow = sortedWindows[0];
+    const topBase = topWindow ? Math.max(...topWindow.indices.map(i => sentenceScores[i].base)) : 0;
     const meanScore = bm25Scores.reduce((a, b) => a + b, 0) / bm25Scores.length;
+    // Signal 1: Score gap
     const scoreGap = maxPossibleScore > 0 ? (topBase - meanScore) / maxPossibleScore : 0;
-    // 0.3 baseline (we found something), up to 1.0 if top answer dominates
-    const rawConfidence = Math.min(1, Math.max(0, 0.3 + scoreGap * 0.7));
-    // Penalty: if the top answer still looks like citation/metadata noise, reduce confidence
-    const topAnswerText = sorted[0]?.text?.toLowerCase() || '';
+    // Signal 2: Term coverage — what % of query terms appear in top window
+    // Also count synonym-mediated matches (at 0.7 weight)
+    const topWindowTokens = tokenize(topWindow?.text || '');
+    const directMatches = queryTerms.filter(t => topWindowTokens.includes(t)).length;
+    const matchedTerms = queryTerms.filter(t => {
+        if (topWindowTokens.includes(t))
+            return true;
+        // Check if any synonym of this term appears in the top window
+        const synonymsForTerm = expandWithSynonyms([t]);
+        return synonymsForTerm.some(e => !e.isOriginal && topWindowTokens.includes(e.term));
+    });
+    const synonymMatches = matchedTerms.length - directMatches;
+    const effectiveCoverage = queryTerms.length > 0
+        ? (directMatches + synonymMatches * 0.7) / queryTerms.length
+        : 0;
+    // Signal 3: Position signal — early in document is more reliable for factual Qs
+    const positionSignal = (topWindow?.startSentenceIdx ?? 999) < sentences.length * 0.2 ? 0.1 : 0;
+    // Signal 4: Answer type match — does the answer look like it answers the question type?
+    const typeMatch = hasExpectedEntityType(topWindow?.text || '', questionType) ? 0.20 : 0;
+    const rawConfidence = Math.min(1, Math.max(0, 0.1 + // reduced baseline (was 0.2)
+        scoreGap * 0.35 +
+        effectiveCoverage * 0.25 + // synonym-aware term coverage (was 0.30)
+        positionSignal +
+        typeMatch));
+    // Penalty: noise/metadata in top answer reduces confidence
+    const topAnswerText = (topWindow?.text || '').toLowerCase();
     const noisePenalty = (/\bcs1[_\s]/i.test(topAnswerText) ||
         /\bcategory:/i.test(topAnswerText) ||
         /\b(archived|retrieved)\s+(from|on)\b/i.test(topAnswerText) ||
         /\b(isbn|issn|doi|arxiv|bibcode|pmid)\b/i.test(topAnswerText) ||
-        // Line is mostly URLs
         (topAnswerText.match(/https?:\/\//g) || []).length > 2) ? 0.5 : 0;
-    const confidence = Math.max(0, rawConfidence - noisePenalty);
-    // Step 9: Build answer — best passage text, trimmed to maxChars
+    // Fix #13: Penalty for UI chrome / navigation elements
+    const uiChromePenalty = (/\b(sign in|sign up|log in|log out|subscribe|newsletter|cookie|privacy policy|terms of service)\b/i.test(topAnswerText) ||
+        /\b(skip to|main menu|navigation|sidebar|footer|header|breadcrumb)\b/i.test(topAnswerText)) ? 0.3 : 0;
+    const confidence = Math.max(0, rawConfidence - noisePenalty - uiChromePenalty);
+    // ---------------------------------------------------------------------------
+    // Step 11: Try entity extraction for who/when questions (BM25 fallback)
+    // ---------------------------------------------------------------------------
     let answerText = selectedPassages[0]?.context || selectedPassages[0]?.text || '';
+    // For who/when, try to surface a concise entity from the top passage
+    if ((questionType === 'who' || questionType === 'when') && selectedPassages[0]) {
+        const entity = extractEntity(selectedPassages[0].text, questionType);
+        if (entity && selectedPassages[0].text.includes(entity)) {
+            // Keep full passage text as answer (it contains the entity)
+            answerText = selectedPassages[0].text;
+        }
+    }
     if (answerText.length > maxChars) {
         answerText = answerText.slice(0, maxChars).replace(/\s+\S*$/, '') + '…';
     }
@@ -452,9 +820,8 @@ export function quickAnswer(options) {
             ? p.context.slice(0, Math.max(0, maxChars - totalChars)).replace(/\s+\S*$/, '') + '…'
             : p.context;
         totalChars += contextTrimmed.length;
-        return { ...p, context: contextTrimmed };
+        return { text: p.text, score: p.score, context: contextTrimmed };
     });
-    void topScore; // consumed via sorted[0]
     return {
         question,
         answer: answerText,