@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,833 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Quick Answer — LLM-free question answering using BM25 + heuristics
|
|
3
|
+
*
|
|
4
|
+
* Answers a question about page content without any API key.
|
|
5
|
+
* Uses BM25 relevance scoring + answer-signal boosting to surface
|
|
6
|
+
* the most relevant sentences.
|
|
7
|
+
*
|
|
8
|
+
* v2: Added Porter stemming, synonym expansion, and sliding window scoring.
|
|
9
|
+
*/
|
|
10
|
+
import { scoreBM25 } from './bm25-filter.js';
|
|
11
|
+
import { stem } from './stemmer.js';
|
|
12
|
+
import { expandWithSynonyms } from './synonyms.js';
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Stopwords — removed from question before BM25 scoring
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
const STOPWORDS = new Set([
|
|
17
|
+
'what', 'is', 'the', 'how', 'do', 'a', 'an', 'where', 'when', 'why',
|
|
18
|
+
'which', 'can', 'does', 'are', 'was', 'were', 'be', 'been', 'being',
|
|
19
|
+
'have', 'has', 'had', 'will', 'would', 'could', 'should', 'may', 'might',
|
|
20
|
+
'shall', 'must', 'do', 'did', 'i', 'you', 'he', 'she', 'it', 'we', 'they',
|
|
21
|
+
'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their',
|
|
22
|
+
'this', 'that', 'these', 'those', 'of', 'in', 'on', 'at', 'by', 'for',
|
|
23
|
+
'with', 'about', 'into', 'to', 'from', 'up', 'out', 'and', 'or', 'but',
|
|
24
|
+
'if', 'so', 'as', 'not', 'no', 'than', 'then', 'also',
|
|
25
|
+
]);
|
|
26
|
+
function detectQuestionType(question) {
|
|
27
|
+
const q = question.toLowerCase().trim();
|
|
28
|
+
// Fix #1: Distinguish "how many/much/long" (quantity/duration) from "how do/does/can/to/is" (process/explanation)
|
|
29
|
+
if (/how\s+many|how\s+much|how\s+long|what\s+price|what\s+cost|pricing/.test(q))
|
|
30
|
+
return 'how_many';
|
|
31
|
+
// Fix #11: Yes/no questions (starts with auxiliary verb)
|
|
32
|
+
if (/^(is|does|can|will|are|has|do|did|was|were|could|should|would)\b/i.test(q))
|
|
33
|
+
return 'yes_no';
|
|
34
|
+
if (/when\b/.test(q))
|
|
35
|
+
return 'when';
|
|
36
|
+
if (/where\b/.test(q))
|
|
37
|
+
return 'where';
|
|
38
|
+
if (/why\b/.test(q))
|
|
39
|
+
return 'why';
|
|
40
|
+
if (/who\b/.test(q))
|
|
41
|
+
return 'who';
|
|
42
|
+
// "what company/person/team/group/organization" → treat as who
|
|
43
|
+
if (/what\s+(?:company|person|people|team|group|organization|organisation|developer|author|creator|founder)\b/.test(q))
|
|
44
|
+
return 'who';
|
|
45
|
+
if (/what\b/.test(q))
|
|
46
|
+
return 'what';
|
|
47
|
+
// Fix #1: "how do/does/can/to/is" → 'how' (process/explanation), bare 'how' → 'how' (not 'how_many')
|
|
48
|
+
if (/how\s+(?:do|does|can|to|is|are|was|were|will|would|could|should)\b/.test(q))
|
|
49
|
+
return 'how';
|
|
50
|
+
if (/how\b/.test(q))
|
|
51
|
+
return 'how';
|
|
52
|
+
return 'other';
|
|
53
|
+
}
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// Tokenization
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
/**
|
|
58
|
+
* Tokenize and stem text. Used for BM25 scoring — both query and content
|
|
59
|
+
* go through the same stemming pipeline so "limitations" matches "limit".
|
|
60
|
+
*/
|
|
61
|
+
function tokenize(text) {
|
|
62
|
+
return text
|
|
63
|
+
.toLowerCase()
|
|
64
|
+
.replace(/[^\w\s]/g, ' ')
|
|
65
|
+
.split(/\s+/)
|
|
66
|
+
.filter(t => t.length > 1)
|
|
67
|
+
.map(t => stem(t));
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Tokenize WITHOUT stemming. Used for regex pattern building in
|
|
71
|
+
* tryDirectExtraction so that exact text patterns still match.
|
|
72
|
+
*/
|
|
73
|
+
function tokenizeRaw(text) {
|
|
74
|
+
return text
|
|
75
|
+
.toLowerCase()
|
|
76
|
+
.replace(/[^\w\s]/g, ' ')
|
|
77
|
+
.split(/\s+/)
|
|
78
|
+
.filter(t => t.length > 1);
|
|
79
|
+
}
|
|
80
|
+
function tokenizeQuestion(question) {
|
|
81
|
+
// Filter stopwords on raw tokens (before stemming), then stem
|
|
82
|
+
return tokenizeRaw(question)
|
|
83
|
+
.filter(t => !STOPWORDS.has(t))
|
|
84
|
+
.map(t => stem(t));
|
|
85
|
+
}
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
// Sentence splitting
|
|
88
|
+
// ---------------------------------------------------------------------------
|
|
89
|
+
/**
|
|
90
|
+
* Split text into sentences. Handles common abbreviations to avoid false splits.
|
|
91
|
+
* Returns an array of sentences with their start position (index in original text).
|
|
92
|
+
* Also extracts list items (markdown bullets/numbers) as pseudo-sentences.
|
|
93
|
+
*/
|
|
94
|
+
function splitIntoSentences(content) {
|
|
95
|
+
// Strip markdown formatting while preserving positions is complex;
|
|
96
|
+
// Instead work on the raw content but filter sentences by quality later.
|
|
97
|
+
const sentences = [];
|
|
98
|
+
// Protect common abbreviations and URLs from being split
|
|
99
|
+
// Replace them with placeholders, split, then restore
|
|
100
|
+
const PLACEHOLDER_MAP = new Map();
|
|
101
|
+
let placeholderIdx = 0;
|
|
102
|
+
// Protect URLs (http://... or https://...)
|
|
103
|
+
let protected_ = content.replace(/https?:\/\/[^\s)>]+/g, (m) => {
|
|
104
|
+
const ph = `\x00URL${placeholderIdx++}\x00`;
|
|
105
|
+
PLACEHOLDER_MAP.set(ph, m);
|
|
106
|
+
return ph;
|
|
107
|
+
});
|
|
108
|
+
// Protect common abbreviations: Mr. Mrs. Dr. St. vs. etc. e.g. i.e. U.S. U.K.
|
|
109
|
+
const ABBREVS = /\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|vs|etc|e\.g|i\.e|U\.S|U\.K|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec|No|Vol|pp)\./g;
|
|
110
|
+
protected_ = protected_.replace(ABBREVS, (m) => {
|
|
111
|
+
const ph = `\x00ABBR${placeholderIdx++}\x00`;
|
|
112
|
+
PLACEHOLDER_MAP.set(ph, m);
|
|
113
|
+
return ph;
|
|
114
|
+
});
|
|
115
|
+
// Protect version numbers with multiple dots (e.g., 0.9.0, 1.2.3, 3.11.4)
|
|
116
|
+
// Must run BEFORE the decimal number protection to avoid partial replacement
|
|
117
|
+
protected_ = protected_.replace(/\b(\d+\.\d+(?:\.\d+)+)/g, (m) => {
|
|
118
|
+
const ph = `\x00VER${placeholderIdx++}\x00`;
|
|
119
|
+
PLACEHOLDER_MAP.set(ph, m);
|
|
120
|
+
return ph;
|
|
121
|
+
});
|
|
122
|
+
// Protect decimal numbers (e.g., 3.14, $29.99)
|
|
123
|
+
protected_ = protected_.replace(/\b(\d+)\.(\d+)/g, (_m, a, b) => {
|
|
124
|
+
const ph = `\x00NUM${placeholderIdx++}\x00`;
|
|
125
|
+
PLACEHOLDER_MAP.set(ph, `${a}.${b}`);
|
|
126
|
+
return ph;
|
|
127
|
+
});
|
|
128
|
+
// Split on sentence-ending punctuation followed by whitespace or end of string
|
|
129
|
+
// Using a regex that splits AFTER the punctuation
|
|
130
|
+
const sentencePattern = /[.!?]+(?:\s+|\n+|$)/g;
|
|
131
|
+
let lastEnd = 0;
|
|
132
|
+
let match;
|
|
133
|
+
while ((match = sentencePattern.exec(protected_)) !== null) {
|
|
134
|
+
const end = match.index + match[0].length;
|
|
135
|
+
let sentence = protected_.slice(lastEnd, end).trim();
|
|
136
|
+
lastEnd = end;
|
|
137
|
+
// Restore placeholders
|
|
138
|
+
for (const [ph, orig] of PLACEHOLDER_MAP.entries()) {
|
|
139
|
+
sentence = sentence.split(ph).join(orig);
|
|
140
|
+
}
|
|
141
|
+
if (sentence) {
|
|
142
|
+
sentences.push({ text: sentence, start: match.index });
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
// Add any remaining text after the last sentence boundary
|
|
146
|
+
if (lastEnd < protected_.length) {
|
|
147
|
+
let remaining = protected_.slice(lastEnd).trim();
|
|
148
|
+
if (remaining) {
|
|
149
|
+
for (const [ph, orig] of PLACEHOLDER_MAP.entries()) {
|
|
150
|
+
remaining = remaining.split(ph).join(orig);
|
|
151
|
+
}
|
|
152
|
+
sentences.push({ text: remaining, start: lastEnd });
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
// Fix #12: Also extract list items (markdown bullets/numbers) as "sentences"
|
|
156
|
+
const listPattern = /^[\s]*[-*+]\s+(.+)$/gm;
|
|
157
|
+
let listMatch;
|
|
158
|
+
while ((listMatch = listPattern.exec(content)) !== null) {
|
|
159
|
+
const item = listMatch[1].trim();
|
|
160
|
+
if (item.length >= 10 && item.length <= 800) {
|
|
161
|
+
// Only add if not already captured by sentence splitting
|
|
162
|
+
const isDuplicate = sentences.some(s => s.text.includes(item) || item.includes(s.text));
|
|
163
|
+
if (!isDuplicate) {
|
|
164
|
+
sentences.push({ text: item, start: listMatch.index });
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
// Fix #7: Increase max sentence length from 500 to 800 chars
|
|
169
|
+
return sentences.filter(s => {
|
|
170
|
+
const len = s.text.length;
|
|
171
|
+
return len >= 10 && len <= 800;
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
// ---------------------------------------------------------------------------
|
|
175
|
+
// Answer-signal boosting
|
|
176
|
+
// ---------------------------------------------------------------------------
|
|
177
|
+
function computeBoost(sentence, questionType, isTopicSentence) {
|
|
178
|
+
let boost = 0;
|
|
179
|
+
const s = sentence.toLowerCase();
|
|
180
|
+
if (isTopicSentence) {
|
|
181
|
+
boost += 0.1;
|
|
182
|
+
}
|
|
183
|
+
switch (questionType) {
|
|
184
|
+
case 'how_many': {
|
|
185
|
+
// Contains a number or price or duration
|
|
186
|
+
if (/\$[\d,.]+|\d+[,.]?\d*\s*(per|\/|month|year|week|day|request|api|call|token|user|minute|second|hour|degree|meter|mile|kg|lb)/i.test(sentence)) {
|
|
187
|
+
boost += 0.3;
|
|
188
|
+
}
|
|
189
|
+
else if (/\b\d+\b/.test(sentence)) {
|
|
190
|
+
boost += 0.15;
|
|
191
|
+
}
|
|
192
|
+
break;
|
|
193
|
+
}
|
|
194
|
+
// Fix #1: New 'how' (process/explanation) boost
|
|
195
|
+
case 'how': {
|
|
196
|
+
// Process/explanation sentences
|
|
197
|
+
if (/\b(by using|through|works by|in order to|step|first|then|next|finally|process|method|approach|technique|way to|can be done)\b/i.test(s)) {
|
|
198
|
+
boost += 0.4;
|
|
199
|
+
}
|
|
200
|
+
// Instructional patterns
|
|
201
|
+
if (/\b(install|run|execute|configure|set up|use|import|require|enable|disable|create|build|deploy)\b/i.test(s)) {
|
|
202
|
+
boost += 0.2;
|
|
203
|
+
}
|
|
204
|
+
break;
|
|
205
|
+
}
|
|
206
|
+
case 'when': {
|
|
207
|
+
// Contains a date
|
|
208
|
+
if (/\b(january|february|march|april|may|june|july|august|september|october|november|december|\d{4}|\d+\s*(days?|weeks?|months?|years?))\b/i.test(sentence)) {
|
|
209
|
+
boost += 0.3;
|
|
210
|
+
}
|
|
211
|
+
// Contains "released/launched/etc. in/on <year>"
|
|
212
|
+
if (/\b(released|launched|published|introduced|created|started|began|founded|established|invented)\s+(in|on|at|around)?\s*\d/i.test(sentence)) {
|
|
213
|
+
boost += 0.4;
|
|
214
|
+
}
|
|
215
|
+
break;
|
|
216
|
+
}
|
|
217
|
+
// Fix #4: Use more specific location indicators
|
|
218
|
+
case 'where': {
|
|
219
|
+
// Primary location signal — strong indicator (located/headquartered/based in + geographic proper noun)
|
|
220
|
+
if (/\b(located|headquartered|based|founded|established)\s+(in|at)\b/i.test(s) ||
|
|
221
|
+
/\b(?:in|at)\s+(?:the\s+)?[A-Z][a-z]+(?:(?:\s+[A-Z][a-z]+)*|(?:,\s+[A-Z][a-z]+)*)\b/.test(sentence) ||
|
|
222
|
+
/\b(city|country|state|region|continent|capital|office|campus|location|address)\b/i.test(s)) {
|
|
223
|
+
boost += 0.6;
|
|
224
|
+
}
|
|
225
|
+
// Specific geographic indicators including country names
|
|
226
|
+
if (/\b(street|avenue|boulevard|road|highway|route|district|province|county|netherlands|amsterdam|berlin|london|paris|tokyo|beijing|moscow|france|germany|japan|china|india|canada|australia|san francisco|new york|los angeles|seattle|chicago|boston|austin|miami)\b/i.test(s)) {
|
|
227
|
+
boost += 0.4;
|
|
228
|
+
}
|
|
229
|
+
// Birth/origin patterns
|
|
230
|
+
if (/\b(born|raised|grew up|native|hometown|birthplace|originally from)\b/i.test(s)) {
|
|
231
|
+
boost += 0.4;
|
|
232
|
+
}
|
|
233
|
+
break;
|
|
234
|
+
}
|
|
235
|
+
case 'what': {
|
|
236
|
+
// Definition sentence
|
|
237
|
+
if (/\b(is a|is an|are a|refers to|means|defined as|known as)\b/.test(s)) {
|
|
238
|
+
boost += 0.5;
|
|
239
|
+
}
|
|
240
|
+
break;
|
|
241
|
+
}
|
|
242
|
+
case 'why': {
|
|
243
|
+
// Causal sentence
|
|
244
|
+
if (/\b(because|due to|reason|therefore|since|as a result|consequently|thus)\b/.test(s)) {
|
|
245
|
+
boost += 0.4;
|
|
246
|
+
}
|
|
247
|
+
// Purpose/goal sentences ("as a successor to", "in order to", "to allow", "to provide")
|
|
248
|
+
if (/\b(as a successor|successor to|in order to|so that|to allow|to provide|to enable|to support|to replace|to improve|to address|to solve)\b/i.test(s)) {
|
|
249
|
+
boost += 0.4;
|
|
250
|
+
}
|
|
251
|
+
break;
|
|
252
|
+
}
|
|
253
|
+
case 'who': {
|
|
254
|
+
// Pattern: "[topic] was created/designed/developed by [Person]"
|
|
255
|
+
// Or: "[Person] created/designed/developed [topic]"
|
|
256
|
+
if (/\b(created|designed|developed|built|invented|founded|authored|introduced|proposed|conceived|released|launched|established)\s+(?:\w+\s+){0,4}by\b/i.test(s) ||
|
|
257
|
+
/\b[A-Z][a-z]+\s+(?:[A-Z][a-z]+\s+)?(?:created|designed|developed|built|invented|founded|authored|introduced|conceived|began)\b/.test(sentence)) {
|
|
258
|
+
boost += 0.5;
|
|
259
|
+
}
|
|
260
|
+
// Also boost if contains person names (capitalized words that aren't sentence starters)
|
|
261
|
+
const namePattern = /\b[A-Z][a-z]+\s+[A-Z][a-z]+\b/;
|
|
262
|
+
if (namePattern.test(sentence) && !/^(The|A|An|In|On|At)\b/.test(sentence)) {
|
|
263
|
+
boost += 0.2;
|
|
264
|
+
}
|
|
265
|
+
// Existing title check
|
|
266
|
+
if (/\b(ceo|cto|founder|president|director|manager|team|company|organization|engineer|professor|researcher)\b/i.test(s)) {
|
|
267
|
+
boost += 0.2;
|
|
268
|
+
}
|
|
269
|
+
break;
|
|
270
|
+
}
|
|
271
|
+
// Fix #11: Yes/no question boost
|
|
272
|
+
case 'yes_no': {
|
|
273
|
+
if (/\b(yes|no|not|does not|doesn't|cannot|can't|isn't|aren't|won't|supports?|enables?|allows?|provides?|includes?)\b/i.test(s)) {
|
|
274
|
+
boost += 0.3;
|
|
275
|
+
}
|
|
276
|
+
break;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
return boost;
|
|
280
|
+
}
|
|
281
|
+
// Fix #9: Remove unused `_question` parameter
|
|
282
|
+
// NOTE: topicTerms must be RAW (unstemmed) for correct regex pattern building
|
|
283
|
+
function tryDirectExtraction(content, questionType, topicTerms) {
|
|
284
|
+
if (topicTerms.length === 0)
|
|
285
|
+
return null;
|
|
286
|
+
// Build a regex pattern that matches any topic term (case-insensitive)
|
|
287
|
+
const topicPattern = topicTerms.map(t => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
|
288
|
+
// --- Tiered 'who' infobox extraction ---
|
|
289
|
+
// Wikipedia infobox entries appear as list items like:
|
|
290
|
+
// "- Founders · Sam AltmanElon Musk..."
|
|
291
|
+
// We search for the field pattern directly (no topic prefix required) since
|
|
292
|
+
// "Founders ·" is specific enough to avoid false positives.
|
|
293
|
+
// Split into two tiers: creator fields (always try first) vs. developer/maintainer fields
|
|
294
|
+
// (skip for creation questions so we don't return "The Rust Team" for "Who created Rust?")
|
|
295
|
+
if (questionType === 'who') {
|
|
296
|
+
// Detect if question is about creation/origin.
|
|
297
|
+
// These are stem prefixes (e.g. "creat" from "created"), so use leading \b only —
|
|
298
|
+
// no trailing \b, since the stem appears INSIDE the full word.
|
|
299
|
+
const isCreationQuestion = /\b(?:creat|built|invent|found|design|start|conceiv|originat|develop|made|wrote|began)\w*/i.test(topicTerms.join(' '));
|
|
300
|
+
// Tier 1: Original creator fields (always try first) — search directly without topic prefix
|
|
301
|
+
const creatorFields = /(?:Original\s+author|Creator|Inventor|Designed\s+by|Created\s+by|Founded\s+by|Founders)\s*[·:]\s*(.+)/i;
|
|
302
|
+
const creatorMatch = content.match(creatorFields);
|
|
303
|
+
if (creatorMatch?.[1]) {
|
|
304
|
+
const value = creatorMatch[1].split('\n')[0].trim().slice(0, 300);
|
|
305
|
+
if (value.length > 2) {
|
|
306
|
+
return { text: value, context: creatorMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
// Tier 2: General developer fields (skip for creation questions — let BM25 find the original creator)
|
|
310
|
+
if (!isCreationQuestion) {
|
|
311
|
+
const devFields = /(?:Developers|Developer|Maintainer|Author)\s*[·:]\s*(.+)/i;
|
|
312
|
+
const devMatch = content.match(devFields);
|
|
313
|
+
if (devMatch?.[1]) {
|
|
314
|
+
const value = devMatch[1].split('\n')[0].trim().slice(0, 300);
|
|
315
|
+
if (value.length > 2) {
|
|
316
|
+
return { text: value, context: devMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
// --- Infobox patterns (Wikipedia-style: "Topic: Field · Value") ---
|
|
322
|
+
// Note: Wikipedia uses \u00A0 (NBSP) in infobox fields, so we use \\s+ (which matches NBSP) instead of literal spaces
|
|
323
|
+
const infoboxPatterns = [
|
|
324
|
+
{ type: ['when'], field: new RegExp(`(?:${topicPattern}).*?(?:First\\s+appeared|Released|Founded|Established|Created|Launch\\s+date|Initial\\s+release)\\s*[·:]\\s*(.+)`, 'i') },
|
|
325
|
+
{ type: ['what'], field: new RegExp(`(?:${topicPattern}).*?(?:Type|Genre|Category|Classification)\\s*[·:]\\s*(.+)`, 'i') },
|
|
326
|
+
{ type: ['where'], field: /(?:Headquarters|Headquartered|Location|Address|HQ|Head\s+office|Based\s+in)\s*[·:]\s*(.+)/i },
|
|
327
|
+
];
|
|
328
|
+
for (const pat of infoboxPatterns) {
|
|
329
|
+
if (!pat.type.includes(questionType))
|
|
330
|
+
continue;
|
|
331
|
+
const match = content.match(pat.field);
|
|
332
|
+
if (match?.[1]) {
|
|
333
|
+
const value = match[1].split('\n')[0].trim().slice(0, 300);
|
|
334
|
+
if (value.length > 2) {
|
|
335
|
+
return {
|
|
336
|
+
text: value,
|
|
337
|
+
context: match[0].split('\n')[0].trim().slice(0, 500),
|
|
338
|
+
confidence: 0.92,
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
// --- Definition sentence patterns (e.g. "X is a Y developed by Z") ---
|
|
344
|
+
if (questionType === 'who') {
|
|
345
|
+
// "developed/designed/created by [Name]" in first 20% of content
|
|
346
|
+
const first20 = content.slice(0, Math.max(500, Math.floor(content.length * 0.2)));
|
|
347
|
+
// Use case-insensitive for verbs, but validate name casing separately
|
|
348
|
+
const byPattern = /(?:developed|designed|created|built|invented|founded|authored|introduced|coined|conceived|released|started|launched|begun|proposed|established)\s+(?:\w+\s+){0,4}by\s+(\S+(?:\s+\S+){0,3})/i;
|
|
349
|
+
const byMatch = first20.match(byPattern);
|
|
350
|
+
if (byMatch?.[1]) {
|
|
351
|
+
const candidateName = byMatch[1].trim();
|
|
352
|
+
// Validate: first word must start with uppercase (proper noun, not "generative AI software")
|
|
353
|
+
const firstWord = candidateName.split(/\s+/)[0];
|
|
354
|
+
const isProperNoun = /^[A-Z]/.test(firstWord) && !/^(The|A|An|This|That|Its|Their|Our|Some|Many|Most|All|Each|Every)$/.test(firstWord);
|
|
355
|
+
if (isProperNoun) {
|
|
356
|
+
// Find the full sentence containing this match
|
|
357
|
+
const idx = first20.indexOf(byMatch[0]);
|
|
358
|
+
const sentStart = Math.max(0, first20.lastIndexOf('.', idx) + 1);
|
|
359
|
+
const sentEnd = first20.indexOf('.', idx + byMatch[0].length);
|
|
360
|
+
const fullSentence = first20.slice(sentStart, sentEnd > 0 ? sentEnd + 1 : undefined).trim();
|
|
361
|
+
return {
|
|
362
|
+
text: fullSentence || byMatch[0],
|
|
363
|
+
context: fullSentence,
|
|
364
|
+
confidence: 0.88,
|
|
365
|
+
};
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
if (questionType === 'when') {
|
|
370
|
+
// Look for a date near topic terms in first 30% of content
|
|
371
|
+
const first30 = content.slice(0, Math.max(600, Math.floor(content.length * 0.3)));
|
|
372
|
+
// Note: "began"/"started" are intentionally excluded — they can match
|
|
373
|
+
// construction/start events that don't answer the specific question
|
|
374
|
+
// (e.g. "When did X fall?" should NOT match "began on Aug 13, 1961").
|
|
375
|
+
const datePattern = /(?:released|launched|first appeared|founded|established|created|introduced|conceived|opened|invented)\s+(?:\w+\s+){0,2}(?:in|on)\s+(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})/i;
|
|
376
|
+
const dateMatch = first30.match(datePattern);
|
|
377
|
+
if (dateMatch) {
|
|
378
|
+
const idx = first30.indexOf(dateMatch[0]);
|
|
379
|
+
const sentStart = Math.max(0, first30.lastIndexOf('.', idx) + 1);
|
|
380
|
+
const sentEnd = first30.indexOf('.', idx + dateMatch[0].length);
|
|
381
|
+
const fullSentence = first30.slice(sentStart, sentEnd > 0 ? sentEnd + 1 : undefined).trim();
|
|
382
|
+
return {
|
|
383
|
+
text: fullSentence || dateMatch[0],
|
|
384
|
+
context: fullSentence,
|
|
385
|
+
confidence: 0.88,
|
|
386
|
+
};
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
return null;
|
|
390
|
+
}
|
|
391
|
+
// ---------------------------------------------------------------------------
|
|
392
|
+
// Entity extraction — for who/when questions answered by BM25
|
|
393
|
+
// ---------------------------------------------------------------------------
|
|
394
|
+
/**
|
|
395
|
+
* Try to extract a specific entity (person name, date) from a BM25-selected passage.
|
|
396
|
+
* Returns the entity string if found, or null.
|
|
397
|
+
*/
|
|
398
|
+
function extractEntity(passage, questionType) {
|
|
399
|
+
if (questionType === 'who') {
|
|
400
|
+
// Try: "by [Name Name]"
|
|
401
|
+
const byMatch = passage.match(/\bby\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})/);
|
|
402
|
+
if (byMatch)
|
|
403
|
+
return byMatch[1];
|
|
404
|
+
// Try: "[Name Name] created/founded/..."
|
|
405
|
+
const nameVerbMatch = passage.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})\s+(?:created|founded|designed|developed|built|invented|authored|introduced)/);
|
|
406
|
+
if (nameVerbMatch)
|
|
407
|
+
return nameVerbMatch[1];
|
|
408
|
+
return null;
|
|
409
|
+
}
|
|
410
|
+
if (questionType === 'when') {
|
|
411
|
+
const dateMatch = passage.match(/\b(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})\b/);
|
|
412
|
+
if (dateMatch)
|
|
413
|
+
return dateMatch[1];
|
|
414
|
+
return null;
|
|
415
|
+
}
|
|
416
|
+
return null;
|
|
417
|
+
}
|
|
418
|
+
// ---------------------------------------------------------------------------
|
|
419
|
+
// Entity type check for confidence formula
|
|
420
|
+
// ---------------------------------------------------------------------------
|
|
421
|
+
function hasExpectedEntityType(text, questionType) {
|
|
422
|
+
switch (questionType) {
|
|
423
|
+
case 'who':
|
|
424
|
+
return /[A-Z][a-z]+\s+[A-Z][a-z]+/.test(text);
|
|
425
|
+
case 'when':
|
|
426
|
+
return /\b\d{4}\b|\b(january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(text);
|
|
427
|
+
case 'how_many':
|
|
428
|
+
case 'how_much':
|
|
429
|
+
return /\b\d+\b/.test(text);
|
|
430
|
+
case 'where':
|
|
431
|
+
return /\b(in|at|near|located|based|headquarter)\b/i.test(text);
|
|
432
|
+
default:
|
|
433
|
+
return true;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
// ---------------------------------------------------------------------------
|
|
437
|
+
// Content cleaning — strip citation/reference noise before BM25 scoring
|
|
438
|
+
// ---------------------------------------------------------------------------
|
|
439
|
+
/**
|
|
440
|
+
* Strip citation/reference noise from content before BM25 scoring.
|
|
441
|
+
* Wikipedia and academic pages contain citation metadata that BM25
|
|
442
|
+
* scores highly due to unique terms (CS1_maint, arXiv, doi, etc.)
|
|
443
|
+
*/
|
|
444
|
+
function cleanContentForQA(content) {
|
|
445
|
+
let cleaned = content;
|
|
446
|
+
// Strip markdown formatting to get clean text for BM25 scoring
|
|
447
|
+
// Images:  → remove entirely
|
|
448
|
+
cleaned = cleaned.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
|
|
449
|
+
// Links: [text](url "title") → text (keep link text, remove URL and title)
|
|
450
|
+
cleaned = cleaned.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
|
|
451
|
+
// Bold/italic: ***text***, **text**, *text* → text
|
|
452
|
+
cleaned = cleaned.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
|
|
453
|
+
// Inline code: `text` → text
|
|
454
|
+
cleaned = cleaned.replace(/`([^`]+)`/g, '$1');
|
|
455
|
+
// Heading markers: ## Heading → Heading
|
|
456
|
+
cleaned = cleaned.replace(/^#{1,6}\s+/gm, '');
|
|
457
|
+
// Horizontal rules
|
|
458
|
+
cleaned = cleaned.replace(/^---+$/gm, '');
|
|
459
|
+
// HTML entities
|
|
460
|
+
cleaned = cleaned.replace(/&/g, '&');
|
|
461
|
+
cleaned = cleaned.replace(/</g, '<');
|
|
462
|
+
cleaned = cleaned.replace(/>/g, '>');
|
|
463
|
+
cleaned = cleaned.replace(/ /g, ' ');
|
|
464
|
+
cleaned = cleaned.replace(/&#\d+;/g, '');
|
|
465
|
+
// Remove Wikipedia citation metadata (CS1_maint, Category:, etc.)
|
|
466
|
+
cleaned = cleaned.replace(/CS1[_\s]\w+[:\s][^\n]*/gi, '');
|
|
467
|
+
cleaned = cleaned.replace(/Category:[^\n]*/gi, '');
|
|
468
|
+
// Remove reference number markers [1], [2], [309], etc.
|
|
469
|
+
cleaned = cleaned.replace(/\[\d{1,4}\]/g, '');
|
|
470
|
+
// Remove academic citation noise (arXiv, doi, ISBN, ISSN, Bibcode, PMID, S2CID)
|
|
471
|
+
cleaned = cleaned.replace(/\b(arXiv|doi|ISBN|ISSN|Bibcode|PMID|S2CID|JSTOR|OCLC)\s*[:=]\s*\S+/gi, '');
|
|
472
|
+
// Remove bare URLs on their own line (often in reference sections)
|
|
473
|
+
cleaned = cleaned.replace(/^https?:\/\/\S+$/gm, '');
|
|
474
|
+
// Remove "Retrieved DATE" and "Archived from the original" patterns
|
|
475
|
+
cleaned = cleaned.replace(/\b(retrieved|archived from the original)\b[^\n]{0,100}/gi, '');
|
|
476
|
+
// Remove "External links" and everything after (usually just URLs)
|
|
477
|
+
cleaned = cleaned.replace(/^#{1,3}\s*External\s+links[\s\S]*$/im, '');
|
|
478
|
+
// Fix #8: Remove entire "See also", "Notes", "Further reading" sections
|
|
479
|
+
// (heading + all content until the next heading)
|
|
480
|
+
cleaned = cleaned.replace(/^#{1,3}\s*(?:See\s+also|Notes|Further\s+reading)\s*\n(?:(?!^#{1,3}\s).*\n?)*/gim, '');
|
|
481
|
+
// Remove "References" heading only (keep nearby content that may be relevant)
|
|
482
|
+
cleaned = cleaned.replace(/^#{1,3}\s*References\s*$/im, '');
|
|
483
|
+
// Remove lines that are mostly citation-like (very short with lots of punctuation/numbers)
|
|
484
|
+
cleaned = cleaned.split('\n').filter(line => {
|
|
485
|
+
const trimmed = line.trim();
|
|
486
|
+
if (!trimmed)
|
|
487
|
+
return true; // keep blank lines
|
|
488
|
+
// Remove lines that look like citation entries:
|
|
489
|
+
// - Start with "^" (Wikipedia footnote)
|
|
490
|
+
if (trimmed.startsWith('^'))
|
|
491
|
+
return false;
|
|
492
|
+
if (trimmed.length < 10)
|
|
493
|
+
return true; // keep very short real lines
|
|
494
|
+
// If more than 60% of chars are non-alphabetic, likely a citation
|
|
495
|
+
const alphaCount = (trimmed.match(/[a-zA-Z]/g) || []).length;
|
|
496
|
+
if (trimmed.length > 30 && alphaCount / trimmed.length < 0.4)
|
|
497
|
+
return false;
|
|
498
|
+
return true;
|
|
499
|
+
}).join('\n');
|
|
500
|
+
// Collapse multiple blank lines
|
|
501
|
+
cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
|
|
502
|
+
return cleaned;
|
|
503
|
+
}
|
|
504
|
+
// ---------------------------------------------------------------------------
|
|
505
|
+
// Main quickAnswer function
|
|
506
|
+
// ---------------------------------------------------------------------------
|
|
507
|
+
/**
|
|
508
|
+
* Answer a question about fetched page content using BM25 + heuristics.
|
|
509
|
+
*
|
|
510
|
+
* This is a fully offline, LLM-free approach. It:
|
|
511
|
+
* 1. Cleans the content (strips Wikipedia citations, reference noise, etc.)
|
|
512
|
+
* 2. Tries direct pattern extraction for structured content (infoboxes, definitions)
|
|
513
|
+
* 3. Falls back to BM25 sentence scoring with question-type-aware boosting
|
|
514
|
+
* 4. Uses sliding windows (1-3 sentences) to capture multi-sentence answers
|
|
515
|
+
* 5. Expands query terms with synonyms for broader matching
|
|
516
|
+
* 6. Returns the top passages with scores and surrounding context
|
|
517
|
+
*
|
|
518
|
+
* @param options - Question, content, and optional tuning parameters
|
|
519
|
+
* @returns A result object with answer text, confidence score, and ranked passages
|
|
520
|
+
*
|
|
521
|
+
* @example
|
|
522
|
+
* ```ts
|
|
523
|
+
* const result = await quickAnswer({
|
|
524
|
+
* question: 'What is the pricing?',
|
|
525
|
+
* content: pageMarkdown,
|
|
526
|
+
* url: 'https://example.com/pricing',
|
|
527
|
+
* });
|
|
528
|
+
* console.log(result.answer, result.confidence);
|
|
529
|
+
* ```
|
|
530
|
+
*/
|
|
531
|
+
export function quickAnswer(options) {
|
|
532
|
+
const { question, content, maxPassages = 3, maxChars = 2000, url = '', } = options;
|
|
533
|
+
const emptyResult = {
|
|
534
|
+
question,
|
|
535
|
+
answer: '',
|
|
536
|
+
confidence: 0,
|
|
537
|
+
passages: [],
|
|
538
|
+
source: url,
|
|
539
|
+
method: 'bm25',
|
|
540
|
+
};
|
|
541
|
+
if (!content || !content.trim())
|
|
542
|
+
return emptyResult;
|
|
543
|
+
if (!question || !question.trim())
|
|
544
|
+
return emptyResult;
|
|
545
|
+
// Clean content to remove citation/reference noise before BM25 scoring
|
|
546
|
+
const cleanedContent = cleanContentForQA(content);
|
|
547
|
+
// For very long content, focus on the most relevant portion.
|
|
548
|
+
// Wikipedia article tails contain references, tangential details, and noise.
|
|
549
|
+
const MAX_QA_CHARS = 20000;
|
|
550
|
+
let qaContent = cleanedContent;
|
|
551
|
+
if (qaContent.length > MAX_QA_CHARS) {
|
|
552
|
+
// Keep the first 70% — definitions, key facts, and main content
|
|
553
|
+
// are almost always in the first 2/3 of the article
|
|
554
|
+
qaContent = qaContent.slice(0, Math.floor(qaContent.length * 0.7));
|
|
555
|
+
}
|
|
556
|
+
// Step 0: Direct pattern extraction — try to find structured answers before BM25
|
|
557
|
+
// This catches infobox patterns (e.g. "TypeScript: Designed by · Anders Hejlsberg")
|
|
558
|
+
// and definition sentences (e.g. "TypeScript is ... developed by Microsoft")
|
|
559
|
+
const questionType = detectQuestionType(question);
|
|
560
|
+
// RAW (unstemmed) topic terms for tryDirectExtraction regex patterns
|
|
561
|
+
const topicTermsRaw = tokenizeRaw(question).filter(t => !STOPWORDS.has(t));
|
|
562
|
+
// Fix #9: Remove the unused `question` argument from the call site
|
|
563
|
+
const directAnswer = tryDirectExtraction(cleanedContent, questionType, topicTermsRaw);
|
|
564
|
+
if (directAnswer) {
|
|
565
|
+
return {
|
|
566
|
+
question,
|
|
567
|
+
answer: directAnswer.text.length > maxChars ? directAnswer.text.slice(0, maxChars) + '…' : directAnswer.text,
|
|
568
|
+
confidence: directAnswer.confidence,
|
|
569
|
+
passages: [{ text: directAnswer.text, score: directAnswer.confidence, context: directAnswer.context }],
|
|
570
|
+
source: url,
|
|
571
|
+
method: 'bm25',
|
|
572
|
+
};
|
|
573
|
+
}
|
|
574
|
+
// Step 1: Split into sentences (use qaContent — truncated for long articles)
|
|
575
|
+
const sentences = splitIntoSentences(qaContent);
|
|
576
|
+
if (sentences.length === 0)
|
|
577
|
+
return emptyResult;
|
|
578
|
+
// Step 2: Tokenize question (remove stopwords, then stem)
|
|
579
|
+
const queryTerms = tokenizeQuestion(question);
|
|
580
|
+
if (queryTerms.length === 0) {
|
|
581
|
+
// Fall back to all stemmed tokens if all were stopwords
|
|
582
|
+
const fallback = tokenize(question);
|
|
583
|
+
if (fallback.length === 0)
|
|
584
|
+
return emptyResult;
|
|
585
|
+
queryTerms.push(...fallback);
|
|
586
|
+
}
|
|
587
|
+
// Expand query with synonyms for broader matching
|
|
588
|
+
const expanded = expandWithSynonyms(queryTerms);
|
|
589
|
+
// Use all expanded terms for BM25 (IDF naturally downweights common synonyms)
|
|
590
|
+
const uniqueQueryTerms = [...new Set(expanded.map(e => e.term))];
|
|
591
|
+
// Step 3: Create stemmed scoring blocks for each sentence.
|
|
592
|
+
// We pass stemmed text to scoreBM25 so that its internal tokenizer gets stemmed tokens,
|
|
593
|
+
// matching the stemmed queryTerms. The original sentence text is preserved for display.
|
|
594
|
+
const scoringBlocks = sentences.map((s, index) => ({
|
|
595
|
+
raw: tokenize(s.text).join(' '), // pre-stemmed text for BM25 scoring
|
|
596
|
+
index,
|
|
597
|
+
}));
|
|
598
|
+
// ---------------------------------------------------------------------------
|
|
599
|
+
// Step 3.5: Lightweight topic propagation (coreference approximation)
|
|
600
|
+
// ---------------------------------------------------------------------------
|
|
601
|
+
// When a sentence uses a referent phrase like "The platform" or "The company"
|
|
602
|
+
// instead of the topic entity name, BM25 can't match it. We inject stemmed
|
|
603
|
+
// topic terms into scoring blocks of nearby referent sentences so BM25 has
|
|
604
|
+
// something to work with.
|
|
605
|
+
//
|
|
606
|
+
// Only active for question types where coreference resolution helps:
|
|
607
|
+
// where, who, when — NOT for what/how/yes_no/how_many (no entity tracking needed).
|
|
608
|
+
//
|
|
609
|
+
// Heuristic: A sentence gets topic injection if:
|
|
610
|
+
// 1. It contains a common referent pattern (the platform/company/service/etc.)
|
|
611
|
+
// 2. It is within PROXIMITY_WINDOW sentences of a sentence containing the topic
|
|
612
|
+
// 3. OR the content has fewer than SMALL_CONTENT_THRESHOLD sentences AND
|
|
613
|
+
// the topic is actually mentioned somewhere in the content (topicSentenceIndices non-empty)
|
|
614
|
+
if (questionType === 'where' || questionType === 'who' || questionType === 'when') {
|
|
615
|
+
const REFERENT_PATTERNS = /\b(?:the\s+)?(?:platform|company|service|product|tool|application|system|framework|library|project|organization|software|language|program|site|website|app|api|sdk|package|module|engine|firm|startup|corporation)\b|^(?:It|They|He|She)\s/im;
|
|
616
|
+
const PROXIMITY_WINDOW = 5;
|
|
617
|
+
const SMALL_CONTENT_THRESHOLD = 15;
|
|
618
|
+
// Find which sentences contain at least one topic term
|
|
619
|
+
const topicSentenceIndices = new Set();
|
|
620
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
621
|
+
const stemmedSentence = scoringBlocks[i].raw;
|
|
622
|
+
if (queryTerms.some(t => stemmedSentence.includes(t))) {
|
|
623
|
+
topicSentenceIndices.add(i);
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
// Only inject if the topic is actually mentioned somewhere (non-empty topicSentenceIndices)
|
|
627
|
+
if (topicSentenceIndices.size > 0) {
|
|
628
|
+
// Inject topic terms into referent sentences that are near topic sentences
|
|
629
|
+
const topicInjection = ' ' + queryTerms.join(' ');
|
|
630
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
631
|
+
if (topicSentenceIndices.has(i))
|
|
632
|
+
continue; // already has topic terms
|
|
633
|
+
const hasReferent = REFERENT_PATTERNS.test(sentences[i].text);
|
|
634
|
+
if (!hasReferent)
|
|
635
|
+
continue;
|
|
636
|
+
// Check proximity: is this sentence within PROXIMITY_WINDOW of a topic sentence?
|
|
637
|
+
const isNearTopic = sentences.length < SMALL_CONTENT_THRESHOLD ||
|
|
638
|
+
[...topicSentenceIndices].some(j => Math.abs(i - j) <= PROXIMITY_WINDOW);
|
|
639
|
+
if (isNearTopic) {
|
|
640
|
+
scoringBlocks[i].raw += topicInjection;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
// Step 4: Score sentences with BM25
|
|
646
|
+
const bm25Scores = scoreBM25(scoringBlocks, uniqueQueryTerms);
|
|
647
|
+
// Step 5: Compute max possible score for normalization
|
|
648
|
+
const maxPossibleScore = Math.max(...bm25Scores, 0.001);
|
|
649
|
+
// Step 6: Apply boosts (position bias, question type, definition patterns)
|
|
650
|
+
const totalSentences = sentences.length;
|
|
651
|
+
const sentenceScores = sentences.map((s, i) => {
|
|
652
|
+
const isTopicSentence = i === 0 || qaContent.slice(Math.max(0, s.start - 2), s.start).includes('\n');
|
|
653
|
+
const base = bm25Scores[i];
|
|
654
|
+
const boost = computeBoost(s.text, questionType, isTopicSentence);
|
|
655
|
+
// Fix #3: Position bias — reduce for 'why' and 'how' (answers can be anywhere)
|
|
656
|
+
const maxPositionBoost = (questionType === 'why' || questionType === 'how') ? 0.15 : 0.4;
|
|
657
|
+
const positionRatio = i / totalSentences;
|
|
658
|
+
// Fix position bias: scale by how many query terms THIS sentence matches.
|
|
659
|
+
// A sentence matching only 1/3 query terms (e.g., just "python") gets 1/3 of the
|
|
660
|
+
// position boost — prevents the first sentence from winning on position alone.
|
|
661
|
+
const sentTokens = tokenize(s.text);
|
|
662
|
+
const sentTermMatches = uniqueQueryTerms.filter(t => sentTokens.includes(t)).length;
|
|
663
|
+
const sentTermCoverage = uniqueQueryTerms.length > 0
|
|
664
|
+
? sentTermMatches / Math.min(uniqueQueryTerms.length, 5)
|
|
665
|
+
: 0;
|
|
666
|
+
const rawPositionBoost = positionRatio < 0.1 ? maxPositionBoost
|
|
667
|
+
: positionRatio < 0.5 ? maxPositionBoost * (1 - (positionRatio - 0.1) / 0.4)
|
|
668
|
+
: 0;
|
|
669
|
+
const positionBoost = rawPositionBoost * sentTermCoverage;
|
|
670
|
+
// Fix #2: Only apply definitionBoost for 'what' and 'other' question types.
|
|
671
|
+
const sl = s.text.toLowerCase();
|
|
672
|
+
const definitionBoost = (questionType === 'what' || questionType === 'other') &&
|
|
673
|
+
/\b(is a|is an|was a|are a|refers to|is the|was the)\b/.test(sl) ? 0.3 : 0;
|
|
674
|
+
// Extra boost for definition sentences very early in the content (for 'what' questions)
|
|
675
|
+
// This handles Wikipedia-style articles where the first sentence IS the answer
|
|
676
|
+
const earlyDefinitionBoost = (questionType === 'what' &&
|
|
677
|
+
positionRatio < 0.05 &&
|
|
678
|
+
/\b(is a|is an|are a|refers to|means|defined as|known as)\b/.test(sl)) ? 0.5 : 0;
|
|
679
|
+
const total = base + (boost + positionBoost + definitionBoost + earlyDefinitionBoost) * maxPossibleScore;
|
|
680
|
+
return { text: s.text, index: i, score: total, base };
|
|
681
|
+
});
|
|
682
|
+
const windows = [];
|
|
683
|
+
// Single-sentence windows (preserve existing behavior)
|
|
684
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
685
|
+
const score = sentenceScores[i].score;
|
|
686
|
+
const lengthPenalty = 0;
|
|
687
|
+
windows.push({
|
|
688
|
+
text: sentences[i].text,
|
|
689
|
+
indices: [i],
|
|
690
|
+
startSentenceIdx: i,
|
|
691
|
+
score: score * (1 - lengthPenalty),
|
|
692
|
+
});
|
|
693
|
+
}
|
|
694
|
+
// 2-sentence windows
|
|
695
|
+
for (let i = 0; i < sentences.length - 1; i++) {
|
|
696
|
+
const score = (sentenceScores[i].score + sentenceScores[i + 1].score) / 2;
|
|
697
|
+
const lengthPenalty = 0.05;
|
|
698
|
+
windows.push({
|
|
699
|
+
text: sentences[i].text + ' ' + sentences[i + 1].text,
|
|
700
|
+
indices: [i, i + 1],
|
|
701
|
+
startSentenceIdx: i,
|
|
702
|
+
score: score * (1 - lengthPenalty),
|
|
703
|
+
});
|
|
704
|
+
}
|
|
705
|
+
// 3-sentence windows (only when content has enough sentences)
|
|
706
|
+
if (sentences.length >= 5) {
|
|
707
|
+
for (let i = 0; i < sentences.length - 2; i++) {
|
|
708
|
+
const score = (sentenceScores[i].score + sentenceScores[i + 1].score + sentenceScores[i + 2].score) / 3;
|
|
709
|
+
const lengthPenalty = 0.10;
|
|
710
|
+
windows.push({
|
|
711
|
+
text: sentences[i].text + ' ' + sentences[i + 1].text + ' ' + sentences[i + 2].text,
|
|
712
|
+
indices: [i, i + 1, i + 2],
|
|
713
|
+
startSentenceIdx: i,
|
|
714
|
+
score: score * (1 - lengthPenalty),
|
|
715
|
+
});
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
// Step 8: Sort windows by score
|
|
719
|
+
const sortedWindows = [...windows].sort((a, b) => b.score - a.score);
|
|
720
|
+
// Step 9: Select top N non-overlapping windows
|
|
721
|
+
const selectedPassages = [];
|
|
722
|
+
const usedSentenceIndices = new Set();
|
|
723
|
+
for (const win of sortedWindows) {
|
|
724
|
+
if (selectedPassages.length >= maxPassages)
|
|
725
|
+
break;
|
|
726
|
+
// Skip if any sentence in this window was already used
|
|
727
|
+
const hasOverlap = win.indices.some(i => usedSentenceIndices.has(i));
|
|
728
|
+
if (hasOverlap)
|
|
729
|
+
continue;
|
|
730
|
+
// Mark all sentences in this window as used
|
|
731
|
+
for (const i of win.indices)
|
|
732
|
+
usedSentenceIndices.add(i);
|
|
733
|
+
// Build context: include sentence before the window and after
|
|
734
|
+
const firstIdx = win.indices[0];
|
|
735
|
+
const lastIdx = win.indices[win.indices.length - 1];
|
|
736
|
+
const contextParts = [];
|
|
737
|
+
if (firstIdx > 0 && !usedSentenceIndices.has(firstIdx - 1)) {
|
|
738
|
+
contextParts.push(sentences[firstIdx - 1].text);
|
|
739
|
+
}
|
|
740
|
+
contextParts.push(win.text);
|
|
741
|
+
if (lastIdx < sentences.length - 1 && !usedSentenceIndices.has(lastIdx + 1)) {
|
|
742
|
+
contextParts.push(sentences[lastIdx + 1].text);
|
|
743
|
+
}
|
|
744
|
+
// Mark surrounding context sentences as used to avoid overlap
|
|
745
|
+
if (firstIdx > 0)
|
|
746
|
+
usedSentenceIndices.add(firstIdx - 1);
|
|
747
|
+
if (lastIdx < sentences.length - 1)
|
|
748
|
+
usedSentenceIndices.add(lastIdx + 1);
|
|
749
|
+
const context = contextParts.join(' ');
|
|
750
|
+
selectedPassages.push({
|
|
751
|
+
text: win.text,
|
|
752
|
+
score: Math.min(1, parseFloat((win.score / (maxPossibleScore || 1)).toFixed(4))),
|
|
753
|
+
context,
|
|
754
|
+
startIdx: firstIdx,
|
|
755
|
+
indices: win.indices,
|
|
756
|
+
});
|
|
757
|
+
}
|
|
758
|
+
// ---------------------------------------------------------------------------
|
|
759
|
+
// Step 10: Confidence computation — multi-signal formula
|
|
760
|
+
// ---------------------------------------------------------------------------
|
|
761
|
+
const topWindow = sortedWindows[0];
|
|
762
|
+
const topBase = topWindow ? Math.max(...topWindow.indices.map(i => sentenceScores[i].base)) : 0;
|
|
763
|
+
const meanScore = bm25Scores.reduce((a, b) => a + b, 0) / bm25Scores.length;
|
|
764
|
+
// Signal 1: Score gap
|
|
765
|
+
const scoreGap = maxPossibleScore > 0 ? (topBase - meanScore) / maxPossibleScore : 0;
|
|
766
|
+
// Signal 2: Term coverage — what % of query terms appear in top window
|
|
767
|
+
// Also count synonym-mediated matches (at 0.7 weight)
|
|
768
|
+
const topWindowTokens = tokenize(topWindow?.text || '');
|
|
769
|
+
const directMatches = queryTerms.filter(t => topWindowTokens.includes(t)).length;
|
|
770
|
+
const matchedTerms = queryTerms.filter(t => {
|
|
771
|
+
if (topWindowTokens.includes(t))
|
|
772
|
+
return true;
|
|
773
|
+
// Check if any synonym of this term appears in the top window
|
|
774
|
+
const synonymsForTerm = expandWithSynonyms([t]);
|
|
775
|
+
return synonymsForTerm.some(e => !e.isOriginal && topWindowTokens.includes(e.term));
|
|
776
|
+
});
|
|
777
|
+
const synonymMatches = matchedTerms.length - directMatches;
|
|
778
|
+
const effectiveCoverage = queryTerms.length > 0
|
|
779
|
+
? (directMatches + synonymMatches * 0.7) / queryTerms.length
|
|
780
|
+
: 0;
|
|
781
|
+
// Signal 3: Position signal — early in document is more reliable for factual Qs
|
|
782
|
+
const positionSignal = (topWindow?.startSentenceIdx ?? 999) < sentences.length * 0.2 ? 0.1 : 0;
|
|
783
|
+
// Signal 4: Answer type match — does the answer look like it answers the question type?
|
|
784
|
+
const typeMatch = hasExpectedEntityType(topWindow?.text || '', questionType) ? 0.20 : 0;
|
|
785
|
+
const rawConfidence = Math.min(1, Math.max(0, 0.1 + // reduced baseline (was 0.2)
|
|
786
|
+
scoreGap * 0.35 +
|
|
787
|
+
effectiveCoverage * 0.25 + // synonym-aware term coverage (was 0.30)
|
|
788
|
+
positionSignal +
|
|
789
|
+
typeMatch));
|
|
790
|
+
// Penalty: noise/metadata in top answer reduces confidence
|
|
791
|
+
const topAnswerText = (topWindow?.text || '').toLowerCase();
|
|
792
|
+
const noisePenalty = (/\bcs1[_\s]/i.test(topAnswerText) ||
|
|
793
|
+
/\bcategory:/i.test(topAnswerText) ||
|
|
794
|
+
/\b(archived|retrieved)\s+(from|on)\b/i.test(topAnswerText) ||
|
|
795
|
+
/\b(isbn|issn|doi|arxiv|bibcode|pmid)\b/i.test(topAnswerText) ||
|
|
796
|
+
(topAnswerText.match(/https?:\/\//g) || []).length > 2) ? 0.5 : 0;
|
|
797
|
+
// Fix #13: Penalty for UI chrome / navigation elements
|
|
798
|
+
const uiChromePenalty = (/\b(sign in|sign up|log in|log out|subscribe|newsletter|cookie|privacy policy|terms of service)\b/i.test(topAnswerText) ||
|
|
799
|
+
/\b(skip to|main menu|navigation|sidebar|footer|header|breadcrumb)\b/i.test(topAnswerText)) ? 0.3 : 0;
|
|
800
|
+
const confidence = Math.max(0, rawConfidence - noisePenalty - uiChromePenalty);
|
|
801
|
+
// ---------------------------------------------------------------------------
|
|
802
|
+
// Step 11: Try entity extraction for who/when questions (BM25 fallback)
|
|
803
|
+
// ---------------------------------------------------------------------------
|
|
804
|
+
let answerText = selectedPassages[0]?.context || selectedPassages[0]?.text || '';
|
|
805
|
+
// For who/when, try to surface a concise entity from the top passage
|
|
806
|
+
if ((questionType === 'who' || questionType === 'when') && selectedPassages[0]) {
|
|
807
|
+
const entity = extractEntity(selectedPassages[0].text, questionType);
|
|
808
|
+
if (entity && selectedPassages[0].text.includes(entity)) {
|
|
809
|
+
// Keep full passage text as answer (it contains the entity)
|
|
810
|
+
answerText = selectedPassages[0].text;
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
if (answerText.length > maxChars) {
|
|
814
|
+
answerText = answerText.slice(0, maxChars).replace(/\s+\S*$/, '') + '…';
|
|
815
|
+
}
|
|
816
|
+
// Trim total passages content to maxChars
|
|
817
|
+
let totalChars = 0;
|
|
818
|
+
const finalPassages = selectedPassages.map(p => {
|
|
819
|
+
const contextTrimmed = p.context.length + totalChars > maxChars
|
|
820
|
+
? p.context.slice(0, Math.max(0, maxChars - totalChars)).replace(/\s+\S*$/, '') + '…'
|
|
821
|
+
: p.context;
|
|
822
|
+
totalChars += contextTrimmed.length;
|
|
823
|
+
return { text: p.text, score: p.score, context: contextTrimmed };
|
|
824
|
+
});
|
|
825
|
+
return {
|
|
826
|
+
question,
|
|
827
|
+
answer: answerText,
|
|
828
|
+
confidence: parseFloat(confidence.toFixed(4)),
|
|
829
|
+
passages: finalPassages,
|
|
830
|
+
source: url,
|
|
831
|
+
method: 'bm25',
|
|
832
|
+
};
|
|
833
|
+
}
|