@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,833 @@
1
+ /**
2
+ * Quick Answer — LLM-free question answering using BM25 + heuristics
3
+ *
4
+ * Answers a question about page content without any API key.
5
+ * Uses BM25 relevance scoring + answer-signal boosting to surface
6
+ * the most relevant sentences.
7
+ *
8
+ * v2: Added Porter stemming, synonym expansion, and sliding window scoring.
9
+ */
10
+ import { scoreBM25 } from './bm25-filter.js';
11
+ import { stem } from './stemmer.js';
12
+ import { expandWithSynonyms } from './synonyms.js';
13
+ // ---------------------------------------------------------------------------
14
+ // Stopwords — removed from question before BM25 scoring
15
+ // ---------------------------------------------------------------------------
16
+ const STOPWORDS = new Set([
17
+ 'what', 'is', 'the', 'how', 'do', 'a', 'an', 'where', 'when', 'why',
18
+ 'which', 'can', 'does', 'are', 'was', 'were', 'be', 'been', 'being',
19
+ 'have', 'has', 'had', 'will', 'would', 'could', 'should', 'may', 'might',
20
+ 'shall', 'must', 'do', 'did', 'i', 'you', 'he', 'she', 'it', 'we', 'they',
21
+ 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their',
22
+ 'this', 'that', 'these', 'those', 'of', 'in', 'on', 'at', 'by', 'for',
23
+ 'with', 'about', 'into', 'to', 'from', 'up', 'out', 'and', 'or', 'but',
24
+ 'if', 'so', 'as', 'not', 'no', 'than', 'then', 'also',
25
+ ]);
26
+ function detectQuestionType(question) {
27
+ const q = question.toLowerCase().trim();
28
+ // Fix #1: Distinguish "how many/much/long" (quantity/duration) from "how do/does/can/to/is" (process/explanation)
29
+ if (/how\s+many|how\s+much|how\s+long|what\s+price|what\s+cost|pricing/.test(q))
30
+ return 'how_many';
31
+ // Fix #11: Yes/no questions (starts with auxiliary verb)
32
+ if (/^(is|does|can|will|are|has|do|did|was|were|could|should|would)\b/i.test(q))
33
+ return 'yes_no';
34
+ if (/when\b/.test(q))
35
+ return 'when';
36
+ if (/where\b/.test(q))
37
+ return 'where';
38
+ if (/why\b/.test(q))
39
+ return 'why';
40
+ if (/who\b/.test(q))
41
+ return 'who';
42
+ // "what company/person/team/group/organization" → treat as who
43
+ if (/what\s+(?:company|person|people|team|group|organization|organisation|developer|author|creator|founder)\b/.test(q))
44
+ return 'who';
45
+ if (/what\b/.test(q))
46
+ return 'what';
47
+ // Fix #1: "how do/does/can/to/is" → 'how' (process/explanation), bare 'how' → 'how' (not 'how_many')
48
+ if (/how\s+(?:do|does|can|to|is|are|was|were|will|would|could|should)\b/.test(q))
49
+ return 'how';
50
+ if (/how\b/.test(q))
51
+ return 'how';
52
+ return 'other';
53
+ }
54
+ // ---------------------------------------------------------------------------
55
+ // Tokenization
56
+ // ---------------------------------------------------------------------------
57
+ /**
58
+ * Tokenize and stem text. Used for BM25 scoring — both query and content
59
+ * go through the same stemming pipeline so "limitations" matches "limit".
60
+ */
61
+ function tokenize(text) {
62
+ return text
63
+ .toLowerCase()
64
+ .replace(/[^\w\s]/g, ' ')
65
+ .split(/\s+/)
66
+ .filter(t => t.length > 1)
67
+ .map(t => stem(t));
68
+ }
69
+ /**
70
+ * Tokenize WITHOUT stemming. Used for regex pattern building in
71
+ * tryDirectExtraction so that exact text patterns still match.
72
+ */
73
+ function tokenizeRaw(text) {
74
+ return text
75
+ .toLowerCase()
76
+ .replace(/[^\w\s]/g, ' ')
77
+ .split(/\s+/)
78
+ .filter(t => t.length > 1);
79
+ }
80
+ function tokenizeQuestion(question) {
81
+ // Filter stopwords on raw tokens (before stemming), then stem
82
+ return tokenizeRaw(question)
83
+ .filter(t => !STOPWORDS.has(t))
84
+ .map(t => stem(t));
85
+ }
86
+ // ---------------------------------------------------------------------------
87
+ // Sentence splitting
88
+ // ---------------------------------------------------------------------------
89
+ /**
90
+ * Split text into sentences. Handles common abbreviations to avoid false splits.
91
+ * Returns an array of sentences with their start position (index in original text).
92
+ * Also extracts list items (markdown bullets/numbers) as pseudo-sentences.
93
+ */
94
+ function splitIntoSentences(content) {
95
+ // Strip markdown formatting while preserving positions is complex;
96
+ // Instead work on the raw content but filter sentences by quality later.
97
+ const sentences = [];
98
+ // Protect common abbreviations and URLs from being split
99
+ // Replace them with placeholders, split, then restore
100
+ const PLACEHOLDER_MAP = new Map();
101
+ let placeholderIdx = 0;
102
+ // Protect URLs (http://... or https://...)
103
+ let protected_ = content.replace(/https?:\/\/[^\s)>]+/g, (m) => {
104
+ const ph = `\x00URL${placeholderIdx++}\x00`;
105
+ PLACEHOLDER_MAP.set(ph, m);
106
+ return ph;
107
+ });
108
+ // Protect common abbreviations: Mr. Mrs. Dr. St. vs. etc. e.g. i.e. U.S. U.K.
109
+ const ABBREVS = /\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|vs|etc|e\.g|i\.e|U\.S|U\.K|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec|No|Vol|pp)\./g;
110
+ protected_ = protected_.replace(ABBREVS, (m) => {
111
+ const ph = `\x00ABBR${placeholderIdx++}\x00`;
112
+ PLACEHOLDER_MAP.set(ph, m);
113
+ return ph;
114
+ });
115
+ // Protect version numbers with multiple dots (e.g., 0.9.0, 1.2.3, 3.11.4)
116
+ // Must run BEFORE the decimal number protection to avoid partial replacement
117
+ protected_ = protected_.replace(/\b(\d+\.\d+(?:\.\d+)+)/g, (m) => {
118
+ const ph = `\x00VER${placeholderIdx++}\x00`;
119
+ PLACEHOLDER_MAP.set(ph, m);
120
+ return ph;
121
+ });
122
+ // Protect decimal numbers (e.g., 3.14, $29.99)
123
+ protected_ = protected_.replace(/\b(\d+)\.(\d+)/g, (_m, a, b) => {
124
+ const ph = `\x00NUM${placeholderIdx++}\x00`;
125
+ PLACEHOLDER_MAP.set(ph, `${a}.${b}`);
126
+ return ph;
127
+ });
128
+ // Split on sentence-ending punctuation followed by whitespace or end of string
129
+ // Using a regex that splits AFTER the punctuation
130
+ const sentencePattern = /[.!?]+(?:\s+|\n+|$)/g;
131
+ let lastEnd = 0;
132
+ let match;
133
+ while ((match = sentencePattern.exec(protected_)) !== null) {
134
+ const end = match.index + match[0].length;
135
+ let sentence = protected_.slice(lastEnd, end).trim();
136
+ lastEnd = end;
137
+ // Restore placeholders
138
+ for (const [ph, orig] of PLACEHOLDER_MAP.entries()) {
139
+ sentence = sentence.split(ph).join(orig);
140
+ }
141
+ if (sentence) {
142
+ sentences.push({ text: sentence, start: match.index });
143
+ }
144
+ }
145
+ // Add any remaining text after the last sentence boundary
146
+ if (lastEnd < protected_.length) {
147
+ let remaining = protected_.slice(lastEnd).trim();
148
+ if (remaining) {
149
+ for (const [ph, orig] of PLACEHOLDER_MAP.entries()) {
150
+ remaining = remaining.split(ph).join(orig);
151
+ }
152
+ sentences.push({ text: remaining, start: lastEnd });
153
+ }
154
+ }
155
+ // Fix #12: Also extract list items (markdown bullets/numbers) as "sentences"
156
+ const listPattern = /^[\s]*[-*+]\s+(.+)$/gm;
157
+ let listMatch;
158
+ while ((listMatch = listPattern.exec(content)) !== null) {
159
+ const item = listMatch[1].trim();
160
+ if (item.length >= 10 && item.length <= 800) {
161
+ // Only add if not already captured by sentence splitting
162
+ const isDuplicate = sentences.some(s => s.text.includes(item) || item.includes(s.text));
163
+ if (!isDuplicate) {
164
+ sentences.push({ text: item, start: listMatch.index });
165
+ }
166
+ }
167
+ }
168
+ // Fix #7: Increase max sentence length from 500 to 800 chars
169
+ return sentences.filter(s => {
170
+ const len = s.text.length;
171
+ return len >= 10 && len <= 800;
172
+ });
173
+ }
174
+ // ---------------------------------------------------------------------------
175
+ // Answer-signal boosting
176
+ // ---------------------------------------------------------------------------
177
+ function computeBoost(sentence, questionType, isTopicSentence) {
178
+ let boost = 0;
179
+ const s = sentence.toLowerCase();
180
+ if (isTopicSentence) {
181
+ boost += 0.1;
182
+ }
183
+ switch (questionType) {
184
+ case 'how_many': {
185
+ // Contains a number or price or duration
186
+ if (/\$[\d,.]+|\d+[,.]?\d*\s*(per|\/|month|year|week|day|request|api|call|token|user|minute|second|hour|degree|meter|mile|kg|lb)/i.test(sentence)) {
187
+ boost += 0.3;
188
+ }
189
+ else if (/\b\d+\b/.test(sentence)) {
190
+ boost += 0.15;
191
+ }
192
+ break;
193
+ }
194
+ // Fix #1: New 'how' (process/explanation) boost
195
+ case 'how': {
196
+ // Process/explanation sentences
197
+ if (/\b(by using|through|works by|in order to|step|first|then|next|finally|process|method|approach|technique|way to|can be done)\b/i.test(s)) {
198
+ boost += 0.4;
199
+ }
200
+ // Instructional patterns
201
+ if (/\b(install|run|execute|configure|set up|use|import|require|enable|disable|create|build|deploy)\b/i.test(s)) {
202
+ boost += 0.2;
203
+ }
204
+ break;
205
+ }
206
+ case 'when': {
207
+ // Contains a date
208
+ if (/\b(january|february|march|april|may|june|july|august|september|october|november|december|\d{4}|\d+\s*(days?|weeks?|months?|years?))\b/i.test(sentence)) {
209
+ boost += 0.3;
210
+ }
211
+ // Contains "released/launched/etc. in/on <year>"
212
+ if (/\b(released|launched|published|introduced|created|started|began|founded|established|invented)\s+(in|on|at|around)?\s*\d/i.test(sentence)) {
213
+ boost += 0.4;
214
+ }
215
+ break;
216
+ }
217
+ // Fix #4: Use more specific location indicators
218
+ case 'where': {
219
+ // Primary location signal — strong indicator (located/headquartered/based in + geographic proper noun)
220
+ if (/\b(located|headquartered|based|founded|established)\s+(in|at)\b/i.test(s) ||
221
+ /\b(?:in|at)\s+(?:the\s+)?[A-Z][a-z]+(?:(?:\s+[A-Z][a-z]+)*|(?:,\s+[A-Z][a-z]+)*)\b/.test(sentence) ||
222
+ /\b(city|country|state|region|continent|capital|office|campus|location|address)\b/i.test(s)) {
223
+ boost += 0.6;
224
+ }
225
+ // Specific geographic indicators including country names
226
+ if (/\b(street|avenue|boulevard|road|highway|route|district|province|county|netherlands|amsterdam|berlin|london|paris|tokyo|beijing|moscow|france|germany|japan|china|india|canada|australia|san francisco|new york|los angeles|seattle|chicago|boston|austin|miami)\b/i.test(s)) {
227
+ boost += 0.4;
228
+ }
229
+ // Birth/origin patterns
230
+ if (/\b(born|raised|grew up|native|hometown|birthplace|originally from)\b/i.test(s)) {
231
+ boost += 0.4;
232
+ }
233
+ break;
234
+ }
235
+ case 'what': {
236
+ // Definition sentence
237
+ if (/\b(is a|is an|are a|refers to|means|defined as|known as)\b/.test(s)) {
238
+ boost += 0.5;
239
+ }
240
+ break;
241
+ }
242
+ case 'why': {
243
+ // Causal sentence
244
+ if (/\b(because|due to|reason|therefore|since|as a result|consequently|thus)\b/.test(s)) {
245
+ boost += 0.4;
246
+ }
247
+ // Purpose/goal sentences ("as a successor to", "in order to", "to allow", "to provide")
248
+ if (/\b(as a successor|successor to|in order to|so that|to allow|to provide|to enable|to support|to replace|to improve|to address|to solve)\b/i.test(s)) {
249
+ boost += 0.4;
250
+ }
251
+ break;
252
+ }
253
+ case 'who': {
254
+ // Pattern: "[topic] was created/designed/developed by [Person]"
255
+ // Or: "[Person] created/designed/developed [topic]"
256
+ if (/\b(created|designed|developed|built|invented|founded|authored|introduced|proposed|conceived|released|launched|established)\s+(?:\w+\s+){0,4}by\b/i.test(s) ||
257
+ /\b[A-Z][a-z]+\s+(?:[A-Z][a-z]+\s+)?(?:created|designed|developed|built|invented|founded|authored|introduced|conceived|began)\b/.test(sentence)) {
258
+ boost += 0.5;
259
+ }
260
+ // Also boost if contains person names (capitalized words that aren't sentence starters)
261
+ const namePattern = /\b[A-Z][a-z]+\s+[A-Z][a-z]+\b/;
262
+ if (namePattern.test(sentence) && !/^(The|A|An|In|On|At)\b/.test(sentence)) {
263
+ boost += 0.2;
264
+ }
265
+ // Existing title check
266
+ if (/\b(ceo|cto|founder|president|director|manager|team|company|organization|engineer|professor|researcher)\b/i.test(s)) {
267
+ boost += 0.2;
268
+ }
269
+ break;
270
+ }
271
+ // Fix #11: Yes/no question boost
272
+ case 'yes_no': {
273
+ if (/\b(yes|no|not|does not|doesn't|cannot|can't|isn't|aren't|won't|supports?|enables?|allows?|provides?|includes?)\b/i.test(s)) {
274
+ boost += 0.3;
275
+ }
276
+ break;
277
+ }
278
+ }
279
+ return boost;
280
+ }
281
+ // Fix #9: Remove unused `_question` parameter
282
+ // NOTE: topicTerms must be RAW (unstemmed) for correct regex pattern building
283
+ function tryDirectExtraction(content, questionType, topicTerms) {
284
+ if (topicTerms.length === 0)
285
+ return null;
286
+ // Build a regex pattern that matches any topic term (case-insensitive)
287
+ const topicPattern = topicTerms.map(t => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
288
+ // --- Tiered 'who' infobox extraction ---
289
+ // Wikipedia infobox entries appear as list items like:
290
+ // "- Founders · Sam AltmanElon Musk..."
291
+ // We search for the field pattern directly (no topic prefix required) since
292
+ // "Founders ·" is specific enough to avoid false positives.
293
+ // Split into two tiers: creator fields (always try first) vs. developer/maintainer fields
294
+ // (skip for creation questions so we don't return "The Rust Team" for "Who created Rust?")
295
+ if (questionType === 'who') {
296
+ // Detect if question is about creation/origin.
297
+ // These are stem prefixes (e.g. "creat" from "created"), so use leading \b only —
298
+ // no trailing \b, since the stem appears INSIDE the full word.
299
+ const isCreationQuestion = /\b(?:creat|built|invent|found|design|start|conceiv|originat|develop|made|wrote|began)\w*/i.test(topicTerms.join(' '));
300
+ // Tier 1: Original creator fields (always try first) — search directly without topic prefix
301
+ const creatorFields = /(?:Original\s+author|Creator|Inventor|Designed\s+by|Created\s+by|Founded\s+by|Founders)\s*[·:]\s*(.+)/i;
302
+ const creatorMatch = content.match(creatorFields);
303
+ if (creatorMatch?.[1]) {
304
+ const value = creatorMatch[1].split('\n')[0].trim().slice(0, 300);
305
+ if (value.length > 2) {
306
+ return { text: value, context: creatorMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
307
+ }
308
+ }
309
+ // Tier 2: General developer fields (skip for creation questions — let BM25 find the original creator)
310
+ if (!isCreationQuestion) {
311
+ const devFields = /(?:Developers|Developer|Maintainer|Author)\s*[·:]\s*(.+)/i;
312
+ const devMatch = content.match(devFields);
313
+ if (devMatch?.[1]) {
314
+ const value = devMatch[1].split('\n')[0].trim().slice(0, 300);
315
+ if (value.length > 2) {
316
+ return { text: value, context: devMatch[0].split('\n')[0].trim().slice(0, 500), confidence: 0.92 };
317
+ }
318
+ }
319
+ }
320
+ }
321
+ // --- Infobox patterns (Wikipedia-style: "Topic: Field · Value") ---
322
+ // Note: Wikipedia uses \u00A0 (NBSP) in infobox fields, so we use \\s+ (which matches NBSP) instead of literal spaces
323
+ const infoboxPatterns = [
324
+ { type: ['when'], field: new RegExp(`(?:${topicPattern}).*?(?:First\\s+appeared|Released|Founded|Established|Created|Launch\\s+date|Initial\\s+release)\\s*[·:]\\s*(.+)`, 'i') },
325
+ { type: ['what'], field: new RegExp(`(?:${topicPattern}).*?(?:Type|Genre|Category|Classification)\\s*[·:]\\s*(.+)`, 'i') },
326
+ { type: ['where'], field: /(?:Headquarters|Headquartered|Location|Address|HQ|Head\s+office|Based\s+in)\s*[·:]\s*(.+)/i },
327
+ ];
328
+ for (const pat of infoboxPatterns) {
329
+ if (!pat.type.includes(questionType))
330
+ continue;
331
+ const match = content.match(pat.field);
332
+ if (match?.[1]) {
333
+ const value = match[1].split('\n')[0].trim().slice(0, 300);
334
+ if (value.length > 2) {
335
+ return {
336
+ text: value,
337
+ context: match[0].split('\n')[0].trim().slice(0, 500),
338
+ confidence: 0.92,
339
+ };
340
+ }
341
+ }
342
+ }
343
+ // --- Definition sentence patterns (e.g. "X is a Y developed by Z") ---
344
+ if (questionType === 'who') {
345
+ // "developed/designed/created by [Name]" in first 20% of content
346
+ const first20 = content.slice(0, Math.max(500, Math.floor(content.length * 0.2)));
347
+ // Use case-insensitive for verbs, but validate name casing separately
348
+ const byPattern = /(?:developed|designed|created|built|invented|founded|authored|introduced|coined|conceived|released|started|launched|begun|proposed|established)\s+(?:\w+\s+){0,4}by\s+(\S+(?:\s+\S+){0,3})/i;
349
+ const byMatch = first20.match(byPattern);
350
+ if (byMatch?.[1]) {
351
+ const candidateName = byMatch[1].trim();
352
+ // Validate: first word must start with uppercase (proper noun, not "generative AI software")
353
+ const firstWord = candidateName.split(/\s+/)[0];
354
+ const isProperNoun = /^[A-Z]/.test(firstWord) && !/^(The|A|An|This|That|Its|Their|Our|Some|Many|Most|All|Each|Every)$/.test(firstWord);
355
+ if (isProperNoun) {
356
+ // Find the full sentence containing this match
357
+ const idx = first20.indexOf(byMatch[0]);
358
+ const sentStart = Math.max(0, first20.lastIndexOf('.', idx) + 1);
359
+ const sentEnd = first20.indexOf('.', idx + byMatch[0].length);
360
+ const fullSentence = first20.slice(sentStart, sentEnd > 0 ? sentEnd + 1 : undefined).trim();
361
+ return {
362
+ text: fullSentence || byMatch[0],
363
+ context: fullSentence,
364
+ confidence: 0.88,
365
+ };
366
+ }
367
+ }
368
+ }
369
+ if (questionType === 'when') {
370
+ // Look for a date near topic terms in first 30% of content
371
+ const first30 = content.slice(0, Math.max(600, Math.floor(content.length * 0.3)));
372
+ // Note: "began"/"started" are intentionally excluded — they can match
373
+ // construction/start events that don't answer the specific question
374
+ // (e.g. "When did X fall?" should NOT match "began on Aug 13, 1961").
375
+ const datePattern = /(?:released|launched|first appeared|founded|established|created|introduced|conceived|opened|invented)\s+(?:\w+\s+){0,2}(?:in|on)\s+(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})/i;
376
+ const dateMatch = first30.match(datePattern);
377
+ if (dateMatch) {
378
+ const idx = first30.indexOf(dateMatch[0]);
379
+ const sentStart = Math.max(0, first30.lastIndexOf('.', idx) + 1);
380
+ const sentEnd = first30.indexOf('.', idx + dateMatch[0].length);
381
+ const fullSentence = first30.slice(sentStart, sentEnd > 0 ? sentEnd + 1 : undefined).trim();
382
+ return {
383
+ text: fullSentence || dateMatch[0],
384
+ context: fullSentence,
385
+ confidence: 0.88,
386
+ };
387
+ }
388
+ }
389
+ return null;
390
+ }
391
+ // ---------------------------------------------------------------------------
392
+ // Entity extraction — for who/when questions answered by BM25
393
+ // ---------------------------------------------------------------------------
394
+ /**
395
+ * Try to extract a specific entity (person name, date) from a BM25-selected passage.
396
+ * Returns the entity string if found, or null.
397
+ */
398
+ function extractEntity(passage, questionType) {
399
+ if (questionType === 'who') {
400
+ // Try: "by [Name Name]"
401
+ const byMatch = passage.match(/\bby\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3})/);
402
+ if (byMatch)
403
+ return byMatch[1];
404
+ // Try: "[Name Name] created/founded/..."
405
+ const nameVerbMatch = passage.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})\s+(?:created|founded|designed|developed|built|invented|authored|introduced)/);
406
+ if (nameVerbMatch)
407
+ return nameVerbMatch[1];
408
+ return null;
409
+ }
410
+ if (questionType === 'when') {
411
+ const dateMatch = passage.match(/\b(\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{1,2},?\s+\d{4}|\d{4})\b/);
412
+ if (dateMatch)
413
+ return dateMatch[1];
414
+ return null;
415
+ }
416
+ return null;
417
+ }
418
+ // ---------------------------------------------------------------------------
419
+ // Entity type check for confidence formula
420
+ // ---------------------------------------------------------------------------
421
+ function hasExpectedEntityType(text, questionType) {
422
+ switch (questionType) {
423
+ case 'who':
424
+ return /[A-Z][a-z]+\s+[A-Z][a-z]+/.test(text);
425
+ case 'when':
426
+ return /\b\d{4}\b|\b(january|february|march|april|may|june|july|august|september|october|november|december)\b/i.test(text);
427
+ case 'how_many':
428
+ case 'how_much':
429
+ return /\b\d+\b/.test(text);
430
+ case 'where':
431
+ return /\b(in|at|near|located|based|headquarter)\b/i.test(text);
432
+ default:
433
+ return true;
434
+ }
435
+ }
436
+ // ---------------------------------------------------------------------------
437
+ // Content cleaning — strip citation/reference noise before BM25 scoring
438
+ // ---------------------------------------------------------------------------
439
+ /**
440
+ * Strip citation/reference noise from content before BM25 scoring.
441
+ * Wikipedia and academic pages contain citation metadata that BM25
442
+ * scores highly due to unique terms (CS1_maint, arXiv, doi, etc.)
443
+ */
444
+ function cleanContentForQA(content) {
445
+ let cleaned = content;
446
+ // Strip markdown formatting to get clean text for BM25 scoring
447
+ // Images: ![alt](url) → remove entirely
448
+ cleaned = cleaned.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
449
+ // Links: [text](url "title") → text (keep link text, remove URL and title)
450
+ cleaned = cleaned.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
451
+ // Bold/italic: ***text***, **text**, *text* → text
452
+ cleaned = cleaned.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
453
+ // Inline code: `text` → text
454
+ cleaned = cleaned.replace(/`([^`]+)`/g, '$1');
455
+ // Heading markers: ## Heading → Heading
456
+ cleaned = cleaned.replace(/^#{1,6}\s+/gm, '');
457
+ // Horizontal rules
458
+ cleaned = cleaned.replace(/^---+$/gm, '');
459
+ // HTML entities
460
+ cleaned = cleaned.replace(/&amp;/g, '&');
461
+ cleaned = cleaned.replace(/&lt;/g, '<');
462
+ cleaned = cleaned.replace(/&gt;/g, '>');
463
+ cleaned = cleaned.replace(/&nbsp;/g, ' ');
464
+ cleaned = cleaned.replace(/&#\d+;/g, '');
465
+ // Remove Wikipedia citation metadata (CS1_maint, Category:, etc.)
466
+ cleaned = cleaned.replace(/CS1[_\s]\w+[:\s][^\n]*/gi, '');
467
+ cleaned = cleaned.replace(/Category:[^\n]*/gi, '');
468
+ // Remove reference number markers [1], [2], [309], etc.
469
+ cleaned = cleaned.replace(/\[\d{1,4}\]/g, '');
470
+ // Remove academic citation noise (arXiv, doi, ISBN, ISSN, Bibcode, PMID, S2CID)
471
+ cleaned = cleaned.replace(/\b(arXiv|doi|ISBN|ISSN|Bibcode|PMID|S2CID|JSTOR|OCLC)\s*[:=]\s*\S+/gi, '');
472
+ // Remove bare URLs on their own line (often in reference sections)
473
+ cleaned = cleaned.replace(/^https?:\/\/\S+$/gm, '');
474
+ // Remove "Retrieved DATE" and "Archived from the original" patterns
475
+ cleaned = cleaned.replace(/\b(retrieved|archived from the original)\b[^\n]{0,100}/gi, '');
476
+ // Remove "External links" and everything after (usually just URLs)
477
+ cleaned = cleaned.replace(/^#{1,3}\s*External\s+links[\s\S]*$/im, '');
478
+ // Fix #8: Remove entire "See also", "Notes", "Further reading" sections
479
+ // (heading + all content until the next heading)
480
+ cleaned = cleaned.replace(/^#{1,3}\s*(?:See\s+also|Notes|Further\s+reading)\s*\n(?:(?!^#{1,3}\s).*\n?)*/gim, '');
481
+ // Remove "References" heading only (keep nearby content that may be relevant)
482
+ cleaned = cleaned.replace(/^#{1,3}\s*References\s*$/im, '');
483
+ // Remove lines that are mostly citation-like (very short with lots of punctuation/numbers)
484
+ cleaned = cleaned.split('\n').filter(line => {
485
+ const trimmed = line.trim();
486
+ if (!trimmed)
487
+ return true; // keep blank lines
488
+ // Remove lines that look like citation entries:
489
+ // - Start with "^" (Wikipedia footnote)
490
+ if (trimmed.startsWith('^'))
491
+ return false;
492
+ if (trimmed.length < 10)
493
+ return true; // keep very short real lines
494
+ // If more than 60% of chars are non-alphabetic, likely a citation
495
+ const alphaCount = (trimmed.match(/[a-zA-Z]/g) || []).length;
496
+ if (trimmed.length > 30 && alphaCount / trimmed.length < 0.4)
497
+ return false;
498
+ return true;
499
+ }).join('\n');
500
+ // Collapse multiple blank lines
501
+ cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
502
+ return cleaned;
503
+ }
504
+ // ---------------------------------------------------------------------------
505
+ // Main quickAnswer function
506
+ // ---------------------------------------------------------------------------
507
+ /**
508
+ * Answer a question about fetched page content using BM25 + heuristics.
509
+ *
510
+ * This is a fully offline, LLM-free approach. It:
511
+ * 1. Cleans the content (strips Wikipedia citations, reference noise, etc.)
512
+ * 2. Tries direct pattern extraction for structured content (infoboxes, definitions)
513
+ * 3. Falls back to BM25 sentence scoring with question-type-aware boosting
514
+ * 4. Uses sliding windows (1-3 sentences) to capture multi-sentence answers
515
+ * 5. Expands query terms with synonyms for broader matching
516
+ * 6. Returns the top passages with scores and surrounding context
517
+ *
518
+ * @param options - Question, content, and optional tuning parameters
519
+ * @returns A result object with answer text, confidence score, and ranked passages
520
+ *
521
+ * @example
522
+ * ```ts
523
+ * const result = await quickAnswer({
524
+ * question: 'What is the pricing?',
525
+ * content: pageMarkdown,
526
+ * url: 'https://example.com/pricing',
527
+ * });
528
+ * console.log(result.answer, result.confidence);
529
+ * ```
530
+ */
531
+ export function quickAnswer(options) {
532
+ const { question, content, maxPassages = 3, maxChars = 2000, url = '', } = options;
533
+ const emptyResult = {
534
+ question,
535
+ answer: '',
536
+ confidence: 0,
537
+ passages: [],
538
+ source: url,
539
+ method: 'bm25',
540
+ };
541
+ if (!content || !content.trim())
542
+ return emptyResult;
543
+ if (!question || !question.trim())
544
+ return emptyResult;
545
+ // Clean content to remove citation/reference noise before BM25 scoring
546
+ const cleanedContent = cleanContentForQA(content);
547
+ // For very long content, focus on the most relevant portion.
548
+ // Wikipedia article tails contain references, tangential details, and noise.
549
+ const MAX_QA_CHARS = 20000;
550
+ let qaContent = cleanedContent;
551
+ if (qaContent.length > MAX_QA_CHARS) {
552
+ // Keep the first 70% — definitions, key facts, and main content
553
+ // are almost always in the first 2/3 of the article
554
+ qaContent = qaContent.slice(0, Math.floor(qaContent.length * 0.7));
555
+ }
556
+ // Step 0: Direct pattern extraction — try to find structured answers before BM25
557
+ // This catches infobox patterns (e.g. "TypeScript: Designed by · Anders Hejlsberg")
558
+ // and definition sentences (e.g. "TypeScript is ... developed by Microsoft")
559
+ const questionType = detectQuestionType(question);
560
+ // RAW (unstemmed) topic terms for tryDirectExtraction regex patterns
561
+ const topicTermsRaw = tokenizeRaw(question).filter(t => !STOPWORDS.has(t));
562
+ // Fix #9: Remove the unused `question` argument from the call site
563
+ const directAnswer = tryDirectExtraction(cleanedContent, questionType, topicTermsRaw);
564
+ if (directAnswer) {
565
+ return {
566
+ question,
567
+ answer: directAnswer.text.length > maxChars ? directAnswer.text.slice(0, maxChars) + '…' : directAnswer.text,
568
+ confidence: directAnswer.confidence,
569
+ passages: [{ text: directAnswer.text, score: directAnswer.confidence, context: directAnswer.context }],
570
+ source: url,
571
+ method: 'bm25',
572
+ };
573
+ }
574
+ // Step 1: Split into sentences (use qaContent — truncated for long articles)
575
+ const sentences = splitIntoSentences(qaContent);
576
+ if (sentences.length === 0)
577
+ return emptyResult;
578
+ // Step 2: Tokenize question (remove stopwords, then stem)
579
+ const queryTerms = tokenizeQuestion(question);
580
+ if (queryTerms.length === 0) {
581
+ // Fall back to all stemmed tokens if all were stopwords
582
+ const fallback = tokenize(question);
583
+ if (fallback.length === 0)
584
+ return emptyResult;
585
+ queryTerms.push(...fallback);
586
+ }
587
+ // Expand query with synonyms for broader matching
588
+ const expanded = expandWithSynonyms(queryTerms);
589
+ // Use all expanded terms for BM25 (IDF naturally downweights common synonyms)
590
+ const uniqueQueryTerms = [...new Set(expanded.map(e => e.term))];
591
+ // Step 3: Create stemmed scoring blocks for each sentence.
592
+ // We pass stemmed text to scoreBM25 so that its internal tokenizer gets stemmed tokens,
593
+ // matching the stemmed queryTerms. The original sentence text is preserved for display.
594
+ const scoringBlocks = sentences.map((s, index) => ({
595
+ raw: tokenize(s.text).join(' '), // pre-stemmed text for BM25 scoring
596
+ index,
597
+ }));
598
+ // ---------------------------------------------------------------------------
599
+ // Step 3.5: Lightweight topic propagation (coreference approximation)
600
+ // ---------------------------------------------------------------------------
601
+ // When a sentence uses a referent phrase like "The platform" or "The company"
602
+ // instead of the topic entity name, BM25 can't match it. We inject stemmed
603
+ // topic terms into scoring blocks of nearby referent sentences so BM25 has
604
+ // something to work with.
605
+ //
606
+ // Only active for question types where coreference resolution helps:
607
+ // where, who, when — NOT for what/how/yes_no/how_many (no entity tracking needed).
608
+ //
609
+ // Heuristic: A sentence gets topic injection if:
610
+ // 1. It contains a common referent pattern (the platform/company/service/etc.)
611
+ // 2. It is within PROXIMITY_WINDOW sentences of a sentence containing the topic
612
+ // 3. OR the content has fewer than SMALL_CONTENT_THRESHOLD sentences AND
613
+ // the topic is actually mentioned somewhere in the content (topicSentenceIndices non-empty)
614
+ if (questionType === 'where' || questionType === 'who' || questionType === 'when') {
615
+ const REFERENT_PATTERNS = /\b(?:the\s+)?(?:platform|company|service|product|tool|application|system|framework|library|project|organization|software|language|program|site|website|app|api|sdk|package|module|engine|firm|startup|corporation)\b|^(?:It|They|He|She)\s/im;
616
+ const PROXIMITY_WINDOW = 5;
617
+ const SMALL_CONTENT_THRESHOLD = 15;
618
+ // Find which sentences contain at least one topic term
619
+ const topicSentenceIndices = new Set();
620
+ for (let i = 0; i < sentences.length; i++) {
621
+ const stemmedSentence = scoringBlocks[i].raw;
622
+ if (queryTerms.some(t => stemmedSentence.includes(t))) {
623
+ topicSentenceIndices.add(i);
624
+ }
625
+ }
626
+ // Only inject if the topic is actually mentioned somewhere (non-empty topicSentenceIndices)
627
+ if (topicSentenceIndices.size > 0) {
628
+ // Inject topic terms into referent sentences that are near topic sentences
629
+ const topicInjection = ' ' + queryTerms.join(' ');
630
+ for (let i = 0; i < sentences.length; i++) {
631
+ if (topicSentenceIndices.has(i))
632
+ continue; // already has topic terms
633
+ const hasReferent = REFERENT_PATTERNS.test(sentences[i].text);
634
+ if (!hasReferent)
635
+ continue;
636
+ // Check proximity: is this sentence within PROXIMITY_WINDOW of a topic sentence?
637
+ const isNearTopic = sentences.length < SMALL_CONTENT_THRESHOLD ||
638
+ [...topicSentenceIndices].some(j => Math.abs(i - j) <= PROXIMITY_WINDOW);
639
+ if (isNearTopic) {
640
+ scoringBlocks[i].raw += topicInjection;
641
+ }
642
+ }
643
+ }
644
+ }
645
+ // Step 4: Score sentences with BM25
646
+ const bm25Scores = scoreBM25(scoringBlocks, uniqueQueryTerms);
647
+ // Step 5: Compute max possible score for normalization
648
+ const maxPossibleScore = Math.max(...bm25Scores, 0.001);
649
+ // Step 6: Apply boosts (position bias, question type, definition patterns)
650
+ const totalSentences = sentences.length;
651
+ const sentenceScores = sentences.map((s, i) => {
652
+ const isTopicSentence = i === 0 || qaContent.slice(Math.max(0, s.start - 2), s.start).includes('\n');
653
+ const base = bm25Scores[i];
654
+ const boost = computeBoost(s.text, questionType, isTopicSentence);
655
+ // Fix #3: Position bias — reduce for 'why' and 'how' (answers can be anywhere)
656
+ const maxPositionBoost = (questionType === 'why' || questionType === 'how') ? 0.15 : 0.4;
657
+ const positionRatio = i / totalSentences;
658
+ // Fix position bias: scale by how many query terms THIS sentence matches.
659
+ // A sentence matching only 1/3 query terms (e.g., just "python") gets 1/3 of the
660
+ // position boost — prevents the first sentence from winning on position alone.
661
+ const sentTokens = tokenize(s.text);
662
+ const sentTermMatches = uniqueQueryTerms.filter(t => sentTokens.includes(t)).length;
663
+ const sentTermCoverage = uniqueQueryTerms.length > 0
664
+ ? sentTermMatches / Math.min(uniqueQueryTerms.length, 5)
665
+ : 0;
666
+ const rawPositionBoost = positionRatio < 0.1 ? maxPositionBoost
667
+ : positionRatio < 0.5 ? maxPositionBoost * (1 - (positionRatio - 0.1) / 0.4)
668
+ : 0;
669
+ const positionBoost = rawPositionBoost * sentTermCoverage;
670
+ // Fix #2: Only apply definitionBoost for 'what' and 'other' question types.
671
+ const sl = s.text.toLowerCase();
672
+ const definitionBoost = (questionType === 'what' || questionType === 'other') &&
673
+ /\b(is a|is an|was a|are a|refers to|is the|was the)\b/.test(sl) ? 0.3 : 0;
674
+ // Extra boost for definition sentences very early in the content (for 'what' questions)
675
+ // This handles Wikipedia-style articles where the first sentence IS the answer
676
+ const earlyDefinitionBoost = (questionType === 'what' &&
677
+ positionRatio < 0.05 &&
678
+ /\b(is a|is an|are a|refers to|means|defined as|known as)\b/.test(sl)) ? 0.5 : 0;
679
+ const total = base + (boost + positionBoost + definitionBoost + earlyDefinitionBoost) * maxPossibleScore;
680
+ return { text: s.text, index: i, score: total, base };
681
+ });
682
+ const windows = [];
683
+ // Single-sentence windows (preserve existing behavior)
684
+ for (let i = 0; i < sentences.length; i++) {
685
+ const score = sentenceScores[i].score;
686
+ const lengthPenalty = 0;
687
+ windows.push({
688
+ text: sentences[i].text,
689
+ indices: [i],
690
+ startSentenceIdx: i,
691
+ score: score * (1 - lengthPenalty),
692
+ });
693
+ }
694
+ // 2-sentence windows
695
+ for (let i = 0; i < sentences.length - 1; i++) {
696
+ const score = (sentenceScores[i].score + sentenceScores[i + 1].score) / 2;
697
+ const lengthPenalty = 0.05;
698
+ windows.push({
699
+ text: sentences[i].text + ' ' + sentences[i + 1].text,
700
+ indices: [i, i + 1],
701
+ startSentenceIdx: i,
702
+ score: score * (1 - lengthPenalty),
703
+ });
704
+ }
705
+ // 3-sentence windows (only when content has enough sentences)
706
+ if (sentences.length >= 5) {
707
+ for (let i = 0; i < sentences.length - 2; i++) {
708
+ const score = (sentenceScores[i].score + sentenceScores[i + 1].score + sentenceScores[i + 2].score) / 3;
709
+ const lengthPenalty = 0.10;
710
+ windows.push({
711
+ text: sentences[i].text + ' ' + sentences[i + 1].text + ' ' + sentences[i + 2].text,
712
+ indices: [i, i + 1, i + 2],
713
+ startSentenceIdx: i,
714
+ score: score * (1 - lengthPenalty),
715
+ });
716
+ }
717
+ }
718
+ // Step 8: Sort windows by score
719
+ const sortedWindows = [...windows].sort((a, b) => b.score - a.score);
720
+ // Step 9: Select top N non-overlapping windows
721
+ const selectedPassages = [];
722
+ const usedSentenceIndices = new Set();
723
+ for (const win of sortedWindows) {
724
+ if (selectedPassages.length >= maxPassages)
725
+ break;
726
+ // Skip if any sentence in this window was already used
727
+ const hasOverlap = win.indices.some(i => usedSentenceIndices.has(i));
728
+ if (hasOverlap)
729
+ continue;
730
+ // Mark all sentences in this window as used
731
+ for (const i of win.indices)
732
+ usedSentenceIndices.add(i);
733
+ // Build context: include sentence before the window and after
734
+ const firstIdx = win.indices[0];
735
+ const lastIdx = win.indices[win.indices.length - 1];
736
+ const contextParts = [];
737
+ if (firstIdx > 0 && !usedSentenceIndices.has(firstIdx - 1)) {
738
+ contextParts.push(sentences[firstIdx - 1].text);
739
+ }
740
+ contextParts.push(win.text);
741
+ if (lastIdx < sentences.length - 1 && !usedSentenceIndices.has(lastIdx + 1)) {
742
+ contextParts.push(sentences[lastIdx + 1].text);
743
+ }
744
+ // Mark surrounding context sentences as used to avoid overlap
745
+ if (firstIdx > 0)
746
+ usedSentenceIndices.add(firstIdx - 1);
747
+ if (lastIdx < sentences.length - 1)
748
+ usedSentenceIndices.add(lastIdx + 1);
749
+ const context = contextParts.join(' ');
750
+ selectedPassages.push({
751
+ text: win.text,
752
+ score: Math.min(1, parseFloat((win.score / (maxPossibleScore || 1)).toFixed(4))),
753
+ context,
754
+ startIdx: firstIdx,
755
+ indices: win.indices,
756
+ });
757
+ }
758
+ // ---------------------------------------------------------------------------
759
+ // Step 10: Confidence computation — multi-signal formula
760
+ // ---------------------------------------------------------------------------
761
+ const topWindow = sortedWindows[0];
762
+ const topBase = topWindow ? Math.max(...topWindow.indices.map(i => sentenceScores[i].base)) : 0;
763
+ const meanScore = bm25Scores.reduce((a, b) => a + b, 0) / bm25Scores.length;
764
+ // Signal 1: Score gap
765
+ const scoreGap = maxPossibleScore > 0 ? (topBase - meanScore) / maxPossibleScore : 0;
766
+ // Signal 2: Term coverage — what % of query terms appear in top window
767
+ // Also count synonym-mediated matches (at 0.7 weight)
768
+ const topWindowTokens = tokenize(topWindow?.text || '');
769
+ const directMatches = queryTerms.filter(t => topWindowTokens.includes(t)).length;
770
+ const matchedTerms = queryTerms.filter(t => {
771
+ if (topWindowTokens.includes(t))
772
+ return true;
773
+ // Check if any synonym of this term appears in the top window
774
+ const synonymsForTerm = expandWithSynonyms([t]);
775
+ return synonymsForTerm.some(e => !e.isOriginal && topWindowTokens.includes(e.term));
776
+ });
777
+ const synonymMatches = matchedTerms.length - directMatches;
778
+ const effectiveCoverage = queryTerms.length > 0
779
+ ? (directMatches + synonymMatches * 0.7) / queryTerms.length
780
+ : 0;
781
+ // Signal 3: Position signal — early in document is more reliable for factual Qs
782
+ const positionSignal = (topWindow?.startSentenceIdx ?? 999) < sentences.length * 0.2 ? 0.1 : 0;
783
+ // Signal 4: Answer type match — does the answer look like it answers the question type?
784
+ const typeMatch = hasExpectedEntityType(topWindow?.text || '', questionType) ? 0.20 : 0;
785
+ const rawConfidence = Math.min(1, Math.max(0, 0.1 + // reduced baseline (was 0.2)
786
+ scoreGap * 0.35 +
787
+ effectiveCoverage * 0.25 + // synonym-aware term coverage (was 0.30)
788
+ positionSignal +
789
+ typeMatch));
790
+ // Penalty: noise/metadata in top answer reduces confidence
791
+ const topAnswerText = (topWindow?.text || '').toLowerCase();
792
+ const noisePenalty = (/\bcs1[_\s]/i.test(topAnswerText) ||
793
+ /\bcategory:/i.test(topAnswerText) ||
794
+ /\b(archived|retrieved)\s+(from|on)\b/i.test(topAnswerText) ||
795
+ /\b(isbn|issn|doi|arxiv|bibcode|pmid)\b/i.test(topAnswerText) ||
796
+ (topAnswerText.match(/https?:\/\//g) || []).length > 2) ? 0.5 : 0;
797
+ // Fix #13: Penalty for UI chrome / navigation elements
798
+ const uiChromePenalty = (/\b(sign in|sign up|log in|log out|subscribe|newsletter|cookie|privacy policy|terms of service)\b/i.test(topAnswerText) ||
799
+ /\b(skip to|main menu|navigation|sidebar|footer|header|breadcrumb)\b/i.test(topAnswerText)) ? 0.3 : 0;
800
+ const confidence = Math.max(0, rawConfidence - noisePenalty - uiChromePenalty);
801
+ // ---------------------------------------------------------------------------
802
+ // Step 11: Try entity extraction for who/when questions (BM25 fallback)
803
+ // ---------------------------------------------------------------------------
804
+ let answerText = selectedPassages[0]?.context || selectedPassages[0]?.text || '';
805
+ // For who/when, try to surface a concise entity from the top passage
806
+ if ((questionType === 'who' || questionType === 'when') && selectedPassages[0]) {
807
+ const entity = extractEntity(selectedPassages[0].text, questionType);
808
+ if (entity && selectedPassages[0].text.includes(entity)) {
809
+ // Keep full passage text as answer (it contains the entity)
810
+ answerText = selectedPassages[0].text;
811
+ }
812
+ }
813
+ if (answerText.length > maxChars) {
814
+ answerText = answerText.slice(0, maxChars).replace(/\s+\S*$/, '') + '…';
815
+ }
816
+ // Trim total passages content to maxChars
817
+ let totalChars = 0;
818
+ const finalPassages = selectedPassages.map(p => {
819
+ const contextTrimmed = p.context.length + totalChars > maxChars
820
+ ? p.context.slice(0, Math.max(0, maxChars - totalChars)).replace(/\s+\S*$/, '') + '…'
821
+ : p.context;
822
+ totalChars += contextTrimmed.length;
823
+ return { text: p.text, score: p.score, context: contextTrimmed };
824
+ });
825
+ return {
826
+ question,
827
+ answer: answerText,
828
+ confidence: parseFloat(confidence.toFixed(4)),
829
+ passages: finalPassages,
830
+ source: url,
831
+ method: 'bm25',
832
+ };
833
+ }