@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,533 @@
1
+ /**
2
+ * Readability Engine
3
+ *
4
+ * Extracts the core article content from a web page — like Pocket, Instapaper,
5
+ * or Safari Reader Mode but deterministic, fast, and purpose-built for AI agents.
6
+ *
7
+ * Algorithm:
8
+ * 1. Noise removal — strip nav, footer, aside, ads, cookie banners, etc.
9
+ * 2. Candidate scoring — score block elements by text density, link density,
10
+ * paragraph count, and structural signals.
11
+ * 3. Best candidate selection — prefer <article> > <main> > highest-scoring div.
12
+ * 4. Post-selection cleaning — remove inline noise (share buttons, etc.).
13
+ * 5. Metadata extraction — title, author, date, site name from meta tags / bylines.
14
+ * 6. Markdown output — via existing htmlToMarkdown().
15
+ */
16
+ import * as cheerio from 'cheerio';
17
+ import { rawHtmlToMarkdown } from './markdown.js';
18
+ import { cleanConcatenatedTitle } from './metadata.js';
19
+ // ─── Noise patterns ───────────────────────────────────────────────────────────
20
+ /** Tags that are almost always page chrome, not article content */
21
+ const NOISE_TAGS = new Set([
22
+ 'nav', 'footer', 'aside', 'header',
23
+ 'script', 'style', 'noscript', 'iframe', 'form',
24
+ ]);
25
+ /**
26
+ * Class/id patterns that indicate page chrome (case-insensitive).
27
+ * Applied to combined class+id strings.
28
+ */
29
+ const NOISE_CLASS_PATTERNS = [
30
+ /\bsidebar\b/,
31
+ /\bmenu\b/,
32
+ /\bnav(bar|igation)?\b/,
33
+ /\bfooter\b/,
34
+ /\bcomment/,
35
+ /\bshare\b/,
36
+ /\bsocial/,
37
+ /\bwidget\b/,
38
+ /\bad(s|vert(isement)?|-unit)?\b/,
39
+ /\bpromo\b/,
40
+ /\bbanner(?!-content)/,
41
+ /\bcookie\b/,
42
+ /\bconsent\b/,
43
+ /\bnewsletter\b/,
44
+ /\bsignup\b/,
45
+ /\bsign-up\b/,
46
+ /\bsubscri/,
47
+ /\brelated\b/,
48
+ /\brecommended\b/,
49
+ /\bpopular\b/,
50
+ /\btrending\b/,
51
+ /\bbreadcrumb/,
52
+ /\bpagination\b/,
53
+ /\btoolbar\b/,
54
+ /\bmodal\b/,
55
+ /\bpopup\b/,
56
+ /\boverlay\b/,
57
+ /\btoast\b/,
58
+ /\bnotification\b/,
59
+ /\bskip-?link\b/,
60
+ ];
61
+ /** aria-role values that indicate page chrome */
62
+ const NOISE_ROLES = new Set([
63
+ 'navigation', 'banner', 'contentinfo', 'complementary', 'search',
64
+ ]);
65
+ /** Class/id patterns that indicate content (protect from removal) */
66
+ const CONTENT_PATTERNS = [
67
+ /\barticle/,
68
+ /\bpost-?content/,
69
+ /\bentry-?content/,
70
+ /\bmain-?content/,
71
+ /\bstory\b/,
72
+ /\bpage-?content/,
73
+ /\bcontent-?area\b/,
74
+ /\bprose\b/,
75
+ /\bmarkdown-?body\b/,
76
+ ];
77
+ /** Inline noise patterns for post-selection cleanup */
78
+ const INLINE_NOISE_PATTERNS = [
79
+ /\bshare\b/,
80
+ /\bsocial\b/,
81
+ /\bfollow\b/,
82
+ /\btwitter\b/,
83
+ /\bfacebook\b/,
84
+ /\blinkedin\b/,
85
+ /\binstagram\b/,
86
+ /\bpinterest\b/,
87
+ /\bprint\b/,
88
+ /\bsave\b/,
89
+ /\bbookmark\b/,
90
+ ];
91
+ // ─── Helpers ──────────────────────────────────────────────────────────────────
92
+ function getClassAndId($el, _$) {
93
+ const cls = ($el.attr('class') ?? '').toLowerCase();
94
+ const id = ($el.attr('id') ?? '').toLowerCase();
95
+ return cls + ' ' + id;
96
+ }
97
+ function isNoise(el, $) {
98
+ const tagName = (el.tagName ?? '').toLowerCase();
99
+ if (NOISE_TAGS.has(tagName))
100
+ return true;
101
+ const $el = $(el);
102
+ const combined = getClassAndId($el, $);
103
+ // Protect elements that match content patterns
104
+ for (const p of CONTENT_PATTERNS) {
105
+ if (p.test(combined))
106
+ return false;
107
+ }
108
+ for (const p of NOISE_CLASS_PATTERNS) {
109
+ if (p.test(combined))
110
+ return true;
111
+ }
112
+ const role = ($el.attr('role') ?? '').toLowerCase();
113
+ if (NOISE_ROLES.has(role))
114
+ return true;
115
+ return false;
116
+ }
117
+ function isHidden($el) {
118
+ const style = ($el.attr('style') ?? '').toLowerCase();
119
+ if (style.includes('display:none') || style.includes('display: none'))
120
+ return true;
121
+ if ($el.attr('hidden') !== undefined)
122
+ return true;
123
+ if ($el.attr('aria-hidden') === 'true')
124
+ return true;
125
+ return false;
126
+ }
127
+ function extractMeta($) {
128
+ // Title — prefer og:title, then <title>, then h1
129
+ let title = $('meta[property="og:title"]').attr('content') ||
130
+ $('meta[name="twitter:title"]').attr('content') ||
131
+ $('title').text() ||
132
+ $('h1').first().text() ||
133
+ '';
134
+ title = cleanConcatenatedTitle(title.trim().replace(/\s+/g, ' '));
135
+ // Author
136
+ let author = $('meta[name="author"]').attr('content') ||
137
+ $('meta[property="article:author"]').attr('content') ||
138
+ null;
139
+ // Structured author data (rel="author", itemprop="author")
140
+ // Only accept if the text looks like a person's name (short, no junk)
141
+ if (!author) {
142
+ for (const sel of ['[rel="author"]', '[itemprop="author"]']) {
143
+ const text = $(sel).first().text().trim().replace(/\s+/g, ' ');
144
+ if (text && text.length > 1 && text.length < 60 && !text.includes('\n')) {
145
+ author = text;
146
+ break;
147
+ }
148
+ }
149
+ }
150
+ // Byline patterns — look for common class names
151
+ if (!author) {
152
+ const bylineSelectors = [
153
+ '.byline', '.author:not([class*="authority"])', '.post-author',
154
+ '.article-author', '.entry-author', '[class*="byline"]',
155
+ ];
156
+ for (const sel of bylineSelectors) {
157
+ const text = $(sel).first().text().trim().replace(/\s+/g, ' ');
158
+ if (text && text.length > 1 && text.length < 80 && !text.includes('\n')) {
159
+ // Strip "By " prefix common in bylines
160
+ author = text.replace(/^by\s+/i, '').trim();
161
+ break;
162
+ }
163
+ }
164
+ }
165
+ // Sanity check: author shouldn't look like junk (too many words, has "database", etc.)
166
+ if (author && (author.split(/\s+/).length > 8 || /database|control|footer|sidebar/i.test(author))) {
167
+ author = null;
168
+ }
169
+ if (author)
170
+ author = author.trim().replace(/\s+/g, ' ') || null;
171
+ // Date
172
+ let date = $('meta[property="article:published_time"]').attr('content') ||
173
+ $('meta[name="publishdate"]').attr('content') ||
174
+ $('meta[name="publish_date"]').attr('content') ||
175
+ $('meta[itemprop="datePublished"]').attr('content') ||
176
+ null;
177
+ if (!date) {
178
+ // Look for <time> elements
179
+ const timeEl = $('time[datetime]').first();
180
+ if (timeEl.length) {
181
+ date = timeEl.attr('datetime') || timeEl.text().trim() || null;
182
+ }
183
+ }
184
+ if (!date) {
185
+ // Look for JSON-LD datePublished
186
+ $('script[type="application/ld+json"]').each((_, el) => {
187
+ if (date)
188
+ return;
189
+ try {
190
+ const parsed = JSON.parse($(el).html() ?? '{}');
191
+ const candidates = Array.isArray(parsed) ? parsed : [parsed];
192
+ for (const obj of candidates) {
193
+ if (obj.datePublished) {
194
+ date = obj.datePublished;
195
+ break;
196
+ }
197
+ }
198
+ }
199
+ catch { /* ignore parse errors */ }
200
+ });
201
+ }
202
+ if (date)
203
+ date = date.trim() || null;
204
+ // Site name
205
+ const siteName = $('meta[property="og:site_name"]').attr('content')?.trim() ||
206
+ null;
207
+ // Language
208
+ const language = $('html').attr('lang')?.trim().split('-')[0] ||
209
+ $('meta[http-equiv="Content-Language"]').attr('content')?.trim() ||
210
+ null;
211
+ return { title, author, date, siteName, language };
212
+ }
213
+ // ─── Noise removal ────────────────────────────────────────────────────────────
214
+ function removeNoise($) {
215
+ // Remove hidden elements first
216
+ $('[aria-hidden="true"], [hidden]').remove();
217
+ $('[style*="display:none"], [style*="display: none"]').remove();
218
+ // Walk and remove noise elements (top-down, don't recurse into removed nodes)
219
+ const toRemove = [];
220
+ function walk(node) {
221
+ if (node.type !== 'tag')
222
+ return;
223
+ const el = node;
224
+ const tagName = (el.tagName ?? '').toLowerCase();
225
+ // Skip script/style (already handled by htmlToMarkdown)
226
+ if (tagName === 'script' || tagName === 'style' || tagName === 'meta' || tagName === 'link')
227
+ return;
228
+ if (isNoise(el, $) || isHidden($(el))) {
229
+ toRemove.push(el);
230
+ return; // Don't recurse into nodes we'll remove
231
+ }
232
+ for (const child of el.children ?? []) {
233
+ walk(child);
234
+ }
235
+ }
236
+ const body = $('body').get(0);
237
+ if (body)
238
+ walk(body);
239
+ for (const el of toRemove) {
240
+ $(el).remove();
241
+ }
242
+ }
243
+ function scoreCandidate($el, $) {
244
+ const html = $.html($el) ?? '';
245
+ const htmlLength = html.length;
246
+ if (htmlLength === 0)
247
+ return 0;
248
+ // Remove scripts/styles from clone for text measurement
249
+ const clone = $el.clone();
250
+ clone.find('script, style, noscript').remove();
251
+ const visibleText = clone.text() ?? '';
252
+ const textLength = visibleText.trim().length;
253
+ const textDensity = textLength / Math.max(htmlLength, 1);
254
+ // Link density
255
+ let linkTextLength = 0;
256
+ $el.find('a').each((_, a) => {
257
+ linkTextLength += ($(a).text() ?? '').trim().length;
258
+ });
259
+ const linkDensity = textLength > 0 ? linkTextLength / textLength : 1;
260
+ // Paragraph count
261
+ const paragraphCount = $el.find('p').length;
262
+ // Base score: paragraphs × 3 + text length bonus - link density penalty
263
+ let score = paragraphCount * 3 + textLength / 100 - linkDensity * 100;
264
+ // Boost for high text density
265
+ score += textDensity * 20;
266
+ // Penalize noise class/id
267
+ const combined = getClassAndId($el, $);
268
+ for (const p of NOISE_CLASS_PATTERNS) {
269
+ if (p.test(combined)) {
270
+ score -= 30;
271
+ break;
272
+ }
273
+ }
274
+ // Boost if inside <main> or <article>
275
+ const parents = $el.parents('main, article');
276
+ if (parents.length > 0) {
277
+ score += 20;
278
+ }
279
+ return score;
280
+ }
281
+ function findBestCandidate($) {
282
+ // Priority 1: <article>
283
+ const articles = $('article');
284
+ if (articles.length > 0) {
285
+ // If multiple articles, pick the one with most paragraph content
286
+ let best = null;
287
+ let bestScore = -Infinity;
288
+ articles.each((_, el) => {
289
+ const $el = $(el);
290
+ const s = scoreCandidate($el, $);
291
+ if (s > bestScore) {
292
+ bestScore = s;
293
+ best = el;
294
+ }
295
+ });
296
+ if (best)
297
+ return best;
298
+ }
299
+ // Priority 2: <main>
300
+ const main = $('main').first();
301
+ if (main.length > 0) {
302
+ return main.get(0);
303
+ }
304
+ // Priority 3: [role="main"]
305
+ const roleMain = $('[role="main"]').first();
306
+ if (roleMain.length > 0) {
307
+ return roleMain.get(0);
308
+ }
309
+ // Priority 4: Highest-scoring div/section
310
+ const candidates = [];
311
+ $('div, section').each((_, el) => {
312
+ const $el = $(el);
313
+ const html = $.html($el) ?? '';
314
+ // Only consider elements with meaningful content (skip tiny wrappers)
315
+ if (html.length < 200)
316
+ return;
317
+ const clone = $el.clone();
318
+ clone.find('script, style, noscript').remove();
319
+ const textLength = clone.text().trim().length;
320
+ if (textLength < 100)
321
+ return;
322
+ const paragraphCount = $el.find('p').length;
323
+ if (paragraphCount < 1)
324
+ return; // Require at least one <p>
325
+ let linkTextLength = 0;
326
+ $el.find('a').each((_, a) => {
327
+ linkTextLength += ($(a).text() ?? '').trim().length;
328
+ });
329
+ const linkDensity = textLength > 0 ? linkTextLength / textLength : 1;
330
+ const score = scoreCandidate($el, $);
331
+ candidates.push({ el: el, score, textLength, paragraphCount, linkDensity });
332
+ });
333
+ if (candidates.length === 0)
334
+ return null;
335
+ // Return highest score
336
+ candidates.sort((a, b) => b.score - a.score);
337
+ return candidates[0].el;
338
+ }
339
+ // ─── Post-selection cleaning ──────────────────────────────────────────────────
340
+ function cleanCandidate($candidate, $, options) {
341
+ // Remove remaining inline noise (share buttons, social icons)
342
+ $candidate.find('*').each((_, el) => {
343
+ const $el = $(el);
344
+ const combined = getClassAndId($el, $);
345
+ for (const p of INLINE_NOISE_PATTERNS) {
346
+ if (p.test(combined)) {
347
+ // Only remove if it's clearly a widget, not article text
348
+ const text = $el.text().trim();
349
+ const tagName = el.tagName?.toLowerCase() ?? '';
350
+ const isInlineNoise = tagName === 'div' || tagName === 'span' || tagName === 'ul' || tagName === 'button';
351
+ if (isInlineNoise && text.length < 200) {
352
+ $el.remove();
353
+ return;
354
+ }
355
+ }
356
+ }
357
+ });
358
+ // Strip images if not wanted
359
+ if (options.includeImages === false) {
360
+ $candidate.find('img, picture, figure, [class*="image"]').remove();
361
+ }
362
+ // Strip links (keep text) if not wanted
363
+ if (options.includeLinks === false) {
364
+ $candidate.find('a').each((_, el) => {
365
+ $(el).replaceWith($(el).text());
366
+ });
367
+ }
368
+ // Strip code blocks if not wanted
369
+ if (options.includeCode === false) {
370
+ $candidate.find('pre, code').remove();
371
+ }
372
+ // Strip tables if not wanted
373
+ if (options.includeTables === false) {
374
+ $candidate.find('table').remove();
375
+ }
376
+ }
377
+ // ─── Excerpt generation ───────────────────────────────────────────────────────
378
+ function extractExcerpt(text) {
379
+ // Split by sentence boundaries and take first 2 complete sentences
380
+ const sentences = text.match(/[^.!?]+[.!?]+/g);
381
+ if (!sentences || sentences.length === 0) {
382
+ // Fallback: first 200 chars
383
+ return text.slice(0, 200).trim();
384
+ }
385
+ return sentences.slice(0, 2).join(' ').trim();
386
+ }
387
+ // ─── Reading time ─────────────────────────────────────────────────────────────
388
+ function calcReadingTime(wordCount) {
389
+ const minutes = Math.max(1, Math.round(wordCount / 200));
390
+ return `${minutes} min read`;
391
+ }
392
+ // ─── Output post-processing ───────────────────────────────────────────────────
393
+ /**
394
+ * Post-process readability output to remove residual noise that survives
395
+ * readability extraction: skip-to-content links, breadcrumbs, cookie consent
396
+ * patterns, orphaned link references, and excessive blank lines.
397
+ */
398
+ function cleanReadabilityOutput(content) {
399
+ return content
400
+ // Remove skip-to-content links
401
+ .replace(/\[skip to (?:main )?content\]\([^)]*\)/gi, '')
402
+ // Remove standalone breadcrumb patterns (e.g. "Home > Category > Page")
403
+ .replace(/^(?:Home|Main)\s*[>›»]\s*.*/gm, '')
404
+ // Remove cookie consent patterns
405
+ .replace(/(?:we use cookies|cookie (?:policy|settings|preferences)).*$/gim, '')
406
+ // Remove orphaned link references like [something]: #
407
+ .replace(/^\[.*?\]:\s*#?\s*$/gm, '')
408
+ // Clean up leftover consecutive divider noise (e.g. "--- --- ---" → single "---")
409
+ .replace(/(?:---\s*){2,}/g, '---\n')
410
+ // Collapse excessive blank lines (4+ → 2)
411
+ .replace(/\n{4,}/g, '\n\n\n')
412
+ .trim();
413
+ }
414
+ // ─── Main export ──────────────────────────────────────────────────────────────
415
+ /**
416
+ * Extract clean, readable article content from raw HTML.
417
+ *
418
+ * Mimics browser Reader Mode but deterministic and purpose-built for AI agents.
419
+ *
420
+ * @param html - Raw HTML of the page
421
+ * @param url - Source URL (used for resolving relative links in metadata)
422
+ * @param options - Extraction options
423
+ */
424
+ export function extractReadableContent(html, _url, options = {}) {
425
+ const { includeImages = true, includeLinks = true, includeCode = true, includeTables = true, maxLength, } = options;
426
+ // Security: cap HTML size
427
+ if (html.length > 10 * 1024 * 1024) {
428
+ html = html.slice(0, 10 * 1024 * 1024);
429
+ }
430
+ // Handle empty input gracefully
431
+ if (!html.trim()) {
432
+ return {
433
+ title: '',
434
+ author: null,
435
+ date: null,
436
+ siteName: null,
437
+ content: '',
438
+ excerpt: '',
439
+ wordCount: 0,
440
+ readingTime: '1 min read',
441
+ language: null,
442
+ };
443
+ }
444
+ const $ = cheerio.load(html);
445
+ // ── Step 1: Extract metadata BEFORE noise removal (meta tags in <head> must survive) ──
446
+ const meta = extractMeta($);
447
+ // ── Step 2: Noise removal ──────────────────────────────────────────────────
448
+ removeNoise($);
449
+ // ── Step 3: Find best candidate ────────────────────────────────────────────
450
+ const bestEl = findBestCandidate($);
451
+ let candidateHtml;
452
+ if (bestEl) {
453
+ candidateHtml = $.html($(bestEl)) ?? '';
454
+ }
455
+ else {
456
+ // Fallback: use cleaned body content
457
+ candidateHtml = $('body').html() ?? $.html();
458
+ }
459
+ // ── Step 4: Post-selection cleaning ────────────────────────────────────────
460
+ const $candidate = cheerio.load(candidateHtml);
461
+ const $root = $candidate('body');
462
+ cleanCandidate($root, $candidate, { includeImages, includeLinks, includeCode, includeTables });
463
+ const cleanedHtml = $candidate('body').html() ?? candidateHtml;
464
+ // ── Step 5: Convert to markdown ────────────────────────────────────────────
465
+ // We use the existing htmlToMarkdown with prune:false (already cleaned)
466
+ let content = rawHtmlToMarkdown(cleanedHtml);
467
+ // ── Step 6: Build metadata header ──────────────────────────────────────────
468
+ // Use H1 from content as title if meta title is missing or just the tab title
469
+ if (!meta.title || meta.title.length < 3) {
470
+ const h1Match = content.match(/^#\s+(.+)$/m);
471
+ if (h1Match)
472
+ meta.title = h1Match[1].trim();
473
+ }
474
+ // Extract word count from plain content text
475
+ const plainText = content.replace(/[#*_`\[\]\(\)>|-]/g, ' ').replace(/\s+/g, ' ').trim();
476
+ const wordCount = plainText.split(/\s+/).filter(w => w.length > 0).length;
477
+ const readingTime = calcReadingTime(wordCount);
478
+ // Build metadata line
479
+ const metaParts = [];
480
+ if (meta.author)
481
+ metaParts.push(`By ${meta.author}`);
482
+ if (meta.date) {
483
+ // Try to format the date nicely
484
+ try {
485
+ const d = new Date(meta.date);
486
+ if (!isNaN(d.getTime())) {
487
+ metaParts.push(d.toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' }));
488
+ }
489
+ else {
490
+ metaParts.push(meta.date);
491
+ }
492
+ }
493
+ catch {
494
+ metaParts.push(meta.date);
495
+ }
496
+ }
497
+ metaParts.push(readingTime);
498
+ const metaLine = metaParts.length > 0 ? `*${metaParts.join(' · ')}*\n\n` : '';
499
+ const titleLine = meta.title ? `# ${meta.title}\n${metaLine}` : metaLine;
500
+ // Don't duplicate title if it's already the first heading in content
501
+ const contentStartsWithTitle = meta.title &&
502
+ content.trimStart().startsWith(`# ${meta.title}`);
503
+ if (!contentStartsWithTitle && titleLine) {
504
+ content = titleLine + content;
505
+ }
506
+ else if (contentStartsWithTitle && metaLine) {
507
+ // Inject meta line right after the title heading
508
+ content = content.replace(/^(#\s+.+\n)/, `$1${metaLine}`);
509
+ }
510
+ // ── Step 7: Clean up whitespace ─────────────────────────────────────────────
511
+ content = content.replace(/\n{3,}/g, '\n\n').trim();
512
+ // ── Step 7b: Remove residual noise (skip-links, breadcrumbs, cookie text) ──
513
+ content = cleanReadabilityOutput(content);
514
+ // ── Step 8: Apply maxLength ──────────────────────────────────────────────────
515
+ if (maxLength && maxLength > 0 && content.length > maxLength) {
516
+ content = content.slice(0, maxLength).trim() + '\n\n[Content truncated]';
517
+ }
518
+ // ── Step 9: Generate excerpt ─────────────────────────────────────────────────
519
+ // Extract from the plain article text (no markdown formatting)
520
+ const articleTextForExcerpt = plainText;
521
+ const excerpt = extractExcerpt(articleTextForExcerpt);
522
+ return {
523
+ title: meta.title,
524
+ author: meta.author,
525
+ date: meta.date,
526
+ siteName: meta.siteName,
527
+ content,
528
+ excerpt,
529
+ wordCount,
530
+ readingTime,
531
+ language: meta.language,
532
+ };
533
+ }
@@ -0,0 +1,66 @@
1
+ /**
2
+ * WebPeel Deep Research Agent
3
+ *
4
+ * Autonomously searches the web, fetches top sources, filters content with
5
+ * BM25, optionally follows promising links, and synthesizes a comprehensive
6
+ * report using an LLM.
7
+ *
8
+ * Design principle: orchestrate existing modules (peel, bm25-filter,
9
+ * llm-extract) — don't reinvent anything.
10
+ */
11
+ export interface ResearchOptions {
12
+ /** Research question or topic */
13
+ query: string;
14
+ /** Maximum number of sources to consult. Default: 5 */
15
+ maxSources?: number;
16
+ /** Maximum depth of link-following. Default: 1 (just search results; 2+ follows links) */
17
+ maxDepth?: number;
18
+ /** LLM API key for synthesis */
19
+ apiKey?: string;
20
+ /** LLM model for synthesis. Default: gpt-4o-mini */
21
+ model?: string;
22
+ /** LLM base URL. Default: https://api.openai.com/v1 */
23
+ baseUrl?: string;
24
+ /** Maximum total time in ms. Default: 60000 (1 minute) */
25
+ timeout?: number;
26
+ /** Output format: 'report' (markdown synthesis) or 'sources' (raw extracted data). Default: 'report' */
27
+ outputFormat?: 'report' | 'sources';
28
+ /** Optional callback for progress updates */
29
+ onProgress?: (step: ResearchStep) => void;
30
+ }
31
+ export interface ResearchStep {
32
+ phase: 'searching' | 'fetching' | 'extracting' | 'following' | 'synthesizing';
33
+ message: string;
34
+ sourcesFound?: number;
35
+ sourcesFetched?: number;
36
+ }
37
+ export interface ResearchSource {
38
+ url: string;
39
+ title: string;
40
+ /** Key findings from this source */
41
+ findings: string;
42
+ /** Relevance score (0-1) */
43
+ relevance: number;
44
+ }
45
+ export interface ResearchResult {
46
+ /** Synthesized research report (markdown) */
47
+ report: string;
48
+ /** Sources consulted */
49
+ sources: ResearchSource[];
50
+ /** Total sources found vs consulted */
51
+ totalSourcesFound: number;
52
+ sourcesConsulted: number;
53
+ /** Time taken in ms */
54
+ elapsed: number;
55
+ /** Tokens used for synthesis */
56
+ tokensUsed?: {
57
+ input: number;
58
+ output: number;
59
+ };
60
+ /** Estimated cost in USD */
61
+ cost?: number;
62
+ }
63
+ /**
64
+ * Conduct autonomous multi-step web research on a topic.
65
+ */
66
+ export declare function research(options: ResearchOptions): Promise<ResearchResult>;