@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,425 @@
1
+ /**
2
+ * Content Density Pruner
3
+ *
4
+ * Two-pass pruning to reduce HTML before markdown conversion:
5
+ *
6
+ * Pass 1 — Semantic removal: strip elements whose tag or class/id clearly
7
+ * mark them as page chrome (nav, footer, sidebar, cookie banners, ads).
8
+ *
9
+ * Pass 2 — Density scoring: score remaining block elements by text density,
10
+ * link density, tag importance, and word count. Remove low-scorers.
11
+ *
12
+ * Inspired by Crawl4AI's PruningContentFilter — targets 40-60% token savings.
13
+ */
14
+ import * as cheerio from 'cheerio';
15
+ // -----------------------------------------------------------------------
16
+ // Pass 1 — Semantic removal: tags and class/id patterns
17
+ // -----------------------------------------------------------------------
18
+ /** Tags that are almost always page chrome, not article content. */
19
+ const CHROME_TAGS = new Set([
20
+ 'nav', 'footer', 'aside', 'noscript',
21
+ ]);
22
+ /**
23
+ * Class/id patterns that indicate page chrome.
24
+ * Tested against lowercased class/id strings.
25
+ */
26
+ const CHROME_PATTERNS = [
27
+ /\bsidebar\b/,
28
+ /\bcookie/,
29
+ /\bbanner\b/,
30
+ /\b(ad|ads|advert)\b/,
31
+ /\bpopup\b/,
32
+ /\bmodal\b/,
33
+ /\boverlay\b/,
34
+ /\bsocial/,
35
+ /\bshare\b/,
36
+ /\bbreadcrumb/,
37
+ /\bskip-?link/,
38
+ /\bfootnote/,
39
+ /\brelated-?(post|article)/,
40
+ /\bnewsletter/,
41
+ /\bsubscri/,
42
+ /\bcomment/,
43
+ /\b(sign-?up|sign-?in|log-?in)\b/,
44
+ /\btoc\b/,
45
+ /\btable-?of-?contents\b/,
46
+ /\bgdpr\b/,
47
+ /\bconsent\b/,
48
+ // Q&A sites (Stack Overflow, StackExchange)
49
+ /\bvote\b/,
50
+ /\bpost-?menu/,
51
+ /\bjs-vote/,
52
+ /\buser-?card/,
53
+ /\buser-?info/,
54
+ /\bpost-?tag/,
55
+ /\bquestion-?stats/,
56
+ // Social/sharing UI
57
+ /\bshare-?(button|link|panel|menu|bar)/,
58
+ /\bfollow-?button/,
59
+ /\breaction/,
60
+ /\blike-?button/,
61
+ /\bupvote/,
62
+ /\bdownvote/,
63
+ // Edit/action UI
64
+ /\bedit-?(link|button|post)/,
65
+ /\breport-?(link|button)/,
66
+ /\bflag-?(link|button)/,
67
+ // Generic site chrome
68
+ /\btop-?bar/,
69
+ /\bsite-?header/,
70
+ /\bpage-?header/,
71
+ /\bsticky-?header/,
72
+ /\bnotice\b/,
73
+ /\balert\b/,
74
+ /\btoast\b/,
75
+ /\bsnackbar/,
76
+ /\bbottom-?bar/,
77
+ /\bfloating/,
78
+ /\bfixed-?bottom/,
79
+ /\bback-?to-?top/,
80
+ // Interactive UI elements (non-content)
81
+ /\bquiz\b/,
82
+ /\bquestionnaire\b/,
83
+ /\btoggle(?!-content|-body|-text)\b/,
84
+ /\bcarousel\b/,
85
+ /\baccordion(?!-content|-body|-text)\b/,
86
+ /\bstepper\b/,
87
+ /\bpagination\b/,
88
+ /\btabs-?(?:list|nav|bar)\b/,
89
+ /\bcookie-?(?:banner|bar|notice|consent|popup)\b/,
90
+ ];
91
+ /**
92
+ * Tags we never remove (they likely wrap main content).
93
+ * We recurse into them but never strip the element itself.
94
+ */
95
+ const PROTECTED_TAGS = new Set(['main', 'article', 'body']);
96
+ /**
97
+ * Tags we never remove during density scoring (Pass 2).
98
+ * Headings, paragraphs, and semantic content elements should survive
99
+ * even if they're small — they carry essential meaning.
100
+ */
101
+ const DENSITY_SAFE_TAGS = new Set([
102
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
103
+ 'p', 'pre', 'code', 'blockquote', 'figcaption',
104
+ 'main', 'article', 'body',
105
+ // Table structural elements — pruner must not remove these or Turndown GFM
106
+ // can't convert tables and falls back to raw HTML output.
107
+ 'table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td',
108
+ ]);
109
+ /**
110
+ * Class/id patterns that protect an element from removal.
111
+ */
112
+ const CONTENT_PATTERNS = [
113
+ /\barticle/,
114
+ /\bpost-?content/,
115
+ /\bentry-?content/,
116
+ /\bmain-?content/,
117
+ /\bstory/,
118
+ /\bblog/,
119
+ /\bpage-?content/,
120
+ /\bcontent-?area/,
121
+ // Wikipedia/MediaWiki data tables — always content, never chrome
122
+ /\bwikitable\b/,
123
+ /\bmw-parser-output\b/,
124
+ ];
125
+ function isChromeBySemantic(el, $) {
126
+ const tagName = el.tagName?.toLowerCase() ?? '';
127
+ if (CHROME_TAGS.has(tagName))
128
+ return true;
129
+ const cls = ($(el).attr('class') ?? '').toLowerCase();
130
+ const id = ($(el).attr('id') ?? '').toLowerCase();
131
+ const combined = cls + ' ' + id;
132
+ // Don't remove if it matches a content pattern
133
+ for (const p of CONTENT_PATTERNS) {
134
+ if (p.test(combined))
135
+ return false;
136
+ }
137
+ for (const p of CHROME_PATTERNS) {
138
+ if (p.test(combined))
139
+ return true;
140
+ }
141
+ // Role attribute
142
+ const role = ($(el).attr('role') ?? '').toLowerCase();
143
+ if (['navigation', 'banner', 'complementary', 'contentinfo', 'search'].includes(role)) {
144
+ return true;
145
+ }
146
+ return false;
147
+ }
148
+ // -----------------------------------------------------------------------
149
+ // Pass 2 — Density scoring
150
+ // -----------------------------------------------------------------------
151
+ /** Tag importance scores for density scoring (-2 to +3) */
152
+ const TAG_IMPORTANCE = {
153
+ article: 3, main: 3,
154
+ p: 2, h1: 2, h2: 2, h3: 2, h4: 2, h5: 2, h6: 2,
155
+ blockquote: 2, pre: 2, code: 2, figure: 2, figcaption: 2,
156
+ section: 1, td: 1, th: 1, li: 1, dd: 1, dt: 1,
157
+ div: 0, span: 0, table: 0, ul: 0, ol: 0, dl: 0,
158
+ aside: -1, header: -1, form: -1,
159
+ nav: -2, footer: -2,
160
+ };
161
+ function normalizeTagScore(rawScore) {
162
+ return (rawScore + 2) / 5; // -2..+3 → 0..1
163
+ }
164
+ /**
165
+ * Collect scoreable blocks from a DOM tree.
166
+ *
167
+ * Strategy: walk the tree top-down. For each element:
168
+ * - If it's a "leaf-ish" block (< threshold size), score it as one unit.
169
+ * - If it's large and a wrapper (div/section/table), recurse into children.
170
+ * - Protected elements are always recursed.
171
+ *
172
+ * This finds the right granularity: not scoring a 200KB wrapper div,
173
+ * but scoring the divs/sections/p's nested 3-4 levels deep that carry
174
+ * actual content or chrome.
175
+ */
176
+ function collectBlocks($, parent, blocks, maxLeafSize) {
177
+ const children = 'children' in parent ? parent.children : [];
178
+ for (const child of children) {
179
+ if (child.type !== 'tag')
180
+ continue;
181
+ const el = child;
182
+ const tagName = el.tagName?.toLowerCase() ?? '';
183
+ // Skip script/style
184
+ if (tagName === 'script' || tagName === 'style' || tagName === 'link' || tagName === 'meta')
185
+ continue;
186
+ const $el = $(el);
187
+ const outerHtml = $.html($el) ?? '';
188
+ const htmlLen = outerHtml.length;
189
+ // Skip extremely tiny elements (bare tags like <br>)
190
+ if (htmlLen < 10)
191
+ continue;
192
+ const isProtected = PROTECTED_TAGS.has(tagName);
193
+ const isWrapper = ['div', 'section', 'table', 'tbody', 'thead', 'tr',
194
+ 'center', 'details', 'summary'].includes(tagName);
195
+ if (isProtected || (isWrapper && htmlLen > maxLeafSize)) {
196
+ // Too large or protected — recurse deeper
197
+ collectBlocks($, el, blocks, maxLeafSize);
198
+ }
199
+ else if (htmlLen > 0) {
200
+ // Score this element
201
+ const clone = $el.clone();
202
+ clone.find('script, style, noscript, svg, path').remove();
203
+ const visibleText = clone.text() ?? '';
204
+ const visibleTextLen = visibleText.trim().length;
205
+ const textDensity = Math.min(visibleTextLen / Math.max(htmlLen, 1), 1.0);
206
+ let linkTextLen = 0;
207
+ $el.find('a').each((_i, a) => {
208
+ linkTextLen += ($(a).text() ?? '').trim().length;
209
+ });
210
+ const linkDensity = visibleTextLen > 0
211
+ ? Math.min(linkTextLen / visibleTextLen, 1.0)
212
+ : 0;
213
+ const rawTagScore = TAG_IMPORTANCE[tagName] ?? 0;
214
+ const normalizedTag = normalizeTagScore(rawTagScore);
215
+ const words = visibleText.trim().split(/\s+/).filter(w => w.length > 0);
216
+ const wordBonus = words.length > 0
217
+ ? Math.min(Math.log(words.length + 1) / Math.log(1000), 1.0)
218
+ : 0;
219
+ const score = (textDensity * 0.35 +
220
+ (1 - linkDensity) * 0.25 +
221
+ normalizedTag * 0.2 +
222
+ wordBonus * 0.1 +
223
+ 0.1 // baseline position score (removed position bias — not useful for deep nesting)
224
+ );
225
+ blocks.push({
226
+ element: el,
227
+ tagName,
228
+ htmlLength: htmlLen,
229
+ visibleText,
230
+ score,
231
+ });
232
+ }
233
+ }
234
+ }
235
+ // -----------------------------------------------------------------------
236
+ // Main export
237
+ // -----------------------------------------------------------------------
238
+ /**
239
+ * Prune low-value HTML blocks using two-pass approach:
240
+ * 1. Semantic tag/class removal
241
+ * 2. Density scoring of remaining blocks
242
+ *
243
+ * @param html - Raw HTML to prune
244
+ * @param options - Pruning configuration
245
+ * @returns Pruned HTML with stats
246
+ */
247
+ export function pruneContent(html, options = {}) {
248
+ const { threshold = 0.3, minWords = 3, dynamic = true, } = options;
249
+ const originalLength = html.length;
250
+ if (!html.trim()) {
251
+ return { html, nodesRemoved: 0, reductionPercent: 0 };
252
+ }
253
+ // =====================================================================
254
+ // Pass 0: Regex pre-pass — strip obvious chrome BEFORE cheerio parse
255
+ // =====================================================================
256
+ // For large HTML (> 20KB), a fast regex pass removes top-level nav/footer/
257
+ // aside/header blocks before we load into cheerio, saving DOM parse time.
258
+ // Only applies to simple self-contained elements (no nesting concerns since
259
+ // these are structural tags that rarely wrap article content).
260
+ if (html.length > 20000) {
261
+ // Remove <nav>…</nav>, <footer>…</footer>, <aside>…</aside>
262
+ // Use a non-greedy match with dotAll flag; stop at the matching close tag.
263
+ html = html
264
+ .replace(/<nav(\s[^>]*)?>[\s\S]*?<\/nav>/gi, '')
265
+ .replace(/<footer(\s[^>]*)?>[\s\S]*?<\/footer>/gi, '')
266
+ .replace(/<aside(\s[^>]*)?>[\s\S]*?<\/aside>/gi, '');
267
+ // No safe way to strip noise <div>s by regex (nested divs break simple patterns).
268
+ // Cheerio's semantic pass handles them reliably in Pass 1.
269
+ }
270
+ const $ = cheerio.load(html);
271
+ let nodesRemoved = 0;
272
+ // =====================================================================
273
+ // Pass 1: Semantic removal
274
+ // =====================================================================
275
+ // Walk top-down; remove entire subtrees that are clearly chrome.
276
+ // We look at direct children of body, and one level deeper, to catch
277
+ // both <body> <nav> and <body> <div> <nav> patterns.
278
+ const toRemoveSemantic = [];
279
+ function walkForChrome(parent, depth) {
280
+ const children = 'children' in parent ? parent.children : [];
281
+ for (const child of children) {
282
+ if (child.type !== 'tag')
283
+ continue;
284
+ const el = child;
285
+ const tagName = el.tagName?.toLowerCase() ?? '';
286
+ if (tagName === 'script' || tagName === 'style')
287
+ continue;
288
+ if (PROTECTED_TAGS.has(tagName)) {
289
+ // Recurse into protected — there might be chrome inside <article>
290
+ walkForChrome(el, depth + 1);
291
+ continue;
292
+ }
293
+ if (isChromeBySemantic(el, $)) {
294
+ toRemoveSemantic.push(el);
295
+ continue; // don't recurse into something we'll remove
296
+ }
297
+ // Recurse up to a reasonable depth
298
+ if (depth < 6) {
299
+ walkForChrome(el, depth + 1);
300
+ }
301
+ }
302
+ }
303
+ const body = $('body').get(0);
304
+ if (body) {
305
+ walkForChrome(body, 0);
306
+ }
307
+ for (const el of toRemoveSemantic) {
308
+ $(el).remove();
309
+ nodesRemoved++;
310
+ }
311
+ // =====================================================================
312
+ // Pass 2: Density scoring (on the remaining HTML)
313
+ // =====================================================================
314
+ const postPass1Html = $.html();
315
+ const postPass1Length = postPass1Html.length;
316
+ // Run density scoring on remaining content
317
+ if (postPass1Length > 100 && body) {
318
+ const blocks = [];
319
+ // Max leaf size: ~5KB or 30% of remaining content (whichever is smaller)
320
+ // This ensures we find leaf blocks even in small documents.
321
+ const maxLeafSize = Math.min(5000, Math.ceil(postPass1Length * 0.3));
322
+ collectBlocks($, body, blocks, maxLeafSize);
323
+ if (blocks.length >= 2) {
324
+ const scores = blocks.map(b => b.score);
325
+ const bestScore = Math.max(...scores);
326
+ let effectiveThreshold = threshold;
327
+ if (dynamic) {
328
+ // Blocks scoring below 50% of the best block are candidates for removal
329
+ effectiveThreshold = bestScore * 0.5;
330
+ }
331
+ // Safety: retain at least 40% of post-pass1 content
332
+ const minRetainLength = Math.ceil(postPass1Length * 0.4);
333
+ // Sort ascending by score — remove worst first
334
+ const sorted = blocks
335
+ .map((b, i) => ({ b, i, score: b.score }))
336
+ .sort((a, b) => a.score - b.score);
337
+ const toRemoveDensity = new Set();
338
+ let removedLength = 0;
339
+ for (const { b } of sorted) {
340
+ if (PROTECTED_TAGS.has(b.tagName) || DENSITY_SAFE_TAGS.has(b.tagName))
341
+ continue;
342
+ const words = b.visibleText.trim().split(/\s+/).filter(w => w.length > 0);
343
+ const isTiny = words.length < minWords;
344
+ const isLow = b.score < effectiveThreshold;
345
+ if (!isTiny && !isLow)
346
+ continue;
347
+ // Check safety floor
348
+ const remaining = postPass1Length - (removedLength + b.htmlLength);
349
+ if (remaining < minRetainLength)
350
+ continue;
351
+ toRemoveDensity.add(b.element);
352
+ removedLength += b.htmlLength;
353
+ }
354
+ for (const el of toRemoveDensity) {
355
+ $(el).remove();
356
+ nodesRemoved++;
357
+ }
358
+ }
359
+ }
360
+ const resultHtml = $.html() ?? html;
361
+ const resultLength = resultHtml.length;
362
+ const reductionPercent = originalLength > 0
363
+ ? Math.max(0, Math.round(((originalLength - resultLength) / originalLength) * 100))
364
+ : 0;
365
+ return {
366
+ html: resultHtml,
367
+ nodesRemoved,
368
+ reductionPercent,
369
+ };
370
+ }
371
+ // ---------------------------------------------------------------------------
372
+ // Markdown post-processing — remove UI noise leaked into markdown output
373
+ // ---------------------------------------------------------------------------
374
+ /** UI button labels that should be removed when they appear as standalone lines */
375
+ const UI_BUTTON_LABELS = /^(load more|headlines only|show more|read more|show less|collapse|expand|view more|view less|see more|see less|more stories|more articles|sign up|subscribe|log in|sign in|follow us|get started|click here|learn more)$/i;
376
+ /** An image with no alt text (empty brackets): `![](url)` */
377
+ const EMPTY_IMAGE_RE = /^\!\[\]\([^)]+\)$/;
378
+ /**
379
+ * Post-process markdown output to remove UI elements that leak through
380
+ * from content scrapers (buttons, empty images, consecutive hr separators).
381
+ *
382
+ * @param markdown - Raw markdown string
383
+ * @returns Cleaned markdown string
384
+ */
385
+ export function pruneMarkdown(markdown) {
386
+ if (!markdown)
387
+ return markdown;
388
+ const lines = markdown.split('\n');
389
+ const result = [];
390
+ let consecutiveHrCount = 0;
391
+ for (let i = 0; i < lines.length; i++) {
392
+ const line = lines[i];
393
+ const trimmed = line.trim();
394
+ // Remove lines that are just UI button labels (standalone, not in a heading/list)
395
+ if (UI_BUTTON_LABELS.test(trimmed)) {
396
+ continue;
397
+ }
398
+ // Remove empty images (no alt text): ![](url)
399
+ // But keep images with alt text: ![alt text](url)
400
+ if (EMPTY_IMAGE_RE.test(trimmed)) {
401
+ continue;
402
+ }
403
+ // Remove list items whose only content is an empty image
404
+ if (/^[-*+]\s+\!\[\]\([^)]+\)$/.test(trimmed)) {
405
+ continue;
406
+ }
407
+ // Handle consecutive HR separators ("* * *", "---", "___")
408
+ // Keep the first one, remove subsequent consecutive ones
409
+ const isHr = /^(\*\s*\*\s*\*|\-\s*\-\s*\-|_\s*_\s*_)$/.test(trimmed);
410
+ if (isHr) {
411
+ consecutiveHrCount++;
412
+ if (consecutiveHrCount > 1) {
413
+ continue; // skip duplicate hr
414
+ }
415
+ }
416
+ else {
417
+ // Reset counter on any non-HR, non-blank line
418
+ if (trimmed !== '') {
419
+ consecutiveHrCount = 0;
420
+ }
421
+ }
422
+ result.push(line);
423
+ }
424
+ return result.join('\n');
425
+ }
@@ -0,0 +1,60 @@
1
+ /**
2
+ * In-memory cookie cache with TTL.
3
+ *
4
+ * Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
5
+ * Cookies from challenge solves are cached here so future requests to the same
6
+ * domain skip the challenge entirely.
7
+ *
8
+ * Design goals:
9
+ * - Zero dependencies (plain Map + setTimeout)
10
+ * - In-memory only — no disk/DB persistence
11
+ * - TTL per entry (default 30 min, matching cf_clearance lifetime)
12
+ * - Thread-safe for single-process Node.js (event loop is single-threaded)
13
+ */
14
+ export interface CachedCookies {
15
+ /** Raw "Cookie: ..." header value (semicolon-separated) */
16
+ cookieHeader: string;
17
+ /** Individual cookie strings (e.g. ["cf_clearance=abc; Path=/", ...]) */
18
+ cookies: string[];
19
+ /** Unix timestamp (ms) when this cache entry expires */
20
+ expiresAt: number;
21
+ /** The domain these cookies are for */
22
+ domain: string;
23
+ }
24
+ /**
25
+ * Store cookies for a domain.
26
+ *
27
+ * @param domain Hostname (e.g. "example.com" or "sub.example.com")
28
+ * @param cookies Array of Set-Cookie header values or cookie strings
29
+ * @param ttlMs Time-to-live in ms (default: 30 min)
30
+ */
31
+ export declare function cacheCookies(domain: string, cookies: string[], ttlMs?: number): void;
32
+ /**
33
+ * Retrieve cached cookies for a domain (or its parent domain).
34
+ * Returns null if no valid (non-expired) entry exists.
35
+ *
36
+ * @param domain Hostname to look up
37
+ */
38
+ export declare function getCachedCookies(domain: string): CachedCookies | null;
39
+ /**
40
+ * Build a Cookie request header value from a URL.
41
+ * Returns undefined if no cached cookies exist.
42
+ */
43
+ export declare function getCookieHeader(url: string): string | undefined;
44
+ /**
45
+ * Cache cookies from a URL's perspective.
46
+ * Extracts domain from URL automatically.
47
+ */
48
+ export declare function cacheCookiesForUrl(url: string, cookies: string[], ttlMs?: number): void;
49
+ /**
50
+ * Invalidate (remove) cached cookies for a domain.
51
+ */
52
+ export declare function invalidateCookies(domain: string): void;
53
+ /**
54
+ * Return the number of cached domains (for diagnostics).
55
+ */
56
+ export declare function getCacheSize(): number;
57
+ /**
58
+ * Clear ALL cached cookies. Mainly for tests.
59
+ */
60
+ export declare function clearCookieCache(): void;
@@ -0,0 +1,163 @@
1
+ /**
2
+ * In-memory cookie cache with TTL.
3
+ *
4
+ * Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
5
+ * Cookies from challenge solves are cached here so future requests to the same
6
+ * domain skip the challenge entirely.
7
+ *
8
+ * Design goals:
9
+ * - Zero dependencies (plain Map + setTimeout)
10
+ * - In-memory only — no disk/DB persistence
11
+ * - TTL per entry (default 30 min, matching cf_clearance lifetime)
12
+ * - Thread-safe for single-process Node.js (event loop is single-threaded)
13
+ */
14
+ // ── Internal store ────────────────────────────────────────────────────────────
15
+ const store = new Map();
16
+ let cleanupTimer = null;
17
+ /** Default TTL: 30 minutes (cf_clearance lasts 30 min) */
18
+ const DEFAULT_TTL_MS = 30 * 60 * 1000;
19
+ // ── Public API ────────────────────────────────────────────────────────────────
20
+ /**
21
+ * Store cookies for a domain.
22
+ *
23
+ * @param domain Hostname (e.g. "example.com" or "sub.example.com")
24
+ * @param cookies Array of Set-Cookie header values or cookie strings
25
+ * @param ttlMs Time-to-live in ms (default: 30 min)
26
+ */
27
+ export function cacheCookies(domain, cookies, ttlMs = DEFAULT_TTL_MS) {
28
+ if (!cookies.length)
29
+ return;
30
+ const normalizedDomain = normalizeDomain(domain);
31
+ const cookieHeader = buildCookieHeader(cookies);
32
+ const expiresAt = Date.now() + ttlMs;
33
+ store.set(normalizedDomain, {
34
+ cookieHeader,
35
+ cookies,
36
+ expiresAt,
37
+ domain: normalizedDomain,
38
+ });
39
+ // Start periodic cleanup if not already running
40
+ startCleanup();
41
+ }
42
+ /**
43
+ * Retrieve cached cookies for a domain (or its parent domain).
44
+ * Returns null if no valid (non-expired) entry exists.
45
+ *
46
+ * @param domain Hostname to look up
47
+ */
48
+ export function getCachedCookies(domain) {
49
+ const normalizedDomain = normalizeDomain(domain);
50
+ // Try exact match first, then parent domain
51
+ const candidates = [normalizedDomain, getParentDomain(normalizedDomain)].filter(Boolean);
52
+ for (const candidate of candidates) {
53
+ const entry = store.get(candidate);
54
+ if (entry && entry.expiresAt > Date.now()) {
55
+ return entry;
56
+ }
57
+ // Remove expired entry
58
+ if (entry) {
59
+ store.delete(candidate);
60
+ }
61
+ }
62
+ return null;
63
+ }
64
+ /**
65
+ * Build a Cookie request header value from a URL.
66
+ * Returns undefined if no cached cookies exist.
67
+ */
68
+ export function getCookieHeader(url) {
69
+ try {
70
+ const domain = new URL(url).hostname;
71
+ const cached = getCachedCookies(domain);
72
+ return cached?.cookieHeader;
73
+ }
74
+ catch {
75
+ return undefined;
76
+ }
77
+ }
78
+ /**
79
+ * Cache cookies from a URL's perspective.
80
+ * Extracts domain from URL automatically.
81
+ */
82
+ export function cacheCookiesForUrl(url, cookies, ttlMs = DEFAULT_TTL_MS) {
83
+ try {
84
+ const domain = new URL(url).hostname;
85
+ cacheCookies(domain, cookies, ttlMs);
86
+ }
87
+ catch {
88
+ // Invalid URL — ignore
89
+ }
90
+ }
91
+ /**
92
+ * Invalidate (remove) cached cookies for a domain.
93
+ */
94
+ export function invalidateCookies(domain) {
95
+ const normalizedDomain = normalizeDomain(domain);
96
+ store.delete(normalizedDomain);
97
+ }
98
+ /**
99
+ * Return the number of cached domains (for diagnostics).
100
+ */
101
+ export function getCacheSize() {
102
+ return store.size;
103
+ }
104
+ /**
105
+ * Clear ALL cached cookies. Mainly for tests.
106
+ */
107
+ export function clearCookieCache() {
108
+ store.clear();
109
+ if (cleanupTimer) {
110
+ clearInterval(cleanupTimer);
111
+ cleanupTimer = null;
112
+ }
113
+ }
114
+ // ── Helpers ───────────────────────────────────────────────────────────────────
115
+ /** Normalize domain: lowercase, strip www. prefix */
116
+ function normalizeDomain(domain) {
117
+ return domain.toLowerCase().replace(/^www\./, '');
118
+ }
119
+ /** Get parent domain (strip first subdomain label) */
120
+ function getParentDomain(domain) {
121
+ const parts = domain.split('.');
122
+ if (parts.length <= 2)
123
+ return null; // Already a root domain
124
+ return parts.slice(1).join('.');
125
+ }
126
+ /**
127
+ * Convert an array of Set-Cookie values or raw cookie strings into a single
128
+ * "Cookie: name=value; name2=value2" header value.
129
+ */
130
+ function buildCookieHeader(cookies) {
131
+ const pairs = [];
132
+ for (const cookie of cookies) {
133
+ // Set-Cookie format: "name=value; Path=/; Secure; HttpOnly; ..."
134
+ // We only want the first "name=value" pair
135
+ const firstPart = cookie.split(';')[0]?.trim();
136
+ if (firstPart) {
137
+ pairs.push(firstPart);
138
+ }
139
+ }
140
+ return pairs.join('; ');
141
+ }
142
+ /** Periodically remove expired entries to prevent memory leaks. */
143
+ function startCleanup() {
144
+ if (cleanupTimer)
145
+ return;
146
+ cleanupTimer = setInterval(() => {
147
+ const now = Date.now();
148
+ for (const [domain, entry] of store) {
149
+ if (entry.expiresAt <= now) {
150
+ store.delete(domain);
151
+ }
152
+ }
153
+ // Stop the timer if the cache is empty
154
+ if (store.size === 0 && cleanupTimer) {
155
+ clearInterval(cleanupTimer);
156
+ cleanupTimer = null;
157
+ }
158
+ }, 5 * 60 * 1000); // Run every 5 minutes
159
+ // Don't block Node.js process exit
160
+ if (cleanupTimer && typeof cleanupTimer.unref === 'function') {
161
+ cleanupTimer.unref();
162
+ }
163
+ }