@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,1666 @@
1
+ /**
2
+ * WebPeel pipeline stages
3
+ *
4
+ * Each stage is an exported async function that reads from / writes to the
5
+ * mutable PipelineContext. The stages are called in order by peel().
6
+ */
7
+ import { createHash } from 'crypto';
8
+ import { smartFetch } from './strategies.js';
9
+ import { htmlToMarkdown, htmlToText, cleanForAI, estimateTokens, selectContent, detectMainContent, calculateQuality, truncateToTokenBudget, filterByTags, cleanMarkdownNoise, } from './markdown.js';
10
+ import { pruneContent, pruneMarkdown } from './content-pruner.js';
11
+ import { distillToBudget } from './budget.js';
12
+ import { extractMetadata, extractLinks, extractImages } from './metadata.js';
13
+ import { autoScroll as runAutoScroll } from './actions.js';
14
+ import { extractStructured } from './extract.js';
15
+ import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
16
+ import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
17
+ import { extractDomainData, getDomainExtractor } from '../ee/domain-extractors.js';
18
+ import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
19
+ import { extractReadableContent } from './readability.js';
20
+ import { quickAnswer as runQuickAnswer } from './quick-answer.js';
21
+ import { Timer } from './timing.js';
22
+ import { chunkContent } from './chunker.js';
23
+ import { splitIntoBlocks, scoreBM25 } from './bm25-filter.js';
24
+ import { BlockedError } from '../types.js';
25
+ import { Errors } from '../errors.js';
26
+ import { sanitizeForLLM } from './prompt-guard.js';
27
+ import { getSourceCredibility } from './source-credibility.js';
28
+ import { createLogger } from './logger.js';
29
+ import { detectAuthWall } from './auth-detection.js';
30
+ import { buildAcceptLanguageHeader, detectLanguageFromUrl } from './language-detect.js';
31
+ const log = createLogger('pipeline');
32
+ // ---------------------------------------------------------------------------
33
+ // Hook-aware wrappers — route through premium hooks, fall back to basic stubs
34
+ // ---------------------------------------------------------------------------
35
+ /**
36
+ * Check if a URL has a domain extractor.
37
+ * Priority: premium hook → ee/domain-extractors.
38
+ */
39
+ function hasDomainExtractor(url) {
40
+ const hookFn = getDomainExtractorHook();
41
+ if (hookFn)
42
+ return hookFn(url) !== null;
43
+ return getDomainExtractor(url) !== null;
44
+ }
45
+ /**
46
+ * Run domain extraction on HTML/URL.
47
+ * Priority: premium hook → ee/domain-extractors.
48
+ */
49
+ async function runDomainExtract(html, url) {
50
+ const hookFn = getDomainExtractHook();
51
+ if (hookFn)
52
+ return hookFn(html, url);
53
+ return extractDomainData(html, url);
54
+ }
55
+ /** Create the initial PipelineContext with defaults */
56
+ export function createContext(url, options) {
57
+ return {
58
+ url,
59
+ options,
60
+ timer: new Timer(),
61
+ startTime: Date.now(),
62
+ // Normalized options — filled by normalizeOptions()
63
+ render: false,
64
+ stealth: false,
65
+ wait: 0,
66
+ format: 'markdown',
67
+ timeout: 30000,
68
+ userAgent: undefined,
69
+ screenshot: false,
70
+ screenshotFullPage: false,
71
+ selector: undefined,
72
+ exclude: undefined,
73
+ includeTags: undefined,
74
+ excludeTags: undefined,
75
+ headers: undefined,
76
+ cookies: undefined,
77
+ raw: false,
78
+ actions: undefined,
79
+ extract: undefined,
80
+ maxTokens: undefined,
81
+ extractImagesFlag: false,
82
+ profileDir: undefined,
83
+ headed: false,
84
+ storageState: undefined,
85
+ proxy: undefined,
86
+ fullPage: false,
87
+ autoScrollOpts: undefined,
88
+ // Content type — filled by detectContentType()
89
+ contentType: 'html',
90
+ // Parsing results — filled by parseContent()
91
+ content: '',
92
+ title: '',
93
+ metadata: {},
94
+ links: [],
95
+ quality: 0,
96
+ // Link count — filled by parseContent() / buildResult
97
+ linkCount: 0,
98
+ // Domain API first-pass flag
99
+ domainApiHandled: false,
100
+ // Warnings accumulator
101
+ warnings: [],
102
+ };
103
+ }
104
+ // ---------------------------------------------------------------------------
105
+ // Stage 1: normalizeOptions
106
+ // ---------------------------------------------------------------------------
107
+ /**
108
+ * Resolve all PeelOptions values into flat context fields with defaults applied.
109
+ * Force render=true when screenshot/stealth/actions/branding/autoScroll requested.
110
+ * Parse the autoScroll option.
111
+ */
112
+ export function normalizeOptions(ctx) {
113
+ const opts = ctx.options;
114
+ // Apply agent-mode defaults (can be overridden by explicit options)
115
+ if (opts.agentMode) {
116
+ if (opts.budget === undefined)
117
+ opts.budget = 4000;
118
+ if (opts.format === undefined)
119
+ opts.format = 'markdown';
120
+ }
121
+ const { render = false, stealth = false, wait = 0, format = 'markdown', timeout = 30000, userAgent, screenshot = false, screenshotFullPage = false, selector, exclude, includeTags, excludeTags, headers, cookies, raw = false, actions, extract, maxTokens, images: extractImagesFlag = false, profileDir, headed = false, storageState, proxy, fullPage = false, autoScroll: autoScrollOption, } = opts;
122
+ // Normalize autoScroll option
123
+ const autoScrollOpts = autoScrollOption
124
+ ? (typeof autoScrollOption === 'boolean' ? {} : autoScrollOption)
125
+ : undefined;
126
+ ctx.render = render;
127
+ ctx.stealth = stealth;
128
+ ctx.wait = wait;
129
+ ctx.format = format;
130
+ ctx.timeout = timeout;
131
+ ctx.userAgent = userAgent;
132
+ ctx.screenshot = screenshot;
133
+ ctx.screenshotFullPage = screenshotFullPage;
134
+ ctx.selector = selector;
135
+ ctx.exclude = exclude;
136
+ ctx.includeTags = includeTags;
137
+ ctx.excludeTags = excludeTags;
138
+ // Inject Accept-Language header when location.languages is specified,
139
+ // or auto-detect from URL TLD when no languages are explicitly set.
140
+ // This ensures both HTTP and browser requests use the correct language.
141
+ {
142
+ const langs = opts.location?.languages;
143
+ if (langs && langs.length > 0) {
144
+ const acceptLang = buildAcceptLanguageHeader(langs);
145
+ ctx.headers = { 'Accept-Language': acceptLang, ...headers };
146
+ }
147
+ else {
148
+ const detectedLang = detectLanguageFromUrl(ctx.url);
149
+ if (detectedLang) {
150
+ const acceptLang = buildAcceptLanguageHeader([detectedLang]);
151
+ ctx.headers = { 'Accept-Language': acceptLang, ...headers };
152
+ }
153
+ else {
154
+ ctx.headers = headers;
155
+ }
156
+ }
157
+ }
158
+ ctx.cookies = cookies;
159
+ ctx.raw = raw;
160
+ ctx.actions = actions;
161
+ ctx.extract = extract;
162
+ ctx.maxTokens = maxTokens;
163
+ ctx.extractImagesFlag = extractImagesFlag;
164
+ ctx.profileDir = profileDir;
165
+ ctx.headed = headed;
166
+ ctx.storageState = storageState;
167
+ ctx.proxy = proxy;
168
+ ctx.fullPage = fullPage;
169
+ ctx.autoScrollOpts = autoScrollOpts;
170
+ // NOTE: PDFs/DOCX are now handled via simpleFetch + document parser.
171
+ // No need to force browser rendering for them.
172
+ // If screenshot is requested, force render mode
173
+ if (screenshot) {
174
+ ctx.render = true;
175
+ }
176
+ // If stealth is requested, force render mode
177
+ if (stealth) {
178
+ ctx.render = true;
179
+ }
180
+ // If actions are provided, force render mode
181
+ if (actions && actions.length > 0) {
182
+ ctx.render = true;
183
+ }
184
+ // If branding is requested, force render mode
185
+ if (opts.branding) {
186
+ ctx.render = true;
187
+ }
188
+ // If designAnalysis is requested, force render mode
189
+ if (opts.designAnalysis) {
190
+ ctx.render = true;
191
+ }
192
+ // If autoScroll is requested, force render mode
193
+ if (autoScrollOpts) {
194
+ ctx.render = true;
195
+ }
196
+ // Auto-detect SPAs that require browser rendering (no --render flag needed).
197
+ // This list is NOT proprietary — every developer knows these sites are SPAs.
198
+ // The proprietary part is the domain EXTRACTORS (what data to pull), not this list.
199
+ // Premium hook can extend this for additional server-side intelligence.
200
+ if (!ctx.render) {
201
+ const spaDomainsHook = getSPADomainsHook();
202
+ const spaPatternsHook = getSPAPatternsHook();
203
+ // Full SPA domain list — always available (npm + server)
204
+ const DEFAULT_SPA_DOMAINS = new Set([
205
+ // Search & travel
206
+ 'www.google.com',
207
+ 'flights.google.com',
208
+ // Travel & hospitality
209
+ 'www.airbnb.com',
210
+ 'www.booking.com',
211
+ 'www.expedia.com',
212
+ 'www.kayak.com',
213
+ 'www.skyscanner.com',
214
+ 'www.tripadvisor.com',
215
+ // Jobs
216
+ 'www.indeed.com',
217
+ 'www.glassdoor.com',
218
+ // Real estate
219
+ 'www.zillow.com',
220
+ // Prediction markets (extractor handles specific paths; browser render for unknown paths)
221
+ 'polymarket.com',
222
+ 'www.polymarket.com',
223
+ // Our own dashboard
224
+ 'app.webpeel.dev',
225
+ ]);
226
+ const DEFAULT_SPA_PATTERNS = [
227
+ /google\.com\/travel/,
228
+ /google\.com\/maps/,
229
+ /google\.com\/shopping/,
230
+ ];
231
+ // Premium hook can extend with additional domains; otherwise use full default list
232
+ const SPA_DOMAINS = spaDomainsHook ? spaDomainsHook() : DEFAULT_SPA_DOMAINS;
233
+ const SPA_URL_PATTERNS = spaPatternsHook ? spaPatternsHook() : DEFAULT_SPA_PATTERNS;
234
+ try {
235
+ const hostname = new URL(ctx.url).hostname;
236
+ if (SPA_DOMAINS.has(hostname)) {
237
+ ctx.render = true;
238
+ log.debug(`Auto-enabling render: SPA domain detected (${hostname})`);
239
+ }
240
+ else if (SPA_URL_PATTERNS.some(p => p.test(ctx.url))) {
241
+ ctx.render = true;
242
+ log.debug(`Auto-enabling render: SPA URL pattern matched`);
243
+ }
244
+ }
245
+ catch {
246
+ // Invalid URL — skip SPA detection
247
+ }
248
+ }
249
+ }
250
+ // ---------------------------------------------------------------------------
251
+ // Stage 2: handleYouTube
252
+ // ---------------------------------------------------------------------------
253
+ /**
254
+ * If the URL is a YouTube URL, attempt transcript extraction.
255
+ * Returns a PeelResult on success, or null to fall through to normal pipeline.
256
+ */
257
+ export async function handleYouTube(ctx) {
258
+ const ytVideoId = parseYouTubeUrl(ctx.url);
259
+ if (!ytVideoId)
260
+ return null;
261
+ const ytStartTime = Date.now();
262
+ try {
263
+ const transcript = await getYouTubeTranscript(ctx.url, {
264
+ language: ctx.options.language ?? ctx.options.location?.languages?.[0]?.split('-')[0] ?? 'en',
265
+ });
266
+ // Format view count
267
+ let viewStr = '';
268
+ if (transcript.viewCount) {
269
+ const v = parseInt(transcript.viewCount, 10);
270
+ if (!isNaN(v)) {
271
+ if (v >= 1_000_000)
272
+ viewStr = `${(v / 1_000_000).toFixed(1).replace(/\.0$/, '')}M views`;
273
+ else if (v >= 1_000)
274
+ viewStr = `${(v / 1_000).toFixed(1).replace(/\.0$/, '')}K views`;
275
+ else
276
+ viewStr = `${v.toLocaleString()} views`;
277
+ }
278
+ }
279
+ // Format publish date
280
+ let publishStr = '';
281
+ if (transcript.publishDate) {
282
+ try {
283
+ const d = new Date(transcript.publishDate);
284
+ publishStr = d.toLocaleDateString('en-US', { month: 'short', year: 'numeric', day: 'numeric' });
285
+ }
286
+ catch {
287
+ publishStr = transcript.publishDate;
288
+ }
289
+ }
290
+ // Build header metadata line
291
+ const headerParts = [`**Channel:** ${transcript.channel}`];
292
+ if (transcript.duration && transcript.duration !== '0:00')
293
+ headerParts.push(`**Duration:** ${transcript.duration}`);
294
+ if (viewStr)
295
+ headerParts.push(`**${viewStr}**`);
296
+ if (publishStr)
297
+ headerParts.push(`**Published:** ${publishStr}`);
298
+ /**
299
+ * Strip music note symbols from YouTube auto-caption text.
300
+ * Cleans: [♪♪♪], [🎵🎵🎵], ♪ text ♪ (keeps inner text), standalone ♪ / 🎵
301
+ */
302
+ const cleanMusicNotes = (text) => text
303
+ .replace(/\[[♪🎵]+\]/g, '')
304
+ .replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
305
+ .replace(/[♪🎵]+/g, '')
306
+ .replace(/\s{2,}/g, ' ')
307
+ .trim();
308
+ // Add paragraph breaks to transcript for readability
309
+ let readableText = cleanMusicNotes(transcript.fullText);
310
+ readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
311
+ readableText = readableText.replace(/\n{3,}/g, '\n\n');
312
+ // Build a clean markdown representation of the video + transcript
313
+ const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
314
+ if (transcript.summary) {
315
+ let summaryText = cleanMusicNotes(transcript.summary);
316
+ summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
317
+ parts.push(`## Summary\n\n${summaryText}`);
318
+ }
319
+ if (transcript.keyPoints && transcript.keyPoints.length > 0) {
320
+ const cleanedKps = transcript.keyPoints.map((kp) => cleanMusicNotes(kp)).filter((kp) => kp.length > 0);
321
+ if (cleanedKps.length > 0) {
322
+ parts.push(`## Key Points\n\n${cleanedKps.map((kp) => `- ${kp}`).join('\n')}`);
323
+ }
324
+ }
325
+ if (transcript.chapters && transcript.chapters.length > 0) {
326
+ parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
327
+ }
328
+ parts.push(`## Full Transcript\n\n${readableText}`);
329
+ const videoInfoContent = parts.join('\n\n');
330
+ const elapsed = Date.now() - ytStartTime;
331
+ const tokens = estimateTokens(videoInfoContent);
332
+ const fingerprint = createHash('sha256').update(videoInfoContent).digest('hex').slice(0, 16);
333
+ return {
334
+ url: `https://www.youtube.com/watch?v=${ytVideoId}`,
335
+ title: transcript.title,
336
+ content: videoInfoContent,
337
+ metadata: {
338
+ description: `YouTube video by ${transcript.channel}, duration ${transcript.duration}`,
339
+ author: transcript.channel,
340
+ },
341
+ links: [`https://www.youtube.com/watch?v=${ytVideoId}`],
342
+ tokens,
343
+ method: 'simple',
344
+ elapsed,
345
+ contentType: 'youtube',
346
+ quality: 1.0,
347
+ fingerprint,
348
+ extracted: undefined,
349
+ structured: transcript,
350
+ };
351
+ }
352
+ catch (_ytError) {
353
+ // If transcript extraction fails (no captions, page changed, etc.),
354
+ // fall through to the normal HTML fetch pipeline below.
355
+ return null;
356
+ }
357
+ }
358
+ // ---------------------------------------------------------------------------
359
+ // Stage 3: fetchContent
360
+ // ---------------------------------------------------------------------------
361
+ /**
362
+ * Fetch the URL via smartFetch, handle autoScroll, and store result in ctx.fetchResult.
363
+ */
364
+ export async function fetchContent(ctx) {
365
+ const needsBranding = ctx.options.branding && ctx.render;
366
+ const needsAutoScroll = !!ctx.autoScrollOpts && ctx.render;
367
+ const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
368
+ // Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
369
+ // This avoids expensive browser fetches that often get blocked
370
+ // Skip if noDomainApi is set — user wants raw page content, not API shortcut
371
+ if (hasDomainExtractor(ctx.url) && !ctx.options.noDomainApi) {
372
+ try {
373
+ ctx.timer.mark('domainApiFirst');
374
+ const ddResult = await runDomainExtract('', ctx.url);
375
+ ctx.timer.end('domainApiFirst');
376
+ if (ddResult && ddResult.cleanContent.length > 50) {
377
+ ctx.domainData = ddResult;
378
+ ctx.content = ddResult.cleanContent;
379
+ // Capture raw HTML size from the extractor (e.g. Wikipedia mobile-html size)
380
+ if (ddResult.rawHtmlSize && ddResult.rawHtmlSize > 0) {
381
+ ctx.rawHtmlSize = ddResult.rawHtmlSize;
382
+ }
383
+ else {
384
+ // For API-first extractors (HN, Reddit, GitHub), the raw HTML page is typically
385
+ // 6-10x larger than the extracted content. Estimate conservatively at 7x.
386
+ ctx.rawHtmlSize = ddResult.cleanContent.length * 7;
387
+ }
388
+ // Create minimal fetchResult so downstream stages don't crash
389
+ ctx.fetchResult = {
390
+ html: ddResult.cleanContent,
391
+ url: ctx.url,
392
+ status: 200,
393
+ contentType: 'text/html',
394
+ method: 'domain-api',
395
+ };
396
+ ctx.title = ddResult.structured?.title || '';
397
+ ctx.quality = 0.95; // High quality — structured API data
398
+ // Compute basic metadata so downstream stages have wordCount etc.
399
+ const domainWordCount = ddResult.cleanContent.split(/\s+/).filter(Boolean).length;
400
+ ctx.metadata = {
401
+ ...(ctx.metadata || {}),
402
+ title: ddResult.structured?.title || ctx.title,
403
+ description: ddResult.structured?.description || ddResult.structured?.extract || '',
404
+ wordCount: domainWordCount,
405
+ language: ddResult.structured?.language || ctx.options.location?.languages?.[0]?.split('-')[0] || 'en',
406
+ };
407
+ ctx.domainApiHandled = true;
408
+ return; // Skip browser fetch entirely
409
+ }
410
+ }
411
+ catch (e) {
412
+ // Domain API failed — fall through to normal fetch
413
+ const errMsg = e instanceof Error ? e.message : String(e);
414
+ log.warn('domain API first-pass failed, falling back to fetch:', errMsg);
415
+ ctx.warnings.push(`Domain API extraction failed: ${errMsg}`);
416
+ }
417
+ }
418
+ ctx.timer.mark('fetch');
419
+ let fetchResult;
420
+ try {
421
+ fetchResult = await smartFetch(ctx.url, {
422
+ forceBrowser: ctx.render,
423
+ stealth: ctx.stealth,
424
+ waitMs: ctx.wait,
425
+ userAgent: ctx.userAgent,
426
+ timeoutMs: ctx.timeout,
427
+ screenshot: ctx.screenshot,
428
+ screenshotFullPage: ctx.screenshotFullPage,
429
+ headers: ctx.headers,
430
+ cookies: ctx.cookies,
431
+ actions: ctx.actions,
432
+ keepPageOpen: needsBranding || needsAutoScroll || needsDesignAnalysis,
433
+ profileDir: ctx.profileDir,
434
+ headed: ctx.headed,
435
+ storageState: ctx.storageState,
436
+ proxy: ctx.proxy,
437
+ proxies: ctx.options.proxies,
438
+ device: ctx.options.device,
439
+ viewportWidth: ctx.options.viewportWidth,
440
+ viewportHeight: ctx.options.viewportHeight,
441
+ deviceScaleFactor: ctx.options.deviceScaleFactor,
442
+ waitUntil: ctx.options.waitUntil,
443
+ waitSelector: ctx.options.waitSelector,
444
+ blockResources: ctx.options.blockResources,
445
+ cloaked: ctx.options.cloaked,
446
+ cycle: ctx.options.cycle,
447
+ tls: ctx.options.tls,
448
+ noEscalate: ctx.options.noEscalate,
449
+ });
450
+ }
451
+ catch (fetchError) {
452
+ // If fetch failed but we have a domain extractor, try it as fallback
453
+ // Respect noDomainApi flag even in error fallback path
454
+ if (hasDomainExtractor(ctx.url) && !ctx.options.noDomainApi) {
455
+ try {
456
+ const ddResult = await runDomainExtract('', ctx.url);
457
+ if (ddResult && ddResult.cleanContent.length > 50) {
458
+ ctx.timer.end('fetch');
459
+ ctx.domainData = ddResult;
460
+ ctx.content = ddResult.cleanContent;
461
+ if (ddResult.rawHtmlSize && ddResult.rawHtmlSize > 0) {
462
+ ctx.rawHtmlSize = ddResult.rawHtmlSize;
463
+ }
464
+ else {
465
+ // Estimate raw HTML size for API-first extractors (7x compression factor)
466
+ ctx.rawHtmlSize = ddResult.cleanContent.length * 7;
467
+ }
468
+ ctx.fetchResult = {
469
+ html: ddResult.cleanContent,
470
+ url: ctx.url,
471
+ status: 200,
472
+ contentType: 'text/html',
473
+ method: 'domain-api-fallback',
474
+ };
475
+ ctx.title = ddResult.structured?.title || '';
476
+ ctx.quality = 0.90;
477
+ const fallbackWordCount = ddResult.cleanContent.split(/\s+/).filter(Boolean).length;
478
+ ctx.metadata = { ...(ctx.metadata || {}), title: ddResult.structured?.title || ctx.title, wordCount: fallbackWordCount, language: ddResult.structured?.language || ctx.options.location?.languages?.[0]?.split('-')[0] || 'en' };
479
+ ctx.domainApiHandled = true;
480
+ return;
481
+ }
482
+ }
483
+ catch (e) {
484
+ // Domain API also failed — throw original error
485
+ }
486
+ }
487
+ // Search-as-proxy fallback for blocked requests (BlockedError before pipeline)
488
+ // When all fetch strategies fail with a bot-protection block, try DDG search
489
+ // to get the title/snippet from the search engine's cached version.
490
+ if (fetchError instanceof BlockedError) {
491
+ try {
492
+ // @ts-ignore — proprietary module, gitignored
493
+ const { searchFallback } = await import('./search-fallback.js');
494
+ const searchResult = await searchFallback(ctx.url);
495
+ // If DDG/primary returned very little, also try Bing for richer snippets
496
+ if (!searchResult.cachedContent || searchResult.cachedContent.length < 400) {
497
+ try {
498
+ const { simpleFetch } = await import('./http-fetch.js');
499
+ const bingUrl = `https://www.bing.com/search?q=${encodeURIComponent(ctx.url)}`;
500
+ const bingResult = await simpleFetch(bingUrl, ctx.userAgent, 8000);
501
+ if (bingResult.html && bingResult.html.length > 500) {
502
+ const snippetMatch = bingResult.html.match(/<p[^>]*class="[^"]*snippet[^"]*"[^>]*>(.*?)<\/p>/gi);
503
+ if (snippetMatch) {
504
+ const bingSnippet = snippetMatch.map(s => s.replace(/<[^>]+>/g, '')).join('\n');
505
+ searchResult.cachedContent = (searchResult.cachedContent || '') + '\n\n---\n*Additional context from Bing:*\n' + bingSnippet;
506
+ }
507
+ }
508
+ }
509
+ catch { /* Bing fallback is best-effort */ }
510
+ }
511
+ if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
512
+ ctx.timer.end('fetch');
513
+ ctx.content = searchResult.cachedContent;
514
+ ctx.title = searchResult.title || ctx.title;
515
+ ctx.quality = 0.4;
516
+ ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
517
+ ctx.fetchResult = {
518
+ html: searchResult.cachedContent,
519
+ url: ctx.url,
520
+ status: 0,
521
+ contentType: 'text/markdown',
522
+ method: 'search-fallback',
523
+ };
524
+ ctx.metadata = {
525
+ ...(ctx.metadata || {}),
526
+ title: searchResult.title || ctx.title,
527
+ blocked: true,
528
+ fallbackSource: searchResult.source,
529
+ };
530
+ return;
531
+ }
532
+ }
533
+ catch { /* Search fallback also failed — rethrow original BlockedError */ }
534
+ }
535
+ // Enhance error messages with actionable advice
536
+ if (fetchError instanceof BlockedError) {
537
+ // Instead of crashing, return a helpful response with the block info
538
+ ctx.timer.end('fetch');
539
+ const host = new URL(ctx.url).hostname.replace('www.', '');
540
+ ctx.content = `# ⚠️ ${host} — Access Blocked\n\nThis site uses advanced bot protection and blocked our request.\n\n**What you can try:**\n- Use a browser profile with saved login: \`webpeel login ${host}\`\n- Try an alternative site that provides similar data\n\n*Direct link: [Open in browser](${ctx.url})*`;
541
+ ctx.title = `${host} — Blocked`;
542
+ ctx.quality = 0.2;
543
+ ctx.warnings.push('Site blocked automated access. Showing fallback content.');
544
+ ctx.fetchResult = {
545
+ html: ctx.content,
546
+ url: ctx.url,
547
+ status: 403,
548
+ contentType: 'text/markdown',
549
+ method: 'blocked-fallback',
550
+ };
551
+ return;
552
+ }
553
+ const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
554
+ if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
555
+ const ms = ctx.timeout ?? 30000;
556
+ throw Errors.fetchTimeout(ctx.url, ms);
557
+ }
558
+ throw fetchError;
559
+ }
560
+ const fetchDuration = ctx.timer.end('fetch');
561
+ // Fast path: if a plain HTTP fetch completed quickly with real HTML content,
562
+ // mark it so post-processing can skip expensive heuristics (challenge detection).
563
+ // Only applies to non-browser fetches that succeeded with HTML content.
564
+ if (fetchDuration < 500 &&
565
+ !ctx.render &&
566
+ fetchResult.statusCode === 200 &&
567
+ (fetchResult.contentType || '').includes('html') &&
568
+ (fetchResult.html?.length || 0) > 200) {
569
+ ctx.fastPath = true;
570
+ }
571
+ // Auto-scroll to load lazy content, then grab fresh HTML
572
+ if (needsAutoScroll && fetchResult.page) {
573
+ try {
574
+ await runAutoScroll(fetchResult.page, ctx.autoScrollOpts);
575
+ // Capture refreshed HTML after scrolling
576
+ fetchResult.html = await fetchResult.page.content();
577
+ }
578
+ catch (e) {
579
+ // Non-fatal: auto-scroll failed, continuing with whatever HTML we have
580
+ log.debug('auto-scroll failed:', e instanceof Error ? e.message : e);
581
+ }
582
+ finally {
583
+ // Close page unless branding or design analysis also needs it
584
+ if (!needsBranding && !needsDesignAnalysis) {
585
+ try {
586
+ await fetchResult.page.close().catch(() => { });
587
+ if (fetchResult.browser) {
588
+ await fetchResult.browser.close().catch(() => { });
589
+ }
590
+ }
591
+ catch (e) {
592
+ // Non-fatal: page/browser cleanup after auto-scroll
593
+ log.debug('page/browser cleanup after auto-scroll:', e instanceof Error ? e.message : e);
594
+ }
595
+ fetchResult.page = undefined;
596
+ }
597
+ }
598
+ }
599
+ // Capture raw HTML size BEFORE any processing (accurate measurement of original content)
600
+ ctx.rawHtmlSize = fetchResult.html?.length || 0;
601
+ ctx.fetchResult = fetchResult;
602
+ // Attempt to solve challenge/CAPTCHA page when detected
603
+ if (fetchResult.challengeDetected) {
604
+ const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
605
+ // Only attempt solve if we have a browser worker URL or are not on a resource-constrained env
606
+ const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
607
+ if (canSolve) {
608
+ try {
609
+ const { solveChallenge } = await import('../ee/challenge-solver.js');
610
+ const { detectChallenge } = await import('./challenge-detection.js');
611
+ const rawHtml = fetchResult.html || '';
612
+ const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
613
+ const challengeType = detectionResult.type || 'generic-block';
614
+ const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
615
+ timeout: 15000,
616
+ });
617
+ if (solveResult.solved && solveResult.html) {
618
+ fetchResult.html = solveResult.html;
619
+ fetchResult.challengeDetected = false;
620
+ log.debug(`Challenge solved (${challengeType}) for ${ctx.url}`);
621
+ }
622
+ else {
623
+ ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
624
+ }
625
+ }
626
+ catch (e) {
627
+ ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
628
+ log.debug('Challenge solve failed:', e instanceof Error ? e.message : e);
629
+ }
630
+ }
631
+ else {
632
+ ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
633
+ }
634
+ }
635
+ }
636
+ // ---------------------------------------------------------------------------
637
+ // Stage 4: detectContentType
638
+ // ---------------------------------------------------------------------------
639
+ /**
640
+ * Detect and set ctx.contentType based on response headers and content.
641
+ */
642
+ export function detectContentType(ctx) {
643
+ // Skip HTML parsing stages — domain API already provided clean content
644
+ if (ctx.domainApiHandled)
645
+ return;
646
+ const fetchResult = ctx.fetchResult;
647
+ const ct = (fetchResult.contentType || '').toLowerCase();
648
+ const urlLower = fetchResult.url.toLowerCase();
649
+ // Check for binary document types (PDF/DOCX)
650
+ const isDocument = isPdfContentType(ct) || isDocxContentType(ct) ||
651
+ urlLower.endsWith('.pdf') || urlLower.endsWith('.docx');
652
+ // Check for image types (for OCR text extraction)
653
+ const IMAGE_URL_EXTS = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.tiff', '.tif', '.bmp'];
654
+ const isImage = !isDocument && (ct.startsWith('image/') ||
655
+ IMAGE_URL_EXTS.some(ext => urlLower.endsWith(ext)));
656
+ const isHTML = !isDocument && !isImage && (ct.includes('html') || ct.includes('xhtml') || (!ct && fetchResult.html.trimStart().startsWith('<')));
657
+ const isJSON = !isDocument && !isImage && ct.includes('json');
658
+ const isXML = !isDocument && !isImage && (ct.includes('xml') || ct.includes('rss') || ct.includes('atom'));
659
+ const isPlainText = !isDocument && !isImage && (ct.includes('text/plain') || ct.includes('text/markdown') || ct.includes('text/csv') || ct.includes('text/css') || ct.includes('javascript'));
660
+ ctx.contentType = isImage ? 'image' : isDocument ? 'document' : isHTML ? 'html' : isJSON ? 'json' : isXML ? 'xml' : isPlainText ? 'text' : 'html';
661
+ // Flag when the server returned pre-rendered markdown — no HTML parsing needed
662
+ if (ct.includes('text/markdown')) {
663
+ ctx.serverMarkdown = true;
664
+ }
665
+ }
666
+ // ---------------------------------------------------------------------------
667
+ // Stage 5: parseContent
668
+ // ---------------------------------------------------------------------------
669
+ /**
670
+ * Parse content from fetchResult based on the detected contentType.
671
+ * Sets ctx.content, ctx.title, ctx.metadata, ctx.links, ctx.quality, ctx.prunedPercent.
672
+ */
673
+ export async function parseContent(ctx) {
674
+ // Skip HTML parsing stages — domain API already provided clean content
675
+ if (ctx.domainApiHandled)
676
+ return;
677
+ const fetchResult = ctx.fetchResult;
678
+ const { contentType, format, fullPage, raw, selector, exclude, includeTags, excludeTags } = ctx;
679
+ const hasBuffer = !!fetchResult.buffer;
680
+ // === Image alt-text enhancement (opt-in, heuristic) ===
681
+ // Runs before any conversion so both lite mode and standard mode benefit.
682
+ if (ctx.options.captionImages && contentType === 'html' && fetchResult.html) {
683
+ ctx.timer.mark('captionImages');
684
+ const { enhanceImageAltText } = await import('./image-caption.js');
685
+ fetchResult.html = enhanceImageAltText(fetchResult.html);
686
+ ctx.timer.end('captionImages');
687
+ }
688
+ if (contentType === 'image' && hasBuffer) {
689
+ // === OCR pipeline — extract text from images using Tesseract.js ===
690
+ ctx.timer.mark('ocr');
691
+ const { extractTextFromImage } = await import('./ocr.js');
692
+ const ocrText = await extractTextFromImage(fetchResult.buffer);
693
+ ctx.timer.end('ocr');
694
+ if (ocrText.length > 0) {
695
+ ctx.content = `# OCR Text Extraction\n\n${ocrText}`;
696
+ }
697
+ else {
698
+ ctx.content = '# OCR Text Extraction\n\n*(No text detected in image)*';
699
+ }
700
+ ctx.title = '';
701
+ ctx.metadata = { url: fetchResult.url, title: '' };
702
+ ctx.quality = ocrText.length > 10 ? 0.8 : 0.1;
703
+ }
704
+ else if (contentType === 'document' && hasBuffer) {
705
+ // Document parsing pipeline (PDF/DOCX)
706
+ // 'clean' maps to 'markdown' for extraction; cleanForAI is applied in buildResult
707
+ const docFormat = format === 'clean' ? 'markdown' : format;
708
+ const docResult = await extractDocumentToFormat(fetchResult.buffer, {
709
+ url: fetchResult.url,
710
+ contentType: fetchResult.contentType,
711
+ format: docFormat,
712
+ });
713
+ ctx.content = docResult.content;
714
+ ctx.title = docResult.metadata.title;
715
+ ctx.metadata = docResult.metadata;
716
+ ctx.quality = 1.0; // Documents are inherently structured content
717
+ }
718
+ else if (contentType === 'html') {
719
+ // === Lite mode — minimal processing, maximum speed ===
720
+ // Skips pruning, metadata, quality scoring, JSON-LD. Just fetch → markdown.
721
+ if (ctx.options.lite) {
722
+ let liteHtml = fetchResult.html;
723
+ if (selector) {
724
+ liteHtml = selectContent(liteHtml, selector, exclude);
725
+ }
726
+ ctx.timer.mark('convert');
727
+ switch (format) {
728
+ case 'html':
729
+ ctx.content = liteHtml;
730
+ break;
731
+ case 'text':
732
+ ctx.content = htmlToText(liteHtml);
733
+ break;
734
+ case 'clean':
735
+ ctx.content = cleanForAI(htmlToMarkdown(liteHtml, { raw, prune: false }));
736
+ break;
737
+ default:
738
+ ctx.content = htmlToMarkdown(liteHtml, { raw, prune: false });
739
+ break;
740
+ }
741
+ ctx.timer.end('convert');
742
+ ctx.title = liteHtml.match(/<title[^>]*>([^<]*)<\/title>/i)?.[1]?.trim() || '';
743
+ ctx.quality = 0.5; // Unknown quality in lite mode
744
+ return;
745
+ }
746
+ // === JSON-LD extraction — first-class content source ===
747
+ // Many sites (recipes, products, articles) embed structured data that's
748
+ // more reliable than DOM parsing, especially on JS-heavy SPAs.
749
+ if (!raw && !selector) {
750
+ const { extractJsonLd } = await import('./json-ld.js');
751
+ const jsonLdResult = extractJsonLd(fetchResult.html);
752
+ if (jsonLdResult && jsonLdResult.found && jsonLdResult.content.length > 100) {
753
+ ctx.content = jsonLdResult.content;
754
+ ctx.title = jsonLdResult.title || ctx.title;
755
+ ctx.jsonLdType = jsonLdResult.type;
756
+ ctx.quality = 0.95; // Structured data is high quality
757
+ // Still extract metadata and links from HTML
758
+ ctx.timer.mark('metadata');
759
+ const meta = extractMetadata(fetchResult.html, fetchResult.url);
760
+ ctx.metadata = meta.metadata;
761
+ if (!ctx.title)
762
+ ctx.title = meta.title;
763
+ const htmlForLinks = fetchResult.html.length > 100000
764
+ ? fetchResult.html.slice(0, 100000)
765
+ : fetchResult.html;
766
+ ctx.links = extractLinks(htmlForLinks, fetchResult.url);
767
+ ctx.linkCount = ctx.links.length;
768
+ ctx.timer.end('metadata');
769
+ return;
770
+ }
771
+ }
772
+ // === Readable mode fast-path ===
773
+ // Run readability on raw HTML directly, skipping expensive prune + convert stages.
774
+ // Readability handles its own noise removal and outputs markdown, making prune/convert redundant.
775
+ if (ctx.options.readable && !raw && !selector && !fullPage) {
776
+ // Run readability and metadata extraction in parallel
777
+ const [readResult, metaResult] = await Promise.all([
778
+ Promise.resolve().then(() => {
779
+ ctx.timer.mark('readability');
780
+ const result = extractReadableContent(fetchResult.html, fetchResult.url);
781
+ ctx.timer.end('readability');
782
+ return result;
783
+ }),
784
+ Promise.resolve().then(() => {
785
+ ctx.timer.mark('metadata');
786
+ const meta = extractMetadata(fetchResult.html, fetchResult.url);
787
+ const htmlForLinks = fetchResult.html.length > 100000
788
+ ? fetchResult.html.slice(0, 100000)
789
+ : fetchResult.html;
790
+ const links = extractLinks(htmlForLinks, fetchResult.url);
791
+ ctx.timer.end('metadata');
792
+ return { meta, links };
793
+ }),
794
+ ]);
795
+ // Quality check: if readability result is < 15% of the HTML body text, it likely failed
796
+ // (picked footnotes, sidebar, or wrong section as "main content" — e.g. aosabook.org)
797
+ const htmlTextLen = fetchResult.html.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim().length;
798
+ const readableLen = readResult.content?.length || 0;
799
+ const readabilityFailed = htmlTextLen > 2000 && readableLen > 0 && readableLen < htmlTextLen * 0.15;
800
+ if (readabilityFailed) {
801
+ log.debug(`Readability returned only ${Math.round(readableLen / htmlTextLen * 100)}% of content — falling through to standard extraction`);
802
+ // Don't return early — fall through to standard HTML pipeline below
803
+ }
804
+ else {
805
+ ctx.readabilityResult = readResult;
806
+ ctx.content = readResult.content;
807
+ ctx.title = readResult.title || metaResult.meta.title || ctx.title;
808
+ ctx.metadata = {
809
+ ...metaResult.meta.metadata,
810
+ title: readResult.title || metaResult.meta.title,
811
+ ...(readResult.author ? { author: readResult.author } : {}),
812
+ ...(readResult.date ? { publishedDate: readResult.date } : {}),
813
+ };
814
+ ctx.links = metaResult.links;
815
+ ctx.linkCount = metaResult.links.length;
816
+ ctx.quality = readResult.content.length > 200 ? 0.95 : 0.5;
817
+ return;
818
+ }
819
+ }
820
+ // Standard HTML pipeline
821
+ let html = fetchResult.html;
822
+ // Apply include/exclude tags filtering first (before selector)
823
+ if (includeTags || excludeTags) {
824
+ html = filterByTags(html, includeTags, excludeTags);
825
+ }
826
+ if (selector) {
827
+ html = selectContent(html, selector, exclude);
828
+ }
829
+ else if (exclude?.length) {
830
+ // Apply exclude selectors even without a specific selector
831
+ const cheerio = await import('cheerio');
832
+ const $doc = cheerio.load(html);
833
+ exclude.forEach(sel => $doc(sel).remove());
834
+ html = $doc.html() || html;
835
+ }
836
+ // Smart main content detection (unless raw or selector specified)
837
+ let contentHtml = html;
838
+ if (!raw && !selector) {
839
+ const detected = detectMainContent(html);
840
+ if (detected.detected) {
841
+ contentHtml = detected.html;
842
+ }
843
+ }
844
+ const metadataTask = Promise.resolve().then(() => {
845
+ ctx.timer.mark('metadata');
846
+ const meta = extractMetadata(html, fetchResult.url);
847
+ // When budget is set, use pre-truncated HTML for link extraction (faster)
848
+ const htmlForLinks = (ctx.options.budget && ctx.options.budget > 0 && html.length > 100000)
849
+ ? html.slice(0, 100000)
850
+ : html;
851
+ const result = {
852
+ title: meta.title,
853
+ metadata: meta.metadata,
854
+ links: extractLinks(htmlForLinks, fetchResult.url),
855
+ };
856
+ ctx.timer.end('metadata');
857
+ return result;
858
+ });
859
+ // Content density pruning — runs on HTML before markdown conversion.
860
+ // Removes low-value blocks (sidebars, footers, ads) CSS selectors miss.
861
+ // OFF when fullPage=true, format !== markdown, or content is small (< 20K chars — overhead not worth it).
862
+ if (format === 'markdown' && !fullPage && contentHtml.length >= 20000) {
863
+ ctx.timer.mark('prune');
864
+ const pruned = pruneContent(contentHtml, { dynamic: true });
865
+ ctx.timer.end('prune');
866
+ contentHtml = pruned.html;
867
+ if (pruned.nodesRemoved > 0) {
868
+ ctx.prunedPercent = pruned.reductionPercent;
869
+ }
870
+ }
871
+ // OPTIMIZATION: When budget is set, pre-truncate HTML before markdown conversion.
872
+ // Converting 332K chars → markdown takes ~450ms. If budget=4000 tokens (~16K chars),
873
+ // we only need ~50K chars of HTML (3x overhead for tags/attributes).
874
+ // This cuts convert time from ~450ms to ~30ms on large pages.
875
+ let htmlForConvert = contentHtml;
876
+ // Skip pre-truncation when question is specified — QA needs full content to find answers
877
+ // that may be deep in the article (e.g., "Who coined AI?" → History section of Wikipedia)
878
+ const hasQuestion = !!ctx.options.question;
879
+ if (!hasQuestion && ctx.options.budget && ctx.options.budget > 0 && contentHtml.length > 50000) {
880
+ const estimatedCharsNeeded = ctx.options.budget * 12; // ~12 chars HTML per output token
881
+ const minChars = Math.max(estimatedCharsNeeded, 50000); // at least 50K to ensure quality
882
+ if (contentHtml.length > minChars) {
883
+ // Truncate at a block boundary (</p>, </div>, </li>, </tr>) to avoid broken HTML
884
+ const truncPoint = contentHtml.lastIndexOf('</', minChars);
885
+ if (truncPoint > minChars * 0.8) {
886
+ // Find the end of this closing tag
887
+ const tagEnd = contentHtml.indexOf('>', truncPoint);
888
+ htmlForConvert = contentHtml.slice(0, tagEnd > 0 ? tagEnd + 1 : minChars);
889
+ }
890
+ else {
891
+ htmlForConvert = contentHtml.slice(0, minChars);
892
+ }
893
+ if (process.env.DEBUG) {
894
+ log.debug(`budget pre-truncate: ${contentHtml.length} → ${htmlForConvert.length} chars`);
895
+ }
896
+ }
897
+ }
898
+ const contentTask = Promise.resolve().then(() => {
899
+ ctx.timer.mark('convert');
900
+ let converted;
901
+ switch (format) {
902
+ case 'html':
903
+ converted = htmlForConvert;
904
+ break;
905
+ case 'text':
906
+ converted = htmlToText(htmlForConvert);
907
+ break;
908
+ case 'clean': {
909
+ // First convert to markdown, then strip link syntax
910
+ const md = htmlToMarkdown(htmlForConvert, { raw, prune: false });
911
+ converted = cleanForAI(md);
912
+ break;
913
+ }
914
+ case 'markdown':
915
+ default:
916
+ // prune:false — already pruned above; avoid double-pruning in htmlToMarkdown
917
+ converted = htmlToMarkdown(htmlForConvert, { raw, prune: false });
918
+ break;
919
+ }
920
+ ctx.timer.end('convert');
921
+ return converted;
922
+ });
923
+ const [metaResult, convertedContent] = await Promise.all([metadataTask, contentTask]);
924
+ ctx.title = metaResult.title;
925
+ ctx.metadata = metaResult.metadata;
926
+ ctx.links = metaResult.links;
927
+ ctx.content = convertedContent;
928
+ // Safety net: if budget pre-truncation produced thin content but the full HTML
929
+ // has substantial content, redo conversion WITHOUT pre-truncation.
930
+ // This catches pages where the actual content is in the second half of the HTML
931
+ // (common for listing/index pages, SPAs with shell-first layouts).
932
+ if (htmlForConvert !== contentHtml && convertedContent.length < 200 && contentHtml.length > 20000) {
933
+ if (process.env.DEBUG) {
934
+ log.debug(`budget pre-truncation produced thin content (${convertedContent.length} chars from ${htmlForConvert.length} HTML). Retrying with full HTML (${contentHtml.length} chars).`);
935
+ }
936
+ ctx.timer.mark('convert-retry');
937
+ let retryConverted;
938
+ switch (format) {
939
+ case 'html':
940
+ retryConverted = contentHtml;
941
+ break;
942
+ case 'text':
943
+ retryConverted = htmlToText(contentHtml);
944
+ break;
945
+ case 'clean':
946
+ retryConverted = cleanForAI(htmlToMarkdown(contentHtml, { raw, prune: false }));
947
+ break;
948
+ case 'markdown':
949
+ default:
950
+ retryConverted = htmlToMarkdown(contentHtml, { raw, prune: false });
951
+ break;
952
+ }
953
+ ctx.timer.end('convert-retry');
954
+ ctx.content = retryConverted;
955
+ }
956
+ // Clean up markdown noise (empty links, excess newlines, trailing spaces)
957
+ if (format === 'markdown') {
958
+ ctx.content = cleanMarkdownNoise(ctx.content);
959
+ ctx.content = pruneMarkdown(ctx.content);
960
+ }
961
+ ctx.quality = calculateQuality(ctx.content, fetchResult.html);
962
+ }
963
+ else if (contentType === 'json') {
964
+ // JSON content — format nicely
965
+ try {
966
+ const parsed = JSON.parse(fetchResult.html);
967
+ ctx.content = JSON.stringify(parsed, null, 2);
968
+ ctx.title = 'JSON Response';
969
+ // Extract any URLs from JSON for links
970
+ const urlRegex = /https?:\/\/[^\s"'`,\]})]+/g;
971
+ const found = ctx.content.match(urlRegex) || [];
972
+ ctx.links = [...new Set(found)];
973
+ }
974
+ catch (e) {
975
+ // Non-fatal: JSON parse failed, treating as malformed
976
+ log.debug('JSON parse failed:', e instanceof Error ? e.message : e);
977
+ ctx.content = fetchResult.html;
978
+ ctx.title = 'JSON Response (malformed)';
979
+ }
980
+ ctx.quality = 1.0; // JSON is structured, always "clean"
981
+ }
982
+ else if (contentType === 'xml') {
983
+ // XML/RSS/Atom — convert to readable format
984
+ try {
985
+ const $ = (await import('cheerio')).load(fetchResult.html, { xml: true });
986
+ // Check if RSS/Atom feed
987
+ const items = $('item, entry');
988
+ if (items.length > 0) {
989
+ ctx.title = $('channel > title, feed > title').first().text() || 'RSS/Atom Feed';
990
+ const feedItems = [];
991
+ items.each((_, el) => {
992
+ const itemTitle = $(el).find('title').first().text();
993
+ const itemLink = $(el).find('link').first().text() || $(el).find('link').first().attr('href') || '';
994
+ const itemDesc = $(el).find('description, summary, content').first().text().slice(0, 200);
995
+ feedItems.push(`## ${itemTitle}\n${itemLink}\n${itemDesc}`);
996
+ if (itemLink)
997
+ ctx.links.push(itemLink);
998
+ });
999
+ ctx.content = `# ${ctx.title}\n\n${feedItems.join('\n\n---\n\n')}`;
1000
+ }
1001
+ else {
1002
+ ctx.content = fetchResult.html;
1003
+ ctx.title = $('title').first().text() || 'XML Document';
1004
+ }
1005
+ }
1006
+ catch (e) {
1007
+ // Non-fatal: XML/RSS parse failed, using raw content
1008
+ log.debug('XML/RSS parse failed:', e instanceof Error ? e.message : e);
1009
+ ctx.content = fetchResult.html;
1010
+ ctx.title = 'XML Document';
1011
+ }
1012
+ ctx.quality = 0.9;
1013
+ }
1014
+ else {
1015
+ // Plain text, CSS, JS, etc — return as-is
1016
+ ctx.content = fetchResult.html;
1017
+ ctx.title = fetchResult.url.split('/').pop() || 'Text Document';
1018
+ // Extract URLs from plain text
1019
+ const urlRegex = /https?:\/\/[^\s"'`,\]})]+/g;
1020
+ const found = ctx.content.match(urlRegex) || [];
1021
+ ctx.links = [...new Set(found)];
1022
+ ctx.quality = 1.0;
1023
+ }
1024
+ // --- Auth wall detection ---
1025
+ // Run after content extraction. Only check when content is sparse OR quality is low,
1026
+ // and we're not already in a blocked state, and we have HTML to analyze.
1027
+ if (ctx.fetchResult?.html &&
1028
+ !ctx.metadata?.blocked &&
1029
+ !ctx.authRequired &&
1030
+ (ctx.content.length < 800 || (ctx.quality ?? 1) < 0.3)) {
1031
+ const authCheck = detectAuthWall(ctx.fetchResult.html, ctx.url, ctx.fetchResult.statusCode ?? ctx.fetchResult.status);
1032
+ if (authCheck.isAuthWall) {
1033
+ ctx.authRequired = true;
1034
+ const host = (() => { try {
1035
+ return new URL(ctx.url).hostname.replace('www.', '');
1036
+ }
1037
+ catch {
1038
+ return ctx.url;
1039
+ } })();
1040
+ ctx.warnings.push(`Authentication required. This page is behind a login wall. ` +
1041
+ `Use a browser profile: webpeel profile create ${host} && webpeel "${ctx.url}" --profile ${host}`);
1042
+ }
1043
+ }
1044
+ }
1045
+ // ---------------------------------------------------------------------------
1046
+ // Stage 6: postProcess
1047
+ // ---------------------------------------------------------------------------
1048
+ /**
1049
+ * Run all post-processing in sequence:
1050
+ * readability, image extraction, structured extraction,
1051
+ * maxTokens truncation, budget distillation, domain extractors, quick answer.
1052
+ */
1053
+ export async function postProcess(ctx) {
1054
+ const fetchResult = ctx.fetchResult;
1055
+ const { contentType, options } = ctx;
1056
+ const isHTML = contentType === 'html';
1057
+ // Lite mode — skip all post-processing (no readability, no QA, no budget, no domain extract)
1058
+ if (options.lite)
1059
+ return;
1060
+ // Readability mode — skip if fast-path already handled it in parseContent
1061
+ // Also skip if selector was used — user explicitly chose content, don't override with readability
1062
+ if (options.readable && isHTML && fetchResult.html && !ctx.readabilityResult && !ctx.selector) {
1063
+ ctx.timer.mark('readability');
1064
+ try {
1065
+ const readResult = extractReadableContent(fetchResult.html, fetchResult.url);
1066
+ // Quality check: if readability result is < 15% of full content, it likely failed
1067
+ // (picked footnotes, sidebar, or wrong section as "main content" — e.g. aosabook.org)
1068
+ const fullContentLen = ctx.content?.length || 0;
1069
+ const readableLen = readResult.content?.length || 0;
1070
+ if (fullContentLen > 0 && readableLen > 0 && readableLen < fullContentLen * 0.15) {
1071
+ // Readability failed — keep the full content (already in ctx.content)
1072
+ log.debug(`Readability returned only ${Math.round(readableLen / fullContentLen * 100)}% of content — using full extraction instead`);
1073
+ }
1074
+ else {
1075
+ ctx.readabilityResult = readResult;
1076
+ ctx.content = readResult.content;
1077
+ ctx.metadata = {
1078
+ ...ctx.metadata,
1079
+ title: readResult.title || ctx.metadata?.title,
1080
+ author: readResult.author || undefined,
1081
+ publishedDate: readResult.date || undefined,
1082
+ };
1083
+ ctx.title = readResult.title || ctx.title;
1084
+ }
1085
+ }
1086
+ catch (readErr) {
1087
+ // Readability can crash on complex DOMs (e.g. Amazon) — gracefully fall back to standard content
1088
+ log.debug('Readability failed, using standard content:', readErr.message);
1089
+ }
1090
+ ctx.timer.end('readability');
1091
+ }
1092
+ // Extract images if requested
1093
+ if (ctx.extractImagesFlag && isHTML) {
1094
+ ctx.imagesList = extractImages(fetchResult.html, fetchResult.url);
1095
+ }
1096
+ // Extract structured data if requested
1097
+ if (ctx.extract && isHTML) {
1098
+ if (ctx.extract.llmApiKey && (ctx.extract.prompt || ctx.extract.schema)) {
1099
+ // LLM-powered extraction
1100
+ const { extractWithLLM } = await import('./extract.js');
1101
+ ctx.extracted = await extractWithLLM(ctx.content, ctx.extract);
1102
+ }
1103
+ else if (ctx.extract.selectors || ctx.extract.schema) {
1104
+ // CSS-based extraction (existing)
1105
+ ctx.extracted = extractStructured(fetchResult.html, ctx.extract);
1106
+ }
1107
+ }
1108
+ // Quick answer (LLM-free) — tries pruned content first (higher quality),
1109
+ // then falls back to full raw HTML text if confidence is low (catches answers
1110
+ // deep in the document that pruning may have removed).
1111
+ if (options.question && ctx.content) {
1112
+ ctx.timer.mark('quickAnswer');
1113
+ let qa = runQuickAnswer({
1114
+ question: options.question,
1115
+ content: ctx.content,
1116
+ url: fetchResult.url,
1117
+ });
1118
+ // If confidence is below infobox-level (0.92) and we have raw HTML, try again on full text.
1119
+ // This catches answers deep in articles that pruning may have removed.
1120
+ if (qa.confidence < 0.91 && fetchResult.html && fetchResult.html.length > ctx.content.length * 2) {
1121
+ const { htmlToText } = await import('./markdown.js');
1122
+ const fullText = htmlToText(fetchResult.html);
1123
+ const qaFull = runQuickAnswer({
1124
+ question: options.question,
1125
+ content: fullText,
1126
+ url: fetchResult.url,
1127
+ });
1128
+ // Use the full-text answer if it's more confident
1129
+ if (qaFull.confidence > qa.confidence) {
1130
+ qa = qaFull;
1131
+ }
1132
+ }
1133
+ ctx.timer.end('quickAnswer');
1134
+ ctx.quickAnswerResult = qa;
1135
+ }
1136
+ // Truncate to token budget if requested (simple truncation)
1137
+ if (ctx.maxTokens && ctx.maxTokens > 0) {
1138
+ ctx.content = truncateToTokenBudget(ctx.content, ctx.maxTokens);
1139
+ }
1140
+ // Smart budget distillation — applied AFTER maxTokens truncation
1141
+ // This intelligently compresses content (strips boilerplate, compresses
1142
+ // tables, removes weak paragraphs) rather than blindly cutting.
1143
+ // Skip for domain-extracted content (e.g. YouTube) — it's already clean and structured.
1144
+ if (options.budget && options.budget > 0 && !ctx.domainData) {
1145
+ const budgetFormat = ctx.contentType === 'json' ? 'json' :
1146
+ ctx.format === 'text' ? 'text' : 'markdown';
1147
+ const originalContent = ctx.content;
1148
+ ctx.timer.mark('budget');
1149
+ let budgetedContent = distillToBudget(ctx.content, options.budget, budgetFormat);
1150
+ ctx.timer.end('budget');
1151
+ if (process.env.DEBUG) {
1152
+ log.debug(`budget result: ${originalContent.length} → ${budgetedContent.length} chars`);
1153
+ }
1154
+ // Safety net: if BM25 distillation stripped too much (< 10% of original)
1155
+ // on a substantial page, fall back to simple head truncation.
1156
+ // This happens on listing/index pages with no clear topic to rank by.
1157
+ if (budgetedContent.length < originalContent.length * 0.10 && originalContent.length > 500) {
1158
+ const estimatedChars = options.budget * 4; // rough: 1 token ≈ 4 chars
1159
+ // Trim at a word boundary to avoid cutting mid-word
1160
+ let truncated = originalContent.slice(0, estimatedChars);
1161
+ const lastSpace = truncated.lastIndexOf(' ');
1162
+ if (lastSpace > estimatedChars * 0.8) {
1163
+ truncated = truncated.slice(0, lastSpace);
1164
+ }
1165
+ budgetedContent = truncated;
1166
+ ctx.budgetFallback = true;
1167
+ ctx.warnings.push('Content was truncated to fit budget using head truncation (BM25 distillation produced insufficient content)');
1168
+ if (process.env.DEBUG) {
1169
+ log.debug(`budget distillation fallback: BM25 produced ${budgetedContent.length} chars (< 10% of ${originalContent.length}), using head truncation`);
1170
+ }
1171
+ }
1172
+ ctx.content = budgetedContent;
1173
+ }
1174
+ // Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
1175
+ // Fires when URL matches a known domain. Replaces content with clean markdown.
1176
+ if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled && !ctx.options.noDomainApi) {
1177
+ try {
1178
+ ctx.timer.mark('domainExtract');
1179
+ // Try raw HTML first, then fall back to readability-processed content
1180
+ // (some SPAs like Google Flights have data only after readability processing)
1181
+ let ddResult = await runDomainExtract(fetchResult.html, fetchResult.url);
1182
+ if (!ddResult && ctx.content) {
1183
+ ddResult = await runDomainExtract(ctx.content, fetchResult.url);
1184
+ }
1185
+ ctx.timer.end('domainExtract');
1186
+ if (ddResult) {
1187
+ ctx.domainData = ddResult;
1188
+ ctx.content = ddResult.cleanContent;
1189
+ // Update title from domain extractor (takes precedence over HTML page title)
1190
+ if (ddResult.structured?.title) {
1191
+ ctx.title = ddResult.structured.title;
1192
+ }
1193
+ }
1194
+ }
1195
+ catch (e) {
1196
+ // Domain extraction failure is non-fatal; continue with normal content
1197
+ const errMsg2 = e instanceof Error ? e.message : String(e);
1198
+ log.warn('domain extraction (second pass) failed:', errMsg2);
1199
+ ctx.warnings.push(`Domain extraction (second pass) failed: ${errMsg2}`);
1200
+ }
1201
+ }
1202
+ // === Challenge / bot-protection page detection (post-extraction) ===
1203
+ // After content extraction, verify the raw HTML isn't actually a challenge/block page
1204
+ // that slipped through the fetch-level checks (e.g. a 200-status challenge page with
1205
+ // enough HTML to pass content-length gates).
1206
+ //
1207
+ // Uses the proper detectChallenge() function on raw HTML instead of fragile string
1208
+ // matching on extracted markdown — this avoids false positives from articles that
1209
+ // mention security terms and correctly handles 404 pages, real content with security
1210
+ // keywords, and vendor-specific challenge patterns.
1211
+ //
1212
+ // Fast path: skip for HTTP fetches that completed in < 500ms with 200 status —
1213
+ // a fast successful response is virtually never a challenge page.
1214
+ //
1215
+ // Also flag very thin content from stealth/browser fetches as suspicious — if the
1216
+ // browser rendered a page but extracted almost nothing, it's likely a challenge page
1217
+ // that rendered its JS but produced no meaningful text.
1218
+ if (!ctx.fastPath && ctx.fetchResult?.html) {
1219
+ const { detectChallenge } = await import('./challenge-detection.js');
1220
+ const rawHtml = ctx.fetchResult.html;
1221
+ const statusCode = ctx.fetchResult.statusCode ?? ctx.fetchResult.status;
1222
+ const postExtractChallenge = detectChallenge(rawHtml, statusCode);
1223
+ // Also flag very thin browser/stealth results — challenge pages that execute JS
1224
+ // often produce minimal extracted text even though the HTML is large
1225
+ const isThinBrowserResult = ctx.content
1226
+ && ctx.content.length < 100
1227
+ && (ctx.stealth || ctx.fetchResult?.method === 'stealth' || ctx.fetchResult?.method === 'browser');
1228
+ const isChallengeContent = (postExtractChallenge.isChallenge && postExtractChallenge.confidence >= 0.7)
1229
+ || isThinBrowserResult;
1230
+ if (isChallengeContent) {
1231
+ const challengeType = postExtractChallenge.type || 'generic-block';
1232
+ log.debug(`Post-extraction challenge detected: ${challengeType} (confidence: ${postExtractChallenge.confidence.toFixed(2)}) for ${ctx.url}`);
1233
+ ctx.warnings.push('Bot protection detected. Content is a challenge page, not the actual page content.');
1234
+ if (ctx.metadata) {
1235
+ ctx.metadata.blocked = true;
1236
+ ctx.metadata.challengeDetected = true;
1237
+ }
1238
+ // Try challenge solver first (if browser worker available or local solve enabled)
1239
+ let solvedViaChallengeSolver = false;
1240
+ const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
1241
+ const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
1242
+ if (canSolve) {
1243
+ try {
1244
+ const { solveChallenge } = await import('../ee/challenge-solver.js');
1245
+ const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
1246
+ timeout: 15000,
1247
+ });
1248
+ if (solveResult.solved && solveResult.html) {
1249
+ // Re-parse the solved HTML
1250
+ const { htmlToMarkdown, htmlToText, cleanForAI } = await import('./markdown.js');
1251
+ const fmt = ctx.format || 'markdown';
1252
+ ctx.content = fmt === 'text' ? htmlToText(solveResult.html)
1253
+ : fmt === 'clean' ? cleanForAI(solveResult.html)
1254
+ : htmlToMarkdown(solveResult.html);
1255
+ ctx.fetchResult.html = solveResult.html;
1256
+ if (ctx.metadata) {
1257
+ ctx.metadata.blocked = false;
1258
+ ctx.metadata.challengeDetected = false;
1259
+ ctx.metadata.challengeSolved = true;
1260
+ }
1261
+ solvedViaChallengeSolver = true;
1262
+ log.debug(`Content-level challenge solved for ${ctx.url}`);
1263
+ }
1264
+ }
1265
+ catch (e) {
1266
+ log.debug('Content-level challenge solve failed:', e instanceof Error ? e.message : e);
1267
+ }
1268
+ }
1269
+ // Fall back to search fallback if challenge solve didn't work
1270
+ if (!solvedViaChallengeSolver) {
1271
+ try {
1272
+ // @ts-ignore — proprietary module, gitignored
1273
+ const { searchFallback } = await import('./search-fallback.js');
1274
+ const searchResult = await searchFallback(ctx.url);
1275
+ if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
1276
+ ctx.content = searchResult.cachedContent;
1277
+ ctx.title = searchResult.title || ctx.title;
1278
+ ctx.quality = 0.4;
1279
+ ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
1280
+ if (ctx.metadata) {
1281
+ ctx.metadata.fallbackSource = searchResult.source;
1282
+ }
1283
+ }
1284
+ }
1285
+ catch { /* Search fallback failed — continue with challenge page content */ }
1286
+ }
1287
+ }
1288
+ }
1289
+ // === Active domain verification ===
1290
+ // Run for ALL sites — even known official/established domains benefit from
1291
+ // showing real TLS, DNS, and header signals. This is what makes WebPeel useful.
1292
+ {
1293
+ const { verifyDomain } = await import('./domain-verify.js');
1294
+ const existingHeaders = ctx.fetchResult?.responseHeaders || undefined;
1295
+ ctx.domainVerification = await verifyDomain(ctx.url, existingHeaders).catch(() => null);
1296
+ }
1297
+ // === Zero-token safety net ===
1298
+ // NEVER return empty content. If pipeline produced nothing, fall back.
1299
+ if (!ctx.content || ctx.content.trim().length === 0) {
1300
+ ctx.warnings.push('Primary extraction failed; content sourced from fallback (meta description or raw HTML)');
1301
+ // Try 1: JSON-LD (may not have been tried if selector/raw was used)
1302
+ if (fetchResult.html) {
1303
+ const { extractJsonLd } = await import('./json-ld.js');
1304
+ const jsonLd = extractJsonLd(fetchResult.html);
1305
+ if (jsonLd?.content && jsonLd.content.length > 50) {
1306
+ ctx.content = jsonLd.content;
1307
+ ctx.title = jsonLd.title || ctx.title;
1308
+ ctx.jsonLdType = jsonLd.type;
1309
+ ctx.quality = 0.90;
1310
+ return;
1311
+ }
1312
+ }
1313
+ // Try 2: Meta description + title as minimal content
1314
+ const metaDesc = ctx.metadata?.description || ctx.metadata?.ogDescription;
1315
+ const pageTitle = ctx.title || ctx.metadata?.title;
1316
+ if (metaDesc || pageTitle) {
1317
+ const parts = [];
1318
+ if (pageTitle)
1319
+ parts.push(`# ${pageTitle}\n`);
1320
+ if (metaDesc)
1321
+ parts.push(metaDesc);
1322
+ ctx.content = parts.join('\n');
1323
+ ctx.quality = 0.3; // Low quality — we only got metadata
1324
+ return;
1325
+ }
1326
+ // Try 3: Raw text from HTML (strip all tags)
1327
+ if (fetchResult.html && fetchResult.html.length > 100) {
1328
+ const { htmlToText } = await import('./markdown.js');
1329
+ const rawText = htmlToText(fetchResult.html);
1330
+ if (rawText.trim().length > 50) {
1331
+ ctx.content = rawText.slice(0, 10000); // Cap at 10K chars
1332
+ ctx.quality = 0.2; // Very low quality
1333
+ return;
1334
+ }
1335
+ }
1336
+ // Try 4: Search-as-proxy fallback (when page appears blocked)
1337
+ // Search engines already crawled this page — use their cached snippet.
1338
+ try {
1339
+ // @ts-ignore — proprietary module, gitignored
1340
+ const { searchFallback } = await import('./search-fallback.js');
1341
+ const searchResult = await searchFallback(ctx.url);
1342
+ if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
1343
+ ctx.content = searchResult.cachedContent;
1344
+ ctx.title = searchResult.title || ctx.title;
1345
+ ctx.quality = 0.4; // Low quality — it's a search snippet, not the full page
1346
+ ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
1347
+ if (ctx.metadata) {
1348
+ ctx.metadata.blocked = true;
1349
+ ctx.metadata.fallbackSource = searchResult.source;
1350
+ }
1351
+ return;
1352
+ }
1353
+ }
1354
+ catch { /* Search fallback failed — continue to final empty handler */ }
1355
+ }
1356
+ }
1357
+ // ---------------------------------------------------------------------------
1358
+ // Stage 7: finalize
1359
+ // ---------------------------------------------------------------------------
1360
+ /**
1361
+ * Screenshot base64 conversion, branding extraction (needs page), change tracking, AI summary.
1362
+ */
1363
+ export async function finalize(ctx) {
1364
+ const fetchResult = ctx.fetchResult;
1365
+ const { options } = ctx;
1366
+ // Convert screenshot buffer to base64 if present
1367
+ ctx.screenshotBase64 = fetchResult.screenshot?.toString('base64');
1368
+ // Extract branding if requested (reuses existing browser page when available)
1369
+ if (options.branding && ctx.render && fetchResult.page) {
1370
+ try {
1371
+ const { extractBranding } = await import('./branding.js');
1372
+ ctx.brandingProfile = await extractBranding(fetchResult.page);
1373
+ }
1374
+ catch (error) {
1375
+ log.error('Branding extraction failed:', error);
1376
+ }
1377
+ finally {
1378
+ // Clean up the kept-open page and browser
1379
+ try {
1380
+ await fetchResult.page.close().catch(() => { });
1381
+ if (fetchResult.browser) {
1382
+ await fetchResult.browser.close().catch(() => { });
1383
+ }
1384
+ }
1385
+ catch (e) {
1386
+ // Non-fatal: page/browser cleanup after branding extraction
1387
+ log.debug('page/browser cleanup after branding:', e instanceof Error ? e.message : e);
1388
+ }
1389
+ }
1390
+ }
1391
+ // Extract design analysis if requested (reuses existing browser page when available)
1392
+ if (options.designAnalysis && ctx.render && fetchResult.page) {
1393
+ try {
1394
+ const { extractDesignAnalysis } = await import('./design-analysis.js');
1395
+ ctx.designAnalysisResult = await extractDesignAnalysis(fetchResult.page);
1396
+ }
1397
+ catch (error) {
1398
+ log.error('Design analysis extraction failed:', error);
1399
+ }
1400
+ finally {
1401
+ if (!options.branding) {
1402
+ // Clean up the page and browser if branding didn't already do it
1403
+ try {
1404
+ await fetchResult.page.close().catch(() => { });
1405
+ if (fetchResult.browser) {
1406
+ await fetchResult.browser.close().catch(() => { });
1407
+ }
1408
+ }
1409
+ catch (e) {
1410
+ log.debug('page/browser cleanup after design analysis:', e instanceof Error ? e.message : e);
1411
+ }
1412
+ }
1413
+ }
1414
+ }
1415
+ // Track content changes if requested
1416
+ if (options.changeTracking) {
1417
+ try {
1418
+ const fingerprint = createHash('sha256').update(ctx.content).digest('hex').slice(0, 16);
1419
+ const { trackChange } = await import('./change-tracking.js');
1420
+ ctx.changeResult = await trackChange(fetchResult.url, ctx.content, fingerprint);
1421
+ }
1422
+ catch (error) {
1423
+ log.error('Change tracking failed:', error);
1424
+ }
1425
+ }
1426
+ // Generate AI summary if requested
1427
+ if (options.summary && options.llm) {
1428
+ try {
1429
+ const { summarizeContent } = await import('./summarize.js');
1430
+ const maxLength = typeof options.summary === 'object' && options.summary.maxLength
1431
+ ? options.summary.maxLength
1432
+ : 150;
1433
+ ctx.summaryText = await summarizeContent(ctx.content, {
1434
+ apiKey: options.llm.apiKey,
1435
+ model: options.llm.model,
1436
+ apiBase: options.llm.baseUrl,
1437
+ maxWords: maxLength,
1438
+ });
1439
+ }
1440
+ catch (error) {
1441
+ log.error('Summary generation failed:', error);
1442
+ }
1443
+ }
1444
+ }
1445
+ // ---------------------------------------------------------------------------
1446
+ // Stage 8: buildResult
1447
+ // ---------------------------------------------------------------------------
1448
+ /**
1449
+ * Assemble the final PeelResult from the pipeline context.
1450
+ */
1451
+ export function buildResult(ctx) {
1452
+ const fetchResult = ctx.fetchResult;
1453
+ const elapsed = Date.now() - ctx.startTime;
1454
+ // --- Trust & Safety ---
1455
+ // Run prompt injection scan on final content
1456
+ const sanitizeResult = sanitizeForLLM(ctx.content);
1457
+ // If injection was detected, use the cleaned content
1458
+ if (sanitizeResult.injectionDetected) {
1459
+ ctx.content = sanitizeResult.content;
1460
+ ctx.warnings.push('Prompt injection patterns detected and stripped from content.');
1461
+ }
1462
+ // Assess source credibility
1463
+ const credibility = getSourceCredibility(ctx.url);
1464
+ // Merge active domain verification signals (if available)
1465
+ const dv = ctx.domainVerification ?? null;
1466
+ const verificationBonus = dv?.verificationScore ?? 0;
1467
+ const finalCredibilityScore = Math.min(100, credibility.score + verificationBonus);
1468
+ // Merge signals/warnings from active verification into credibility
1469
+ const mergedSignals = [
1470
+ ...(credibility.signals ?? []),
1471
+ ...(dv?.signals ?? []),
1472
+ ];
1473
+ const mergedCredWarnings = [
1474
+ ...(credibility.warnings ?? []),
1475
+ ...(dv?.warnings ?? []),
1476
+ ];
1477
+ // Compute composite trust score from source credibility (0-100) + content safety
1478
+ let trustScore = finalCredibilityScore / 100; // normalize 0-100 → 0-1
1479
+ if (sanitizeResult.injectionDetected)
1480
+ trustScore -= 0.3;
1481
+ if ((ctx.quality ?? 1.0) < 0.5)
1482
+ trustScore -= 0.1;
1483
+ trustScore = Math.round(Math.max(0, Math.min(1, trustScore)) * 100) / 100;
1484
+ // Build trust warnings
1485
+ const trustWarnings = [...mergedCredWarnings];
1486
+ if (credibility.tier === 'new')
1487
+ trustWarnings.push('Domain has limited verifiable presence — exercise caution.');
1488
+ if (credibility.tier === 'suspicious')
1489
+ trustWarnings.push('Domain shows suspicious signals — treat content with caution.');
1490
+ if (sanitizeResult.injectionDetected)
1491
+ trustWarnings.push(`Prompt injection detected: ${sanitizeResult.detectedPatterns.join(', ')}`);
1492
+ if (sanitizeResult.strippedChars > 0)
1493
+ trustWarnings.push(`Stripped ${sanitizeResult.strippedChars} suspicious characters (zero-width/Unicode smuggling).`);
1494
+ // Build verification sub-object (compact version for PeelResult)
1495
+ const verificationData = dv ? {
1496
+ tls: dv.tls ? { valid: dv.tls.valid, issuer: dv.tls.issuer, daysRemaining: dv.tls.daysRemaining } : null,
1497
+ dns: dv.dns ? { hasMx: dv.dns.hasMx, hasDmarc: dv.dns.hasDmarc, hasSpf: dv.dns.hasSpf } : null,
1498
+ headers: dv.headers ? { hsts: dv.headers.hsts, csp: dv.headers.csp, server: dv.headers.server } : null,
1499
+ } : undefined;
1500
+ // Include safe browsing data in trust object
1501
+ const sb = ctx.safeBrowsingResult;
1502
+ const trust = {
1503
+ source: {
1504
+ tier: credibility.tier,
1505
+ score: finalCredibilityScore,
1506
+ label: credibility.label,
1507
+ signals: mergedSignals,
1508
+ warnings: mergedCredWarnings,
1509
+ ...(verificationData ? { verification: verificationData } : {}),
1510
+ },
1511
+ contentSafety: {
1512
+ clean: !sanitizeResult.injectionDetected,
1513
+ injectionDetected: sanitizeResult.injectionDetected,
1514
+ detectedPatterns: sanitizeResult.detectedPatterns,
1515
+ strippedCount: sanitizeResult.strippedChars,
1516
+ },
1517
+ ...(sb ? {
1518
+ safeBrowsing: { safe: sb.safe, threats: sb.threats, source: sb.source },
1519
+ } : {}),
1520
+ ...(sb?.threatFeeds ? {
1521
+ threatFeeds: {
1522
+ safe: sb.threatFeeds.safe,
1523
+ threats: sb.threatFeeds.threats,
1524
+ source: sb.threatFeeds.source,
1525
+ ...(sb.threatFeeds.details ? { details: sb.threatFeeds.details } : {}),
1526
+ },
1527
+ } : {}),
1528
+ score: trustScore,
1529
+ warnings: trustWarnings,
1530
+ };
1531
+ const tokens = estimateTokens(ctx.content);
1532
+ const fingerprint = createHash('sha256').update(ctx.content).digest('hex').slice(0, 16);
1533
+ // Token savings metrics — only when raw HTML size was captured (from actual fetch or domain extractor)
1534
+ const rawHtmlSize = ctx.rawHtmlSize ?? 0;
1535
+ const rawTokenEstimate = rawHtmlSize > 0 ? Math.round(rawHtmlSize / 4) : undefined;
1536
+ const tokenSavingsPercent = rawTokenEstimate !== undefined && rawTokenEstimate > 0
1537
+ ? Math.max(0, Math.round((1 - tokens / rawTokenEstimate) * 100))
1538
+ : undefined;
1539
+ // Build freshness from fetchResult response headers
1540
+ const freshness = {
1541
+ ...(fetchResult.responseHeaders?.['last-modified'] ? { lastModified: fetchResult.responseHeaders['last-modified'] } : {}),
1542
+ ...(fetchResult.responseHeaders?.['etag'] ? { etag: fetchResult.responseHeaders['etag'] } : {}),
1543
+ fetchedAt: new Date().toISOString(),
1544
+ ...(fetchResult.responseHeaders?.['cache-control'] ? { cacheControl: fetchResult.responseHeaders['cache-control'] } : {}),
1545
+ };
1546
+ // Detect and warn about potential content issues
1547
+ let warning;
1548
+ const contentLen = ctx.content.length;
1549
+ const htmlLen = ctx.fetchResult?.html?.length || 0;
1550
+ // Add contentQuality metadata for thin content (< 100 words)
1551
+ const wordCount = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
1552
+ if (wordCount < 100 && wordCount > 0) {
1553
+ ctx.warnings.push(`Content is thin (${wordCount} words). The page may be paywalled, require authentication, or block automated access.`);
1554
+ if (ctx.metadata) {
1555
+ ctx.metadata.contentQuality = 'thin';
1556
+ }
1557
+ }
1558
+ if (contentLen < 100 && htmlLen > 1000) {
1559
+ warning = 'Content extraction produced very little text from a substantial page. The site may use heavy JavaScript rendering. Try adding render: true.';
1560
+ }
1561
+ else if (ctx.budgetFallback) {
1562
+ warning = 'Budget distillation was unable to identify key content. Showing first portion of page instead. This may be a listing or index page — try fetching without a budget for full content.';
1563
+ }
1564
+ else if (contentLen < 50) {
1565
+ // Check if this looks like a blocked request
1566
+ const fetchMethod = ctx.fetchResult?.method || 'unknown';
1567
+ const triedStealth = fetchMethod === 'stealth' || ctx.options.stealth;
1568
+ const triedBrowser = fetchMethod === 'browser' || ctx.options.render;
1569
+ if (triedStealth || triedBrowser) {
1570
+ warning = 'This site appears to use bot protection (Cloudflare, Akamai, PerimeterX). Try: --cloaked flag, a residential proxy (--proxy), or check if the URL requires authentication.';
1571
+ // Set blocked flag in metadata
1572
+ if (ctx.metadata) {
1573
+ ctx.metadata.blocked = true;
1574
+ }
1575
+ }
1576
+ else {
1577
+ warning = 'Very little content extracted. The page may require JavaScript rendering (try --render), be behind a login wall, or use bot protection.';
1578
+ }
1579
+ }
1580
+ // Apply clean format if requested (after all other processing)
1581
+ if (ctx.format === 'clean' && ctx.content) {
1582
+ ctx.content = cleanForAI(ctx.content);
1583
+ }
1584
+ // Query-dependent highlights (BM25-powered)
1585
+ let highlights;
1586
+ let highlightedContent;
1587
+ if (ctx.options.highlightQuery && ctx.content) {
1588
+ const highlightMaxChars = ctx.options.highlightMaxChars ?? 1000;
1589
+ const queryTerms = ctx.options.highlightQuery
1590
+ .toLowerCase()
1591
+ .replace(/[^\w\s]/g, ' ')
1592
+ .split(/\s+/)
1593
+ .filter(t => t.length > 0);
1594
+ if (queryTerms.length > 0) {
1595
+ const blocks = splitIntoBlocks(ctx.content);
1596
+ const scores = scoreBM25(blocks, queryTerms);
1597
+ // Pair blocks with scores and sort by score descending
1598
+ const scored = blocks.map((block, i) => ({ text: block.raw, score: scores[i], index: i }));
1599
+ scored.sort((a, b) => b.score - a.score);
1600
+ // Take top blocks until highlightMaxChars is reached
1601
+ const selected = [];
1602
+ let totalChars = 0;
1603
+ for (const item of scored) {
1604
+ if (item.score <= 0)
1605
+ break; // skip zero-score blocks
1606
+ if (totalChars + item.text.length > highlightMaxChars && selected.length > 0)
1607
+ break;
1608
+ selected.push({ text: item.text, score: Math.round(item.score * 10000) / 10000 });
1609
+ totalChars += item.text.length;
1610
+ }
1611
+ if (selected.length > 0) {
1612
+ highlights = selected;
1613
+ highlightedContent = selected.map(h => h.text).join('\n\n');
1614
+ }
1615
+ }
1616
+ }
1617
+ // Chunking for RAG pipelines
1618
+ let ragChunks;
1619
+ if (ctx.options.chunk) {
1620
+ const chunkOpts = typeof ctx.options.chunk === 'object'
1621
+ ? ctx.options.chunk
1622
+ : {};
1623
+ const chunkResult = chunkContent(ctx.content, chunkOpts);
1624
+ ragChunks = chunkResult.chunks;
1625
+ }
1626
+ return {
1627
+ url: fetchResult.url,
1628
+ title: ctx.title,
1629
+ content: ctx.content,
1630
+ metadata: ctx.metadata,
1631
+ links: ctx.links,
1632
+ tokens,
1633
+ method: ctx.contentType === 'image' ? 'ocr' : fetchResult.method === 'cached' ? 'simple' : fetchResult.method,
1634
+ elapsed,
1635
+ screenshot: ctx.screenshotBase64,
1636
+ contentType: ctx.contentType,
1637
+ quality: ctx.quality,
1638
+ fingerprint,
1639
+ extracted: ctx.extracted,
1640
+ branding: ctx.brandingProfile,
1641
+ designAnalysis: ctx.designAnalysisResult,
1642
+ changeTracking: ctx.changeResult,
1643
+ summary: ctx.summaryText,
1644
+ images: ctx.imagesList,
1645
+ linkCount: ctx.links.length,
1646
+ freshness,
1647
+ ...(warning !== undefined ? { warning } : {}),
1648
+ ...(ctx.metadata && ctx.metadata.blocked ? { blocked: true } : {}),
1649
+ ...(ctx.authRequired ? { authRequired: true } : {}),
1650
+ ...(ctx.prunedPercent !== undefined ? { prunedPercent: ctx.prunedPercent } : {}),
1651
+ ...(ctx.domainData !== undefined ? { domainData: ctx.domainData } : {}),
1652
+ ...(ctx.readabilityResult !== undefined ? { readability: ctx.readabilityResult } : {}),
1653
+ ...(ctx.quickAnswerResult !== undefined ? { quickAnswer: ctx.quickAnswerResult } : {}),
1654
+ timing: ctx.timer.toTiming(),
1655
+ ...(ctx.jsonLdType !== undefined ? { jsonLdType: ctx.jsonLdType } : {}),
1656
+ ...(ctx.warnings.length > 0 ? { warnings: ctx.warnings } : {}),
1657
+ ...(ragChunks !== undefined ? { chunks: ragChunks } : {}),
1658
+ ...(highlights !== undefined ? { highlights } : {}),
1659
+ ...(highlightedContent !== undefined ? { highlightedContent } : {}),
1660
+ ...(ctx.serverMarkdown ? { serverMarkdown: true } : {}),
1661
+ ...(rawTokenEstimate !== undefined ? { rawTokenEstimate } : {}),
1662
+ ...(tokenSavingsPercent !== undefined ? { tokenSavingsPercent } : {}),
1663
+ ...(fetchResult.autoInteract !== undefined ? { autoInteract: fetchResult.autoInteract } : {}),
1664
+ trust,
1665
+ };
1666
+ }