@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,1345 @@
1
+ /**
2
+ * Fetch commands: default URL handler, read, pipe
3
+ */
4
+ import ora from 'ora';
5
+ import { writeFileSync, readFileSync, existsSync } from 'fs';
6
+ import { getProfilePath, loadStorageState, touchProfile } from '../../core/profiles.js';
7
+ import { shouldForceBrowser } from '../../core/strategies.js';
8
+ import { peel, cleanup } from '../../index.js';
9
+ import { checkUsage, showUsageFooter, loadConfig } from '../../cli-auth.js';
10
+ import { getCache, setCache, parseTTL } from '../../cache.js';
11
+ import { estimateTokens, htmlToMarkdown } from '../../core/markdown.js';
12
+ import { distillToBudget, budgetListings } from '../../core/budget.js';
13
+ import { parseActions, formatError, fetchViaApi, outputResult, writeStdout, buildEnvelope, classifyErrorCode, formatListingsCsv, normaliseExtractedToRows, } from '../utils.js';
14
+ // ─── readStdin ────────────────────────────────────────────────────────────────
15
+ async function readStdin() {
16
+ const chunks = [];
17
+ for await (const chunk of process.stdin) {
18
+ chunks.push(Buffer.from(chunk));
19
+ }
20
+ return Buffer.concat(chunks).toString('utf-8');
21
+ }
22
+ // ─── runStdin ─────────────────────────────────────────────────────────────────
23
+ // Read HTML from stdin, convert to markdown, and output
24
+ async function runStdin(options) {
25
+ try {
26
+ const html = await readStdin();
27
+ if (!html.trim()) {
28
+ process.stderr.write('Error: No input received on stdin\n');
29
+ process.exit(1);
30
+ }
31
+ const markdown = htmlToMarkdown(html, { raw: false, prune: true });
32
+ if (options.json) {
33
+ const tokens = estimateTokens(markdown);
34
+ process.stdout.write(JSON.stringify({ success: true, content: markdown, tokens }) + '\n');
35
+ }
36
+ else {
37
+ process.stdout.write(markdown + '\n');
38
+ }
39
+ }
40
+ catch (err) {
41
+ process.stderr.write(`Error: ${err.message}\n`);
42
+ process.exit(1);
43
+ }
44
+ }
45
+ // ─── runFetch ─────────────────────────────────────────────────────────────────
46
+ // Main fetch handler — shared with the `pipe` and `ask` subcommands
47
+ export async function runFetch(url, options) {
48
+ // --silent: suppress all log output (set env var before any logger fires)
49
+ if (options.silent && !process.env.WEBPEEL_LOG_LEVEL) {
50
+ process.env.WEBPEEL_LOG_LEVEL = 'silent';
51
+ }
52
+ // --content-only: override all output flags — we just want raw content
53
+ if (options.contentOnly) {
54
+ options.silent = true;
55
+ // Disable json/text/html — we output content directly
56
+ options.json = false;
57
+ options.html = false;
58
+ options.text = false;
59
+ }
60
+ // Handle --format flag: maps to existing boolean flags
61
+ if (options.format) {
62
+ const fmt = options.format.toLowerCase();
63
+ if (fmt === 'text')
64
+ options.text = true;
65
+ else if (fmt === 'html')
66
+ options.html = true;
67
+ else if (fmt === 'json')
68
+ options.json = true;
69
+ else if (fmt === 'markdown' || fmt === 'md') { /* default, do nothing */ }
70
+ else {
71
+ console.error(`Unknown format: ${options.format}. Use: text, markdown, html, or json`);
72
+ process.exit(1);
73
+ }
74
+ }
75
+ // Smart defaults: when piped (not a TTY), default to silent JSON + budget
76
+ // BUT respect explicit --format flag (user chose the output format)
77
+ // AND respect --content-only (raw content output, no JSON wrapper)
78
+ const isPiped = !process.stdout.isTTY;
79
+ const hasExplicitFormat = options.format && ['text', 'html', 'markdown', 'md'].includes(options.format.toLowerCase());
80
+ if (isPiped && !options.html && !options.text && !hasExplicitFormat && !options.contentOnly) {
81
+ if (!options.json)
82
+ options.json = true;
83
+ if (!options.silent)
84
+ options.silent = true;
85
+ // Auto-enable readability for AI consumers — clean content by default
86
+ if (!options.readable && !options.fullNav) {
87
+ options.readable = true;
88
+ }
89
+ // Auto token budget for piped mode (AI consumers want concise content)
90
+ if (options.budget === undefined && !options.fullContent && !options.raw && !options.full) {
91
+ options.budget = 4000;
92
+ }
93
+ }
94
+ // --full alias: sets raw + fullContent
95
+ if (options.full) {
96
+ options.raw = true;
97
+ options.fullContent = true;
98
+ }
99
+ // Smart defaults for terminal (interactive) mode
100
+ const isTerminal = process.stdout.isTTY && !isPiped;
101
+ if (isTerminal && !options.raw && !options.html && !options.text) {
102
+ // Auto-readable: clean content by default (like browser Reader Mode)
103
+ if (!options.readable && !options.fullNav && !options.selector) {
104
+ options.readable = true;
105
+ }
106
+ // Default token budget: don't flood the terminal with 20K tokens
107
+ if (options.budget === undefined && !options.fullContent && !options.raw) {
108
+ options.budget = 4000;
109
+ }
110
+ }
111
+ // --agent sets sensible defaults for AI agents; explicit flags override
112
+ if (options.agent) {
113
+ if (!options.json)
114
+ options.json = true;
115
+ if (!options.silent)
116
+ options.silent = true;
117
+ if (!options.extractAll)
118
+ options.extractAll = true;
119
+ if (options.budget === undefined)
120
+ options.budget = 4000;
121
+ // Agent mode = clean content by default
122
+ if (!options.readable && !options.fullNav) {
123
+ options.readable = true;
124
+ }
125
+ }
126
+ const isJson = options.json;
127
+ // --- --list-schemas: print all available schemas and exit ---
128
+ if (options.listSchemas) {
129
+ const { loadBundledSchemas } = await import('../../core/schema-extraction.js');
130
+ const schemas = loadBundledSchemas();
131
+ if (isJson) {
132
+ await writeStdout(JSON.stringify(schemas.map(s => ({
133
+ name: s.name,
134
+ version: s.version,
135
+ domains: s.domains,
136
+ urlPatterns: s.urlPatterns,
137
+ })), null, 2) + '\n');
138
+ }
139
+ else {
140
+ console.log(`\nAvailable extraction schemas (${schemas.length}):\n`);
141
+ for (const s of schemas) {
142
+ console.log(` ${s.name} (v${s.version})`);
143
+ console.log(` Domains: ${s.domains.join(', ')}`);
144
+ if (s.urlPatterns && s.urlPatterns.length > 0) {
145
+ console.log(` URL patterns: ${s.urlPatterns.join(', ')}`);
146
+ }
147
+ console.log('');
148
+ }
149
+ }
150
+ process.exit(0);
151
+ }
152
+ // --- #4b: Read URL from stdin (pipe mode) if no URL argument provided ---
153
+ if ((!url || url.trim() === '') && !process.stdin.isTTY) {
154
+ try {
155
+ const stdinData = await readStdin();
156
+ const stdinUrl = stdinData.trim().split('\n')[0].trim();
157
+ if (stdinUrl && (stdinUrl.startsWith('http://') || stdinUrl.startsWith('https://'))) {
158
+ url = stdinUrl;
159
+ }
160
+ }
161
+ catch { /* ignore stdin read errors */ }
162
+ }
163
+ // --- #5: Concise error for missing URL (no help dump) ---
164
+ if (!url || url.trim() === '') {
165
+ if (isJson) {
166
+ await writeStdout(JSON.stringify({ success: false, error: { type: 'invalid_request', message: 'URL is required' } }) + '\n');
167
+ }
168
+ else {
169
+ console.error('Error: URL is required');
170
+ console.error('Usage: webpeel <url> [options]');
171
+ console.error('Run "webpeel --help" for full usage.');
172
+ }
173
+ process.exit(1);
174
+ }
175
+ // --- #6: Helper to output JSON errors and exit ---
176
+ function exitWithJsonError(message, code) {
177
+ if (isJson) {
178
+ process.stdout.write(JSON.stringify({
179
+ success: false,
180
+ error: { type: code.toLowerCase(), message },
181
+ }) + '\n');
182
+ }
183
+ else {
184
+ console.error(`Error: ${message}`);
185
+ }
186
+ process.exit(1);
187
+ }
188
+ // SECURITY: Enhanced URL validation
189
+ if (url.length > 2048) {
190
+ exitWithJsonError('URL too long (max 2048 characters)', 'INVALID_URL');
191
+ }
192
+ // Check for control characters
193
+ if (/[\x00-\x1F\x7F]/.test(url)) {
194
+ exitWithJsonError('URL contains invalid control characters', 'INVALID_URL');
195
+ }
196
+ // Validate URL format
197
+ try {
198
+ const parsed = new URL(url);
199
+ if (!['http:', 'https:'].includes(parsed.protocol)) {
200
+ exitWithJsonError('Only HTTP and HTTPS protocols are allowed', 'INVALID_URL');
201
+ }
202
+ }
203
+ catch {
204
+ // Check if it looks like a command/verb the user typed by mistake
205
+ const commonVerbs = ['fetch', 'get', 'scrape', 'read', 'download', 'curl', 'wget', 'peel'];
206
+ if (commonVerbs.includes(url.toLowerCase())) {
207
+ exitWithJsonError(`Did you mean: webpeel "${process.argv[3] || '<url>'}"?\nThe URL goes directly after webpeel — no verb needed.\nExample: webpeel "https://example.com" --json`, 'INVALID_URL');
208
+ }
209
+ else {
210
+ exitWithJsonError(`Invalid URL: "${url}"\nMake sure to include the protocol (https://)\nExample: webpeel "https://${url}" --json`, 'INVALID_URL');
211
+ }
212
+ }
213
+ const useStealth = options.stealth || false;
214
+ // Check usage quota
215
+ const usageCheck = await checkUsage();
216
+ if (!usageCheck.allowed) {
217
+ if (isJson) {
218
+ await writeStdout(JSON.stringify({ success: false, error: { type: 'rate_limited', message: usageCheck.message } }) + '\n');
219
+ process.exit(1);
220
+ }
221
+ console.error(usageCheck.message);
222
+ process.exit(1);
223
+ }
224
+ // ── --export: YouTube transcript download (early exit) ────────────────
225
+ if (options.export) {
226
+ const exportFmt = options.export.toLowerCase();
227
+ const validExportFmts = ['srt', 'txt', 'md', 'json'];
228
+ if (!validExportFmts.includes(exportFmt)) {
229
+ console.error(`Error: --export format must be one of: ${validExportFmts.join(', ')}`);
230
+ process.exit(1);
231
+ }
232
+ const exportCfg = loadConfig();
233
+ const exportApiKey = exportCfg.apiKey || process.env.WEBPEEL_API_KEY;
234
+ const exportApiUrl = process.env.WEBPEEL_API_URL || 'https://api.webpeel.dev';
235
+ if (!exportApiKey) {
236
+ console.error('No API key configured. Run: webpeel auth <your-key>');
237
+ console.error('Get a free key at: https://app.webpeel.dev/keys');
238
+ process.exit(2);
239
+ }
240
+ const lang = options.language || 'en';
241
+ const exportUrl = `${exportApiUrl}/v1/transcript/export?url=${encodeURIComponent(url)}&format=${exportFmt}&language=${lang}`;
242
+ const exportRes = await fetch(exportUrl, {
243
+ headers: { 'Authorization': `Bearer ${exportApiKey}` },
244
+ signal: AbortSignal.timeout(options.timeout ?? 90000),
245
+ });
246
+ if (!exportRes.ok) {
247
+ const errBody = await exportRes.text().catch(() => '');
248
+ try {
249
+ const errJson = JSON.parse(errBody);
250
+ const msg = errJson?.error?.message || errJson?.message || exportRes.statusText;
251
+ console.error(`Export failed (${exportRes.status}): ${msg}`);
252
+ }
253
+ catch {
254
+ console.error(`Export failed (${exportRes.status}): ${exportRes.statusText}`);
255
+ }
256
+ process.exit(1);
257
+ }
258
+ const exportContent = await exportRes.text();
259
+ if (options.output) {
260
+ writeFileSync(options.output, exportContent, 'utf-8');
261
+ if (!options.silent) {
262
+ console.error(`Transcript saved to: ${options.output}`);
263
+ }
264
+ }
265
+ else {
266
+ process.stdout.write(exportContent);
267
+ if (!exportContent.endsWith('\n'))
268
+ process.stdout.write('\n');
269
+ }
270
+ await cleanup();
271
+ process.exit(0);
272
+ }
273
+ // Check cache first (before spinner/network)
274
+ // Default: 5m TTL for all CLI fetches unless --no-cache is set
275
+ let cacheTtlMs;
276
+ const cacheDisabled = options.cache === false; // --no-cache sets options.cache to false
277
+ const explicitTtl = typeof options.cache === 'string' ? options.cache : undefined;
278
+ if (!cacheDisabled) {
279
+ const ttlStr = explicitTtl || '5m';
280
+ try {
281
+ cacheTtlMs = parseTTL(ttlStr);
282
+ }
283
+ catch (e) {
284
+ exitWithJsonError(e.message, 'FETCH_FAILED');
285
+ }
286
+ const cacheOptions = {
287
+ render: options.render,
288
+ stealth: options.stealth,
289
+ selector: options.selector,
290
+ format: options.html ? 'html' : options.text ? 'text' : options.clean ? 'clean' : 'markdown',
291
+ budget: null, // Budget excluded from cache key — cache stores full content
292
+ readable: options.readable || false,
293
+ noDomainApi: options.skipDomainApi || false, // Different cache for domain-api bypass
294
+ };
295
+ const cachedResult = getCache(url, cacheOptions);
296
+ if (cachedResult) {
297
+ if (!options.silent) {
298
+ console.error(`\x1b[36m⚡ Cache hit\x1b[0m (TTL: ${ttlStr})`);
299
+ }
300
+ // Apply budget to cached content (cache stores full, budget is post-process)
301
+ if (options.budget && options.budget > 0 && cachedResult.content) {
302
+ const fmt = options.text ? 'text' : 'markdown';
303
+ cachedResult.content = distillToBudget(cachedResult.content, options.budget, fmt);
304
+ cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
305
+ }
306
+ // LLM extraction from cached content
307
+ if (options.llmExtract || options.extractSchema) {
308
+ const { extractWithLLM } = await import('../../core/llm-extract.js');
309
+ const llmCfgCached = loadConfig();
310
+ const llmApiKeyCached = options.llmKey || llmCfgCached.llm?.apiKey || process.env.OPENAI_API_KEY;
311
+ if (!llmApiKeyCached) {
312
+ console.error('Error: LLM extraction requires an API key.\nSet OPENAI_API_KEY environment variable or use --llm-key <key>');
313
+ process.exit(1);
314
+ }
315
+ const llmModelCached = options.llmModel || llmCfgCached.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
316
+ const llmBaseUrlCached = options.llmBaseUrl || llmCfgCached.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
317
+ const llmInstructionCached = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
318
+ // Parse schema if provided
319
+ let llmSchemaCached;
320
+ if (options.extractSchema) {
321
+ let schemaStr = options.extractSchema;
322
+ if (schemaStr.startsWith('@')) {
323
+ schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
324
+ }
325
+ try {
326
+ llmSchemaCached = JSON.parse(schemaStr);
327
+ }
328
+ catch {
329
+ console.error('Error: --extract-schema must be valid JSON or a valid @file.json path');
330
+ process.exit(1);
331
+ }
332
+ }
333
+ const llmResultCached = await extractWithLLM({
334
+ content: cachedResult.content,
335
+ instruction: llmInstructionCached,
336
+ schema: llmSchemaCached,
337
+ apiKey: llmApiKeyCached,
338
+ model: llmModelCached,
339
+ baseUrl: llmBaseUrlCached,
340
+ });
341
+ await writeStdout(JSON.stringify(llmResultCached.items, null, 2) + '\n');
342
+ if (!options.silent) {
343
+ const { input, output } = llmResultCached.tokensUsed;
344
+ const costStr = llmResultCached.cost !== undefined ? ` | Est. cost: $${llmResultCached.cost.toFixed(6)}` : '';
345
+ console.error(`\n🤖 LLM extraction: ${llmResultCached.items.length} items | ${input} input + ${output} output tokens${costStr} | model: ${llmResultCached.model}`);
346
+ }
347
+ process.exit(0);
348
+ }
349
+ // --- LLM-free Quick Answer (also on cached content) ---
350
+ if (options.question && cachedResult.content) {
351
+ const { quickAnswer } = await import('../../core/quick-answer.js');
352
+ const qa = quickAnswer({
353
+ question: options.question,
354
+ content: cachedResult.content,
355
+ url: cachedResult.url,
356
+ });
357
+ cachedResult.quickAnswer = qa;
358
+ if (!isJson) {
359
+ const conf = (qa.confidence * 100).toFixed(0);
360
+ await writeStdout(`\n\x1b[36m📋 ${qa.question}\x1b[0m\n\n`);
361
+ if (qa.answer) {
362
+ await writeStdout(`\x1b[32m💡 Answer (${conf}% confidence):\x1b[0m\n${qa.answer}\n`);
363
+ }
364
+ else {
365
+ await writeStdout(`\x1b[33m💡 No relevant answer found (${conf}% confidence)\x1b[0m\n`);
366
+ }
367
+ if (qa.passages && qa.passages.length > 1) {
368
+ await writeStdout(`\n\x1b[33m📝 Supporting evidence:\x1b[0m\n`);
369
+ for (const p of qa.passages.slice(1, 4)) {
370
+ await writeStdout(` • [${(p.score * 100).toFixed(0)}%] ${p.text.substring(0, 200)}${p.text.length > 200 ? '...' : ''}\n`);
371
+ }
372
+ }
373
+ await writeStdout('\n');
374
+ await cleanup();
375
+ process.exit(0);
376
+ }
377
+ }
378
+ // --- BM25 Schema Template Extraction (cached path) ---
379
+ if (options.schema && cachedResult.content) {
380
+ const { getSchemaTemplate: getSchTmplCached } = await import('../../core/schema-templates.js');
381
+ const schTemplateCached = getSchTmplCached(options.schema);
382
+ if (schTemplateCached) {
383
+ const { quickAnswer: qaCached } = await import('../../core/quick-answer.js');
384
+ const { smartExtractSchemaFields: smartExtractCached } = await import('../../core/schema-postprocess.js');
385
+ const extractedCached = smartExtractCached(cachedResult.content, schTemplateCached.fields, qaCached, {
386
+ pageTitle: cachedResult.title,
387
+ pageUrl: cachedResult.url,
388
+ metadata: cachedResult.metadata,
389
+ });
390
+ cachedResult.extracted = extractedCached;
391
+ }
392
+ }
393
+ if (options.contentOnly) {
394
+ await writeStdout(cachedResult.content + '\n');
395
+ }
396
+ else {
397
+ await outputResult(cachedResult, options, { cached: true });
398
+ }
399
+ process.exit(0);
400
+ }
401
+ }
402
+ // --progress: show escalation steps on stderr (overrides spinner)
403
+ let progressInterval;
404
+ const progressStart = Date.now();
405
+ if (options.progress) {
406
+ process.stderr.write(`[simple] Fetching ${url}...\n`);
407
+ // Show escalation hints based on elapsed time (best-effort approximations)
408
+ const progressSteps = [
409
+ { afterMs: 2500, message: '[simple] Waiting for response...' },
410
+ { afterMs: 6000, message: '[browser] Simple too slow — escalating to browser render...' },
411
+ { afterMs: 12000, message: '[browser] Rendering with Chromium...' },
412
+ { afterMs: 20000, message: '[stealth] Escalating to stealth mode...' },
413
+ ];
414
+ let stepIdx = 0;
415
+ progressInterval = setInterval(() => {
416
+ const elapsed = Date.now() - progressStart;
417
+ while (stepIdx < progressSteps.length && elapsed >= progressSteps[stepIdx].afterMs) {
418
+ process.stderr.write(`${progressSteps[stepIdx].message}\n`);
419
+ stepIdx++;
420
+ }
421
+ }, 500);
422
+ }
423
+ // Suppress spinner when --progress is active (progress lines replace it)
424
+ const spinner = (options.silent || options.progress) ? null : ora('Fetching...').start();
425
+ // Auto progress: after 3 s, update spinner text with elapsed time + method hints
426
+ // Updated every 2 s so the user knows we're still working.
427
+ const autoProgressStart = Date.now();
428
+ const autoProgressSteps = [
429
+ { afterMs: 3000, text: '⏳ Fetching... (slow response)' },
430
+ { afterMs: 6000, text: '⏳ Fetching with browser... ({s}s)' },
431
+ { afterMs: 12000, text: '⏳ Fetching with browser... ({s}s — stealth may be needed)' },
432
+ { afterMs: 20000, text: '⏳ Fetching with stealth browser + proxy... ({s}s)' },
433
+ ];
434
+ let autoProgressStepIdx = 0;
435
+ const autoProgressInterval = spinner ? setInterval(() => {
436
+ const elapsed = Date.now() - autoProgressStart;
437
+ const secs = Math.round(elapsed / 1000);
438
+ while (autoProgressStepIdx < autoProgressSteps.length &&
439
+ elapsed >= autoProgressSteps[autoProgressStepIdx].afterMs) {
440
+ autoProgressStepIdx++;
441
+ }
442
+ if (autoProgressStepIdx > 0 && spinner) {
443
+ const tmpl = autoProgressSteps[autoProgressStepIdx - 1].text;
444
+ spinner.text = tmpl.replace('{s}', String(secs));
445
+ }
446
+ }, 2000) : null;
447
+ try {
448
+ // Validate options
449
+ if (options.wait && (options.wait < 0 || options.wait > 60000)) {
450
+ throw Object.assign(new Error('Wait time must be between 0 and 60000ms'), { _code: 'FETCH_FAILED' });
451
+ }
452
+ // Parse custom headers
453
+ let headers;
454
+ if (options.header && options.header.length > 0) {
455
+ headers = {};
456
+ for (const header of options.header) {
457
+ const colonIndex = header.indexOf(':');
458
+ if (colonIndex === -1) {
459
+ throw Object.assign(new Error(`Invalid header format: ${header}. Expected "Key: Value"`), { _code: 'FETCH_FAILED' });
460
+ }
461
+ const key = header.slice(0, colonIndex).trim();
462
+ const value = header.slice(colonIndex + 1).trim();
463
+ headers[key] = value;
464
+ }
465
+ }
466
+ // Parse actions
467
+ let actions;
468
+ if (options.action && options.action.length > 0) {
469
+ try {
470
+ actions = parseActions(options.action);
471
+ }
472
+ catch (e) {
473
+ throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
474
+ }
475
+ }
476
+ // --extract-schema auto-enables JSON output
477
+ if (options.extractSchema) {
478
+ options.json = true;
479
+ }
480
+ // Parse extract
481
+ let extract;
482
+ if (options.llmExtract || options.extractSchema) {
483
+ // LLM-based extraction is handled post-fetch (after peel returns markdown).
484
+ // Early-validate that an API key is available so we fail fast.
485
+ const llmCfg = loadConfig();
486
+ const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
487
+ if (!llmApiKey) {
488
+ throw Object.assign(new Error('LLM extraction requires an API key.\n' +
489
+ 'Set OPENAI_API_KEY environment variable or use --llm-key <key>'), { _code: 'FETCH_FAILED' });
490
+ }
491
+ // Do NOT set extract here — peel runs normally, LLM extraction happens below.
492
+ }
493
+ else if (options.extract) {
494
+ // Smart extract: detect schema format vs CSS selectors
495
+ let extractJson;
496
+ try {
497
+ extractJson = JSON.parse(options.extract);
498
+ }
499
+ catch {
500
+ throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\' or \'{"company": "string"}\')'), { _code: 'FETCH_FAILED' });
501
+ }
502
+ // If all values are type names (string/boolean/number/array/object),
503
+ // treat as structured schema extraction (routed to extractStructured after fetch).
504
+ // Otherwise treat as CSS selector map.
505
+ const { isTypeSchema } = await import('../../core/structured-extract.js');
506
+ if (isTypeSchema(extractJson)) {
507
+ // Mark for post-fetch structured extraction (handled below)
508
+ options._structuredSchema = extractJson;
509
+ }
510
+ else {
511
+ // CSS-based extraction
512
+ extract = { selectors: extractJson };
513
+ }
514
+ }
515
+ // Validate maxTokens
516
+ if (options.maxTokens !== undefined) {
517
+ if (isNaN(options.maxTokens) || options.maxTokens < 100) {
518
+ throw Object.assign(new Error('--max-tokens must be at least 100'), { _code: 'FETCH_FAILED' });
519
+ }
520
+ }
521
+ // Parse include-tags and exclude-tags
522
+ let includeTags;
523
+ let excludeTags;
524
+ if (options.onlyMainContent) {
525
+ includeTags = ['main', 'article'];
526
+ }
527
+ else if (options.includeTags) {
528
+ includeTags = options.includeTags.split(',').map((t) => t.trim());
529
+ }
530
+ if (options.excludeTags) {
531
+ excludeTags = options.excludeTags.split(',').map((t) => t.trim());
532
+ }
533
+ // Build location options
534
+ let locationOptions;
535
+ if (options.location || options.language) {
536
+ locationOptions = {};
537
+ if (options.location) {
538
+ locationOptions.country = options.location;
539
+ }
540
+ if (options.language) {
541
+ locationOptions.languages = [options.language];
542
+ }
543
+ }
544
+ // ── Resolve --profile: name → path + storage state ─────────────────
545
+ let resolvedProfileDir;
546
+ let resolvedStorageState;
547
+ let resolvedProfileName;
548
+ if (options.profile) {
549
+ const profilePath = getProfilePath(options.profile);
550
+ if (profilePath) {
551
+ // It's a named profile in ~/.webpeel/profiles/
552
+ resolvedProfileDir = profilePath;
553
+ resolvedStorageState = loadStorageState(options.profile) ?? undefined;
554
+ resolvedProfileName = options.profile;
555
+ }
556
+ else if (existsSync(options.profile)) {
557
+ // It's a raw directory path (backward compat)
558
+ resolvedProfileDir = options.profile;
559
+ }
560
+ else {
561
+ exitWithJsonError(`Profile "${options.profile}" not found. Run "webpeel profile list" to see available profiles.`, 'PROFILE_NOT_FOUND');
562
+ }
563
+ }
564
+ // Build peel options
565
+ // --stealth auto-enables --render (stealth requires browser)
566
+ // --action auto-enables --render (actions require browser)
567
+ // --scroll-extract implies --render (needs browser)
568
+ //
569
+ // Bare --scroll-extract (no number) → smart autoScroll (detects stable height)
570
+ // --scroll-extract N (with number) → legacy fixed N scrolls via actions
571
+ const scrollExtractRaw = options.scrollExtract;
572
+ const isAutoScroll = scrollExtractRaw !== undefined && typeof scrollExtractRaw !== 'number';
573
+ const scrollExtractCount = isAutoScroll
574
+ ? 0
575
+ : (scrollExtractRaw !== undefined ? scrollExtractRaw : 0);
576
+ const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || isAutoScroll
577
+ || (options.device && options.device !== 'desktop')
578
+ || !!options.viewport
579
+ || !!options.waitUntil
580
+ || !!options.waitSelector
581
+ || !!options.blockResources
582
+ || !!options.screenshot // Auto-enable render for screenshot (needs browser)
583
+ || false;
584
+ // Inject scroll actions when --scroll-extract N (fixed count) is used
585
+ if (scrollExtractCount > 0) {
586
+ const scrollActions = [];
587
+ for (let i = 0; i < scrollExtractCount; i++) {
588
+ scrollActions.push({ type: 'scroll', to: 'bottom' });
589
+ scrollActions.push({ type: 'wait', ms: 1500 });
590
+ }
591
+ actions = actions ? [...actions, ...scrollActions] : scrollActions;
592
+ }
593
+ const peelOptions = {
594
+ render: useRender,
595
+ stealth: options.stealth || false,
596
+ wait: options.wait || 0,
597
+ timeout: options.timeout,
598
+ userAgent: options.ua,
599
+ screenshot: options.screenshot !== undefined,
600
+ screenshotFullPage: options.fullPage || false,
601
+ selector: options.selector,
602
+ exclude: options.exclude,
603
+ includeTags,
604
+ excludeTags,
605
+ headers,
606
+ cookies: options.cookie,
607
+ raw: options.raw || false,
608
+ noDomainApi: options.skipDomainApi || false,
609
+ lite: options.lite || false,
610
+ actions,
611
+ maxTokens: options.maxTokens,
612
+ // Note: budget is applied AFTER caching (so cache stores full content)
613
+ // We pass it to peel() for programmatic API compatibility, but the CLI
614
+ // also applies it post-fetch (see below) to ensure cache stores full result.
615
+ extract,
616
+ images: options.images || false,
617
+ location: locationOptions,
618
+ profileDir: resolvedProfileDir,
619
+ headed: options.headed || false,
620
+ storageState: resolvedStorageState,
621
+ proxy: options.proxy,
622
+ proxies: options.proxies,
623
+ fullPage: options.fullContent || false,
624
+ readable: options.readable || false,
625
+ // Smart auto-scroll (bare --scroll-extract flag)
626
+ autoScroll: isAutoScroll
627
+ ? { timeout: options.scrollExtractTimeout }
628
+ : undefined,
629
+ device: options.device,
630
+ viewportWidth: options.viewport ? options.viewport.width : undefined,
631
+ viewportHeight: options.viewport ? options.viewport.height : undefined,
632
+ deviceScaleFactor: options.scale,
633
+ waitUntil: options.waitUntil,
634
+ waitSelector: options.waitSelector,
635
+ blockResources: options.blockResources ? options.blockResources.split(',').map((s) => s.trim()) : undefined,
636
+ cloaked: options.cloaked ? true : undefined,
637
+ cycle: options.cycle ? true : undefined,
638
+ tls: (options.tls || options.cycle) ? true : undefined,
639
+ highlightQuery: options.highlightQuery,
640
+ highlightMaxChars: options.highlightMaxChars,
641
+ };
642
+ if (options.cloaked) {
643
+ peelOptions.render = true; // CloakBrowser is a browser
644
+ }
645
+ // Add chunk option if requested
646
+ if (options.chunk) {
647
+ peelOptions.chunk = {
648
+ maxTokens: options.chunkSize || 512,
649
+ overlap: options.chunkOverlap || 50,
650
+ strategy: options.chunkStrategy || 'section',
651
+ };
652
+ }
653
+ // Add summary option if requested
654
+ if (options.summary) {
655
+ const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
656
+ if (!llmApiKey) {
657
+ throw Object.assign(new Error('--summary requires --llm-key or OPENAI_API_KEY environment variable'), { _code: 'FETCH_FAILED' });
658
+ }
659
+ peelOptions.summary = true;
660
+ peelOptions.llm = {
661
+ apiKey: llmApiKey,
662
+ model: process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini',
663
+ baseUrl: process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1',
664
+ };
665
+ }
666
+ // Determine format
667
+ if (options.html) {
668
+ peelOptions.format = 'html';
669
+ }
670
+ else if (options.text) {
671
+ peelOptions.format = 'text';
672
+ }
673
+ else if (options.clean) {
674
+ peelOptions.format = 'clean';
675
+ // --clean implies readable mode (article content only, no navs/footers)
676
+ peelOptions.readable = true;
677
+ }
678
+ else {
679
+ peelOptions.format = 'markdown';
680
+ }
681
+ // Fetch the page — route through API if key is configured, otherwise require auth
682
+ const fetchCfg = loadConfig();
683
+ const fetchApiKey = fetchCfg.apiKey || process.env.WEBPEEL_API_KEY;
684
+ const fetchApiUrl = process.env.WEBPEEL_API_URL || 'https://api.webpeel.dev';
685
+ // Features that require a local browser and cannot be delegated to the remote API.
686
+ // Also include domains (like amazon.com) that require stealth/browser rendering —
687
+ // the remote API won't render them correctly without special flags, so route locally.
688
+ const domainNeedsLocalBrowser = !!(shouldForceBrowser(url));
689
+ const needsLocalBrowser = !!(peelOptions.screenshot ||
690
+ peelOptions.actions?.length ||
691
+ peelOptions.profileDir ||
692
+ peelOptions.headed ||
693
+ peelOptions.storageState ||
694
+ peelOptions.cloaked ||
695
+ domainNeedsLocalBrowser);
696
+ let result;
697
+ if (fetchApiKey && !needsLocalBrowser) {
698
+ // Use the WebPeel API — no local Playwright needed
699
+ result = await fetchViaApi(url, peelOptions, fetchApiKey, fetchApiUrl);
700
+ }
701
+ else {
702
+ // No API key — fall back to local peel() mode (runs locally, no API needed)
703
+ if (spinner)
704
+ spinner.text = 'Fetching locally (no API key)…';
705
+ const startLocal = Date.now();
706
+ const { peel } = await import('../../index.js');
707
+ const localResult = await peel(url, peelOptions);
708
+ const elapsed = Date.now() - startLocal;
709
+ // Normalize to the shape fetchViaApi returns
710
+ result = {
711
+ ...localResult,
712
+ elapsed: localResult.elapsed ?? elapsed,
713
+ method: localResult.method ?? 'local',
714
+ tokens: localResult.tokens ?? Math.ceil((localResult.content?.length ?? 0) / 4),
715
+ cached: false,
716
+ };
717
+ }
718
+ // Update lastUsed timestamp for named profiles
719
+ if (resolvedProfileName) {
720
+ touchProfile(resolvedProfileName);
721
+ }
722
+ // Stop progress intervals and show final result
723
+ if (progressInterval) {
724
+ clearInterval(progressInterval);
725
+ progressInterval = undefined;
726
+ }
727
+ if (autoProgressInterval)
728
+ clearInterval(autoProgressInterval);
729
+ if (options.progress) {
730
+ const method = result.method || 'simple';
731
+ const elapsedSec = ((result.elapsed || (Date.now() - progressStart)) / 1000).toFixed(1);
732
+ const tokenCount = (result.tokens || 0).toLocaleString();
733
+ // Show escalation arrow if browser/stealth was needed
734
+ if (method !== 'simple') {
735
+ process.stderr.write(`[simple] → [${method}] escalated\n`);
736
+ }
737
+ process.stderr.write(`[${method}] Done — ${tokenCount} tokens in ${elapsedSec}s\n`);
738
+ }
739
+ else if (spinner) {
740
+ const domainTag = result.domainData
741
+ ? ` [${result.domainData.domain}:${result.domainData.type}]`
742
+ : '';
743
+ spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
744
+ // Smart hints — suggest features the user might not know about
745
+ if (!options.silent && !options.json && !options.skipDomainApi) {
746
+ if (result.method === 'domain-api') {
747
+ const extractorName = result.domainData?.domain || new URL(url).hostname.replace('www.', '') || 'domain';
748
+ console.error(`\x1b[33m💡 Tip: Using our ${extractorName} extractor. Want the raw page instead? Add --skip-domain-api\x1b[0m`);
749
+ }
750
+ }
751
+ if (!options.silent && !options.json && result.tokens && result.tokens < 50 && !options.render) {
752
+ console.error(`\x1b[33m💡 Tip: Very little content extracted. This may be a JavaScript-rendered page.\x1b[0m`);
753
+ console.error(`\x1b[33m Try: webpeel "${url}" --render\x1b[0m`);
754
+ console.error(`\x1b[33m For infinite scroll/SPAs: --action 'scroll:bottom' --action 'wait:2000'\x1b[0m`);
755
+ console.error(`\x1b[33m Or use --stealth if the site blocks bots.\x1b[0m`);
756
+ }
757
+ // Auth wall detection hint
758
+ if (!options.json && result.authRequired) {
759
+ let authHost = url;
760
+ try {
761
+ authHost = new URL(url).hostname.replace('www.', '');
762
+ }
763
+ catch { /* ignore */ }
764
+ console.error('');
765
+ console.error('\x1b[33m🔐 This page requires authentication.\x1b[0m');
766
+ console.error(`\x1b[36m 1. Create a login profile: webpeel profile create ${authHost}\x1b[0m`);
767
+ console.error('\x1b[36m 2. Log in to the site in the browser that opens\x1b[0m');
768
+ console.error('\x1b[36m 3. Press Ctrl+C when done\x1b[0m');
769
+ console.error(`\x1b[36m 4. Re-run with: webpeel "${url}" --profile ${authHost}\x1b[0m`);
770
+ console.error('');
771
+ }
772
+ }
773
+ // Trust & safety warnings — shown prominently in non-JSON mode
774
+ if (!options.silent && !options.json) {
775
+ const trustData = result.trust;
776
+ const sbData = result.safeBrowsing;
777
+ // Unsafe: safe browsing threats detected
778
+ const allThreats = [
779
+ ...(sbData?.threats ?? []),
780
+ ...(trustData?.threatFeeds?.threats ?? []),
781
+ ].filter((t, i, a) => a.indexOf(t) === i);
782
+ if (sbData && !sbData.safe && allThreats.length > 0) {
783
+ console.error(`\x1b[31m🚨 UNSAFE — Threats detected: ${allThreats.join(', ')}\x1b[0m`);
784
+ }
785
+ else if (trustData?.threatFeeds && !trustData.threatFeeds.safe && trustData.threatFeeds.threats.length > 0) {
786
+ console.error(`\x1b[31m🚨 UNSAFE — Threat feeds flagged: ${trustData.threatFeeds.threats.join(', ')}\x1b[0m`);
787
+ if (trustData.threatFeeds.details) {
788
+ console.error(`\x1b[31m ${trustData.threatFeeds.details}\x1b[0m`);
789
+ }
790
+ }
791
+ else if (trustData && trustData.score < 0.5) {
792
+ // Low trust score
793
+ const tier = trustData.source?.tier ?? 'unknown';
794
+ const label = trustData.source?.label ?? '';
795
+ const reason = tier === 'suspicious'
796
+ ? 'Domain shows suspicious signals'
797
+ : tier === 'new'
798
+ ? 'Domain has limited verifiable presence'
799
+ : label || 'Low credibility domain';
800
+ console.error(`\x1b[33m⚠️ Low trust score (${trustData.score.toFixed(2)}) — ${reason}\x1b[0m`);
801
+ }
802
+ // Show any trust warnings
803
+ if (trustData?.warnings && trustData.warnings.length > 0) {
804
+ for (const warn of trustData.warnings) {
805
+ console.error(`\x1b[33m⚠️ ${warn}\x1b[0m`);
806
+ }
807
+ }
808
+ }
809
+ // Show metadata header
810
+ const pageTitle = result.metadata?.title || result.title;
811
+ if (!options.silent && !options.json && pageTitle) {
812
+ const parts = [];
813
+ if (result.metadata?.author)
814
+ parts.push(`by ${result.metadata.author}`);
815
+ if (result.readability?.readingTime)
816
+ parts.push(result.readability.readingTime);
817
+ if (result.tokens)
818
+ parts.push(`${result.tokens.toLocaleString()} tokens`);
819
+ const subtitle = parts.length ? ` · ${parts.join(' · ')}` : '';
820
+ console.error(`\x1b[36m📄 ${pageTitle}${subtitle}\x1b[0m`);
821
+ }
822
+ // Show usage footer for free/anonymous users
823
+ if (usageCheck.usageInfo && !options.silent) {
824
+ showUsageFooter(usageCheck.usageInfo, usageCheck.isAnonymous || false, useStealth);
825
+ }
826
+ // Handle screenshot saving
827
+ if (options.screenshot && result.screenshot) {
828
+ const screenshotPath = typeof options.screenshot === 'string'
829
+ ? options.screenshot
830
+ : 'screenshot.png';
831
+ const screenshotBuffer = Buffer.from(result.screenshot, 'base64');
832
+ writeFileSync(screenshotPath, screenshotBuffer);
833
+ if (!options.silent) {
834
+ console.error(`Screenshot saved to: ${screenshotPath}`);
835
+ }
836
+ // Remove screenshot from JSON output if saving to file
837
+ if (typeof options.screenshot === 'string') {
838
+ delete result.screenshot;
839
+ }
840
+ }
841
+ // Store full result in cache (before budget distillation so cache is reusable)
842
+ if (cacheTtlMs && !cacheDisabled) {
843
+ setCache(url, result, cacheTtlMs, {
844
+ render: options.render,
845
+ stealth: useStealth,
846
+ selector: options.selector,
847
+ format: peelOptions.format,
848
+ budget: null, // Budget excluded — cache stores full content, budget applied post-cache
849
+ readable: options.readable || false,
850
+ });
851
+ }
852
+ // Apply smart budget distillation AFTER caching (cache always stores full content)
853
+ // When --agent is set, always apply budget even with --extract-all (listings will be budgeted
854
+ // separately, but if no listings are found the content itself still needs trimming).
855
+ const skipBudgetForExtract = (options.extractAll || options.scrollExtract !== undefined) && !options.agent;
856
+ let contentTruncated = false;
857
+ if (options.budget && options.budget > 0 && !skipBudgetForExtract) {
858
+ const budgetFormat = peelOptions.format === 'text' ? 'text' : 'markdown';
859
+ const distilled = distillToBudget(result.content, options.budget, budgetFormat);
860
+ if (distilled !== result.content) {
861
+ contentTruncated = true;
862
+ result.content = distilled;
863
+ result.tokens = estimateTokens(distilled);
864
+ }
865
+ }
866
+ // --- BM25 Query-Focused Filtering ---
867
+ if (options.focus && result.content) {
868
+ const { filterByRelevance } = await import('../../core/bm25-filter.js');
869
+ const focusResult = filterByRelevance(result.content, { query: options.focus });
870
+ result.content = focusResult.content;
871
+ result.tokens = estimateTokens(focusResult.content);
872
+ if (isJson) {
873
+ result.focusQuery = options.focus;
874
+ result.focusReduction = focusResult.reductionPercent;
875
+ }
876
+ }
877
+ // --- LLM-free Quick Answer ---
878
+ if (options.question && result.content) {
879
+ const { quickAnswer } = await import('../../core/quick-answer.js');
880
+ const qa = quickAnswer({
881
+ question: options.question,
882
+ content: result.content,
883
+ url: result.url,
884
+ });
885
+ result.quickAnswer = qa;
886
+ if (!isJson) {
887
+ // Display answer prominently in human-readable mode
888
+ const conf = (qa.confidence * 100).toFixed(0);
889
+ await writeStdout(`\n\x1b[36m📋 ${qa.question}\x1b[0m\n\n`);
890
+ if (qa.answer) {
891
+ await writeStdout(`\x1b[32m💡 Answer (${conf}% confidence):\x1b[0m\n${qa.answer}\n`);
892
+ }
893
+ else {
894
+ await writeStdout(`\x1b[33m💡 No relevant answer found (${conf}% confidence)\x1b[0m\n`);
895
+ }
896
+ if (qa.passages && qa.passages.length > 1) {
897
+ await writeStdout(`\n\x1b[33m📝 Supporting evidence:\x1b[0m\n`);
898
+ for (const p of qa.passages.slice(1, 4)) {
899
+ await writeStdout(` • [${(p.score * 100).toFixed(0)}%] ${p.text.substring(0, 200)}${p.text.length > 200 ? '...' : ''}\n`);
900
+ }
901
+ }
902
+ await writeStdout('\n');
903
+ await cleanup();
904
+ process.exit(0);
905
+ }
906
+ }
907
+ // --- RAG Chunking output (chunks come from pipeline via peelOptions.chunk) ---
908
+ if (result.chunks && result.chunks.length > 0 && !isJson) {
909
+ console.log(`\n${'─'.repeat(60)}`);
910
+ console.log(`📦 ${result.chunks.length} chunks (${options.chunkStrategy || 'section'} strategy)\n`);
911
+ for (const chunk of result.chunks) {
912
+ const sectionLabel = chunk.section ? ` [${chunk.section}]` : '';
913
+ console.log(`── Chunk ${chunk.index + 1}${sectionLabel} (${chunk.tokenCount} tokens, ${chunk.wordCount} words) ──`);
914
+ console.log(chunk.text.substring(0, 200) + (chunk.text.length > 200 ? '...' : ''));
915
+ console.log('');
916
+ }
917
+ }
918
+ // --- #4: Content quality warning ---
919
+ const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
920
+ const isRedirect = false; // peel() follows redirects — final result is always 200
921
+ if (result.tokens < 20 && !useRender && isHtmlContent && !isRedirect) {
922
+ const warningMsg = `Low content detected (${result.tokens} tokens). Try: webpeel ${url} --render`;
923
+ if (isJson) {
924
+ result.warning = warningMsg;
925
+ }
926
+ else {
927
+ console.error(`⚠ ${warningMsg}`);
928
+ }
929
+ }
930
+ // --- Structured schema extraction (--extract with type schema or --extract-prompt) ---
931
+ if (options._structuredSchema || options.extractPrompt) {
932
+ const { extractStructured, simpleToExtractionSchema } = await import('../../core/structured-extract.js');
933
+ const rawSchema = options._structuredSchema;
934
+ const schema = rawSchema
935
+ ? simpleToExtractionSchema(rawSchema)
936
+ : { type: 'object', properties: { result: { type: 'string', description: options.extractPrompt } } };
937
+ const strResult = await extractStructured(result.content, schema, undefined, // No LLM config — use heuristic (no key needed)
938
+ options.extractPrompt);
939
+ if (isJson) {
940
+ await writeStdout(JSON.stringify({
941
+ success: true,
942
+ data: strResult.data,
943
+ confidence: strResult.confidence,
944
+ method: 'heuristic',
945
+ }, null, 2) + '\n');
946
+ }
947
+ else {
948
+ await writeStdout(JSON.stringify(strResult.data, null, 2) + '\n');
949
+ if (!options.silent) {
950
+ console.error(`\n📊 Structured extraction: confidence=${(strResult.confidence * 100).toFixed(0)}% (heuristic)`);
951
+ }
952
+ }
953
+ await cleanup();
954
+ process.exit(0);
955
+ }
956
+ // --- LLM-based extraction (post-peel) ---
957
+ if (options.llmExtract || options.extractSchema) {
958
+ const { extractWithLLM } = await import('../../core/llm-extract.js');
959
+ const llmCfg = loadConfig();
960
+ const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
961
+ const llmModel = options.llmModel || llmCfg.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
962
+ const llmBaseUrl = options.llmBaseUrl || llmCfg.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
963
+ const llmInstruction = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
964
+ // Parse --extract-schema if provided
965
+ let llmSchema;
966
+ if (options.extractSchema) {
967
+ let schemaStr = options.extractSchema;
968
+ if (schemaStr.startsWith('@')) {
969
+ schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
970
+ }
971
+ try {
972
+ llmSchema = JSON.parse(schemaStr);
973
+ }
974
+ catch {
975
+ exitWithJsonError('--extract-schema must be valid JSON or a valid @file.json path', 'FETCH_FAILED');
976
+ }
977
+ }
978
+ const llmResult = await extractWithLLM({
979
+ content: result.content,
980
+ instruction: llmInstruction,
981
+ schema: llmSchema,
982
+ apiKey: llmApiKey,
983
+ model: llmModel,
984
+ baseUrl: llmBaseUrl,
985
+ });
986
+ // Output structured items as JSON
987
+ await writeStdout(JSON.stringify(llmResult.items, null, 2) + '\n');
988
+ // Show token usage and estimated cost
989
+ if (!options.silent) {
990
+ const { input, output } = llmResult.tokensUsed;
991
+ const costStr = llmResult.cost !== undefined
992
+ ? ` | Est. cost: $${llmResult.cost.toFixed(6)}`
993
+ : '';
994
+ console.error(`\n🤖 LLM extraction: ${llmResult.items.length} items | ${input} input + ${output} output tokens${costStr} | model: ${llmResult.model}`);
995
+ }
996
+ await cleanup();
997
+ process.exit(0);
998
+ }
999
+ // --- Extract-all / pagination / output formatting ---
1000
+ const wantsExtractAll = options.extractAll || options.scrollExtract !== undefined;
1001
+ const pagesCount = Math.min(Math.max(options.pages || 1, 1), 10);
1002
+ if (wantsExtractAll) {
1003
+ const { extractListings } = await import('../../core/extract-listings.js');
1004
+ const { findNextPageUrl } = await import('../../core/paginate.js');
1005
+ const { findSchemaForUrl, extractWithSchema, loadBundledSchemas } = await import('../../core/schema-extraction.js');
1006
+ // Resolve which schema to use (explicit --schema flag or auto-detect)
1007
+ let activeSchema = null;
1008
+ if (options.schema) {
1009
+ // Find schema by name or domain match
1010
+ const schemaQuery = options.schema.toLowerCase();
1011
+ const allSchemas = loadBundledSchemas();
1012
+ activeSchema = allSchemas.find(s => s.name.toLowerCase().includes(schemaQuery) ||
1013
+ s.domains.some(d => d.toLowerCase().includes(schemaQuery))) ?? null;
1014
+ if (!activeSchema && !options.silent) {
1015
+ console.error(`Warning: No schema found for "${options.schema}", falling back to auto-detection`);
1016
+ }
1017
+ }
1018
+ else {
1019
+ // Auto-detect from URL
1020
+ activeSchema = findSchemaForUrl(result.url || url);
1021
+ }
1022
+ // We need the raw HTML for extraction. Re-fetch with format=html if needed.
1023
+ let allListings = [];
1024
+ // Fetch HTML for extraction
1025
+ const htmlResult = peelOptions.format === 'html'
1026
+ ? result
1027
+ : await peel(url, { ...peelOptions, format: 'html', maxTokens: undefined });
1028
+ // Try schema extraction first, fall back to generic
1029
+ if (activeSchema) {
1030
+ const schemaListings = extractWithSchema(htmlResult.content, activeSchema, result.url);
1031
+ if (schemaListings.length > 0) {
1032
+ allListings.push(...schemaListings);
1033
+ }
1034
+ else {
1035
+ // Schema returned nothing — fall back to generic
1036
+ allListings.push(...extractListings(htmlResult.content, result.url));
1037
+ }
1038
+ }
1039
+ else {
1040
+ allListings.push(...extractListings(htmlResult.content, result.url));
1041
+ }
1042
+ // Pagination: follow "Next" links
1043
+ if (pagesCount > 1) {
1044
+ let currentHtml = htmlResult.content;
1045
+ let currentUrl = result.url;
1046
+ for (let page = 1; page < pagesCount; page++) {
1047
+ const nextUrl = findNextPageUrl(currentHtml, currentUrl);
1048
+ if (!nextUrl)
1049
+ break;
1050
+ try {
1051
+ const nextResult = await peel(nextUrl, { ...peelOptions, format: 'html', maxTokens: undefined });
1052
+ let pageListings;
1053
+ if (activeSchema) {
1054
+ const schemaPage = extractWithSchema(nextResult.content, activeSchema, nextResult.url);
1055
+ pageListings = schemaPage.length > 0
1056
+ ? schemaPage
1057
+ : extractListings(nextResult.content, nextResult.url);
1058
+ }
1059
+ else {
1060
+ pageListings = extractListings(nextResult.content, nextResult.url);
1061
+ }
1062
+ allListings.push(...pageListings);
1063
+ currentHtml = nextResult.content;
1064
+ currentUrl = nextResult.url;
1065
+ }
1066
+ catch {
1067
+ break; // Stop paginating on error
1068
+ }
1069
+ }
1070
+ }
1071
+ // Apply budget to listings if requested
1072
+ let listingsTruncated = false;
1073
+ let totalAvailableListings;
1074
+ if (options.budget && options.budget > 0 && allListings.length > 0) {
1075
+ const { maxItems, truncated, totalAvailable } = budgetListings(allListings.length, options.budget);
1076
+ if (truncated) {
1077
+ listingsTruncated = true;
1078
+ totalAvailableListings = totalAvailable;
1079
+ allListings = allListings.slice(0, maxItems);
1080
+ }
1081
+ }
1082
+ // Output based on format flags
1083
+ if (options.csv) {
1084
+ const csvOutput = formatListingsCsv(allListings);
1085
+ await writeStdout(csvOutput);
1086
+ }
1087
+ else if (options.table) {
1088
+ const { formatTable } = await import('../../core/table-format.js');
1089
+ const tableRows = allListings.map(item => {
1090
+ const row = {};
1091
+ for (const [k, v] of Object.entries(item)) {
1092
+ if (v !== undefined)
1093
+ row[k] = v;
1094
+ }
1095
+ return row;
1096
+ });
1097
+ await writeStdout(formatTable(tableRows) + '\n');
1098
+ }
1099
+ else if (isJson) {
1100
+ // Use unified envelope for JSON output
1101
+ const structured = allListings;
1102
+ const envelope = buildEnvelope(result, {
1103
+ cached: false,
1104
+ structured,
1105
+ truncated: listingsTruncated || undefined,
1106
+ totalAvailable: totalAvailableListings,
1107
+ });
1108
+ // Also include legacy fields for backward compat
1109
+ envelope.listings = allListings;
1110
+ envelope.count = allListings.length;
1111
+ await writeStdout(JSON.stringify(envelope, null, 2) + '\n');
1112
+ }
1113
+ else {
1114
+ // Formatted text output
1115
+ if (allListings.length === 0) {
1116
+ await writeStdout('No listings found.\n');
1117
+ }
1118
+ else {
1119
+ const truncNote = listingsTruncated && totalAvailableListings
1120
+ ? ` (${totalAvailableListings} total — budget limited to ${allListings.length})`
1121
+ : '';
1122
+ await writeStdout(`Found ${allListings.length} listings${truncNote}:\n\n`);
1123
+ allListings.forEach((item, i) => {
1124
+ const pricePart = item.price ? ` — ${item.price}` : '';
1125
+ const line = `${i + 1}. ${item.title}${pricePart}\n`;
1126
+ process.stdout.write(line);
1127
+ if (item.link) {
1128
+ process.stdout.write(` ${item.link}\n`);
1129
+ }
1130
+ process.stdout.write('\n');
1131
+ });
1132
+ }
1133
+ }
1134
+ }
1135
+ else if (options.csv || options.table) {
1136
+ // CSV / table output for --extract (CSS selector extraction)
1137
+ if (result.extracted) {
1138
+ const rows = normaliseExtractedToRows(result.extracted);
1139
+ if (options.csv) {
1140
+ await writeStdout(formatListingsCsv(rows));
1141
+ }
1142
+ else {
1143
+ const { formatTable } = await import('../../core/table-format.js');
1144
+ await writeStdout(formatTable(rows) + '\n');
1145
+ }
1146
+ }
1147
+ else {
1148
+ console.error('--csv / --table require --extract-all or --extract to produce structured data.');
1149
+ }
1150
+ }
1151
+ else {
1152
+ // --- BM25 Schema Template Extraction (no LLM needed) ---
1153
+ if (options.schema && result.content) {
1154
+ const { getSchemaTemplate: getSchTmpl } = await import('../../core/schema-templates.js');
1155
+ const schTemplate = getSchTmpl(options.schema);
1156
+ if (schTemplate) {
1157
+ const { quickAnswer: qa } = await import('../../core/quick-answer.js');
1158
+ const { smartExtractSchemaFields } = await import('../../core/schema-postprocess.js');
1159
+ const extracted = smartExtractSchemaFields(result.content, schTemplate.fields, qa, {
1160
+ pageTitle: result.title,
1161
+ pageUrl: result.url,
1162
+ metadata: result.metadata,
1163
+ });
1164
+ result.extracted = extracted;
1165
+ }
1166
+ }
1167
+ // --content-only: output raw content only, no wrapper
1168
+ if (options.contentOnly) {
1169
+ await writeStdout(result.content + '\n');
1170
+ }
1171
+ else {
1172
+ // Output results (default path)
1173
+ await outputResult(result, options, {
1174
+ cached: false,
1175
+ truncated: contentTruncated || undefined,
1176
+ });
1177
+ // Token savings display (our unique selling point)
1178
+ if (!options.json && !options.silent && result.tokenSavingsPercent) {
1179
+ const savings = result.tokenSavingsPercent;
1180
+ const raw = result.rawTokenEstimate;
1181
+ const optimized = result.tokens || 0;
1182
+ if (savings > 0) {
1183
+ const rawStr = raw ? `${raw.toLocaleString()}→${optimized.toLocaleString()} tokens` : `${optimized.toLocaleString()} tokens`;
1184
+ process.stderr.write(`\x1b[32m💰 Token savings: ${savings}% smaller than raw HTML (${rawStr})\x1b[0m\n`);
1185
+ }
1186
+ }
1187
+ }
1188
+ }
1189
+ // Clean up and exit
1190
+ await cleanup();
1191
+ process.exit(0);
1192
+ }
1193
+ catch (error) {
1194
+ if (autoProgressInterval)
1195
+ clearInterval(autoProgressInterval);
1196
+ if (spinner) {
1197
+ spinner.fail('Failed to fetch');
1198
+ }
1199
+ // --- #6: Consistent JSON error output ---
1200
+ if (isJson) {
1201
+ const errMsg = error instanceof Error ? error.message : 'Unknown error';
1202
+ const errCode = classifyErrorCode(error);
1203
+ await writeStdout(JSON.stringify({ success: false, error: { type: errCode.toLowerCase(), message: errMsg } }) + '\n');
1204
+ await cleanup();
1205
+ process.exit(1);
1206
+ }
1207
+ if (error instanceof Error) {
1208
+ console.error('\n' + formatError(error, url || '', options));
1209
+ }
1210
+ else {
1211
+ console.error('\x1b[31m✖ Unknown error occurred\x1b[0m');
1212
+ }
1213
+ await cleanup();
1214
+ process.exit(1);
1215
+ }
1216
+ }
1217
+ // ─── registerFetchCommands ───────────────────────────────────────────────────
1218
+ export function registerFetchCommands(program) {
1219
+ // ── Default command: fetch a URL ─────────────────────────────────────────
1220
+ program
1221
+ .argument('[url]', 'URL to fetch')
1222
+ .option('-r, --render', 'Use headless browser (for JS-heavy sites)')
1223
+ .option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
1224
+ .option('--cloaked', 'Use CloakBrowser stealth (requires: npm install cloakbrowser)')
1225
+ .option('--tls', 'Use PeelTLS TLS fingerprint spoofing (built-in, no install needed)')
1226
+ .option('--cycle', 'Use PeelTLS TLS fingerprint spoofing (alias for --tls)', false)
1227
+ .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
1228
+ .option('--proxies <urls>', 'Comma-separated list of proxy URLs for rotation (tried in order on failure)', (val) => val.split(',').map((s) => s.trim()).filter(Boolean))
1229
+ .option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
1230
+ .option('--html', 'Output raw HTML instead of markdown')
1231
+ .option('--text', 'Output plain text instead of markdown')
1232
+ .option('--clean', 'Clean output — article content only, no links or metadata (alias for --readable with URL-stripped markdown)')
1233
+ .option('--json', 'Output as JSON')
1234
+ .option('-t, --timeout <ms>', 'Request timeout (ms)', (v) => parseInt(v, 10), 30000)
1235
+ .option('--ua <agent>', 'Custom user agent')
1236
+ .option('-s, --silent', 'Silent mode (no spinner)')
1237
+ .option('--screenshot [path]', 'Take a screenshot (optionally save to file path)')
1238
+ .option('--full-page', 'Full-page screenshot (use with --screenshot)')
1239
+ .option('--selector <css>', 'CSS selector to extract (e.g., "article", ".content")')
1240
+ .option('--exclude <selectors...>', 'CSS selectors to exclude (e.g., ".sidebar" ".ads")')
1241
+ .option('--include-tags <tags>', 'Comma-separated HTML tags/selectors to include (e.g., "main,article,.content")')
1242
+ .option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
1243
+ .option('--only-main-content', 'Shortcut for --include-tags main,article')
1244
+ .option('--full-content', 'Return full page content (disable automatic content density pruning)')
1245
+ .option('--readable', 'Reader mode — extract only the main article content, strip all noise (like browser Reader Mode)')
1246
+ .option('--full-nav', 'Keep full navigation/content (disable auto-readability when piped or in agent mode)')
1247
+ .option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
1248
+ .option('--chunk', 'Split content into RAG-ready chunks')
1249
+ .option('--chunk-size <tokens>', 'Max tokens per chunk (default: 512)', parseInt)
1250
+ .option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 50)', parseInt)
1251
+ .option('--chunk-strategy <strategy>', 'Chunking strategy: section (default), paragraph, fixed')
1252
+ .option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
1253
+ .option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
1254
+ .option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
1255
+ .option('--no-cache', 'Disable automatic caching for this request')
1256
+ .option('--links', 'Output only the links found on the page')
1257
+ .option('--images', 'Output image URLs from the page')
1258
+ .option('--meta', 'Output only the page metadata (title, description, author, etc.)')
1259
+ .option('--raw', 'Return full page without smart content extraction')
1260
+ .option('--skip-domain-api', 'Bypass domain-specific API extractors — force actual page scraping')
1261
+ .option('--full', 'Alias for --raw — full page content, no budget')
1262
+ .option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
1263
+ .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
1264
+ .option('--extract <json>', 'Extract structured data using CSS selectors or type schema (e.g., \'{"title": "h1"}\' for CSS, \'{"name": "string"}\' for schema)')
1265
+ .option('--extract-prompt <prompt>', 'Natural language prompt for structured extraction (no LLM key needed — uses heuristics)')
1266
+ .option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
1267
+ .option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
1268
+ .option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
1269
+ .option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
1270
+ .option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
1271
+ .option('--summary', 'Generate AI summary of content (requires --llm-key or OPENAI_API_KEY)')
1272
+ .option('--location <country>', 'ISO country code for geo-targeting (e.g., "US", "DE", "JP")')
1273
+ .option('--language <lang>', 'Language preference (e.g., "en", "de", "ja")')
1274
+ .option('--max-tokens <n>', 'Maximum token count for output (truncate if exceeded)', parseInt)
1275
+ .option('--budget <n>', 'Smart token budget — distill content to fit within N tokens (heuristic, no LLM key needed)', parseInt)
1276
+ .option('--extract-all', 'Auto-detect and extract repeated listing items (e.g., search results)')
1277
+ .option('--schema <name>', 'Force a specific extraction schema by name or domain (e.g., "booking.com", "amazon")')
1278
+ .option('--list-schemas', 'List all available extraction schemas and their supported domains')
1279
+ .option('--scroll-extract [count]', 'Scroll page N times to load lazy content (bare flag = smart auto-scroll until stable), then extract (implies --render)', (v) => parseInt(v, 10))
1280
+ .option('--scroll-extract-timeout <ms>', 'Total timeout in ms for auto-scroll (default: 30000, only used with bare --scroll-extract)', parseInt)
1281
+ .option('--csv', 'Output extraction results as CSV')
1282
+ .option('--table', 'Output extraction results as a formatted table')
1283
+ .option('--pages <n>', 'Follow pagination "Next" links for N pages (max 10)', (v) => parseInt(v, 10))
1284
+ .option('--profile <path>', 'Use a persistent browser profile directory (cookies/sessions survive between calls)')
1285
+ .option('--headed', 'Run browser in headed (visible) mode — useful for profile setup and debugging')
1286
+ .option('-q, --question <q>', 'Ask a question about the page content (BM25-powered, no LLM key needed)')
1287
+ .option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)')
1288
+ .option('--device <type>', 'Device emulation: desktop (default), mobile, tablet (auto-enables --render)')
1289
+ .option('--viewport <WxH>', 'Browser viewport size (e.g., "1920x1080") (auto-enables --render)', (val) => {
1290
+ const [w, h] = val.split('x').map(Number);
1291
+ return { width: w, height: h };
1292
+ })
1293
+ .option('--scale <factor>', 'Device scale factor (pixel density) for screenshots (default: auto from device profile)', parseFloat)
1294
+ .option('--wait-until <event>', 'Page load event: domcontentloaded, networkidle, load, commit (auto-enables --render)')
1295
+ .option('--wait-selector <css>', 'Wait for CSS selector before extracting (auto-enables --render)')
1296
+ .option('--block-resources <types>', 'Block resource types, comma-separated: image,stylesheet,font,media,script (auto-enables --render)')
1297
+ .option('--format <type>', 'Output format: markdown (default), text, html, json')
1298
+ .option('--content-only', 'Output only the raw content field (no metadata, no JSON wrapper) — ideal for piping to LLMs')
1299
+ .option('--progress', 'Show engine escalation steps (simple → browser → stealth) with timing')
1300
+ .option('--stdin', 'Read HTML from stdin instead of fetching a URL — converts to markdown')
1301
+ .option('--export <format>', 'Export YouTube transcript in the given format: srt, txt, md, json')
1302
+ .option('--output <file>', 'Write output to a file instead of stdout')
1303
+ .action(async (url, options) => {
1304
+ if (options.stdin) {
1305
+ await runStdin(options);
1306
+ return;
1307
+ }
1308
+ await runFetch(url, options);
1309
+ });
1310
+ // ── read subcommand (explicit readable mode) ─────────────────────────────
1311
+ program
1312
+ .command('read <url>')
1313
+ .description('Read a page in clean reader mode (like browser Reader View)')
1314
+ .option('--json', 'Output as JSON')
1315
+ .option('-s, --silent', 'Silent mode')
1316
+ .option('--budget <n>', 'Token budget (default: 4000)', parseInt)
1317
+ .option('--focus <query>', 'Focus on content relevant to this query')
1318
+ .option('--highlight-query <query>', 'Extract only passages relevant to this query (BM25-powered)')
1319
+ .option('--highlight-max-chars <n>', 'Max characters for highlights (default: 1000)', parseInt)
1320
+ .action(async (url, opts) => {
1321
+ await runFetch(url, {
1322
+ ...opts,
1323
+ readable: true,
1324
+ budget: 4000,
1325
+ });
1326
+ });
1327
+ // ── pipe subcommand — always JSON, no UI (agent-friendly) ────────────────
1328
+ program
1329
+ .command('pipe <url>')
1330
+ .description('Pipe-friendly fetch (always JSON, no UI). Alias for: webpeel <url> --json --silent')
1331
+ .option('-r, --render', 'Use headless browser')
1332
+ .option('--stealth', 'Stealth mode')
1333
+ .option('--budget <n>', 'Token budget', parseInt)
1334
+ .option('--clean', 'Clean format for AI')
1335
+ .option('-q, --question <q>', 'Quick answer')
1336
+ .option('--proxy <url>', 'Proxy URL')
1337
+ .option('--timeout <ms>', 'Timeout in ms', parseInt)
1338
+ .option('-s, --silent', 'Silent mode (always on for pipe, accepted for compatibility)')
1339
+ .action(async (url, opts) => {
1340
+ // Force JSON + silent — always, unconditionally
1341
+ opts.json = true;
1342
+ opts.silent = true;
1343
+ await runFetch(url, opts);
1344
+ });
1345
+ }