webpeel 0.19.4 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (544) hide show
  1. package/README.md +2 -2
  2. package/dist/cache.d.ts +0 -1
  3. package/dist/cache.js +0 -1
  4. package/dist/cli/commands/auth.d.ts +5 -0
  5. package/dist/cli/commands/auth.js +476 -0
  6. package/dist/cli/commands/fetch.d.ts +6 -0
  7. package/dist/cli/commands/fetch.js +1015 -0
  8. package/dist/cli/commands/interact.d.ts +5 -0
  9. package/dist/cli/commands/interact.js +839 -0
  10. package/dist/cli/commands/jobs.d.ts +5 -0
  11. package/dist/cli/commands/jobs.js +997 -0
  12. package/dist/cli/commands/screenshot.d.ts +5 -0
  13. package/dist/cli/commands/screenshot.js +273 -0
  14. package/dist/cli/commands/search.d.ts +5 -0
  15. package/dist/cli/commands/search.js +524 -0
  16. package/dist/cli/utils.d.ts +84 -0
  17. package/dist/cli/utils.js +686 -0
  18. package/dist/cli-auth.d.ts +0 -1
  19. package/dist/cli-auth.js +0 -1
  20. package/dist/cli.d.ts +7 -6
  21. package/dist/cli.js +35 -4698
  22. package/dist/core/actions.d.ts +0 -1
  23. package/dist/core/actions.js +0 -1
  24. package/dist/core/agent.d.ts +0 -1
  25. package/dist/core/agent.js +9 -12
  26. package/dist/core/answer.d.ts +0 -1
  27. package/dist/core/answer.js +0 -1
  28. package/dist/core/application-tracker.d.ts +0 -1
  29. package/dist/core/application-tracker.js +0 -1
  30. package/dist/core/apply.d.ts +0 -1
  31. package/dist/core/apply.js +0 -1
  32. package/dist/core/auto-extract.d.ts +0 -1
  33. package/dist/core/auto-extract.js +0 -1
  34. package/dist/core/auto-interact.d.ts +0 -1
  35. package/dist/core/auto-interact.js +0 -1
  36. package/dist/core/bm25-filter.d.ts +0 -1
  37. package/dist/core/bm25-filter.js +0 -1
  38. package/dist/core/branding.d.ts +0 -1
  39. package/dist/core/branding.js +0 -1
  40. package/dist/core/browser-fetch.d.ts +0 -1
  41. package/dist/core/browser-fetch.js +17 -10
  42. package/dist/core/browser-pool.d.ts +0 -1
  43. package/dist/core/browser-pool.js +0 -1
  44. package/dist/core/budget.d.ts +0 -1
  45. package/dist/core/budget.js +0 -1
  46. package/dist/core/cache.d.ts +0 -1
  47. package/dist/core/cache.js +0 -1
  48. package/dist/core/cf-worker-proxy.d.ts +0 -1
  49. package/dist/core/cf-worker-proxy.js +0 -1
  50. package/dist/core/challenge-detection.d.ts +0 -1
  51. package/dist/core/challenge-detection.js +0 -1
  52. package/dist/core/change-tracking.d.ts +0 -1
  53. package/dist/core/change-tracking.js +0 -1
  54. package/dist/core/chunker.d.ts +0 -1
  55. package/dist/core/chunker.js +0 -1
  56. package/dist/core/chunking.d.ts +0 -1
  57. package/dist/core/chunking.js +0 -1
  58. package/dist/core/cloak-fetch.d.ts +0 -1
  59. package/dist/core/cloak-fetch.js +0 -1
  60. package/dist/core/content-pruner.d.ts +0 -1
  61. package/dist/core/content-pruner.js +0 -1
  62. package/dist/core/crawl-checkpoint.d.ts +0 -1
  63. package/dist/core/crawl-checkpoint.js +0 -1
  64. package/dist/core/crawler.d.ts +0 -1
  65. package/dist/core/crawler.js +6 -5
  66. package/dist/core/cycle-fetch.d.ts +0 -1
  67. package/dist/core/cycle-fetch.js +0 -1
  68. package/dist/core/deep-fetch.d.ts +0 -1
  69. package/dist/core/deep-fetch.js +0 -1
  70. package/dist/core/design-analysis.d.ts +0 -1
  71. package/dist/core/design-analysis.js +0 -1
  72. package/dist/core/design-compare.d.ts +0 -1
  73. package/dist/core/design-compare.js +0 -1
  74. package/dist/core/diff.d.ts +0 -1
  75. package/dist/core/diff.js +0 -1
  76. package/dist/core/dns-cache.d.ts +0 -1
  77. package/dist/core/dns-cache.js +0 -1
  78. package/dist/core/documents.d.ts +0 -1
  79. package/dist/core/documents.js +0 -1
  80. package/dist/core/domain-extractors.d.ts +0 -1
  81. package/dist/core/domain-extractors.js +0 -1
  82. package/dist/core/extract-inline.d.ts +0 -1
  83. package/dist/core/extract-inline.js +0 -1
  84. package/dist/core/extract-listings.d.ts +0 -1
  85. package/dist/core/extract-listings.js +0 -1
  86. package/dist/core/extract.d.ts +0 -1
  87. package/dist/core/extract.js +0 -1
  88. package/dist/core/fetcher.d.ts +0 -1
  89. package/dist/core/fetcher.js +0 -1
  90. package/dist/core/google-cache.d.ts +0 -1
  91. package/dist/core/google-cache.js +0 -1
  92. package/dist/core/hotel-search.d.ts +0 -1
  93. package/dist/core/hotel-search.js +0 -1
  94. package/dist/core/http-fetch.d.ts +0 -1
  95. package/dist/core/http-fetch.js +5 -7
  96. package/dist/core/human.d.ts +0 -1
  97. package/dist/core/human.js +0 -1
  98. package/dist/core/jobs.d.ts +0 -1
  99. package/dist/core/jobs.js +0 -1
  100. package/dist/core/json-ld.d.ts +0 -1
  101. package/dist/core/json-ld.js +0 -1
  102. package/dist/core/llm-extract.d.ts +0 -1
  103. package/dist/core/llm-extract.js +0 -1
  104. package/dist/core/logger.d.ts +17 -0
  105. package/dist/core/logger.js +44 -0
  106. package/dist/core/map.d.ts +0 -1
  107. package/dist/core/map.js +0 -1
  108. package/dist/core/markdown.d.ts +0 -1
  109. package/dist/core/markdown.js +0 -1
  110. package/dist/core/metadata.d.ts +0 -1
  111. package/dist/core/metadata.js +0 -1
  112. package/dist/core/paginate.d.ts +0 -1
  113. package/dist/core/paginate.js +0 -1
  114. package/dist/core/pdf.d.ts +0 -1
  115. package/dist/core/pdf.js +0 -1
  116. package/dist/core/peel-tls.d.ts +0 -1
  117. package/dist/core/peel-tls.js +0 -1
  118. package/dist/core/pipeline.d.ts +0 -1
  119. package/dist/core/pipeline.js +22 -25
  120. package/dist/core/profiles.d.ts +0 -1
  121. package/dist/core/profiles.js +0 -1
  122. package/dist/core/quick-answer.d.ts +0 -1
  123. package/dist/core/quick-answer.js +0 -1
  124. package/dist/core/rate-governor.d.ts +0 -1
  125. package/dist/core/rate-governor.js +0 -1
  126. package/dist/core/readability.d.ts +0 -1
  127. package/dist/core/readability.js +0 -1
  128. package/dist/core/research.d.ts +0 -1
  129. package/dist/core/research.js +0 -1
  130. package/dist/core/schema-extraction.d.ts +0 -1
  131. package/dist/core/schema-extraction.js +0 -1
  132. package/dist/core/schema-postprocess.d.ts +0 -1
  133. package/dist/core/schema-postprocess.js +0 -1
  134. package/dist/core/schema-templates.d.ts +0 -1
  135. package/dist/core/schema-templates.js +0 -1
  136. package/dist/core/screenshot.d.ts +0 -1
  137. package/dist/core/screenshot.js +0 -1
  138. package/dist/core/search-fallback.d.ts +0 -1
  139. package/dist/core/search-fallback.js +0 -1
  140. package/dist/core/search-provider.d.ts +0 -1
  141. package/dist/core/search-provider.js +18 -21
  142. package/dist/core/site-search.d.ts +0 -1
  143. package/dist/core/site-search.js +0 -1
  144. package/dist/core/sitemap.d.ts +0 -1
  145. package/dist/core/sitemap.js +0 -1
  146. package/dist/core/stealth-patches.d.ts +0 -1
  147. package/dist/core/stealth-patches.js +0 -1
  148. package/dist/core/stemmer.d.ts +0 -1
  149. package/dist/core/stemmer.js +0 -1
  150. package/dist/core/strategies.d.ts +6 -1
  151. package/dist/core/strategies.js +29 -41
  152. package/dist/core/strategy-hooks.d.ts +0 -1
  153. package/dist/core/strategy-hooks.js +0 -1
  154. package/dist/core/summarize.d.ts +0 -1
  155. package/dist/core/summarize.js +0 -1
  156. package/dist/core/synonyms.d.ts +0 -1
  157. package/dist/core/synonyms.js +0 -1
  158. package/dist/core/table-format.d.ts +0 -1
  159. package/dist/core/table-format.js +0 -1
  160. package/dist/core/timing.d.ts +0 -1
  161. package/dist/core/timing.js +0 -1
  162. package/dist/core/user-agents.d.ts +0 -1
  163. package/dist/core/user-agents.js +0 -1
  164. package/dist/core/watch-manager.d.ts +0 -1
  165. package/dist/core/watch-manager.js +0 -1
  166. package/dist/core/watch.d.ts +0 -1
  167. package/dist/core/watch.js +0 -1
  168. package/dist/core/youtube.d.ts +0 -1
  169. package/dist/core/youtube.js +0 -1
  170. package/dist/index.d.ts +8 -3
  171. package/dist/index.js +27 -3
  172. package/dist/integrations/index.d.ts +0 -1
  173. package/dist/integrations/index.js +0 -1
  174. package/dist/integrations/langchain.d.ts +0 -1
  175. package/dist/integrations/langchain.js +0 -1
  176. package/dist/integrations/llamaindex.d.ts +0 -1
  177. package/dist/integrations/llamaindex.js +0 -1
  178. package/dist/mcp/handlers/act.d.ts +5 -0
  179. package/dist/mcp/handlers/act.js +34 -0
  180. package/dist/mcp/handlers/definitions.d.ts +6 -0
  181. package/dist/mcp/handlers/definitions.js +266 -0
  182. package/dist/mcp/handlers/extract.d.ts +6 -0
  183. package/dist/mcp/handlers/extract.js +102 -0
  184. package/dist/mcp/handlers/fetch.d.ts +6 -0
  185. package/dist/mcp/handlers/fetch.js +98 -0
  186. package/dist/mcp/handlers/find.d.ts +5 -0
  187. package/dist/mcp/handlers/find.js +137 -0
  188. package/dist/mcp/handlers/index.d.ts +13 -0
  189. package/dist/mcp/handlers/index.js +61 -0
  190. package/dist/mcp/handlers/legacy.d.ts +25 -0
  191. package/dist/mcp/handlers/legacy.js +450 -0
  192. package/dist/mcp/handlers/meta.d.ts +6 -0
  193. package/dist/mcp/handlers/meta.js +31 -0
  194. package/dist/mcp/handlers/monitor.d.ts +5 -0
  195. package/dist/mcp/handlers/monitor.js +41 -0
  196. package/dist/mcp/handlers/read.d.ts +6 -0
  197. package/dist/mcp/handlers/read.js +63 -0
  198. package/dist/mcp/handlers/see.d.ts +5 -0
  199. package/dist/mcp/handlers/see.js +75 -0
  200. package/dist/mcp/handlers/types.d.ts +29 -0
  201. package/dist/mcp/handlers/types.js +28 -0
  202. package/dist/mcp/server.d.ts +3 -4
  203. package/dist/mcp/server.js +35 -1101
  204. package/dist/mcp/smart-router.d.ts +0 -1
  205. package/dist/mcp/smart-router.js +3 -1
  206. package/dist/types.d.ts +6 -1
  207. package/dist/types.js +0 -1
  208. package/package.json +3 -13
  209. package/dist/cache.d.ts.map +0 -1
  210. package/dist/cache.js.map +0 -1
  211. package/dist/cli-auth.d.ts.map +0 -1
  212. package/dist/cli-auth.js.map +0 -1
  213. package/dist/cli.bundle.cjs +0 -159248
  214. package/dist/cli.d.ts.map +0 -1
  215. package/dist/cli.js.map +0 -1
  216. package/dist/core/actions.d.ts.map +0 -1
  217. package/dist/core/actions.js.map +0 -1
  218. package/dist/core/agent.d.ts.map +0 -1
  219. package/dist/core/agent.js.map +0 -1
  220. package/dist/core/answer.d.ts.map +0 -1
  221. package/dist/core/answer.js.map +0 -1
  222. package/dist/core/application-tracker.d.ts.map +0 -1
  223. package/dist/core/application-tracker.js.map +0 -1
  224. package/dist/core/apply.d.ts.map +0 -1
  225. package/dist/core/apply.js.map +0 -1
  226. package/dist/core/auto-extract.d.ts.map +0 -1
  227. package/dist/core/auto-extract.js.map +0 -1
  228. package/dist/core/auto-interact.d.ts.map +0 -1
  229. package/dist/core/auto-interact.js.map +0 -1
  230. package/dist/core/bm25-filter.d.ts.map +0 -1
  231. package/dist/core/bm25-filter.js.map +0 -1
  232. package/dist/core/branding.d.ts.map +0 -1
  233. package/dist/core/branding.js.map +0 -1
  234. package/dist/core/browser-fetch.d.ts.map +0 -1
  235. package/dist/core/browser-fetch.js.map +0 -1
  236. package/dist/core/browser-pool.d.ts.map +0 -1
  237. package/dist/core/browser-pool.js.map +0 -1
  238. package/dist/core/budget.d.ts.map +0 -1
  239. package/dist/core/budget.js.map +0 -1
  240. package/dist/core/cache.d.ts.map +0 -1
  241. package/dist/core/cache.js.map +0 -1
  242. package/dist/core/cf-worker-proxy.d.ts.map +0 -1
  243. package/dist/core/cf-worker-proxy.js.map +0 -1
  244. package/dist/core/challenge-detection.d.ts.map +0 -1
  245. package/dist/core/challenge-detection.js.map +0 -1
  246. package/dist/core/change-tracking.d.ts.map +0 -1
  247. package/dist/core/change-tracking.js.map +0 -1
  248. package/dist/core/chunker.d.ts.map +0 -1
  249. package/dist/core/chunker.js.map +0 -1
  250. package/dist/core/chunking.d.ts.map +0 -1
  251. package/dist/core/chunking.js.map +0 -1
  252. package/dist/core/cloak-fetch.d.ts.map +0 -1
  253. package/dist/core/cloak-fetch.js.map +0 -1
  254. package/dist/core/content-pruner.d.ts.map +0 -1
  255. package/dist/core/content-pruner.js.map +0 -1
  256. package/dist/core/crawl-checkpoint.d.ts.map +0 -1
  257. package/dist/core/crawl-checkpoint.js.map +0 -1
  258. package/dist/core/crawler.d.ts.map +0 -1
  259. package/dist/core/crawler.js.map +0 -1
  260. package/dist/core/cycle-fetch.d.ts.map +0 -1
  261. package/dist/core/cycle-fetch.js.map +0 -1
  262. package/dist/core/deep-fetch.d.ts.map +0 -1
  263. package/dist/core/deep-fetch.js.map +0 -1
  264. package/dist/core/design-analysis.d.ts.map +0 -1
  265. package/dist/core/design-analysis.js.map +0 -1
  266. package/dist/core/design-compare.d.ts.map +0 -1
  267. package/dist/core/design-compare.js.map +0 -1
  268. package/dist/core/diff.d.ts.map +0 -1
  269. package/dist/core/diff.js.map +0 -1
  270. package/dist/core/dns-cache.d.ts.map +0 -1
  271. package/dist/core/dns-cache.js.map +0 -1
  272. package/dist/core/documents.d.ts.map +0 -1
  273. package/dist/core/documents.js.map +0 -1
  274. package/dist/core/domain-extractors.d.ts.map +0 -1
  275. package/dist/core/domain-extractors.js.map +0 -1
  276. package/dist/core/extract-inline.d.ts.map +0 -1
  277. package/dist/core/extract-inline.js.map +0 -1
  278. package/dist/core/extract-listings.d.ts.map +0 -1
  279. package/dist/core/extract-listings.js.map +0 -1
  280. package/dist/core/extract.d.ts.map +0 -1
  281. package/dist/core/extract.js.map +0 -1
  282. package/dist/core/fetcher.d.ts.map +0 -1
  283. package/dist/core/fetcher.js.map +0 -1
  284. package/dist/core/google-cache.d.ts.map +0 -1
  285. package/dist/core/google-cache.js.map +0 -1
  286. package/dist/core/hotel-search.d.ts.map +0 -1
  287. package/dist/core/hotel-search.js.map +0 -1
  288. package/dist/core/http-fetch.d.ts.map +0 -1
  289. package/dist/core/http-fetch.js.map +0 -1
  290. package/dist/core/human.d.ts.map +0 -1
  291. package/dist/core/human.js.map +0 -1
  292. package/dist/core/jobs.d.ts.map +0 -1
  293. package/dist/core/jobs.js.map +0 -1
  294. package/dist/core/json-ld.d.ts.map +0 -1
  295. package/dist/core/json-ld.js.map +0 -1
  296. package/dist/core/llm-extract.d.ts.map +0 -1
  297. package/dist/core/llm-extract.js.map +0 -1
  298. package/dist/core/map.d.ts.map +0 -1
  299. package/dist/core/map.js.map +0 -1
  300. package/dist/core/markdown.d.ts.map +0 -1
  301. package/dist/core/markdown.js.map +0 -1
  302. package/dist/core/metadata.d.ts.map +0 -1
  303. package/dist/core/metadata.js.map +0 -1
  304. package/dist/core/paginate.d.ts.map +0 -1
  305. package/dist/core/paginate.js.map +0 -1
  306. package/dist/core/pdf.d.ts.map +0 -1
  307. package/dist/core/pdf.js.map +0 -1
  308. package/dist/core/peel-tls.d.ts.map +0 -1
  309. package/dist/core/peel-tls.js.map +0 -1
  310. package/dist/core/pipeline.d.ts.map +0 -1
  311. package/dist/core/pipeline.js.map +0 -1
  312. package/dist/core/profiles.d.ts.map +0 -1
  313. package/dist/core/profiles.js.map +0 -1
  314. package/dist/core/quick-answer.d.ts.map +0 -1
  315. package/dist/core/quick-answer.js.map +0 -1
  316. package/dist/core/rate-governor.d.ts.map +0 -1
  317. package/dist/core/rate-governor.js.map +0 -1
  318. package/dist/core/readability.d.ts.map +0 -1
  319. package/dist/core/readability.js.map +0 -1
  320. package/dist/core/research.d.ts.map +0 -1
  321. package/dist/core/research.js.map +0 -1
  322. package/dist/core/schema-extraction.d.ts.map +0 -1
  323. package/dist/core/schema-extraction.js.map +0 -1
  324. package/dist/core/schema-postprocess.d.ts.map +0 -1
  325. package/dist/core/schema-postprocess.js.map +0 -1
  326. package/dist/core/schema-templates.d.ts.map +0 -1
  327. package/dist/core/schema-templates.js.map +0 -1
  328. package/dist/core/screenshot.d.ts.map +0 -1
  329. package/dist/core/screenshot.js.map +0 -1
  330. package/dist/core/search-fallback.d.ts.map +0 -1
  331. package/dist/core/search-fallback.js.map +0 -1
  332. package/dist/core/search-provider.d.ts.map +0 -1
  333. package/dist/core/search-provider.js.map +0 -1
  334. package/dist/core/site-search.d.ts.map +0 -1
  335. package/dist/core/site-search.js.map +0 -1
  336. package/dist/core/sitemap.d.ts.map +0 -1
  337. package/dist/core/sitemap.js.map +0 -1
  338. package/dist/core/stealth-patches.d.ts.map +0 -1
  339. package/dist/core/stealth-patches.js.map +0 -1
  340. package/dist/core/stemmer.d.ts.map +0 -1
  341. package/dist/core/stemmer.js.map +0 -1
  342. package/dist/core/strategies.d.ts.map +0 -1
  343. package/dist/core/strategies.js.map +0 -1
  344. package/dist/core/strategy-hooks.d.ts.map +0 -1
  345. package/dist/core/strategy-hooks.js.map +0 -1
  346. package/dist/core/summarize.d.ts.map +0 -1
  347. package/dist/core/summarize.js.map +0 -1
  348. package/dist/core/synonyms.d.ts.map +0 -1
  349. package/dist/core/synonyms.js.map +0 -1
  350. package/dist/core/table-format.d.ts.map +0 -1
  351. package/dist/core/table-format.js.map +0 -1
  352. package/dist/core/timing.d.ts.map +0 -1
  353. package/dist/core/timing.js.map +0 -1
  354. package/dist/core/user-agents.d.ts.map +0 -1
  355. package/dist/core/user-agents.js.map +0 -1
  356. package/dist/core/watch-manager.d.ts.map +0 -1
  357. package/dist/core/watch-manager.js.map +0 -1
  358. package/dist/core/watch.d.ts.map +0 -1
  359. package/dist/core/watch.js.map +0 -1
  360. package/dist/core/youtube.d.ts.map +0 -1
  361. package/dist/core/youtube.js.map +0 -1
  362. package/dist/index.d.ts.map +0 -1
  363. package/dist/index.js.map +0 -1
  364. package/dist/integrations/index.d.ts.map +0 -1
  365. package/dist/integrations/index.js.map +0 -1
  366. package/dist/integrations/langchain.d.ts.map +0 -1
  367. package/dist/integrations/langchain.js.map +0 -1
  368. package/dist/integrations/llamaindex.d.ts.map +0 -1
  369. package/dist/integrations/llamaindex.js.map +0 -1
  370. package/dist/mcp/server.d.ts.map +0 -1
  371. package/dist/mcp/server.js.map +0 -1
  372. package/dist/mcp/smart-router.d.ts.map +0 -1
  373. package/dist/mcp/smart-router.js.map +0 -1
  374. package/dist/server/app.d.ts +0 -15
  375. package/dist/server/app.d.ts.map +0 -1
  376. package/dist/server/app.js +0 -350
  377. package/dist/server/app.js.map +0 -1
  378. package/dist/server/auth-store.d.ts +0 -28
  379. package/dist/server/auth-store.d.ts.map +0 -1
  380. package/dist/server/auth-store.js +0 -89
  381. package/dist/server/auth-store.js.map +0 -1
  382. package/dist/server/email-service.d.ts +0 -22
  383. package/dist/server/email-service.d.ts.map +0 -1
  384. package/dist/server/email-service.js +0 -80
  385. package/dist/server/email-service.js.map +0 -1
  386. package/dist/server/job-queue.d.ts +0 -93
  387. package/dist/server/job-queue.d.ts.map +0 -1
  388. package/dist/server/job-queue.js +0 -146
  389. package/dist/server/job-queue.js.map +0 -1
  390. package/dist/server/logger.d.ts +0 -11
  391. package/dist/server/logger.d.ts.map +0 -1
  392. package/dist/server/logger.js +0 -38
  393. package/dist/server/logger.js.map +0 -1
  394. package/dist/server/middleware/auth.d.ts +0 -29
  395. package/dist/server/middleware/auth.d.ts.map +0 -1
  396. package/dist/server/middleware/auth.js +0 -222
  397. package/dist/server/middleware/auth.js.map +0 -1
  398. package/dist/server/middleware/rate-limit.d.ts +0 -25
  399. package/dist/server/middleware/rate-limit.d.ts.map +0 -1
  400. package/dist/server/middleware/rate-limit.js +0 -168
  401. package/dist/server/middleware/rate-limit.js.map +0 -1
  402. package/dist/server/middleware/url-validator.d.ts +0 -16
  403. package/dist/server/middleware/url-validator.d.ts.map +0 -1
  404. package/dist/server/middleware/url-validator.js +0 -187
  405. package/dist/server/middleware/url-validator.js.map +0 -1
  406. package/dist/server/openapi.yaml +0 -4944
  407. package/dist/server/pg-auth-store.d.ts +0 -133
  408. package/dist/server/pg-auth-store.d.ts.map +0 -1
  409. package/dist/server/pg-auth-store.js +0 -473
  410. package/dist/server/pg-auth-store.js.map +0 -1
  411. package/dist/server/pg-job-queue.d.ts +0 -60
  412. package/dist/server/pg-job-queue.d.ts.map +0 -1
  413. package/dist/server/pg-job-queue.js +0 -365
  414. package/dist/server/pg-job-queue.js.map +0 -1
  415. package/dist/server/premium/domain-intel.d.ts +0 -17
  416. package/dist/server/premium/domain-intel.d.ts.map +0 -1
  417. package/dist/server/premium/domain-intel.js +0 -134
  418. package/dist/server/premium/domain-intel.js.map +0 -1
  419. package/dist/server/premium/index.d.ts +0 -18
  420. package/dist/server/premium/index.d.ts.map +0 -1
  421. package/dist/server/premium/index.js +0 -36
  422. package/dist/server/premium/index.js.map +0 -1
  423. package/dist/server/premium/swr-cache.d.ts +0 -15
  424. package/dist/server/premium/swr-cache.d.ts.map +0 -1
  425. package/dist/server/premium/swr-cache.js +0 -35
  426. package/dist/server/premium/swr-cache.js.map +0 -1
  427. package/dist/server/routes/activity.d.ts +0 -7
  428. package/dist/server/routes/activity.d.ts.map +0 -1
  429. package/dist/server/routes/activity.js +0 -68
  430. package/dist/server/routes/activity.js.map +0 -1
  431. package/dist/server/routes/agent.d.ts +0 -16
  432. package/dist/server/routes/agent.d.ts.map +0 -1
  433. package/dist/server/routes/agent.js +0 -247
  434. package/dist/server/routes/agent.js.map +0 -1
  435. package/dist/server/routes/answer.d.ts +0 -6
  436. package/dist/server/routes/answer.d.ts.map +0 -1
  437. package/dist/server/routes/answer.js +0 -133
  438. package/dist/server/routes/answer.js.map +0 -1
  439. package/dist/server/routes/ask.d.ts +0 -23
  440. package/dist/server/routes/ask.d.ts.map +0 -1
  441. package/dist/server/routes/ask.js +0 -119
  442. package/dist/server/routes/ask.js.map +0 -1
  443. package/dist/server/routes/batch.d.ts +0 -7
  444. package/dist/server/routes/batch.d.ts.map +0 -1
  445. package/dist/server/routes/batch.js +0 -412
  446. package/dist/server/routes/batch.js.map +0 -1
  447. package/dist/server/routes/cli-usage.d.ts +0 -7
  448. package/dist/server/routes/cli-usage.d.ts.map +0 -1
  449. package/dist/server/routes/cli-usage.js +0 -121
  450. package/dist/server/routes/cli-usage.js.map +0 -1
  451. package/dist/server/routes/compat.d.ts +0 -24
  452. package/dist/server/routes/compat.d.ts.map +0 -1
  453. package/dist/server/routes/compat.js +0 -653
  454. package/dist/server/routes/compat.js.map +0 -1
  455. package/dist/server/routes/deep-fetch.d.ts +0 -9
  456. package/dist/server/routes/deep-fetch.d.ts.map +0 -1
  457. package/dist/server/routes/deep-fetch.js +0 -50
  458. package/dist/server/routes/deep-fetch.js.map +0 -1
  459. package/dist/server/routes/demo.d.ts +0 -25
  460. package/dist/server/routes/demo.d.ts.map +0 -1
  461. package/dist/server/routes/demo.js +0 -434
  462. package/dist/server/routes/demo.js.map +0 -1
  463. package/dist/server/routes/extract.d.ts +0 -9
  464. package/dist/server/routes/extract.d.ts.map +0 -1
  465. package/dist/server/routes/extract.js +0 -150
  466. package/dist/server/routes/extract.js.map +0 -1
  467. package/dist/server/routes/fetch.d.ts +0 -8
  468. package/dist/server/routes/fetch.d.ts.map +0 -1
  469. package/dist/server/routes/fetch.js +0 -988
  470. package/dist/server/routes/fetch.js.map +0 -1
  471. package/dist/server/routes/health.d.ts +0 -8
  472. package/dist/server/routes/health.d.ts.map +0 -1
  473. package/dist/server/routes/health.js +0 -20
  474. package/dist/server/routes/health.js.map +0 -1
  475. package/dist/server/routes/jobs.d.ts +0 -8
  476. package/dist/server/routes/jobs.d.ts.map +0 -1
  477. package/dist/server/routes/jobs.js +0 -487
  478. package/dist/server/routes/jobs.js.map +0 -1
  479. package/dist/server/routes/mcp.d.ts +0 -18
  480. package/dist/server/routes/mcp.d.ts.map +0 -1
  481. package/dist/server/routes/mcp.js +0 -1260
  482. package/dist/server/routes/mcp.js.map +0 -1
  483. package/dist/server/routes/oauth.d.ts +0 -10
  484. package/dist/server/routes/oauth.d.ts.map +0 -1
  485. package/dist/server/routes/oauth.js +0 -334
  486. package/dist/server/routes/oauth.js.map +0 -1
  487. package/dist/server/routes/quick-answer.d.ts +0 -9
  488. package/dist/server/routes/quick-answer.d.ts.map +0 -1
  489. package/dist/server/routes/quick-answer.js +0 -93
  490. package/dist/server/routes/quick-answer.js.map +0 -1
  491. package/dist/server/routes/screenshot.d.ts +0 -23
  492. package/dist/server/routes/screenshot.d.ts.map +0 -1
  493. package/dist/server/routes/screenshot.js +0 -819
  494. package/dist/server/routes/screenshot.js.map +0 -1
  495. package/dist/server/routes/search.d.ts +0 -7
  496. package/dist/server/routes/search.d.ts.map +0 -1
  497. package/dist/server/routes/search.js +0 -312
  498. package/dist/server/routes/search.js.map +0 -1
  499. package/dist/server/routes/session.d.ts +0 -16
  500. package/dist/server/routes/session.d.ts.map +0 -1
  501. package/dist/server/routes/session.js +0 -278
  502. package/dist/server/routes/session.js.map +0 -1
  503. package/dist/server/routes/stats.d.ts +0 -7
  504. package/dist/server/routes/stats.d.ts.map +0 -1
  505. package/dist/server/routes/stats.js +0 -65
  506. package/dist/server/routes/stats.js.map +0 -1
  507. package/dist/server/routes/stripe.d.ts +0 -16
  508. package/dist/server/routes/stripe.d.ts.map +0 -1
  509. package/dist/server/routes/stripe.js +0 -283
  510. package/dist/server/routes/stripe.js.map +0 -1
  511. package/dist/server/routes/users.d.ts +0 -9
  512. package/dist/server/routes/users.d.ts.map +0 -1
  513. package/dist/server/routes/users.js +0 -1211
  514. package/dist/server/routes/users.js.map +0 -1
  515. package/dist/server/routes/watch.d.ts +0 -16
  516. package/dist/server/routes/watch.d.ts.map +0 -1
  517. package/dist/server/routes/watch.js +0 -257
  518. package/dist/server/routes/watch.js.map +0 -1
  519. package/dist/server/routes/webhooks.d.ts +0 -16
  520. package/dist/server/routes/webhooks.d.ts.map +0 -1
  521. package/dist/server/routes/webhooks.js +0 -74
  522. package/dist/server/routes/webhooks.js.map +0 -1
  523. package/dist/server/routes/youtube.d.ts +0 -7
  524. package/dist/server/routes/youtube.d.ts.map +0 -1
  525. package/dist/server/routes/youtube.js +0 -93
  526. package/dist/server/routes/youtube.js.map +0 -1
  527. package/dist/server/sentry.d.ts +0 -14
  528. package/dist/server/sentry.d.ts.map +0 -1
  529. package/dist/server/sentry.js +0 -39
  530. package/dist/server/sentry.js.map +0 -1
  531. package/dist/server/types.d.ts +0 -16
  532. package/dist/server/types.d.ts.map +0 -1
  533. package/dist/server/types.js +0 -8
  534. package/dist/server/types.js.map +0 -1
  535. package/dist/server/utils/response.d.ts +0 -45
  536. package/dist/server/utils/response.d.ts.map +0 -1
  537. package/dist/server/utils/response.js +0 -70
  538. package/dist/server/utils/response.js.map +0 -1
  539. package/dist/server/utils/sse.d.ts +0 -23
  540. package/dist/server/utils/sse.d.ts.map +0 -1
  541. package/dist/server/utils/sse.js +0 -39
  542. package/dist/server/utils/sse.js.map +0 -1
  543. package/dist/types.d.ts.map +0 -1
  544. package/dist/types.js.map +0 -1
@@ -0,0 +1,1015 @@
1
+ /**
2
+ * Fetch commands: default URL handler, read, pipe
3
+ */
4
+ import ora from 'ora';
5
+ import { writeFileSync, readFileSync, existsSync } from 'fs';
6
+ import { getProfilePath, loadStorageState, touchProfile } from '../../core/profiles.js';
7
+ import { peel, cleanup } from '../../index.js';
8
+ import { checkUsage, showUsageFooter, loadConfig } from '../../cli-auth.js';
9
+ import { getCache, setCache, parseTTL } from '../../cache.js';
10
+ import { estimateTokens } from '../../core/markdown.js';
11
+ import { distillToBudget, budgetListings } from '../../core/budget.js';
12
+ import { parseActions, formatError, fetchViaApi, outputResult, writeStdout, buildEnvelope, classifyErrorCode, formatListingsCsv, normaliseExtractedToRows, } from '../utils.js';
13
+ // ─── runFetch ─────────────────────────────────────────────────────────────────
14
+ // Main fetch handler — shared with the `pipe` and `ask` subcommands
15
+ export async function runFetch(url, options) {
16
+ // Handle --format flag: maps to existing boolean flags
17
+ if (options.format) {
18
+ const fmt = options.format.toLowerCase();
19
+ if (fmt === 'text')
20
+ options.text = true;
21
+ else if (fmt === 'html')
22
+ options.html = true;
23
+ else if (fmt === 'json')
24
+ options.json = true;
25
+ else if (fmt === 'markdown' || fmt === 'md') { /* default, do nothing */ }
26
+ else {
27
+ console.error(`Unknown format: ${options.format}. Use: text, markdown, html, or json`);
28
+ process.exit(1);
29
+ }
30
+ }
31
+ // Smart defaults: when piped (not a TTY), default to silent JSON + budget
32
+ // BUT respect explicit --format flag (user chose the output format)
33
+ const isPiped = !process.stdout.isTTY;
34
+ const hasExplicitFormat = options.format && ['text', 'html', 'markdown', 'md'].includes(options.format.toLowerCase());
35
+ if (isPiped && !options.html && !options.text && !hasExplicitFormat) {
36
+ if (!options.json)
37
+ options.json = true;
38
+ if (!options.silent)
39
+ options.silent = true;
40
+ // Auto-enable readability for AI consumers — clean content by default
41
+ if (!options.readable && !options.fullNav) {
42
+ options.readable = true;
43
+ }
44
+ // Auto token budget for piped mode (AI consumers want concise content)
45
+ if (options.budget === undefined && !options.fullContent && !options.raw && !options.full) {
46
+ options.budget = 4000;
47
+ }
48
+ }
49
+ // --full alias: sets raw + fullContent
50
+ if (options.full) {
51
+ options.raw = true;
52
+ options.fullContent = true;
53
+ }
54
+ // Smart defaults for terminal (interactive) mode
55
+ const isTerminal = process.stdout.isTTY && !isPiped;
56
+ if (isTerminal && !options.raw && !options.html && !options.text) {
57
+ // Auto-readable: clean content by default (like browser Reader Mode)
58
+ if (!options.readable && !options.fullNav && !options.selector) {
59
+ options.readable = true;
60
+ }
61
+ // Default token budget: don't flood the terminal with 20K tokens
62
+ if (options.budget === undefined && !options.fullContent && !options.raw) {
63
+ options.budget = 4000;
64
+ }
65
+ }
66
+ // --agent sets sensible defaults for AI agents; explicit flags override
67
+ if (options.agent) {
68
+ if (!options.json)
69
+ options.json = true;
70
+ if (!options.silent)
71
+ options.silent = true;
72
+ if (!options.extractAll)
73
+ options.extractAll = true;
74
+ if (options.budget === undefined)
75
+ options.budget = 4000;
76
+ // Agent mode = clean content by default
77
+ if (!options.readable && !options.fullNav) {
78
+ options.readable = true;
79
+ }
80
+ }
81
+ const isJson = options.json;
82
+ // --- --list-schemas: print all available schemas and exit ---
83
+ if (options.listSchemas) {
84
+ const { loadBundledSchemas } = await import('../../core/schema-extraction.js');
85
+ const schemas = loadBundledSchemas();
86
+ if (isJson) {
87
+ await writeStdout(JSON.stringify(schemas.map(s => ({
88
+ name: s.name,
89
+ version: s.version,
90
+ domains: s.domains,
91
+ urlPatterns: s.urlPatterns,
92
+ })), null, 2) + '\n');
93
+ }
94
+ else {
95
+ console.log(`\nAvailable extraction schemas (${schemas.length}):\n`);
96
+ for (const s of schemas) {
97
+ console.log(` ${s.name} (v${s.version})`);
98
+ console.log(` Domains: ${s.domains.join(', ')}`);
99
+ if (s.urlPatterns && s.urlPatterns.length > 0) {
100
+ console.log(` URL patterns: ${s.urlPatterns.join(', ')}`);
101
+ }
102
+ console.log('');
103
+ }
104
+ }
105
+ process.exit(0);
106
+ }
107
+ // --- #5: Concise error for missing URL (no help dump) ---
108
+ if (!url || url.trim() === '') {
109
+ if (isJson) {
110
+ await writeStdout(JSON.stringify({ success: false, error: { type: 'invalid_request', message: 'URL is required' } }) + '\n');
111
+ }
112
+ else {
113
+ console.error('Error: URL is required');
114
+ console.error('Usage: webpeel <url> [options]');
115
+ console.error('Run "webpeel --help" for full usage.');
116
+ }
117
+ process.exit(1);
118
+ }
119
+ // --- #6: Helper to output JSON errors and exit ---
120
+ function exitWithJsonError(message, code) {
121
+ if (isJson) {
122
+ process.stdout.write(JSON.stringify({
123
+ success: false,
124
+ error: { type: code.toLowerCase(), message },
125
+ }) + '\n');
126
+ }
127
+ else {
128
+ console.error(`Error: ${message}`);
129
+ }
130
+ process.exit(1);
131
+ }
132
+ // SECURITY: Enhanced URL validation
133
+ if (url.length > 2048) {
134
+ exitWithJsonError('URL too long (max 2048 characters)', 'INVALID_URL');
135
+ }
136
+ // Check for control characters
137
+ if (/[\x00-\x1F\x7F]/.test(url)) {
138
+ exitWithJsonError('URL contains invalid control characters', 'INVALID_URL');
139
+ }
140
+ // Validate URL format
141
+ try {
142
+ const parsed = new URL(url);
143
+ if (!['http:', 'https:'].includes(parsed.protocol)) {
144
+ exitWithJsonError('Only HTTP and HTTPS protocols are allowed', 'INVALID_URL');
145
+ }
146
+ }
147
+ catch {
148
+ // Check if it looks like a command/verb the user typed by mistake
149
+ const commonVerbs = ['fetch', 'get', 'scrape', 'read', 'download', 'curl', 'wget', 'peel'];
150
+ if (commonVerbs.includes(url.toLowerCase())) {
151
+ exitWithJsonError(`Did you mean: webpeel "${process.argv[3] || '<url>'}"?\nThe URL goes directly after webpeel — no verb needed.\nExample: webpeel "https://example.com" --json`, 'INVALID_URL');
152
+ }
153
+ else {
154
+ exitWithJsonError(`Invalid URL: "${url}"\nMake sure to include the protocol (https://)\nExample: webpeel "https://${url}" --json`, 'INVALID_URL');
155
+ }
156
+ }
157
+ const useStealth = options.stealth || false;
158
+ // Check usage quota
159
+ const usageCheck = await checkUsage();
160
+ if (!usageCheck.allowed) {
161
+ if (isJson) {
162
+ await writeStdout(JSON.stringify({ success: false, error: { type: 'rate_limited', message: usageCheck.message } }) + '\n');
163
+ process.exit(1);
164
+ }
165
+ console.error(usageCheck.message);
166
+ process.exit(1);
167
+ }
168
+ // Check cache first (before spinner/network)
169
+ // Default: 5m TTL for all CLI fetches unless --no-cache is set
170
+ let cacheTtlMs;
171
+ const cacheDisabled = options.cache === false; // --no-cache sets options.cache to false
172
+ const explicitTtl = typeof options.cache === 'string' ? options.cache : undefined;
173
+ if (!cacheDisabled) {
174
+ const ttlStr = explicitTtl || '5m';
175
+ try {
176
+ cacheTtlMs = parseTTL(ttlStr);
177
+ }
178
+ catch (e) {
179
+ exitWithJsonError(e.message, 'FETCH_FAILED');
180
+ }
181
+ const cacheOptions = {
182
+ render: options.render,
183
+ stealth: options.stealth,
184
+ selector: options.selector,
185
+ format: options.html ? 'html' : options.text ? 'text' : options.clean ? 'clean' : 'markdown',
186
+ budget: null, // Budget excluded from cache key — cache stores full content
187
+ readable: options.readable || false,
188
+ };
189
+ const cachedResult = getCache(url, cacheOptions);
190
+ if (cachedResult) {
191
+ if (!options.silent) {
192
+ console.error(`\x1b[36m⚡ Cache hit\x1b[0m (TTL: ${ttlStr})`);
193
+ }
194
+ // Apply budget to cached content (cache stores full, budget is post-process)
195
+ if (options.budget && options.budget > 0 && cachedResult.content) {
196
+ const fmt = options.text ? 'text' : 'markdown';
197
+ cachedResult.content = distillToBudget(cachedResult.content, options.budget, fmt);
198
+ cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
199
+ }
200
+ // LLM extraction from cached content
201
+ if (options.llmExtract || options.extractSchema) {
202
+ const { extractWithLLM } = await import('../../core/llm-extract.js');
203
+ const llmCfgCached = loadConfig();
204
+ const llmApiKeyCached = options.llmKey || llmCfgCached.llm?.apiKey || process.env.OPENAI_API_KEY;
205
+ if (!llmApiKeyCached) {
206
+ console.error('Error: LLM extraction requires an API key.\nSet OPENAI_API_KEY environment variable or use --llm-key <key>');
207
+ process.exit(1);
208
+ }
209
+ const llmModelCached = options.llmModel || llmCfgCached.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
210
+ const llmBaseUrlCached = options.llmBaseUrl || llmCfgCached.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
211
+ const llmInstructionCached = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
212
+ // Parse schema if provided
213
+ let llmSchemaCached;
214
+ if (options.extractSchema) {
215
+ let schemaStr = options.extractSchema;
216
+ if (schemaStr.startsWith('@')) {
217
+ schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
218
+ }
219
+ try {
220
+ llmSchemaCached = JSON.parse(schemaStr);
221
+ }
222
+ catch {
223
+ console.error('Error: --extract-schema must be valid JSON or a valid @file.json path');
224
+ process.exit(1);
225
+ }
226
+ }
227
+ const llmResultCached = await extractWithLLM({
228
+ content: cachedResult.content,
229
+ instruction: llmInstructionCached,
230
+ schema: llmSchemaCached,
231
+ apiKey: llmApiKeyCached,
232
+ model: llmModelCached,
233
+ baseUrl: llmBaseUrlCached,
234
+ });
235
+ await writeStdout(JSON.stringify(llmResultCached.items, null, 2) + '\n');
236
+ if (!options.silent) {
237
+ const { input, output } = llmResultCached.tokensUsed;
238
+ const costStr = llmResultCached.cost !== undefined ? ` | Est. cost: $${llmResultCached.cost.toFixed(6)}` : '';
239
+ console.error(`\n🤖 LLM extraction: ${llmResultCached.items.length} items | ${input} input + ${output} output tokens${costStr} | model: ${llmResultCached.model}`);
240
+ }
241
+ process.exit(0);
242
+ }
243
+ // --- LLM-free Quick Answer (also on cached content) ---
244
+ if (options.question && cachedResult.content) {
245
+ const { quickAnswer } = await import('../../core/quick-answer.js');
246
+ const qa = quickAnswer({
247
+ question: options.question,
248
+ content: cachedResult.content,
249
+ url: cachedResult.url,
250
+ });
251
+ cachedResult.quickAnswer = qa;
252
+ if (!isJson) {
253
+ const conf = (qa.confidence * 100).toFixed(0);
254
+ await writeStdout(`\n\x1b[36m📋 ${qa.question}\x1b[0m\n\n`);
255
+ if (qa.answer) {
256
+ await writeStdout(`\x1b[32m💡 Answer (${conf}% confidence):\x1b[0m\n${qa.answer}\n`);
257
+ }
258
+ else {
259
+ await writeStdout(`\x1b[33m💡 No relevant answer found (${conf}% confidence)\x1b[0m\n`);
260
+ }
261
+ if (qa.passages && qa.passages.length > 1) {
262
+ await writeStdout(`\n\x1b[33m📝 Supporting evidence:\x1b[0m\n`);
263
+ for (const p of qa.passages.slice(1, 4)) {
264
+ await writeStdout(` • [${(p.score * 100).toFixed(0)}%] ${p.text.substring(0, 200)}${p.text.length > 200 ? '...' : ''}\n`);
265
+ }
266
+ }
267
+ await writeStdout('\n');
268
+ await cleanup();
269
+ process.exit(0);
270
+ }
271
+ }
272
+ // --- BM25 Schema Template Extraction (cached path) ---
273
+ if (options.schema && cachedResult.content) {
274
+ const { getSchemaTemplate: getSchTmplCached } = await import('../../core/schema-templates.js');
275
+ const schTemplateCached = getSchTmplCached(options.schema);
276
+ if (schTemplateCached) {
277
+ const { quickAnswer: qaCached } = await import('../../core/quick-answer.js');
278
+ const { smartExtractSchemaFields: smartExtractCached } = await import('../../core/schema-postprocess.js');
279
+ const extractedCached = smartExtractCached(cachedResult.content, schTemplateCached.fields, qaCached, {
280
+ pageTitle: cachedResult.title,
281
+ pageUrl: cachedResult.url,
282
+ metadata: cachedResult.metadata,
283
+ });
284
+ cachedResult.extracted = extractedCached;
285
+ }
286
+ }
287
+ await outputResult(cachedResult, options, { cached: true });
288
+ process.exit(0);
289
+ }
290
+ }
291
+ const spinner = options.silent ? null : ora('Fetching...').start();
292
+ try {
293
+ // Validate options
294
+ if (options.wait && (options.wait < 0 || options.wait > 60000)) {
295
+ throw Object.assign(new Error('Wait time must be between 0 and 60000ms'), { _code: 'FETCH_FAILED' });
296
+ }
297
+ // Parse custom headers
298
+ let headers;
299
+ if (options.header && options.header.length > 0) {
300
+ headers = {};
301
+ for (const header of options.header) {
302
+ const colonIndex = header.indexOf(':');
303
+ if (colonIndex === -1) {
304
+ throw Object.assign(new Error(`Invalid header format: ${header}. Expected "Key: Value"`), { _code: 'FETCH_FAILED' });
305
+ }
306
+ const key = header.slice(0, colonIndex).trim();
307
+ const value = header.slice(colonIndex + 1).trim();
308
+ headers[key] = value;
309
+ }
310
+ }
311
+ // Parse actions
312
+ let actions;
313
+ if (options.action && options.action.length > 0) {
314
+ try {
315
+ actions = parseActions(options.action);
316
+ }
317
+ catch (e) {
318
+ throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
319
+ }
320
+ }
321
+ // --extract-schema auto-enables JSON output
322
+ if (options.extractSchema) {
323
+ options.json = true;
324
+ }
325
+ // Parse extract
326
+ let extract;
327
+ if (options.llmExtract || options.extractSchema) {
328
+ // LLM-based extraction is handled post-fetch (after peel returns markdown).
329
+ // Early-validate that an API key is available so we fail fast.
330
+ const llmCfg = loadConfig();
331
+ const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
332
+ if (!llmApiKey) {
333
+ throw Object.assign(new Error('LLM extraction requires an API key.\n' +
334
+ 'Set OPENAI_API_KEY environment variable or use --llm-key <key>'), { _code: 'FETCH_FAILED' });
335
+ }
336
+ // Do NOT set extract here — peel runs normally, LLM extraction happens below.
337
+ }
338
+ else if (options.extract) {
339
+ // CSS-based extraction
340
+ try {
341
+ extract = { selectors: JSON.parse(options.extract) };
342
+ }
343
+ catch {
344
+ throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\')'), { _code: 'FETCH_FAILED' });
345
+ }
346
+ }
347
+ // Validate maxTokens
348
+ if (options.maxTokens !== undefined) {
349
+ if (isNaN(options.maxTokens) || options.maxTokens < 100) {
350
+ throw Object.assign(new Error('--max-tokens must be at least 100'), { _code: 'FETCH_FAILED' });
351
+ }
352
+ }
353
+ // Parse include-tags and exclude-tags
354
+ let includeTags;
355
+ let excludeTags;
356
+ if (options.onlyMainContent) {
357
+ includeTags = ['main', 'article'];
358
+ }
359
+ else if (options.includeTags) {
360
+ includeTags = options.includeTags.split(',').map((t) => t.trim());
361
+ }
362
+ if (options.excludeTags) {
363
+ excludeTags = options.excludeTags.split(',').map((t) => t.trim());
364
+ }
365
+ // Build location options
366
+ let locationOptions;
367
+ if (options.location || options.language) {
368
+ locationOptions = {};
369
+ if (options.location) {
370
+ locationOptions.country = options.location;
371
+ }
372
+ if (options.language) {
373
+ locationOptions.languages = [options.language];
374
+ }
375
+ }
376
+ // ── Resolve --profile: name → path + storage state ─────────────────
377
+ let resolvedProfileDir;
378
+ let resolvedStorageState;
379
+ let resolvedProfileName;
380
+ if (options.profile) {
381
+ const profilePath = getProfilePath(options.profile);
382
+ if (profilePath) {
383
+ // It's a named profile in ~/.webpeel/profiles/
384
+ resolvedProfileDir = profilePath;
385
+ resolvedStorageState = loadStorageState(options.profile) ?? undefined;
386
+ resolvedProfileName = options.profile;
387
+ }
388
+ else if (existsSync(options.profile)) {
389
+ // It's a raw directory path (backward compat)
390
+ resolvedProfileDir = options.profile;
391
+ }
392
+ else {
393
+ exitWithJsonError(`Profile "${options.profile}" not found. Run "webpeel profile list" to see available profiles.`, 'PROFILE_NOT_FOUND');
394
+ }
395
+ }
396
+ // Build peel options
397
+ // --stealth auto-enables --render (stealth requires browser)
398
+ // --action auto-enables --render (actions require browser)
399
+ // --scroll-extract implies --render (needs browser)
400
+ //
401
+ // Bare --scroll-extract (no number) → smart autoScroll (detects stable height)
402
+ // --scroll-extract N (with number) → legacy fixed N scrolls via actions
403
+ const scrollExtractRaw = options.scrollExtract;
404
+ const isAutoScroll = scrollExtractRaw !== undefined && typeof scrollExtractRaw !== 'number';
405
+ const scrollExtractCount = isAutoScroll
406
+ ? 0
407
+ : (scrollExtractRaw !== undefined ? scrollExtractRaw : 0);
408
+ const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || isAutoScroll
409
+ || (options.device && options.device !== 'desktop')
410
+ || !!options.viewport
411
+ || !!options.waitUntil
412
+ || !!options.waitSelector
413
+ || !!options.blockResources
414
+ || !!options.screenshot // Auto-enable render for screenshot (needs browser)
415
+ || false;
416
+ // Inject scroll actions when --scroll-extract N (fixed count) is used
417
+ if (scrollExtractCount > 0) {
418
+ const scrollActions = [];
419
+ for (let i = 0; i < scrollExtractCount; i++) {
420
+ scrollActions.push({ type: 'scroll', to: 'bottom' });
421
+ scrollActions.push({ type: 'wait', ms: 1500 });
422
+ }
423
+ actions = actions ? [...actions, ...scrollActions] : scrollActions;
424
+ }
425
+ const peelOptions = {
426
+ render: useRender,
427
+ stealth: options.stealth || false,
428
+ wait: options.wait || 0,
429
+ timeout: options.timeout,
430
+ userAgent: options.ua,
431
+ screenshot: options.screenshot !== undefined,
432
+ screenshotFullPage: options.fullPage || false,
433
+ selector: options.selector,
434
+ exclude: options.exclude,
435
+ includeTags,
436
+ excludeTags,
437
+ headers,
438
+ cookies: options.cookie,
439
+ raw: options.raw || false,
440
+ lite: options.lite || false,
441
+ actions,
442
+ maxTokens: options.maxTokens,
443
+ // Note: budget is applied AFTER caching (so cache stores full content)
444
+ // We pass it to peel() for programmatic API compatibility, but the CLI
445
+ // also applies it post-fetch (see below) to ensure cache stores full result.
446
+ extract,
447
+ images: options.images || false,
448
+ location: locationOptions,
449
+ profileDir: resolvedProfileDir,
450
+ headed: options.headed || false,
451
+ storageState: resolvedStorageState,
452
+ proxy: options.proxy,
453
+ proxies: options.proxies,
454
+ fullPage: options.fullContent || false,
455
+ readable: options.readable || false,
456
+ // Smart auto-scroll (bare --scroll-extract flag)
457
+ autoScroll: isAutoScroll
458
+ ? { timeout: options.scrollExtractTimeout }
459
+ : undefined,
460
+ device: options.device,
461
+ viewportWidth: options.viewport ? options.viewport.width : undefined,
462
+ viewportHeight: options.viewport ? options.viewport.height : undefined,
463
+ waitUntil: options.waitUntil,
464
+ waitSelector: options.waitSelector,
465
+ blockResources: options.blockResources ? options.blockResources.split(',').map((s) => s.trim()) : undefined,
466
+ cloaked: options.cloaked ? true : undefined,
467
+ cycle: options.cycle ? true : undefined,
468
+ tls: (options.tls || options.cycle) ? true : undefined,
469
+ };
470
+ if (options.cloaked) {
471
+ peelOptions.render = true; // CloakBrowser is a browser
472
+ }
473
+ // Add chunk option if requested
474
+ if (options.chunk) {
475
+ peelOptions.chunk = {
476
+ maxTokens: options.chunkSize || 512,
477
+ overlap: options.chunkOverlap || 50,
478
+ strategy: options.chunkStrategy || 'section',
479
+ };
480
+ }
481
+ // Add summary option if requested
482
+ if (options.summary) {
483
+ const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
484
+ if (!llmApiKey) {
485
+ throw Object.assign(new Error('--summary requires --llm-key or OPENAI_API_KEY environment variable'), { _code: 'FETCH_FAILED' });
486
+ }
487
+ peelOptions.summary = true;
488
+ peelOptions.llm = {
489
+ apiKey: llmApiKey,
490
+ model: process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini',
491
+ baseUrl: process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1',
492
+ };
493
+ }
494
+ // Determine format
495
+ if (options.html) {
496
+ peelOptions.format = 'html';
497
+ }
498
+ else if (options.text) {
499
+ peelOptions.format = 'text';
500
+ }
501
+ else if (options.clean) {
502
+ peelOptions.format = 'clean';
503
+ // --clean implies readable mode (article content only, no navs/footers)
504
+ peelOptions.readable = true;
505
+ }
506
+ else {
507
+ peelOptions.format = 'markdown';
508
+ }
509
+ // Fetch the page — route through API if key is configured, otherwise require auth
510
+ const fetchCfg = loadConfig();
511
+ const fetchApiKey = fetchCfg.apiKey || process.env.WEBPEEL_API_KEY;
512
+ const fetchApiUrl = process.env.WEBPEEL_API_URL || 'https://api.webpeel.dev';
513
+ let result;
514
+ if (fetchApiKey) {
515
+ // Use the WebPeel API — no local Playwright needed
516
+ result = await fetchViaApi(url, peelOptions, fetchApiKey, fetchApiUrl);
517
+ }
518
+ else {
519
+ // No API key — show helpful message instead of trying local mode
520
+ if (spinner)
521
+ spinner.fail('Authentication required');
522
+ console.error('No API key configured. Run: webpeel auth <your-key>');
523
+ console.error('Get a free key at: https://app.webpeel.dev/keys');
524
+ await cleanup();
525
+ process.exit(2);
526
+ }
527
+ // Update lastUsed timestamp for named profiles
528
+ if (resolvedProfileName) {
529
+ touchProfile(resolvedProfileName);
530
+ }
531
+ if (spinner) {
532
+ const domainTag = result.domainData
533
+ ? ` [${result.domainData.domain}:${result.domainData.type}]`
534
+ : '';
535
+ spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
536
+ }
537
+ // Show metadata header
538
+ const pageTitle = result.metadata?.title || result.title;
539
+ if (!options.silent && !options.json && pageTitle) {
540
+ const parts = [];
541
+ if (result.metadata?.author)
542
+ parts.push(`by ${result.metadata.author}`);
543
+ if (result.readability?.readingTime)
544
+ parts.push(result.readability.readingTime);
545
+ if (result.tokens)
546
+ parts.push(`${result.tokens.toLocaleString()} tokens`);
547
+ const subtitle = parts.length ? ` · ${parts.join(' · ')}` : '';
548
+ console.error(`\x1b[36m📄 ${pageTitle}${subtitle}\x1b[0m`);
549
+ }
550
+ // Show usage footer for free/anonymous users
551
+ if (usageCheck.usageInfo && !options.silent) {
552
+ showUsageFooter(usageCheck.usageInfo, usageCheck.isAnonymous || false, useStealth);
553
+ }
554
+ // Handle screenshot saving
555
+ if (options.screenshot && result.screenshot) {
556
+ const screenshotPath = typeof options.screenshot === 'string'
557
+ ? options.screenshot
558
+ : 'screenshot.png';
559
+ const screenshotBuffer = Buffer.from(result.screenshot, 'base64');
560
+ writeFileSync(screenshotPath, screenshotBuffer);
561
+ if (!options.silent) {
562
+ console.error(`Screenshot saved to: ${screenshotPath}`);
563
+ }
564
+ // Remove screenshot from JSON output if saving to file
565
+ if (typeof options.screenshot === 'string') {
566
+ delete result.screenshot;
567
+ }
568
+ }
569
+ // Store full result in cache (before budget distillation so cache is reusable)
570
+ if (cacheTtlMs && !cacheDisabled) {
571
+ setCache(url, result, cacheTtlMs, {
572
+ render: options.render,
573
+ stealth: useStealth,
574
+ selector: options.selector,
575
+ format: peelOptions.format,
576
+ budget: null, // Budget excluded — cache stores full content, budget applied post-cache
577
+ readable: options.readable || false,
578
+ });
579
+ }
580
+ // Apply smart budget distillation AFTER caching (cache always stores full content)
581
+ // When --agent is set, always apply budget even with --extract-all (listings will be budgeted
582
+ // separately, but if no listings are found the content itself still needs trimming).
583
+ const skipBudgetForExtract = (options.extractAll || options.scrollExtract !== undefined) && !options.agent;
584
+ let contentTruncated = false;
585
+ if (options.budget && options.budget > 0 && !skipBudgetForExtract) {
586
+ const budgetFormat = peelOptions.format === 'text' ? 'text' : 'markdown';
587
+ const distilled = distillToBudget(result.content, options.budget, budgetFormat);
588
+ if (distilled !== result.content) {
589
+ contentTruncated = true;
590
+ result.content = distilled;
591
+ result.tokens = estimateTokens(distilled);
592
+ }
593
+ }
594
+ // --- BM25 Query-Focused Filtering ---
595
+ if (options.focus && result.content) {
596
+ const { filterByRelevance } = await import('../../core/bm25-filter.js');
597
+ const focusResult = filterByRelevance(result.content, { query: options.focus });
598
+ result.content = focusResult.content;
599
+ result.tokens = estimateTokens(focusResult.content);
600
+ if (isJson) {
601
+ result.focusQuery = options.focus;
602
+ result.focusReduction = focusResult.reductionPercent;
603
+ }
604
+ }
605
+ // --- LLM-free Quick Answer ---
606
+ if (options.question && result.content) {
607
+ const { quickAnswer } = await import('../../core/quick-answer.js');
608
+ const qa = quickAnswer({
609
+ question: options.question,
610
+ content: result.content,
611
+ url: result.url,
612
+ });
613
+ result.quickAnswer = qa;
614
+ if (!isJson) {
615
+ // Display answer prominently in human-readable mode
616
+ const conf = (qa.confidence * 100).toFixed(0);
617
+ await writeStdout(`\n\x1b[36m📋 ${qa.question}\x1b[0m\n\n`);
618
+ if (qa.answer) {
619
+ await writeStdout(`\x1b[32m💡 Answer (${conf}% confidence):\x1b[0m\n${qa.answer}\n`);
620
+ }
621
+ else {
622
+ await writeStdout(`\x1b[33m💡 No relevant answer found (${conf}% confidence)\x1b[0m\n`);
623
+ }
624
+ if (qa.passages && qa.passages.length > 1) {
625
+ await writeStdout(`\n\x1b[33m📝 Supporting evidence:\x1b[0m\n`);
626
+ for (const p of qa.passages.slice(1, 4)) {
627
+ await writeStdout(` • [${(p.score * 100).toFixed(0)}%] ${p.text.substring(0, 200)}${p.text.length > 200 ? '...' : ''}\n`);
628
+ }
629
+ }
630
+ await writeStdout('\n');
631
+ await cleanup();
632
+ process.exit(0);
633
+ }
634
+ }
635
+ // --- RAG Chunking output (chunks come from pipeline via peelOptions.chunk) ---
636
+ if (result.chunks && result.chunks.length > 0 && !isJson) {
637
+ console.log(`\n${'─'.repeat(60)}`);
638
+ console.log(`📦 ${result.chunks.length} chunks (${options.chunkStrategy || 'section'} strategy)\n`);
639
+ for (const chunk of result.chunks) {
640
+ const sectionLabel = chunk.section ? ` [${chunk.section}]` : '';
641
+ console.log(`── Chunk ${chunk.index + 1}${sectionLabel} (${chunk.tokenCount} tokens, ${chunk.wordCount} words) ──`);
642
+ console.log(chunk.text.substring(0, 200) + (chunk.text.length > 200 ? '...' : ''));
643
+ console.log('');
644
+ }
645
+ }
646
+ // --- #4: Content quality warning ---
647
+ const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
648
+ const isRedirect = false; // peel() follows redirects — final result is always 200
649
+ if (result.tokens < 20 && !useRender && isHtmlContent && !isRedirect) {
650
+ const warningMsg = `Low content detected (${result.tokens} tokens). Try: webpeel ${url} --render`;
651
+ if (isJson) {
652
+ result.warning = warningMsg;
653
+ }
654
+ else {
655
+ console.error(`⚠ ${warningMsg}`);
656
+ }
657
+ }
658
+ // --- LLM-based extraction (post-peel) ---
659
+ if (options.llmExtract || options.extractSchema) {
660
+ const { extractWithLLM } = await import('../../core/llm-extract.js');
661
+ const llmCfg = loadConfig();
662
+ const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
663
+ const llmModel = options.llmModel || llmCfg.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
664
+ const llmBaseUrl = options.llmBaseUrl || llmCfg.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
665
+ const llmInstruction = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
666
+ // Parse --extract-schema if provided
667
+ let llmSchema;
668
+ if (options.extractSchema) {
669
+ let schemaStr = options.extractSchema;
670
+ if (schemaStr.startsWith('@')) {
671
+ schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
672
+ }
673
+ try {
674
+ llmSchema = JSON.parse(schemaStr);
675
+ }
676
+ catch {
677
+ exitWithJsonError('--extract-schema must be valid JSON or a valid @file.json path', 'FETCH_FAILED');
678
+ }
679
+ }
680
+ const llmResult = await extractWithLLM({
681
+ content: result.content,
682
+ instruction: llmInstruction,
683
+ schema: llmSchema,
684
+ apiKey: llmApiKey,
685
+ model: llmModel,
686
+ baseUrl: llmBaseUrl,
687
+ });
688
+ // Output structured items as JSON
689
+ await writeStdout(JSON.stringify(llmResult.items, null, 2) + '\n');
690
+ // Show token usage and estimated cost
691
+ if (!options.silent) {
692
+ const { input, output } = llmResult.tokensUsed;
693
+ const costStr = llmResult.cost !== undefined
694
+ ? ` | Est. cost: $${llmResult.cost.toFixed(6)}`
695
+ : '';
696
+ console.error(`\n🤖 LLM extraction: ${llmResult.items.length} items | ${input} input + ${output} output tokens${costStr} | model: ${llmResult.model}`);
697
+ }
698
+ await cleanup();
699
+ process.exit(0);
700
+ }
701
+ // --- Extract-all / pagination / output formatting ---
702
+ const wantsExtractAll = options.extractAll || options.scrollExtract !== undefined;
703
+ const pagesCount = Math.min(Math.max(options.pages || 1, 1), 10);
704
+ if (wantsExtractAll) {
705
+ const { extractListings } = await import('../../core/extract-listings.js');
706
+ const { findNextPageUrl } = await import('../../core/paginate.js');
707
+ const { findSchemaForUrl, extractWithSchema, loadBundledSchemas } = await import('../../core/schema-extraction.js');
708
+ // Resolve which schema to use (explicit --schema flag or auto-detect)
709
+ let activeSchema = null;
710
+ if (options.schema) {
711
+ // Find schema by name or domain match
712
+ const schemaQuery = options.schema.toLowerCase();
713
+ const allSchemas = loadBundledSchemas();
714
+ activeSchema = allSchemas.find(s => s.name.toLowerCase().includes(schemaQuery) ||
715
+ s.domains.some(d => d.toLowerCase().includes(schemaQuery))) ?? null;
716
+ if (!activeSchema && !options.silent) {
717
+ console.error(`Warning: No schema found for "${options.schema}", falling back to auto-detection`);
718
+ }
719
+ }
720
+ else {
721
+ // Auto-detect from URL
722
+ activeSchema = findSchemaForUrl(result.url || url);
723
+ }
724
+ // We need the raw HTML for extraction. Re-fetch with format=html if needed.
725
+ let allListings = [];
726
+ // Fetch HTML for extraction
727
+ const htmlResult = peelOptions.format === 'html'
728
+ ? result
729
+ : await peel(url, { ...peelOptions, format: 'html', maxTokens: undefined });
730
+ // Try schema extraction first, fall back to generic
731
+ if (activeSchema) {
732
+ const schemaListings = extractWithSchema(htmlResult.content, activeSchema, result.url);
733
+ if (schemaListings.length > 0) {
734
+ allListings.push(...schemaListings);
735
+ }
736
+ else {
737
+ // Schema returned nothing — fall back to generic
738
+ allListings.push(...extractListings(htmlResult.content, result.url));
739
+ }
740
+ }
741
+ else {
742
+ allListings.push(...extractListings(htmlResult.content, result.url));
743
+ }
744
+ // Pagination: follow "Next" links
745
+ if (pagesCount > 1) {
746
+ let currentHtml = htmlResult.content;
747
+ let currentUrl = result.url;
748
+ for (let page = 1; page < pagesCount; page++) {
749
+ const nextUrl = findNextPageUrl(currentHtml, currentUrl);
750
+ if (!nextUrl)
751
+ break;
752
+ try {
753
+ const nextResult = await peel(nextUrl, { ...peelOptions, format: 'html', maxTokens: undefined });
754
+ let pageListings;
755
+ if (activeSchema) {
756
+ const schemaPage = extractWithSchema(nextResult.content, activeSchema, nextResult.url);
757
+ pageListings = schemaPage.length > 0
758
+ ? schemaPage
759
+ : extractListings(nextResult.content, nextResult.url);
760
+ }
761
+ else {
762
+ pageListings = extractListings(nextResult.content, nextResult.url);
763
+ }
764
+ allListings.push(...pageListings);
765
+ currentHtml = nextResult.content;
766
+ currentUrl = nextResult.url;
767
+ }
768
+ catch {
769
+ break; // Stop paginating on error
770
+ }
771
+ }
772
+ }
773
+ // Apply budget to listings if requested
774
+ let listingsTruncated = false;
775
+ let totalAvailableListings;
776
+ if (options.budget && options.budget > 0 && allListings.length > 0) {
777
+ const { maxItems, truncated, totalAvailable } = budgetListings(allListings.length, options.budget);
778
+ if (truncated) {
779
+ listingsTruncated = true;
780
+ totalAvailableListings = totalAvailable;
781
+ allListings = allListings.slice(0, maxItems);
782
+ }
783
+ }
784
+ // Output based on format flags
785
+ if (options.csv) {
786
+ const csvOutput = formatListingsCsv(allListings);
787
+ await writeStdout(csvOutput);
788
+ }
789
+ else if (options.table) {
790
+ const { formatTable } = await import('../../core/table-format.js');
791
+ const tableRows = allListings.map(item => {
792
+ const row = {};
793
+ for (const [k, v] of Object.entries(item)) {
794
+ if (v !== undefined)
795
+ row[k] = v;
796
+ }
797
+ return row;
798
+ });
799
+ await writeStdout(formatTable(tableRows) + '\n');
800
+ }
801
+ else if (isJson) {
802
+ // Use unified envelope for JSON output
803
+ const structured = allListings;
804
+ const envelope = buildEnvelope(result, {
805
+ cached: false,
806
+ structured,
807
+ truncated: listingsTruncated || undefined,
808
+ totalAvailable: totalAvailableListings,
809
+ });
810
+ // Also include legacy fields for backward compat
811
+ envelope.listings = allListings;
812
+ envelope.count = allListings.length;
813
+ await writeStdout(JSON.stringify(envelope, null, 2) + '\n');
814
+ }
815
+ else {
816
+ // Formatted text output
817
+ if (allListings.length === 0) {
818
+ await writeStdout('No listings found.\n');
819
+ }
820
+ else {
821
+ const truncNote = listingsTruncated && totalAvailableListings
822
+ ? ` (${totalAvailableListings} total — budget limited to ${allListings.length})`
823
+ : '';
824
+ await writeStdout(`Found ${allListings.length} listings${truncNote}:\n\n`);
825
+ allListings.forEach((item, i) => {
826
+ const pricePart = item.price ? ` — ${item.price}` : '';
827
+ const line = `${i + 1}. ${item.title}${pricePart}\n`;
828
+ process.stdout.write(line);
829
+ if (item.link) {
830
+ process.stdout.write(` ${item.link}\n`);
831
+ }
832
+ process.stdout.write('\n');
833
+ });
834
+ }
835
+ }
836
+ }
837
+ else if (options.csv || options.table) {
838
+ // CSV / table output for --extract (CSS selector extraction)
839
+ if (result.extracted) {
840
+ const rows = normaliseExtractedToRows(result.extracted);
841
+ if (options.csv) {
842
+ await writeStdout(formatListingsCsv(rows));
843
+ }
844
+ else {
845
+ const { formatTable } = await import('../../core/table-format.js');
846
+ await writeStdout(formatTable(rows) + '\n');
847
+ }
848
+ }
849
+ else {
850
+ console.error('--csv / --table require --extract-all or --extract to produce structured data.');
851
+ }
852
+ }
853
+ else {
854
+ // --- BM25 Schema Template Extraction (no LLM needed) ---
855
+ if (options.schema && result.content) {
856
+ const { getSchemaTemplate: getSchTmpl } = await import('../../core/schema-templates.js');
857
+ const schTemplate = getSchTmpl(options.schema);
858
+ if (schTemplate) {
859
+ const { quickAnswer: qa } = await import('../../core/quick-answer.js');
860
+ const { smartExtractSchemaFields } = await import('../../core/schema-postprocess.js');
861
+ const extracted = smartExtractSchemaFields(result.content, schTemplate.fields, qa, {
862
+ pageTitle: result.title,
863
+ pageUrl: result.url,
864
+ metadata: result.metadata,
865
+ });
866
+ result.extracted = extracted;
867
+ }
868
+ }
869
+ // Output results (default path)
870
+ await outputResult(result, options, {
871
+ cached: false,
872
+ truncated: contentTruncated || undefined,
873
+ });
874
+ }
875
+ // Clean up and exit
876
+ await cleanup();
877
+ process.exit(0);
878
+ }
879
+ catch (error) {
880
+ if (spinner) {
881
+ spinner.fail('Failed to fetch');
882
+ }
883
+ // --- #6: Consistent JSON error output ---
884
+ if (isJson) {
885
+ const errMsg = error instanceof Error ? error.message : 'Unknown error';
886
+ const errCode = classifyErrorCode(error);
887
+ await writeStdout(JSON.stringify({ success: false, error: { type: errCode.toLowerCase(), message: errMsg } }) + '\n');
888
+ await cleanup();
889
+ process.exit(1);
890
+ }
891
+ if (error instanceof Error) {
892
+ console.error('\n' + formatError(error, url || '', options));
893
+ }
894
+ else {
895
+ console.error('\x1b[31m✖ Unknown error occurred\x1b[0m');
896
+ }
897
+ await cleanup();
898
+ process.exit(1);
899
+ }
900
+ }
901
+ // ─── registerFetchCommands ───────────────────────────────────────────────────
902
+ export function registerFetchCommands(program) {
903
+ // ── Default command: fetch a URL ─────────────────────────────────────────
904
+ program
905
+ .argument('[url]', 'URL to fetch')
906
+ .option('-r, --render', 'Use headless browser (for JS-heavy sites)')
907
+ .option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
908
+ .option('--cloaked', 'Use CloakBrowser stealth (requires: npm install cloakbrowser)')
909
+ .option('--tls', 'Use PeelTLS TLS fingerprint spoofing (built-in, no install needed)')
910
+ .option('--cycle', 'Use PeelTLS TLS fingerprint spoofing (alias for --tls)', false)
911
+ .option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
912
+ .option('--proxies <urls>', 'Comma-separated list of proxy URLs for rotation (tried in order on failure)', (val) => val.split(',').map((s) => s.trim()).filter(Boolean))
913
+ .option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
914
+ .option('--html', 'Output raw HTML instead of markdown')
915
+ .option('--text', 'Output plain text instead of markdown')
916
+ .option('--clean', 'Clean output — article content only, no links or metadata (alias for --readable with URL-stripped markdown)')
917
+ .option('--json', 'Output as JSON')
918
+ .option('-t, --timeout <ms>', 'Request timeout (ms)', (v) => parseInt(v, 10), 30000)
919
+ .option('--ua <agent>', 'Custom user agent')
920
+ .option('-s, --silent', 'Silent mode (no spinner)')
921
+ .option('--screenshot [path]', 'Take a screenshot (optionally save to file path)')
922
+ .option('--full-page', 'Full-page screenshot (use with --screenshot)')
923
+ .option('--selector <css>', 'CSS selector to extract (e.g., "article", ".content")')
924
+ .option('--exclude <selectors...>', 'CSS selectors to exclude (e.g., ".sidebar" ".ads")')
925
+ .option('--include-tags <tags>', 'Comma-separated HTML tags/selectors to include (e.g., "main,article,.content")')
926
+ .option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
927
+ .option('--only-main-content', 'Shortcut for --include-tags main,article')
928
+ .option('--full-content', 'Return full page content (disable automatic content density pruning)')
929
+ .option('--readable', 'Reader mode — extract only the main article content, strip all noise (like browser Reader Mode)')
930
+ .option('--full-nav', 'Keep full navigation/content (disable auto-readability when piped or in agent mode)')
931
+ .option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
932
+ .option('--chunk', 'Split content into RAG-ready chunks')
933
+ .option('--chunk-size <tokens>', 'Max tokens per chunk (default: 512)', parseInt)
934
+ .option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 50)', parseInt)
935
+ .option('--chunk-strategy <strategy>', 'Chunking strategy: section (default), paragraph, fixed')
936
+ .option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
937
+ .option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
938
+ .option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
939
+ .option('--no-cache', 'Disable automatic caching for this request')
940
+ .option('--links', 'Output only the links found on the page')
941
+ .option('--images', 'Output image URLs from the page')
942
+ .option('--meta', 'Output only the page metadata (title, description, author, etc.)')
943
+ .option('--raw', 'Return full page without smart content extraction')
944
+ .option('--full', 'Alias for --raw — full page content, no budget')
945
+ .option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
946
+ .option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
947
+ .option('--extract <json>', 'Extract structured data using CSS selectors (JSON object of field:selector pairs)')
948
+ .option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
949
+ .option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
950
+ .option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
951
+ .option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
952
+ .option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
953
+ .option('--summary', 'Generate AI summary of content (requires --llm-key or OPENAI_API_KEY)')
954
+ .option('--location <country>', 'ISO country code for geo-targeting (e.g., "US", "DE", "JP")')
955
+ .option('--language <lang>', 'Language preference (e.g., "en", "de", "ja")')
956
+ .option('--max-tokens <n>', 'Maximum token count for output (truncate if exceeded)', parseInt)
957
+ .option('--budget <n>', 'Smart token budget — distill content to fit within N tokens (heuristic, no LLM key needed)', parseInt)
958
+ .option('--extract-all', 'Auto-detect and extract repeated listing items (e.g., search results)')
959
+ .option('--schema <name>', 'Force a specific extraction schema by name or domain (e.g., "booking.com", "amazon")')
960
+ .option('--list-schemas', 'List all available extraction schemas and their supported domains')
961
+ .option('--scroll-extract [count]', 'Scroll page N times to load lazy content (bare flag = smart auto-scroll until stable), then extract (implies --render)', (v) => parseInt(v, 10))
962
+ .option('--scroll-extract-timeout <ms>', 'Total timeout in ms for auto-scroll (default: 30000, only used with bare --scroll-extract)', parseInt)
963
+ .option('--csv', 'Output extraction results as CSV')
964
+ .option('--table', 'Output extraction results as a formatted table')
965
+ .option('--pages <n>', 'Follow pagination "Next" links for N pages (max 10)', (v) => parseInt(v, 10))
966
+ .option('--profile <path>', 'Use a persistent browser profile directory (cookies/sessions survive between calls)')
967
+ .option('--headed', 'Run browser in headed (visible) mode — useful for profile setup and debugging')
968
+ .option('-q, --question <q>', 'Ask a question about the page content (BM25-powered, no LLM key needed)')
969
+ .option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)')
970
+ .option('--device <type>', 'Device emulation: desktop (default), mobile, tablet (auto-enables --render)')
971
+ .option('--viewport <WxH>', 'Browser viewport size (e.g., "1920x1080") (auto-enables --render)', (val) => {
972
+ const [w, h] = val.split('x').map(Number);
973
+ return { width: w, height: h };
974
+ })
975
+ .option('--wait-until <event>', 'Page load event: domcontentloaded, networkidle, load, commit (auto-enables --render)')
976
+ .option('--wait-selector <css>', 'Wait for CSS selector before extracting (auto-enables --render)')
977
+ .option('--block-resources <types>', 'Block resource types, comma-separated: image,stylesheet,font,media,script (auto-enables --render)')
978
+ .option('--format <type>', 'Output format: markdown (default), text, html, json')
979
+ .action(async (url, options) => {
980
+ await runFetch(url, options);
981
+ });
982
+ // ── read subcommand (explicit readable mode) ─────────────────────────────
983
+ program
984
+ .command('read <url>')
985
+ .description('Read a page in clean reader mode (like browser Reader View)')
986
+ .option('--json', 'Output as JSON')
987
+ .option('-s, --silent', 'Silent mode')
988
+ .option('--budget <n>', 'Token budget (default: 4000)', parseInt)
989
+ .option('--focus <query>', 'Focus on content relevant to this query')
990
+ .action(async (url, opts) => {
991
+ await runFetch(url, {
992
+ ...opts,
993
+ readable: true,
994
+ budget: 4000,
995
+ });
996
+ });
997
+ // ── pipe subcommand — always JSON, no UI (agent-friendly) ────────────────
998
+ program
999
+ .command('pipe <url>')
1000
+ .description('Pipe-friendly fetch (always JSON, no UI). Alias for: webpeel <url> --json --silent')
1001
+ .option('-r, --render', 'Use headless browser')
1002
+ .option('--stealth', 'Stealth mode')
1003
+ .option('--budget <n>', 'Token budget', parseInt)
1004
+ .option('--clean', 'Clean format for AI')
1005
+ .option('-q, --question <q>', 'Quick answer')
1006
+ .option('--proxy <url>', 'Proxy URL')
1007
+ .option('--timeout <ms>', 'Timeout in ms', parseInt)
1008
+ .option('-s, --silent', 'Silent mode (always on for pipe, accepted for compatibility)')
1009
+ .action(async (url, opts) => {
1010
+ // Force JSON + silent — always, unconditionally
1011
+ opts.json = true;
1012
+ opts.silent = true;
1013
+ await runFetch(url, opts);
1014
+ });
1015
+ }