@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,1044 @@
1
+ /**
2
+ * Smart escalation strategy: try simple fetch first, escalate to browser if needed.
3
+ *
4
+ * Premium server-side optimisations (SWR cache, domain intelligence, parallel
5
+ * race) are injected via the hook system in `strategy-hooks.ts`. When no hooks
6
+ * are registered the strategy degrades gracefully to a simple escalation path
7
+ * that works great for CLI / npm library usage.
8
+ */
9
+ import { simpleFetch, browserFetch } from './fetcher.js';
10
+ import { getCached, setCached as setBasicCache } from './cache.js';
11
+ import { resolveAndCache } from './dns-cache.js';
12
+ import { BlockedError, NetworkError } from '../types.js';
13
+ import { WebPeelError } from '../errors.js';
14
+ import { withRetry, domainLimiter } from './retry.js';
15
+ import { getWebshareProxyUrl, canUseProxy, recordProxyBytes } from './proxy-config.js';
16
+ import { detectChallenge } from './challenge-detection.js';
17
+ import { browserCircuitBreaker } from './circuit-breaker.js';
18
+ import { markProxyExhausted } from './proxy-config.js';
19
+ import { getStrategyHooks, } from './strategy-hooks.js';
20
+ import { createLogger } from './logger.js';
21
+ const log = createLogger('fetch');
22
+ /* ---------- hardcoded domain rules -------------------------------------- */
23
+ /**
24
+ * Domains that require a residential proxy to bypass datacenter IP blocks.
25
+ * These sites don't just need stealth — they fingerprint the IP itself and
26
+ * block all cloud/datacenter ranges. Webshare residential proxy bypasses this.
27
+ *
28
+ * When no explicit proxy is set and Webshare is configured, requests to these
29
+ * domains skip the direct (datacenter) attempt and go straight to residential proxy.
30
+ */
31
+ const RESIDENTIAL_PROXY_DOMAINS = [
32
+ 'zillow.com',
33
+ 'yelp.com',
34
+ 'pinterest.com',
35
+ 'ticketmaster.com',
36
+ 'stubhub.com',
37
+ 'cargurus.com',
38
+ 'realtor.com',
39
+ 'redfin.com',
40
+ 'apartments.com',
41
+ 'trulia.com',
42
+ 'homefinder.com',
43
+ ];
44
+ /**
45
+ * Check if a URL matches a domain that requires residential proxy.
46
+ * Returns true if no explicit proxy is set and Webshare env vars are available.
47
+ */
48
+ function requiresResidentialProxy(url) {
49
+ try {
50
+ const hostname = new URL(url).hostname.toLowerCase();
51
+ return RESIDENTIAL_PROXY_DOMAINS.some(domain => hostname === domain || hostname.endsWith(`.${domain}`));
52
+ }
53
+ catch {
54
+ return false;
55
+ }
56
+ }
57
+ export function shouldForceBrowser(url) {
58
+ // Hashbang URLs (#!) are always JS-routed SPAs — browser rendering required
59
+ if (url.includes('#!')) {
60
+ return { mode: 'browser' };
61
+ }
62
+ try {
63
+ const hostname = new URL(url).hostname.toLowerCase();
64
+ // Sites that return HTML shells / need JS rendering (browser mode)
65
+ const browserDomains = [
66
+ 'reddit.com', // HTML shell via simple fetch
67
+ 'npmjs.com', // 403 on simple fetch
68
+ 'x.com', // SPA, login wall
69
+ 'twitter.com', // SPA, login wall
70
+ 'instagram.com', // SPA, login wall
71
+ 'facebook.com', // SPA, heavy JS
72
+ 'tiktok.com', // SPA, JS-rendered
73
+ 'pinterest.com', // SPA, JS-rendered
74
+ 'airbnb.com', // heavy SPA
75
+ 'medium.com', // JS-rendered, sometimes login wall
76
+ 'substack.com', // JS-rendered
77
+ 'notion.so', // SPA
78
+ 'figma.com', // SPA
79
+ 'canva.com', // SPA
80
+ 'vercel.app', // Could be any SPA
81
+ ];
82
+ for (const domain of browserDomains) {
83
+ if (hostname === domain || hostname.endsWith(`.${domain}`)) {
84
+ return { mode: 'browser' };
85
+ }
86
+ }
87
+ // These are known to aggressively block automation — stealth mode required
88
+ const stealthDomains = [
89
+ 'glassdoor.com',
90
+ 'bloomberg.com',
91
+ 'indeed.com',
92
+ 'yelp.com', // aggressive bot detection
93
+ 'amazon.com', // captcha wall on simple/browser fetch
94
+ 'zillow.com', // aggressive bot detection
95
+ 'ticketmaster.com', // Distil Networks / PerimeterX
96
+ 'stubhub.com', // PerimeterX / CAPTCHA
97
+ 'walmart.com', // Akamai Bot Manager
98
+ 'target.com', // Akamai Bot Manager
99
+ 'bestbuy.com', // Akamai Bot Manager
100
+ 'homedepot.com', // Akamai Bot Manager
101
+ 'lowes.com', // Akamai Bot Manager
102
+ 'costco.com', // Akamai Bot Manager
103
+ 'nike.com', // Akamai / Shape Security
104
+ 'footlocker.com', // PerimeterX / DataDome
105
+ 'realtor.com', // aggressive bot detection
106
+ 'redfin.com', // aggressive bot detection
107
+ 'cloudflare.com', // Cloudflare challenge pages
108
+ 'ebay.com', // challenge page on simple fetch
109
+ 'linkedin.com', // aggressive bot detection + login walls
110
+ 'craigslist.org', // occasionally blocks automated access
111
+ 'etsy.com', // Akamai protection
112
+ 'wayfair.com', // Akamai protection
113
+ 'newegg.com', // bot detection
114
+ 'zappos.com', // Amazon subsidiary, same protection
115
+ 'chewy.com', // Amazon subsidiary
116
+ 'aliexpress.com', // anti-bot
117
+ 'wish.com', // anti-bot
118
+ 'cargurus.com', // aggressive bot detection
119
+ ];
120
+ for (const domain of stealthDomains) {
121
+ if (hostname === domain || hostname.endsWith(`.${domain}`)) {
122
+ return { mode: 'stealth' };
123
+ }
124
+ }
125
+ }
126
+ catch (e) {
127
+ // Ignore URL parsing errors; validation happens inside fetchers.
128
+ log.debug('stealth domain URL parse failed:', e instanceof Error ? e.message : e);
129
+ }
130
+ return null;
131
+ }
132
+ /* ---------- helpers ------------------------------------------------------ */
133
+ /**
134
+ * Detect strong SPA indicators in fetched HTML that suggest browser rendering is required.
135
+ *
136
+ * These patterns indicate a JS-rendered SPA shell page: the server returns a
137
+ * barebones HTML document with an empty root mount point that only gets
138
+ * populated after JavaScript runs in the browser.
139
+ *
140
+ * Auto-render detection complements the domain-list approach in shouldForceBrowser():
141
+ * it catches unknown SPAs that aren't in the hardcoded list.
142
+ */
143
+ function hasSpaIndicators(html) {
144
+ // Empty SPA root mount points — definitive SPA shell indicators
145
+ const emptyRootPatterns = [
146
+ '<div id="root"></div>',
147
+ '<div id="root"> </div>',
148
+ '<div id="app"></div>',
149
+ '<div id="app"> </div>',
150
+ '<div id="__next"></div>',
151
+ '<div id="__next"> </div>',
152
+ '<div id="___gatsby"></div>',
153
+ '<div id="gatsby-focus-wrapper"></div>',
154
+ ];
155
+ for (const pattern of emptyRootPatterns) {
156
+ if (html.includes(pattern))
157
+ return true;
158
+ }
159
+ // <noscript> blocks with "enable JavaScript" messages
160
+ // These are canonical SPA signals — React, Vue, Angular all emit them
161
+ const noscriptMatch = html.match(/<noscript[^>]*>([\s\S]*?)<\/noscript>/i);
162
+ if (noscriptMatch) {
163
+ const noscriptContent = noscriptMatch[1].toLowerCase();
164
+ if (noscriptContent.includes('enable javascript') ||
165
+ noscriptContent.includes('javascript is required') ||
166
+ noscriptContent.includes('javascript must be enabled') ||
167
+ noscriptContent.includes('requires javascript') ||
168
+ noscriptContent.includes('javascript to run this app') ||
169
+ noscriptContent.includes('you need to enable javascript')) {
170
+ return true;
171
+ }
172
+ }
173
+ // Many script tags + very little visible text = almost certainly an SPA shell.
174
+ // This catches SPAs not matched by the root-div patterns above.
175
+ // Note: shouldEscalateForLowContent() guards html.length > 1500; this fills the gap
176
+ // for smaller pages (e.g. minimal webpack bundles with few/no meta tags).
177
+ const scriptTagCount = (html.match(/<script/gi) || []).length;
178
+ if (scriptTagCount >= 5) {
179
+ // Strip scripts/styles then measure visible text
180
+ const stripped = html
181
+ .replace(/<script[\s\S]*?<\/script>/gi, '')
182
+ .replace(/<style[\s\S]*?<\/style>/gi, '')
183
+ .replace(/<noscript[\s\S]*?<\/noscript>/gi, '')
184
+ .replace(/<[^>]*>/g, '')
185
+ .replace(/\s+/g, ' ')
186
+ .trim();
187
+ // Many scripts but almost no readable text → render it
188
+ if (stripped.length < 150) {
189
+ return true;
190
+ }
191
+ }
192
+ return false;
193
+ }
194
+ function isAbortError(error) {
195
+ return error instanceof Error && error.name === 'AbortError';
196
+ }
197
+ function shouldEscalateSimpleError(error) {
198
+ if (error instanceof BlockedError)
199
+ return true;
200
+ return error instanceof NetworkError && error.message.includes('TLS/SSL');
201
+ }
202
+ function looksLikeShellPage(result) {
203
+ const ct = (result.contentType || '').toLowerCase();
204
+ if (!ct.includes('html'))
205
+ return false;
206
+ const text = result.html.replace(/<[^>]*>/g, '').trim();
207
+ return text.length < 500 && result.html.length > 1000;
208
+ }
209
+ /**
210
+ * Detect pages that returned HTML but have very little actual text content.
211
+ * This catches JS-rendered SPAs that return a shell page with a big HTML payload
212
+ * (scripts, styles, framework boilerplate) but minimal visible text.
213
+ */
214
+ function shouldEscalateForLowContent(result) {
215
+ const ct = (result.contentType || '').toLowerCase();
216
+ if (!ct.includes('html'))
217
+ return false;
218
+ if (result.html.length <= 1500)
219
+ return false;
220
+ // Strip script/style blocks and their contents first, then strip remaining tags
221
+ const withoutScripts = result.html
222
+ .replace(/<script[\s\S]*?<\/script>/gi, '')
223
+ .replace(/<style[\s\S]*?<\/style>/gi, '')
224
+ .replace(/<noscript[\s\S]*?<\/noscript>/gi, '');
225
+ const visibleText = withoutScripts.replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim();
226
+ return visibleText.length < 200;
227
+ }
228
+ function prefetchDns(url) {
229
+ try {
230
+ const hostname = new URL(url).hostname;
231
+ void resolveAndCache(hostname).catch(() => { });
232
+ }
233
+ catch (e) {
234
+ // Ignore invalid URL.
235
+ log.debug('DNS prefetch URL parse failed:', e instanceof Error ? e.message : e);
236
+ }
237
+ }
238
+ async function fetchWithBrowserStrategy(url, options) {
239
+ const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, storageState, proxy, device, viewportWidth, viewportHeight, deviceScaleFactor, waitUntil, waitSelector, blockResources, isSPA, languages, } = options;
240
+ // Check circuit breaker before attempting any browser launch
241
+ if (!browserCircuitBreaker.canExecute()) {
242
+ throw new Error('Browser circuit breaker OPEN — Chromium unavailable, using HTTP fallback');
243
+ }
244
+ try {
245
+ const result = await browserFetch(url, {
246
+ userAgent,
247
+ waitMs,
248
+ timeoutMs,
249
+ screenshot,
250
+ screenshotFullPage,
251
+ headers,
252
+ cookies,
253
+ stealth: effectiveStealth,
254
+ actions,
255
+ keepPageOpen,
256
+ signal,
257
+ profileDir,
258
+ headed,
259
+ proxy,
260
+ storageState,
261
+ device,
262
+ viewportWidth,
263
+ viewportHeight,
264
+ deviceScaleFactor,
265
+ waitUntil,
266
+ waitSelector,
267
+ blockResources,
268
+ isSPA,
269
+ languages,
270
+ });
271
+ browserCircuitBreaker.recordSuccess();
272
+ return {
273
+ ...result,
274
+ method: effectiveStealth ? 'stealth' : 'browser',
275
+ };
276
+ }
277
+ catch (error) {
278
+ if (isAbortError(error))
279
+ throw error;
280
+ // Trip the circuit breaker on infrastructure errors (not page-level errors)
281
+ const errMsg = error.message || '';
282
+ const isInfraError = errMsg.includes('ERR_TUNNEL') ||
283
+ errMsg.includes('ECONNREFUSED') ||
284
+ errMsg.includes('browser has been closed') ||
285
+ errMsg.includes('Target closed') ||
286
+ errMsg.includes('Protocol error') ||
287
+ errMsg.includes('Session closed') ||
288
+ errMsg.includes('Browser.close') ||
289
+ errMsg.includes('crashed');
290
+ if (isInfraError) {
291
+ // ERR_TUNNEL specifically means proxy is dead (402 bandwidth, connection refused)
292
+ // Disable proxy for 5 minutes so subsequent requests go direct instead of failing.
293
+ // Don't trip the circuit breaker for proxy-only failures — the browser itself is fine,
294
+ // it just needs to run without a proxy.
295
+ if (errMsg.includes('ERR_TUNNEL')) {
296
+ markProxyExhausted('ERR_TUNNEL_CONNECTION_FAILED — proxy bandwidth likely exhausted');
297
+ // Don't count this as a browser infrastructure failure
298
+ }
299
+ else {
300
+ browserCircuitBreaker.recordFailure(error);
301
+ }
302
+ }
303
+ // If browser gets blocked, try stealth as fallback (unless already stealth)
304
+ if (!effectiveStealth && error instanceof BlockedError && browserCircuitBreaker.canExecute()) {
305
+ const result = await browserFetch(url, {
306
+ userAgent,
307
+ waitMs,
308
+ timeoutMs,
309
+ screenshot,
310
+ screenshotFullPage,
311
+ headers,
312
+ cookies,
313
+ stealth: true,
314
+ actions,
315
+ keepPageOpen,
316
+ signal,
317
+ profileDir,
318
+ headed,
319
+ storageState,
320
+ proxy,
321
+ device,
322
+ viewportWidth,
323
+ viewportHeight,
324
+ deviceScaleFactor,
325
+ });
326
+ return { ...result, method: 'stealth' };
327
+ }
328
+ // If Cloudflare detected, retry with extra wait time
329
+ if (error instanceof NetworkError &&
330
+ error.message.toLowerCase().includes('cloudflare') &&
331
+ browserCircuitBreaker.canExecute()) {
332
+ const result = await browserFetch(url, {
333
+ userAgent,
334
+ waitMs: 5000,
335
+ timeoutMs,
336
+ screenshot,
337
+ screenshotFullPage,
338
+ headers,
339
+ cookies,
340
+ stealth: effectiveStealth,
341
+ actions,
342
+ keepPageOpen,
343
+ signal,
344
+ profileDir,
345
+ headed,
346
+ proxy,
347
+ device,
348
+ viewportWidth,
349
+ viewportHeight,
350
+ deviceScaleFactor,
351
+ });
352
+ return { ...result, method: effectiveStealth ? 'stealth' : 'browser' };
353
+ }
354
+ // If network error (HTTP/2 protocol, connection refused, etc.), try stealth as fallback
355
+ if (!effectiveStealth && error instanceof NetworkError && browserCircuitBreaker.canExecute()) {
356
+ try {
357
+ const result = await browserFetch(url, {
358
+ userAgent,
359
+ waitMs,
360
+ timeoutMs,
361
+ screenshot,
362
+ screenshotFullPage,
363
+ headers,
364
+ cookies,
365
+ stealth: true,
366
+ actions,
367
+ keepPageOpen,
368
+ signal,
369
+ profileDir,
370
+ headed,
371
+ storageState,
372
+ proxy,
373
+ device,
374
+ viewportWidth,
375
+ viewportHeight,
376
+ deviceScaleFactor,
377
+ });
378
+ return { ...result, method: 'stealth' };
379
+ }
380
+ catch (stealthError) {
381
+ // Stealth also failed — throw original error with helpful message
382
+ throw error;
383
+ }
384
+ }
385
+ throw error;
386
+ }
387
+ }
388
+ /* ---------- main entry point -------------------------------------------- */
389
+ /**
390
+ * Smart fetch with automatic escalation.
391
+ *
392
+ * Without hooks: simple fetch → browser → stealth escalation.
393
+ * With premium hooks: SWR cache → domain intel → parallel race → escalation.
394
+ */
395
+ export async function smartFetch(url, options = {}) {
396
+ const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, deviceScaleFactor, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, location, proxyContext, } = options;
397
+ const usePeelTLS = tls || cycle;
398
+ // Build effective proxy list: explicit proxies array, or single proxy, or empty.
399
+ // For domains that require residential proxies (Zillow, Yelp, Pinterest, etc.),
400
+ // skip the direct datacenter connection entirely and go straight to Webshare.
401
+ // For all other domains, try direct first (fast), then Webshare as fallback.
402
+ //
403
+ // Tier enforcement: if proxyContext is set and the user is over their limit (or free tier),
404
+ // skip Webshare entirely so they run direct-only.
405
+ const userCanProxy = !proxyContext?.userId || canUseProxy(proxyContext.userId, proxyContext.tier || 'free');
406
+ const effectiveProxies = proxies?.length ? proxies :
407
+ proxy ? [proxy] :
408
+ (() => {
409
+ if (!userCanProxy)
410
+ return [undefined]; // Tier limit reached — direct only
411
+ const wsUrl = getWebshareProxyUrl();
412
+ if (!wsUrl)
413
+ return [undefined];
414
+ // Skip datacenter IP for known residential-proxy-required domains
415
+ if (requiresResidentialProxy(url)) {
416
+ log.debug('Residential proxy domain detected — skipping datacenter IP, using Webshare directly');
417
+ return [wsUrl];
418
+ }
419
+ return [undefined, wsUrl];
420
+ })();
421
+ const firstProxy = effectiveProxies[0];
422
+ const hooks = getStrategyHooks();
423
+ const fetchStartMs = Date.now();
424
+ const recordMethod = (method) => {
425
+ if (method === 'cached' || method === 'cloaked' || method === 'cycle' || method === 'peeltls' || method === 'cf-worker' || method === 'google-cache')
426
+ return;
427
+ hooks.recordDomainResult?.(url, method, Date.now() - fetchStartMs);
428
+ };
429
+ /* ---- determine effective mode ---------------------------------------- */
430
+ // Hardcoded rules always take priority, then hook-based domain intelligence.
431
+ const forced = shouldForceBrowser(url);
432
+ const recommended = hooks.getDomainRecommendation?.(url) ?? null;
433
+ const selected = forced ?? recommended;
434
+ let effectiveForceBrowser = forceBrowser;
435
+ let effectiveStealth = stealth;
436
+ if (selected) {
437
+ effectiveForceBrowser = true;
438
+ if (selected.mode === 'stealth')
439
+ effectiveStealth = true;
440
+ }
441
+ prefetchDns(url);
442
+ /* ---- cache eligibility ----------------------------------------------- */
443
+ const canUseCache = !noCache &&
444
+ !effectiveForceBrowser &&
445
+ !effectiveStealth &&
446
+ !screenshot &&
447
+ !keepPageOpen &&
448
+ !actions?.length &&
449
+ !headers &&
450
+ !cookies &&
451
+ waitMs === 0 &&
452
+ !userAgent &&
453
+ !proxy &&
454
+ !proxies?.length;
455
+ /* ---- CloakBrowser direct path (if explicitly requested) -------------- */
456
+ if (cloaked) {
457
+ try {
458
+ // @ts-ignore — proprietary module, gitignored
459
+ const { cloakFetch, isCloakBrowserAvailable } = await import('./cloak-fetch.js');
460
+ if (!isCloakBrowserAvailable()) {
461
+ throw new Error('CloakBrowser not installed. Run: npm install cloakbrowser playwright-core');
462
+ }
463
+ log.debug('Using CloakBrowser stealth (explicitly requested)');
464
+ const result = await cloakFetch({
465
+ url,
466
+ proxy: effectiveProxies[0],
467
+ userAgent,
468
+ viewportWidth,
469
+ viewportHeight,
470
+ waitMs,
471
+ waitSelector,
472
+ waitUntil,
473
+ timeoutMs,
474
+ screenshot,
475
+ screenshotFullPage,
476
+ actions,
477
+ headers,
478
+ headed,
479
+ });
480
+ if (canUseCache && !result.challengeDetected) {
481
+ hooks.setCache?.(url, result) ?? setBasicCache(url, result);
482
+ }
483
+ recordMethod(result.method);
484
+ return result;
485
+ }
486
+ catch (e) {
487
+ if (isAbortError(e))
488
+ throw e;
489
+ throw e; // Don't fall back — user explicitly requested cloaked mode
490
+ }
491
+ }
492
+ /* ---- PeelTLS direct path (if explicitly requested via --tls or --cycle) */
493
+ if (usePeelTLS) {
494
+ try {
495
+ const { peelTLSFetch, isPeelTLSAvailable } = await import('./peel-tls.js');
496
+ if (!isPeelTLSAvailable()) {
497
+ throw new Error('PeelTLS binary not found. Build it with: cd peeltls && bash build.sh');
498
+ }
499
+ log.debug('Using PeelTLS fingerprint spoofing (explicitly requested)');
500
+ const result = await peelTLSFetch(url, {
501
+ proxy: firstProxy,
502
+ headers,
503
+ timeout: timeoutMs,
504
+ });
505
+ const peelResult = { ...result, method: 'peeltls' };
506
+ if (canUseCache) {
507
+ hooks.setCache?.(url, peelResult) ?? setBasicCache(url, peelResult);
508
+ }
509
+ recordMethod('peeltls');
510
+ return peelResult;
511
+ }
512
+ catch (e) {
513
+ if (isAbortError(e))
514
+ throw e;
515
+ throw e; // Don't fall back — user explicitly requested tls mode
516
+ }
517
+ }
518
+ /* ---- hook-based cache check (premium) -------------------------------- */
519
+ if (canUseCache && hooks.checkCache) {
520
+ const cached = hooks.checkCache(url);
521
+ if (cached) {
522
+ if (cached.stale && hooks.markRevalidating?.(url)) {
523
+ // Background revalidation — fire-and-forget
524
+ void (async () => {
525
+ try {
526
+ const fresh = await simpleFetch(url, userAgent, timeoutMs, undefined, undefined, firstProxy, proxyContext);
527
+ if (!looksLikeShellPage(fresh)) {
528
+ hooks.setCache?.(url, { ...fresh, method: 'simple' });
529
+ }
530
+ }
531
+ catch (e) {
532
+ // Non-fatal: background revalidation failed, stale entry continues serving.
533
+ log.debug('background cache revalidation failed:', e instanceof Error ? e.message : e);
534
+ }
535
+ })();
536
+ }
537
+ return { ...cached.value, method: 'cached' };
538
+ }
539
+ }
540
+ /* ---- basic cache check (non-premium fallback) ------------------------ */
541
+ if (canUseCache && !hooks.checkCache) {
542
+ const basicCached = getCached(url);
543
+ if (basicCached) {
544
+ return { ...basicCached, method: 'cached' };
545
+ }
546
+ }
547
+ /* ---- browser-level options ------------------------------------------- */
548
+ let shouldUseBrowser = effectiveForceBrowser || screenshot || effectiveStealth;
549
+ // A profileDir always forces browser mode (profile sessions need a real browser)
550
+ if (profileDir) {
551
+ effectiveForceBrowser = true;
552
+ }
553
+ // storageState injection requires a browser context
554
+ if (storageState) {
555
+ effectiveForceBrowser = true;
556
+ }
557
+ // Detect SPA for smarter DOM stability wait
558
+ const SPA_FETCH_DOMAINS = new Set([
559
+ 'www.google.com', 'flights.google.com', 'www.airbnb.com', 'www.booking.com',
560
+ 'www.expedia.com', 'www.kayak.com', 'www.skyscanner.com', 'www.tripadvisor.com',
561
+ 'www.indeed.com', 'www.glassdoor.com', 'www.zillow.com', 'app.webpeel.dev',
562
+ ]);
563
+ const SPA_FETCH_URL_PATTERNS = [
564
+ /google\.com\/travel/, /google\.com\/maps/, /google\.com\/shopping/,
565
+ ];
566
+ let isSPAUrl = false;
567
+ try {
568
+ const parsedHostname = new URL(url).hostname;
569
+ isSPAUrl = SPA_FETCH_DOMAINS.has(parsedHostname) || SPA_FETCH_URL_PATTERNS.some(p => p.test(url));
570
+ }
571
+ catch { /* invalid URL — ignore */ }
572
+ const browserOptions = {
573
+ userAgent,
574
+ waitMs,
575
+ timeoutMs,
576
+ screenshot,
577
+ screenshotFullPage,
578
+ headers,
579
+ cookies,
580
+ actions,
581
+ keepPageOpen,
582
+ effectiveStealth,
583
+ profileDir,
584
+ headed,
585
+ storageState,
586
+ proxy: firstProxy,
587
+ device,
588
+ viewportWidth,
589
+ viewportHeight,
590
+ deviceScaleFactor,
591
+ waitUntil,
592
+ waitSelector,
593
+ blockResources,
594
+ isSPA: isSPAUrl,
595
+ languages: location?.languages,
596
+ };
597
+ /* ---- Strategy: simple fetch (with optional race) --------------------- */
598
+ if (!shouldUseBrowser) {
599
+ const simpleAbortController = new AbortController();
600
+ const simplePromise = withRetry(async () => {
601
+ // Throttle per-domain to avoid rate limits on target sites
602
+ await domainLimiter.throttle(url);
603
+ const result = await simpleFetch(url, userAgent, timeoutMs, headers, simpleAbortController.signal, firstProxy, proxyContext);
604
+ // Record success/failure for adaptive rate limiting
605
+ domainLimiter.recordResult(url, result.statusCode ?? 200);
606
+ return result;
607
+ }, {
608
+ maxRetries: 2,
609
+ baseDelayMs: 500,
610
+ maxDelayMs: 2000,
611
+ label: `simple-fetch:${url}`,
612
+ // Don't retry on blocked errors — escalate to browser instead
613
+ retryOn: (err) => {
614
+ if (err instanceof BlockedError)
615
+ return false;
616
+ if (err instanceof WebPeelError && !err.retryable)
617
+ return false;
618
+ // Retry transient errors (network, timeout, connection reset)
619
+ const msg = err.message?.toLowerCase() || '';
620
+ return (msg.includes('timeout') ||
621
+ msg.includes('econnreset') ||
622
+ msg.includes('econnrefused') ||
623
+ msg.includes('socket hang up') ||
624
+ msg.includes('getaddrinfo') ||
625
+ msg.includes('network'));
626
+ },
627
+ }).then((result) => {
628
+ if (looksLikeShellPage(result) || hasSpaIndicators(result.html)) {
629
+ throw new BlockedError('Shell page detected. Browser rendering required.');
630
+ }
631
+ return result;
632
+ });
633
+ // Determine race timeout — hooks can override
634
+ const useRace = hooks.shouldRace?.() ?? false;
635
+ const effectiveRaceTimeout = useRace
636
+ ? (hooks.getRaceTimeoutMs?.() ?? raceTimeoutMs)
637
+ : raceTimeoutMs;
638
+ let raceTimer;
639
+ const simpleOrTimeout = await Promise.race([
640
+ simplePromise
641
+ .then((result) => ({ type: 'simple-success', result }))
642
+ .catch((error) => ({ type: 'simple-error', error })),
643
+ new Promise((resolve) => {
644
+ raceTimer = setTimeout(() => resolve({ type: 'race-timeout' }), Math.max(effectiveRaceTimeout, 0));
645
+ }),
646
+ ]);
647
+ if (raceTimer)
648
+ clearTimeout(raceTimer);
649
+ if (simpleOrTimeout.type === 'simple-success') {
650
+ // Skip escalation when noEscalate=true (Q&A workloads that prefer speed over JS rendering)
651
+ if (!noEscalate && (shouldEscalateForLowContent(simpleOrTimeout.result) || hasSpaIndicators(simpleOrTimeout.result.html))) {
652
+ shouldUseBrowser = true;
653
+ }
654
+ else {
655
+ // Check whether the response is a bot-challenge page (e.g. Cloudflare, PerimeterX)
656
+ // Skip challenge detection when noEscalate=true (can't fix it with browser anyway)
657
+ const challengeCheck = noEscalate ? null : detectChallenge(simpleOrTimeout.result.html, simpleOrTimeout.result.statusCode);
658
+ if (challengeCheck && challengeCheck.isChallenge && challengeCheck.confidence >= 0.7) {
659
+ // Escalate — the browser/stealth path will handle it below
660
+ shouldUseBrowser = true;
661
+ }
662
+ else {
663
+ const strategyResult = {
664
+ ...simpleOrTimeout.result,
665
+ method: 'simple',
666
+ };
667
+ if (canUseCache) {
668
+ hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
669
+ }
670
+ recordMethod('simple');
671
+ return strategyResult;
672
+ }
673
+ }
674
+ }
675
+ if (simpleOrTimeout.type === 'simple-error') {
676
+ // When noEscalate=true, don't try browser on simple fetch error — just throw
677
+ if (noEscalate || !shouldEscalateSimpleError(simpleOrTimeout.error)) {
678
+ throw simpleOrTimeout.error;
679
+ }
680
+ shouldUseBrowser = true;
681
+ }
682
+ else {
683
+ // Race timeout — only start parallel browser if hooks say to race
684
+ if (useRace) {
685
+ // Parallel race: simple still running, start browser too
686
+ const browserAbortController = new AbortController();
687
+ let simpleError;
688
+ let browserError;
689
+ const simpleCandidate = simplePromise
690
+ .then((result) => ({ source: 'simple', result }))
691
+ .catch((error) => {
692
+ simpleError = error;
693
+ throw error;
694
+ });
695
+ const browserCandidate = fetchWithBrowserStrategy(url, {
696
+ ...browserOptions,
697
+ signal: browserAbortController.signal,
698
+ })
699
+ .then((result) => ({ source: 'browser', result }))
700
+ .catch((error) => {
701
+ browserError = error;
702
+ throw error;
703
+ });
704
+ try {
705
+ const winner = await Promise.any([
706
+ simpleCandidate,
707
+ browserCandidate,
708
+ ]);
709
+ if (winner.source === 'simple') {
710
+ browserAbortController.abort();
711
+ const strategyResult = {
712
+ ...winner.result,
713
+ method: 'simple',
714
+ };
715
+ if (canUseCache) {
716
+ hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
717
+ }
718
+ recordMethod('simple');
719
+ return strategyResult;
720
+ }
721
+ simpleAbortController.abort();
722
+ if (canUseCache) {
723
+ hooks.setCache?.(url, winner.result) ?? setBasicCache(url, winner.result);
724
+ }
725
+ recordMethod(winner.result.method);
726
+ return winner.result;
727
+ }
728
+ catch (e) {
729
+ // Race resolution failed — determine which error to propagate
730
+ log.debug('fetch race resolution failed:', e instanceof Error ? e.message : e);
731
+ if (simpleError &&
732
+ !shouldEscalateSimpleError(simpleError) &&
733
+ !isAbortError(simpleError)) {
734
+ throw simpleError;
735
+ }
736
+ if (browserError)
737
+ throw browserError;
738
+ if (simpleError)
739
+ throw simpleError;
740
+ throw new Error('Both simple and browser fetch attempts failed');
741
+ }
742
+ }
743
+ else {
744
+ // No race — just wait for the simple fetch to finish
745
+ const simpleResult = await simplePromise
746
+ .then((result) => ({ type: 'simple-success', result }))
747
+ .catch((error) => ({ type: 'simple-error', error }));
748
+ if (simpleResult.type === 'simple-success') {
749
+ // Check if the content is suspiciously thin, looks like an SPA shell, or is a shell page
750
+ // (looksLikeShellPage catches partial renders with 200-500 visible chars that
751
+ // shouldEscalateForLowContent misses — improves consistency on sites like China Daily)
752
+ if (shouldEscalateForLowContent(simpleResult.result) ||
753
+ hasSpaIndicators(simpleResult.result.html) ||
754
+ looksLikeShellPage(simpleResult.result)) {
755
+ shouldUseBrowser = true;
756
+ }
757
+ else {
758
+ // Check whether the response is a bot-challenge page
759
+ const challengeCheck = detectChallenge(simpleResult.result.html, simpleResult.result.statusCode);
760
+ if (challengeCheck.isChallenge && challengeCheck.confidence >= 0.7) {
761
+ shouldUseBrowser = true;
762
+ }
763
+ else {
764
+ const strategyResult = {
765
+ ...simpleResult.result,
766
+ method: 'simple',
767
+ };
768
+ if (canUseCache) {
769
+ hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
770
+ }
771
+ recordMethod('simple');
772
+ return strategyResult;
773
+ }
774
+ }
775
+ }
776
+ else {
777
+ if (!shouldEscalateSimpleError(simpleResult.error)) {
778
+ throw simpleResult.error;
779
+ }
780
+ shouldUseBrowser = true;
781
+ }
782
+ }
783
+ }
784
+ }
785
+ /* ---- simple-with-headers: intermediate step before browser ----------- */
786
+ // Before escalating to the headless browser, retry simple fetch with Googlebot UA
787
+ // and a Google Referer. This catches sites that block generic UAs but return full
788
+ // content to search-engine crawlers without needing JS rendering.
789
+ // Only fires when: we escalated from simple (not forced by domain rules), noEscalate=false.
790
+ if (shouldUseBrowser && !noEscalate && !effectiveForceBrowser && !effectiveStealth && !screenshot) {
791
+ const t0Headers = Date.now();
792
+ log.debug('Escalating: simple → simple-with-headers (Googlebot UA + Google Referer)');
793
+ try {
794
+ const headersResult = await simpleFetch(url, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', timeoutMs, {
795
+ 'Accept-Language': 'en-US,en;q=0.5',
796
+ 'Referer': 'https://www.google.com/',
797
+ }, undefined, firstProxy, proxyContext);
798
+ const headersChallengeCheck = detectChallenge(headersResult.html, headersResult.statusCode);
799
+ const headersOk = !looksLikeShellPage(headersResult) &&
800
+ !hasSpaIndicators(headersResult.html) &&
801
+ !shouldEscalateForLowContent(headersResult) &&
802
+ (!headersChallengeCheck.isChallenge || headersChallengeCheck.confidence < 0.7);
803
+ if (headersOk) {
804
+ log.debug(`simple-with-headers succeeded in ${Date.now() - t0Headers}ms`);
805
+ const strategyResult = { ...headersResult, method: 'simple' };
806
+ if (canUseCache) {
807
+ hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
808
+ }
809
+ recordMethod('simple');
810
+ return strategyResult;
811
+ }
812
+ log.debug(`simple-with-headers produced thin/blocked content in ${Date.now() - t0Headers}ms, continuing to browser`);
813
+ }
814
+ catch (e) {
815
+ if (isAbortError(e))
816
+ throw e;
817
+ log.debug('simple-with-headers failed:', e instanceof Error ? e.message : e);
818
+ }
819
+ }
820
+ /* ---- browser / stealth fallback with challenge-detection cascade ----- */
821
+ // Try each proxy in sequence until one succeeds
822
+ let lastError;
823
+ for (let proxyIdx = 0; proxyIdx < effectiveProxies.length; proxyIdx++) {
824
+ const currentProxy = effectiveProxies[proxyIdx];
825
+ const isLastProxy = proxyIdx === effectiveProxies.length - 1;
826
+ try {
827
+ const currentBrowserOptions = { ...browserOptions, proxy: currentProxy };
828
+ // Attempt 1: browser (or stealth, if already forced)
829
+ let finalResult = await fetchWithBrowserStrategy(url, currentBrowserOptions);
830
+ // browser-with-wait: if browser returned thin content (SPA may not have fully loaded),
831
+ // retry with a 3-second networkidle wait before escalating to stealth mode.
832
+ // This handles dynamic SPAs where the initial browser fetch catches a partial render.
833
+ if (!currentBrowserOptions.effectiveStealth && shouldEscalateForLowContent(finalResult)) {
834
+ const t0Wait = Date.now();
835
+ log.debug('browser returned thin content, escalating to browser-with-wait (3s networkidle)');
836
+ try {
837
+ const browserWaitResult = await fetchWithBrowserStrategy(url, {
838
+ ...currentBrowserOptions,
839
+ waitMs: Math.max(currentBrowserOptions.waitMs, 3000),
840
+ waitUntil: 'networkidle',
841
+ });
842
+ log.debug(`browser-with-wait done in ${Date.now() - t0Wait}ms`);
843
+ // Accept the wait result if it has more content (even if still thin — it's better than nothing)
844
+ if (!shouldEscalateForLowContent(browserWaitResult) ||
845
+ browserWaitResult.html.length > finalResult.html.length) {
846
+ finalResult = browserWaitResult;
847
+ }
848
+ }
849
+ catch (e) {
850
+ log.debug('browser-with-wait failed:', e instanceof Error ? e.message : e);
851
+ }
852
+ }
853
+ // Check if the browser result is itself a bot-challenge page
854
+ const browserChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
855
+ if (browserChallengeCheck.isChallenge && browserChallengeCheck.confidence >= 0.7) {
856
+ if (!currentBrowserOptions.effectiveStealth) {
857
+ // Attempt 2: escalate to stealth
858
+ const stealthOptions = {
859
+ ...currentBrowserOptions,
860
+ effectiveStealth: true,
861
+ };
862
+ finalResult = await fetchWithBrowserStrategy(url, stealthOptions);
863
+ const stealthChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
864
+ if (stealthChallengeCheck.isChallenge && stealthChallengeCheck.confidence >= 0.7) {
865
+ // Attempt 3: stealth + 5s extra wait
866
+ const stealthExtraOptions = {
867
+ ...stealthOptions,
868
+ waitMs: stealthOptions.waitMs + 5000,
869
+ };
870
+ finalResult = await fetchWithBrowserStrategy(url, stealthExtraOptions);
871
+ const finalChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
872
+ if (finalChallengeCheck.isChallenge && finalChallengeCheck.confidence >= 0.7) {
873
+ if (!isLastProxy) {
874
+ // More proxies to try — move on to the next one
875
+ lastError = new BlockedError(`Challenge detected with proxy ${currentProxy || 'direct'}`);
876
+ continue;
877
+ }
878
+ // Last proxy: give up and return with warning flag (preserve original behaviour)
879
+ finalResult = { ...finalResult, challengeDetected: true };
880
+ }
881
+ }
882
+ }
883
+ else {
884
+ // Already in stealth mode; retry with 5s extra wait
885
+ const stealthExtraOptions = {
886
+ ...currentBrowserOptions,
887
+ waitMs: currentBrowserOptions.waitMs + 5000,
888
+ };
889
+ finalResult = await fetchWithBrowserStrategy(url, stealthExtraOptions);
890
+ const finalChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
891
+ if (finalChallengeCheck.isChallenge && finalChallengeCheck.confidence >= 0.7) {
892
+ if (!isLastProxy) {
893
+ // More proxies to try — move on to the next one
894
+ lastError = new BlockedError(`Challenge detected with proxy ${currentProxy || 'direct'}`);
895
+ continue;
896
+ }
897
+ // Last proxy: give up and return with warning flag (preserve original behaviour)
898
+ finalResult = { ...finalResult, challengeDetected: true };
899
+ }
900
+ }
901
+ }
902
+ // If still challenged after stealth+wait, try PeelTLS (TLS fingerprint spoofing)
903
+ if (finalResult.challengeDetected) {
904
+ try {
905
+ const { peelTLSFetch, isPeelTLSAvailable } = await import('./peel-tls.js');
906
+ if (isPeelTLSAvailable()) {
907
+ log.debug('Escalating to PeelTLS fingerprint spoofing');
908
+ const peelResult = await peelTLSFetch(url, {
909
+ proxy: currentProxy,
910
+ headers,
911
+ timeout: timeoutMs,
912
+ });
913
+ const peelStrategyResult = { ...peelResult, method: 'peeltls' };
914
+ const peelChallengeCheck = detectChallenge(peelResult.html, peelResult.statusCode);
915
+ if (!peelChallengeCheck.isChallenge || peelChallengeCheck.confidence < 0.7) {
916
+ // PeelTLS succeeded
917
+ if (canUseCache) {
918
+ hooks.setCache?.(url, peelStrategyResult) ?? setBasicCache(url, peelStrategyResult);
919
+ }
920
+ recordMethod('peeltls');
921
+ return peelStrategyResult;
922
+ }
923
+ // PeelTLS still challenged — fall through to CloakBrowser
924
+ log.debug('PeelTLS still challenged, escalating to CloakBrowser');
925
+ }
926
+ }
927
+ catch (peelError) {
928
+ log.debug('PeelTLS failed:', peelError instanceof Error ? peelError.message : peelError);
929
+ // Fall through to CloakBrowser
930
+ }
931
+ }
932
+ // If still challenged after PeelTLS, try Cloudflare Worker proxy (clean edge IPs)
933
+ if (finalResult.challengeDetected) {
934
+ try {
935
+ const { cfWorkerFetch, isCfWorkerAvailable } = await import('./cf-worker-proxy.js');
936
+ if (isCfWorkerAvailable()) {
937
+ log.debug('Escalating to CF Worker proxy');
938
+ const cfResult = await cfWorkerFetch(url, {
939
+ headers,
940
+ timeout: timeoutMs,
941
+ });
942
+ const cfStrategyResult = { ...cfResult, method: 'cf-worker' };
943
+ const cfChallengeCheck = detectChallenge(cfResult.html, cfResult.statusCode);
944
+ if (!cfChallengeCheck.isChallenge || cfChallengeCheck.confidence < 0.7) {
945
+ // CF Worker succeeded
946
+ if (canUseCache) {
947
+ hooks.setCache?.(url, cfStrategyResult) ?? setBasicCache(url, cfStrategyResult);
948
+ }
949
+ recordMethod('cf-worker');
950
+ return cfStrategyResult;
951
+ }
952
+ log.debug('CF Worker still challenged, escalating to CloakBrowser');
953
+ }
954
+ }
955
+ catch (cfError) {
956
+ log.debug('CF Worker proxy failed:', cfError instanceof Error ? cfError.message : cfError);
957
+ }
958
+ }
959
+ // If still challenged after CF Worker, try CloakBrowser
960
+ if (finalResult.challengeDetected) {
961
+ try {
962
+ // @ts-ignore — proprietary module, gitignored
963
+ const { cloakFetch, isCloakBrowserAvailable } = await import('./cloak-fetch.js');
964
+ if (isCloakBrowserAvailable()) {
965
+ log.debug('Escalating to CloakBrowser stealth');
966
+ const cloakResult = await cloakFetch({
967
+ url,
968
+ proxy: currentProxy,
969
+ userAgent,
970
+ viewportWidth,
971
+ viewportHeight,
972
+ waitMs,
973
+ waitSelector,
974
+ waitUntil,
975
+ timeoutMs,
976
+ screenshot,
977
+ screenshotFullPage,
978
+ actions,
979
+ headers,
980
+ headed,
981
+ });
982
+ if (canUseCache && !cloakResult.challengeDetected) {
983
+ hooks.setCache?.(url, cloakResult) ?? setBasicCache(url, cloakResult);
984
+ }
985
+ recordMethod(cloakResult.method);
986
+ return cloakResult;
987
+ }
988
+ }
989
+ catch (cloakError) {
990
+ log.debug('CloakBrowser failed:', cloakError instanceof Error ? cloakError.message : cloakError);
991
+ // Fall through to Google Cache fallback
992
+ }
993
+ }
994
+ // If still challenged after PeelTLS/CloakBrowser, try Google Cache
995
+ if (finalResult.challengeDetected) {
996
+ try {
997
+ const { fetchGoogleCache } = await import('./google-cache.js');
998
+ const cacheResult = await fetchGoogleCache(url, { timeout: timeoutMs });
999
+ if (cacheResult && cacheResult.html.length > 200) {
1000
+ log.debug('Using Google Cache fallback');
1001
+ const cacheStrategyResult = {
1002
+ html: cacheResult.html,
1003
+ url: cacheResult.url,
1004
+ statusCode: cacheResult.statusCode,
1005
+ contentType: 'text/html',
1006
+ method: 'google-cache',
1007
+ };
1008
+ return cacheStrategyResult;
1009
+ }
1010
+ }
1011
+ catch (cacheError) {
1012
+ log.debug('Google Cache failed:', cacheError);
1013
+ }
1014
+ }
1015
+ // Success (or gave up with challengeDetected=true on the last proxy)
1016
+ if (canUseCache && !finalResult.challengeDetected) {
1017
+ hooks.setCache?.(url, finalResult) ?? setBasicCache(url, finalResult);
1018
+ }
1019
+ recordMethod(finalResult.method);
1020
+ // Record estimated proxy bandwidth for browser fetches that used a proxy
1021
+ if (currentProxy && proxyContext?.userId && !finalResult.challengeDetected) {
1022
+ // Estimate bytes: use HTML length as proxy for page size (rough but fast)
1023
+ const estimatedBytes = finalResult.html?.length ?? (2 * 1024 * 1024); // fallback 2MB
1024
+ recordProxyBytes(proxyContext.userId, estimatedBytes);
1025
+ }
1026
+ return finalResult;
1027
+ }
1028
+ catch (e) {
1029
+ lastError = e;
1030
+ if (isAbortError(e))
1031
+ throw e; // Don't retry on abort
1032
+ // Log and try next proxy
1033
+ log.debug(`proxy ${currentProxy || 'direct'} failed:`, e instanceof Error ? e.message : e);
1034
+ // If last proxy, throw below; otherwise continue loop
1035
+ }
1036
+ }
1037
+ // All proxies exhausted — throw the last error
1038
+ throw lastError;
1039
+ }
1040
+ /* ---------- legacy export for tests ------------------------------------- */
1041
+ /**
1042
+ * @deprecated Use `clearStrategyHooks()` from strategy-hooks.ts instead.
1043
+ */
1044
+ export { clearStrategyHooks as clearDomainIntel } from './strategy-hooks.js';