@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,1044 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart escalation strategy: try simple fetch first, escalate to browser if needed.
|
|
3
|
+
*
|
|
4
|
+
* Premium server-side optimisations (SWR cache, domain intelligence, parallel
|
|
5
|
+
* race) are injected via the hook system in `strategy-hooks.ts`. When no hooks
|
|
6
|
+
* are registered the strategy degrades gracefully to a simple escalation path
|
|
7
|
+
* that works great for CLI / npm library usage.
|
|
8
|
+
*/
|
|
9
|
+
import { simpleFetch, browserFetch } from './fetcher.js';
|
|
10
|
+
import { getCached, setCached as setBasicCache } from './cache.js';
|
|
11
|
+
import { resolveAndCache } from './dns-cache.js';
|
|
12
|
+
import { BlockedError, NetworkError } from '../types.js';
|
|
13
|
+
import { WebPeelError } from '../errors.js';
|
|
14
|
+
import { withRetry, domainLimiter } from './retry.js';
|
|
15
|
+
import { getWebshareProxyUrl, canUseProxy, recordProxyBytes } from './proxy-config.js';
|
|
16
|
+
import { detectChallenge } from './challenge-detection.js';
|
|
17
|
+
import { browserCircuitBreaker } from './circuit-breaker.js';
|
|
18
|
+
import { markProxyExhausted } from './proxy-config.js';
|
|
19
|
+
import { getStrategyHooks, } from './strategy-hooks.js';
|
|
20
|
+
import { createLogger } from './logger.js';
|
|
21
|
+
const log = createLogger('fetch');
|
|
22
|
+
/* ---------- hardcoded domain rules -------------------------------------- */
|
|
23
|
+
/**
|
|
24
|
+
* Domains that require a residential proxy to bypass datacenter IP blocks.
|
|
25
|
+
* These sites don't just need stealth — they fingerprint the IP itself and
|
|
26
|
+
* block all cloud/datacenter ranges. Webshare residential proxy bypasses this.
|
|
27
|
+
*
|
|
28
|
+
* When no explicit proxy is set and Webshare is configured, requests to these
|
|
29
|
+
* domains skip the direct (datacenter) attempt and go straight to residential proxy.
|
|
30
|
+
*/
|
|
31
|
+
const RESIDENTIAL_PROXY_DOMAINS = [
|
|
32
|
+
'zillow.com',
|
|
33
|
+
'yelp.com',
|
|
34
|
+
'pinterest.com',
|
|
35
|
+
'ticketmaster.com',
|
|
36
|
+
'stubhub.com',
|
|
37
|
+
'cargurus.com',
|
|
38
|
+
'realtor.com',
|
|
39
|
+
'redfin.com',
|
|
40
|
+
'apartments.com',
|
|
41
|
+
'trulia.com',
|
|
42
|
+
'homefinder.com',
|
|
43
|
+
];
|
|
44
|
+
/**
|
|
45
|
+
* Check if a URL matches a domain that requires residential proxy.
|
|
46
|
+
* Returns true if no explicit proxy is set and Webshare env vars are available.
|
|
47
|
+
*/
|
|
48
|
+
function requiresResidentialProxy(url) {
|
|
49
|
+
try {
|
|
50
|
+
const hostname = new URL(url).hostname.toLowerCase();
|
|
51
|
+
return RESIDENTIAL_PROXY_DOMAINS.some(domain => hostname === domain || hostname.endsWith(`.${domain}`));
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
return false;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
export function shouldForceBrowser(url) {
|
|
58
|
+
// Hashbang URLs (#!) are always JS-routed SPAs — browser rendering required
|
|
59
|
+
if (url.includes('#!')) {
|
|
60
|
+
return { mode: 'browser' };
|
|
61
|
+
}
|
|
62
|
+
try {
|
|
63
|
+
const hostname = new URL(url).hostname.toLowerCase();
|
|
64
|
+
// Sites that return HTML shells / need JS rendering (browser mode)
|
|
65
|
+
const browserDomains = [
|
|
66
|
+
'reddit.com', // HTML shell via simple fetch
|
|
67
|
+
'npmjs.com', // 403 on simple fetch
|
|
68
|
+
'x.com', // SPA, login wall
|
|
69
|
+
'twitter.com', // SPA, login wall
|
|
70
|
+
'instagram.com', // SPA, login wall
|
|
71
|
+
'facebook.com', // SPA, heavy JS
|
|
72
|
+
'tiktok.com', // SPA, JS-rendered
|
|
73
|
+
'pinterest.com', // SPA, JS-rendered
|
|
74
|
+
'airbnb.com', // heavy SPA
|
|
75
|
+
'medium.com', // JS-rendered, sometimes login wall
|
|
76
|
+
'substack.com', // JS-rendered
|
|
77
|
+
'notion.so', // SPA
|
|
78
|
+
'figma.com', // SPA
|
|
79
|
+
'canva.com', // SPA
|
|
80
|
+
'vercel.app', // Could be any SPA
|
|
81
|
+
];
|
|
82
|
+
for (const domain of browserDomains) {
|
|
83
|
+
if (hostname === domain || hostname.endsWith(`.${domain}`)) {
|
|
84
|
+
return { mode: 'browser' };
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
// These are known to aggressively block automation — stealth mode required
|
|
88
|
+
const stealthDomains = [
|
|
89
|
+
'glassdoor.com',
|
|
90
|
+
'bloomberg.com',
|
|
91
|
+
'indeed.com',
|
|
92
|
+
'yelp.com', // aggressive bot detection
|
|
93
|
+
'amazon.com', // captcha wall on simple/browser fetch
|
|
94
|
+
'zillow.com', // aggressive bot detection
|
|
95
|
+
'ticketmaster.com', // Distil Networks / PerimeterX
|
|
96
|
+
'stubhub.com', // PerimeterX / CAPTCHA
|
|
97
|
+
'walmart.com', // Akamai Bot Manager
|
|
98
|
+
'target.com', // Akamai Bot Manager
|
|
99
|
+
'bestbuy.com', // Akamai Bot Manager
|
|
100
|
+
'homedepot.com', // Akamai Bot Manager
|
|
101
|
+
'lowes.com', // Akamai Bot Manager
|
|
102
|
+
'costco.com', // Akamai Bot Manager
|
|
103
|
+
'nike.com', // Akamai / Shape Security
|
|
104
|
+
'footlocker.com', // PerimeterX / DataDome
|
|
105
|
+
'realtor.com', // aggressive bot detection
|
|
106
|
+
'redfin.com', // aggressive bot detection
|
|
107
|
+
'cloudflare.com', // Cloudflare challenge pages
|
|
108
|
+
'ebay.com', // challenge page on simple fetch
|
|
109
|
+
'linkedin.com', // aggressive bot detection + login walls
|
|
110
|
+
'craigslist.org', // occasionally blocks automated access
|
|
111
|
+
'etsy.com', // Akamai protection
|
|
112
|
+
'wayfair.com', // Akamai protection
|
|
113
|
+
'newegg.com', // bot detection
|
|
114
|
+
'zappos.com', // Amazon subsidiary, same protection
|
|
115
|
+
'chewy.com', // Amazon subsidiary
|
|
116
|
+
'aliexpress.com', // anti-bot
|
|
117
|
+
'wish.com', // anti-bot
|
|
118
|
+
'cargurus.com', // aggressive bot detection
|
|
119
|
+
];
|
|
120
|
+
for (const domain of stealthDomains) {
|
|
121
|
+
if (hostname === domain || hostname.endsWith(`.${domain}`)) {
|
|
122
|
+
return { mode: 'stealth' };
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
catch (e) {
|
|
127
|
+
// Ignore URL parsing errors; validation happens inside fetchers.
|
|
128
|
+
log.debug('stealth domain URL parse failed:', e instanceof Error ? e.message : e);
|
|
129
|
+
}
|
|
130
|
+
return null;
|
|
131
|
+
}
|
|
132
|
+
/* ---------- helpers ------------------------------------------------------ */
|
|
133
|
+
/**
|
|
134
|
+
* Detect strong SPA indicators in fetched HTML that suggest browser rendering is required.
|
|
135
|
+
*
|
|
136
|
+
* These patterns indicate a JS-rendered SPA shell page: the server returns a
|
|
137
|
+
* barebones HTML document with an empty root mount point that only gets
|
|
138
|
+
* populated after JavaScript runs in the browser.
|
|
139
|
+
*
|
|
140
|
+
* Auto-render detection complements the domain-list approach in shouldForceBrowser():
|
|
141
|
+
* it catches unknown SPAs that aren't in the hardcoded list.
|
|
142
|
+
*/
|
|
143
|
+
function hasSpaIndicators(html) {
|
|
144
|
+
// Empty SPA root mount points — definitive SPA shell indicators
|
|
145
|
+
const emptyRootPatterns = [
|
|
146
|
+
'<div id="root"></div>',
|
|
147
|
+
'<div id="root"> </div>',
|
|
148
|
+
'<div id="app"></div>',
|
|
149
|
+
'<div id="app"> </div>',
|
|
150
|
+
'<div id="__next"></div>',
|
|
151
|
+
'<div id="__next"> </div>',
|
|
152
|
+
'<div id="___gatsby"></div>',
|
|
153
|
+
'<div id="gatsby-focus-wrapper"></div>',
|
|
154
|
+
];
|
|
155
|
+
for (const pattern of emptyRootPatterns) {
|
|
156
|
+
if (html.includes(pattern))
|
|
157
|
+
return true;
|
|
158
|
+
}
|
|
159
|
+
// <noscript> blocks with "enable JavaScript" messages
|
|
160
|
+
// These are canonical SPA signals — React, Vue, Angular all emit them
|
|
161
|
+
const noscriptMatch = html.match(/<noscript[^>]*>([\s\S]*?)<\/noscript>/i);
|
|
162
|
+
if (noscriptMatch) {
|
|
163
|
+
const noscriptContent = noscriptMatch[1].toLowerCase();
|
|
164
|
+
if (noscriptContent.includes('enable javascript') ||
|
|
165
|
+
noscriptContent.includes('javascript is required') ||
|
|
166
|
+
noscriptContent.includes('javascript must be enabled') ||
|
|
167
|
+
noscriptContent.includes('requires javascript') ||
|
|
168
|
+
noscriptContent.includes('javascript to run this app') ||
|
|
169
|
+
noscriptContent.includes('you need to enable javascript')) {
|
|
170
|
+
return true;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
// Many script tags + very little visible text = almost certainly an SPA shell.
|
|
174
|
+
// This catches SPAs not matched by the root-div patterns above.
|
|
175
|
+
// Note: shouldEscalateForLowContent() guards html.length > 1500; this fills the gap
|
|
176
|
+
// for smaller pages (e.g. minimal webpack bundles with few/no meta tags).
|
|
177
|
+
const scriptTagCount = (html.match(/<script/gi) || []).length;
|
|
178
|
+
if (scriptTagCount >= 5) {
|
|
179
|
+
// Strip scripts/styles then measure visible text
|
|
180
|
+
const stripped = html
|
|
181
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
182
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
183
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, '')
|
|
184
|
+
.replace(/<[^>]*>/g, '')
|
|
185
|
+
.replace(/\s+/g, ' ')
|
|
186
|
+
.trim();
|
|
187
|
+
// Many scripts but almost no readable text → render it
|
|
188
|
+
if (stripped.length < 150) {
|
|
189
|
+
return true;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
return false;
|
|
193
|
+
}
|
|
194
|
+
function isAbortError(error) {
|
|
195
|
+
return error instanceof Error && error.name === 'AbortError';
|
|
196
|
+
}
|
|
197
|
+
function shouldEscalateSimpleError(error) {
|
|
198
|
+
if (error instanceof BlockedError)
|
|
199
|
+
return true;
|
|
200
|
+
return error instanceof NetworkError && error.message.includes('TLS/SSL');
|
|
201
|
+
}
|
|
202
|
+
function looksLikeShellPage(result) {
|
|
203
|
+
const ct = (result.contentType || '').toLowerCase();
|
|
204
|
+
if (!ct.includes('html'))
|
|
205
|
+
return false;
|
|
206
|
+
const text = result.html.replace(/<[^>]*>/g, '').trim();
|
|
207
|
+
return text.length < 500 && result.html.length > 1000;
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* Detect pages that returned HTML but have very little actual text content.
|
|
211
|
+
* This catches JS-rendered SPAs that return a shell page with a big HTML payload
|
|
212
|
+
* (scripts, styles, framework boilerplate) but minimal visible text.
|
|
213
|
+
*/
|
|
214
|
+
function shouldEscalateForLowContent(result) {
|
|
215
|
+
const ct = (result.contentType || '').toLowerCase();
|
|
216
|
+
if (!ct.includes('html'))
|
|
217
|
+
return false;
|
|
218
|
+
if (result.html.length <= 1500)
|
|
219
|
+
return false;
|
|
220
|
+
// Strip script/style blocks and their contents first, then strip remaining tags
|
|
221
|
+
const withoutScripts = result.html
|
|
222
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
223
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
224
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, '');
|
|
225
|
+
const visibleText = withoutScripts.replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim();
|
|
226
|
+
return visibleText.length < 200;
|
|
227
|
+
}
|
|
228
|
+
function prefetchDns(url) {
|
|
229
|
+
try {
|
|
230
|
+
const hostname = new URL(url).hostname;
|
|
231
|
+
void resolveAndCache(hostname).catch(() => { });
|
|
232
|
+
}
|
|
233
|
+
catch (e) {
|
|
234
|
+
// Ignore invalid URL.
|
|
235
|
+
log.debug('DNS prefetch URL parse failed:', e instanceof Error ? e.message : e);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
async function fetchWithBrowserStrategy(url, options) {
|
|
239
|
+
const { userAgent, waitMs, timeoutMs, screenshot, screenshotFullPage, headers, cookies, actions, keepPageOpen, effectiveStealth, signal, profileDir, headed, storageState, proxy, device, viewportWidth, viewportHeight, deviceScaleFactor, waitUntil, waitSelector, blockResources, isSPA, languages, } = options;
|
|
240
|
+
// Check circuit breaker before attempting any browser launch
|
|
241
|
+
if (!browserCircuitBreaker.canExecute()) {
|
|
242
|
+
throw new Error('Browser circuit breaker OPEN — Chromium unavailable, using HTTP fallback');
|
|
243
|
+
}
|
|
244
|
+
try {
|
|
245
|
+
const result = await browserFetch(url, {
|
|
246
|
+
userAgent,
|
|
247
|
+
waitMs,
|
|
248
|
+
timeoutMs,
|
|
249
|
+
screenshot,
|
|
250
|
+
screenshotFullPage,
|
|
251
|
+
headers,
|
|
252
|
+
cookies,
|
|
253
|
+
stealth: effectiveStealth,
|
|
254
|
+
actions,
|
|
255
|
+
keepPageOpen,
|
|
256
|
+
signal,
|
|
257
|
+
profileDir,
|
|
258
|
+
headed,
|
|
259
|
+
proxy,
|
|
260
|
+
storageState,
|
|
261
|
+
device,
|
|
262
|
+
viewportWidth,
|
|
263
|
+
viewportHeight,
|
|
264
|
+
deviceScaleFactor,
|
|
265
|
+
waitUntil,
|
|
266
|
+
waitSelector,
|
|
267
|
+
blockResources,
|
|
268
|
+
isSPA,
|
|
269
|
+
languages,
|
|
270
|
+
});
|
|
271
|
+
browserCircuitBreaker.recordSuccess();
|
|
272
|
+
return {
|
|
273
|
+
...result,
|
|
274
|
+
method: effectiveStealth ? 'stealth' : 'browser',
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
catch (error) {
|
|
278
|
+
if (isAbortError(error))
|
|
279
|
+
throw error;
|
|
280
|
+
// Trip the circuit breaker on infrastructure errors (not page-level errors)
|
|
281
|
+
const errMsg = error.message || '';
|
|
282
|
+
const isInfraError = errMsg.includes('ERR_TUNNEL') ||
|
|
283
|
+
errMsg.includes('ECONNREFUSED') ||
|
|
284
|
+
errMsg.includes('browser has been closed') ||
|
|
285
|
+
errMsg.includes('Target closed') ||
|
|
286
|
+
errMsg.includes('Protocol error') ||
|
|
287
|
+
errMsg.includes('Session closed') ||
|
|
288
|
+
errMsg.includes('Browser.close') ||
|
|
289
|
+
errMsg.includes('crashed');
|
|
290
|
+
if (isInfraError) {
|
|
291
|
+
// ERR_TUNNEL specifically means proxy is dead (402 bandwidth, connection refused)
|
|
292
|
+
// Disable proxy for 5 minutes so subsequent requests go direct instead of failing.
|
|
293
|
+
// Don't trip the circuit breaker for proxy-only failures — the browser itself is fine,
|
|
294
|
+
// it just needs to run without a proxy.
|
|
295
|
+
if (errMsg.includes('ERR_TUNNEL')) {
|
|
296
|
+
markProxyExhausted('ERR_TUNNEL_CONNECTION_FAILED — proxy bandwidth likely exhausted');
|
|
297
|
+
// Don't count this as a browser infrastructure failure
|
|
298
|
+
}
|
|
299
|
+
else {
|
|
300
|
+
browserCircuitBreaker.recordFailure(error);
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
// If browser gets blocked, try stealth as fallback (unless already stealth)
|
|
304
|
+
if (!effectiveStealth && error instanceof BlockedError && browserCircuitBreaker.canExecute()) {
|
|
305
|
+
const result = await browserFetch(url, {
|
|
306
|
+
userAgent,
|
|
307
|
+
waitMs,
|
|
308
|
+
timeoutMs,
|
|
309
|
+
screenshot,
|
|
310
|
+
screenshotFullPage,
|
|
311
|
+
headers,
|
|
312
|
+
cookies,
|
|
313
|
+
stealth: true,
|
|
314
|
+
actions,
|
|
315
|
+
keepPageOpen,
|
|
316
|
+
signal,
|
|
317
|
+
profileDir,
|
|
318
|
+
headed,
|
|
319
|
+
storageState,
|
|
320
|
+
proxy,
|
|
321
|
+
device,
|
|
322
|
+
viewportWidth,
|
|
323
|
+
viewportHeight,
|
|
324
|
+
deviceScaleFactor,
|
|
325
|
+
});
|
|
326
|
+
return { ...result, method: 'stealth' };
|
|
327
|
+
}
|
|
328
|
+
// If Cloudflare detected, retry with extra wait time
|
|
329
|
+
if (error instanceof NetworkError &&
|
|
330
|
+
error.message.toLowerCase().includes('cloudflare') &&
|
|
331
|
+
browserCircuitBreaker.canExecute()) {
|
|
332
|
+
const result = await browserFetch(url, {
|
|
333
|
+
userAgent,
|
|
334
|
+
waitMs: 5000,
|
|
335
|
+
timeoutMs,
|
|
336
|
+
screenshot,
|
|
337
|
+
screenshotFullPage,
|
|
338
|
+
headers,
|
|
339
|
+
cookies,
|
|
340
|
+
stealth: effectiveStealth,
|
|
341
|
+
actions,
|
|
342
|
+
keepPageOpen,
|
|
343
|
+
signal,
|
|
344
|
+
profileDir,
|
|
345
|
+
headed,
|
|
346
|
+
proxy,
|
|
347
|
+
device,
|
|
348
|
+
viewportWidth,
|
|
349
|
+
viewportHeight,
|
|
350
|
+
deviceScaleFactor,
|
|
351
|
+
});
|
|
352
|
+
return { ...result, method: effectiveStealth ? 'stealth' : 'browser' };
|
|
353
|
+
}
|
|
354
|
+
// If network error (HTTP/2 protocol, connection refused, etc.), try stealth as fallback
|
|
355
|
+
if (!effectiveStealth && error instanceof NetworkError && browserCircuitBreaker.canExecute()) {
|
|
356
|
+
try {
|
|
357
|
+
const result = await browserFetch(url, {
|
|
358
|
+
userAgent,
|
|
359
|
+
waitMs,
|
|
360
|
+
timeoutMs,
|
|
361
|
+
screenshot,
|
|
362
|
+
screenshotFullPage,
|
|
363
|
+
headers,
|
|
364
|
+
cookies,
|
|
365
|
+
stealth: true,
|
|
366
|
+
actions,
|
|
367
|
+
keepPageOpen,
|
|
368
|
+
signal,
|
|
369
|
+
profileDir,
|
|
370
|
+
headed,
|
|
371
|
+
storageState,
|
|
372
|
+
proxy,
|
|
373
|
+
device,
|
|
374
|
+
viewportWidth,
|
|
375
|
+
viewportHeight,
|
|
376
|
+
deviceScaleFactor,
|
|
377
|
+
});
|
|
378
|
+
return { ...result, method: 'stealth' };
|
|
379
|
+
}
|
|
380
|
+
catch (stealthError) {
|
|
381
|
+
// Stealth also failed — throw original error with helpful message
|
|
382
|
+
throw error;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
throw error;
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
/* ---------- main entry point -------------------------------------------- */
|
|
389
|
+
/**
|
|
390
|
+
* Smart fetch with automatic escalation.
|
|
391
|
+
*
|
|
392
|
+
* Without hooks: simple fetch → browser → stealth escalation.
|
|
393
|
+
* With premium hooks: SWR cache → domain intel → parallel race → escalation.
|
|
394
|
+
*/
|
|
395
|
+
export async function smartFetch(url, options = {}) {
|
|
396
|
+
const { forceBrowser = false, stealth = false, waitMs = 0, userAgent, timeoutMs = 30000, screenshot = false, screenshotFullPage = false, headers, cookies, actions, keepPageOpen = false, noCache = false, raceTimeoutMs = 2000, profileDir, headed = false, storageState, proxy, proxies, device, viewportWidth, viewportHeight, deviceScaleFactor, waitUntil, waitSelector, blockResources, cloaked = false, cycle = false, tls = false, noEscalate = false, location, proxyContext, } = options;
|
|
397
|
+
const usePeelTLS = tls || cycle;
|
|
398
|
+
// Build effective proxy list: explicit proxies array, or single proxy, or empty.
|
|
399
|
+
// For domains that require residential proxies (Zillow, Yelp, Pinterest, etc.),
|
|
400
|
+
// skip the direct datacenter connection entirely and go straight to Webshare.
|
|
401
|
+
// For all other domains, try direct first (fast), then Webshare as fallback.
|
|
402
|
+
//
|
|
403
|
+
// Tier enforcement: if proxyContext is set and the user is over their limit (or free tier),
|
|
404
|
+
// skip Webshare entirely so they run direct-only.
|
|
405
|
+
const userCanProxy = !proxyContext?.userId || canUseProxy(proxyContext.userId, proxyContext.tier || 'free');
|
|
406
|
+
const effectiveProxies = proxies?.length ? proxies :
|
|
407
|
+
proxy ? [proxy] :
|
|
408
|
+
(() => {
|
|
409
|
+
if (!userCanProxy)
|
|
410
|
+
return [undefined]; // Tier limit reached — direct only
|
|
411
|
+
const wsUrl = getWebshareProxyUrl();
|
|
412
|
+
if (!wsUrl)
|
|
413
|
+
return [undefined];
|
|
414
|
+
// Skip datacenter IP for known residential-proxy-required domains
|
|
415
|
+
if (requiresResidentialProxy(url)) {
|
|
416
|
+
log.debug('Residential proxy domain detected — skipping datacenter IP, using Webshare directly');
|
|
417
|
+
return [wsUrl];
|
|
418
|
+
}
|
|
419
|
+
return [undefined, wsUrl];
|
|
420
|
+
})();
|
|
421
|
+
const firstProxy = effectiveProxies[0];
|
|
422
|
+
const hooks = getStrategyHooks();
|
|
423
|
+
const fetchStartMs = Date.now();
|
|
424
|
+
const recordMethod = (method) => {
|
|
425
|
+
if (method === 'cached' || method === 'cloaked' || method === 'cycle' || method === 'peeltls' || method === 'cf-worker' || method === 'google-cache')
|
|
426
|
+
return;
|
|
427
|
+
hooks.recordDomainResult?.(url, method, Date.now() - fetchStartMs);
|
|
428
|
+
};
|
|
429
|
+
/* ---- determine effective mode ---------------------------------------- */
|
|
430
|
+
// Hardcoded rules always take priority, then hook-based domain intelligence.
|
|
431
|
+
const forced = shouldForceBrowser(url);
|
|
432
|
+
const recommended = hooks.getDomainRecommendation?.(url) ?? null;
|
|
433
|
+
const selected = forced ?? recommended;
|
|
434
|
+
let effectiveForceBrowser = forceBrowser;
|
|
435
|
+
let effectiveStealth = stealth;
|
|
436
|
+
if (selected) {
|
|
437
|
+
effectiveForceBrowser = true;
|
|
438
|
+
if (selected.mode === 'stealth')
|
|
439
|
+
effectiveStealth = true;
|
|
440
|
+
}
|
|
441
|
+
prefetchDns(url);
|
|
442
|
+
/* ---- cache eligibility ----------------------------------------------- */
|
|
443
|
+
const canUseCache = !noCache &&
|
|
444
|
+
!effectiveForceBrowser &&
|
|
445
|
+
!effectiveStealth &&
|
|
446
|
+
!screenshot &&
|
|
447
|
+
!keepPageOpen &&
|
|
448
|
+
!actions?.length &&
|
|
449
|
+
!headers &&
|
|
450
|
+
!cookies &&
|
|
451
|
+
waitMs === 0 &&
|
|
452
|
+
!userAgent &&
|
|
453
|
+
!proxy &&
|
|
454
|
+
!proxies?.length;
|
|
455
|
+
/* ---- CloakBrowser direct path (if explicitly requested) -------------- */
|
|
456
|
+
if (cloaked) {
|
|
457
|
+
try {
|
|
458
|
+
// @ts-ignore — proprietary module, gitignored
|
|
459
|
+
const { cloakFetch, isCloakBrowserAvailable } = await import('./cloak-fetch.js');
|
|
460
|
+
if (!isCloakBrowserAvailable()) {
|
|
461
|
+
throw new Error('CloakBrowser not installed. Run: npm install cloakbrowser playwright-core');
|
|
462
|
+
}
|
|
463
|
+
log.debug('Using CloakBrowser stealth (explicitly requested)');
|
|
464
|
+
const result = await cloakFetch({
|
|
465
|
+
url,
|
|
466
|
+
proxy: effectiveProxies[0],
|
|
467
|
+
userAgent,
|
|
468
|
+
viewportWidth,
|
|
469
|
+
viewportHeight,
|
|
470
|
+
waitMs,
|
|
471
|
+
waitSelector,
|
|
472
|
+
waitUntil,
|
|
473
|
+
timeoutMs,
|
|
474
|
+
screenshot,
|
|
475
|
+
screenshotFullPage,
|
|
476
|
+
actions,
|
|
477
|
+
headers,
|
|
478
|
+
headed,
|
|
479
|
+
});
|
|
480
|
+
if (canUseCache && !result.challengeDetected) {
|
|
481
|
+
hooks.setCache?.(url, result) ?? setBasicCache(url, result);
|
|
482
|
+
}
|
|
483
|
+
recordMethod(result.method);
|
|
484
|
+
return result;
|
|
485
|
+
}
|
|
486
|
+
catch (e) {
|
|
487
|
+
if (isAbortError(e))
|
|
488
|
+
throw e;
|
|
489
|
+
throw e; // Don't fall back — user explicitly requested cloaked mode
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
/* ---- PeelTLS direct path (if explicitly requested via --tls or --cycle) */
|
|
493
|
+
if (usePeelTLS) {
|
|
494
|
+
try {
|
|
495
|
+
const { peelTLSFetch, isPeelTLSAvailable } = await import('./peel-tls.js');
|
|
496
|
+
if (!isPeelTLSAvailable()) {
|
|
497
|
+
throw new Error('PeelTLS binary not found. Build it with: cd peeltls && bash build.sh');
|
|
498
|
+
}
|
|
499
|
+
log.debug('Using PeelTLS fingerprint spoofing (explicitly requested)');
|
|
500
|
+
const result = await peelTLSFetch(url, {
|
|
501
|
+
proxy: firstProxy,
|
|
502
|
+
headers,
|
|
503
|
+
timeout: timeoutMs,
|
|
504
|
+
});
|
|
505
|
+
const peelResult = { ...result, method: 'peeltls' };
|
|
506
|
+
if (canUseCache) {
|
|
507
|
+
hooks.setCache?.(url, peelResult) ?? setBasicCache(url, peelResult);
|
|
508
|
+
}
|
|
509
|
+
recordMethod('peeltls');
|
|
510
|
+
return peelResult;
|
|
511
|
+
}
|
|
512
|
+
catch (e) {
|
|
513
|
+
if (isAbortError(e))
|
|
514
|
+
throw e;
|
|
515
|
+
throw e; // Don't fall back — user explicitly requested tls mode
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
/* ---- hook-based cache check (premium) -------------------------------- */
|
|
519
|
+
if (canUseCache && hooks.checkCache) {
|
|
520
|
+
const cached = hooks.checkCache(url);
|
|
521
|
+
if (cached) {
|
|
522
|
+
if (cached.stale && hooks.markRevalidating?.(url)) {
|
|
523
|
+
// Background revalidation — fire-and-forget
|
|
524
|
+
void (async () => {
|
|
525
|
+
try {
|
|
526
|
+
const fresh = await simpleFetch(url, userAgent, timeoutMs, undefined, undefined, firstProxy, proxyContext);
|
|
527
|
+
if (!looksLikeShellPage(fresh)) {
|
|
528
|
+
hooks.setCache?.(url, { ...fresh, method: 'simple' });
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
catch (e) {
|
|
532
|
+
// Non-fatal: background revalidation failed, stale entry continues serving.
|
|
533
|
+
log.debug('background cache revalidation failed:', e instanceof Error ? e.message : e);
|
|
534
|
+
}
|
|
535
|
+
})();
|
|
536
|
+
}
|
|
537
|
+
return { ...cached.value, method: 'cached' };
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
/* ---- basic cache check (non-premium fallback) ------------------------ */
|
|
541
|
+
if (canUseCache && !hooks.checkCache) {
|
|
542
|
+
const basicCached = getCached(url);
|
|
543
|
+
if (basicCached) {
|
|
544
|
+
return { ...basicCached, method: 'cached' };
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
/* ---- browser-level options ------------------------------------------- */
|
|
548
|
+
let shouldUseBrowser = effectiveForceBrowser || screenshot || effectiveStealth;
|
|
549
|
+
// A profileDir always forces browser mode (profile sessions need a real browser)
|
|
550
|
+
if (profileDir) {
|
|
551
|
+
effectiveForceBrowser = true;
|
|
552
|
+
}
|
|
553
|
+
// storageState injection requires a browser context
|
|
554
|
+
if (storageState) {
|
|
555
|
+
effectiveForceBrowser = true;
|
|
556
|
+
}
|
|
557
|
+
// Detect SPA for smarter DOM stability wait
|
|
558
|
+
const SPA_FETCH_DOMAINS = new Set([
|
|
559
|
+
'www.google.com', 'flights.google.com', 'www.airbnb.com', 'www.booking.com',
|
|
560
|
+
'www.expedia.com', 'www.kayak.com', 'www.skyscanner.com', 'www.tripadvisor.com',
|
|
561
|
+
'www.indeed.com', 'www.glassdoor.com', 'www.zillow.com', 'app.webpeel.dev',
|
|
562
|
+
]);
|
|
563
|
+
const SPA_FETCH_URL_PATTERNS = [
|
|
564
|
+
/google\.com\/travel/, /google\.com\/maps/, /google\.com\/shopping/,
|
|
565
|
+
];
|
|
566
|
+
let isSPAUrl = false;
|
|
567
|
+
try {
|
|
568
|
+
const parsedHostname = new URL(url).hostname;
|
|
569
|
+
isSPAUrl = SPA_FETCH_DOMAINS.has(parsedHostname) || SPA_FETCH_URL_PATTERNS.some(p => p.test(url));
|
|
570
|
+
}
|
|
571
|
+
catch { /* invalid URL — ignore */ }
|
|
572
|
+
const browserOptions = {
|
|
573
|
+
userAgent,
|
|
574
|
+
waitMs,
|
|
575
|
+
timeoutMs,
|
|
576
|
+
screenshot,
|
|
577
|
+
screenshotFullPage,
|
|
578
|
+
headers,
|
|
579
|
+
cookies,
|
|
580
|
+
actions,
|
|
581
|
+
keepPageOpen,
|
|
582
|
+
effectiveStealth,
|
|
583
|
+
profileDir,
|
|
584
|
+
headed,
|
|
585
|
+
storageState,
|
|
586
|
+
proxy: firstProxy,
|
|
587
|
+
device,
|
|
588
|
+
viewportWidth,
|
|
589
|
+
viewportHeight,
|
|
590
|
+
deviceScaleFactor,
|
|
591
|
+
waitUntil,
|
|
592
|
+
waitSelector,
|
|
593
|
+
blockResources,
|
|
594
|
+
isSPA: isSPAUrl,
|
|
595
|
+
languages: location?.languages,
|
|
596
|
+
};
|
|
597
|
+
/* ---- Strategy: simple fetch (with optional race) --------------------- */
|
|
598
|
+
if (!shouldUseBrowser) {
|
|
599
|
+
const simpleAbortController = new AbortController();
|
|
600
|
+
const simplePromise = withRetry(async () => {
|
|
601
|
+
// Throttle per-domain to avoid rate limits on target sites
|
|
602
|
+
await domainLimiter.throttle(url);
|
|
603
|
+
const result = await simpleFetch(url, userAgent, timeoutMs, headers, simpleAbortController.signal, firstProxy, proxyContext);
|
|
604
|
+
// Record success/failure for adaptive rate limiting
|
|
605
|
+
domainLimiter.recordResult(url, result.statusCode ?? 200);
|
|
606
|
+
return result;
|
|
607
|
+
}, {
|
|
608
|
+
maxRetries: 2,
|
|
609
|
+
baseDelayMs: 500,
|
|
610
|
+
maxDelayMs: 2000,
|
|
611
|
+
label: `simple-fetch:${url}`,
|
|
612
|
+
// Don't retry on blocked errors — escalate to browser instead
|
|
613
|
+
retryOn: (err) => {
|
|
614
|
+
if (err instanceof BlockedError)
|
|
615
|
+
return false;
|
|
616
|
+
if (err instanceof WebPeelError && !err.retryable)
|
|
617
|
+
return false;
|
|
618
|
+
// Retry transient errors (network, timeout, connection reset)
|
|
619
|
+
const msg = err.message?.toLowerCase() || '';
|
|
620
|
+
return (msg.includes('timeout') ||
|
|
621
|
+
msg.includes('econnreset') ||
|
|
622
|
+
msg.includes('econnrefused') ||
|
|
623
|
+
msg.includes('socket hang up') ||
|
|
624
|
+
msg.includes('getaddrinfo') ||
|
|
625
|
+
msg.includes('network'));
|
|
626
|
+
},
|
|
627
|
+
}).then((result) => {
|
|
628
|
+
if (looksLikeShellPage(result) || hasSpaIndicators(result.html)) {
|
|
629
|
+
throw new BlockedError('Shell page detected. Browser rendering required.');
|
|
630
|
+
}
|
|
631
|
+
return result;
|
|
632
|
+
});
|
|
633
|
+
// Determine race timeout — hooks can override
|
|
634
|
+
const useRace = hooks.shouldRace?.() ?? false;
|
|
635
|
+
const effectiveRaceTimeout = useRace
|
|
636
|
+
? (hooks.getRaceTimeoutMs?.() ?? raceTimeoutMs)
|
|
637
|
+
: raceTimeoutMs;
|
|
638
|
+
let raceTimer;
|
|
639
|
+
const simpleOrTimeout = await Promise.race([
|
|
640
|
+
simplePromise
|
|
641
|
+
.then((result) => ({ type: 'simple-success', result }))
|
|
642
|
+
.catch((error) => ({ type: 'simple-error', error })),
|
|
643
|
+
new Promise((resolve) => {
|
|
644
|
+
raceTimer = setTimeout(() => resolve({ type: 'race-timeout' }), Math.max(effectiveRaceTimeout, 0));
|
|
645
|
+
}),
|
|
646
|
+
]);
|
|
647
|
+
if (raceTimer)
|
|
648
|
+
clearTimeout(raceTimer);
|
|
649
|
+
if (simpleOrTimeout.type === 'simple-success') {
|
|
650
|
+
// Skip escalation when noEscalate=true (Q&A workloads that prefer speed over JS rendering)
|
|
651
|
+
if (!noEscalate && (shouldEscalateForLowContent(simpleOrTimeout.result) || hasSpaIndicators(simpleOrTimeout.result.html))) {
|
|
652
|
+
shouldUseBrowser = true;
|
|
653
|
+
}
|
|
654
|
+
else {
|
|
655
|
+
// Check whether the response is a bot-challenge page (e.g. Cloudflare, PerimeterX)
|
|
656
|
+
// Skip challenge detection when noEscalate=true (can't fix it with browser anyway)
|
|
657
|
+
const challengeCheck = noEscalate ? null : detectChallenge(simpleOrTimeout.result.html, simpleOrTimeout.result.statusCode);
|
|
658
|
+
if (challengeCheck && challengeCheck.isChallenge && challengeCheck.confidence >= 0.7) {
|
|
659
|
+
// Escalate — the browser/stealth path will handle it below
|
|
660
|
+
shouldUseBrowser = true;
|
|
661
|
+
}
|
|
662
|
+
else {
|
|
663
|
+
const strategyResult = {
|
|
664
|
+
...simpleOrTimeout.result,
|
|
665
|
+
method: 'simple',
|
|
666
|
+
};
|
|
667
|
+
if (canUseCache) {
|
|
668
|
+
hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
|
|
669
|
+
}
|
|
670
|
+
recordMethod('simple');
|
|
671
|
+
return strategyResult;
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
if (simpleOrTimeout.type === 'simple-error') {
|
|
676
|
+
// When noEscalate=true, don't try browser on simple fetch error — just throw
|
|
677
|
+
if (noEscalate || !shouldEscalateSimpleError(simpleOrTimeout.error)) {
|
|
678
|
+
throw simpleOrTimeout.error;
|
|
679
|
+
}
|
|
680
|
+
shouldUseBrowser = true;
|
|
681
|
+
}
|
|
682
|
+
else {
|
|
683
|
+
// Race timeout — only start parallel browser if hooks say to race
|
|
684
|
+
if (useRace) {
|
|
685
|
+
// Parallel race: simple still running, start browser too
|
|
686
|
+
const browserAbortController = new AbortController();
|
|
687
|
+
let simpleError;
|
|
688
|
+
let browserError;
|
|
689
|
+
const simpleCandidate = simplePromise
|
|
690
|
+
.then((result) => ({ source: 'simple', result }))
|
|
691
|
+
.catch((error) => {
|
|
692
|
+
simpleError = error;
|
|
693
|
+
throw error;
|
|
694
|
+
});
|
|
695
|
+
const browserCandidate = fetchWithBrowserStrategy(url, {
|
|
696
|
+
...browserOptions,
|
|
697
|
+
signal: browserAbortController.signal,
|
|
698
|
+
})
|
|
699
|
+
.then((result) => ({ source: 'browser', result }))
|
|
700
|
+
.catch((error) => {
|
|
701
|
+
browserError = error;
|
|
702
|
+
throw error;
|
|
703
|
+
});
|
|
704
|
+
try {
|
|
705
|
+
const winner = await Promise.any([
|
|
706
|
+
simpleCandidate,
|
|
707
|
+
browserCandidate,
|
|
708
|
+
]);
|
|
709
|
+
if (winner.source === 'simple') {
|
|
710
|
+
browserAbortController.abort();
|
|
711
|
+
const strategyResult = {
|
|
712
|
+
...winner.result,
|
|
713
|
+
method: 'simple',
|
|
714
|
+
};
|
|
715
|
+
if (canUseCache) {
|
|
716
|
+
hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
|
|
717
|
+
}
|
|
718
|
+
recordMethod('simple');
|
|
719
|
+
return strategyResult;
|
|
720
|
+
}
|
|
721
|
+
simpleAbortController.abort();
|
|
722
|
+
if (canUseCache) {
|
|
723
|
+
hooks.setCache?.(url, winner.result) ?? setBasicCache(url, winner.result);
|
|
724
|
+
}
|
|
725
|
+
recordMethod(winner.result.method);
|
|
726
|
+
return winner.result;
|
|
727
|
+
}
|
|
728
|
+
catch (e) {
|
|
729
|
+
// Race resolution failed — determine which error to propagate
|
|
730
|
+
log.debug('fetch race resolution failed:', e instanceof Error ? e.message : e);
|
|
731
|
+
if (simpleError &&
|
|
732
|
+
!shouldEscalateSimpleError(simpleError) &&
|
|
733
|
+
!isAbortError(simpleError)) {
|
|
734
|
+
throw simpleError;
|
|
735
|
+
}
|
|
736
|
+
if (browserError)
|
|
737
|
+
throw browserError;
|
|
738
|
+
if (simpleError)
|
|
739
|
+
throw simpleError;
|
|
740
|
+
throw new Error('Both simple and browser fetch attempts failed');
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
else {
|
|
744
|
+
// No race — just wait for the simple fetch to finish
|
|
745
|
+
const simpleResult = await simplePromise
|
|
746
|
+
.then((result) => ({ type: 'simple-success', result }))
|
|
747
|
+
.catch((error) => ({ type: 'simple-error', error }));
|
|
748
|
+
if (simpleResult.type === 'simple-success') {
|
|
749
|
+
// Check if the content is suspiciously thin, looks like an SPA shell, or is a shell page
|
|
750
|
+
// (looksLikeShellPage catches partial renders with 200-500 visible chars that
|
|
751
|
+
// shouldEscalateForLowContent misses — improves consistency on sites like China Daily)
|
|
752
|
+
if (shouldEscalateForLowContent(simpleResult.result) ||
|
|
753
|
+
hasSpaIndicators(simpleResult.result.html) ||
|
|
754
|
+
looksLikeShellPage(simpleResult.result)) {
|
|
755
|
+
shouldUseBrowser = true;
|
|
756
|
+
}
|
|
757
|
+
else {
|
|
758
|
+
// Check whether the response is a bot-challenge page
|
|
759
|
+
const challengeCheck = detectChallenge(simpleResult.result.html, simpleResult.result.statusCode);
|
|
760
|
+
if (challengeCheck.isChallenge && challengeCheck.confidence >= 0.7) {
|
|
761
|
+
shouldUseBrowser = true;
|
|
762
|
+
}
|
|
763
|
+
else {
|
|
764
|
+
const strategyResult = {
|
|
765
|
+
...simpleResult.result,
|
|
766
|
+
method: 'simple',
|
|
767
|
+
};
|
|
768
|
+
if (canUseCache) {
|
|
769
|
+
hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
|
|
770
|
+
}
|
|
771
|
+
recordMethod('simple');
|
|
772
|
+
return strategyResult;
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
else {
|
|
777
|
+
if (!shouldEscalateSimpleError(simpleResult.error)) {
|
|
778
|
+
throw simpleResult.error;
|
|
779
|
+
}
|
|
780
|
+
shouldUseBrowser = true;
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
/* ---- simple-with-headers: intermediate step before browser ----------- */
|
|
786
|
+
// Before escalating to the headless browser, retry simple fetch with Googlebot UA
|
|
787
|
+
// and a Google Referer. This catches sites that block generic UAs but return full
|
|
788
|
+
// content to search-engine crawlers without needing JS rendering.
|
|
789
|
+
// Only fires when: we escalated from simple (not forced by domain rules), noEscalate=false.
|
|
790
|
+
if (shouldUseBrowser && !noEscalate && !effectiveForceBrowser && !effectiveStealth && !screenshot) {
|
|
791
|
+
const t0Headers = Date.now();
|
|
792
|
+
log.debug('Escalating: simple → simple-with-headers (Googlebot UA + Google Referer)');
|
|
793
|
+
try {
|
|
794
|
+
const headersResult = await simpleFetch(url, 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', timeoutMs, {
|
|
795
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
796
|
+
'Referer': 'https://www.google.com/',
|
|
797
|
+
}, undefined, firstProxy, proxyContext);
|
|
798
|
+
const headersChallengeCheck = detectChallenge(headersResult.html, headersResult.statusCode);
|
|
799
|
+
const headersOk = !looksLikeShellPage(headersResult) &&
|
|
800
|
+
!hasSpaIndicators(headersResult.html) &&
|
|
801
|
+
!shouldEscalateForLowContent(headersResult) &&
|
|
802
|
+
(!headersChallengeCheck.isChallenge || headersChallengeCheck.confidence < 0.7);
|
|
803
|
+
if (headersOk) {
|
|
804
|
+
log.debug(`simple-with-headers succeeded in ${Date.now() - t0Headers}ms`);
|
|
805
|
+
const strategyResult = { ...headersResult, method: 'simple' };
|
|
806
|
+
if (canUseCache) {
|
|
807
|
+
hooks.setCache?.(url, strategyResult) ?? setBasicCache(url, strategyResult);
|
|
808
|
+
}
|
|
809
|
+
recordMethod('simple');
|
|
810
|
+
return strategyResult;
|
|
811
|
+
}
|
|
812
|
+
log.debug(`simple-with-headers produced thin/blocked content in ${Date.now() - t0Headers}ms, continuing to browser`);
|
|
813
|
+
}
|
|
814
|
+
catch (e) {
|
|
815
|
+
if (isAbortError(e))
|
|
816
|
+
throw e;
|
|
817
|
+
log.debug('simple-with-headers failed:', e instanceof Error ? e.message : e);
|
|
818
|
+
}
|
|
819
|
+
}
|
|
820
|
+
/* ---- browser / stealth fallback with challenge-detection cascade ----- */
|
|
821
|
+
// Try each proxy in sequence until one succeeds
|
|
822
|
+
let lastError;
|
|
823
|
+
for (let proxyIdx = 0; proxyIdx < effectiveProxies.length; proxyIdx++) {
|
|
824
|
+
const currentProxy = effectiveProxies[proxyIdx];
|
|
825
|
+
const isLastProxy = proxyIdx === effectiveProxies.length - 1;
|
|
826
|
+
try {
|
|
827
|
+
const currentBrowserOptions = { ...browserOptions, proxy: currentProxy };
|
|
828
|
+
// Attempt 1: browser (or stealth, if already forced)
|
|
829
|
+
let finalResult = await fetchWithBrowserStrategy(url, currentBrowserOptions);
|
|
830
|
+
// browser-with-wait: if browser returned thin content (SPA may not have fully loaded),
|
|
831
|
+
// retry with a 3-second networkidle wait before escalating to stealth mode.
|
|
832
|
+
// This handles dynamic SPAs where the initial browser fetch catches a partial render.
|
|
833
|
+
if (!currentBrowserOptions.effectiveStealth && shouldEscalateForLowContent(finalResult)) {
|
|
834
|
+
const t0Wait = Date.now();
|
|
835
|
+
log.debug('browser returned thin content, escalating to browser-with-wait (3s networkidle)');
|
|
836
|
+
try {
|
|
837
|
+
const browserWaitResult = await fetchWithBrowserStrategy(url, {
|
|
838
|
+
...currentBrowserOptions,
|
|
839
|
+
waitMs: Math.max(currentBrowserOptions.waitMs, 3000),
|
|
840
|
+
waitUntil: 'networkidle',
|
|
841
|
+
});
|
|
842
|
+
log.debug(`browser-with-wait done in ${Date.now() - t0Wait}ms`);
|
|
843
|
+
// Accept the wait result if it has more content (even if still thin — it's better than nothing)
|
|
844
|
+
if (!shouldEscalateForLowContent(browserWaitResult) ||
|
|
845
|
+
browserWaitResult.html.length > finalResult.html.length) {
|
|
846
|
+
finalResult = browserWaitResult;
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
catch (e) {
|
|
850
|
+
log.debug('browser-with-wait failed:', e instanceof Error ? e.message : e);
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
// Check if the browser result is itself a bot-challenge page
|
|
854
|
+
const browserChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
|
|
855
|
+
if (browserChallengeCheck.isChallenge && browserChallengeCheck.confidence >= 0.7) {
|
|
856
|
+
if (!currentBrowserOptions.effectiveStealth) {
|
|
857
|
+
// Attempt 2: escalate to stealth
|
|
858
|
+
const stealthOptions = {
|
|
859
|
+
...currentBrowserOptions,
|
|
860
|
+
effectiveStealth: true,
|
|
861
|
+
};
|
|
862
|
+
finalResult = await fetchWithBrowserStrategy(url, stealthOptions);
|
|
863
|
+
const stealthChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
|
|
864
|
+
if (stealthChallengeCheck.isChallenge && stealthChallengeCheck.confidence >= 0.7) {
|
|
865
|
+
// Attempt 3: stealth + 5s extra wait
|
|
866
|
+
const stealthExtraOptions = {
|
|
867
|
+
...stealthOptions,
|
|
868
|
+
waitMs: stealthOptions.waitMs + 5000,
|
|
869
|
+
};
|
|
870
|
+
finalResult = await fetchWithBrowserStrategy(url, stealthExtraOptions);
|
|
871
|
+
const finalChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
|
|
872
|
+
if (finalChallengeCheck.isChallenge && finalChallengeCheck.confidence >= 0.7) {
|
|
873
|
+
if (!isLastProxy) {
|
|
874
|
+
// More proxies to try — move on to the next one
|
|
875
|
+
lastError = new BlockedError(`Challenge detected with proxy ${currentProxy || 'direct'}`);
|
|
876
|
+
continue;
|
|
877
|
+
}
|
|
878
|
+
// Last proxy: give up and return with warning flag (preserve original behaviour)
|
|
879
|
+
finalResult = { ...finalResult, challengeDetected: true };
|
|
880
|
+
}
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
else {
|
|
884
|
+
// Already in stealth mode; retry with 5s extra wait
|
|
885
|
+
const stealthExtraOptions = {
|
|
886
|
+
...currentBrowserOptions,
|
|
887
|
+
waitMs: currentBrowserOptions.waitMs + 5000,
|
|
888
|
+
};
|
|
889
|
+
finalResult = await fetchWithBrowserStrategy(url, stealthExtraOptions);
|
|
890
|
+
const finalChallengeCheck = detectChallenge(finalResult.html, finalResult.statusCode);
|
|
891
|
+
if (finalChallengeCheck.isChallenge && finalChallengeCheck.confidence >= 0.7) {
|
|
892
|
+
if (!isLastProxy) {
|
|
893
|
+
// More proxies to try — move on to the next one
|
|
894
|
+
lastError = new BlockedError(`Challenge detected with proxy ${currentProxy || 'direct'}`);
|
|
895
|
+
continue;
|
|
896
|
+
}
|
|
897
|
+
// Last proxy: give up and return with warning flag (preserve original behaviour)
|
|
898
|
+
finalResult = { ...finalResult, challengeDetected: true };
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
// If still challenged after stealth+wait, try PeelTLS (TLS fingerprint spoofing)
|
|
903
|
+
if (finalResult.challengeDetected) {
|
|
904
|
+
try {
|
|
905
|
+
const { peelTLSFetch, isPeelTLSAvailable } = await import('./peel-tls.js');
|
|
906
|
+
if (isPeelTLSAvailable()) {
|
|
907
|
+
log.debug('Escalating to PeelTLS fingerprint spoofing');
|
|
908
|
+
const peelResult = await peelTLSFetch(url, {
|
|
909
|
+
proxy: currentProxy,
|
|
910
|
+
headers,
|
|
911
|
+
timeout: timeoutMs,
|
|
912
|
+
});
|
|
913
|
+
const peelStrategyResult = { ...peelResult, method: 'peeltls' };
|
|
914
|
+
const peelChallengeCheck = detectChallenge(peelResult.html, peelResult.statusCode);
|
|
915
|
+
if (!peelChallengeCheck.isChallenge || peelChallengeCheck.confidence < 0.7) {
|
|
916
|
+
// PeelTLS succeeded
|
|
917
|
+
if (canUseCache) {
|
|
918
|
+
hooks.setCache?.(url, peelStrategyResult) ?? setBasicCache(url, peelStrategyResult);
|
|
919
|
+
}
|
|
920
|
+
recordMethod('peeltls');
|
|
921
|
+
return peelStrategyResult;
|
|
922
|
+
}
|
|
923
|
+
// PeelTLS still challenged — fall through to CloakBrowser
|
|
924
|
+
log.debug('PeelTLS still challenged, escalating to CloakBrowser');
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
catch (peelError) {
|
|
928
|
+
log.debug('PeelTLS failed:', peelError instanceof Error ? peelError.message : peelError);
|
|
929
|
+
// Fall through to CloakBrowser
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
// If still challenged after PeelTLS, try Cloudflare Worker proxy (clean edge IPs)
|
|
933
|
+
if (finalResult.challengeDetected) {
|
|
934
|
+
try {
|
|
935
|
+
const { cfWorkerFetch, isCfWorkerAvailable } = await import('./cf-worker-proxy.js');
|
|
936
|
+
if (isCfWorkerAvailable()) {
|
|
937
|
+
log.debug('Escalating to CF Worker proxy');
|
|
938
|
+
const cfResult = await cfWorkerFetch(url, {
|
|
939
|
+
headers,
|
|
940
|
+
timeout: timeoutMs,
|
|
941
|
+
});
|
|
942
|
+
const cfStrategyResult = { ...cfResult, method: 'cf-worker' };
|
|
943
|
+
const cfChallengeCheck = detectChallenge(cfResult.html, cfResult.statusCode);
|
|
944
|
+
if (!cfChallengeCheck.isChallenge || cfChallengeCheck.confidence < 0.7) {
|
|
945
|
+
// CF Worker succeeded
|
|
946
|
+
if (canUseCache) {
|
|
947
|
+
hooks.setCache?.(url, cfStrategyResult) ?? setBasicCache(url, cfStrategyResult);
|
|
948
|
+
}
|
|
949
|
+
recordMethod('cf-worker');
|
|
950
|
+
return cfStrategyResult;
|
|
951
|
+
}
|
|
952
|
+
log.debug('CF Worker still challenged, escalating to CloakBrowser');
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
catch (cfError) {
|
|
956
|
+
log.debug('CF Worker proxy failed:', cfError instanceof Error ? cfError.message : cfError);
|
|
957
|
+
}
|
|
958
|
+
}
|
|
959
|
+
// If still challenged after CF Worker, try CloakBrowser
|
|
960
|
+
if (finalResult.challengeDetected) {
|
|
961
|
+
try {
|
|
962
|
+
// @ts-ignore — proprietary module, gitignored
|
|
963
|
+
const { cloakFetch, isCloakBrowserAvailable } = await import('./cloak-fetch.js');
|
|
964
|
+
if (isCloakBrowserAvailable()) {
|
|
965
|
+
log.debug('Escalating to CloakBrowser stealth');
|
|
966
|
+
const cloakResult = await cloakFetch({
|
|
967
|
+
url,
|
|
968
|
+
proxy: currentProxy,
|
|
969
|
+
userAgent,
|
|
970
|
+
viewportWidth,
|
|
971
|
+
viewportHeight,
|
|
972
|
+
waitMs,
|
|
973
|
+
waitSelector,
|
|
974
|
+
waitUntil,
|
|
975
|
+
timeoutMs,
|
|
976
|
+
screenshot,
|
|
977
|
+
screenshotFullPage,
|
|
978
|
+
actions,
|
|
979
|
+
headers,
|
|
980
|
+
headed,
|
|
981
|
+
});
|
|
982
|
+
if (canUseCache && !cloakResult.challengeDetected) {
|
|
983
|
+
hooks.setCache?.(url, cloakResult) ?? setBasicCache(url, cloakResult);
|
|
984
|
+
}
|
|
985
|
+
recordMethod(cloakResult.method);
|
|
986
|
+
return cloakResult;
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
catch (cloakError) {
|
|
990
|
+
log.debug('CloakBrowser failed:', cloakError instanceof Error ? cloakError.message : cloakError);
|
|
991
|
+
// Fall through to Google Cache fallback
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
// If still challenged after PeelTLS/CloakBrowser, try Google Cache
|
|
995
|
+
if (finalResult.challengeDetected) {
|
|
996
|
+
try {
|
|
997
|
+
const { fetchGoogleCache } = await import('./google-cache.js');
|
|
998
|
+
const cacheResult = await fetchGoogleCache(url, { timeout: timeoutMs });
|
|
999
|
+
if (cacheResult && cacheResult.html.length > 200) {
|
|
1000
|
+
log.debug('Using Google Cache fallback');
|
|
1001
|
+
const cacheStrategyResult = {
|
|
1002
|
+
html: cacheResult.html,
|
|
1003
|
+
url: cacheResult.url,
|
|
1004
|
+
statusCode: cacheResult.statusCode,
|
|
1005
|
+
contentType: 'text/html',
|
|
1006
|
+
method: 'google-cache',
|
|
1007
|
+
};
|
|
1008
|
+
return cacheStrategyResult;
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
catch (cacheError) {
|
|
1012
|
+
log.debug('Google Cache failed:', cacheError);
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
// Success (or gave up with challengeDetected=true on the last proxy)
|
|
1016
|
+
if (canUseCache && !finalResult.challengeDetected) {
|
|
1017
|
+
hooks.setCache?.(url, finalResult) ?? setBasicCache(url, finalResult);
|
|
1018
|
+
}
|
|
1019
|
+
recordMethod(finalResult.method);
|
|
1020
|
+
// Record estimated proxy bandwidth for browser fetches that used a proxy
|
|
1021
|
+
if (currentProxy && proxyContext?.userId && !finalResult.challengeDetected) {
|
|
1022
|
+
// Estimate bytes: use HTML length as proxy for page size (rough but fast)
|
|
1023
|
+
const estimatedBytes = finalResult.html?.length ?? (2 * 1024 * 1024); // fallback 2MB
|
|
1024
|
+
recordProxyBytes(proxyContext.userId, estimatedBytes);
|
|
1025
|
+
}
|
|
1026
|
+
return finalResult;
|
|
1027
|
+
}
|
|
1028
|
+
catch (e) {
|
|
1029
|
+
lastError = e;
|
|
1030
|
+
if (isAbortError(e))
|
|
1031
|
+
throw e; // Don't retry on abort
|
|
1032
|
+
// Log and try next proxy
|
|
1033
|
+
log.debug(`proxy ${currentProxy || 'direct'} failed:`, e instanceof Error ? e.message : e);
|
|
1034
|
+
// If last proxy, throw below; otherwise continue loop
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
// All proxies exhausted — throw the last error
|
|
1038
|
+
throw lastError;
|
|
1039
|
+
}
|
|
1040
|
+
/* ---------- legacy export for tests ------------------------------------- */
|
|
1041
|
+
/**
|
|
1042
|
+
* @deprecated Use `clearStrategyHooks()` from strategy-hooks.ts instead.
|
|
1043
|
+
*/
|
|
1044
|
+
export { clearStrategyHooks as clearDomainIntel } from './strategy-hooks.js';
|