@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,820 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure HTTP fetching — no browser dependencies.
|
|
3
|
+
* Handles connection pooling, conditional caching, SSRF validation, and simpleFetch.
|
|
4
|
+
*/
|
|
5
|
+
// Force IPv4-first DNS resolution globally.
|
|
6
|
+
// Prevents IPv6 connection failures (TLS errors, timeouts) on hosts that
|
|
7
|
+
// advertise AAAA records but can't actually route IPv6 (e.g. Render containers).
|
|
8
|
+
// Must run before any network library is used.
|
|
9
|
+
import dns from 'dns';
|
|
10
|
+
dns.setDefaultResultOrder('ipv4first');
|
|
11
|
+
import { getHttpUA, getSecCHUA, getSecCHUAPlatform } from './user-agents.js';
|
|
12
|
+
import { getWebshareProxyUrl, canUseProxy, recordProxyBytes } from './proxy-config.js';
|
|
13
|
+
import { fetch as undiciFetch, Agent, ProxyAgent } from 'undici';
|
|
14
|
+
import { TimeoutError, BlockedError, NetworkError, WebPeelError } from '../types.js';
|
|
15
|
+
import { getCached } from './cache.js';
|
|
16
|
+
import { cachedLookup, resolveAndCache, startDnsWarmup } from './dns-cache.js';
|
|
17
|
+
import { detectChallenge } from './challenge-detection.js';
|
|
18
|
+
import { getCookieHeader } from './cookie-cache.js';
|
|
19
|
+
import { createLogger } from './logger.js';
|
|
20
|
+
const log = createLogger('http');
|
|
21
|
+
// ── HTTP status text fallbacks (HTTP/2 omits reason phrases) ──────────────────
|
|
22
|
+
const HTTP_STATUS_TEXT = {
|
|
23
|
+
400: 'Bad Request',
|
|
24
|
+
401: 'Unauthorized',
|
|
25
|
+
402: 'Payment Required',
|
|
26
|
+
403: 'Forbidden',
|
|
27
|
+
404: 'Not Found',
|
|
28
|
+
405: 'Method Not Allowed',
|
|
29
|
+
408: 'Request Timeout',
|
|
30
|
+
410: 'Gone',
|
|
31
|
+
429: 'Too Many Requests',
|
|
32
|
+
451: 'Unavailable For Legal Reasons',
|
|
33
|
+
500: 'Internal Server Error',
|
|
34
|
+
502: 'Bad Gateway',
|
|
35
|
+
503: 'Service Unavailable',
|
|
36
|
+
504: 'Gateway Timeout',
|
|
37
|
+
520: 'Unknown Error (Cloudflare)',
|
|
38
|
+
521: 'Web Server Is Down (Cloudflare)',
|
|
39
|
+
522: 'Connection Timed Out (Cloudflare)',
|
|
40
|
+
523: 'Origin Is Unreachable (Cloudflare)',
|
|
41
|
+
524: 'A Timeout Occurred (Cloudflare)',
|
|
42
|
+
525: 'SSL Handshake Failed (Cloudflare)',
|
|
43
|
+
};
|
|
44
|
+
// ── HTTP connection pool ──────────────────────────────────────────────────────
|
|
45
|
+
function createHttpPool() {
|
|
46
|
+
return new Agent({
|
|
47
|
+
connections: 50,
|
|
48
|
+
pipelining: 10,
|
|
49
|
+
keepAliveTimeout: 60000,
|
|
50
|
+
keepAliveMaxTimeout: 60000,
|
|
51
|
+
allowH2: true,
|
|
52
|
+
headersTimeout: 10000,
|
|
53
|
+
bodyTimeout: 30000,
|
|
54
|
+
connect: {
|
|
55
|
+
lookup: cachedLookup,
|
|
56
|
+
},
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
let httpPool = createHttpPool();
|
|
60
|
+
startDnsWarmup();
|
|
61
|
+
export async function closePool() {
|
|
62
|
+
const oldPool = httpPool;
|
|
63
|
+
httpPool = createHttpPool();
|
|
64
|
+
await oldPool.close().catch(() => { });
|
|
65
|
+
}
|
|
66
|
+
const CONDITIONAL_CACHE_MAX_ENTRIES = 2000;
|
|
67
|
+
const conditionalValidatorsByUrl = new Map();
|
|
68
|
+
function normalizeUrlForConditionalCache(url) {
|
|
69
|
+
try {
|
|
70
|
+
const normalized = new URL(url);
|
|
71
|
+
normalized.hash = '';
|
|
72
|
+
normalized.hostname = normalized.hostname.toLowerCase();
|
|
73
|
+
if ((normalized.protocol === 'http:' && normalized.port === '80') ||
|
|
74
|
+
(normalized.protocol === 'https:' && normalized.port === '443')) {
|
|
75
|
+
normalized.port = '';
|
|
76
|
+
}
|
|
77
|
+
if (!normalized.pathname) {
|
|
78
|
+
normalized.pathname = '/';
|
|
79
|
+
}
|
|
80
|
+
const sortedParams = [...normalized.searchParams.entries()]
|
|
81
|
+
.sort(([a], [b]) => a.localeCompare(b));
|
|
82
|
+
normalized.search = '';
|
|
83
|
+
for (const [key, value] of sortedParams) {
|
|
84
|
+
normalized.searchParams.append(key, value);
|
|
85
|
+
}
|
|
86
|
+
return normalized.toString();
|
|
87
|
+
}
|
|
88
|
+
catch (e) {
|
|
89
|
+
// Non-fatal: URL normalization failed, returning raw trimmed URL
|
|
90
|
+
log.debug('URL normalization:', e instanceof Error ? e.message : e);
|
|
91
|
+
return url.trim();
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
function getConditionalValidators(url) {
|
|
95
|
+
const key = normalizeUrlForConditionalCache(url);
|
|
96
|
+
const existing = conditionalValidatorsByUrl.get(key);
|
|
97
|
+
if (!existing) {
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
// LRU touch
|
|
101
|
+
conditionalValidatorsByUrl.delete(key);
|
|
102
|
+
conditionalValidatorsByUrl.set(key, existing);
|
|
103
|
+
return existing;
|
|
104
|
+
}
|
|
105
|
+
function setConditionalValidators(url, validators) {
|
|
106
|
+
const key = normalizeUrlForConditionalCache(url);
|
|
107
|
+
if (conditionalValidatorsByUrl.has(key)) {
|
|
108
|
+
conditionalValidatorsByUrl.delete(key);
|
|
109
|
+
}
|
|
110
|
+
conditionalValidatorsByUrl.set(key, validators);
|
|
111
|
+
while (conditionalValidatorsByUrl.size > CONDITIONAL_CACHE_MAX_ENTRIES) {
|
|
112
|
+
const oldestKey = conditionalValidatorsByUrl.keys().next().value;
|
|
113
|
+
if (!oldestKey) {
|
|
114
|
+
break;
|
|
115
|
+
}
|
|
116
|
+
conditionalValidatorsByUrl.delete(oldestKey);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
function rememberConditionalValidators(url, response) {
|
|
120
|
+
const etag = response.headers.get('etag') || undefined;
|
|
121
|
+
const lastModified = response.headers.get('last-modified') || undefined;
|
|
122
|
+
if (!etag && !lastModified) {
|
|
123
|
+
return;
|
|
124
|
+
}
|
|
125
|
+
setConditionalValidators(url, { etag, lastModified });
|
|
126
|
+
}
|
|
127
|
+
function hasHeader(headers, name) {
|
|
128
|
+
const lowered = name.toLowerCase();
|
|
129
|
+
return Object.keys(headers).some((header) => header.toLowerCase() === lowered);
|
|
130
|
+
}
|
|
131
|
+
function getCachedResultFor304(url, fallbackUrl) {
|
|
132
|
+
const cached = getCached(url) || (fallbackUrl ? getCached(fallbackUrl) : null);
|
|
133
|
+
if (!cached) {
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
return {
|
|
137
|
+
html: cached.html,
|
|
138
|
+
buffer: cached.buffer,
|
|
139
|
+
url: cached.url || url,
|
|
140
|
+
statusCode: 304,
|
|
141
|
+
contentType: cached.contentType,
|
|
142
|
+
screenshot: cached.screenshot,
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
export function createAbortError() {
|
|
146
|
+
const error = new Error('Operation aborted');
|
|
147
|
+
error.name = 'AbortError';
|
|
148
|
+
return error;
|
|
149
|
+
}
|
|
150
|
+
// ── Stealth headers & proxy routing ──────────────────────────────────────────
|
|
151
|
+
/**
|
|
152
|
+
* Domains known to aggressively block datacenter IPs.
|
|
153
|
+
* Requests to these domains automatically route through the Webshare residential
|
|
154
|
+
* proxy when proxy credentials are configured (WEBSHARE_PROXY_* env vars).
|
|
155
|
+
*/
|
|
156
|
+
export const PROXY_PREFERRED_DOMAINS = [
|
|
157
|
+
// Social / content
|
|
158
|
+
'reddit.com',
|
|
159
|
+
'old.reddit.com',
|
|
160
|
+
'forbes.com',
|
|
161
|
+
'fortune.com',
|
|
162
|
+
// Auto / cars
|
|
163
|
+
'cargurus.com',
|
|
164
|
+
'edmunds.com',
|
|
165
|
+
'cars.com',
|
|
166
|
+
'truecar.com',
|
|
167
|
+
'autotrader.com',
|
|
168
|
+
'carfax.com',
|
|
169
|
+
'tesla.com',
|
|
170
|
+
'motortrend.com',
|
|
171
|
+
'jdpower.com',
|
|
172
|
+
// Finance / home
|
|
173
|
+
'nerdwallet.com',
|
|
174
|
+
'bankrate.com',
|
|
175
|
+
'homeadvisor.com',
|
|
176
|
+
'angi.com',
|
|
177
|
+
// EV / auto news
|
|
178
|
+
'insideevs.com',
|
|
179
|
+
'electrek.co',
|
|
180
|
+
// Restaurants / food
|
|
181
|
+
'yelp.com',
|
|
182
|
+
// Travel
|
|
183
|
+
'kayak.com',
|
|
184
|
+
'booking.com',
|
|
185
|
+
'expedia.com',
|
|
186
|
+
'tripadvisor.com',
|
|
187
|
+
'hotels.com',
|
|
188
|
+
// Shopping / products
|
|
189
|
+
'amazon.com',
|
|
190
|
+
'bestbuy.com',
|
|
191
|
+
'walmart.com',
|
|
192
|
+
'target.com',
|
|
193
|
+
];
|
|
194
|
+
/**
|
|
195
|
+
* Returns true if the URL's domain is on the proxy-preferred blocklist.
|
|
196
|
+
* Matches exact hostname (sans www.) and all subdomains.
|
|
197
|
+
*
|
|
198
|
+
* @example
|
|
199
|
+
* shouldUseProxy('https://www.reddit.com/r/news') // true
|
|
200
|
+
* shouldUseProxy('https://example.com') // false
|
|
201
|
+
*/
|
|
202
|
+
export function shouldUseProxy(url) {
|
|
203
|
+
try {
|
|
204
|
+
const host = new URL(url).hostname.replace(/^www\./, '');
|
|
205
|
+
return PROXY_PREFERRED_DOMAINS.some(d => host === d || host.endsWith('.' + d));
|
|
206
|
+
}
|
|
207
|
+
catch {
|
|
208
|
+
return false;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Generate browser-like request headers tailored to the User-Agent type.
|
|
213
|
+
*
|
|
214
|
+
* - Chrome/Edge: full Sec-CH-UA + Sec-Fetch-* header set
|
|
215
|
+
* - Firefox: adjusted Accept, TE header, partial Sec-Fetch-* (no Sec-CH-UA)
|
|
216
|
+
* - Safari: minimal headers, no Sec-Fetch-* or Sec-CH-UA
|
|
217
|
+
* - Other: basic headers only
|
|
218
|
+
*
|
|
219
|
+
* Automatically adds a Google referer for domains where it helps bypass blocks.
|
|
220
|
+
*
|
|
221
|
+
* @param url - Target URL (used for domain-specific header additions)
|
|
222
|
+
* @param userAgent - User-Agent string (determines which header set is applied)
|
|
223
|
+
*/
|
|
224
|
+
export function getStealthHeaders(url, userAgent) {
|
|
225
|
+
const isFirefox = userAgent.includes('Firefox');
|
|
226
|
+
const isSafari = userAgent.includes('Safari') && !userAgent.includes('Chrome');
|
|
227
|
+
const isChrome = !isFirefox && !isSafari && (userAgent.includes('Chrome') || userAgent.includes('Chromium'));
|
|
228
|
+
const isMobile = userAgent.includes('Mobile') || userAgent.includes('Android');
|
|
229
|
+
// Base headers all browsers send
|
|
230
|
+
const headers = {
|
|
231
|
+
'User-Agent': userAgent,
|
|
232
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
233
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
234
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
235
|
+
'Cache-Control': 'max-age=0',
|
|
236
|
+
'DNT': '1',
|
|
237
|
+
'Upgrade-Insecure-Requests': '1',
|
|
238
|
+
};
|
|
239
|
+
if (isFirefox) {
|
|
240
|
+
// Firefox: different Accept, TE, and partial Sec-Fetch (no Sec-CH-UA)
|
|
241
|
+
headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8';
|
|
242
|
+
headers['Accept-Language'] = 'en-US,en;q=0.5';
|
|
243
|
+
headers['TE'] = 'trailers';
|
|
244
|
+
headers['Sec-Fetch-Dest'] = 'document';
|
|
245
|
+
headers['Sec-Fetch-Mode'] = 'navigate';
|
|
246
|
+
headers['Sec-Fetch-Site'] = 'none';
|
|
247
|
+
// Firefox omits Sec-Fetch-User in many navigations
|
|
248
|
+
}
|
|
249
|
+
else if (isSafari) {
|
|
250
|
+
// Safari: minimal headers, no Sec-Fetch-* or Sec-CH-UA
|
|
251
|
+
headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
|
252
|
+
// Safari does not send Sec-Fetch headers at all
|
|
253
|
+
}
|
|
254
|
+
else if (isChrome) {
|
|
255
|
+
// Chrome/Edge: full set of Sec-Fetch-* and Sec-CH-UA headers
|
|
256
|
+
headers['Sec-Fetch-Dest'] = 'document';
|
|
257
|
+
headers['Sec-Fetch-Mode'] = 'navigate';
|
|
258
|
+
headers['Sec-Fetch-Site'] = 'none';
|
|
259
|
+
headers['Sec-Fetch-User'] = '?1';
|
|
260
|
+
headers['Sec-CH-UA'] = getSecCHUA(userAgent);
|
|
261
|
+
headers['Sec-CH-UA-Mobile'] = isMobile ? '?1' : '?0';
|
|
262
|
+
headers['Sec-CH-UA-Platform'] = getSecCHUAPlatform(userAgent);
|
|
263
|
+
headers['Connection'] = 'keep-alive';
|
|
264
|
+
headers['Priority'] = 'u=0, i';
|
|
265
|
+
}
|
|
266
|
+
// else: custom/API UAs (e.g. "WebPeel/1.0") — basic headers only, no browser fingerprints
|
|
267
|
+
// Add Google Referer for domains where it's known to help bypass blocks
|
|
268
|
+
try {
|
|
269
|
+
const domain = new URL(url).hostname;
|
|
270
|
+
const referrerDomains = [
|
|
271
|
+
'reddit.com', 'forbes.com', 'cargurus.com', 'edmunds.com',
|
|
272
|
+
'cars.com', 'truecar.com', 'nerdwallet.com', 'homeadvisor.com',
|
|
273
|
+
'angi.com', 'motortrend.com', 'jdpower.com', 'electrek.co', 'insideevs.com',
|
|
274
|
+
];
|
|
275
|
+
if (referrerDomains.some(d => domain.includes(d))) {
|
|
276
|
+
headers['Referer'] = 'https://www.google.com/';
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
catch {
|
|
280
|
+
// Non-fatal: URL parsing failed, skip Referer
|
|
281
|
+
}
|
|
282
|
+
return headers;
|
|
283
|
+
}
|
|
284
|
+
/** Pick a different UA than the one currently in use (for 403/503 retries). */
|
|
285
|
+
function getDifferentUA(current) {
|
|
286
|
+
for (let i = 0; i < 10; i++) {
|
|
287
|
+
const ua = getHttpUA();
|
|
288
|
+
if (ua !== current)
|
|
289
|
+
return ua;
|
|
290
|
+
}
|
|
291
|
+
return getHttpUA();
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Build the merged request headers: stealth defaults + caller custom headers.
|
|
295
|
+
* Throws WebPeelError if customHeaders attempts to override the Host header.
|
|
296
|
+
*/
|
|
297
|
+
function buildMergedHeaders(url, userAgent, customHeaders) {
|
|
298
|
+
const merged = { ...getStealthHeaders(url, userAgent) };
|
|
299
|
+
if (customHeaders) {
|
|
300
|
+
for (const [key, value] of Object.entries(customHeaders)) {
|
|
301
|
+
// SECURITY: Block Host header override
|
|
302
|
+
if (key.toLowerCase() === 'host') {
|
|
303
|
+
throw new WebPeelError('Custom Host header is not allowed');
|
|
304
|
+
}
|
|
305
|
+
merged[key] = value;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return merged;
|
|
309
|
+
}
|
|
310
|
+
// ── SSRF / URL validation ─────────────────────────────────────────────────────
|
|
311
|
+
/**
|
|
312
|
+
* SECURITY: Validate URL to prevent SSRF attacks
|
|
313
|
+
* Blocks localhost, private IPs, link-local, and various bypass techniques
|
|
314
|
+
*/
|
|
315
|
+
export function validateUrl(urlString) {
|
|
316
|
+
// Length check
|
|
317
|
+
if (urlString.length > 2048) {
|
|
318
|
+
throw new WebPeelError('URL too long (max 2048 characters)');
|
|
319
|
+
}
|
|
320
|
+
// Check for control characters and suspicious encoding
|
|
321
|
+
if (/[\x00-\x1F\x7F]/.test(urlString)) {
|
|
322
|
+
throw new WebPeelError('URL contains invalid control characters');
|
|
323
|
+
}
|
|
324
|
+
let url;
|
|
325
|
+
try {
|
|
326
|
+
url = new URL(urlString);
|
|
327
|
+
}
|
|
328
|
+
catch {
|
|
329
|
+
throw new WebPeelError('Invalid URL format');
|
|
330
|
+
}
|
|
331
|
+
// Only allow HTTP(S)
|
|
332
|
+
if (!['http:', 'https:'].includes(url.protocol)) {
|
|
333
|
+
throw new WebPeelError('Only HTTP and HTTPS protocols are allowed');
|
|
334
|
+
}
|
|
335
|
+
// Validate hostname is not empty
|
|
336
|
+
if (!url.hostname) {
|
|
337
|
+
throw new WebPeelError('Invalid hostname');
|
|
338
|
+
}
|
|
339
|
+
const hostname = url.hostname.toLowerCase();
|
|
340
|
+
// Block localhost patterns
|
|
341
|
+
const localhostPatterns = ['localhost', '0.0.0.0'];
|
|
342
|
+
if (localhostPatterns.some(pattern => hostname === pattern || hostname.endsWith('.' + pattern))) {
|
|
343
|
+
throw new WebPeelError('Access to localhost is not allowed');
|
|
344
|
+
}
|
|
345
|
+
// ENHANCED: Parse and validate IP addresses (handles hex, octal, decimal, mixed)
|
|
346
|
+
const ipv4Info = parseAndValidateIPv4(hostname);
|
|
347
|
+
if (ipv4Info) {
|
|
348
|
+
validateIPv4Address(ipv4Info);
|
|
349
|
+
}
|
|
350
|
+
// ENHANCED: Comprehensive IPv6 validation
|
|
351
|
+
if (hostname.includes(':')) {
|
|
352
|
+
validateIPv6Address(hostname);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
/**
|
|
356
|
+
* Parse IPv4 address in any format (dotted, hex, octal, decimal, mixed)
|
|
357
|
+
* Returns null if not an IPv4 address
|
|
358
|
+
*/
|
|
359
|
+
function parseAndValidateIPv4(hostname) {
|
|
360
|
+
// Remove brackets if present
|
|
361
|
+
const cleaned = hostname.replace(/^\[|\]$/g, '');
|
|
362
|
+
// Standard dotted notation: 192.168.1.1
|
|
363
|
+
const dottedRegex = /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
|
|
364
|
+
const dottedMatch = cleaned.match(dottedRegex);
|
|
365
|
+
if (dottedMatch) {
|
|
366
|
+
const octets = dottedMatch.slice(1).map(Number);
|
|
367
|
+
if (octets.every(o => o >= 0 && o <= 255)) {
|
|
368
|
+
return octets;
|
|
369
|
+
}
|
|
370
|
+
throw new WebPeelError('Invalid IPv4 address');
|
|
371
|
+
}
|
|
372
|
+
// Hex notation: 0x7f000001
|
|
373
|
+
if (/^0x[0-9a-fA-F]+$/.test(cleaned)) {
|
|
374
|
+
const num = parseInt(cleaned, 16);
|
|
375
|
+
return [
|
|
376
|
+
(num >>> 24) & 0xff,
|
|
377
|
+
(num >>> 16) & 0xff,
|
|
378
|
+
(num >>> 8) & 0xff,
|
|
379
|
+
num & 0xff,
|
|
380
|
+
];
|
|
381
|
+
}
|
|
382
|
+
// Octal notation: 0177.0.0.1 or full octal 017700000001
|
|
383
|
+
if (/^0[0-7]/.test(cleaned)) {
|
|
384
|
+
// Full octal (all digits)
|
|
385
|
+
if (/^0[0-7]+$/.test(cleaned)) {
|
|
386
|
+
const num = parseInt(cleaned, 8);
|
|
387
|
+
if (num <= 0xffffffff) {
|
|
388
|
+
return [
|
|
389
|
+
(num >>> 24) & 0xff,
|
|
390
|
+
(num >>> 16) & 0xff,
|
|
391
|
+
(num >>> 8) & 0xff,
|
|
392
|
+
num & 0xff,
|
|
393
|
+
];
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
// Mixed octal-decimal: 0177.0.0.1
|
|
397
|
+
const parts = cleaned.split('.');
|
|
398
|
+
if (parts.length === 4) {
|
|
399
|
+
const octets = parts.map(p => parseInt(p, /^0[0-7]/.test(p) ? 8 : 10));
|
|
400
|
+
if (octets.every(o => o >= 0 && o <= 255)) {
|
|
401
|
+
return octets;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
// Decimal notation: 2130706433
|
|
406
|
+
if (/^\d+$/.test(cleaned)) {
|
|
407
|
+
const num = parseInt(cleaned, 10);
|
|
408
|
+
if (num <= 0xffffffff) {
|
|
409
|
+
return [
|
|
410
|
+
(num >>> 24) & 0xff,
|
|
411
|
+
(num >>> 16) & 0xff,
|
|
412
|
+
(num >>> 8) & 0xff,
|
|
413
|
+
num & 0xff,
|
|
414
|
+
];
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
return null;
|
|
418
|
+
}
|
|
419
|
+
/**
|
|
420
|
+
* Validate IPv4 address against private/reserved ranges
|
|
421
|
+
*/
|
|
422
|
+
function validateIPv4Address(octets) {
|
|
423
|
+
const [a, b, c, d] = octets;
|
|
424
|
+
// Loopback: 127.0.0.0/8
|
|
425
|
+
if (a === 127) {
|
|
426
|
+
throw new WebPeelError('Access to loopback addresses is not allowed');
|
|
427
|
+
}
|
|
428
|
+
// Private: 10.0.0.0/8
|
|
429
|
+
if (a === 10) {
|
|
430
|
+
throw new WebPeelError('Access to private IP addresses is not allowed');
|
|
431
|
+
}
|
|
432
|
+
// Private: 172.16.0.0/12
|
|
433
|
+
if (a === 172 && b >= 16 && b <= 31) {
|
|
434
|
+
throw new WebPeelError('Access to private IP addresses is not allowed');
|
|
435
|
+
}
|
|
436
|
+
// Private: 192.168.0.0/16
|
|
437
|
+
if (a === 192 && b === 168) {
|
|
438
|
+
throw new WebPeelError('Access to private IP addresses is not allowed');
|
|
439
|
+
}
|
|
440
|
+
// Link-local: 169.254.0.0/16
|
|
441
|
+
if (a === 169 && b === 254) {
|
|
442
|
+
throw new WebPeelError('Access to link-local addresses is not allowed');
|
|
443
|
+
}
|
|
444
|
+
// Broadcast: 255.255.255.255
|
|
445
|
+
if (a === 255 && b === 255 && c === 255 && d === 255) {
|
|
446
|
+
throw new WebPeelError('Access to broadcast address is not allowed');
|
|
447
|
+
}
|
|
448
|
+
// This network: 0.0.0.0/8
|
|
449
|
+
if (a === 0) {
|
|
450
|
+
throw new WebPeelError('Access to "this network" addresses is not allowed');
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
/**
|
|
454
|
+
* Validate IPv6 address against private/reserved ranges
|
|
455
|
+
*/
|
|
456
|
+
function validateIPv6Address(hostname) {
|
|
457
|
+
// Remove brackets
|
|
458
|
+
const addr = hostname.replace(/^\[|\]$/g, '').toLowerCase();
|
|
459
|
+
// Loopback: ::1
|
|
460
|
+
if (addr === '::1' || addr === '0:0:0:0:0:0:0:1') {
|
|
461
|
+
throw new WebPeelError('Access to loopback addresses is not allowed');
|
|
462
|
+
}
|
|
463
|
+
// IPv6 mapped IPv4: ::ffff:192.168.1.1 or ::ffff:c0a8:0101
|
|
464
|
+
if (addr.startsWith('::ffff:')) {
|
|
465
|
+
// Extract the IPv4 part
|
|
466
|
+
const ipv4Part = addr.substring(7);
|
|
467
|
+
// Could be dotted (::ffff:192.168.1.1) or hex (::ffff:c0a8:0101)
|
|
468
|
+
if (ipv4Part.includes('.')) {
|
|
469
|
+
// Parse dotted IPv4
|
|
470
|
+
const parts = ipv4Part.split('.');
|
|
471
|
+
if (parts.length === 4) {
|
|
472
|
+
const octets = parts.map(p => parseInt(p, 10));
|
|
473
|
+
if (octets.every(o => !isNaN(o) && o >= 0 && o <= 255)) {
|
|
474
|
+
validateIPv4Address(octets);
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
else {
|
|
479
|
+
// Parse hex IPv4 (e.g., c0a80101 = 192.168.1.1)
|
|
480
|
+
const hexStr = ipv4Part.replace(/:/g, '');
|
|
481
|
+
if (/^[0-9a-f]{1,8}$/.test(hexStr)) {
|
|
482
|
+
const num = parseInt(hexStr, 16);
|
|
483
|
+
const octets = [
|
|
484
|
+
(num >>> 24) & 0xff,
|
|
485
|
+
(num >>> 16) & 0xff,
|
|
486
|
+
(num >>> 8) & 0xff,
|
|
487
|
+
num & 0xff,
|
|
488
|
+
];
|
|
489
|
+
validateIPv4Address(octets);
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
throw new WebPeelError('Access to IPv6-mapped IPv4 addresses is not allowed');
|
|
493
|
+
}
|
|
494
|
+
// Unique local addresses: fc00::/7 (fc00:: to fdff::)
|
|
495
|
+
if (addr.startsWith('fc') || addr.startsWith('fd')) {
|
|
496
|
+
throw new WebPeelError('Access to unique local IPv6 addresses is not allowed');
|
|
497
|
+
}
|
|
498
|
+
// Link-local: fe80::/10
|
|
499
|
+
if (addr.startsWith('fe8') || addr.startsWith('fe9') ||
|
|
500
|
+
addr.startsWith('fea') || addr.startsWith('feb')) {
|
|
501
|
+
throw new WebPeelError('Access to link-local IPv6 addresses is not allowed');
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
/**
|
|
505
|
+
* Validate and sanitize user agent string
|
|
506
|
+
*/
|
|
507
|
+
export function validateUserAgent(userAgent) {
|
|
508
|
+
if (userAgent.length > 500) {
|
|
509
|
+
throw new WebPeelError('User agent too long (max 500 characters)');
|
|
510
|
+
}
|
|
511
|
+
// Allow only printable ASCII characters
|
|
512
|
+
if (!/^[\x20-\x7E]*$/.test(userAgent)) {
|
|
513
|
+
throw new WebPeelError('User agent contains invalid characters');
|
|
514
|
+
}
|
|
515
|
+
return userAgent;
|
|
516
|
+
}
|
|
517
|
+
// ── simpleFetch ───────────────────────────────────────────────────────────────
|
|
518
|
+
/**
|
|
519
|
+
* Simple HTTP fetch using native fetch + Cheerio
|
|
520
|
+
* Fast and lightweight, but can be blocked by Cloudflare/bot detection
|
|
521
|
+
* SECURITY: Manual redirect handling with SSRF re-validation
|
|
522
|
+
*/
|
|
523
|
+
export async function simpleFetch(url, userAgent, timeoutMs = 30000, customHeaders, abortSignal, proxy, proxyContext) {
|
|
524
|
+
// SECURITY: Validate URL to prevent SSRF
|
|
525
|
+
validateUrl(url);
|
|
526
|
+
if (abortSignal?.aborted) {
|
|
527
|
+
throw createAbortError();
|
|
528
|
+
}
|
|
529
|
+
// Validate user agent if provided
|
|
530
|
+
// SEC.gov requires a User-Agent with contact info (their documented automated access policy)
|
|
531
|
+
const hostname = new URL(url).hostname.toLowerCase();
|
|
532
|
+
const isSecGov = hostname === 'sec.gov' || hostname.endsWith('.sec.gov');
|
|
533
|
+
let activeUserAgent = isSecGov
|
|
534
|
+
? 'WebPeel/1.0 (support@webpeel.dev)'
|
|
535
|
+
: (userAgent ? validateUserAgent(userAgent) : getHttpUA());
|
|
536
|
+
// Inject cached challenge-solve cookies (e.g. cf_clearance) if available.
|
|
537
|
+
// These are merged into customHeaders so they ride along on every request
|
|
538
|
+
// to this domain, skipping repeated challenge pages.
|
|
539
|
+
const cachedCookieHeader = getCookieHeader(url);
|
|
540
|
+
const effectiveCustomHeaders = cachedCookieHeader
|
|
541
|
+
? { Cookie: cachedCookieHeader, ...(customHeaders || {}) }
|
|
542
|
+
: customHeaders;
|
|
543
|
+
// Build stealth headers merged with any caller-supplied custom headers
|
|
544
|
+
let mergedHeaders = buildMergedHeaders(url, activeUserAgent, effectiveCustomHeaders);
|
|
545
|
+
// Proxy routing: explicit proxy param always wins.
|
|
546
|
+
// For proxy-preferred domains, we now try DIRECT first to save bandwidth.
|
|
547
|
+
// Only fall back to proxy on 403/429/block (handled in retry logic below).
|
|
548
|
+
// This saves ~30% proxy bandwidth — many "blocked" sites actually work from Hetzner.
|
|
549
|
+
const effectiveProxy = proxy ?? undefined;
|
|
550
|
+
const MAX_REDIRECTS = 10;
|
|
551
|
+
let redirectCount = 0;
|
|
552
|
+
let currentUrl = url;
|
|
553
|
+
const seenUrls = new Set();
|
|
554
|
+
let retried = false; // track whether we've already retried with a different UA
|
|
555
|
+
try {
|
|
556
|
+
const hostname = new URL(url).hostname;
|
|
557
|
+
void resolveAndCache(hostname).catch(() => {
|
|
558
|
+
// Best-effort optimization only.
|
|
559
|
+
});
|
|
560
|
+
}
|
|
561
|
+
catch (e) {
|
|
562
|
+
// Ignore URL parsing errors here; validation handles invalid input below.
|
|
563
|
+
log.debug('DNS prefetch (initial URL):', e instanceof Error ? e.message : e);
|
|
564
|
+
}
|
|
565
|
+
while (redirectCount <= MAX_REDIRECTS) {
|
|
566
|
+
// Detect redirect loops
|
|
567
|
+
if (seenUrls.has(currentUrl)) {
|
|
568
|
+
throw new WebPeelError('Redirect loop detected');
|
|
569
|
+
}
|
|
570
|
+
seenUrls.add(currentUrl);
|
|
571
|
+
// Re-validate on each redirect
|
|
572
|
+
validateUrl(currentUrl);
|
|
573
|
+
const timeoutController = new AbortController();
|
|
574
|
+
const timer = setTimeout(() => timeoutController.abort(), timeoutMs);
|
|
575
|
+
const signal = abortSignal
|
|
576
|
+
? AbortSignal.any([timeoutController.signal, abortSignal])
|
|
577
|
+
: timeoutController.signal;
|
|
578
|
+
try {
|
|
579
|
+
const requestHeaders = { ...mergedHeaders };
|
|
580
|
+
const validators = getConditionalValidators(currentUrl);
|
|
581
|
+
// Only send conditional headers if we actually have the cached body
|
|
582
|
+
// In server/worker mode, the in-memory cache may have been cleared (pod restart)
|
|
583
|
+
// and sending If-None-Match without a cached body would cause a 304 crash
|
|
584
|
+
const cachedBody = getCachedResultFor304(currentUrl, url);
|
|
585
|
+
if (validators?.etag && cachedBody && !hasHeader(requestHeaders, 'if-none-match')) {
|
|
586
|
+
requestHeaders['If-None-Match'] = validators.etag;
|
|
587
|
+
}
|
|
588
|
+
if (validators?.lastModified && cachedBody && !hasHeader(requestHeaders, 'if-modified-since')) {
|
|
589
|
+
requestHeaders['If-Modified-Since'] = validators.lastModified;
|
|
590
|
+
}
|
|
591
|
+
// Use proxy if provided or auto-selected, otherwise use shared connection pool
|
|
592
|
+
const dispatcher = effectiveProxy ? new ProxyAgent(effectiveProxy) : httpPool;
|
|
593
|
+
const response = await undiciFetch(currentUrl, {
|
|
594
|
+
headers: requestHeaders,
|
|
595
|
+
signal,
|
|
596
|
+
dispatcher,
|
|
597
|
+
redirect: 'manual', // SECURITY: Manual redirect handling
|
|
598
|
+
});
|
|
599
|
+
clearTimeout(timer);
|
|
600
|
+
if (response.status === 304) {
|
|
601
|
+
const cachedResult = getCachedResultFor304(currentUrl, url);
|
|
602
|
+
if (cachedResult) {
|
|
603
|
+
return cachedResult;
|
|
604
|
+
}
|
|
605
|
+
throw new NetworkError('HTTP 304 received but no cached response is available');
|
|
606
|
+
}
|
|
607
|
+
// Handle redirects manually
|
|
608
|
+
if (response.status >= 300 && response.status < 400) {
|
|
609
|
+
const location = response.headers.get('location');
|
|
610
|
+
if (!location) {
|
|
611
|
+
throw new NetworkError('Redirect response missing Location header');
|
|
612
|
+
}
|
|
613
|
+
// Resolve relative URLs
|
|
614
|
+
currentUrl = new URL(location, currentUrl).href;
|
|
615
|
+
try {
|
|
616
|
+
const hostname = new URL(currentUrl).hostname;
|
|
617
|
+
void resolveAndCache(hostname).catch(() => {
|
|
618
|
+
// Best-effort optimization only.
|
|
619
|
+
});
|
|
620
|
+
}
|
|
621
|
+
catch (e) {
|
|
622
|
+
// Ignore URL parsing errors here; validation handles invalid input below.
|
|
623
|
+
log.debug('DNS prefetch (redirect URL):', e instanceof Error ? e.message : e);
|
|
624
|
+
}
|
|
625
|
+
redirectCount++;
|
|
626
|
+
continue;
|
|
627
|
+
}
|
|
628
|
+
if (!response.ok) {
|
|
629
|
+
if (response.status === 403 || response.status === 503) {
|
|
630
|
+
// Retry once with a different UA — cheap and catches UA-based blocks
|
|
631
|
+
if (!retried && !userAgent) {
|
|
632
|
+
retried = true;
|
|
633
|
+
activeUserAgent = getDifferentUA(activeUserAgent);
|
|
634
|
+
mergedHeaders = buildMergedHeaders(currentUrl, activeUserAgent, customHeaders);
|
|
635
|
+
// Allow the retry to re-visit the same URL (not a redirect loop)
|
|
636
|
+
seenUrls.delete(currentUrl);
|
|
637
|
+
log.debug(`HTTP ${response.status} on first attempt; retrying with different UA`);
|
|
638
|
+
continue;
|
|
639
|
+
}
|
|
640
|
+
// Try proxy as last resort before giving up (only for proxy-preferred domains)
|
|
641
|
+
if (retried && !proxy && shouldUseProxy(url)) {
|
|
642
|
+
// Check if the user's tier allows proxy usage
|
|
643
|
+
if (proxyContext?.userId && !canUseProxy(proxyContext.userId, proxyContext.tier || 'free')) {
|
|
644
|
+
throw new BlockedError(`HTTP ${response.status}: Site blocks direct access. Proxy bandwidth limit reached for your plan. Upgrade for more proxy bandwidth.`);
|
|
645
|
+
}
|
|
646
|
+
const proxyUrl = getWebshareProxyUrl();
|
|
647
|
+
if (proxyUrl) {
|
|
648
|
+
log.debug(`HTTP ${response.status} after UA retry; retrying via proxy`);
|
|
649
|
+
// Recursive call with proxy — single attempt, no further fallback
|
|
650
|
+
return simpleFetch(url, userAgent, timeoutMs, customHeaders, abortSignal, proxyUrl, proxyContext);
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
throw new BlockedError(`HTTP ${response.status}: Site may be blocking requests. Try --render for browser mode.`);
|
|
654
|
+
}
|
|
655
|
+
const statusText = response.statusText || HTTP_STATUS_TEXT[response.status] || 'Unknown Error';
|
|
656
|
+
throw new NetworkError(`HTTP ${response.status}: ${statusText}`);
|
|
657
|
+
}
|
|
658
|
+
rememberConditionalValidators(currentUrl, response);
|
|
659
|
+
// Content-Type detection
|
|
660
|
+
const contentType = response.headers.get('content-type') || '';
|
|
661
|
+
const contentTypeLower = contentType.toLowerCase();
|
|
662
|
+
const urlLower = currentUrl.toLowerCase();
|
|
663
|
+
// Support binary documents (PDF/DOCX) in the simple HTTP path.
|
|
664
|
+
const isPdf = contentTypeLower.includes('application/pdf') || urlLower.endsWith('.pdf');
|
|
665
|
+
const isDocx = contentTypeLower.includes('application/vnd.openxmlformats-officedocument.wordprocessingml.document') || urlLower.endsWith('.docx');
|
|
666
|
+
const isBinaryDoc = isPdf || isDocx;
|
|
667
|
+
// Support image types for OCR text extraction.
|
|
668
|
+
const IMAGE_URL_EXTS = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.tiff', '.tif', '.bmp'];
|
|
669
|
+
const isImage = contentTypeLower.startsWith('image/') ||
|
|
670
|
+
IMAGE_URL_EXTS.some(ext => urlLower.endsWith(ext));
|
|
671
|
+
// Accept a wide range of text-based content, plus supported binary documents.
|
|
672
|
+
const ALLOWED_TYPES = [
|
|
673
|
+
'text/html', 'application/xhtml+xml',
|
|
674
|
+
'text/plain', 'text/markdown', 'text/csv',
|
|
675
|
+
'application/json', 'text/json',
|
|
676
|
+
'text/xml', 'application/xml', 'application/rss+xml', 'application/atom+xml',
|
|
677
|
+
'application/javascript', 'text/javascript', 'text/css',
|
|
678
|
+
// Documents
|
|
679
|
+
'application/pdf',
|
|
680
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
681
|
+
];
|
|
682
|
+
const isAllowed = !contentTypeLower ||
|
|
683
|
+
isImage ||
|
|
684
|
+
ALLOWED_TYPES.some(t => contentTypeLower.includes(t)) ||
|
|
685
|
+
// Many servers mislabel docs as octet-stream; allow when URL implies a supported document.
|
|
686
|
+
(contentTypeLower.includes('application/octet-stream') && isBinaryDoc);
|
|
687
|
+
if (!isAllowed) {
|
|
688
|
+
// Check if it's at least text-based
|
|
689
|
+
const isTexty = contentTypeLower.startsWith('text/') ||
|
|
690
|
+
contentTypeLower.includes('json') ||
|
|
691
|
+
contentTypeLower.includes('xml');
|
|
692
|
+
if (!isTexty) {
|
|
693
|
+
throw new WebPeelError(`Binary content type: ${contentType}. WebPeel handles text-based content and PDF/DOCX documents only.`);
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
// SECURITY: Stream response with size limit (prevent memory exhaustion)
|
|
697
|
+
const chunks = [];
|
|
698
|
+
let totalSize = 0;
|
|
699
|
+
const MAX_SIZE = 10 * 1024 * 1024; // 10MB
|
|
700
|
+
const reader = response.body?.getReader();
|
|
701
|
+
if (!reader) {
|
|
702
|
+
throw new NetworkError('Response body is not readable');
|
|
703
|
+
}
|
|
704
|
+
try {
|
|
705
|
+
while (true) {
|
|
706
|
+
const { done, value } = await reader.read();
|
|
707
|
+
if (done)
|
|
708
|
+
break;
|
|
709
|
+
totalSize += value.length;
|
|
710
|
+
if (totalSize > MAX_SIZE) {
|
|
711
|
+
reader.cancel();
|
|
712
|
+
throw new WebPeelError('Response too large (max 10MB)');
|
|
713
|
+
}
|
|
714
|
+
chunks.push(value);
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
finally {
|
|
718
|
+
reader.releaseLock();
|
|
719
|
+
}
|
|
720
|
+
// Combine chunks
|
|
721
|
+
const combined = new Uint8Array(totalSize);
|
|
722
|
+
let offset = 0;
|
|
723
|
+
for (const chunk of chunks) {
|
|
724
|
+
combined.set(chunk, offset);
|
|
725
|
+
offset += chunk.length;
|
|
726
|
+
}
|
|
727
|
+
const buffer = Buffer.from(combined);
|
|
728
|
+
const isBinaryOrImage = isBinaryDoc || isImage;
|
|
729
|
+
const html = isBinaryOrImage ? '' : new TextDecoder().decode(combined);
|
|
730
|
+
// For HTML content, check for suspiciously small responses (bot blocks)
|
|
731
|
+
// Non-HTML content (JSON, text, XML) can legitimately be short
|
|
732
|
+
const isHtmlContent = !isBinaryOrImage && (contentTypeLower.includes('html') || contentTypeLower.includes('xhtml'));
|
|
733
|
+
if (isHtmlContent && (!html || html.length < 100)) {
|
|
734
|
+
throw new BlockedError('Empty or suspiciously small response. Site may require JavaScript.');
|
|
735
|
+
}
|
|
736
|
+
if (!isBinaryOrImage && !html) {
|
|
737
|
+
throw new NetworkError('Empty response body');
|
|
738
|
+
}
|
|
739
|
+
if (isBinaryOrImage && buffer.length === 0) {
|
|
740
|
+
throw new NetworkError('Empty response body');
|
|
741
|
+
}
|
|
742
|
+
// Check for Cloudflare challenge (only relevant for HTML)
|
|
743
|
+
if (isHtmlContent && (html.includes('cf-browser-verification') || html.includes('Just a moment...'))) {
|
|
744
|
+
throw new BlockedError('Cloudflare challenge detected. Try --render for browser mode.');
|
|
745
|
+
}
|
|
746
|
+
// Run full challenge detection for HTML content
|
|
747
|
+
// Note: skip empty-shell type — in simple HTTP mode, SPA shells are expected and
|
|
748
|
+
// the caller's escalation logic upgrades to browser/stealth rendering.
|
|
749
|
+
if (isHtmlContent) {
|
|
750
|
+
const challengeResult = detectChallenge(html, response.status);
|
|
751
|
+
if (challengeResult.isChallenge && challengeResult.type !== 'empty-shell') {
|
|
752
|
+
throw new BlockedError(`Challenge page detected (${challengeResult.type || 'unknown'}, confidence: ${challengeResult.confidence.toFixed(2)}). ` +
|
|
753
|
+
`Site requires human verification. Try a different approach or use a CAPTCHA solving service.`);
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
// Capture selected response headers for freshness metadata
|
|
757
|
+
const responseHeaders = {};
|
|
758
|
+
const lastModified = response.headers.get('last-modified');
|
|
759
|
+
if (lastModified)
|
|
760
|
+
responseHeaders['last-modified'] = lastModified;
|
|
761
|
+
const etag = response.headers.get('etag');
|
|
762
|
+
if (etag)
|
|
763
|
+
responseHeaders['etag'] = etag;
|
|
764
|
+
const cacheControl = response.headers.get('cache-control');
|
|
765
|
+
if (cacheControl)
|
|
766
|
+
responseHeaders['cache-control'] = cacheControl;
|
|
767
|
+
// Record proxy bandwidth consumed (fire-and-forget, zero-cost fast path)
|
|
768
|
+
if (effectiveProxy && proxyContext?.userId) {
|
|
769
|
+
// Use Content-Length if present; fall back to actual buffer/body size
|
|
770
|
+
const clHeader = response.headers.get('content-length');
|
|
771
|
+
const bodyBytes = clHeader ? parseInt(clHeader, 10) : totalSize;
|
|
772
|
+
if (bodyBytes > 0) {
|
|
773
|
+
recordProxyBytes(proxyContext.userId, bodyBytes);
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
return {
|
|
777
|
+
html,
|
|
778
|
+
buffer: isBinaryOrImage ? buffer : undefined,
|
|
779
|
+
url: currentUrl,
|
|
780
|
+
statusCode: response.status,
|
|
781
|
+
contentType,
|
|
782
|
+
responseHeaders: Object.keys(responseHeaders).length > 0 ? responseHeaders : undefined,
|
|
783
|
+
};
|
|
784
|
+
}
|
|
785
|
+
catch (error) {
|
|
786
|
+
clearTimeout(timer);
|
|
787
|
+
if (error instanceof BlockedError || error instanceof NetworkError || error instanceof WebPeelError) {
|
|
788
|
+
throw error;
|
|
789
|
+
}
|
|
790
|
+
if (error instanceof Error && error.name === 'AbortError') {
|
|
791
|
+
if (abortSignal?.aborted && !timeoutController.signal.aborted) {
|
|
792
|
+
throw createAbortError();
|
|
793
|
+
}
|
|
794
|
+
throw new TimeoutError(`Request timed out after ${timeoutMs}ms`);
|
|
795
|
+
}
|
|
796
|
+
// Provide specific error messages based on the actual cause
|
|
797
|
+
const cause = error instanceof Error && error.cause;
|
|
798
|
+
const causeMsg = cause?.message || cause?.code || '';
|
|
799
|
+
if (causeMsg.includes('certificate') || causeMsg.includes('CERT') || causeMsg.includes('SSL') || causeMsg.includes('TLS')) {
|
|
800
|
+
throw new NetworkError(`TLS/SSL certificate error for ${new URL(currentUrl).hostname}. The site's certificate may be expired, self-signed, or untrusted.`);
|
|
801
|
+
}
|
|
802
|
+
if (causeMsg.includes('ENOTFOUND') || causeMsg.includes('getaddrinfo')) {
|
|
803
|
+
throw new NetworkError(`DNS resolution failed: ${new URL(currentUrl).hostname} not found. Check the URL or your network connection.`);
|
|
804
|
+
}
|
|
805
|
+
if (causeMsg.includes('ECONNREFUSED')) {
|
|
806
|
+
throw new NetworkError(`Connection refused by ${new URL(currentUrl).hostname}. The server may be down.`);
|
|
807
|
+
}
|
|
808
|
+
if (causeMsg.includes('ECONNRESET') || causeMsg.includes('EPIPE')) {
|
|
809
|
+
throw new NetworkError(`Connection reset by ${new URL(currentUrl).hostname}. Try again or use --render.`);
|
|
810
|
+
}
|
|
811
|
+
if (causeMsg.includes('ETIMEDOUT') || causeMsg.includes('ENETUNREACH')) {
|
|
812
|
+
throw new TimeoutError(`Network unreachable or connection timed out for ${new URL(currentUrl).hostname}.`);
|
|
813
|
+
}
|
|
814
|
+
const msg = error instanceof Error ? error.message : 'Unknown error';
|
|
815
|
+
const causeDetail = causeMsg ? ` (${causeMsg})` : '';
|
|
816
|
+
throw new NetworkError(`Failed to fetch: ${msg}${causeDetail}`);
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
throw new WebPeelError(`Too many redirects (max ${MAX_REDIRECTS})`);
|
|
820
|
+
}
|