@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Challenge / bot-protection page detection.
|
|
3
|
+
*
|
|
4
|
+
* Analyzes raw HTML (and optional HTTP status code) to determine whether the
|
|
5
|
+
* response is a bot-challenge or block page rather than real content.
|
|
6
|
+
*
|
|
7
|
+
* Design goals:
|
|
8
|
+
* - Fast: pure string/regex matching, no DOM parsing required
|
|
9
|
+
* - Low false-positive rate: uses confidence scoring, only flags at >= 0.7
|
|
10
|
+
* - No external dependencies
|
|
11
|
+
*/
|
|
12
|
+
/* ---------- helpers ------------------------------------------------------ */
|
|
13
|
+
/** Case-insensitive substring presence test. */
|
|
14
|
+
function has(html, needle) {
|
|
15
|
+
return html.includes(needle);
|
|
16
|
+
}
|
|
17
|
+
/** Test multiple needles — return how many match. */
|
|
18
|
+
function countMatches(html, needles) {
|
|
19
|
+
let count = 0;
|
|
20
|
+
for (const needle of needles) {
|
|
21
|
+
if (html.includes(needle))
|
|
22
|
+
count++;
|
|
23
|
+
}
|
|
24
|
+
return count;
|
|
25
|
+
}
|
|
26
|
+
/** Extract <title> content (lowercased). */
|
|
27
|
+
function extractTitle(html) {
|
|
28
|
+
const m = html.match(/<title[^>]*>([^<]*)<\/title>/i);
|
|
29
|
+
return m ? m[1].toLowerCase().trim() : '';
|
|
30
|
+
}
|
|
31
|
+
/** Estimate visible text length after stripping scripts/styles/tags. */
|
|
32
|
+
function estimateVisibleTextLength(html) {
|
|
33
|
+
const stripped = html
|
|
34
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
35
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
36
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, '')
|
|
37
|
+
.replace(/<[^>]*>/g, '')
|
|
38
|
+
.replace(/\s+/g, ' ')
|
|
39
|
+
.trim();
|
|
40
|
+
return stripped.length;
|
|
41
|
+
}
|
|
42
|
+
/* ---------- vendor-specific detectors ------------------------------------ */
|
|
43
|
+
function detectCloudflare(html, statusCode) {
|
|
44
|
+
let score = 0;
|
|
45
|
+
// Strong signals — each adds a lot of weight
|
|
46
|
+
const strongSignals = [
|
|
47
|
+
'cf-browser-verification',
|
|
48
|
+
'cf-turnstile',
|
|
49
|
+
'cf-challenge',
|
|
50
|
+
'cf-chl-widget',
|
|
51
|
+
'challenge-running',
|
|
52
|
+
'challenge-form',
|
|
53
|
+
'window._cf_chl_opt',
|
|
54
|
+
'__cf_chl_f_tk',
|
|
55
|
+
'cf_chl_prog',
|
|
56
|
+
'cf-spinner',
|
|
57
|
+
'cf-error-overview',
|
|
58
|
+
];
|
|
59
|
+
const strongCount = countMatches(html, strongSignals);
|
|
60
|
+
score += Math.min(strongCount * 0.25, 0.75);
|
|
61
|
+
// Title check
|
|
62
|
+
const title = extractTitle(html);
|
|
63
|
+
if (title.includes('just a moment') ||
|
|
64
|
+
title.includes('attention required') ||
|
|
65
|
+
title.includes('checking your browser') ||
|
|
66
|
+
title.includes('one more step')) {
|
|
67
|
+
score += 0.35;
|
|
68
|
+
}
|
|
69
|
+
// Ray ID is a Cloudflare-specific identifier
|
|
70
|
+
if (/ray\s+id/i.test(html) || /ray id:/i.test(html)) {
|
|
71
|
+
score += 0.2;
|
|
72
|
+
}
|
|
73
|
+
// Cloudflare's cdn-cgi path
|
|
74
|
+
if (has(html, 'cdn-cgi/')) {
|
|
75
|
+
score += 0.15;
|
|
76
|
+
}
|
|
77
|
+
// 403/503 + Cloudflare signals
|
|
78
|
+
if ((statusCode === 403 || statusCode === 503) && score > 0) {
|
|
79
|
+
score += 0.2;
|
|
80
|
+
}
|
|
81
|
+
return Math.min(score, 1);
|
|
82
|
+
}
|
|
83
|
+
function detectPerimeterX(html, statusCode) {
|
|
84
|
+
let score = 0;
|
|
85
|
+
// Use lowercase for case-insensitive matching of new Expedia-style signals
|
|
86
|
+
const htmlLower = html.toLowerCase();
|
|
87
|
+
const signals = [
|
|
88
|
+
'perimeterx',
|
|
89
|
+
'_pxhd',
|
|
90
|
+
'px-captcha',
|
|
91
|
+
'_pxCaptcha',
|
|
92
|
+
'window._pxAppId',
|
|
93
|
+
'window._pxUuid',
|
|
94
|
+
'pxCaptcha',
|
|
95
|
+
'_px3',
|
|
96
|
+
'_pxvid',
|
|
97
|
+
'human.security',
|
|
98
|
+
'px-block',
|
|
99
|
+
];
|
|
100
|
+
const count = countMatches(html, signals);
|
|
101
|
+
score += Math.min(count * 0.3, 0.8);
|
|
102
|
+
// Case-insensitive Expedia/PerimeterX-specific signals
|
|
103
|
+
const expediaSignals = [
|
|
104
|
+
'human or a bot',
|
|
105
|
+
'show us your human side',
|
|
106
|
+
'human-side',
|
|
107
|
+
'bot or not',
|
|
108
|
+
];
|
|
109
|
+
const expediaCount = countMatches(htmlLower, expediaSignals);
|
|
110
|
+
score += Math.min(expediaCount * 0.25, 0.6);
|
|
111
|
+
const title = extractTitle(htmlLower);
|
|
112
|
+
if (title.includes('access denied') ||
|
|
113
|
+
title.includes('has been denied') ||
|
|
114
|
+
title.includes('access to this page') ||
|
|
115
|
+
title.includes('please verify') ||
|
|
116
|
+
title.includes('bot detection') ||
|
|
117
|
+
title.includes('pardon our interruption') ||
|
|
118
|
+
title.includes('bot or not')) {
|
|
119
|
+
score += 0.15;
|
|
120
|
+
}
|
|
121
|
+
// PerimeterX "Press & Hold" challenge page (used by Zillow, etc.)
|
|
122
|
+
const hasPresssHold = has(html, 'Press & Hold') || has(html, 'Press & Hold') || has(htmlLower, 'press and hold');
|
|
123
|
+
const hasHumanCheck = has(htmlLower, 'confirm you are human') || has(htmlLower, 'confirm you area human') || has(htmlLower, 'not a bot') || has(htmlLower, 'human or a bot') || has(htmlLower, 'show us your human side') || has(htmlLower, 'bot or not');
|
|
124
|
+
if (hasPresssHold && hasHumanCheck) {
|
|
125
|
+
score += 0.5;
|
|
126
|
+
}
|
|
127
|
+
else if (hasPresssHold || hasHumanCheck) {
|
|
128
|
+
score += 0.2;
|
|
129
|
+
}
|
|
130
|
+
// Reference ID pattern is common in PerimeterX block pages (supports "Reference ID:" and "Reference ID " formats)
|
|
131
|
+
if (/reference\s+id[:\s]+[0-9a-f-]{20,}/i.test(html)) {
|
|
132
|
+
score += 0.2;
|
|
133
|
+
}
|
|
134
|
+
if (statusCode === 403 && score > 0) {
|
|
135
|
+
score += 0.1;
|
|
136
|
+
}
|
|
137
|
+
return Math.min(score, 1);
|
|
138
|
+
}
|
|
139
|
+
function detectAkamai(html, statusCode) {
|
|
140
|
+
let score = 0;
|
|
141
|
+
const signals = [
|
|
142
|
+
'ak_bmsc',
|
|
143
|
+
'_abck',
|
|
144
|
+
'bm_sz',
|
|
145
|
+
'akamaized.net',
|
|
146
|
+
'akamai',
|
|
147
|
+
'bmak.',
|
|
148
|
+
'__utmz',
|
|
149
|
+
'akam/',
|
|
150
|
+
'BotManagerSettings',
|
|
151
|
+
];
|
|
152
|
+
const count = countMatches(html, signals);
|
|
153
|
+
score += Math.min(count * 0.2, 0.6);
|
|
154
|
+
// Akamai often shows a short "Access Denied" page
|
|
155
|
+
const title = extractTitle(html);
|
|
156
|
+
if (title.includes('access denied') || title.includes('forbidden')) {
|
|
157
|
+
score += 0.2;
|
|
158
|
+
}
|
|
159
|
+
// Akamai block pages tend to be small
|
|
160
|
+
if (html.length < 2000 && score > 0) {
|
|
161
|
+
score += 0.15;
|
|
162
|
+
}
|
|
163
|
+
if ((statusCode === 403 || statusCode === 503) && score > 0) {
|
|
164
|
+
score += 0.1;
|
|
165
|
+
}
|
|
166
|
+
return Math.min(score, 1);
|
|
167
|
+
}
|
|
168
|
+
function detectDataDome(html, _statusCode) {
|
|
169
|
+
let score = 0;
|
|
170
|
+
const signals = [
|
|
171
|
+
'datadome',
|
|
172
|
+
'dd.js',
|
|
173
|
+
'datadome.co',
|
|
174
|
+
'window.ddjskey',
|
|
175
|
+
'ddjskey',
|
|
176
|
+
'dd_referrer',
|
|
177
|
+
'dd_cookie_test',
|
|
178
|
+
'datadome/captcha',
|
|
179
|
+
// DataDome's CAPTCHA delivery infrastructure (used by Etsy, FootLocker, etc.)
|
|
180
|
+
'captcha-delivery.com',
|
|
181
|
+
'geo.captcha-delivery.com',
|
|
182
|
+
];
|
|
183
|
+
const count = countMatches(html, signals);
|
|
184
|
+
score += Math.min(count * 0.3, 0.9);
|
|
185
|
+
// DataDome uses a short `var dd={...}` config variable with captcha-delivery host
|
|
186
|
+
if (/\bvar\s+dd\s*=\s*\{/.test(html) && html.includes('captcha-delivery')) {
|
|
187
|
+
score += 0.4;
|
|
188
|
+
}
|
|
189
|
+
return Math.min(score, 1);
|
|
190
|
+
}
|
|
191
|
+
function detectIncapsula(html, _statusCode) {
|
|
192
|
+
let score = 0;
|
|
193
|
+
const signals = [
|
|
194
|
+
'incap_ses_',
|
|
195
|
+
'visid_incap_',
|
|
196
|
+
'_incap_',
|
|
197
|
+
'imperva',
|
|
198
|
+
'incapsula',
|
|
199
|
+
'incapsula.com',
|
|
200
|
+
'incapcookies',
|
|
201
|
+
'reese84',
|
|
202
|
+
];
|
|
203
|
+
const count = countMatches(html, signals);
|
|
204
|
+
score += Math.min(count * 0.3, 0.8);
|
|
205
|
+
// Incapsula "requires JavaScript" pages
|
|
206
|
+
if (has(html, 'This site requires JavaScript') ||
|
|
207
|
+
has(html, 'requires javascript')) {
|
|
208
|
+
score += 0.15;
|
|
209
|
+
}
|
|
210
|
+
return Math.min(score, 1);
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Detect generic block/challenge pages that don't belong to a specific vendor.
|
|
214
|
+
*
|
|
215
|
+
* We use multiple weak signals and require several of them to fire before
|
|
216
|
+
* flagging — this avoids false positives from pages that merely mention
|
|
217
|
+
* these terms in article content.
|
|
218
|
+
*/
|
|
219
|
+
function detectGenericBlock(html, statusCode) {
|
|
220
|
+
let score = 0;
|
|
221
|
+
// Title signals (strong)
|
|
222
|
+
const title = extractTitle(html);
|
|
223
|
+
const blockTitles = [
|
|
224
|
+
'access denied',
|
|
225
|
+
'has been denied',
|
|
226
|
+
'has been blocked',
|
|
227
|
+
'access to this page',
|
|
228
|
+
'403 forbidden',
|
|
229
|
+
'bot detected',
|
|
230
|
+
'verify you are human',
|
|
231
|
+
'security check',
|
|
232
|
+
'ddos protection',
|
|
233
|
+
'rate limit exceeded',
|
|
234
|
+
'too many requests',
|
|
235
|
+
'captcha required',
|
|
236
|
+
'robot check',
|
|
237
|
+
'unusual traffic',
|
|
238
|
+
'automated access',
|
|
239
|
+
'browser check',
|
|
240
|
+
'human verification',
|
|
241
|
+
'blocked by',
|
|
242
|
+
'pardon our interruption',
|
|
243
|
+
'bot or not',
|
|
244
|
+
'blocked',
|
|
245
|
+
'verification required',
|
|
246
|
+
'are you a robot',
|
|
247
|
+
];
|
|
248
|
+
for (const t of blockTitles) {
|
|
249
|
+
if (title.includes(t)) {
|
|
250
|
+
score += 0.35;
|
|
251
|
+
break; // Only count once from title
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
// Body signals — but require multiple (to avoid false positives from blog posts)
|
|
255
|
+
const bodySignals = [
|
|
256
|
+
'automated access',
|
|
257
|
+
'suspicious activity',
|
|
258
|
+
'rate limit',
|
|
259
|
+
'bot detected',
|
|
260
|
+
'verify you are human',
|
|
261
|
+
'verify that you are human',
|
|
262
|
+
'confirm you are human',
|
|
263
|
+
'confirm you area human', // known PerimeterX typo in the wild
|
|
264
|
+
'are you a robot',
|
|
265
|
+
'are you human',
|
|
266
|
+
'not a bot',
|
|
267
|
+
'and not a bot',
|
|
268
|
+
'press & hold',
|
|
269
|
+
'press and hold',
|
|
270
|
+
'ddos protection by',
|
|
271
|
+
'please complete the security check',
|
|
272
|
+
'this page checks to see if it',
|
|
273
|
+
'prove you are human',
|
|
274
|
+
'security challenge',
|
|
275
|
+
'enable javascript and cookies',
|
|
276
|
+
'javascript and cookies to continue',
|
|
277
|
+
'enable cookies',
|
|
278
|
+
'reference id', // PerimeterX block pages include a Reference ID
|
|
279
|
+
'why have i been blocked',
|
|
280
|
+
'your access has been blocked',
|
|
281
|
+
'detected unusual activity',
|
|
282
|
+
// New patterns for additional challenge pages
|
|
283
|
+
'human or a bot',
|
|
284
|
+
'show us your human side',
|
|
285
|
+
'bot or not',
|
|
286
|
+
'complete a captcha',
|
|
287
|
+
'solve this puzzle',
|
|
288
|
+
'verify your identity',
|
|
289
|
+
'unusual traffic',
|
|
290
|
+
'too many requests',
|
|
291
|
+
'access denied',
|
|
292
|
+
'automated traffic',
|
|
293
|
+
'we need to verify',
|
|
294
|
+
'human verification',
|
|
295
|
+
'browser verification',
|
|
296
|
+
'checking your browser',
|
|
297
|
+
'please wait while we verify',
|
|
298
|
+
'blocked by',
|
|
299
|
+
];
|
|
300
|
+
const bodyCount = countMatches(html, bodySignals);
|
|
301
|
+
// Require at least 2 body signals to avoid flagging a blog post mentioning one
|
|
302
|
+
if (bodyCount >= 2) {
|
|
303
|
+
score += Math.min((bodyCount - 1) * 0.15, 0.4);
|
|
304
|
+
}
|
|
305
|
+
else if (bodyCount === 1 && title.length === 0) {
|
|
306
|
+
// Single body signal + no title = weak signal only
|
|
307
|
+
score += 0.05;
|
|
308
|
+
}
|
|
309
|
+
// Very short response with an error status
|
|
310
|
+
if (html.length < 1000 && (statusCode === 403 || statusCode === 503 || statusCode === 429)) {
|
|
311
|
+
score += 0.25;
|
|
312
|
+
// Tiny pages (< 500 chars) with a block status are almost certainly block pages
|
|
313
|
+
if (html.length < 500) {
|
|
314
|
+
score += 0.15;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
// Meta refresh to a captcha/challenge URL — this ONLY happens on challenge interstitials;
|
|
318
|
+
// real content pages never redirect to a captcha URL via meta-refresh.
|
|
319
|
+
if (/meta[^>]*refresh/i.test(html) && /captcha|challenge/i.test(html)) {
|
|
320
|
+
score += 0.75;
|
|
321
|
+
}
|
|
322
|
+
// Page is almost entirely a form with nothing else (login-wall-adjacent)
|
|
323
|
+
// We want to avoid flagging actual login pages here, so only trigger if
|
|
324
|
+
// combined with other signals.
|
|
325
|
+
if (score > 0.2) {
|
|
326
|
+
const formOnly = html.length < 3000 &&
|
|
327
|
+
(html.match(/<form/gi) || []).length > 0 &&
|
|
328
|
+
estimateVisibleTextLength(html) < 150;
|
|
329
|
+
if (formOnly) {
|
|
330
|
+
score += 0.15;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
// HTTP 429 on its own is a strong rate-limit signal
|
|
334
|
+
if (statusCode === 429) {
|
|
335
|
+
score += 0.25;
|
|
336
|
+
}
|
|
337
|
+
// A page that is mostly/entirely an iframe to a captcha service
|
|
338
|
+
// (short HTML + iframe with captcha in src/title)
|
|
339
|
+
if (html.length < 2000 &&
|
|
340
|
+
/iframe[^>]*captcha/i.test(html) &&
|
|
341
|
+
(statusCode === 403 || statusCode === 503 || statusCode === 429)) {
|
|
342
|
+
score += 0.5;
|
|
343
|
+
}
|
|
344
|
+
return Math.min(score, 1);
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
* Detect SPA shells — large HTML but almost no visible text.
|
|
348
|
+
* These happen when a JS-rendered site returns an app shell without executing JS.
|
|
349
|
+
*/
|
|
350
|
+
function detectEmptyShell(html, _statusCode) {
|
|
351
|
+
// Must be a substantial HTML payload (otherwise it's just a small page)
|
|
352
|
+
if (html.length < 2000)
|
|
353
|
+
return 0;
|
|
354
|
+
const visibleLen = estimateVisibleTextLength(html);
|
|
355
|
+
// Less than 200 chars of visible text in a large HTML doc = shell
|
|
356
|
+
if (visibleLen >= 200)
|
|
357
|
+
return 0;
|
|
358
|
+
let score = 0.65; // base confidence for a shell
|
|
359
|
+
// Known SPA root elements that are empty
|
|
360
|
+
const shellPatterns = [
|
|
361
|
+
'<div id="root"></div>',
|
|
362
|
+
'<div id="root"> </div>',
|
|
363
|
+
'<div id="app"></div>',
|
|
364
|
+
'<div id="app"> </div>',
|
|
365
|
+
'<div id="__next"></div>',
|
|
366
|
+
'<div id="__next"> </div>',
|
|
367
|
+
'<div id="gatsby-focus-wrapper"></div>',
|
|
368
|
+
'<div id="___gatsby"></div>',
|
|
369
|
+
'id="root"', // weaker — just presence of root
|
|
370
|
+
'id="__next"', // Next.js
|
|
371
|
+
];
|
|
372
|
+
const shellCount = countMatches(html, shellPatterns);
|
|
373
|
+
if (shellCount > 0) {
|
|
374
|
+
score += Math.min(shellCount * 0.1, 0.2);
|
|
375
|
+
}
|
|
376
|
+
// Many script tags in a tiny-text page = SPA shell
|
|
377
|
+
const scriptTagCount = (html.match(/<script/gi) || []).length;
|
|
378
|
+
if (scriptTagCount >= 3) {
|
|
379
|
+
score += 0.1;
|
|
380
|
+
}
|
|
381
|
+
return Math.min(score, 1);
|
|
382
|
+
}
|
|
383
|
+
/* ---------- false-positive guards --------------------------------------- */
|
|
384
|
+
/**
|
|
385
|
+
* Returns true if the HTML looks like legitimate content that just happens
|
|
386
|
+
* to mention security/captcha terms (e.g. a blog post ABOUT CAPTCHAs).
|
|
387
|
+
*/
|
|
388
|
+
function looksLikeRealContent(html) {
|
|
389
|
+
const visible = estimateVisibleTextLength(html);
|
|
390
|
+
// If there's a lot of visible text, it's almost certainly real content
|
|
391
|
+
if (visible > 1500)
|
|
392
|
+
return true;
|
|
393
|
+
// If visible text is 600+ chars and it's not a tiny page, likely real
|
|
394
|
+
if (visible > 600 && html.length > 5000)
|
|
395
|
+
return true;
|
|
396
|
+
return false;
|
|
397
|
+
}
|
|
398
|
+
/**
|
|
399
|
+
* Returns true if this looks like a normal 404 page (not a block page).
|
|
400
|
+
* 404s are sometimes mistaken for blocks when they have short content.
|
|
401
|
+
*/
|
|
402
|
+
function looksLike404(html, statusCode) {
|
|
403
|
+
if (statusCode !== 404)
|
|
404
|
+
return false;
|
|
405
|
+
const title = extractTitle(html);
|
|
406
|
+
return (title.includes('not found') ||
|
|
407
|
+
title.includes('404') ||
|
|
408
|
+
title.includes('page not found') ||
|
|
409
|
+
title.includes('error 404'));
|
|
410
|
+
}
|
|
411
|
+
/* ---------- main export -------------------------------------------------- */
|
|
412
|
+
/**
|
|
413
|
+
* Detect whether an HTML response is a bot-challenge or block page.
|
|
414
|
+
*
|
|
415
|
+
* @param html - Raw HTML response body.
|
|
416
|
+
* @param statusCode - HTTP status code (optional but improves accuracy).
|
|
417
|
+
*/
|
|
418
|
+
export function detectChallenge(html, statusCode) {
|
|
419
|
+
const THRESHOLD = 0.7;
|
|
420
|
+
// Sanity — empty input
|
|
421
|
+
if (!html || html.length === 0) {
|
|
422
|
+
return { isChallenge: false, confidence: 0 };
|
|
423
|
+
}
|
|
424
|
+
// Quick exit: if there's clearly lots of real content, don't bother scoring
|
|
425
|
+
// (still allow empty-shell detection to run since that has LOTS of html but no text)
|
|
426
|
+
const realContent = looksLikeRealContent(html);
|
|
427
|
+
const is404 = looksLike404(html, statusCode);
|
|
428
|
+
if (is404) {
|
|
429
|
+
return { isChallenge: false, confidence: 0, details: '404 page' };
|
|
430
|
+
}
|
|
431
|
+
// Normalize to lowercase for case-insensitive matching
|
|
432
|
+
// We keep a lowercase copy for patterns that don't need case sensitivity
|
|
433
|
+
const htmlLower = html.toLowerCase();
|
|
434
|
+
// Run each vendor detector
|
|
435
|
+
const scores = [
|
|
436
|
+
{ type: 'cloudflare', score: detectCloudflare(html, statusCode) },
|
|
437
|
+
{ type: 'perimeterx', score: detectPerimeterX(html, statusCode) },
|
|
438
|
+
{ type: 'akamai', score: detectAkamai(html, statusCode) },
|
|
439
|
+
{ type: 'datadome', score: detectDataDome(htmlLower, statusCode) },
|
|
440
|
+
{ type: 'incapsula', score: detectIncapsula(htmlLower, statusCode) },
|
|
441
|
+
{ type: 'generic-block', score: detectGenericBlock(htmlLower, statusCode) },
|
|
442
|
+
{ type: 'empty-shell', score: detectEmptyShell(html, statusCode) },
|
|
443
|
+
];
|
|
444
|
+
// Find highest scoring detector
|
|
445
|
+
let best = scores[0];
|
|
446
|
+
for (const entry of scores) {
|
|
447
|
+
if (entry.score > best.score)
|
|
448
|
+
best = entry;
|
|
449
|
+
}
|
|
450
|
+
// If real content guard fired, suppress non-empty-shell challenges
|
|
451
|
+
// (a blog post about Cloudflare can mention cf patterns in quoted code blocks)
|
|
452
|
+
if (realContent && best.type !== 'empty-shell') {
|
|
453
|
+
return {
|
|
454
|
+
isChallenge: false,
|
|
455
|
+
confidence: best.score * 0.4,
|
|
456
|
+
details: 'Suppressed: page has substantial real content',
|
|
457
|
+
};
|
|
458
|
+
}
|
|
459
|
+
if (best.score < THRESHOLD) {
|
|
460
|
+
return { isChallenge: false, confidence: best.score };
|
|
461
|
+
}
|
|
462
|
+
return {
|
|
463
|
+
isChallenge: true,
|
|
464
|
+
type: best.type,
|
|
465
|
+
confidence: best.score,
|
|
466
|
+
details: `Detected as ${best.type} (confidence ${best.score.toFixed(2)})`,
|
|
467
|
+
};
|
|
468
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Local-first content change tracking
|
|
3
|
+
* Stores snapshots in ~/.webpeel/snapshots/ and provides diffing
|
|
4
|
+
*/
|
|
5
|
+
export interface Snapshot {
|
|
6
|
+
url: string;
|
|
7
|
+
fingerprint: string;
|
|
8
|
+
content: string;
|
|
9
|
+
timestamp: number;
|
|
10
|
+
metadata?: Record<string, any>;
|
|
11
|
+
}
|
|
12
|
+
export interface ChangeResult {
|
|
13
|
+
changeStatus: 'new' | 'same' | 'changed' | 'removed';
|
|
14
|
+
previousScrapeAt: string | null;
|
|
15
|
+
diff?: {
|
|
16
|
+
text: string;
|
|
17
|
+
additions: number;
|
|
18
|
+
deletions: number;
|
|
19
|
+
changes: Array<{
|
|
20
|
+
type: 'add' | 'del' | 'normal';
|
|
21
|
+
line: number;
|
|
22
|
+
content: string;
|
|
23
|
+
}>;
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Get a snapshot for a URL
|
|
28
|
+
*
|
|
29
|
+
* @param url - URL to get snapshot for
|
|
30
|
+
* @returns Snapshot if exists, null otherwise
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* ```typescript
|
|
34
|
+
* const snapshot = await getSnapshot('https://example.com');
|
|
35
|
+
* if (snapshot) {
|
|
36
|
+
* console.log('Last scraped:', new Date(snapshot.timestamp));
|
|
37
|
+
* }
|
|
38
|
+
* ```
|
|
39
|
+
*/
|
|
40
|
+
export declare function getSnapshot(url: string): Promise<Snapshot | null>;
|
|
41
|
+
/**
|
|
42
|
+
* Track content changes for a URL
|
|
43
|
+
* Compares with previous snapshot and saves new one
|
|
44
|
+
*
|
|
45
|
+
* @param url - URL being tracked
|
|
46
|
+
* @param content - Current content
|
|
47
|
+
* @param fingerprint - Content fingerprint (SHA256 hash)
|
|
48
|
+
* @returns Change detection result
|
|
49
|
+
*
|
|
50
|
+
* @example
|
|
51
|
+
* ```typescript
|
|
52
|
+
* const result = await trackChange('https://example.com', content, fingerprint);
|
|
53
|
+
* if (result.changeStatus === 'changed') {
|
|
54
|
+
* console.log('Content changed!');
|
|
55
|
+
* console.log(`+${result.diff.additions} -${result.diff.deletions}`);
|
|
56
|
+
* }
|
|
57
|
+
* ```
|
|
58
|
+
*/
|
|
59
|
+
export declare function trackChange(url: string, content: string, fingerprint: string): Promise<ChangeResult>;
|
|
60
|
+
/**
|
|
61
|
+
* Clear snapshots matching a URL pattern
|
|
62
|
+
*
|
|
63
|
+
* @param urlPattern - Optional regex pattern to match URLs (if not provided, clears all)
|
|
64
|
+
* @returns Number of snapshots cleared
|
|
65
|
+
*
|
|
66
|
+
* @example
|
|
67
|
+
* ```typescript
|
|
68
|
+
* // Clear all snapshots
|
|
69
|
+
* const count = await clearSnapshots();
|
|
70
|
+
*
|
|
71
|
+
* // Clear specific domain
|
|
72
|
+
* const count = await clearSnapshots('example\\.com');
|
|
73
|
+
* ```
|
|
74
|
+
*/
|
|
75
|
+
export declare function clearSnapshots(urlPattern?: string): Promise<number>;
|