@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Authentication wall detection.
|
|
3
|
+
*
|
|
4
|
+
* Analyzes raw HTML (and optional HTTP status code) to determine whether the
|
|
5
|
+
* response is a login/auth wall rather than real content.
|
|
6
|
+
*
|
|
7
|
+
* Design goals:
|
|
8
|
+
* - Fast: pure string/regex matching, no DOM parsing required
|
|
9
|
+
* - Low false-positive rate: uses confidence scoring, only flags at >= 0.5
|
|
10
|
+
* - Ignores ACTUAL login pages (user navigated there intentionally)
|
|
11
|
+
* - No external dependencies
|
|
12
|
+
*/
|
|
13
|
+
export type AuthWallType = 'login-form' | 'oauth-redirect' | 'paywall' | 'signup-required' | 'generic';
|
|
14
|
+
export interface AuthDetectionResult {
|
|
15
|
+
isAuthWall: boolean;
|
|
16
|
+
/** Confidence score 0-1 */
|
|
17
|
+
confidence: number;
|
|
18
|
+
/** What kind of auth is needed */
|
|
19
|
+
type?: AuthWallType;
|
|
20
|
+
/** Human-readable detail */
|
|
21
|
+
details?: string;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Detect whether an HTML response is an authentication/login wall.
|
|
25
|
+
*
|
|
26
|
+
* Returns `isAuthWall: true` only when confidence >= 0.5.
|
|
27
|
+
*
|
|
28
|
+
* **Important:** If the URL itself is a login/auth path (e.g. `/login`),
|
|
29
|
+
* returns `{ isAuthWall: false }` — the user navigated there intentionally.
|
|
30
|
+
*
|
|
31
|
+
* @param html - Raw HTML response body.
|
|
32
|
+
* @param url - Final URL (after redirects).
|
|
33
|
+
* @param statusCode - HTTP status code (optional but improves accuracy).
|
|
34
|
+
*/
|
|
35
|
+
export declare function detectAuthWall(html: string, url: string, statusCode?: number): AuthDetectionResult;
|
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Authentication wall detection.
|
|
3
|
+
*
|
|
4
|
+
* Analyzes raw HTML (and optional HTTP status code) to determine whether the
|
|
5
|
+
* response is a login/auth wall rather than real content.
|
|
6
|
+
*
|
|
7
|
+
* Design goals:
|
|
8
|
+
* - Fast: pure string/regex matching, no DOM parsing required
|
|
9
|
+
* - Low false-positive rate: uses confidence scoring, only flags at >= 0.5
|
|
10
|
+
* - Ignores ACTUAL login pages (user navigated there intentionally)
|
|
11
|
+
* - No external dependencies
|
|
12
|
+
*/
|
|
13
|
+
/* ---------- helpers ------------------------------------------------------ */
|
|
14
|
+
/** Test multiple needles — return how many match. */
|
|
15
|
+
function countMatches(html, needles) {
|
|
16
|
+
let count = 0;
|
|
17
|
+
for (const needle of needles) {
|
|
18
|
+
if (html.includes(needle))
|
|
19
|
+
count++;
|
|
20
|
+
}
|
|
21
|
+
return count;
|
|
22
|
+
}
|
|
23
|
+
/** Extract <title> content (lowercased). */
|
|
24
|
+
function extractTitle(html) {
|
|
25
|
+
const m = html.match(/<title[^>]*>([^<]*)<\/title>/i);
|
|
26
|
+
return m ? m[1].toLowerCase().trim() : '';
|
|
27
|
+
}
|
|
28
|
+
/** Estimate visible text length after stripping scripts/styles/tags. */
|
|
29
|
+
function estimateVisibleTextLength(html) {
|
|
30
|
+
const stripped = html
|
|
31
|
+
.replace(/<script[\s\S]*?<\/script>/gi, '')
|
|
32
|
+
.replace(/<style[\s\S]*?<\/style>/gi, '')
|
|
33
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, '')
|
|
34
|
+
.replace(/<[^>]*>/g, '')
|
|
35
|
+
.replace(/\s+/g, ' ')
|
|
36
|
+
.trim();
|
|
37
|
+
return stripped.length;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Returns true when the URL path itself IS a login/auth page.
|
|
41
|
+
* In that case the user navigated there intentionally — don't flag as auth wall.
|
|
42
|
+
*/
|
|
43
|
+
function urlIsAuthPage(url) {
|
|
44
|
+
try {
|
|
45
|
+
const { pathname } = new URL(url);
|
|
46
|
+
const p = pathname.toLowerCase();
|
|
47
|
+
// Match paths like /login, /signin, /sign-in, /auth, /authenticate,
|
|
48
|
+
// /signup, /sign-up, /register, /registration, /account/login, etc.
|
|
49
|
+
return /\/(login|log-in|signin|sign-in|auth|authenticate|signup|sign-up|register|registration)(\/|$|\?|#)/.test(p)
|
|
50
|
+
|| p === '/login'
|
|
51
|
+
|| p === '/signin'
|
|
52
|
+
|| p === '/sign-in'
|
|
53
|
+
|| p === '/auth'
|
|
54
|
+
|| p === '/authenticate'
|
|
55
|
+
|| p === '/signup'
|
|
56
|
+
|| p === '/sign-up'
|
|
57
|
+
|| p === '/register'
|
|
58
|
+
|| p === '/registration';
|
|
59
|
+
}
|
|
60
|
+
catch {
|
|
61
|
+
return false;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
/* ---------- signal detectors --------------------------------------------- */
|
|
65
|
+
/** HIGH confidence: Login form with password field and sparse content. */
|
|
66
|
+
function scoreLoginForm(html, _htmlLower) {
|
|
67
|
+
const hasPasswordInput = /<input[^>]*type\s*=\s*["']password["'][^>]*>/i.test(html);
|
|
68
|
+
if (!hasPasswordInput)
|
|
69
|
+
return 0;
|
|
70
|
+
const hasForm = /<form[^>]*>/i.test(html);
|
|
71
|
+
if (!hasForm)
|
|
72
|
+
return 0;
|
|
73
|
+
// High confidence: form + password + sparse content
|
|
74
|
+
const visibleLen = estimateVisibleTextLength(html);
|
|
75
|
+
if (visibleLen < 300) {
|
|
76
|
+
return 0.40; // Very sparse — strong signal
|
|
77
|
+
}
|
|
78
|
+
else if (visibleLen < 800) {
|
|
79
|
+
return 0.25; // Somewhat sparse
|
|
80
|
+
}
|
|
81
|
+
// Password form on a content page with reasonable text — weak signal
|
|
82
|
+
return 0.10;
|
|
83
|
+
}
|
|
84
|
+
/** HIGH confidence: HTTP 401/403 with auth-related HTML. */
|
|
85
|
+
function scoreStatusCode(_html, htmlLower, statusCode) {
|
|
86
|
+
if (statusCode !== 401 && statusCode !== 403)
|
|
87
|
+
return 0;
|
|
88
|
+
const authKeywords = [
|
|
89
|
+
'log in', 'login', 'sign in', 'signin', 'authenticate',
|
|
90
|
+
'unauthorized', 'forbidden', 'access denied',
|
|
91
|
+
'please log', 'please sign',
|
|
92
|
+
];
|
|
93
|
+
const matches = countMatches(htmlLower, authKeywords);
|
|
94
|
+
if (statusCode === 401) {
|
|
95
|
+
// 401 Unauthorized almost always means auth required
|
|
96
|
+
return matches > 0 ? 0.45 : 0.35;
|
|
97
|
+
}
|
|
98
|
+
// 403 with auth keywords
|
|
99
|
+
return matches >= 2 ? 0.30 : matches === 1 ? 0.15 : 0;
|
|
100
|
+
}
|
|
101
|
+
/** MEDIUM confidence: Title contains auth-related terms. */
|
|
102
|
+
function scoreTitleSignals(html) {
|
|
103
|
+
const title = extractTitle(html);
|
|
104
|
+
const authTitles = [
|
|
105
|
+
'log in', 'login', 'sign in', 'signin', 'sign up', 'signup',
|
|
106
|
+
'register', 'authenticate', 'authentication',
|
|
107
|
+
'create account', 'create an account',
|
|
108
|
+
'access denied', 'unauthorized',
|
|
109
|
+
];
|
|
110
|
+
for (const t of authTitles) {
|
|
111
|
+
if (title.includes(t))
|
|
112
|
+
return 0.20;
|
|
113
|
+
}
|
|
114
|
+
return 0;
|
|
115
|
+
}
|
|
116
|
+
/** MEDIUM confidence: Auth-related CSS classes in the page. */
|
|
117
|
+
function scoreCssClasses(html) {
|
|
118
|
+
const authClasses = [
|
|
119
|
+
'login-wall', 'auth-wall', 'signin-gate', 'login-gate',
|
|
120
|
+
'access-gate', 'content-gate', 'paywall', 'sign-in-gate',
|
|
121
|
+
'registration-wall', 'auth-gate', 'login-modal', 'signin-modal',
|
|
122
|
+
'auth-modal', 'auth-overlay', 'login-overlay',
|
|
123
|
+
];
|
|
124
|
+
const matches = countMatches(html.toLowerCase(), authClasses);
|
|
125
|
+
if (matches >= 2)
|
|
126
|
+
return 0.25;
|
|
127
|
+
if (matches === 1)
|
|
128
|
+
return 0.20;
|
|
129
|
+
return 0;
|
|
130
|
+
}
|
|
131
|
+
/** MEDIUM confidence: OAuth/social login buttons present. */
|
|
132
|
+
function scoreOAuthButtons(htmlLower) {
|
|
133
|
+
const oauthSignals = [
|
|
134
|
+
'sign in with google',
|
|
135
|
+
'login with google',
|
|
136
|
+
'continue with google',
|
|
137
|
+
'sign in with github',
|
|
138
|
+
'login with github',
|
|
139
|
+
'sign in with facebook',
|
|
140
|
+
'login with facebook',
|
|
141
|
+
'sign in with apple',
|
|
142
|
+
'continue with apple',
|
|
143
|
+
'sign in with twitter',
|
|
144
|
+
'sign in with microsoft',
|
|
145
|
+
'/auth/google',
|
|
146
|
+
'/auth/github',
|
|
147
|
+
'/auth/facebook',
|
|
148
|
+
'/oauth/google',
|
|
149
|
+
'/oauth/github',
|
|
150
|
+
];
|
|
151
|
+
const matches = countMatches(htmlLower, oauthSignals);
|
|
152
|
+
if (matches >= 3)
|
|
153
|
+
return 0.25;
|
|
154
|
+
if (matches >= 2)
|
|
155
|
+
return 0.20;
|
|
156
|
+
if (matches === 1)
|
|
157
|
+
return 0.15;
|
|
158
|
+
return 0;
|
|
159
|
+
}
|
|
160
|
+
/** MEDIUM confidence: Short page with password form. */
|
|
161
|
+
function scoreShortPageWithForm(html) {
|
|
162
|
+
if (html.length >= 5000)
|
|
163
|
+
return 0;
|
|
164
|
+
const hasPasswordInput = /<input[^>]*type\s*=\s*["']password["'][^>]*>/i.test(html);
|
|
165
|
+
const hasForm = /<form[^>]*>/i.test(html);
|
|
166
|
+
const hasSubmit = /<button[^>]*>|<input[^>]*type\s*=\s*["']submit["'][^>]*>/i.test(html);
|
|
167
|
+
if (hasPasswordInput && hasForm && hasSubmit)
|
|
168
|
+
return 0.20;
|
|
169
|
+
return 0;
|
|
170
|
+
}
|
|
171
|
+
/** MEDIUM confidence: window.location redirect to auth URL in inline script. */
|
|
172
|
+
function scoreJsRedirect(html) {
|
|
173
|
+
if (!/<script/i.test(html))
|
|
174
|
+
return 0;
|
|
175
|
+
const redirectPatterns = [
|
|
176
|
+
/window\.location\s*[=.]\s*["'][^"']*\/(login|signin|auth|signup|register)/i,
|
|
177
|
+
/location\.href\s*=\s*["'][^"']*\/(login|signin|auth|signup|register)/i,
|
|
178
|
+
/location\.replace\s*\(\s*["'][^"']*\/(login|signin|auth|signup|register)/i,
|
|
179
|
+
];
|
|
180
|
+
for (const pattern of redirectPatterns) {
|
|
181
|
+
if (pattern.test(html))
|
|
182
|
+
return 0.20;
|
|
183
|
+
}
|
|
184
|
+
return 0;
|
|
185
|
+
}
|
|
186
|
+
/** MEDIUM confidence: Meta tags or OG contain auth-related text. */
|
|
187
|
+
function scoreMetaTags(html) {
|
|
188
|
+
const metaRegex = /<meta[^>]*content\s*=\s*["']([^"']*)["'][^>]*>/gi;
|
|
189
|
+
let match;
|
|
190
|
+
while ((match = metaRegex.exec(html)) !== null) {
|
|
191
|
+
const content = match[1].toLowerCase();
|
|
192
|
+
if (content.includes('log in') ||
|
|
193
|
+
content.includes('sign in') ||
|
|
194
|
+
content.includes('login') ||
|
|
195
|
+
content.includes('signin') ||
|
|
196
|
+
content.includes('authenticate')) {
|
|
197
|
+
return 0.15;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
return 0;
|
|
201
|
+
}
|
|
202
|
+
/** LOW confidence: Text phrases suggesting auth is required. */
|
|
203
|
+
function scoreAuthPhrases(htmlLower) {
|
|
204
|
+
const phrases = [
|
|
205
|
+
'sign in to continue',
|
|
206
|
+
'log in to continue',
|
|
207
|
+
'login to continue',
|
|
208
|
+
'sign in to view',
|
|
209
|
+
'log in to view',
|
|
210
|
+
'please sign in',
|
|
211
|
+
'please log in',
|
|
212
|
+
'please login',
|
|
213
|
+
'create an account to',
|
|
214
|
+
'create account to',
|
|
215
|
+
'you must be logged in',
|
|
216
|
+
'you need to log in',
|
|
217
|
+
'you need to sign in',
|
|
218
|
+
'members only',
|
|
219
|
+
'subscribers only',
|
|
220
|
+
'login required',
|
|
221
|
+
'sign in required',
|
|
222
|
+
'authentication required',
|
|
223
|
+
];
|
|
224
|
+
const matches = countMatches(htmlLower, phrases);
|
|
225
|
+
if (matches >= 3)
|
|
226
|
+
return 0.15;
|
|
227
|
+
if (matches >= 2)
|
|
228
|
+
return 0.12;
|
|
229
|
+
if (matches === 1)
|
|
230
|
+
return 0.08;
|
|
231
|
+
return 0;
|
|
232
|
+
}
|
|
233
|
+
/** LOW confidence: noscript tag mentions authentication. */
|
|
234
|
+
function scoreNoscriptAuth(html) {
|
|
235
|
+
const noscriptMatch = html.match(/<noscript[^>]*>([\s\S]*?)<\/noscript>/gi);
|
|
236
|
+
if (!noscriptMatch)
|
|
237
|
+
return 0;
|
|
238
|
+
const noscriptText = noscriptMatch.join(' ').toLowerCase();
|
|
239
|
+
if (noscriptText.includes('login') ||
|
|
240
|
+
noscriptText.includes('sign in') ||
|
|
241
|
+
noscriptText.includes('authenticate')) {
|
|
242
|
+
return 0.08;
|
|
243
|
+
}
|
|
244
|
+
return 0;
|
|
245
|
+
}
|
|
246
|
+
/** LOW confidence: Social login buttons but very little other content. */
|
|
247
|
+
function scoreSocialLoginSparse(htmlLower, html) {
|
|
248
|
+
const socialButtons = [
|
|
249
|
+
'google', 'github', 'facebook', 'apple', 'microsoft', 'twitter',
|
|
250
|
+
];
|
|
251
|
+
const socialCount = countMatches(htmlLower, socialButtons);
|
|
252
|
+
if (socialCount < 2)
|
|
253
|
+
return 0;
|
|
254
|
+
const visibleLen = estimateVisibleTextLength(html);
|
|
255
|
+
if (visibleLen < 200)
|
|
256
|
+
return 0.10;
|
|
257
|
+
return 0;
|
|
258
|
+
}
|
|
259
|
+
/** Detect the most likely auth wall type. */
|
|
260
|
+
function detectType(scores) {
|
|
261
|
+
if (scores.status > 0.25)
|
|
262
|
+
return 'generic';
|
|
263
|
+
if (scores.loginForm >= 0.25)
|
|
264
|
+
return 'login-form';
|
|
265
|
+
if (scores.oauth >= 0.20)
|
|
266
|
+
return 'oauth-redirect';
|
|
267
|
+
if (scores.cssClasses > 0 && (scores.cssClasses >= 0.25 || (scores.cssClasses >= 0.20 && scores.authPhrases > 0))) {
|
|
268
|
+
return 'generic';
|
|
269
|
+
}
|
|
270
|
+
if (scores.authPhrases >= 0.12)
|
|
271
|
+
return 'signup-required';
|
|
272
|
+
return 'generic';
|
|
273
|
+
}
|
|
274
|
+
/* ---------- main export -------------------------------------------------- */
|
|
275
|
+
/**
|
|
276
|
+
* Detect whether an HTML response is an authentication/login wall.
|
|
277
|
+
*
|
|
278
|
+
* Returns `isAuthWall: true` only when confidence >= 0.5.
|
|
279
|
+
*
|
|
280
|
+
* **Important:** If the URL itself is a login/auth path (e.g. `/login`),
|
|
281
|
+
* returns `{ isAuthWall: false }` — the user navigated there intentionally.
|
|
282
|
+
*
|
|
283
|
+
* @param html - Raw HTML response body.
|
|
284
|
+
* @param url - Final URL (after redirects).
|
|
285
|
+
* @param statusCode - HTTP status code (optional but improves accuracy).
|
|
286
|
+
*/
|
|
287
|
+
export function detectAuthWall(html, url, statusCode) {
|
|
288
|
+
const THRESHOLD = 0.5;
|
|
289
|
+
// Sanity — empty input
|
|
290
|
+
if (!html || html.length === 0) {
|
|
291
|
+
return { isAuthWall: false, confidence: 0 };
|
|
292
|
+
}
|
|
293
|
+
// If the URL itself IS a login/auth page, don't flag — user navigated there intentionally
|
|
294
|
+
if (urlIsAuthPage(url)) {
|
|
295
|
+
return { isAuthWall: false, confidence: 0, details: 'URL is a login/auth page — user navigated there intentionally' };
|
|
296
|
+
}
|
|
297
|
+
// Real content pages (lots of visible text) are almost never auth walls
|
|
298
|
+
const visibleLen = estimateVisibleTextLength(html);
|
|
299
|
+
if (visibleLen > 2000) {
|
|
300
|
+
return { isAuthWall: false, confidence: 0, details: 'Page has substantial real content' };
|
|
301
|
+
}
|
|
302
|
+
const htmlLower = html.toLowerCase();
|
|
303
|
+
// --- Score each signal ---
|
|
304
|
+
const loginFormScore = scoreLoginForm(html, htmlLower);
|
|
305
|
+
const statusScore = scoreStatusCode(html, htmlLower, statusCode);
|
|
306
|
+
const titleScore = scoreTitleSignals(html);
|
|
307
|
+
const cssClassScore = scoreCssClasses(html);
|
|
308
|
+
const oauthScore = scoreOAuthButtons(htmlLower);
|
|
309
|
+
const shortPageScore = scoreShortPageWithForm(html);
|
|
310
|
+
const jsRedirectScore = scoreJsRedirect(html);
|
|
311
|
+
const metaScore = scoreMetaTags(html);
|
|
312
|
+
const phraseScore = scoreAuthPhrases(htmlLower);
|
|
313
|
+
const noscriptScore = scoreNoscriptAuth(html);
|
|
314
|
+
const socialSparseScore = scoreSocialLoginSparse(htmlLower, html);
|
|
315
|
+
const totalScore = loginFormScore +
|
|
316
|
+
statusScore +
|
|
317
|
+
titleScore +
|
|
318
|
+
cssClassScore +
|
|
319
|
+
oauthScore +
|
|
320
|
+
shortPageScore +
|
|
321
|
+
jsRedirectScore +
|
|
322
|
+
metaScore +
|
|
323
|
+
phraseScore +
|
|
324
|
+
noscriptScore +
|
|
325
|
+
socialSparseScore;
|
|
326
|
+
// Cap at 1.0
|
|
327
|
+
const confidence = Math.min(1.0, totalScore);
|
|
328
|
+
if (confidence < THRESHOLD) {
|
|
329
|
+
return { isAuthWall: false, confidence };
|
|
330
|
+
}
|
|
331
|
+
const type = detectType({
|
|
332
|
+
loginForm: loginFormScore,
|
|
333
|
+
oauth: oauthScore,
|
|
334
|
+
cssClasses: cssClassScore,
|
|
335
|
+
authPhrases: phraseScore,
|
|
336
|
+
status: statusScore,
|
|
337
|
+
});
|
|
338
|
+
// Build a human-readable detail string
|
|
339
|
+
const signals = [];
|
|
340
|
+
if (loginFormScore > 0)
|
|
341
|
+
signals.push(`login form (${loginFormScore.toFixed(2)})`);
|
|
342
|
+
if (statusScore > 0)
|
|
343
|
+
signals.push(`HTTP ${statusCode} (${statusScore.toFixed(2)})`);
|
|
344
|
+
if (titleScore > 0)
|
|
345
|
+
signals.push(`auth title (${titleScore.toFixed(2)})`);
|
|
346
|
+
if (cssClassScore > 0)
|
|
347
|
+
signals.push(`auth CSS class (${cssClassScore.toFixed(2)})`);
|
|
348
|
+
if (oauthScore > 0)
|
|
349
|
+
signals.push(`OAuth buttons (${oauthScore.toFixed(2)})`);
|
|
350
|
+
if (phraseScore > 0)
|
|
351
|
+
signals.push(`auth phrases (${phraseScore.toFixed(2)})`);
|
|
352
|
+
return {
|
|
353
|
+
isAuthWall: true,
|
|
354
|
+
confidence,
|
|
355
|
+
type,
|
|
356
|
+
details: `Auth wall detected (${type}): ${signals.join(', ')} → confidence ${confidence.toFixed(2)}`,
|
|
357
|
+
};
|
|
358
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Auto-extraction module — heuristic + CSS selector based structured data extraction.
|
|
3
|
+
* No LLM API key required.
|
|
4
|
+
*
|
|
5
|
+
* Supports:
|
|
6
|
+
* - pricing : pricing tables / plan cards
|
|
7
|
+
* - products : product grids / listings
|
|
8
|
+
* - contact : emails, phones, addresses, social links
|
|
9
|
+
* - article : blog posts / news articles
|
|
10
|
+
* - api_docs : REST API endpoint documentation
|
|
11
|
+
* - unknown : fallback when no type is detected
|
|
12
|
+
*/
|
|
13
|
+
export interface PricingPlan {
|
|
14
|
+
name: string;
|
|
15
|
+
price: string;
|
|
16
|
+
period?: string;
|
|
17
|
+
features: string[];
|
|
18
|
+
cta?: string;
|
|
19
|
+
}
|
|
20
|
+
export interface PricingResult {
|
|
21
|
+
type: 'pricing';
|
|
22
|
+
plans: PricingPlan[];
|
|
23
|
+
}
|
|
24
|
+
export interface ProductItem {
|
|
25
|
+
name: string;
|
|
26
|
+
price?: string;
|
|
27
|
+
image?: string;
|
|
28
|
+
url?: string;
|
|
29
|
+
rating?: string;
|
|
30
|
+
}
|
|
31
|
+
export interface ProductsResult {
|
|
32
|
+
type: 'products';
|
|
33
|
+
items: ProductItem[];
|
|
34
|
+
}
|
|
35
|
+
export interface ContactResult {
|
|
36
|
+
type: 'contact';
|
|
37
|
+
emails: string[];
|
|
38
|
+
phones: string[];
|
|
39
|
+
addresses: string[];
|
|
40
|
+
social: Record<string, string>;
|
|
41
|
+
}
|
|
42
|
+
export interface ArticleSection {
|
|
43
|
+
heading: string;
|
|
44
|
+
content: string;
|
|
45
|
+
}
|
|
46
|
+
export interface ArticleResult {
|
|
47
|
+
type: 'article';
|
|
48
|
+
title?: string;
|
|
49
|
+
author?: string;
|
|
50
|
+
date?: string;
|
|
51
|
+
readingTime?: string;
|
|
52
|
+
summary?: string;
|
|
53
|
+
sections: ArticleSection[];
|
|
54
|
+
}
|
|
55
|
+
export interface ApiEndpoint {
|
|
56
|
+
method: string;
|
|
57
|
+
path: string;
|
|
58
|
+
description?: string;
|
|
59
|
+
params?: string[];
|
|
60
|
+
}
|
|
61
|
+
export interface ApiDocsResult {
|
|
62
|
+
type: 'api_docs';
|
|
63
|
+
baseUrl?: string;
|
|
64
|
+
endpoints: ApiEndpoint[];
|
|
65
|
+
}
|
|
66
|
+
export interface UnknownResult {
|
|
67
|
+
type: 'unknown';
|
|
68
|
+
}
|
|
69
|
+
export type AutoExtractResult = PricingResult | ProductsResult | ContactResult | ArticleResult | ApiDocsResult | UnknownResult;
|
|
70
|
+
/**
|
|
71
|
+
* Detect the page type from HTML + URL.
|
|
72
|
+
* Returns one of: 'pricing' | 'products' | 'contact' | 'article' | 'api_docs' | 'unknown'
|
|
73
|
+
*/
|
|
74
|
+
export declare function detectPageType(html: string, url: string): string;
|
|
75
|
+
/**
|
|
76
|
+
* Detect the type of a web page based on HTML content and URL.
|
|
77
|
+
*/
|
|
78
|
+
export { detectPageType as default };
|
|
79
|
+
/**
|
|
80
|
+
* Auto-extract structured data from a web page without an LLM API key.
|
|
81
|
+
*/
|
|
82
|
+
export declare function autoExtract(html: string, url: string): AutoExtractResult;
|