@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,604 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Auto-extraction module — heuristic + CSS selector based structured data extraction.
|
|
3
|
+
* No LLM API key required.
|
|
4
|
+
*
|
|
5
|
+
* Supports:
|
|
6
|
+
* - pricing : pricing tables / plan cards
|
|
7
|
+
* - products : product grids / listings
|
|
8
|
+
* - contact : emails, phones, addresses, social links
|
|
9
|
+
* - article : blog posts / news articles
|
|
10
|
+
* - api_docs : REST API endpoint documentation
|
|
11
|
+
* - unknown : fallback when no type is detected
|
|
12
|
+
*/
|
|
13
|
+
import { load } from 'cheerio';
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Page type detection
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
const PRICE_INLINE = /(\$|€|£)\s*\d+/;
|
|
18
|
+
const FREE_PLAN = /\bfree\b/i;
|
|
19
|
+
const HTTP_METHOD_PATTERN = /\b(GET|POST|PUT|PATCH|DELETE|HEAD|OPTIONS)\b/;
|
|
20
|
+
const URL_PATH_PATTERN = /\/(v\d+\/)?[a-z_-]+(\/{[^}]+}|\/?[a-z_-]*)*\b/;
|
|
21
|
+
const EMAIL_PATTERN = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g;
|
|
22
|
+
const PHONE_PATTERN = /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}|\+\d{1,3}[-.\s]?\d{2,4}[-.\s]?\d{4,}/g;
|
|
23
|
+
/** Extract body text with spaces between elements (prevents regex over-matching adjacent tokens). */
|
|
24
|
+
function getBodyText($) {
|
|
25
|
+
const html = $('body').html() || '';
|
|
26
|
+
return html.replace(/<[^>]+>/g, ' ').replace(/&[a-z#\d]+;/gi, ' ').replace(/\s+/g, ' ').trim();
|
|
27
|
+
}
|
|
28
|
+
function urlHas(url, ...keywords) {
|
|
29
|
+
try {
|
|
30
|
+
const path = new URL(url).pathname.toLowerCase();
|
|
31
|
+
return keywords.some((kw) => path.includes(kw));
|
|
32
|
+
}
|
|
33
|
+
catch {
|
|
34
|
+
const lower = url.toLowerCase();
|
|
35
|
+
return keywords.some((kw) => lower.includes(kw));
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Detect the page type from HTML + URL.
|
|
40
|
+
* Returns one of: 'pricing' | 'products' | 'contact' | 'article' | 'api_docs' | 'unknown'
|
|
41
|
+
*/
|
|
42
|
+
export function detectPageType(html, url) {
|
|
43
|
+
const $ = load(html);
|
|
44
|
+
// --- Pricing ---
|
|
45
|
+
if (urlHas(url, '/pricing', '/plans', '/packages', '/tiers', '/billing')) {
|
|
46
|
+
return 'pricing';
|
|
47
|
+
}
|
|
48
|
+
const bodyText = getBodyText($);
|
|
49
|
+
const priceMatches = bodyText.match(/(\$|€|£)\s*\d+/g) || [];
|
|
50
|
+
const perPeriodMatches = bodyText.match(/\/(mo|month|year|yr|annual|week)/gi) || [];
|
|
51
|
+
if (priceMatches.length >= 2 && perPeriodMatches.length >= 1) {
|
|
52
|
+
return 'pricing';
|
|
53
|
+
}
|
|
54
|
+
// --- Contact ---
|
|
55
|
+
if (urlHas(url, '/contact', '/about', '/reach', '/connect', '/support')) {
|
|
56
|
+
const emails = bodyText.match(EMAIL_PATTERN) || [];
|
|
57
|
+
if (emails.length > 0)
|
|
58
|
+
return 'contact';
|
|
59
|
+
}
|
|
60
|
+
const emails = bodyText.match(EMAIL_PATTERN) || [];
|
|
61
|
+
const phones = bodyText.match(PHONE_PATTERN) || [];
|
|
62
|
+
const socialLinks = $('a[href*="twitter.com"], a[href*="linkedin.com"], a[href*="github.com"]').length;
|
|
63
|
+
if (emails.length > 0 && (phones.length > 0 || socialLinks > 0)) {
|
|
64
|
+
return 'contact';
|
|
65
|
+
}
|
|
66
|
+
// --- Article ---
|
|
67
|
+
const hasArticleTag = $('article').length > 0;
|
|
68
|
+
const hasTimeTag = $('time[datetime], time[pubdate]').length > 0;
|
|
69
|
+
const hasAuthorMeta = $('meta[name="author"]').length > 0 ||
|
|
70
|
+
$('[class*="author"], [itemprop="author"]').length > 0;
|
|
71
|
+
if (hasArticleTag || (hasTimeTag && hasAuthorMeta)) {
|
|
72
|
+
return 'article';
|
|
73
|
+
}
|
|
74
|
+
// Single <h1> + multiple paragraphs and a date-ish element
|
|
75
|
+
const h1Count = $('h1').length;
|
|
76
|
+
const paraCount = $('p').length;
|
|
77
|
+
if (h1Count === 1 && paraCount >= 3 && hasTimeTag) {
|
|
78
|
+
return 'article';
|
|
79
|
+
}
|
|
80
|
+
// --- API docs ---
|
|
81
|
+
const codeText = $('code, pre').text();
|
|
82
|
+
const httpMethodHits = (codeText.match(HTTP_METHOD_PATTERN) || []).length;
|
|
83
|
+
const urlPathHits = (codeText.match(URL_PATH_PATTERN) || []).length;
|
|
84
|
+
if (httpMethodHits >= 2 && urlPathHits >= 2) {
|
|
85
|
+
return 'api_docs';
|
|
86
|
+
}
|
|
87
|
+
// Also check for common API doc patterns in normal text
|
|
88
|
+
const headingText = $('h1, h2, h3').text();
|
|
89
|
+
if (headingText.match(/endpoint|api reference|rest api|http method/i) &&
|
|
90
|
+
httpMethodHits >= 1) {
|
|
91
|
+
return 'api_docs';
|
|
92
|
+
}
|
|
93
|
+
// --- Products ---
|
|
94
|
+
// Look for repeating card-like structures with prices + images
|
|
95
|
+
const potentialProductContainers = [
|
|
96
|
+
'.product', '.item', '.card', '[class*="product"]', '[class*="item"]', '[class*="card"]',
|
|
97
|
+
];
|
|
98
|
+
for (const sel of potentialProductContainers) {
|
|
99
|
+
const cards = $(sel);
|
|
100
|
+
if (cards.length >= 3) {
|
|
101
|
+
let withPrice = 0;
|
|
102
|
+
cards.each((_, el) => {
|
|
103
|
+
const text = $(el).text();
|
|
104
|
+
if (PRICE_INLINE.test(text) || FREE_PLAN.test(text))
|
|
105
|
+
withPrice++;
|
|
106
|
+
});
|
|
107
|
+
if (withPrice >= 2)
|
|
108
|
+
return 'products';
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
// Fallback: many <img> elements with adjacent prices
|
|
112
|
+
const imgs = $('img').length;
|
|
113
|
+
if (imgs >= 4 && priceMatches.length >= 3) {
|
|
114
|
+
return 'products';
|
|
115
|
+
}
|
|
116
|
+
return 'unknown';
|
|
117
|
+
}
|
|
118
|
+
// ---------------------------------------------------------------------------
|
|
119
|
+
// Pricing extractor
|
|
120
|
+
// ---------------------------------------------------------------------------
|
|
121
|
+
function extractPricingPlans($) {
|
|
122
|
+
const plans = [];
|
|
123
|
+
// Common pricing card selectors (ordered from specific to broad)
|
|
124
|
+
const containerSelectors = [
|
|
125
|
+
'[class*="pricing-card"]',
|
|
126
|
+
'[class*="price-card"]',
|
|
127
|
+
'[class*="plan-card"]',
|
|
128
|
+
'[class*="tier-card"]',
|
|
129
|
+
'[class*="pricing__plan"]',
|
|
130
|
+
'[class*="plan"]',
|
|
131
|
+
'[class*="pricing-tier"]',
|
|
132
|
+
'[class*="pricing-table"] td',
|
|
133
|
+
'[class*="pricing-table"] th',
|
|
134
|
+
'.card',
|
|
135
|
+
'[class*="col-"]',
|
|
136
|
+
];
|
|
137
|
+
let containers = null;
|
|
138
|
+
for (const sel of containerSelectors) {
|
|
139
|
+
const found = $(sel).filter((_, el) => {
|
|
140
|
+
const text = $(el).text();
|
|
141
|
+
return PRICE_INLINE.test(text) || FREE_PLAN.test(text);
|
|
142
|
+
});
|
|
143
|
+
if (found.length >= 2) {
|
|
144
|
+
containers = found;
|
|
145
|
+
break;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
if (!containers || containers.length === 0) {
|
|
149
|
+
// Last resort: parse entire page for price-like text blocks
|
|
150
|
+
return parsePricingFromText($);
|
|
151
|
+
}
|
|
152
|
+
containers.each((_, el) => {
|
|
153
|
+
try {
|
|
154
|
+
const $el = $(el);
|
|
155
|
+
const text = $el.text().trim();
|
|
156
|
+
// Extract plan name — try specific selectors first, then fall back to headings
|
|
157
|
+
const nameSelectors = [
|
|
158
|
+
'[data-plan-name]',
|
|
159
|
+
'.plan-name',
|
|
160
|
+
'[class*="plan-name"]',
|
|
161
|
+
'[class*="plan__name"]',
|
|
162
|
+
'[class*="tier-name"]',
|
|
163
|
+
'[class*="pricing-header"] h2',
|
|
164
|
+
'[class*="pricing-header"] h3',
|
|
165
|
+
'[class*="pricing__title"]',
|
|
166
|
+
'[class*="price__title"]',
|
|
167
|
+
'[class*="card__title"]',
|
|
168
|
+
'[class*="card-title"]',
|
|
169
|
+
'h2',
|
|
170
|
+
'h3',
|
|
171
|
+
'h4',
|
|
172
|
+
'[class*="name"]',
|
|
173
|
+
'[class*="title"]',
|
|
174
|
+
'h1',
|
|
175
|
+
'h5',
|
|
176
|
+
'h6',
|
|
177
|
+
];
|
|
178
|
+
let name = '';
|
|
179
|
+
for (const sel of nameSelectors) {
|
|
180
|
+
const candidate = $el.find(sel).first().text().trim();
|
|
181
|
+
if (candidate && candidate.toLowerCase() !== 'plan') {
|
|
182
|
+
name = candidate;
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
if (!name)
|
|
187
|
+
name = 'Plan';
|
|
188
|
+
// Extract price
|
|
189
|
+
const priceMatch = text.match(/(\$|€|£|free)\s*[\d,]+(\.\d+)?/i);
|
|
190
|
+
if (!priceMatch && !FREE_PLAN.test(text))
|
|
191
|
+
return; // Skip non-price containers
|
|
192
|
+
const price = FREE_PLAN.test(text) && !priceMatch ? 'Free' : (priceMatch?.[0] ?? '');
|
|
193
|
+
// Extract period
|
|
194
|
+
const periodMatch = text.match(/\/(mo(nth)?|yr|year|week|day|annual)/i);
|
|
195
|
+
const period = periodMatch ? periodMatch[0] : undefined;
|
|
196
|
+
// Extract features from lists
|
|
197
|
+
const features = [];
|
|
198
|
+
$el.find('li').each((_, li) => {
|
|
199
|
+
const featureText = $(li).text().trim();
|
|
200
|
+
if (featureText && featureText.length < 200) {
|
|
201
|
+
features.push(featureText);
|
|
202
|
+
}
|
|
203
|
+
});
|
|
204
|
+
// Extract CTA button
|
|
205
|
+
const ctaEl = $el
|
|
206
|
+
.find('a, button')
|
|
207
|
+
.filter((_, btn) => /get started|sign up|buy|subscribe|choose|select|try|start|upgrade/i.test($(btn).text()))
|
|
208
|
+
.first();
|
|
209
|
+
const cta = ctaEl.text().trim() || undefined;
|
|
210
|
+
if (name || price) {
|
|
211
|
+
plans.push({ name, price, period, features, cta });
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
catch (e) {
|
|
215
|
+
if (process.env.DEBUG)
|
|
216
|
+
console.debug('[webpeel]', 'pricing plan parse failed:', e instanceof Error ? e.message : e);
|
|
217
|
+
}
|
|
218
|
+
});
|
|
219
|
+
return deduplicatePlans(plans);
|
|
220
|
+
}
|
|
221
|
+
function parsePricingFromText($) {
|
|
222
|
+
// Fallback: find all price-like elements and group them
|
|
223
|
+
const plans = [];
|
|
224
|
+
const bodyText = getBodyText($);
|
|
225
|
+
const priceRegex = /(\$|€|£)\s*(\d+(?:\.\d+)?)\s*(?:\/(mo(?:nth)?|yr|year|week|annual))?/gi;
|
|
226
|
+
let match;
|
|
227
|
+
const foundPrices = [];
|
|
228
|
+
while ((match = priceRegex.exec(bodyText)) !== null) {
|
|
229
|
+
foundPrices.push(match[0]);
|
|
230
|
+
}
|
|
231
|
+
// Simple heuristic: each unique price = 1 plan
|
|
232
|
+
// Try to find plan names from headings near price text
|
|
233
|
+
const headings = [];
|
|
234
|
+
$('h1, h2, h3, h4').each((_, el) => {
|
|
235
|
+
const text = $(el).text().trim();
|
|
236
|
+
if (text && text.toLowerCase() !== 'plan' && text.length < 60)
|
|
237
|
+
headings.push(text);
|
|
238
|
+
});
|
|
239
|
+
const uniquePrices = [...new Set(foundPrices)];
|
|
240
|
+
for (let i = 0; i < uniquePrices.length; i++) {
|
|
241
|
+
const p = uniquePrices[i];
|
|
242
|
+
const name = headings[i] || 'Plan';
|
|
243
|
+
plans.push({ name, price: p, features: [] });
|
|
244
|
+
}
|
|
245
|
+
return plans;
|
|
246
|
+
}
|
|
247
|
+
function deduplicatePlans(plans) {
|
|
248
|
+
const seen = new Set();
|
|
249
|
+
return plans.filter((p) => {
|
|
250
|
+
const key = `${p.name}|${p.price}`;
|
|
251
|
+
if (seen.has(key))
|
|
252
|
+
return false;
|
|
253
|
+
seen.add(key);
|
|
254
|
+
return true;
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
// ---------------------------------------------------------------------------
|
|
258
|
+
// Products extractor
|
|
259
|
+
// ---------------------------------------------------------------------------
|
|
260
|
+
function extractProducts($, baseUrl) {
|
|
261
|
+
const items = [];
|
|
262
|
+
const origin = (() => {
|
|
263
|
+
try {
|
|
264
|
+
return new URL(baseUrl).origin;
|
|
265
|
+
}
|
|
266
|
+
catch {
|
|
267
|
+
return '';
|
|
268
|
+
}
|
|
269
|
+
})();
|
|
270
|
+
const containerSelectors = [
|
|
271
|
+
'[class*="product"]',
|
|
272
|
+
'[class*="item"]',
|
|
273
|
+
'[class*="card"]',
|
|
274
|
+
'li',
|
|
275
|
+
'article',
|
|
276
|
+
];
|
|
277
|
+
let containers = null;
|
|
278
|
+
for (const sel of containerSelectors) {
|
|
279
|
+
const found = $(sel).filter((_, el) => {
|
|
280
|
+
const text = $(el).text();
|
|
281
|
+
return (PRICE_INLINE.test(text) || FREE_PLAN.test(text)) && $(el).find('img').length > 0;
|
|
282
|
+
});
|
|
283
|
+
if (found.length >= 2) {
|
|
284
|
+
containers = found;
|
|
285
|
+
break;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
if (!containers || containers.length === 0)
|
|
289
|
+
return items;
|
|
290
|
+
containers.each((_, el) => {
|
|
291
|
+
try {
|
|
292
|
+
const $el = $(el);
|
|
293
|
+
// Name
|
|
294
|
+
const nameEl = $el.find('h1,h2,h3,h4,h5,h6,[class*="name"],[class*="title"]').first();
|
|
295
|
+
const name = nameEl.text().trim();
|
|
296
|
+
if (!name)
|
|
297
|
+
return;
|
|
298
|
+
// Price
|
|
299
|
+
const priceMatch = $el.text().match(/(\$|€|£)\s*[\d,]+(\.\d+)?/);
|
|
300
|
+
const price = priceMatch ? priceMatch[0].trim() : undefined;
|
|
301
|
+
// Image
|
|
302
|
+
const imgEl = $el.find('img').first();
|
|
303
|
+
const imgSrc = imgEl.attr('src') || imgEl.attr('data-src') || imgEl.attr('data-lazy');
|
|
304
|
+
const image = imgSrc
|
|
305
|
+
? imgSrc.startsWith('http')
|
|
306
|
+
? imgSrc
|
|
307
|
+
: `${origin}${imgSrc.startsWith('/') ? '' : '/'}${imgSrc}`
|
|
308
|
+
: undefined;
|
|
309
|
+
// URL
|
|
310
|
+
const linkEl = $el.find('a').first();
|
|
311
|
+
const href = linkEl.attr('href');
|
|
312
|
+
const url = href
|
|
313
|
+
? href.startsWith('http')
|
|
314
|
+
? href
|
|
315
|
+
: `${origin}${href.startsWith('/') ? '' : '/'}${href}`
|
|
316
|
+
: undefined;
|
|
317
|
+
// Rating
|
|
318
|
+
const ratingMatch = $el.text().match(/(\d(\.\d)?)\s*(\/\s*5|stars?|★)/i);
|
|
319
|
+
const rating = ratingMatch ? `${ratingMatch[1]}/5` : undefined;
|
|
320
|
+
items.push({ name, price, image, url, rating });
|
|
321
|
+
}
|
|
322
|
+
catch (e) {
|
|
323
|
+
if (process.env.DEBUG)
|
|
324
|
+
console.debug('[webpeel]', 'product item parse failed:', e instanceof Error ? e.message : e);
|
|
325
|
+
}
|
|
326
|
+
});
|
|
327
|
+
return items.slice(0, 100); // cap at 100
|
|
328
|
+
}
|
|
329
|
+
// ---------------------------------------------------------------------------
|
|
330
|
+
// Contact extractor
|
|
331
|
+
// ---------------------------------------------------------------------------
|
|
332
|
+
const SOCIAL_DOMAINS = {
|
|
333
|
+
'twitter.com': 'twitter',
|
|
334
|
+
'x.com': 'twitter',
|
|
335
|
+
'linkedin.com': 'linkedin',
|
|
336
|
+
'github.com': 'github',
|
|
337
|
+
'facebook.com': 'facebook',
|
|
338
|
+
'instagram.com': 'instagram',
|
|
339
|
+
'youtube.com': 'youtube',
|
|
340
|
+
'tiktok.com': 'tiktok',
|
|
341
|
+
'discord.gg': 'discord',
|
|
342
|
+
'discord.com': 'discord',
|
|
343
|
+
};
|
|
344
|
+
const ADDRESS_PATTERN = /\d{1,5}\s+[A-Za-z0-9\s,\.]+(?:street|st|avenue|ave|road|rd|blvd|boulevard|lane|ln|drive|dr|court|ct|way|wy|place|pl)\b[^<\n]{0,80}/i;
|
|
345
|
+
function extractContact($) {
|
|
346
|
+
const bodyText = getBodyText($);
|
|
347
|
+
// Emails
|
|
348
|
+
const emailMatches = bodyText.match(EMAIL_PATTERN) || [];
|
|
349
|
+
const emails = [
|
|
350
|
+
...new Set(emailMatches.map((e) => e.toLowerCase())),
|
|
351
|
+
];
|
|
352
|
+
// Phones
|
|
353
|
+
const phoneMatches = bodyText.match(PHONE_PATTERN) || [];
|
|
354
|
+
const phones = [...new Set(phoneMatches.map((p) => p.trim()))];
|
|
355
|
+
// Addresses
|
|
356
|
+
const addresses = [];
|
|
357
|
+
$('[class*="address"], [itemprop="address"], address').each((_, el) => {
|
|
358
|
+
const addr = $(el).text().replace(/\s+/g, ' ').trim();
|
|
359
|
+
if (addr.length > 10)
|
|
360
|
+
addresses.push(addr);
|
|
361
|
+
});
|
|
362
|
+
// Also regex-based
|
|
363
|
+
const addrMatch = bodyText.match(ADDRESS_PATTERN);
|
|
364
|
+
if (addrMatch) {
|
|
365
|
+
const addr = addrMatch[0].trim();
|
|
366
|
+
if (!addresses.some((a) => a.includes(addr.substring(0, 10)))) {
|
|
367
|
+
addresses.push(addr);
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
// Social links
|
|
371
|
+
const social = {};
|
|
372
|
+
$('a[href]').each((_, el) => {
|
|
373
|
+
const href = $(el).attr('href') || '';
|
|
374
|
+
for (const [domain, key] of Object.entries(SOCIAL_DOMAINS)) {
|
|
375
|
+
if (href.includes(domain) && !social[key]) {
|
|
376
|
+
social[key] = href;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
});
|
|
380
|
+
return { type: 'contact', emails, phones, addresses, social };
|
|
381
|
+
}
|
|
382
|
+
// ---------------------------------------------------------------------------
|
|
383
|
+
// Article extractor
|
|
384
|
+
// ---------------------------------------------------------------------------
|
|
385
|
+
function extractArticle($) {
|
|
386
|
+
// Title
|
|
387
|
+
const title = $('h1').first().text().trim() ||
|
|
388
|
+
$('meta[property="og:title"]').attr('content') ||
|
|
389
|
+
$('title').text().trim() ||
|
|
390
|
+
undefined;
|
|
391
|
+
// Author
|
|
392
|
+
const author = $('meta[name="author"]').attr('content') ||
|
|
393
|
+
$('[itemprop="author"]').first().text().trim() ||
|
|
394
|
+
$('[class*="author"]').first().text().trim() ||
|
|
395
|
+
$('[rel="author"]').first().text().trim() ||
|
|
396
|
+
undefined;
|
|
397
|
+
// Date
|
|
398
|
+
const date = $('time[datetime]').first().attr('datetime') ||
|
|
399
|
+
$('time[pubdate]').first().attr('datetime') ||
|
|
400
|
+
$('meta[name="date"]').attr('content') ||
|
|
401
|
+
$('meta[property="article:published_time"]').attr('content') ||
|
|
402
|
+
$('time').first().text().trim() ||
|
|
403
|
+
undefined;
|
|
404
|
+
// Reading time
|
|
405
|
+
const readingTimeEl = $('[class*="reading-time"], [class*="read-time"], [class*="readtime"]').first();
|
|
406
|
+
const readingTime = readingTimeEl.length ? readingTimeEl.text().trim() : estimateReadingTime($);
|
|
407
|
+
// Summary (first 2 sentences of article content)
|
|
408
|
+
const articleEl = $('article').first();
|
|
409
|
+
const contentEl = articleEl.length ? articleEl : $('main').first();
|
|
410
|
+
const firstPara = contentEl.find('p').first().text().trim() ||
|
|
411
|
+
$('meta[name="description"]').attr('content') ||
|
|
412
|
+
$('meta[property="og:description"]').attr('content') ||
|
|
413
|
+
'';
|
|
414
|
+
const summary = firstPara ? extractFirstSentences(firstPara, 2) : undefined;
|
|
415
|
+
// Sections: h2/h3 + following content
|
|
416
|
+
const sections = [];
|
|
417
|
+
const headings = contentEl.find('h2, h3');
|
|
418
|
+
headings.each((_, el) => {
|
|
419
|
+
const heading = $(el).text().trim();
|
|
420
|
+
if (!heading)
|
|
421
|
+
return;
|
|
422
|
+
// Gather text of next sibling elements until next heading
|
|
423
|
+
const contentParts = [];
|
|
424
|
+
let sibling = $(el).next();
|
|
425
|
+
while (sibling.length && !sibling.is('h2, h3')) {
|
|
426
|
+
const text = sibling.text().trim();
|
|
427
|
+
if (text)
|
|
428
|
+
contentParts.push(text);
|
|
429
|
+
sibling = sibling.next();
|
|
430
|
+
}
|
|
431
|
+
if (contentParts.length > 0) {
|
|
432
|
+
sections.push({ heading, content: contentParts.join(' ') });
|
|
433
|
+
}
|
|
434
|
+
});
|
|
435
|
+
return { type: 'article', title, author, date, readingTime, summary, sections };
|
|
436
|
+
}
|
|
437
|
+
function extractFirstSentences(text, count) {
|
|
438
|
+
const sentenceEnd = /[.!?]+\s+/g;
|
|
439
|
+
let match;
|
|
440
|
+
let lastIndex = 0;
|
|
441
|
+
let sentenceCount = 0;
|
|
442
|
+
while ((match = sentenceEnd.exec(text)) !== null) {
|
|
443
|
+
lastIndex = match.index + match[0].length;
|
|
444
|
+
sentenceCount++;
|
|
445
|
+
if (sentenceCount >= count)
|
|
446
|
+
break;
|
|
447
|
+
}
|
|
448
|
+
return sentenceCount > 0 ? text.slice(0, lastIndex).trim() : text.slice(0, 300).trim();
|
|
449
|
+
}
|
|
450
|
+
function estimateReadingTime($) {
|
|
451
|
+
const wordsPerMinute = 200;
|
|
452
|
+
const text = $('article, main, [class*="content"], body').first().text();
|
|
453
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
454
|
+
const minutes = Math.max(1, Math.ceil(wordCount / wordsPerMinute));
|
|
455
|
+
return `${minutes} min`;
|
|
456
|
+
}
|
|
457
|
+
// ---------------------------------------------------------------------------
|
|
458
|
+
// API docs extractor
|
|
459
|
+
// ---------------------------------------------------------------------------
|
|
460
|
+
const HTTP_METHODS = ['GET', 'POST', 'PUT', 'PATCH', 'DELETE', 'HEAD', 'OPTIONS'];
|
|
461
|
+
function extractApiDocs($, url) {
|
|
462
|
+
const endpoints = [];
|
|
463
|
+
// Try to detect base URL from page or URL
|
|
464
|
+
let baseUrl;
|
|
465
|
+
const pageText = getBodyText($);
|
|
466
|
+
const baseUrlMatch = pageText.match(/https?:\/\/api\.[a-zA-Z0-9.-]+/);
|
|
467
|
+
if (baseUrlMatch) {
|
|
468
|
+
baseUrl = baseUrlMatch[0];
|
|
469
|
+
}
|
|
470
|
+
else {
|
|
471
|
+
try {
|
|
472
|
+
const parsed = new URL(url);
|
|
473
|
+
baseUrl = `${parsed.protocol}//api.${parsed.hostname}`;
|
|
474
|
+
}
|
|
475
|
+
catch {
|
|
476
|
+
baseUrl = undefined;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
// Strategy 1: Parse code blocks for HTTP method + path patterns
|
|
480
|
+
$('code, pre').each((_, el) => {
|
|
481
|
+
const text = $(el).text().trim();
|
|
482
|
+
const lines = text.split(/\n/);
|
|
483
|
+
for (const line of lines) {
|
|
484
|
+
const trimmed = line.trim();
|
|
485
|
+
for (const method of HTTP_METHODS) {
|
|
486
|
+
if (trimmed.startsWith(method + ' ') || trimmed.startsWith(method + '\t')) {
|
|
487
|
+
const rest = trimmed.slice(method.length).trim();
|
|
488
|
+
// Extract path (first URL-like token)
|
|
489
|
+
const pathMatch = rest.match(/^(https?:\/\/[^\s]+|\/[^\s]*)/);
|
|
490
|
+
if (pathMatch) {
|
|
491
|
+
let path = pathMatch[0];
|
|
492
|
+
// Normalize: strip base URL prefix if present
|
|
493
|
+
if (baseUrl && path.startsWith(baseUrl)) {
|
|
494
|
+
path = path.slice(baseUrl.length);
|
|
495
|
+
}
|
|
496
|
+
// Strip query string
|
|
497
|
+
path = path.split('?')[0];
|
|
498
|
+
// Try to find a description — look at nearest heading above this code block
|
|
499
|
+
const description = findNearestHeading($(el)) || undefined;
|
|
500
|
+
endpoints.push({ method, path, description });
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
});
|
|
506
|
+
// Strategy 2: Scan for method badges + inline paths in regular text
|
|
507
|
+
$('[class*="method"], [class*="http-method"], .badge, .label').each((_, el) => {
|
|
508
|
+
const methodText = $(el).text().trim().toUpperCase();
|
|
509
|
+
if (!HTTP_METHODS.includes(methodText))
|
|
510
|
+
return;
|
|
511
|
+
// Look for adjacent path element
|
|
512
|
+
const siblings = [
|
|
513
|
+
$(el).next('[class*="path"], [class*="endpoint"], [class*="route"], code'),
|
|
514
|
+
$(el).parent().find('code').first(),
|
|
515
|
+
];
|
|
516
|
+
for (const sibling of siblings) {
|
|
517
|
+
if (sibling.length) {
|
|
518
|
+
const path = sibling.text().trim();
|
|
519
|
+
if (URL_PATH_PATTERN.test(path)) {
|
|
520
|
+
endpoints.push({ method: methodText, path });
|
|
521
|
+
break;
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
});
|
|
526
|
+
// Deduplicate by method+path
|
|
527
|
+
const seen = new Set();
|
|
528
|
+
const unique = endpoints.filter((ep) => {
|
|
529
|
+
const key = `${ep.method}:${ep.path}`;
|
|
530
|
+
if (seen.has(key))
|
|
531
|
+
return false;
|
|
532
|
+
seen.add(key);
|
|
533
|
+
return true;
|
|
534
|
+
});
|
|
535
|
+
return { type: 'api_docs', baseUrl, endpoints: unique };
|
|
536
|
+
}
|
|
537
|
+
function findNearestHeading($el) {
|
|
538
|
+
// Walk backwards through siblings/parents to find closest heading
|
|
539
|
+
let current = $el.prev();
|
|
540
|
+
let depth = 0;
|
|
541
|
+
while (depth < 5) {
|
|
542
|
+
if (current.length === 0) {
|
|
543
|
+
const parent = $el.parent();
|
|
544
|
+
if (!parent.length)
|
|
545
|
+
break;
|
|
546
|
+
current = parent.prev();
|
|
547
|
+
}
|
|
548
|
+
else if (current.is('h1,h2,h3,h4,h5,h6')) {
|
|
549
|
+
return current.text().trim();
|
|
550
|
+
}
|
|
551
|
+
else {
|
|
552
|
+
current = current.prev();
|
|
553
|
+
}
|
|
554
|
+
depth++;
|
|
555
|
+
}
|
|
556
|
+
return null;
|
|
557
|
+
}
|
|
558
|
+
// ---------------------------------------------------------------------------
|
|
559
|
+
// Main entry points
|
|
560
|
+
// ---------------------------------------------------------------------------
|
|
561
|
+
/**
|
|
562
|
+
* Detect the type of a web page based on HTML content and URL.
|
|
563
|
+
*/
|
|
564
|
+
export { detectPageType as default };
|
|
565
|
+
/**
|
|
566
|
+
* Auto-extract structured data from a web page without an LLM API key.
|
|
567
|
+
*/
|
|
568
|
+
export function autoExtract(html, url) {
|
|
569
|
+
const type = detectPageType(html, url);
|
|
570
|
+
const $ = load(html);
|
|
571
|
+
try {
|
|
572
|
+
switch (type) {
|
|
573
|
+
case 'pricing':
|
|
574
|
+
return { type: 'pricing', plans: extractPricingPlans($) };
|
|
575
|
+
case 'products':
|
|
576
|
+
return { type: 'products', items: extractProducts($, url) };
|
|
577
|
+
case 'contact':
|
|
578
|
+
return extractContact($);
|
|
579
|
+
case 'article':
|
|
580
|
+
return extractArticle($);
|
|
581
|
+
case 'api_docs':
|
|
582
|
+
return extractApiDocs($, url);
|
|
583
|
+
default:
|
|
584
|
+
return { type: 'unknown' };
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
catch {
|
|
588
|
+
// Return partial/empty result rather than crashing
|
|
589
|
+
switch (type) {
|
|
590
|
+
case 'pricing':
|
|
591
|
+
return { type: 'pricing', plans: [] };
|
|
592
|
+
case 'products':
|
|
593
|
+
return { type: 'products', items: [] };
|
|
594
|
+
case 'contact':
|
|
595
|
+
return { type: 'contact', emails: [], phones: [], addresses: [], social: {} };
|
|
596
|
+
case 'article':
|
|
597
|
+
return { type: 'article', sections: [] };
|
|
598
|
+
case 'api_docs':
|
|
599
|
+
return { type: 'api_docs', endpoints: [] };
|
|
600
|
+
default:
|
|
601
|
+
return { type: 'unknown' };
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Auto-interact: automatically dismiss cookie banners, consent popups,
|
|
3
|
+
* overlay modals, and optionally click "load more" / "show all" buttons.
|
|
4
|
+
*
|
|
5
|
+
* Runs after page.goto() and before content extraction.
|
|
6
|
+
* Never blocks extraction — each interaction has a tight timeout.
|
|
7
|
+
* Total budget: 3s max.
|
|
8
|
+
*/
|
|
9
|
+
import type { Page } from 'playwright';
|
|
10
|
+
export interface AutoInteractResult {
|
|
11
|
+
cookieBannerDismissed: boolean;
|
|
12
|
+
consentHandled: boolean;
|
|
13
|
+
loadMoreClicked: number;
|
|
14
|
+
overlaysDismissed: number;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Automatically interact with the page to dismiss common UI overlays before
|
|
18
|
+
* content extraction. Never throws — all errors are swallowed.
|
|
19
|
+
*
|
|
20
|
+
* @param page - Playwright page (already navigated)
|
|
21
|
+
* @returns Summary of what was dismissed
|
|
22
|
+
*/
|
|
23
|
+
export declare function autoInteract(page: Page): Promise<AutoInteractResult>;
|