@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract structured metadata from HTML
|
|
3
|
+
*/
|
|
4
|
+
import type { PageMetadata } from '../types.js';
|
|
5
|
+
/**
|
|
6
|
+
* Detect and fix concatenated titles where two titles are smashed together
|
|
7
|
+
* without a separator (e.g. "The Performance of Open Source SoftwareHigh Performance Networking in Chrome").
|
|
8
|
+
* Heuristic: split at lowercase→uppercase boundary if it looks like two distinct titles.
|
|
9
|
+
* Returns the longer (more specific) segment.
|
|
10
|
+
*/
|
|
11
|
+
export declare function cleanConcatenatedTitle(title: string): string;
|
|
12
|
+
/**
|
|
13
|
+
* Extract all links from page
|
|
14
|
+
* Returns absolute URLs, deduplicated
|
|
15
|
+
*/
|
|
16
|
+
export declare function extractLinks(html: string, baseUrl: string): string[];
|
|
17
|
+
/**
|
|
18
|
+
* Extract all images from HTML
|
|
19
|
+
* Resolves relative URLs to absolute and extracts metadata
|
|
20
|
+
*
|
|
21
|
+
* @param html - HTML to extract images from
|
|
22
|
+
* @param baseUrl - Base URL for resolving relative paths
|
|
23
|
+
* @returns Array of image information, deduplicated by src
|
|
24
|
+
*/
|
|
25
|
+
export declare function extractImages(html: string, baseUrl: string): import('../types.js').ImageInfo[];
|
|
26
|
+
/**
|
|
27
|
+
* Extract all metadata from HTML.
|
|
28
|
+
* Optimization: only parse the <head> section with cheerio (avoids full DOM parse).
|
|
29
|
+
* Falls back to full HTML if head section is not found or produces no title.
|
|
30
|
+
*/
|
|
31
|
+
export declare function extractMetadata(html: string, _url: string): {
|
|
32
|
+
title: string;
|
|
33
|
+
metadata: PageMetadata;
|
|
34
|
+
};
|
|
@@ -0,0 +1,422 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract structured metadata from HTML
|
|
3
|
+
*/
|
|
4
|
+
import * as cheerio from 'cheerio';
|
|
5
|
+
/**
|
|
6
|
+
* Detect and fix concatenated titles where two titles are smashed together
|
|
7
|
+
* without a separator (e.g. "The Performance of Open Source SoftwareHigh Performance Networking in Chrome").
|
|
8
|
+
* Heuristic: split at lowercase→uppercase boundary if it looks like two distinct titles.
|
|
9
|
+
* Returns the longer (more specific) segment.
|
|
10
|
+
*/
|
|
11
|
+
export function cleanConcatenatedTitle(title) {
|
|
12
|
+
if (!title)
|
|
13
|
+
return title;
|
|
14
|
+
// Look for pattern: lowercase letter immediately followed by uppercase letter
|
|
15
|
+
// that isn't a normal camelCase word (e.g. "JavaScript" is fine, but
|
|
16
|
+
// "SoftwareHigh" is two words smashed together)
|
|
17
|
+
// We split on boundaries where a common word ending meets a new capitalized word
|
|
18
|
+
const match = title.match(/^(.+[a-z])([A-Z][a-z].+)$/);
|
|
19
|
+
if (match) {
|
|
20
|
+
const [, part1, part2] = match;
|
|
21
|
+
// Both parts should be reasonably long to be separate titles
|
|
22
|
+
if (part1.length > 10 && part2.length > 10) {
|
|
23
|
+
// Prefer the second part — it's typically the page-specific title
|
|
24
|
+
// (e.g. "The Performance of Open Source Software" + "High Performance Networking in Chrome"
|
|
25
|
+
// → the second part is the chapter/page title, the first is the site/book title)
|
|
26
|
+
return part2;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
return title;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Extract page title using fallback chain:
|
|
33
|
+
* og:title → twitter:title → title tag → h1
|
|
34
|
+
*/
|
|
35
|
+
function extractTitle($) {
|
|
36
|
+
// Try Open Graph title
|
|
37
|
+
let title = $('meta[property="og:title"]').attr('content');
|
|
38
|
+
if (title)
|
|
39
|
+
return title.trim();
|
|
40
|
+
// Try Twitter title
|
|
41
|
+
title = $('meta[name="twitter:title"]').attr('content');
|
|
42
|
+
if (title)
|
|
43
|
+
return title.trim();
|
|
44
|
+
// Try title tag
|
|
45
|
+
title = $('title').text();
|
|
46
|
+
if (title)
|
|
47
|
+
return cleanConcatenatedTitle(title.trim());
|
|
48
|
+
// Fallback to first h1
|
|
49
|
+
title = $('h1').first().text();
|
|
50
|
+
if (title)
|
|
51
|
+
return title.trim();
|
|
52
|
+
return '';
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Extract page description using fallback chain:
|
|
56
|
+
* og:description → twitter:description → meta description
|
|
57
|
+
*/
|
|
58
|
+
function extractDescription($) {
|
|
59
|
+
// Try Open Graph description
|
|
60
|
+
let desc = $('meta[property="og:description"]').attr('content');
|
|
61
|
+
if (desc)
|
|
62
|
+
return desc.trim();
|
|
63
|
+
// Try Twitter description
|
|
64
|
+
desc = $('meta[name="twitter:description"]').attr('content');
|
|
65
|
+
if (desc)
|
|
66
|
+
return desc.trim();
|
|
67
|
+
// Try standard meta description
|
|
68
|
+
desc = $('meta[name="description"]').attr('content');
|
|
69
|
+
if (desc)
|
|
70
|
+
return desc.trim();
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Extract author from meta tags
|
|
75
|
+
*/
|
|
76
|
+
function extractAuthor($) {
|
|
77
|
+
// Try article:author
|
|
78
|
+
let author = $('meta[property="article:author"]').attr('content');
|
|
79
|
+
if (author)
|
|
80
|
+
return author.trim();
|
|
81
|
+
// Try og:article:author
|
|
82
|
+
author = $('meta[property="og:article:author"]').attr('content');
|
|
83
|
+
if (author)
|
|
84
|
+
return author.trim();
|
|
85
|
+
// Try author meta tag
|
|
86
|
+
author = $('meta[name="author"]').attr('content');
|
|
87
|
+
if (author)
|
|
88
|
+
return author.trim();
|
|
89
|
+
// Try twitter:creator
|
|
90
|
+
author = $('meta[name="twitter:creator"]').attr('content');
|
|
91
|
+
if (author)
|
|
92
|
+
return author.trim();
|
|
93
|
+
return undefined;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Extract publish date from rich meta sources
|
|
97
|
+
* Returns ISO 8601 date string if found
|
|
98
|
+
*/
|
|
99
|
+
function extractPublishDate($, _html) {
|
|
100
|
+
// Try article:published_time
|
|
101
|
+
let published = $('meta[property="article:published_time"]').attr('content');
|
|
102
|
+
if (published) {
|
|
103
|
+
try {
|
|
104
|
+
return new Date(published).toISOString();
|
|
105
|
+
}
|
|
106
|
+
catch { /* ignore */ }
|
|
107
|
+
}
|
|
108
|
+
// Try meta name="date"
|
|
109
|
+
published = $('meta[name="date"]').attr('content');
|
|
110
|
+
if (published) {
|
|
111
|
+
try {
|
|
112
|
+
return new Date(published).toISOString();
|
|
113
|
+
}
|
|
114
|
+
catch { /* ignore */ }
|
|
115
|
+
}
|
|
116
|
+
// Try og:updated_time
|
|
117
|
+
published = $('meta[property="og:updated_time"]').attr('content');
|
|
118
|
+
if (published) {
|
|
119
|
+
try {
|
|
120
|
+
return new Date(published).toISOString();
|
|
121
|
+
}
|
|
122
|
+
catch { /* ignore */ }
|
|
123
|
+
}
|
|
124
|
+
// Try <time pubdate> or <time datetime> with pubdate attribute
|
|
125
|
+
const timeEl = $('time[pubdate], time[datetime][pubdate]').first();
|
|
126
|
+
const datetime = timeEl.attr('datetime') || timeEl.attr('content');
|
|
127
|
+
if (datetime) {
|
|
128
|
+
try {
|
|
129
|
+
return new Date(datetime).toISOString();
|
|
130
|
+
}
|
|
131
|
+
catch { /* ignore */ }
|
|
132
|
+
}
|
|
133
|
+
// Try JSON-LD datePublished
|
|
134
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
135
|
+
if (published)
|
|
136
|
+
return;
|
|
137
|
+
try {
|
|
138
|
+
const json = JSON.parse($(el).html() || '{}');
|
|
139
|
+
const date = json.datePublished || json.publishDate || (json['@graph'] && json['@graph'].find?.((n) => n.datePublished)?.datePublished);
|
|
140
|
+
if (date) {
|
|
141
|
+
published = new Date(date).toISOString();
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
catch { /* ignore */ }
|
|
145
|
+
});
|
|
146
|
+
if (published)
|
|
147
|
+
return published;
|
|
148
|
+
return undefined;
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Extract page language
|
|
152
|
+
*/
|
|
153
|
+
function extractLanguage($) {
|
|
154
|
+
// Try html lang attribute
|
|
155
|
+
const htmlLang = $('html').attr('lang');
|
|
156
|
+
if (htmlLang)
|
|
157
|
+
return htmlLang.trim();
|
|
158
|
+
// Try Content-Language meta
|
|
159
|
+
const contentLang = $('meta[http-equiv="Content-Language"]').attr('content');
|
|
160
|
+
if (contentLang)
|
|
161
|
+
return contentLang.trim();
|
|
162
|
+
// Try og:locale (convert underscore to hyphen, e.g. "en_US" → "en-US")
|
|
163
|
+
const ogLocale = $('meta[property="og:locale"]').attr('content');
|
|
164
|
+
if (ogLocale)
|
|
165
|
+
return ogLocale.trim().replace('_', '-');
|
|
166
|
+
return undefined;
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Count words in visible text (strips HTML tags, splits on whitespace)
|
|
170
|
+
*/
|
|
171
|
+
function extractWordCount(html) {
|
|
172
|
+
// Remove script and style content
|
|
173
|
+
const stripped = html
|
|
174
|
+
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ' ')
|
|
175
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, ' ')
|
|
176
|
+
// Remove all HTML tags
|
|
177
|
+
.replace(/<[^>]+>/g, ' ')
|
|
178
|
+
// Decode common entities
|
|
179
|
+
.replace(/&/g, '&')
|
|
180
|
+
.replace(/</g, '<')
|
|
181
|
+
.replace(/>/g, '>')
|
|
182
|
+
.replace(/ /g, ' ')
|
|
183
|
+
.replace(/"/g, '"')
|
|
184
|
+
// Collapse whitespace
|
|
185
|
+
.replace(/\s+/g, ' ')
|
|
186
|
+
.trim();
|
|
187
|
+
if (!stripped)
|
|
188
|
+
return 0;
|
|
189
|
+
return stripped.split(' ').filter(w => w.length > 0).length;
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Extract published date from meta tags
|
|
193
|
+
* Returns ISO 8601 date string if found
|
|
194
|
+
*/
|
|
195
|
+
function extractPublished($) {
|
|
196
|
+
// Try article:published_time
|
|
197
|
+
let published = $('meta[property="article:published_time"]').attr('content');
|
|
198
|
+
if (published) {
|
|
199
|
+
try {
|
|
200
|
+
return new Date(published).toISOString();
|
|
201
|
+
}
|
|
202
|
+
catch (e) {
|
|
203
|
+
if (process.env.DEBUG)
|
|
204
|
+
console.debug('[webpeel]', 'date parse failed:', e instanceof Error ? e.message : e);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
// Try datePublished schema.org
|
|
208
|
+
published = $('meta[itemprop="datePublished"]').attr('content');
|
|
209
|
+
if (published) {
|
|
210
|
+
try {
|
|
211
|
+
return new Date(published).toISOString();
|
|
212
|
+
}
|
|
213
|
+
catch (e) {
|
|
214
|
+
if (process.env.DEBUG)
|
|
215
|
+
console.debug('[webpeel]', 'date parse failed:', e instanceof Error ? e.message : e);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return undefined;
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Extract Open Graph image URL
|
|
222
|
+
*/
|
|
223
|
+
function extractImage($) {
|
|
224
|
+
// Try og:image
|
|
225
|
+
let image = $('meta[property="og:image"]').attr('content');
|
|
226
|
+
if (image)
|
|
227
|
+
return image.trim();
|
|
228
|
+
// Try twitter:image
|
|
229
|
+
image = $('meta[name="twitter:image"]').attr('content');
|
|
230
|
+
if (image)
|
|
231
|
+
return image.trim();
|
|
232
|
+
return undefined;
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Extract canonical URL
|
|
236
|
+
*/
|
|
237
|
+
function extractCanonical($) {
|
|
238
|
+
const canonical = $('link[rel="canonical"]').attr('href');
|
|
239
|
+
if (canonical)
|
|
240
|
+
return canonical.trim();
|
|
241
|
+
// Fallback to og:url
|
|
242
|
+
const ogUrl = $('meta[property="og:url"]').attr('content');
|
|
243
|
+
if (ogUrl)
|
|
244
|
+
return ogUrl.trim();
|
|
245
|
+
return undefined;
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Extract all links from page
|
|
249
|
+
* Returns absolute URLs, deduplicated
|
|
250
|
+
*/
|
|
251
|
+
export function extractLinks(html, baseUrl) {
|
|
252
|
+
const $ = cheerio.load(html);
|
|
253
|
+
const links = new Set();
|
|
254
|
+
$('a[href]').each((_, elem) => {
|
|
255
|
+
const href = $(elem).attr('href');
|
|
256
|
+
if (!href)
|
|
257
|
+
return;
|
|
258
|
+
try {
|
|
259
|
+
const absoluteUrl = new URL(href, baseUrl);
|
|
260
|
+
// SECURITY: Only allow HTTP and HTTPS protocols
|
|
261
|
+
if (!['http:', 'https:'].includes(absoluteUrl.protocol)) {
|
|
262
|
+
return;
|
|
263
|
+
}
|
|
264
|
+
// Skip anchor-only links (e.g., href="#section")
|
|
265
|
+
const baseNormalized = new URL(baseUrl);
|
|
266
|
+
if (absoluteUrl.hash &&
|
|
267
|
+
absoluteUrl.origin === baseNormalized.origin &&
|
|
268
|
+
absoluteUrl.pathname === baseNormalized.pathname &&
|
|
269
|
+
absoluteUrl.search === baseNormalized.search) {
|
|
270
|
+
return;
|
|
271
|
+
}
|
|
272
|
+
links.add(absoluteUrl.href);
|
|
273
|
+
}
|
|
274
|
+
catch (e) {
|
|
275
|
+
if (process.env.DEBUG)
|
|
276
|
+
console.debug('[webpeel]', 'url parse failed:', e instanceof Error ? e.message : e);
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
return Array.from(links).sort();
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Extract all images from HTML
|
|
283
|
+
* Resolves relative URLs to absolute and extracts metadata
|
|
284
|
+
*
|
|
285
|
+
* @param html - HTML to extract images from
|
|
286
|
+
* @param baseUrl - Base URL for resolving relative paths
|
|
287
|
+
* @returns Array of image information, deduplicated by src
|
|
288
|
+
*/
|
|
289
|
+
export function extractImages(html, baseUrl) {
|
|
290
|
+
const $ = cheerio.load(html);
|
|
291
|
+
const images = new Map();
|
|
292
|
+
// Extract <img> tags
|
|
293
|
+
$('img[src]').each((_, elem) => {
|
|
294
|
+
const $img = $(elem);
|
|
295
|
+
const src = $img.attr('src');
|
|
296
|
+
if (!src)
|
|
297
|
+
return;
|
|
298
|
+
try {
|
|
299
|
+
const absoluteUrl = new URL(src, baseUrl);
|
|
300
|
+
// SECURITY: Only allow HTTP and HTTPS protocols
|
|
301
|
+
if (!['http:', 'https:'].includes(absoluteUrl.protocol)) {
|
|
302
|
+
return;
|
|
303
|
+
}
|
|
304
|
+
const alt = $img.attr('alt') || '';
|
|
305
|
+
const title = $img.attr('title');
|
|
306
|
+
const widthStr = $img.attr('width');
|
|
307
|
+
const heightStr = $img.attr('height');
|
|
308
|
+
const width = widthStr ? parseInt(widthStr, 10) : undefined;
|
|
309
|
+
const height = heightStr ? parseInt(heightStr, 10) : undefined;
|
|
310
|
+
const imageInfo = {
|
|
311
|
+
src: absoluteUrl.href,
|
|
312
|
+
alt,
|
|
313
|
+
title,
|
|
314
|
+
width: width && !isNaN(width) ? width : undefined,
|
|
315
|
+
height: height && !isNaN(height) ? height : undefined,
|
|
316
|
+
};
|
|
317
|
+
// Deduplicate by src
|
|
318
|
+
images.set(absoluteUrl.href, imageInfo);
|
|
319
|
+
}
|
|
320
|
+
catch (e) {
|
|
321
|
+
if (process.env.DEBUG)
|
|
322
|
+
console.debug('[webpeel]', 'url parse failed:', e instanceof Error ? e.message : e);
|
|
323
|
+
}
|
|
324
|
+
});
|
|
325
|
+
// Extract <picture><source> tags
|
|
326
|
+
$('picture source[srcset]').each((_, elem) => {
|
|
327
|
+
const $source = $(elem);
|
|
328
|
+
const srcset = $source.attr('srcset');
|
|
329
|
+
if (!srcset)
|
|
330
|
+
return;
|
|
331
|
+
// Parse srcset (format: "url 1x, url 2x" or "url 100w, url 200w")
|
|
332
|
+
const srcsetParts = srcset.split(',').map(s => s.trim());
|
|
333
|
+
srcsetParts.forEach(part => {
|
|
334
|
+
const url = part.split(/\s+/)[0];
|
|
335
|
+
if (!url)
|
|
336
|
+
return;
|
|
337
|
+
try {
|
|
338
|
+
const absoluteUrl = new URL(url, baseUrl);
|
|
339
|
+
// SECURITY: Only allow HTTP and HTTPS protocols
|
|
340
|
+
if (!['http:', 'https:'].includes(absoluteUrl.protocol)) {
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
// Try to get alt from parent picture's img
|
|
344
|
+
const alt = $source.closest('picture').find('img').attr('alt') || '';
|
|
345
|
+
const imageInfo = {
|
|
346
|
+
src: absoluteUrl.href,
|
|
347
|
+
alt,
|
|
348
|
+
};
|
|
349
|
+
images.set(absoluteUrl.href, imageInfo);
|
|
350
|
+
}
|
|
351
|
+
catch (e) {
|
|
352
|
+
if (process.env.DEBUG)
|
|
353
|
+
console.debug('[webpeel]', 'url parse failed:', e instanceof Error ? e.message : e);
|
|
354
|
+
}
|
|
355
|
+
});
|
|
356
|
+
});
|
|
357
|
+
// Extract CSS background images
|
|
358
|
+
$('[style*="background"]').each((_, elem) => {
|
|
359
|
+
const style = $(elem).attr('style');
|
|
360
|
+
if (!style)
|
|
361
|
+
return;
|
|
362
|
+
// Match url() in CSS
|
|
363
|
+
const urlMatches = style.match(/url\(['"]?([^'")\s]+)['"]?\)/g);
|
|
364
|
+
if (!urlMatches)
|
|
365
|
+
return;
|
|
366
|
+
urlMatches.forEach(match => {
|
|
367
|
+
const url = match.replace(/url\(['"]?([^'")\s]+)['"]?\)/, '$1');
|
|
368
|
+
if (!url)
|
|
369
|
+
return;
|
|
370
|
+
try {
|
|
371
|
+
const absoluteUrl = new URL(url, baseUrl);
|
|
372
|
+
// SECURITY: Only allow HTTP and HTTPS protocols
|
|
373
|
+
if (!['http:', 'https:'].includes(absoluteUrl.protocol)) {
|
|
374
|
+
return;
|
|
375
|
+
}
|
|
376
|
+
const imageInfo = {
|
|
377
|
+
src: absoluteUrl.href,
|
|
378
|
+
alt: '', // Background images don't have alt text
|
|
379
|
+
};
|
|
380
|
+
images.set(absoluteUrl.href, imageInfo);
|
|
381
|
+
}
|
|
382
|
+
catch (e) {
|
|
383
|
+
if (process.env.DEBUG)
|
|
384
|
+
console.debug('[webpeel]', 'url parse failed:', e instanceof Error ? e.message : e);
|
|
385
|
+
}
|
|
386
|
+
});
|
|
387
|
+
});
|
|
388
|
+
return Array.from(images.values());
|
|
389
|
+
}
|
|
390
|
+
/**
|
|
391
|
+
* Extract all metadata from HTML.
|
|
392
|
+
* Optimization: only parse the <head> section with cheerio (avoids full DOM parse).
|
|
393
|
+
* Falls back to full HTML if head section is not found or produces no title.
|
|
394
|
+
*/
|
|
395
|
+
export function extractMetadata(html, _url) {
|
|
396
|
+
// Extract only the <head> section for faster cheerio parsing
|
|
397
|
+
// This avoids parsing the entire body DOM just for meta tags
|
|
398
|
+
let headHtml = html;
|
|
399
|
+
const headMatch = html.match(/<head[\s>][\s\S]*?<\/head>/i);
|
|
400
|
+
if (headMatch) {
|
|
401
|
+
// Include a minimal body shell so cheerio parses it correctly,
|
|
402
|
+
// and append the first <h1> from body for the title fallback
|
|
403
|
+
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
|
404
|
+
headHtml = `<html>${headMatch[0]}<body>${h1Match ? h1Match[0] : ''}</body></html>`;
|
|
405
|
+
}
|
|
406
|
+
const $ = cheerio.load(headHtml);
|
|
407
|
+
const title = extractTitle($);
|
|
408
|
+
const publishDate = extractPublishDate($, html);
|
|
409
|
+
const language = extractLanguage($);
|
|
410
|
+
const wordCount = extractWordCount(html);
|
|
411
|
+
const metadata = {
|
|
412
|
+
description: extractDescription($),
|
|
413
|
+
author: extractAuthor($),
|
|
414
|
+
published: extractPublished($),
|
|
415
|
+
image: extractImage($),
|
|
416
|
+
canonical: extractCanonical($),
|
|
417
|
+
...(publishDate ? { publishDate } : {}),
|
|
418
|
+
...(language ? { language } : {}),
|
|
419
|
+
wordCount,
|
|
420
|
+
};
|
|
421
|
+
return { title, metadata };
|
|
422
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebPeel Observe — Give agents structured "eyes" on a web page.
|
|
3
|
+
*
|
|
4
|
+
* Returns a machine-readable map of interactive elements (links, buttons,
|
|
5
|
+
* inputs, forms, selects, media) so agents can decide what to do next
|
|
6
|
+
* without needing a vision model.
|
|
7
|
+
*
|
|
8
|
+
* This bridges the gap between:
|
|
9
|
+
* - `peel()` / `webpeel_read` → markdown content (strips interaction cues)
|
|
10
|
+
* - `webpeel_act` → requires knowing selectors already
|
|
11
|
+
*
|
|
12
|
+
* With `observe()`, the loop becomes:
|
|
13
|
+
* 1. observe(url) → see what's on the page
|
|
14
|
+
* 2. decide which element to interact with
|
|
15
|
+
* 3. act(url, actions) → do it
|
|
16
|
+
* 4. observe again → see the result
|
|
17
|
+
*/
|
|
18
|
+
export interface ObserveOptions {
|
|
19
|
+
/** URL to observe (required unless passing an existing Page) */
|
|
20
|
+
url?: string;
|
|
21
|
+
/** Use browser rendering (default: true — observation inherently needs the rendered DOM) */
|
|
22
|
+
render?: boolean;
|
|
23
|
+
/** CSS selector to scope observation (e.g. 'main', '#content') */
|
|
24
|
+
selector?: string;
|
|
25
|
+
/** Viewport: 'desktop' | 'mobile' | 'tablet' | {width, height} */
|
|
26
|
+
viewport?: 'desktop' | 'mobile' | 'tablet' | {
|
|
27
|
+
width: number;
|
|
28
|
+
height: number;
|
|
29
|
+
};
|
|
30
|
+
/** Include a screenshot alongside structured data (default: false) */
|
|
31
|
+
screenshot?: boolean;
|
|
32
|
+
/** Full-page screenshot (default: false) */
|
|
33
|
+
screenshotFullPage?: boolean;
|
|
34
|
+
/** Max elements to return per category (default: 50) */
|
|
35
|
+
maxElements?: number;
|
|
36
|
+
/** Timeout in ms (default: 30000) */
|
|
37
|
+
timeout?: number;
|
|
38
|
+
/** Use stealth mode (default: false) */
|
|
39
|
+
stealth?: boolean;
|
|
40
|
+
}
|
|
41
|
+
export interface ObservedElement {
|
|
42
|
+
/** Auto-generated index for easy reference: "link-0", "button-3", "input-2" */
|
|
43
|
+
ref: string;
|
|
44
|
+
/** Element tag (a, button, input, select, textarea, etc.) */
|
|
45
|
+
tag: string;
|
|
46
|
+
/** Best CSS selector to target this element */
|
|
47
|
+
selector: string;
|
|
48
|
+
/** Visible text content (truncated to 120 chars) */
|
|
49
|
+
text: string;
|
|
50
|
+
/** Semantic role or purpose */
|
|
51
|
+
role: string;
|
|
52
|
+
/** Additional attributes that help identify purpose */
|
|
53
|
+
attributes: Record<string, string>;
|
|
54
|
+
/** Whether the element is visible in the current viewport */
|
|
55
|
+
inViewport: boolean;
|
|
56
|
+
/** Bounding box { x, y, width, height } relative to viewport */
|
|
57
|
+
bbox?: {
|
|
58
|
+
x: number;
|
|
59
|
+
y: number;
|
|
60
|
+
width: number;
|
|
61
|
+
height: number;
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
export interface ObserveResult {
|
|
65
|
+
/** Final URL after redirects */
|
|
66
|
+
url: string;
|
|
67
|
+
/** Page title */
|
|
68
|
+
title: string;
|
|
69
|
+
/** Current viewport dimensions */
|
|
70
|
+
viewport: {
|
|
71
|
+
width: number;
|
|
72
|
+
height: number;
|
|
73
|
+
};
|
|
74
|
+
/** Page scroll dimensions */
|
|
75
|
+
scroll: {
|
|
76
|
+
width: number;
|
|
77
|
+
height: number;
|
|
78
|
+
};
|
|
79
|
+
/** Interactive elements grouped by type */
|
|
80
|
+
elements: {
|
|
81
|
+
links: ObservedElement[];
|
|
82
|
+
buttons: ObservedElement[];
|
|
83
|
+
inputs: ObservedElement[];
|
|
84
|
+
selects: ObservedElement[];
|
|
85
|
+
forms: ObservedElement[];
|
|
86
|
+
media: ObservedElement[];
|
|
87
|
+
};
|
|
88
|
+
/** Total count of discovered elements */
|
|
89
|
+
totalElements: number;
|
|
90
|
+
/** Plain-text summary for quick agent consumption */
|
|
91
|
+
summary: string;
|
|
92
|
+
/** Optional screenshot (base64 PNG) */
|
|
93
|
+
screenshot?: string;
|
|
94
|
+
/** Elapsed time in ms */
|
|
95
|
+
elapsed: number;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Observe a web page and return a structured map of interactive elements.
|
|
99
|
+
*
|
|
100
|
+
* @example
|
|
101
|
+
* ```typescript
|
|
102
|
+
* import { observe } from 'webpeel';
|
|
103
|
+
*
|
|
104
|
+
* const result = await observe({ url: 'https://news.ycombinator.com' });
|
|
105
|
+
* console.log(result.elements.links.length); // e.g. 30
|
|
106
|
+
* console.log(result.elements.links[0].ref); // "link-0"
|
|
107
|
+
* console.log(result.elements.links[0].text); // "Show HN: ..."
|
|
108
|
+
* console.log(result.elements.links[0].selector); // "a[href='item?id=12345']"
|
|
109
|
+
* console.log(result.summary);
|
|
110
|
+
* // "30 links, 2 buttons, 1 input, 1 form. Key actions: ..."
|
|
111
|
+
* ```
|
|
112
|
+
*/
|
|
113
|
+
export declare function observe(options: ObserveOptions): Promise<ObserveResult>;
|