@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Readability Engine
|
|
3
|
+
*
|
|
4
|
+
* Extracts the core article content from a web page — like Pocket, Instapaper,
|
|
5
|
+
* or Safari Reader Mode but deterministic, fast, and purpose-built for AI agents.
|
|
6
|
+
*
|
|
7
|
+
* Algorithm:
|
|
8
|
+
* 1. Noise removal — strip nav, footer, aside, ads, cookie banners, etc.
|
|
9
|
+
* 2. Candidate scoring — score block elements by text density, link density,
|
|
10
|
+
* paragraph count, and structural signals.
|
|
11
|
+
* 3. Best candidate selection — prefer <article> > <main> > highest-scoring div.
|
|
12
|
+
* 4. Post-selection cleaning — remove inline noise (share buttons, etc.).
|
|
13
|
+
* 5. Metadata extraction — title, author, date, site name from meta tags / bylines.
|
|
14
|
+
* 6. Markdown output — via existing htmlToMarkdown().
|
|
15
|
+
*/
|
|
16
|
+
import * as cheerio from 'cheerio';
|
|
17
|
+
import { rawHtmlToMarkdown } from './markdown.js';
|
|
18
|
+
import { cleanConcatenatedTitle } from './metadata.js';
|
|
19
|
+
// ─── Noise patterns ───────────────────────────────────────────────────────────
|
|
20
|
+
/** Tags that are almost always page chrome, not article content */
|
|
21
|
+
const NOISE_TAGS = new Set([
|
|
22
|
+
'nav', 'footer', 'aside', 'header',
|
|
23
|
+
'script', 'style', 'noscript', 'iframe', 'form',
|
|
24
|
+
]);
|
|
25
|
+
/**
|
|
26
|
+
* Class/id patterns that indicate page chrome (case-insensitive).
|
|
27
|
+
* Applied to combined class+id strings.
|
|
28
|
+
*/
|
|
29
|
+
const NOISE_CLASS_PATTERNS = [
|
|
30
|
+
/\bsidebar\b/,
|
|
31
|
+
/\bmenu\b/,
|
|
32
|
+
/\bnav(bar|igation)?\b/,
|
|
33
|
+
/\bfooter\b/,
|
|
34
|
+
/\bcomment/,
|
|
35
|
+
/\bshare\b/,
|
|
36
|
+
/\bsocial/,
|
|
37
|
+
/\bwidget\b/,
|
|
38
|
+
/\bad(s|vert(isement)?|-unit)?\b/,
|
|
39
|
+
/\bpromo\b/,
|
|
40
|
+
/\bbanner(?!-content)/,
|
|
41
|
+
/\bcookie\b/,
|
|
42
|
+
/\bconsent\b/,
|
|
43
|
+
/\bnewsletter\b/,
|
|
44
|
+
/\bsignup\b/,
|
|
45
|
+
/\bsign-up\b/,
|
|
46
|
+
/\bsubscri/,
|
|
47
|
+
/\brelated\b/,
|
|
48
|
+
/\brecommended\b/,
|
|
49
|
+
/\bpopular\b/,
|
|
50
|
+
/\btrending\b/,
|
|
51
|
+
/\bbreadcrumb/,
|
|
52
|
+
/\bpagination\b/,
|
|
53
|
+
/\btoolbar\b/,
|
|
54
|
+
/\bmodal\b/,
|
|
55
|
+
/\bpopup\b/,
|
|
56
|
+
/\boverlay\b/,
|
|
57
|
+
/\btoast\b/,
|
|
58
|
+
/\bnotification\b/,
|
|
59
|
+
/\bskip-?link\b/,
|
|
60
|
+
];
|
|
61
|
+
/** aria-role values that indicate page chrome */
|
|
62
|
+
const NOISE_ROLES = new Set([
|
|
63
|
+
'navigation', 'banner', 'contentinfo', 'complementary', 'search',
|
|
64
|
+
]);
|
|
65
|
+
/** Class/id patterns that indicate content (protect from removal) */
|
|
66
|
+
const CONTENT_PATTERNS = [
|
|
67
|
+
/\barticle/,
|
|
68
|
+
/\bpost-?content/,
|
|
69
|
+
/\bentry-?content/,
|
|
70
|
+
/\bmain-?content/,
|
|
71
|
+
/\bstory\b/,
|
|
72
|
+
/\bpage-?content/,
|
|
73
|
+
/\bcontent-?area\b/,
|
|
74
|
+
/\bprose\b/,
|
|
75
|
+
/\bmarkdown-?body\b/,
|
|
76
|
+
];
|
|
77
|
+
/** Inline noise patterns for post-selection cleanup */
|
|
78
|
+
const INLINE_NOISE_PATTERNS = [
|
|
79
|
+
/\bshare\b/,
|
|
80
|
+
/\bsocial\b/,
|
|
81
|
+
/\bfollow\b/,
|
|
82
|
+
/\btwitter\b/,
|
|
83
|
+
/\bfacebook\b/,
|
|
84
|
+
/\blinkedin\b/,
|
|
85
|
+
/\binstagram\b/,
|
|
86
|
+
/\bpinterest\b/,
|
|
87
|
+
/\bprint\b/,
|
|
88
|
+
/\bsave\b/,
|
|
89
|
+
/\bbookmark\b/,
|
|
90
|
+
];
|
|
91
|
+
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
92
|
+
function getClassAndId($el, _$) {
|
|
93
|
+
const cls = ($el.attr('class') ?? '').toLowerCase();
|
|
94
|
+
const id = ($el.attr('id') ?? '').toLowerCase();
|
|
95
|
+
return cls + ' ' + id;
|
|
96
|
+
}
|
|
97
|
+
function isNoise(el, $) {
|
|
98
|
+
const tagName = (el.tagName ?? '').toLowerCase();
|
|
99
|
+
if (NOISE_TAGS.has(tagName))
|
|
100
|
+
return true;
|
|
101
|
+
const $el = $(el);
|
|
102
|
+
const combined = getClassAndId($el, $);
|
|
103
|
+
// Protect elements that match content patterns
|
|
104
|
+
for (const p of CONTENT_PATTERNS) {
|
|
105
|
+
if (p.test(combined))
|
|
106
|
+
return false;
|
|
107
|
+
}
|
|
108
|
+
for (const p of NOISE_CLASS_PATTERNS) {
|
|
109
|
+
if (p.test(combined))
|
|
110
|
+
return true;
|
|
111
|
+
}
|
|
112
|
+
const role = ($el.attr('role') ?? '').toLowerCase();
|
|
113
|
+
if (NOISE_ROLES.has(role))
|
|
114
|
+
return true;
|
|
115
|
+
return false;
|
|
116
|
+
}
|
|
117
|
+
function isHidden($el) {
|
|
118
|
+
const style = ($el.attr('style') ?? '').toLowerCase();
|
|
119
|
+
if (style.includes('display:none') || style.includes('display: none'))
|
|
120
|
+
return true;
|
|
121
|
+
if ($el.attr('hidden') !== undefined)
|
|
122
|
+
return true;
|
|
123
|
+
if ($el.attr('aria-hidden') === 'true')
|
|
124
|
+
return true;
|
|
125
|
+
return false;
|
|
126
|
+
}
|
|
127
|
+
function extractMeta($) {
|
|
128
|
+
// Title — prefer og:title, then <title>, then h1
|
|
129
|
+
let title = $('meta[property="og:title"]').attr('content') ||
|
|
130
|
+
$('meta[name="twitter:title"]').attr('content') ||
|
|
131
|
+
$('title').text() ||
|
|
132
|
+
$('h1').first().text() ||
|
|
133
|
+
'';
|
|
134
|
+
title = cleanConcatenatedTitle(title.trim().replace(/\s+/g, ' '));
|
|
135
|
+
// Author
|
|
136
|
+
let author = $('meta[name="author"]').attr('content') ||
|
|
137
|
+
$('meta[property="article:author"]').attr('content') ||
|
|
138
|
+
null;
|
|
139
|
+
// Structured author data (rel="author", itemprop="author")
|
|
140
|
+
// Only accept if the text looks like a person's name (short, no junk)
|
|
141
|
+
if (!author) {
|
|
142
|
+
for (const sel of ['[rel="author"]', '[itemprop="author"]']) {
|
|
143
|
+
const text = $(sel).first().text().trim().replace(/\s+/g, ' ');
|
|
144
|
+
if (text && text.length > 1 && text.length < 60 && !text.includes('\n')) {
|
|
145
|
+
author = text;
|
|
146
|
+
break;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
// Byline patterns — look for common class names
|
|
151
|
+
if (!author) {
|
|
152
|
+
const bylineSelectors = [
|
|
153
|
+
'.byline', '.author:not([class*="authority"])', '.post-author',
|
|
154
|
+
'.article-author', '.entry-author', '[class*="byline"]',
|
|
155
|
+
];
|
|
156
|
+
for (const sel of bylineSelectors) {
|
|
157
|
+
const text = $(sel).first().text().trim().replace(/\s+/g, ' ');
|
|
158
|
+
if (text && text.length > 1 && text.length < 80 && !text.includes('\n')) {
|
|
159
|
+
// Strip "By " prefix common in bylines
|
|
160
|
+
author = text.replace(/^by\s+/i, '').trim();
|
|
161
|
+
break;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
// Sanity check: author shouldn't look like junk (too many words, has "database", etc.)
|
|
166
|
+
if (author && (author.split(/\s+/).length > 8 || /database|control|footer|sidebar/i.test(author))) {
|
|
167
|
+
author = null;
|
|
168
|
+
}
|
|
169
|
+
if (author)
|
|
170
|
+
author = author.trim().replace(/\s+/g, ' ') || null;
|
|
171
|
+
// Date
|
|
172
|
+
let date = $('meta[property="article:published_time"]').attr('content') ||
|
|
173
|
+
$('meta[name="publishdate"]').attr('content') ||
|
|
174
|
+
$('meta[name="publish_date"]').attr('content') ||
|
|
175
|
+
$('meta[itemprop="datePublished"]').attr('content') ||
|
|
176
|
+
null;
|
|
177
|
+
if (!date) {
|
|
178
|
+
// Look for <time> elements
|
|
179
|
+
const timeEl = $('time[datetime]').first();
|
|
180
|
+
if (timeEl.length) {
|
|
181
|
+
date = timeEl.attr('datetime') || timeEl.text().trim() || null;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
if (!date) {
|
|
185
|
+
// Look for JSON-LD datePublished
|
|
186
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
187
|
+
if (date)
|
|
188
|
+
return;
|
|
189
|
+
try {
|
|
190
|
+
const parsed = JSON.parse($(el).html() ?? '{}');
|
|
191
|
+
const candidates = Array.isArray(parsed) ? parsed : [parsed];
|
|
192
|
+
for (const obj of candidates) {
|
|
193
|
+
if (obj.datePublished) {
|
|
194
|
+
date = obj.datePublished;
|
|
195
|
+
break;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
catch { /* ignore parse errors */ }
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
if (date)
|
|
203
|
+
date = date.trim() || null;
|
|
204
|
+
// Site name
|
|
205
|
+
const siteName = $('meta[property="og:site_name"]').attr('content')?.trim() ||
|
|
206
|
+
null;
|
|
207
|
+
// Language
|
|
208
|
+
const language = $('html').attr('lang')?.trim().split('-')[0] ||
|
|
209
|
+
$('meta[http-equiv="Content-Language"]').attr('content')?.trim() ||
|
|
210
|
+
null;
|
|
211
|
+
return { title, author, date, siteName, language };
|
|
212
|
+
}
|
|
213
|
+
// ─── Noise removal ────────────────────────────────────────────────────────────
|
|
214
|
+
function removeNoise($) {
|
|
215
|
+
// Remove hidden elements first
|
|
216
|
+
$('[aria-hidden="true"], [hidden]').remove();
|
|
217
|
+
$('[style*="display:none"], [style*="display: none"]').remove();
|
|
218
|
+
// Walk and remove noise elements (top-down, don't recurse into removed nodes)
|
|
219
|
+
const toRemove = [];
|
|
220
|
+
function walk(node) {
|
|
221
|
+
if (node.type !== 'tag')
|
|
222
|
+
return;
|
|
223
|
+
const el = node;
|
|
224
|
+
const tagName = (el.tagName ?? '').toLowerCase();
|
|
225
|
+
// Skip script/style (already handled by htmlToMarkdown)
|
|
226
|
+
if (tagName === 'script' || tagName === 'style' || tagName === 'meta' || tagName === 'link')
|
|
227
|
+
return;
|
|
228
|
+
if (isNoise(el, $) || isHidden($(el))) {
|
|
229
|
+
toRemove.push(el);
|
|
230
|
+
return; // Don't recurse into nodes we'll remove
|
|
231
|
+
}
|
|
232
|
+
for (const child of el.children ?? []) {
|
|
233
|
+
walk(child);
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
const body = $('body').get(0);
|
|
237
|
+
if (body)
|
|
238
|
+
walk(body);
|
|
239
|
+
for (const el of toRemove) {
|
|
240
|
+
$(el).remove();
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
function scoreCandidate($el, $) {
|
|
244
|
+
const html = $.html($el) ?? '';
|
|
245
|
+
const htmlLength = html.length;
|
|
246
|
+
if (htmlLength === 0)
|
|
247
|
+
return 0;
|
|
248
|
+
// Remove scripts/styles from clone for text measurement
|
|
249
|
+
const clone = $el.clone();
|
|
250
|
+
clone.find('script, style, noscript').remove();
|
|
251
|
+
const visibleText = clone.text() ?? '';
|
|
252
|
+
const textLength = visibleText.trim().length;
|
|
253
|
+
const textDensity = textLength / Math.max(htmlLength, 1);
|
|
254
|
+
// Link density
|
|
255
|
+
let linkTextLength = 0;
|
|
256
|
+
$el.find('a').each((_, a) => {
|
|
257
|
+
linkTextLength += ($(a).text() ?? '').trim().length;
|
|
258
|
+
});
|
|
259
|
+
const linkDensity = textLength > 0 ? linkTextLength / textLength : 1;
|
|
260
|
+
// Paragraph count
|
|
261
|
+
const paragraphCount = $el.find('p').length;
|
|
262
|
+
// Base score: paragraphs × 3 + text length bonus - link density penalty
|
|
263
|
+
let score = paragraphCount * 3 + textLength / 100 - linkDensity * 100;
|
|
264
|
+
// Boost for high text density
|
|
265
|
+
score += textDensity * 20;
|
|
266
|
+
// Penalize noise class/id
|
|
267
|
+
const combined = getClassAndId($el, $);
|
|
268
|
+
for (const p of NOISE_CLASS_PATTERNS) {
|
|
269
|
+
if (p.test(combined)) {
|
|
270
|
+
score -= 30;
|
|
271
|
+
break;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
// Boost if inside <main> or <article>
|
|
275
|
+
const parents = $el.parents('main, article');
|
|
276
|
+
if (parents.length > 0) {
|
|
277
|
+
score += 20;
|
|
278
|
+
}
|
|
279
|
+
return score;
|
|
280
|
+
}
|
|
281
|
+
function findBestCandidate($) {
|
|
282
|
+
// Priority 1: <article>
|
|
283
|
+
const articles = $('article');
|
|
284
|
+
if (articles.length > 0) {
|
|
285
|
+
// If multiple articles, pick the one with most paragraph content
|
|
286
|
+
let best = null;
|
|
287
|
+
let bestScore = -Infinity;
|
|
288
|
+
articles.each((_, el) => {
|
|
289
|
+
const $el = $(el);
|
|
290
|
+
const s = scoreCandidate($el, $);
|
|
291
|
+
if (s > bestScore) {
|
|
292
|
+
bestScore = s;
|
|
293
|
+
best = el;
|
|
294
|
+
}
|
|
295
|
+
});
|
|
296
|
+
if (best)
|
|
297
|
+
return best;
|
|
298
|
+
}
|
|
299
|
+
// Priority 2: <main>
|
|
300
|
+
const main = $('main').first();
|
|
301
|
+
if (main.length > 0) {
|
|
302
|
+
return main.get(0);
|
|
303
|
+
}
|
|
304
|
+
// Priority 3: [role="main"]
|
|
305
|
+
const roleMain = $('[role="main"]').first();
|
|
306
|
+
if (roleMain.length > 0) {
|
|
307
|
+
return roleMain.get(0);
|
|
308
|
+
}
|
|
309
|
+
// Priority 4: Highest-scoring div/section
|
|
310
|
+
const candidates = [];
|
|
311
|
+
$('div, section').each((_, el) => {
|
|
312
|
+
const $el = $(el);
|
|
313
|
+
const html = $.html($el) ?? '';
|
|
314
|
+
// Only consider elements with meaningful content (skip tiny wrappers)
|
|
315
|
+
if (html.length < 200)
|
|
316
|
+
return;
|
|
317
|
+
const clone = $el.clone();
|
|
318
|
+
clone.find('script, style, noscript').remove();
|
|
319
|
+
const textLength = clone.text().trim().length;
|
|
320
|
+
if (textLength < 100)
|
|
321
|
+
return;
|
|
322
|
+
const paragraphCount = $el.find('p').length;
|
|
323
|
+
if (paragraphCount < 1)
|
|
324
|
+
return; // Require at least one <p>
|
|
325
|
+
let linkTextLength = 0;
|
|
326
|
+
$el.find('a').each((_, a) => {
|
|
327
|
+
linkTextLength += ($(a).text() ?? '').trim().length;
|
|
328
|
+
});
|
|
329
|
+
const linkDensity = textLength > 0 ? linkTextLength / textLength : 1;
|
|
330
|
+
const score = scoreCandidate($el, $);
|
|
331
|
+
candidates.push({ el: el, score, textLength, paragraphCount, linkDensity });
|
|
332
|
+
});
|
|
333
|
+
if (candidates.length === 0)
|
|
334
|
+
return null;
|
|
335
|
+
// Return highest score
|
|
336
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
337
|
+
return candidates[0].el;
|
|
338
|
+
}
|
|
339
|
+
// ─── Post-selection cleaning ──────────────────────────────────────────────────
|
|
340
|
+
function cleanCandidate($candidate, $, options) {
|
|
341
|
+
// Remove remaining inline noise (share buttons, social icons)
|
|
342
|
+
$candidate.find('*').each((_, el) => {
|
|
343
|
+
const $el = $(el);
|
|
344
|
+
const combined = getClassAndId($el, $);
|
|
345
|
+
for (const p of INLINE_NOISE_PATTERNS) {
|
|
346
|
+
if (p.test(combined)) {
|
|
347
|
+
// Only remove if it's clearly a widget, not article text
|
|
348
|
+
const text = $el.text().trim();
|
|
349
|
+
const tagName = el.tagName?.toLowerCase() ?? '';
|
|
350
|
+
const isInlineNoise = tagName === 'div' || tagName === 'span' || tagName === 'ul' || tagName === 'button';
|
|
351
|
+
if (isInlineNoise && text.length < 200) {
|
|
352
|
+
$el.remove();
|
|
353
|
+
return;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
});
|
|
358
|
+
// Strip images if not wanted
|
|
359
|
+
if (options.includeImages === false) {
|
|
360
|
+
$candidate.find('img, picture, figure, [class*="image"]').remove();
|
|
361
|
+
}
|
|
362
|
+
// Strip links (keep text) if not wanted
|
|
363
|
+
if (options.includeLinks === false) {
|
|
364
|
+
$candidate.find('a').each((_, el) => {
|
|
365
|
+
$(el).replaceWith($(el).text());
|
|
366
|
+
});
|
|
367
|
+
}
|
|
368
|
+
// Strip code blocks if not wanted
|
|
369
|
+
if (options.includeCode === false) {
|
|
370
|
+
$candidate.find('pre, code').remove();
|
|
371
|
+
}
|
|
372
|
+
// Strip tables if not wanted
|
|
373
|
+
if (options.includeTables === false) {
|
|
374
|
+
$candidate.find('table').remove();
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
// ─── Excerpt generation ───────────────────────────────────────────────────────
|
|
378
|
+
function extractExcerpt(text) {
|
|
379
|
+
// Split by sentence boundaries and take first 2 complete sentences
|
|
380
|
+
const sentences = text.match(/[^.!?]+[.!?]+/g);
|
|
381
|
+
if (!sentences || sentences.length === 0) {
|
|
382
|
+
// Fallback: first 200 chars
|
|
383
|
+
return text.slice(0, 200).trim();
|
|
384
|
+
}
|
|
385
|
+
return sentences.slice(0, 2).join(' ').trim();
|
|
386
|
+
}
|
|
387
|
+
// ─── Reading time ─────────────────────────────────────────────────────────────
|
|
388
|
+
function calcReadingTime(wordCount) {
|
|
389
|
+
const minutes = Math.max(1, Math.round(wordCount / 200));
|
|
390
|
+
return `${minutes} min read`;
|
|
391
|
+
}
|
|
392
|
+
// ─── Output post-processing ───────────────────────────────────────────────────
|
|
393
|
+
/**
|
|
394
|
+
* Post-process readability output to remove residual noise that survives
|
|
395
|
+
* readability extraction: skip-to-content links, breadcrumbs, cookie consent
|
|
396
|
+
* patterns, orphaned link references, and excessive blank lines.
|
|
397
|
+
*/
|
|
398
|
+
function cleanReadabilityOutput(content) {
|
|
399
|
+
return content
|
|
400
|
+
// Remove skip-to-content links
|
|
401
|
+
.replace(/\[skip to (?:main )?content\]\([^)]*\)/gi, '')
|
|
402
|
+
// Remove standalone breadcrumb patterns (e.g. "Home > Category > Page")
|
|
403
|
+
.replace(/^(?:Home|Main)\s*[>›»]\s*.*/gm, '')
|
|
404
|
+
// Remove cookie consent patterns
|
|
405
|
+
.replace(/(?:we use cookies|cookie (?:policy|settings|preferences)).*$/gim, '')
|
|
406
|
+
// Remove orphaned link references like [something]: #
|
|
407
|
+
.replace(/^\[.*?\]:\s*#?\s*$/gm, '')
|
|
408
|
+
// Clean up leftover consecutive divider noise (e.g. "--- --- ---" → single "---")
|
|
409
|
+
.replace(/(?:---\s*){2,}/g, '---\n')
|
|
410
|
+
// Collapse excessive blank lines (4+ → 2)
|
|
411
|
+
.replace(/\n{4,}/g, '\n\n\n')
|
|
412
|
+
.trim();
|
|
413
|
+
}
|
|
414
|
+
// ─── Main export ──────────────────────────────────────────────────────────────
|
|
415
|
+
/**
|
|
416
|
+
* Extract clean, readable article content from raw HTML.
|
|
417
|
+
*
|
|
418
|
+
* Mimics browser Reader Mode but deterministic and purpose-built for AI agents.
|
|
419
|
+
*
|
|
420
|
+
* @param html - Raw HTML of the page
|
|
421
|
+
* @param url - Source URL (used for resolving relative links in metadata)
|
|
422
|
+
* @param options - Extraction options
|
|
423
|
+
*/
|
|
424
|
+
export function extractReadableContent(html, _url, options = {}) {
|
|
425
|
+
const { includeImages = true, includeLinks = true, includeCode = true, includeTables = true, maxLength, } = options;
|
|
426
|
+
// Security: cap HTML size
|
|
427
|
+
if (html.length > 10 * 1024 * 1024) {
|
|
428
|
+
html = html.slice(0, 10 * 1024 * 1024);
|
|
429
|
+
}
|
|
430
|
+
// Handle empty input gracefully
|
|
431
|
+
if (!html.trim()) {
|
|
432
|
+
return {
|
|
433
|
+
title: '',
|
|
434
|
+
author: null,
|
|
435
|
+
date: null,
|
|
436
|
+
siteName: null,
|
|
437
|
+
content: '',
|
|
438
|
+
excerpt: '',
|
|
439
|
+
wordCount: 0,
|
|
440
|
+
readingTime: '1 min read',
|
|
441
|
+
language: null,
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
const $ = cheerio.load(html);
|
|
445
|
+
// ── Step 1: Extract metadata BEFORE noise removal (meta tags in <head> must survive) ──
|
|
446
|
+
const meta = extractMeta($);
|
|
447
|
+
// ── Step 2: Noise removal ──────────────────────────────────────────────────
|
|
448
|
+
removeNoise($);
|
|
449
|
+
// ── Step 3: Find best candidate ────────────────────────────────────────────
|
|
450
|
+
const bestEl = findBestCandidate($);
|
|
451
|
+
let candidateHtml;
|
|
452
|
+
if (bestEl) {
|
|
453
|
+
candidateHtml = $.html($(bestEl)) ?? '';
|
|
454
|
+
}
|
|
455
|
+
else {
|
|
456
|
+
// Fallback: use cleaned body content
|
|
457
|
+
candidateHtml = $('body').html() ?? $.html();
|
|
458
|
+
}
|
|
459
|
+
// ── Step 4: Post-selection cleaning ────────────────────────────────────────
|
|
460
|
+
const $candidate = cheerio.load(candidateHtml);
|
|
461
|
+
const $root = $candidate('body');
|
|
462
|
+
cleanCandidate($root, $candidate, { includeImages, includeLinks, includeCode, includeTables });
|
|
463
|
+
const cleanedHtml = $candidate('body').html() ?? candidateHtml;
|
|
464
|
+
// ── Step 5: Convert to markdown ────────────────────────────────────────────
|
|
465
|
+
// We use the existing htmlToMarkdown with prune:false (already cleaned)
|
|
466
|
+
let content = rawHtmlToMarkdown(cleanedHtml);
|
|
467
|
+
// ── Step 6: Build metadata header ──────────────────────────────────────────
|
|
468
|
+
// Use H1 from content as title if meta title is missing or just the tab title
|
|
469
|
+
if (!meta.title || meta.title.length < 3) {
|
|
470
|
+
const h1Match = content.match(/^#\s+(.+)$/m);
|
|
471
|
+
if (h1Match)
|
|
472
|
+
meta.title = h1Match[1].trim();
|
|
473
|
+
}
|
|
474
|
+
// Extract word count from plain content text
|
|
475
|
+
const plainText = content.replace(/[#*_`\[\]\(\)>|-]/g, ' ').replace(/\s+/g, ' ').trim();
|
|
476
|
+
const wordCount = plainText.split(/\s+/).filter(w => w.length > 0).length;
|
|
477
|
+
const readingTime = calcReadingTime(wordCount);
|
|
478
|
+
// Build metadata line
|
|
479
|
+
const metaParts = [];
|
|
480
|
+
if (meta.author)
|
|
481
|
+
metaParts.push(`By ${meta.author}`);
|
|
482
|
+
if (meta.date) {
|
|
483
|
+
// Try to format the date nicely
|
|
484
|
+
try {
|
|
485
|
+
const d = new Date(meta.date);
|
|
486
|
+
if (!isNaN(d.getTime())) {
|
|
487
|
+
metaParts.push(d.toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' }));
|
|
488
|
+
}
|
|
489
|
+
else {
|
|
490
|
+
metaParts.push(meta.date);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
catch {
|
|
494
|
+
metaParts.push(meta.date);
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
metaParts.push(readingTime);
|
|
498
|
+
const metaLine = metaParts.length > 0 ? `*${metaParts.join(' · ')}*\n\n` : '';
|
|
499
|
+
const titleLine = meta.title ? `# ${meta.title}\n${metaLine}` : metaLine;
|
|
500
|
+
// Don't duplicate title if it's already the first heading in content
|
|
501
|
+
const contentStartsWithTitle = meta.title &&
|
|
502
|
+
content.trimStart().startsWith(`# ${meta.title}`);
|
|
503
|
+
if (!contentStartsWithTitle && titleLine) {
|
|
504
|
+
content = titleLine + content;
|
|
505
|
+
}
|
|
506
|
+
else if (contentStartsWithTitle && metaLine) {
|
|
507
|
+
// Inject meta line right after the title heading
|
|
508
|
+
content = content.replace(/^(#\s+.+\n)/, `$1${metaLine}`);
|
|
509
|
+
}
|
|
510
|
+
// ── Step 7: Clean up whitespace ─────────────────────────────────────────────
|
|
511
|
+
content = content.replace(/\n{3,}/g, '\n\n').trim();
|
|
512
|
+
// ── Step 7b: Remove residual noise (skip-links, breadcrumbs, cookie text) ──
|
|
513
|
+
content = cleanReadabilityOutput(content);
|
|
514
|
+
// ── Step 8: Apply maxLength ──────────────────────────────────────────────────
|
|
515
|
+
if (maxLength && maxLength > 0 && content.length > maxLength) {
|
|
516
|
+
content = content.slice(0, maxLength).trim() + '\n\n[Content truncated]';
|
|
517
|
+
}
|
|
518
|
+
// ── Step 9: Generate excerpt ─────────────────────────────────────────────────
|
|
519
|
+
// Extract from the plain article text (no markdown formatting)
|
|
520
|
+
const articleTextForExcerpt = plainText;
|
|
521
|
+
const excerpt = extractExcerpt(articleTextForExcerpt);
|
|
522
|
+
return {
|
|
523
|
+
title: meta.title,
|
|
524
|
+
author: meta.author,
|
|
525
|
+
date: meta.date,
|
|
526
|
+
siteName: meta.siteName,
|
|
527
|
+
content,
|
|
528
|
+
excerpt,
|
|
529
|
+
wordCount,
|
|
530
|
+
readingTime,
|
|
531
|
+
language: meta.language,
|
|
532
|
+
};
|
|
533
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebPeel Deep Research Agent
|
|
3
|
+
*
|
|
4
|
+
* Autonomously searches the web, fetches top sources, filters content with
|
|
5
|
+
* BM25, optionally follows promising links, and synthesizes a comprehensive
|
|
6
|
+
* report using an LLM.
|
|
7
|
+
*
|
|
8
|
+
* Design principle: orchestrate existing modules (peel, bm25-filter,
|
|
9
|
+
* llm-extract) — don't reinvent anything.
|
|
10
|
+
*/
|
|
11
|
+
export interface ResearchOptions {
|
|
12
|
+
/** Research question or topic */
|
|
13
|
+
query: string;
|
|
14
|
+
/** Maximum number of sources to consult. Default: 5 */
|
|
15
|
+
maxSources?: number;
|
|
16
|
+
/** Maximum depth of link-following. Default: 1 (just search results; 2+ follows links) */
|
|
17
|
+
maxDepth?: number;
|
|
18
|
+
/** LLM API key for synthesis */
|
|
19
|
+
apiKey?: string;
|
|
20
|
+
/** LLM model for synthesis. Default: gpt-4o-mini */
|
|
21
|
+
model?: string;
|
|
22
|
+
/** LLM base URL. Default: https://api.openai.com/v1 */
|
|
23
|
+
baseUrl?: string;
|
|
24
|
+
/** Maximum total time in ms. Default: 60000 (1 minute) */
|
|
25
|
+
timeout?: number;
|
|
26
|
+
/** Output format: 'report' (markdown synthesis) or 'sources' (raw extracted data). Default: 'report' */
|
|
27
|
+
outputFormat?: 'report' | 'sources';
|
|
28
|
+
/** Optional callback for progress updates */
|
|
29
|
+
onProgress?: (step: ResearchStep) => void;
|
|
30
|
+
}
|
|
31
|
+
export interface ResearchStep {
|
|
32
|
+
phase: 'searching' | 'fetching' | 'extracting' | 'following' | 'synthesizing';
|
|
33
|
+
message: string;
|
|
34
|
+
sourcesFound?: number;
|
|
35
|
+
sourcesFetched?: number;
|
|
36
|
+
}
|
|
37
|
+
export interface ResearchSource {
|
|
38
|
+
url: string;
|
|
39
|
+
title: string;
|
|
40
|
+
/** Key findings from this source */
|
|
41
|
+
findings: string;
|
|
42
|
+
/** Relevance score (0-1) */
|
|
43
|
+
relevance: number;
|
|
44
|
+
}
|
|
45
|
+
export interface ResearchResult {
|
|
46
|
+
/** Synthesized research report (markdown) */
|
|
47
|
+
report: string;
|
|
48
|
+
/** Sources consulted */
|
|
49
|
+
sources: ResearchSource[];
|
|
50
|
+
/** Total sources found vs consulted */
|
|
51
|
+
totalSourcesFound: number;
|
|
52
|
+
sourcesConsulted: number;
|
|
53
|
+
/** Time taken in ms */
|
|
54
|
+
elapsed: number;
|
|
55
|
+
/** Tokens used for synthesis */
|
|
56
|
+
tokensUsed?: {
|
|
57
|
+
input: number;
|
|
58
|
+
output: number;
|
|
59
|
+
};
|
|
60
|
+
/** Estimated cost in USD */
|
|
61
|
+
cost?: number;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Conduct autonomous multi-step web research on a topic.
|
|
65
|
+
*/
|
|
66
|
+
export declare function research(options: ResearchOptions): Promise<ResearchResult>;
|