@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content Density Pruner
|
|
3
|
+
*
|
|
4
|
+
* Two-pass pruning to reduce HTML before markdown conversion:
|
|
5
|
+
*
|
|
6
|
+
* Pass 1 — Semantic removal: strip elements whose tag or class/id clearly
|
|
7
|
+
* mark them as page chrome (nav, footer, sidebar, cookie banners, ads).
|
|
8
|
+
*
|
|
9
|
+
* Pass 2 — Density scoring: score remaining block elements by text density,
|
|
10
|
+
* link density, tag importance, and word count. Remove low-scorers.
|
|
11
|
+
*
|
|
12
|
+
* Inspired by Crawl4AI's PruningContentFilter — targets 40-60% token savings.
|
|
13
|
+
*/
|
|
14
|
+
import * as cheerio from 'cheerio';
|
|
15
|
+
// -----------------------------------------------------------------------
|
|
16
|
+
// Pass 1 — Semantic removal: tags and class/id patterns
|
|
17
|
+
// -----------------------------------------------------------------------
|
|
18
|
+
/** Tags that are almost always page chrome, not article content. */
|
|
19
|
+
const CHROME_TAGS = new Set([
|
|
20
|
+
'nav', 'footer', 'aside', 'noscript',
|
|
21
|
+
]);
|
|
22
|
+
/**
|
|
23
|
+
* Class/id patterns that indicate page chrome.
|
|
24
|
+
* Tested against lowercased class/id strings.
|
|
25
|
+
*/
|
|
26
|
+
const CHROME_PATTERNS = [
|
|
27
|
+
/\bsidebar\b/,
|
|
28
|
+
/\bcookie/,
|
|
29
|
+
/\bbanner\b/,
|
|
30
|
+
/\b(ad|ads|advert)\b/,
|
|
31
|
+
/\bpopup\b/,
|
|
32
|
+
/\bmodal\b/,
|
|
33
|
+
/\boverlay\b/,
|
|
34
|
+
/\bsocial/,
|
|
35
|
+
/\bshare\b/,
|
|
36
|
+
/\bbreadcrumb/,
|
|
37
|
+
/\bskip-?link/,
|
|
38
|
+
/\bfootnote/,
|
|
39
|
+
/\brelated-?(post|article)/,
|
|
40
|
+
/\bnewsletter/,
|
|
41
|
+
/\bsubscri/,
|
|
42
|
+
/\bcomment/,
|
|
43
|
+
/\b(sign-?up|sign-?in|log-?in)\b/,
|
|
44
|
+
/\btoc\b/,
|
|
45
|
+
/\btable-?of-?contents\b/,
|
|
46
|
+
/\bgdpr\b/,
|
|
47
|
+
/\bconsent\b/,
|
|
48
|
+
// Q&A sites (Stack Overflow, StackExchange)
|
|
49
|
+
/\bvote\b/,
|
|
50
|
+
/\bpost-?menu/,
|
|
51
|
+
/\bjs-vote/,
|
|
52
|
+
/\buser-?card/,
|
|
53
|
+
/\buser-?info/,
|
|
54
|
+
/\bpost-?tag/,
|
|
55
|
+
/\bquestion-?stats/,
|
|
56
|
+
// Social/sharing UI
|
|
57
|
+
/\bshare-?(button|link|panel|menu|bar)/,
|
|
58
|
+
/\bfollow-?button/,
|
|
59
|
+
/\breaction/,
|
|
60
|
+
/\blike-?button/,
|
|
61
|
+
/\bupvote/,
|
|
62
|
+
/\bdownvote/,
|
|
63
|
+
// Edit/action UI
|
|
64
|
+
/\bedit-?(link|button|post)/,
|
|
65
|
+
/\breport-?(link|button)/,
|
|
66
|
+
/\bflag-?(link|button)/,
|
|
67
|
+
// Generic site chrome
|
|
68
|
+
/\btop-?bar/,
|
|
69
|
+
/\bsite-?header/,
|
|
70
|
+
/\bpage-?header/,
|
|
71
|
+
/\bsticky-?header/,
|
|
72
|
+
/\bnotice\b/,
|
|
73
|
+
/\balert\b/,
|
|
74
|
+
/\btoast\b/,
|
|
75
|
+
/\bsnackbar/,
|
|
76
|
+
/\bbottom-?bar/,
|
|
77
|
+
/\bfloating/,
|
|
78
|
+
/\bfixed-?bottom/,
|
|
79
|
+
/\bback-?to-?top/,
|
|
80
|
+
// Interactive UI elements (non-content)
|
|
81
|
+
/\bquiz\b/,
|
|
82
|
+
/\bquestionnaire\b/,
|
|
83
|
+
/\btoggle(?!-content|-body|-text)\b/,
|
|
84
|
+
/\bcarousel\b/,
|
|
85
|
+
/\baccordion(?!-content|-body|-text)\b/,
|
|
86
|
+
/\bstepper\b/,
|
|
87
|
+
/\bpagination\b/,
|
|
88
|
+
/\btabs-?(?:list|nav|bar)\b/,
|
|
89
|
+
/\bcookie-?(?:banner|bar|notice|consent|popup)\b/,
|
|
90
|
+
];
|
|
91
|
+
/**
|
|
92
|
+
* Tags we never remove (they likely wrap main content).
|
|
93
|
+
* We recurse into them but never strip the element itself.
|
|
94
|
+
*/
|
|
95
|
+
const PROTECTED_TAGS = new Set(['main', 'article', 'body']);
|
|
96
|
+
/**
|
|
97
|
+
* Tags we never remove during density scoring (Pass 2).
|
|
98
|
+
* Headings, paragraphs, and semantic content elements should survive
|
|
99
|
+
* even if they're small — they carry essential meaning.
|
|
100
|
+
*/
|
|
101
|
+
const DENSITY_SAFE_TAGS = new Set([
|
|
102
|
+
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
103
|
+
'p', 'pre', 'code', 'blockquote', 'figcaption',
|
|
104
|
+
'main', 'article', 'body',
|
|
105
|
+
// Table structural elements — pruner must not remove these or Turndown GFM
|
|
106
|
+
// can't convert tables and falls back to raw HTML output.
|
|
107
|
+
'table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td',
|
|
108
|
+
]);
|
|
109
|
+
/**
|
|
110
|
+
* Class/id patterns that protect an element from removal.
|
|
111
|
+
*/
|
|
112
|
+
const CONTENT_PATTERNS = [
|
|
113
|
+
/\barticle/,
|
|
114
|
+
/\bpost-?content/,
|
|
115
|
+
/\bentry-?content/,
|
|
116
|
+
/\bmain-?content/,
|
|
117
|
+
/\bstory/,
|
|
118
|
+
/\bblog/,
|
|
119
|
+
/\bpage-?content/,
|
|
120
|
+
/\bcontent-?area/,
|
|
121
|
+
// Wikipedia/MediaWiki data tables — always content, never chrome
|
|
122
|
+
/\bwikitable\b/,
|
|
123
|
+
/\bmw-parser-output\b/,
|
|
124
|
+
];
|
|
125
|
+
function isChromeBySemantic(el, $) {
|
|
126
|
+
const tagName = el.tagName?.toLowerCase() ?? '';
|
|
127
|
+
if (CHROME_TAGS.has(tagName))
|
|
128
|
+
return true;
|
|
129
|
+
const cls = ($(el).attr('class') ?? '').toLowerCase();
|
|
130
|
+
const id = ($(el).attr('id') ?? '').toLowerCase();
|
|
131
|
+
const combined = cls + ' ' + id;
|
|
132
|
+
// Don't remove if it matches a content pattern
|
|
133
|
+
for (const p of CONTENT_PATTERNS) {
|
|
134
|
+
if (p.test(combined))
|
|
135
|
+
return false;
|
|
136
|
+
}
|
|
137
|
+
for (const p of CHROME_PATTERNS) {
|
|
138
|
+
if (p.test(combined))
|
|
139
|
+
return true;
|
|
140
|
+
}
|
|
141
|
+
// Role attribute
|
|
142
|
+
const role = ($(el).attr('role') ?? '').toLowerCase();
|
|
143
|
+
if (['navigation', 'banner', 'complementary', 'contentinfo', 'search'].includes(role)) {
|
|
144
|
+
return true;
|
|
145
|
+
}
|
|
146
|
+
return false;
|
|
147
|
+
}
|
|
148
|
+
// -----------------------------------------------------------------------
|
|
149
|
+
// Pass 2 — Density scoring
|
|
150
|
+
// -----------------------------------------------------------------------
|
|
151
|
+
/** Tag importance scores for density scoring (-2 to +3) */
|
|
152
|
+
const TAG_IMPORTANCE = {
|
|
153
|
+
article: 3, main: 3,
|
|
154
|
+
p: 2, h1: 2, h2: 2, h3: 2, h4: 2, h5: 2, h6: 2,
|
|
155
|
+
blockquote: 2, pre: 2, code: 2, figure: 2, figcaption: 2,
|
|
156
|
+
section: 1, td: 1, th: 1, li: 1, dd: 1, dt: 1,
|
|
157
|
+
div: 0, span: 0, table: 0, ul: 0, ol: 0, dl: 0,
|
|
158
|
+
aside: -1, header: -1, form: -1,
|
|
159
|
+
nav: -2, footer: -2,
|
|
160
|
+
};
|
|
161
|
+
function normalizeTagScore(rawScore) {
|
|
162
|
+
return (rawScore + 2) / 5; // -2..+3 → 0..1
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Collect scoreable blocks from a DOM tree.
|
|
166
|
+
*
|
|
167
|
+
* Strategy: walk the tree top-down. For each element:
|
|
168
|
+
* - If it's a "leaf-ish" block (< threshold size), score it as one unit.
|
|
169
|
+
* - If it's large and a wrapper (div/section/table), recurse into children.
|
|
170
|
+
* - Protected elements are always recursed.
|
|
171
|
+
*
|
|
172
|
+
* This finds the right granularity: not scoring a 200KB wrapper div,
|
|
173
|
+
* but scoring the divs/sections/p's nested 3-4 levels deep that carry
|
|
174
|
+
* actual content or chrome.
|
|
175
|
+
*/
|
|
176
|
+
function collectBlocks($, parent, blocks, maxLeafSize) {
|
|
177
|
+
const children = 'children' in parent ? parent.children : [];
|
|
178
|
+
for (const child of children) {
|
|
179
|
+
if (child.type !== 'tag')
|
|
180
|
+
continue;
|
|
181
|
+
const el = child;
|
|
182
|
+
const tagName = el.tagName?.toLowerCase() ?? '';
|
|
183
|
+
// Skip script/style
|
|
184
|
+
if (tagName === 'script' || tagName === 'style' || tagName === 'link' || tagName === 'meta')
|
|
185
|
+
continue;
|
|
186
|
+
const $el = $(el);
|
|
187
|
+
const outerHtml = $.html($el) ?? '';
|
|
188
|
+
const htmlLen = outerHtml.length;
|
|
189
|
+
// Skip extremely tiny elements (bare tags like <br>)
|
|
190
|
+
if (htmlLen < 10)
|
|
191
|
+
continue;
|
|
192
|
+
const isProtected = PROTECTED_TAGS.has(tagName);
|
|
193
|
+
const isWrapper = ['div', 'section', 'table', 'tbody', 'thead', 'tr',
|
|
194
|
+
'center', 'details', 'summary'].includes(tagName);
|
|
195
|
+
if (isProtected || (isWrapper && htmlLen > maxLeafSize)) {
|
|
196
|
+
// Too large or protected — recurse deeper
|
|
197
|
+
collectBlocks($, el, blocks, maxLeafSize);
|
|
198
|
+
}
|
|
199
|
+
else if (htmlLen > 0) {
|
|
200
|
+
// Score this element
|
|
201
|
+
const clone = $el.clone();
|
|
202
|
+
clone.find('script, style, noscript, svg, path').remove();
|
|
203
|
+
const visibleText = clone.text() ?? '';
|
|
204
|
+
const visibleTextLen = visibleText.trim().length;
|
|
205
|
+
const textDensity = Math.min(visibleTextLen / Math.max(htmlLen, 1), 1.0);
|
|
206
|
+
let linkTextLen = 0;
|
|
207
|
+
$el.find('a').each((_i, a) => {
|
|
208
|
+
linkTextLen += ($(a).text() ?? '').trim().length;
|
|
209
|
+
});
|
|
210
|
+
const linkDensity = visibleTextLen > 0
|
|
211
|
+
? Math.min(linkTextLen / visibleTextLen, 1.0)
|
|
212
|
+
: 0;
|
|
213
|
+
const rawTagScore = TAG_IMPORTANCE[tagName] ?? 0;
|
|
214
|
+
const normalizedTag = normalizeTagScore(rawTagScore);
|
|
215
|
+
const words = visibleText.trim().split(/\s+/).filter(w => w.length > 0);
|
|
216
|
+
const wordBonus = words.length > 0
|
|
217
|
+
? Math.min(Math.log(words.length + 1) / Math.log(1000), 1.0)
|
|
218
|
+
: 0;
|
|
219
|
+
const score = (textDensity * 0.35 +
|
|
220
|
+
(1 - linkDensity) * 0.25 +
|
|
221
|
+
normalizedTag * 0.2 +
|
|
222
|
+
wordBonus * 0.1 +
|
|
223
|
+
0.1 // baseline position score (removed position bias — not useful for deep nesting)
|
|
224
|
+
);
|
|
225
|
+
blocks.push({
|
|
226
|
+
element: el,
|
|
227
|
+
tagName,
|
|
228
|
+
htmlLength: htmlLen,
|
|
229
|
+
visibleText,
|
|
230
|
+
score,
|
|
231
|
+
});
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
// -----------------------------------------------------------------------
|
|
236
|
+
// Main export
|
|
237
|
+
// -----------------------------------------------------------------------
|
|
238
|
+
/**
|
|
239
|
+
* Prune low-value HTML blocks using two-pass approach:
|
|
240
|
+
* 1. Semantic tag/class removal
|
|
241
|
+
* 2. Density scoring of remaining blocks
|
|
242
|
+
*
|
|
243
|
+
* @param html - Raw HTML to prune
|
|
244
|
+
* @param options - Pruning configuration
|
|
245
|
+
* @returns Pruned HTML with stats
|
|
246
|
+
*/
|
|
247
|
+
export function pruneContent(html, options = {}) {
|
|
248
|
+
const { threshold = 0.3, minWords = 3, dynamic = true, } = options;
|
|
249
|
+
const originalLength = html.length;
|
|
250
|
+
if (!html.trim()) {
|
|
251
|
+
return { html, nodesRemoved: 0, reductionPercent: 0 };
|
|
252
|
+
}
|
|
253
|
+
// =====================================================================
|
|
254
|
+
// Pass 0: Regex pre-pass — strip obvious chrome BEFORE cheerio parse
|
|
255
|
+
// =====================================================================
|
|
256
|
+
// For large HTML (> 20KB), a fast regex pass removes top-level nav/footer/
|
|
257
|
+
// aside/header blocks before we load into cheerio, saving DOM parse time.
|
|
258
|
+
// Only applies to simple self-contained elements (no nesting concerns since
|
|
259
|
+
// these are structural tags that rarely wrap article content).
|
|
260
|
+
if (html.length > 20000) {
|
|
261
|
+
// Remove <nav>…</nav>, <footer>…</footer>, <aside>…</aside>
|
|
262
|
+
// Use a non-greedy match with dotAll flag; stop at the matching close tag.
|
|
263
|
+
html = html
|
|
264
|
+
.replace(/<nav(\s[^>]*)?>[\s\S]*?<\/nav>/gi, '')
|
|
265
|
+
.replace(/<footer(\s[^>]*)?>[\s\S]*?<\/footer>/gi, '')
|
|
266
|
+
.replace(/<aside(\s[^>]*)?>[\s\S]*?<\/aside>/gi, '');
|
|
267
|
+
// No safe way to strip noise <div>s by regex (nested divs break simple patterns).
|
|
268
|
+
// Cheerio's semantic pass handles them reliably in Pass 1.
|
|
269
|
+
}
|
|
270
|
+
const $ = cheerio.load(html);
|
|
271
|
+
let nodesRemoved = 0;
|
|
272
|
+
// =====================================================================
|
|
273
|
+
// Pass 1: Semantic removal
|
|
274
|
+
// =====================================================================
|
|
275
|
+
// Walk top-down; remove entire subtrees that are clearly chrome.
|
|
276
|
+
// We look at direct children of body, and one level deeper, to catch
|
|
277
|
+
// both <body> <nav> and <body> <div> <nav> patterns.
|
|
278
|
+
const toRemoveSemantic = [];
|
|
279
|
+
function walkForChrome(parent, depth) {
|
|
280
|
+
const children = 'children' in parent ? parent.children : [];
|
|
281
|
+
for (const child of children) {
|
|
282
|
+
if (child.type !== 'tag')
|
|
283
|
+
continue;
|
|
284
|
+
const el = child;
|
|
285
|
+
const tagName = el.tagName?.toLowerCase() ?? '';
|
|
286
|
+
if (tagName === 'script' || tagName === 'style')
|
|
287
|
+
continue;
|
|
288
|
+
if (PROTECTED_TAGS.has(tagName)) {
|
|
289
|
+
// Recurse into protected — there might be chrome inside <article>
|
|
290
|
+
walkForChrome(el, depth + 1);
|
|
291
|
+
continue;
|
|
292
|
+
}
|
|
293
|
+
if (isChromeBySemantic(el, $)) {
|
|
294
|
+
toRemoveSemantic.push(el);
|
|
295
|
+
continue; // don't recurse into something we'll remove
|
|
296
|
+
}
|
|
297
|
+
// Recurse up to a reasonable depth
|
|
298
|
+
if (depth < 6) {
|
|
299
|
+
walkForChrome(el, depth + 1);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
const body = $('body').get(0);
|
|
304
|
+
if (body) {
|
|
305
|
+
walkForChrome(body, 0);
|
|
306
|
+
}
|
|
307
|
+
for (const el of toRemoveSemantic) {
|
|
308
|
+
$(el).remove();
|
|
309
|
+
nodesRemoved++;
|
|
310
|
+
}
|
|
311
|
+
// =====================================================================
|
|
312
|
+
// Pass 2: Density scoring (on the remaining HTML)
|
|
313
|
+
// =====================================================================
|
|
314
|
+
const postPass1Html = $.html();
|
|
315
|
+
const postPass1Length = postPass1Html.length;
|
|
316
|
+
// Run density scoring on remaining content
|
|
317
|
+
if (postPass1Length > 100 && body) {
|
|
318
|
+
const blocks = [];
|
|
319
|
+
// Max leaf size: ~5KB or 30% of remaining content (whichever is smaller)
|
|
320
|
+
// This ensures we find leaf blocks even in small documents.
|
|
321
|
+
const maxLeafSize = Math.min(5000, Math.ceil(postPass1Length * 0.3));
|
|
322
|
+
collectBlocks($, body, blocks, maxLeafSize);
|
|
323
|
+
if (blocks.length >= 2) {
|
|
324
|
+
const scores = blocks.map(b => b.score);
|
|
325
|
+
const bestScore = Math.max(...scores);
|
|
326
|
+
let effectiveThreshold = threshold;
|
|
327
|
+
if (dynamic) {
|
|
328
|
+
// Blocks scoring below 50% of the best block are candidates for removal
|
|
329
|
+
effectiveThreshold = bestScore * 0.5;
|
|
330
|
+
}
|
|
331
|
+
// Safety: retain at least 40% of post-pass1 content
|
|
332
|
+
const minRetainLength = Math.ceil(postPass1Length * 0.4);
|
|
333
|
+
// Sort ascending by score — remove worst first
|
|
334
|
+
const sorted = blocks
|
|
335
|
+
.map((b, i) => ({ b, i, score: b.score }))
|
|
336
|
+
.sort((a, b) => a.score - b.score);
|
|
337
|
+
const toRemoveDensity = new Set();
|
|
338
|
+
let removedLength = 0;
|
|
339
|
+
for (const { b } of sorted) {
|
|
340
|
+
if (PROTECTED_TAGS.has(b.tagName) || DENSITY_SAFE_TAGS.has(b.tagName))
|
|
341
|
+
continue;
|
|
342
|
+
const words = b.visibleText.trim().split(/\s+/).filter(w => w.length > 0);
|
|
343
|
+
const isTiny = words.length < minWords;
|
|
344
|
+
const isLow = b.score < effectiveThreshold;
|
|
345
|
+
if (!isTiny && !isLow)
|
|
346
|
+
continue;
|
|
347
|
+
// Check safety floor
|
|
348
|
+
const remaining = postPass1Length - (removedLength + b.htmlLength);
|
|
349
|
+
if (remaining < minRetainLength)
|
|
350
|
+
continue;
|
|
351
|
+
toRemoveDensity.add(b.element);
|
|
352
|
+
removedLength += b.htmlLength;
|
|
353
|
+
}
|
|
354
|
+
for (const el of toRemoveDensity) {
|
|
355
|
+
$(el).remove();
|
|
356
|
+
nodesRemoved++;
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
const resultHtml = $.html() ?? html;
|
|
361
|
+
const resultLength = resultHtml.length;
|
|
362
|
+
const reductionPercent = originalLength > 0
|
|
363
|
+
? Math.max(0, Math.round(((originalLength - resultLength) / originalLength) * 100))
|
|
364
|
+
: 0;
|
|
365
|
+
return {
|
|
366
|
+
html: resultHtml,
|
|
367
|
+
nodesRemoved,
|
|
368
|
+
reductionPercent,
|
|
369
|
+
};
|
|
370
|
+
}
|
|
371
|
+
// ---------------------------------------------------------------------------
|
|
372
|
+
// Markdown post-processing — remove UI noise leaked into markdown output
|
|
373
|
+
// ---------------------------------------------------------------------------
|
|
374
|
+
/** UI button labels that should be removed when they appear as standalone lines */
|
|
375
|
+
const UI_BUTTON_LABELS = /^(load more|headlines only|show more|read more|show less|collapse|expand|view more|view less|see more|see less|more stories|more articles|sign up|subscribe|log in|sign in|follow us|get started|click here|learn more)$/i;
|
|
376
|
+
/** An image with no alt text (empty brackets): `` */
|
|
377
|
+
const EMPTY_IMAGE_RE = /^\!\[\]\([^)]+\)$/;
|
|
378
|
+
/**
|
|
379
|
+
* Post-process markdown output to remove UI elements that leak through
|
|
380
|
+
* from content scrapers (buttons, empty images, consecutive hr separators).
|
|
381
|
+
*
|
|
382
|
+
* @param markdown - Raw markdown string
|
|
383
|
+
* @returns Cleaned markdown string
|
|
384
|
+
*/
|
|
385
|
+
export function pruneMarkdown(markdown) {
|
|
386
|
+
if (!markdown)
|
|
387
|
+
return markdown;
|
|
388
|
+
const lines = markdown.split('\n');
|
|
389
|
+
const result = [];
|
|
390
|
+
let consecutiveHrCount = 0;
|
|
391
|
+
for (let i = 0; i < lines.length; i++) {
|
|
392
|
+
const line = lines[i];
|
|
393
|
+
const trimmed = line.trim();
|
|
394
|
+
// Remove lines that are just UI button labels (standalone, not in a heading/list)
|
|
395
|
+
if (UI_BUTTON_LABELS.test(trimmed)) {
|
|
396
|
+
continue;
|
|
397
|
+
}
|
|
398
|
+
// Remove empty images (no alt text): 
|
|
399
|
+
// But keep images with alt text: 
|
|
400
|
+
if (EMPTY_IMAGE_RE.test(trimmed)) {
|
|
401
|
+
continue;
|
|
402
|
+
}
|
|
403
|
+
// Remove list items whose only content is an empty image
|
|
404
|
+
if (/^[-*+]\s+\!\[\]\([^)]+\)$/.test(trimmed)) {
|
|
405
|
+
continue;
|
|
406
|
+
}
|
|
407
|
+
// Handle consecutive HR separators ("* * *", "---", "___")
|
|
408
|
+
// Keep the first one, remove subsequent consecutive ones
|
|
409
|
+
const isHr = /^(\*\s*\*\s*\*|\-\s*\-\s*\-|_\s*_\s*_)$/.test(trimmed);
|
|
410
|
+
if (isHr) {
|
|
411
|
+
consecutiveHrCount++;
|
|
412
|
+
if (consecutiveHrCount > 1) {
|
|
413
|
+
continue; // skip duplicate hr
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
else {
|
|
417
|
+
// Reset counter on any non-HR, non-blank line
|
|
418
|
+
if (trimmed !== '') {
|
|
419
|
+
consecutiveHrCount = 0;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
result.push(line);
|
|
423
|
+
}
|
|
424
|
+
return result.join('\n');
|
|
425
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory cookie cache with TTL.
|
|
3
|
+
*
|
|
4
|
+
* Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
|
|
5
|
+
* Cookies from challenge solves are cached here so future requests to the same
|
|
6
|
+
* domain skip the challenge entirely.
|
|
7
|
+
*
|
|
8
|
+
* Design goals:
|
|
9
|
+
* - Zero dependencies (plain Map + setTimeout)
|
|
10
|
+
* - In-memory only — no disk/DB persistence
|
|
11
|
+
* - TTL per entry (default 30 min, matching cf_clearance lifetime)
|
|
12
|
+
* - Thread-safe for single-process Node.js (event loop is single-threaded)
|
|
13
|
+
*/
|
|
14
|
+
export interface CachedCookies {
|
|
15
|
+
/** Raw "Cookie: ..." header value (semicolon-separated) */
|
|
16
|
+
cookieHeader: string;
|
|
17
|
+
/** Individual cookie strings (e.g. ["cf_clearance=abc; Path=/", ...]) */
|
|
18
|
+
cookies: string[];
|
|
19
|
+
/** Unix timestamp (ms) when this cache entry expires */
|
|
20
|
+
expiresAt: number;
|
|
21
|
+
/** The domain these cookies are for */
|
|
22
|
+
domain: string;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Store cookies for a domain.
|
|
26
|
+
*
|
|
27
|
+
* @param domain Hostname (e.g. "example.com" or "sub.example.com")
|
|
28
|
+
* @param cookies Array of Set-Cookie header values or cookie strings
|
|
29
|
+
* @param ttlMs Time-to-live in ms (default: 30 min)
|
|
30
|
+
*/
|
|
31
|
+
export declare function cacheCookies(domain: string, cookies: string[], ttlMs?: number): void;
|
|
32
|
+
/**
|
|
33
|
+
* Retrieve cached cookies for a domain (or its parent domain).
|
|
34
|
+
* Returns null if no valid (non-expired) entry exists.
|
|
35
|
+
*
|
|
36
|
+
* @param domain Hostname to look up
|
|
37
|
+
*/
|
|
38
|
+
export declare function getCachedCookies(domain: string): CachedCookies | null;
|
|
39
|
+
/**
|
|
40
|
+
* Build a Cookie request header value from a URL.
|
|
41
|
+
* Returns undefined if no cached cookies exist.
|
|
42
|
+
*/
|
|
43
|
+
export declare function getCookieHeader(url: string): string | undefined;
|
|
44
|
+
/**
|
|
45
|
+
* Cache cookies from a URL's perspective.
|
|
46
|
+
* Extracts domain from URL automatically.
|
|
47
|
+
*/
|
|
48
|
+
export declare function cacheCookiesForUrl(url: string, cookies: string[], ttlMs?: number): void;
|
|
49
|
+
/**
|
|
50
|
+
* Invalidate (remove) cached cookies for a domain.
|
|
51
|
+
*/
|
|
52
|
+
export declare function invalidateCookies(domain: string): void;
|
|
53
|
+
/**
|
|
54
|
+
* Return the number of cached domains (for diagnostics).
|
|
55
|
+
*/
|
|
56
|
+
export declare function getCacheSize(): number;
|
|
57
|
+
/**
|
|
58
|
+
* Clear ALL cached cookies. Mainly for tests.
|
|
59
|
+
*/
|
|
60
|
+
export declare function clearCookieCache(): void;
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory cookie cache with TTL.
|
|
3
|
+
*
|
|
4
|
+
* Stores session cookies (especially cf_clearance, __cf_bm) keyed by domain.
|
|
5
|
+
* Cookies from challenge solves are cached here so future requests to the same
|
|
6
|
+
* domain skip the challenge entirely.
|
|
7
|
+
*
|
|
8
|
+
* Design goals:
|
|
9
|
+
* - Zero dependencies (plain Map + setTimeout)
|
|
10
|
+
* - In-memory only — no disk/DB persistence
|
|
11
|
+
* - TTL per entry (default 30 min, matching cf_clearance lifetime)
|
|
12
|
+
* - Thread-safe for single-process Node.js (event loop is single-threaded)
|
|
13
|
+
*/
|
|
14
|
+
// ── Internal store ────────────────────────────────────────────────────────────
|
|
15
|
+
const store = new Map();
|
|
16
|
+
let cleanupTimer = null;
|
|
17
|
+
/** Default TTL: 30 minutes (cf_clearance lasts 30 min) */
|
|
18
|
+
const DEFAULT_TTL_MS = 30 * 60 * 1000;
|
|
19
|
+
// ── Public API ────────────────────────────────────────────────────────────────
|
|
20
|
+
/**
|
|
21
|
+
* Store cookies for a domain.
|
|
22
|
+
*
|
|
23
|
+
* @param domain Hostname (e.g. "example.com" or "sub.example.com")
|
|
24
|
+
* @param cookies Array of Set-Cookie header values or cookie strings
|
|
25
|
+
* @param ttlMs Time-to-live in ms (default: 30 min)
|
|
26
|
+
*/
|
|
27
|
+
export function cacheCookies(domain, cookies, ttlMs = DEFAULT_TTL_MS) {
|
|
28
|
+
if (!cookies.length)
|
|
29
|
+
return;
|
|
30
|
+
const normalizedDomain = normalizeDomain(domain);
|
|
31
|
+
const cookieHeader = buildCookieHeader(cookies);
|
|
32
|
+
const expiresAt = Date.now() + ttlMs;
|
|
33
|
+
store.set(normalizedDomain, {
|
|
34
|
+
cookieHeader,
|
|
35
|
+
cookies,
|
|
36
|
+
expiresAt,
|
|
37
|
+
domain: normalizedDomain,
|
|
38
|
+
});
|
|
39
|
+
// Start periodic cleanup if not already running
|
|
40
|
+
startCleanup();
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Retrieve cached cookies for a domain (or its parent domain).
|
|
44
|
+
* Returns null if no valid (non-expired) entry exists.
|
|
45
|
+
*
|
|
46
|
+
* @param domain Hostname to look up
|
|
47
|
+
*/
|
|
48
|
+
export function getCachedCookies(domain) {
|
|
49
|
+
const normalizedDomain = normalizeDomain(domain);
|
|
50
|
+
// Try exact match first, then parent domain
|
|
51
|
+
const candidates = [normalizedDomain, getParentDomain(normalizedDomain)].filter(Boolean);
|
|
52
|
+
for (const candidate of candidates) {
|
|
53
|
+
const entry = store.get(candidate);
|
|
54
|
+
if (entry && entry.expiresAt > Date.now()) {
|
|
55
|
+
return entry;
|
|
56
|
+
}
|
|
57
|
+
// Remove expired entry
|
|
58
|
+
if (entry) {
|
|
59
|
+
store.delete(candidate);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Build a Cookie request header value from a URL.
|
|
66
|
+
* Returns undefined if no cached cookies exist.
|
|
67
|
+
*/
|
|
68
|
+
export function getCookieHeader(url) {
|
|
69
|
+
try {
|
|
70
|
+
const domain = new URL(url).hostname;
|
|
71
|
+
const cached = getCachedCookies(domain);
|
|
72
|
+
return cached?.cookieHeader;
|
|
73
|
+
}
|
|
74
|
+
catch {
|
|
75
|
+
return undefined;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Cache cookies from a URL's perspective.
|
|
80
|
+
* Extracts domain from URL automatically.
|
|
81
|
+
*/
|
|
82
|
+
export function cacheCookiesForUrl(url, cookies, ttlMs = DEFAULT_TTL_MS) {
|
|
83
|
+
try {
|
|
84
|
+
const domain = new URL(url).hostname;
|
|
85
|
+
cacheCookies(domain, cookies, ttlMs);
|
|
86
|
+
}
|
|
87
|
+
catch {
|
|
88
|
+
// Invalid URL — ignore
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Invalidate (remove) cached cookies for a domain.
|
|
93
|
+
*/
|
|
94
|
+
export function invalidateCookies(domain) {
|
|
95
|
+
const normalizedDomain = normalizeDomain(domain);
|
|
96
|
+
store.delete(normalizedDomain);
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Return the number of cached domains (for diagnostics).
|
|
100
|
+
*/
|
|
101
|
+
export function getCacheSize() {
|
|
102
|
+
return store.size;
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Clear ALL cached cookies. Mainly for tests.
|
|
106
|
+
*/
|
|
107
|
+
export function clearCookieCache() {
|
|
108
|
+
store.clear();
|
|
109
|
+
if (cleanupTimer) {
|
|
110
|
+
clearInterval(cleanupTimer);
|
|
111
|
+
cleanupTimer = null;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
115
|
+
/** Normalize domain: lowercase, strip www. prefix */
|
|
116
|
+
function normalizeDomain(domain) {
|
|
117
|
+
return domain.toLowerCase().replace(/^www\./, '');
|
|
118
|
+
}
|
|
119
|
+
/** Get parent domain (strip first subdomain label) */
|
|
120
|
+
function getParentDomain(domain) {
|
|
121
|
+
const parts = domain.split('.');
|
|
122
|
+
if (parts.length <= 2)
|
|
123
|
+
return null; // Already a root domain
|
|
124
|
+
return parts.slice(1).join('.');
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Convert an array of Set-Cookie values or raw cookie strings into a single
|
|
128
|
+
* "Cookie: name=value; name2=value2" header value.
|
|
129
|
+
*/
|
|
130
|
+
function buildCookieHeader(cookies) {
|
|
131
|
+
const pairs = [];
|
|
132
|
+
for (const cookie of cookies) {
|
|
133
|
+
// Set-Cookie format: "name=value; Path=/; Secure; HttpOnly; ..."
|
|
134
|
+
// We only want the first "name=value" pair
|
|
135
|
+
const firstPart = cookie.split(';')[0]?.trim();
|
|
136
|
+
if (firstPart) {
|
|
137
|
+
pairs.push(firstPart);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return pairs.join('; ');
|
|
141
|
+
}
|
|
142
|
+
/** Periodically remove expired entries to prevent memory leaks. */
|
|
143
|
+
function startCleanup() {
|
|
144
|
+
if (cleanupTimer)
|
|
145
|
+
return;
|
|
146
|
+
cleanupTimer = setInterval(() => {
|
|
147
|
+
const now = Date.now();
|
|
148
|
+
for (const [domain, entry] of store) {
|
|
149
|
+
if (entry.expiresAt <= now) {
|
|
150
|
+
store.delete(domain);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// Stop the timer if the cache is empty
|
|
154
|
+
if (store.size === 0 && cleanupTimer) {
|
|
155
|
+
clearInterval(cleanupTimer);
|
|
156
|
+
cleanupTimer = null;
|
|
157
|
+
}
|
|
158
|
+
}, 5 * 60 * 1000); // Run every 5 minutes
|
|
159
|
+
// Don't block Node.js process exit
|
|
160
|
+
if (cleanupTimer && typeof cleanupTimer.unref === 'function') {
|
|
161
|
+
cleanupTimer.unref();
|
|
162
|
+
}
|
|
163
|
+
}
|