@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,809 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML to Markdown conversion with smart cleanup
|
|
3
|
+
*/
|
|
4
|
+
import TurndownService from 'turndown';
|
|
5
|
+
import { gfm } from 'turndown-plugin-gfm';
|
|
6
|
+
import * as cheerio from 'cheerio';
|
|
7
|
+
import { pruneContent } from './content-pruner.js';
|
|
8
|
+
const JUNK_SELECTORS = [
|
|
9
|
+
// Scripts, styles, metadata
|
|
10
|
+
'script', 'style', 'noscript', 'iframe', 'link[rel="stylesheet"]',
|
|
11
|
+
// Navigation
|
|
12
|
+
'nav', '[role="navigation"]', '[role="search"]',
|
|
13
|
+
'.sidebar', '.topbar', '.top-bar', '.site-nav', '.main-nav',
|
|
14
|
+
'.breadcrumb', '.breadcrumbs', '[class*="breadcrumb"]',
|
|
15
|
+
'.pagination', '[class*="pagination"]',
|
|
16
|
+
// Ads & tracking
|
|
17
|
+
'.advertisement', '.ad', '[class*="ad-"]', '[id*="ad-"]',
|
|
18
|
+
'[class*="advert"]', '[class*="sponsor"]', '[class*="promo"]',
|
|
19
|
+
// Cookie & consent
|
|
20
|
+
'.cookie-banner', '.cookie-notice', '.cookie-consent',
|
|
21
|
+
'[class*="cookie"]', '[id*="cookie"]',
|
|
22
|
+
'[class*="consent"]', '[class*="gdpr"]',
|
|
23
|
+
// Popups, modals (precise selectors — no broad banner/overlay)
|
|
24
|
+
'[class*="popup"]', '[class*="modal"]',
|
|
25
|
+
'[class*="notification-bar"]',
|
|
26
|
+
// Banners — only known ad/promo banners
|
|
27
|
+
'.ad-banner', '.promo-banner',
|
|
28
|
+
// Social & sharing — only sharing widgets
|
|
29
|
+
'.social-share', '.share-buttons', '.share-widget',
|
|
30
|
+
// Newsletter & CTA — only forms/widgets
|
|
31
|
+
'.newsletter-signup', '[class*="newsletter"]',
|
|
32
|
+
'.subscribe-form', '.subscribe-widget',
|
|
33
|
+
'.signup-form', '.signup-widget', '.signup-cta',
|
|
34
|
+
'[class*="call-to-action"]',
|
|
35
|
+
// Related content — only explicit widgets
|
|
36
|
+
'.related-posts', '[class*="you-may-also"]', '[class*="more-stories"]',
|
|
37
|
+
// Comments — only sections/forms, not comment text
|
|
38
|
+
'.comments-section', '.comment-form', '#comments',
|
|
39
|
+
// Job site CTAs — resume upload prompts, apply nudges, sign-in gates
|
|
40
|
+
'[class*="resume-upload"]', '[class*="resumeUpload"]',
|
|
41
|
+
'[class*="job-alert"]', '[class*="jobAlert"]',
|
|
42
|
+
'[class*="sign-in-gate"]', '[class*="signin-prompt"]',
|
|
43
|
+
// Login/auth gates (specific patterns to avoid matching "navigate", "aggregate", etc.)
|
|
44
|
+
'[class*="login-wall"]', '[class*="paywall"]', '[class*="signin-gate"]',
|
|
45
|
+
'[class*="login-gate"]', '[class*="access-gate"]', '[class*="content-gate"]',
|
|
46
|
+
'[class*="registration-wall"]', '.login-prompt', '.auth-wall',
|
|
47
|
+
// Chat widgets
|
|
48
|
+
'[class*="chat-widget"]', '[class*="chatbot"]', '[class*="intercom"]',
|
|
49
|
+
'[class*="drift-"]', '[class*="zendesk"]', '[class*="crisp"]',
|
|
50
|
+
'[class*="hubspot"]', '#hubspot-messages-iframe-container',
|
|
51
|
+
// Skip links
|
|
52
|
+
'.skip-to-content', '.skip-link', '.skip-nav',
|
|
53
|
+
];
|
|
54
|
+
const STATS_SELECTORS = {
|
|
55
|
+
scripts: ['script', 'noscript'],
|
|
56
|
+
styles: ['style', 'link[rel="stylesheet"]'],
|
|
57
|
+
ads: [
|
|
58
|
+
'.advertisement', '.ad', '[class*="ad-"]', '[id*="ad-"]',
|
|
59
|
+
'[class*="advert"]', '[class*="sponsor"]', '[class*="promo"]',
|
|
60
|
+
'.ad-banner', '.promo-banner',
|
|
61
|
+
],
|
|
62
|
+
tracking: [
|
|
63
|
+
'iframe', '.cookie-banner', '.cookie-notice', '.cookie-consent',
|
|
64
|
+
'[class*="cookie"]', '[id*="cookie"]',
|
|
65
|
+
'[class*="consent"]', '[class*="gdpr"]',
|
|
66
|
+
],
|
|
67
|
+
navigation: [
|
|
68
|
+
'nav', '[role="navigation"]', '[role="search"]',
|
|
69
|
+
'.sidebar', '.topbar', '.top-bar', '.site-nav', '.main-nav',
|
|
70
|
+
'.breadcrumb', '.breadcrumbs', '[class*="breadcrumb"]',
|
|
71
|
+
'.pagination', '[class*="pagination"]',
|
|
72
|
+
],
|
|
73
|
+
socialWidgets: [
|
|
74
|
+
'.social-share', '.share-buttons', '.share-widget',
|
|
75
|
+
'.newsletter-signup', '[class*="newsletter"]',
|
|
76
|
+
'.subscribe-form', '.subscribe-widget',
|
|
77
|
+
'.signup-form', '.signup-widget', '.signup-cta',
|
|
78
|
+
'[class*="chat-widget"]', '[class*="chatbot"]',
|
|
79
|
+
'[class*="intercom"]', '[class*="drift-"]', '[class*="zendesk"]',
|
|
80
|
+
'[class*="crisp"]', '[class*="hubspot"]',
|
|
81
|
+
],
|
|
82
|
+
popups: [
|
|
83
|
+
'[class*="popup"]', '[class*="modal"]',
|
|
84
|
+
'[class*="notification-bar"]',
|
|
85
|
+
],
|
|
86
|
+
};
|
|
87
|
+
/**
|
|
88
|
+
* Count elements that would be removed by the cleaning pipeline.
|
|
89
|
+
* Call this BEFORE cleanHTML to get an accurate picture of what gets stripped.
|
|
90
|
+
*/
|
|
91
|
+
export function countRemovedElements(html) {
|
|
92
|
+
const $ = cheerio.load(html);
|
|
93
|
+
// Track unique element nodes to avoid double-counting
|
|
94
|
+
const seen = new Set();
|
|
95
|
+
function countCategory(selectors) {
|
|
96
|
+
let count = 0;
|
|
97
|
+
for (const sel of selectors) {
|
|
98
|
+
try {
|
|
99
|
+
$(sel).each((_, el) => {
|
|
100
|
+
if (!seen.has(el)) {
|
|
101
|
+
seen.add(el);
|
|
102
|
+
count++;
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
catch {
|
|
107
|
+
// Ignore invalid selectors
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
return count;
|
|
111
|
+
}
|
|
112
|
+
const scripts = countCategory(STATS_SELECTORS.scripts);
|
|
113
|
+
const styles = countCategory(STATS_SELECTORS.styles);
|
|
114
|
+
const ads = countCategory(STATS_SELECTORS.ads);
|
|
115
|
+
const tracking = countCategory(STATS_SELECTORS.tracking);
|
|
116
|
+
const navigation = countCategory(STATS_SELECTORS.navigation);
|
|
117
|
+
const socialWidgets = countCategory(STATS_SELECTORS.socialWidgets);
|
|
118
|
+
const popups = countCategory(STATS_SELECTORS.popups);
|
|
119
|
+
const totalRemoved = scripts + styles + ads + tracking + navigation + socialWidgets + popups;
|
|
120
|
+
return {
|
|
121
|
+
scripts,
|
|
122
|
+
styles,
|
|
123
|
+
ads,
|
|
124
|
+
tracking,
|
|
125
|
+
navigation,
|
|
126
|
+
socialWidgets,
|
|
127
|
+
popups,
|
|
128
|
+
totalRemoved,
|
|
129
|
+
originalSizeBytes: Buffer.byteLength(html, 'utf8'),
|
|
130
|
+
cleanedSizeBytes: 0, // set by caller after cleaning
|
|
131
|
+
reductionPercent: 0, // set by caller after cleaning
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Filter HTML by including or excluding specific tags/selectors
|
|
136
|
+
* Applied BEFORE markdown conversion for precise content control
|
|
137
|
+
*
|
|
138
|
+
* @param html - HTML to filter
|
|
139
|
+
* @param includeTags - Only keep content from these elements (e.g., ['article', 'main', '.content'])
|
|
140
|
+
* @param excludeTags - Remove these elements (e.g., ['nav', 'footer', 'header', '.sidebar'])
|
|
141
|
+
* @returns Filtered HTML
|
|
142
|
+
*/
|
|
143
|
+
export function filterByTags(html, includeTags, excludeTags) {
|
|
144
|
+
const $ = cheerio.load(html);
|
|
145
|
+
// Apply exclude tags first (remove unwanted elements)
|
|
146
|
+
if (excludeTags?.length) {
|
|
147
|
+
excludeTags.forEach(selector => {
|
|
148
|
+
$(selector).remove();
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
// Apply include tags (only keep specified elements)
|
|
152
|
+
if (includeTags?.length) {
|
|
153
|
+
// Collect all matching elements
|
|
154
|
+
const included = [];
|
|
155
|
+
includeTags.forEach(selector => {
|
|
156
|
+
const matches = $(selector);
|
|
157
|
+
if (matches.length > 0) {
|
|
158
|
+
matches.each((_, el) => {
|
|
159
|
+
included.push($(el));
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
});
|
|
163
|
+
// If we found matching elements, return only those
|
|
164
|
+
if (included.length > 0) {
|
|
165
|
+
return included.map(el => $.html(el)).join('\n');
|
|
166
|
+
}
|
|
167
|
+
// If includeTags specified but nothing matched, return empty
|
|
168
|
+
return '';
|
|
169
|
+
}
|
|
170
|
+
// Return filtered HTML
|
|
171
|
+
return $.html();
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Extract content matching a CSS selector
|
|
175
|
+
* Returns filtered HTML or full HTML if selector matches nothing
|
|
176
|
+
*/
|
|
177
|
+
export function selectContent(html, selector, exclude) {
|
|
178
|
+
const $ = cheerio.load(html);
|
|
179
|
+
// Apply excludes first
|
|
180
|
+
if (exclude?.length) {
|
|
181
|
+
exclude.forEach(sel => $(sel).remove());
|
|
182
|
+
}
|
|
183
|
+
// Select matching elements
|
|
184
|
+
const selected = $(selector);
|
|
185
|
+
if (selected.length === 0) {
|
|
186
|
+
// Fallback to full page if selector matches nothing
|
|
187
|
+
return html;
|
|
188
|
+
}
|
|
189
|
+
// Return the HTML of all matched elements
|
|
190
|
+
return selected.map((_, el) => $.html(el)).get().join('\n');
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Clean HTML before conversion
|
|
194
|
+
* Remove navigation, ads, cookie banners, and other junk
|
|
195
|
+
*/
|
|
196
|
+
function cleanHTML(html) {
|
|
197
|
+
// SECURITY: Limit HTML size to prevent DoS
|
|
198
|
+
if (html.length > 10 * 1024 * 1024) { // 10MB
|
|
199
|
+
throw new Error('HTML too large to process (max 10MB)');
|
|
200
|
+
}
|
|
201
|
+
const $ = cheerio.load(html);
|
|
202
|
+
// Remove junk elements
|
|
203
|
+
JUNK_SELECTORS.forEach((selector) => {
|
|
204
|
+
$(selector).remove();
|
|
205
|
+
});
|
|
206
|
+
// Conditionally remove header/footer — keep if they have substantial content (>200 chars)
|
|
207
|
+
$('header, [role="banner"]').each((_, el) => {
|
|
208
|
+
const text = $(el).text().trim();
|
|
209
|
+
if (text.length < 200)
|
|
210
|
+
$(el).remove();
|
|
211
|
+
});
|
|
212
|
+
$('footer, [role="contentinfo"]').each((_, el) => {
|
|
213
|
+
const text = $(el).text().trim();
|
|
214
|
+
if (text.length < 200)
|
|
215
|
+
$(el).remove();
|
|
216
|
+
});
|
|
217
|
+
// Only remove sidebar-like asides, not all aside elements
|
|
218
|
+
$('aside.sidebar, aside[role="complementary"], aside[class*="sidebar"]').remove();
|
|
219
|
+
// Convert layout tables to clean divs before Turndown runs.
|
|
220
|
+
// Layout tables (HN, old Reddit, email HTML etc.) use <table> for positioning,
|
|
221
|
+
// not data — GFM's table plugin fails on them and emits raw HTML.
|
|
222
|
+
// Detection: has presentation attributes OR contains nested <table> OR no <th>.
|
|
223
|
+
$('table').each((_, tableEl) => {
|
|
224
|
+
const $table = $(tableEl);
|
|
225
|
+
const hasBorder = $table.attr('border') !== undefined;
|
|
226
|
+
const hasCellpadding = $table.attr('cellpadding') !== undefined;
|
|
227
|
+
const hasBgcolor = $table.attr('bgcolor') !== undefined;
|
|
228
|
+
const hasRolePresentation = $table.attr('role') === 'presentation';
|
|
229
|
+
const hasNestedTable = $table.find('table').length > 0;
|
|
230
|
+
const hasTh = $table.find('th').length > 0;
|
|
231
|
+
// Count rows and columns to distinguish data tables from layout tables.
|
|
232
|
+
const rowCount = $table.find('tr').length;
|
|
233
|
+
const maxCols = Math.max(0, ...$table.find('tr').toArray().map(r => $(r).children('td, th').length));
|
|
234
|
+
// Keep data tables: those with 3+ rows OR 3+ columns are likely real data
|
|
235
|
+
// even if they lack <th>. Only strip tables that are clearly decorative.
|
|
236
|
+
const isDataBySize = rowCount >= 3 || maxCols >= 3;
|
|
237
|
+
const isLayoutTable = (hasBorder || hasCellpadding || hasBgcolor || hasRolePresentation || hasNestedTable || !hasTh) && !isDataBySize;
|
|
238
|
+
if (!isLayoutTable)
|
|
239
|
+
return;
|
|
240
|
+
// Extract: links (as list items) + non-empty text from each <td>
|
|
241
|
+
const lines = [];
|
|
242
|
+
$table.find('td').each((_, td) => {
|
|
243
|
+
const $td = $(td);
|
|
244
|
+
// Preserve links found in this cell
|
|
245
|
+
$td.find('a').each((_, a) => {
|
|
246
|
+
const $a = $(a);
|
|
247
|
+
const href = $a.attr('href');
|
|
248
|
+
const label = $a.text().trim();
|
|
249
|
+
if (label && href)
|
|
250
|
+
lines.push(`<a href="${href}">${label}</a>`);
|
|
251
|
+
});
|
|
252
|
+
// Add non-link text if substantial
|
|
253
|
+
const nonLinkText = $td.clone().find('a').remove().end().text().trim();
|
|
254
|
+
if (nonLinkText.length > 10 && !$td.find('a').length) {
|
|
255
|
+
lines.push(`<p>${nonLinkText}</p>`);
|
|
256
|
+
}
|
|
257
|
+
});
|
|
258
|
+
const replacement = `<div>${lines.join('\n')}</div>`;
|
|
259
|
+
$table.replaceWith(replacement);
|
|
260
|
+
});
|
|
261
|
+
// Convert complex data tables to clean markdown-ready format.
|
|
262
|
+
// Turndown's GFM plugin fails on tables with colspan/rowspan, missing <thead>,
|
|
263
|
+
// or too many columns. Detect these and convert to readable text pre-Turndown.
|
|
264
|
+
$('table').each((_, tableEl) => {
|
|
265
|
+
const $table = $(tableEl);
|
|
266
|
+
// Detect complexity: colspan, rowspan, no <thead>, or >8 columns
|
|
267
|
+
const hasColspan = $table.find('[colspan]').length > 0;
|
|
268
|
+
const hasRowspan = $table.find('[rowspan]').length > 0;
|
|
269
|
+
const hasThead = $table.find('thead').length > 0;
|
|
270
|
+
const firstRow = $table.find('tr').first();
|
|
271
|
+
const colCount = firstRow.children('th, td').length;
|
|
272
|
+
const isComplex = hasColspan || hasRowspan || !hasThead || colCount > 8;
|
|
273
|
+
if (!isComplex) {
|
|
274
|
+
// Simple table: just strip attributes so GFM plugin handles it
|
|
275
|
+
const tableTags = ['table', 'thead', 'tbody', 'tfoot', 'tr', 'th', 'td', 'caption'];
|
|
276
|
+
tableTags.forEach(tag => {
|
|
277
|
+
$table.find(tag).addBack(tag).each((_i, el) => {
|
|
278
|
+
const attrs = el.attribs || {};
|
|
279
|
+
for (const attr of Object.keys(attrs)) {
|
|
280
|
+
$(el).removeAttr(attr);
|
|
281
|
+
}
|
|
282
|
+
});
|
|
283
|
+
});
|
|
284
|
+
return;
|
|
285
|
+
}
|
|
286
|
+
// Complex table: convert to structured text that reads well in markdown/chat
|
|
287
|
+
// Extract headers from first row of <th> elements
|
|
288
|
+
const headers = [];
|
|
289
|
+
$table.find('tr').first().children('th').each((_i, th) => {
|
|
290
|
+
headers.push($(th).text().trim());
|
|
291
|
+
});
|
|
292
|
+
// If first row had <th>, treat it as header row; otherwise no headers
|
|
293
|
+
const dataRows = $table.find('tr').toArray();
|
|
294
|
+
const startIdx = headers.length > 0 ? 1 : 0;
|
|
295
|
+
// For tables with ≤6 useful columns and headers, rebuild as a clean bare HTML table
|
|
296
|
+
// so Turndown's GFM plugin can convert it to a proper pipe table
|
|
297
|
+
if (headers.length >= 2 && headers.length <= 6) {
|
|
298
|
+
const theadRow = `<tr>${headers.map(h => `<th>${h}</th>`).join('')}</tr>`;
|
|
299
|
+
const tbodyRows = [];
|
|
300
|
+
const ROW_CAP = 50;
|
|
301
|
+
const totalDataRows = dataRows.length - startIdx;
|
|
302
|
+
for (let r = startIdx; r < dataRows.length && (r - startIdx) < ROW_CAP; r++) {
|
|
303
|
+
const cells = [];
|
|
304
|
+
$(dataRows[r]).children('td, th').each((_j, td) => {
|
|
305
|
+
const span = parseInt($(td).attr('colspan') || '1', 10);
|
|
306
|
+
const text = $(td).text().trim();
|
|
307
|
+
for (let s = 0; s < Math.min(span, 6); s++)
|
|
308
|
+
cells.push(text);
|
|
309
|
+
});
|
|
310
|
+
// Pad or trim to match header count
|
|
311
|
+
while (cells.length < headers.length)
|
|
312
|
+
cells.push('');
|
|
313
|
+
tbodyRows.push(`<tr>${cells.slice(0, headers.length).map(c => `<td>${c}</td>`).join('')}</tr>`);
|
|
314
|
+
}
|
|
315
|
+
if (totalDataRows > ROW_CAP) {
|
|
316
|
+
tbodyRows.push(`<tr><td colspan="${headers.length}">... (${ROW_CAP} of ${totalDataRows} rows shown)</td></tr>`);
|
|
317
|
+
}
|
|
318
|
+
$table.replaceWith(`<table><thead>${theadRow}</thead><tbody>${tbodyRows.join('')}</tbody></table>`);
|
|
319
|
+
return;
|
|
320
|
+
}
|
|
321
|
+
// Wide tables or no headers: convert to HTML list so Turndown handles it properly
|
|
322
|
+
// (never put pre-formatted markdown inside a div — Turndown will escape it)
|
|
323
|
+
const liItems = [];
|
|
324
|
+
const ROW_CAP_LIST = 50;
|
|
325
|
+
const totalListRows = dataRows.length - startIdx;
|
|
326
|
+
for (let r = startIdx; r < dataRows.length && (r - startIdx) < ROW_CAP_LIST; r++) {
|
|
327
|
+
const cells = [];
|
|
328
|
+
$(dataRows[r]).children('td, th').each((_j, td) => {
|
|
329
|
+
const span = parseInt($(td).attr('colspan') || '1', 10);
|
|
330
|
+
const text = $(td).text().trim();
|
|
331
|
+
for (let s = 0; s < Math.min(span, 3); s++)
|
|
332
|
+
cells.push(text);
|
|
333
|
+
});
|
|
334
|
+
if (cells.some(c => c)) {
|
|
335
|
+
if (headers.length > 0) {
|
|
336
|
+
const parts = cells
|
|
337
|
+
.map((c, j) => (headers[j] && c) ? `<strong>${headers[j]}:</strong> ${c}` : c)
|
|
338
|
+
.filter(Boolean)
|
|
339
|
+
.join(' · ');
|
|
340
|
+
liItems.push(`<li>${parts}</li>`);
|
|
341
|
+
}
|
|
342
|
+
else {
|
|
343
|
+
liItems.push(`<li>${cells.filter(Boolean).join(' · ')}</li>`);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
if (totalListRows > ROW_CAP_LIST) {
|
|
348
|
+
liItems.push(`<li><em>... (${ROW_CAP_LIST} of ${totalListRows} rows shown)</em></li>`);
|
|
349
|
+
}
|
|
350
|
+
if (liItems.length > 0) {
|
|
351
|
+
$table.replaceWith(`<ul>${liItems.join('')}</ul>`);
|
|
352
|
+
}
|
|
353
|
+
});
|
|
354
|
+
// Remove empty paragraphs and divs
|
|
355
|
+
$('p:empty, div:empty').remove();
|
|
356
|
+
// Remove elements with only whitespace
|
|
357
|
+
$('*').each((_, elem) => {
|
|
358
|
+
const $elem = $(elem);
|
|
359
|
+
const text = $elem.text().trim();
|
|
360
|
+
if (!text && $elem.children().length === 0) {
|
|
361
|
+
$elem.remove();
|
|
362
|
+
}
|
|
363
|
+
});
|
|
364
|
+
return $.html();
|
|
365
|
+
}
|
|
366
|
+
/**
|
|
367
|
+
* MAIN CONTENT SELECTORS — prioritized list of selectors to find the article body
|
|
368
|
+
* Checked in order: first match wins
|
|
369
|
+
*/
|
|
370
|
+
const MAIN_CONTENT_SELECTORS = [
|
|
371
|
+
'article[role="main"]',
|
|
372
|
+
'main article',
|
|
373
|
+
'[role="main"] article',
|
|
374
|
+
'article',
|
|
375
|
+
'[role="main"]',
|
|
376
|
+
'main',
|
|
377
|
+
'.post-content', '.article-content', '.article-body', '.entry-content',
|
|
378
|
+
'.post-body', '.story-body', '.page-content',
|
|
379
|
+
'#content', '#main-content', '#article', '#post',
|
|
380
|
+
'.content', '.main-content',
|
|
381
|
+
'.prose', '.markdown-body', '.post-text', '.article__body',
|
|
382
|
+
'.story-content', '.entry-text', '.post-entry',
|
|
383
|
+
'[itemprop="articleBody"]', '[data-article-body]',
|
|
384
|
+
'.blog-post-content', '.blog-content',
|
|
385
|
+
];
|
|
386
|
+
/**
|
|
387
|
+
* Try to detect the main content area of a page.
|
|
388
|
+
* Returns the main content HTML, or the full cleaned HTML if no main content detected.
|
|
389
|
+
*/
|
|
390
|
+
export function detectMainContent(html) {
|
|
391
|
+
const $ = cheerio.load(html);
|
|
392
|
+
// Helper: get visible text length (ignoring script/style/noscript)
|
|
393
|
+
function visibleTextLength(root) {
|
|
394
|
+
const clone = root.clone();
|
|
395
|
+
clone.find('script, style, noscript').remove();
|
|
396
|
+
return clone.text().trim().length;
|
|
397
|
+
}
|
|
398
|
+
const totalTextLen = visibleTextLength($.root());
|
|
399
|
+
for (const selector of MAIN_CONTENT_SELECTORS) {
|
|
400
|
+
const el = $(selector);
|
|
401
|
+
if (el.length > 0) {
|
|
402
|
+
// Check if it has meaningful content (at least 100 chars of text)
|
|
403
|
+
const text = el.first().text().trim();
|
|
404
|
+
if (text.length >= 100) {
|
|
405
|
+
// Text-coverage heuristic: if detected element has <50% of page text,
|
|
406
|
+
// the detection was too narrow — return full page instead
|
|
407
|
+
const candidateLen = visibleTextLength(el.first());
|
|
408
|
+
if (totalTextLen > 0 && candidateLen / totalTextLen < 0.5) {
|
|
409
|
+
return { html, detected: false };
|
|
410
|
+
}
|
|
411
|
+
return { html: $.html(el.first()), detected: true };
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
// Fallback: find the largest text block (div or section with most text)
|
|
416
|
+
let bestEl = null;
|
|
417
|
+
let bestLen = 0;
|
|
418
|
+
$('div, section').each((_, elem) => {
|
|
419
|
+
const $elem = $(elem);
|
|
420
|
+
const text = $elem.text().trim();
|
|
421
|
+
// Prefer elements with significant text that aren't too deeply nested
|
|
422
|
+
if (text.length > bestLen && text.length >= 200) {
|
|
423
|
+
// Check it's not a wrapper of the whole page
|
|
424
|
+
const parent = $elem.parent();
|
|
425
|
+
if (parent.length && parent[0] !== $('body')[0] && parent[0] !== $('html')[0]) {
|
|
426
|
+
bestEl = $elem;
|
|
427
|
+
bestLen = text.length;
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
});
|
|
431
|
+
if (bestEl && bestLen > 300) {
|
|
432
|
+
// Same coverage check for fallback
|
|
433
|
+
if (totalTextLen > 0 && bestLen / totalTextLen < 0.5) {
|
|
434
|
+
return { html, detected: false };
|
|
435
|
+
}
|
|
436
|
+
return { html: $.html(bestEl), detected: true };
|
|
437
|
+
}
|
|
438
|
+
return { html, detected: false };
|
|
439
|
+
}
|
|
440
|
+
/**
|
|
441
|
+
* Calculate content quality score (0-1)
|
|
442
|
+
* Measures how clean and useful the extracted content is
|
|
443
|
+
*/
|
|
444
|
+
export function calculateQuality(content, originalHtml) {
|
|
445
|
+
if (!content || content.length < 10)
|
|
446
|
+
return 0;
|
|
447
|
+
const contentLen = content.length;
|
|
448
|
+
const htmlLen = originalHtml.length;
|
|
449
|
+
// Factor 1: Compression ratio (how much we stripped) — higher is better, up to a point
|
|
450
|
+
const compressionRatio = Math.min(contentLen / Math.max(htmlLen, 1), 1);
|
|
451
|
+
// Sweet spot: 5-30% of original HTML is usually the real content
|
|
452
|
+
const compressionScore = compressionRatio < 0.01 ? 0.3 :
|
|
453
|
+
compressionRatio < 0.05 ? 0.7 :
|
|
454
|
+
compressionRatio < 0.40 ? 1.0 :
|
|
455
|
+
compressionRatio < 0.60 ? 0.8 : 0.5;
|
|
456
|
+
// Factor 2: Text density (ratio of visible text to markdown formatting)
|
|
457
|
+
const textOnly = content.replace(/[#*_\[\]\(\)\-`|>]/g, '');
|
|
458
|
+
const textDensity = textOnly.trim().length / Math.max(contentLen, 1);
|
|
459
|
+
const densityScore = Math.min(textDensity / 0.7, 1);
|
|
460
|
+
// Factor 3: Has meaningful structure (headings, paragraphs)
|
|
461
|
+
const hasHeadings = /^#{1,6}\s/m.test(content) ? 1 : 0.7;
|
|
462
|
+
const hasParagraphs = content.split('\n\n').length > 2 ? 1 : 0.8;
|
|
463
|
+
// Factor 4: Not too short, not too long
|
|
464
|
+
const lengthScore = contentLen < 50 ? 0.3 :
|
|
465
|
+
contentLen < 200 ? 0.6 :
|
|
466
|
+
contentLen < 50000 ? 1.0 : 0.8;
|
|
467
|
+
// Weighted average
|
|
468
|
+
const quality = (compressionScore * 0.3 +
|
|
469
|
+
densityScore * 0.3 +
|
|
470
|
+
(hasHeadings * hasParagraphs) * 0.2 +
|
|
471
|
+
lengthScore * 0.2);
|
|
472
|
+
return Math.round(quality * 100) / 100;
|
|
473
|
+
}
|
|
474
|
+
// Module-level singleton TurndownService — stateless per-call, safe to reuse.
|
|
475
|
+
const turndownSingleton = (() => {
|
|
476
|
+
const td = new TurndownService({
|
|
477
|
+
headingStyle: 'atx',
|
|
478
|
+
codeBlockStyle: 'fenced',
|
|
479
|
+
bulletListMarker: '-',
|
|
480
|
+
emDelimiter: '_',
|
|
481
|
+
strongDelimiter: '**',
|
|
482
|
+
});
|
|
483
|
+
// Enable GFM support (tables, strikethrough, task lists)
|
|
484
|
+
td.use(gfm);
|
|
485
|
+
// Custom rule: convert images to alt text or skip
|
|
486
|
+
td.addRule('images', {
|
|
487
|
+
filter: 'img',
|
|
488
|
+
replacement: (_content, node) => {
|
|
489
|
+
const alt = node.alt;
|
|
490
|
+
const src = node.src;
|
|
491
|
+
if (alt) {
|
|
492
|
+
return ``;
|
|
493
|
+
}
|
|
494
|
+
return '';
|
|
495
|
+
},
|
|
496
|
+
});
|
|
497
|
+
// Custom rule: preserve code blocks
|
|
498
|
+
td.addRule('codeBlocks', {
|
|
499
|
+
filter: (node) => {
|
|
500
|
+
return node.nodeName === 'PRE' && node.firstChild?.nodeName === 'CODE';
|
|
501
|
+
},
|
|
502
|
+
replacement: (_content, node) => {
|
|
503
|
+
const codeNode = node.firstChild;
|
|
504
|
+
const className = codeNode.getAttribute('class') || '';
|
|
505
|
+
const language = className.match(/language-(\w+)/)?.[1] || '';
|
|
506
|
+
return '\n\n```' + language + '\n' + codeNode.textContent + '\n```\n\n';
|
|
507
|
+
},
|
|
508
|
+
});
|
|
509
|
+
return td;
|
|
510
|
+
})();
|
|
511
|
+
/**
|
|
512
|
+
* Convert HTML to clean, readable Markdown
|
|
513
|
+
* @param html - HTML to convert
|
|
514
|
+
* @param options.raw - Skip main-content heuristics (return full page)
|
|
515
|
+
* @param options.prune - Apply content density pruning (default: true)
|
|
516
|
+
*/
|
|
517
|
+
export function htmlToMarkdown(html, options) {
|
|
518
|
+
let cleanedHTML = cleanHTML(html);
|
|
519
|
+
// Content density pruning — runs AFTER junk selector removal, BEFORE Turndown conversion
|
|
520
|
+
// Default ON; callers pass prune:false to skip (e.g. --full-content flag)
|
|
521
|
+
if (options?.prune !== false) {
|
|
522
|
+
const pruned = pruneContent(cleanedHTML, { dynamic: true });
|
|
523
|
+
cleanedHTML = pruned.html;
|
|
524
|
+
}
|
|
525
|
+
let markdown;
|
|
526
|
+
try {
|
|
527
|
+
markdown = turndownSingleton.turndown(cleanedHTML);
|
|
528
|
+
}
|
|
529
|
+
catch {
|
|
530
|
+
// Turndown GFM plugin crashes on malformed tables (e.g. <tr> without <table> parent)
|
|
531
|
+
// Fall back to basic text extraction
|
|
532
|
+
const $ = cheerio.load(cleanedHTML);
|
|
533
|
+
$('script, style, noscript, svg, iframe').remove();
|
|
534
|
+
markdown = $.text().replace(/\s+/g, ' ').trim();
|
|
535
|
+
}
|
|
536
|
+
// SECURITY: Protect against ReDoS - limit input size before regex
|
|
537
|
+
if (markdown.length > 1024 * 1024) { // 1MB limit for markdown
|
|
538
|
+
markdown = markdown.slice(0, 1024 * 1024);
|
|
539
|
+
}
|
|
540
|
+
// Clean up excessive newlines (use non-backtracking approach)
|
|
541
|
+
markdown = markdown.split('\n').reduce((acc, line, i, arr) => {
|
|
542
|
+
if (i === 0)
|
|
543
|
+
return line;
|
|
544
|
+
const prevEmpty = arr[i - 1].trim() === '';
|
|
545
|
+
const currEmpty = line.trim() === '';
|
|
546
|
+
if (prevEmpty && currEmpty)
|
|
547
|
+
return acc;
|
|
548
|
+
return acc + '\n' + line;
|
|
549
|
+
}, '');
|
|
550
|
+
// Remove common CTA / noise lines (job sites, sign-up prompts, etc.)
|
|
551
|
+
// Strip markdown heading prefix before matching (e.g., "## Are you open...")
|
|
552
|
+
markdown = markdown.split('\n').filter(line => {
|
|
553
|
+
const trimmed = line.trim().toLowerCase().replace(/^#{1,6}\s*/, '');
|
|
554
|
+
// Job site CTA noise
|
|
555
|
+
if (trimmed === 'upload resume' || trimmed === 'upload your resume')
|
|
556
|
+
return false;
|
|
557
|
+
if (trimmed === 'apply now' || trimmed === 'apply on employer site' || trimmed === 'apply on employer siteapply now')
|
|
558
|
+
return false;
|
|
559
|
+
if (trimmed === 'easy apply' || trimmed === 'save job' || trimmed === 'easy apply onlyremote only')
|
|
560
|
+
return false;
|
|
561
|
+
if (/^(is your resume a good match|are you open to new opportunities)\??$/.test(trimmed))
|
|
562
|
+
return false;
|
|
563
|
+
if (/^upload your resume to increase your chances/i.test(trimmed))
|
|
564
|
+
return false;
|
|
565
|
+
if (/^use ai to find out how well/i.test(trimmed))
|
|
566
|
+
return false;
|
|
567
|
+
// Job site filter sidebar labels (standalone)
|
|
568
|
+
if (trimmed === 'company rating' || trimmed === 'date posted' || trimmed === 'salary range')
|
|
569
|
+
return false;
|
|
570
|
+
// Indeed profile insights noise
|
|
571
|
+
if (/^do you have (experience in|a )/i.test(trimmed))
|
|
572
|
+
return false;
|
|
573
|
+
if (trimmed === 'yesno' || trimmed === 'yes no')
|
|
574
|
+
return false;
|
|
575
|
+
if (trimmed === 'profile insights' || trimmed === 'find out how your skills align')
|
|
576
|
+
return false;
|
|
577
|
+
if (/^find out how your skills align/i.test(trimmed))
|
|
578
|
+
return false;
|
|
579
|
+
// Common UI artifacts (icons, loading, inline labels)
|
|
580
|
+
if (trimmed === 'save-icon' || trimmed === 'loading' || trimmed === 'report job')
|
|
581
|
+
return false;
|
|
582
|
+
if (/^show more(chevron down)?$/i.test(trimmed))
|
|
583
|
+
return false;
|
|
584
|
+
if (trimmed === 'whatwherefind jobs')
|
|
585
|
+
return false;
|
|
586
|
+
// Q&A site chrome (Stack Overflow, StackExchange, forums)
|
|
587
|
+
if (/^\[?(share|follow|flag|report)\]?(\(.*\))?$/i.test(trimmed))
|
|
588
|
+
return false;
|
|
589
|
+
if (/^\[?improve this (question|answer)\]?/i.test(trimmed))
|
|
590
|
+
return false;
|
|
591
|
+
if (/^(sorted by|highest score|trending|date modified|date created)/i.test(trimmed))
|
|
592
|
+
return false;
|
|
593
|
+
if (/^\[?(edited|answered|asked)\s+\w+\s+\d/i.test(trimmed))
|
|
594
|
+
return false;
|
|
595
|
+
if (/^community wiki$/i.test(trimmed))
|
|
596
|
+
return false;
|
|
597
|
+
if (/^\d+\s*(answers?|votes?|views?)\s*\d*$/i.test(trimmed))
|
|
598
|
+
return false;
|
|
599
|
+
if (/^\[?reset to default\]?/i.test(trimmed))
|
|
600
|
+
return false;
|
|
601
|
+
// Generic interactive chrome
|
|
602
|
+
if (/^\[?(bookmark|save|pin|mute|hide|block)\]?(\(.*\))?$/i.test(trimmed))
|
|
603
|
+
return false;
|
|
604
|
+
if (/^\[?(reply|retweet|repost|quote)\]?(\(.*\))?$/i.test(trimmed))
|
|
605
|
+
return false;
|
|
606
|
+
if (/^\[?copy\s*(link|url)?\]?(\(.*\))?$/i.test(trimmed))
|
|
607
|
+
return false;
|
|
608
|
+
if (/^(sign up|log in|create account|join now)\s*(to|for)?/i.test(trimmed))
|
|
609
|
+
return false;
|
|
610
|
+
return true;
|
|
611
|
+
}).join('\n');
|
|
612
|
+
// Truncate trailing recommendation/related-jobs sections (common on job sites like Indeed)
|
|
613
|
+
// These appear after the main content and add 1000+ tokens of noise
|
|
614
|
+
const trailCutPatterns = [
|
|
615
|
+
/^#{1,3}\s*(explore other jobs|discover opportunities beyond)/im,
|
|
616
|
+
/^#{1,3}\s*(jobs with similar titles)/im,
|
|
617
|
+
/^#{1,3}\s*(similar job categories)/im,
|
|
618
|
+
/^#{1,3}\s*(career guide articles)/im,
|
|
619
|
+
/^#{1,3}\s*(similar jobs nearby)/im,
|
|
620
|
+
/^#{1,3}\s*(company and salary information)/im,
|
|
621
|
+
];
|
|
622
|
+
for (const pattern of trailCutPatterns) {
|
|
623
|
+
const match = pattern.exec(markdown);
|
|
624
|
+
if (match && match.index !== undefined) {
|
|
625
|
+
// Only truncate if the noise section is in the bottom 40% of the content
|
|
626
|
+
if (match.index > markdown.length * 0.6) {
|
|
627
|
+
markdown = markdown.slice(0, match.index).trim();
|
|
628
|
+
break;
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
// Remove leading/trailing whitespace
|
|
633
|
+
markdown = markdown.trim();
|
|
634
|
+
return markdown;
|
|
635
|
+
}
|
|
636
|
+
/**
|
|
637
|
+
* Convert HTML to markdown using Turndown directly, without the full cleanHTML pipeline.
|
|
638
|
+
* Useful when the caller has already cleaned the HTML and wants to preserve elements
|
|
639
|
+
* (like images) that cleanHTML would strip due to empty-element detection.
|
|
640
|
+
*
|
|
641
|
+
* The only pre-processing done: remove script/style tags for safety.
|
|
642
|
+
*/
|
|
643
|
+
export function rawHtmlToMarkdown(html) {
|
|
644
|
+
const $ = cheerio.load(html);
|
|
645
|
+
// Remove scripts and styles (always)
|
|
646
|
+
$('script, style, noscript').remove();
|
|
647
|
+
// Run Turndown on the cleaned HTML
|
|
648
|
+
let markdown;
|
|
649
|
+
try {
|
|
650
|
+
markdown = turndownSingleton.turndown($.html());
|
|
651
|
+
}
|
|
652
|
+
catch {
|
|
653
|
+
// Turndown GFM plugin crashes on malformed tables — fall back to text
|
|
654
|
+
markdown = $.text().replace(/\s+/g, ' ').trim();
|
|
655
|
+
}
|
|
656
|
+
// Clean up excessive newlines
|
|
657
|
+
markdown = markdown.split('\n').reduce((acc, line, i, arr) => {
|
|
658
|
+
if (i === 0)
|
|
659
|
+
return line;
|
|
660
|
+
const prevEmpty = arr[i - 1].trim() === '';
|
|
661
|
+
const currEmpty = line.trim() === '';
|
|
662
|
+
if (prevEmpty && currEmpty)
|
|
663
|
+
return acc;
|
|
664
|
+
return acc + '\n' + line;
|
|
665
|
+
}, '');
|
|
666
|
+
return markdown.trim();
|
|
667
|
+
}
|
|
668
|
+
/**
|
|
669
|
+
* Convert HTML to plain text (strip all formatting)
|
|
670
|
+
*/
|
|
671
|
+
export function htmlToText(html) {
|
|
672
|
+
const cleanedHTML = cleanHTML(html);
|
|
673
|
+
const $ = cheerio.load(cleanedHTML);
|
|
674
|
+
// Get text content, preserving some structure
|
|
675
|
+
let text = '';
|
|
676
|
+
$('h1, h2, h3, h4, h5, h6, p, li').each((_, elem) => {
|
|
677
|
+
const content = $(elem).text().trim();
|
|
678
|
+
if (content) {
|
|
679
|
+
text += content + '\n\n';
|
|
680
|
+
}
|
|
681
|
+
});
|
|
682
|
+
// Fallback: if no structured content found, get all text
|
|
683
|
+
if (!text.trim()) {
|
|
684
|
+
text = $('body').text();
|
|
685
|
+
}
|
|
686
|
+
// Clean up excessive whitespace
|
|
687
|
+
text = text.replace(/\n{3,}/g, '\n\n');
|
|
688
|
+
text = text.replace(/[ \t]+/g, ' ');
|
|
689
|
+
return text.trim();
|
|
690
|
+
}
|
|
691
|
+
/**
|
|
692
|
+
* Estimate token count (very rough approximation)
|
|
693
|
+
* Rule of thumb: 1 token ≈ 4 characters for English text
|
|
694
|
+
*/
|
|
695
|
+
export function estimateTokens(text) {
|
|
696
|
+
return Math.ceil(text.length / 4);
|
|
697
|
+
}
|
|
698
|
+
/**
|
|
699
|
+
* Truncate content to fit within a token budget
|
|
700
|
+
* Intelligently preserves structure (headings, first paragraph)
|
|
701
|
+
*/
|
|
702
|
+
export function truncateToTokenBudget(content, maxTokens) {
|
|
703
|
+
const currentTokens = estimateTokens(content);
|
|
704
|
+
// If under budget, return as-is
|
|
705
|
+
if (currentTokens <= maxTokens) {
|
|
706
|
+
return content;
|
|
707
|
+
}
|
|
708
|
+
// Split into lines
|
|
709
|
+
const lines = content.split('\n');
|
|
710
|
+
// Build truncated content
|
|
711
|
+
const result = [];
|
|
712
|
+
let currentTokenCount = 0;
|
|
713
|
+
let foundFirstHeading = false;
|
|
714
|
+
for (const line of lines) {
|
|
715
|
+
const lineTokens = estimateTokens(line);
|
|
716
|
+
const isHeading = /^#{1,6}\s/.test(line);
|
|
717
|
+
// Always include the first heading
|
|
718
|
+
if (!foundFirstHeading && isHeading) {
|
|
719
|
+
result.push(line);
|
|
720
|
+
currentTokenCount += lineTokens;
|
|
721
|
+
foundFirstHeading = true;
|
|
722
|
+
continue;
|
|
723
|
+
}
|
|
724
|
+
// Check if adding this line would exceed budget
|
|
725
|
+
if (currentTokenCount + lineTokens > maxTokens) {
|
|
726
|
+
// Stop here
|
|
727
|
+
break;
|
|
728
|
+
}
|
|
729
|
+
// Add the line
|
|
730
|
+
result.push(line);
|
|
731
|
+
currentTokenCount += lineTokens;
|
|
732
|
+
}
|
|
733
|
+
// Add truncation notice
|
|
734
|
+
result.push('');
|
|
735
|
+
result.push(`[Content truncated to ~${maxTokens} tokens]`);
|
|
736
|
+
return result.join('\n');
|
|
737
|
+
}
|
|
738
|
+
/**
|
|
739
|
+
* Strip markdown link/image syntax for clean AI-readable text.
|
|
740
|
+
* Preserves headings, lists, bold, italic, code blocks.
|
|
741
|
+
* Removes: [text](url) → text,  → [Image: alt], reference links.
|
|
742
|
+
*/
|
|
743
|
+
export function cleanForAI(markdown) {
|
|
744
|
+
return markdown
|
|
745
|
+
// Convert images to descriptive text:  → [Image: alt]
|
|
746
|
+
.replace(/!\[([^\]]*)\]\([^)]+\)/g, (_, alt) => alt ? `[Image: ${alt}]` : '')
|
|
747
|
+
// Convert links to just text: [text](url) → text
|
|
748
|
+
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
|
|
749
|
+
// Remove reference-style link definitions: [id]: url
|
|
750
|
+
.replace(/^\[[\w-]+\]:\s+\S+.*$/gm, '')
|
|
751
|
+
// Remove bare URLs that aren't in code blocks (heuristic: standalone URLs on a line)
|
|
752
|
+
.replace(/^https?:\/\/\S+$/gm, '')
|
|
753
|
+
// Remove HTML comments
|
|
754
|
+
.replace(/<!--[\s\S]*?-->/g, '')
|
|
755
|
+
// Remove empty link fragments like []
|
|
756
|
+
.replace(/\[\s*\]/g, '')
|
|
757
|
+
// Clean up citation references like [1], [2] etc (common in scraped content)
|
|
758
|
+
.replace(/\[(\d+)\]/g, '')
|
|
759
|
+
// Collapse multiple blank lines
|
|
760
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
761
|
+
// Trim trailing whitespace on each line
|
|
762
|
+
.replace(/[ \t]+$/gm, '')
|
|
763
|
+
.trim();
|
|
764
|
+
}
|
|
765
|
+
/**
|
|
766
|
+
* Clean up common markdown noise patterns produced during HTML-to-markdown conversion.
|
|
767
|
+
* Removes empty links, orphaned image links, collapses excess newlines, strips trailing whitespace.
|
|
768
|
+
*/
|
|
769
|
+
export function cleanMarkdownNoise(content) {
|
|
770
|
+
let result = content
|
|
771
|
+
// Remove empty links: [](url) or [ ](url)
|
|
772
|
+
.replace(/\[\s*\]\([^)]+\)/g, '')
|
|
773
|
+
// Remove image-only links that are just UI elements: [](link)
|
|
774
|
+
.replace(/\[\!\[\]\([^)]+\)\]\([^)]+\)/g, '')
|
|
775
|
+
// Collapse 3+ newlines to 2
|
|
776
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
777
|
+
// Remove trailing whitespace on lines
|
|
778
|
+
.replace(/[ \t]+$/gm, '')
|
|
779
|
+
.trim();
|
|
780
|
+
// Collapse repeated link text appearing 5+ times (navigation spam)
|
|
781
|
+
// e.g. "Try Claude" appearing 20+ times as standalone lines or inline
|
|
782
|
+
const linkTextCounts = new Map();
|
|
783
|
+
const linkPattern = /\[([^\]]+)\]\([^)]+\)/g;
|
|
784
|
+
let m;
|
|
785
|
+
while ((m = linkPattern.exec(result)) !== null) {
|
|
786
|
+
const text = m[1].trim().toLowerCase();
|
|
787
|
+
linkTextCounts.set(text, (linkTextCounts.get(text) || 0) + 1);
|
|
788
|
+
}
|
|
789
|
+
// Remove repeated CTA links that appear 5+ times (keep first 2 occurrences)
|
|
790
|
+
for (const [text, count] of linkTextCounts) {
|
|
791
|
+
if (count >= 5) {
|
|
792
|
+
// Escape special regex characters in the link text for matching
|
|
793
|
+
const escaped = text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
794
|
+
// Match the full markdown link with this text (case-insensitive)
|
|
795
|
+
const spamPattern = new RegExp(`\\[${escaped}\\]\\([^)]+\\)`, 'gi');
|
|
796
|
+
let kept = 0;
|
|
797
|
+
result = result.replace(spamPattern, (match) => {
|
|
798
|
+
kept++;
|
|
799
|
+
// Keep first 2 occurrences, remove the rest
|
|
800
|
+
return kept <= 2 ? match : '';
|
|
801
|
+
});
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
// Remove "Button Text" placeholders (literal text from button elements)
|
|
805
|
+
result = result.replace(/^Button Text\s*$/gm, '');
|
|
806
|
+
// Clean up any new excess newlines from removals
|
|
807
|
+
result = result.replace(/\n{3,}/g, '\n\n').trim();
|
|
808
|
+
return result;
|
|
809
|
+
}
|