@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,1666 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebPeel pipeline stages
|
|
3
|
+
*
|
|
4
|
+
* Each stage is an exported async function that reads from / writes to the
|
|
5
|
+
* mutable PipelineContext. The stages are called in order by peel().
|
|
6
|
+
*/
|
|
7
|
+
import { createHash } from 'crypto';
|
|
8
|
+
import { smartFetch } from './strategies.js';
|
|
9
|
+
import { htmlToMarkdown, htmlToText, cleanForAI, estimateTokens, selectContent, detectMainContent, calculateQuality, truncateToTokenBudget, filterByTags, cleanMarkdownNoise, } from './markdown.js';
|
|
10
|
+
import { pruneContent, pruneMarkdown } from './content-pruner.js';
|
|
11
|
+
import { distillToBudget } from './budget.js';
|
|
12
|
+
import { extractMetadata, extractLinks, extractImages } from './metadata.js';
|
|
13
|
+
import { autoScroll as runAutoScroll } from './actions.js';
|
|
14
|
+
import { extractStructured } from './extract.js';
|
|
15
|
+
import { isPdfContentType, isDocxContentType, extractDocumentToFormat } from './documents.js';
|
|
16
|
+
import { parseYouTubeUrl, getYouTubeTranscript } from './youtube.js';
|
|
17
|
+
import { extractDomainData, getDomainExtractor } from '../ee/domain-extractors.js';
|
|
18
|
+
import { getDomainExtractHook, getDomainExtractorHook, getSPADomainsHook, getSPAPatternsHook } from './strategy-hooks.js';
|
|
19
|
+
import { extractReadableContent } from './readability.js';
|
|
20
|
+
import { quickAnswer as runQuickAnswer } from './quick-answer.js';
|
|
21
|
+
import { Timer } from './timing.js';
|
|
22
|
+
import { chunkContent } from './chunker.js';
|
|
23
|
+
import { splitIntoBlocks, scoreBM25 } from './bm25-filter.js';
|
|
24
|
+
import { BlockedError } from '../types.js';
|
|
25
|
+
import { Errors } from '../errors.js';
|
|
26
|
+
import { sanitizeForLLM } from './prompt-guard.js';
|
|
27
|
+
import { getSourceCredibility } from './source-credibility.js';
|
|
28
|
+
import { createLogger } from './logger.js';
|
|
29
|
+
import { detectAuthWall } from './auth-detection.js';
|
|
30
|
+
import { buildAcceptLanguageHeader, detectLanguageFromUrl } from './language-detect.js';
|
|
31
|
+
const log = createLogger('pipeline');
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// Hook-aware wrappers — route through premium hooks, fall back to basic stubs
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
/**
|
|
36
|
+
* Check if a URL has a domain extractor.
|
|
37
|
+
* Priority: premium hook → ee/domain-extractors.
|
|
38
|
+
*/
|
|
39
|
+
function hasDomainExtractor(url) {
|
|
40
|
+
const hookFn = getDomainExtractorHook();
|
|
41
|
+
if (hookFn)
|
|
42
|
+
return hookFn(url) !== null;
|
|
43
|
+
return getDomainExtractor(url) !== null;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Run domain extraction on HTML/URL.
|
|
47
|
+
* Priority: premium hook → ee/domain-extractors.
|
|
48
|
+
*/
|
|
49
|
+
async function runDomainExtract(html, url) {
|
|
50
|
+
const hookFn = getDomainExtractHook();
|
|
51
|
+
if (hookFn)
|
|
52
|
+
return hookFn(html, url);
|
|
53
|
+
return extractDomainData(html, url);
|
|
54
|
+
}
|
|
55
|
+
/** Create the initial PipelineContext with defaults */
|
|
56
|
+
export function createContext(url, options) {
|
|
57
|
+
return {
|
|
58
|
+
url,
|
|
59
|
+
options,
|
|
60
|
+
timer: new Timer(),
|
|
61
|
+
startTime: Date.now(),
|
|
62
|
+
// Normalized options — filled by normalizeOptions()
|
|
63
|
+
render: false,
|
|
64
|
+
stealth: false,
|
|
65
|
+
wait: 0,
|
|
66
|
+
format: 'markdown',
|
|
67
|
+
timeout: 30000,
|
|
68
|
+
userAgent: undefined,
|
|
69
|
+
screenshot: false,
|
|
70
|
+
screenshotFullPage: false,
|
|
71
|
+
selector: undefined,
|
|
72
|
+
exclude: undefined,
|
|
73
|
+
includeTags: undefined,
|
|
74
|
+
excludeTags: undefined,
|
|
75
|
+
headers: undefined,
|
|
76
|
+
cookies: undefined,
|
|
77
|
+
raw: false,
|
|
78
|
+
actions: undefined,
|
|
79
|
+
extract: undefined,
|
|
80
|
+
maxTokens: undefined,
|
|
81
|
+
extractImagesFlag: false,
|
|
82
|
+
profileDir: undefined,
|
|
83
|
+
headed: false,
|
|
84
|
+
storageState: undefined,
|
|
85
|
+
proxy: undefined,
|
|
86
|
+
fullPage: false,
|
|
87
|
+
autoScrollOpts: undefined,
|
|
88
|
+
// Content type — filled by detectContentType()
|
|
89
|
+
contentType: 'html',
|
|
90
|
+
// Parsing results — filled by parseContent()
|
|
91
|
+
content: '',
|
|
92
|
+
title: '',
|
|
93
|
+
metadata: {},
|
|
94
|
+
links: [],
|
|
95
|
+
quality: 0,
|
|
96
|
+
// Link count — filled by parseContent() / buildResult
|
|
97
|
+
linkCount: 0,
|
|
98
|
+
// Domain API first-pass flag
|
|
99
|
+
domainApiHandled: false,
|
|
100
|
+
// Warnings accumulator
|
|
101
|
+
warnings: [],
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
// ---------------------------------------------------------------------------
|
|
105
|
+
// Stage 1: normalizeOptions
|
|
106
|
+
// ---------------------------------------------------------------------------
|
|
107
|
+
/**
|
|
108
|
+
* Resolve all PeelOptions values into flat context fields with defaults applied.
|
|
109
|
+
* Force render=true when screenshot/stealth/actions/branding/autoScroll requested.
|
|
110
|
+
* Parse the autoScroll option.
|
|
111
|
+
*/
|
|
112
|
+
export function normalizeOptions(ctx) {
|
|
113
|
+
const opts = ctx.options;
|
|
114
|
+
// Apply agent-mode defaults (can be overridden by explicit options)
|
|
115
|
+
if (opts.agentMode) {
|
|
116
|
+
if (opts.budget === undefined)
|
|
117
|
+
opts.budget = 4000;
|
|
118
|
+
if (opts.format === undefined)
|
|
119
|
+
opts.format = 'markdown';
|
|
120
|
+
}
|
|
121
|
+
const { render = false, stealth = false, wait = 0, format = 'markdown', timeout = 30000, userAgent, screenshot = false, screenshotFullPage = false, selector, exclude, includeTags, excludeTags, headers, cookies, raw = false, actions, extract, maxTokens, images: extractImagesFlag = false, profileDir, headed = false, storageState, proxy, fullPage = false, autoScroll: autoScrollOption, } = opts;
|
|
122
|
+
// Normalize autoScroll option
|
|
123
|
+
const autoScrollOpts = autoScrollOption
|
|
124
|
+
? (typeof autoScrollOption === 'boolean' ? {} : autoScrollOption)
|
|
125
|
+
: undefined;
|
|
126
|
+
ctx.render = render;
|
|
127
|
+
ctx.stealth = stealth;
|
|
128
|
+
ctx.wait = wait;
|
|
129
|
+
ctx.format = format;
|
|
130
|
+
ctx.timeout = timeout;
|
|
131
|
+
ctx.userAgent = userAgent;
|
|
132
|
+
ctx.screenshot = screenshot;
|
|
133
|
+
ctx.screenshotFullPage = screenshotFullPage;
|
|
134
|
+
ctx.selector = selector;
|
|
135
|
+
ctx.exclude = exclude;
|
|
136
|
+
ctx.includeTags = includeTags;
|
|
137
|
+
ctx.excludeTags = excludeTags;
|
|
138
|
+
// Inject Accept-Language header when location.languages is specified,
|
|
139
|
+
// or auto-detect from URL TLD when no languages are explicitly set.
|
|
140
|
+
// This ensures both HTTP and browser requests use the correct language.
|
|
141
|
+
{
|
|
142
|
+
const langs = opts.location?.languages;
|
|
143
|
+
if (langs && langs.length > 0) {
|
|
144
|
+
const acceptLang = buildAcceptLanguageHeader(langs);
|
|
145
|
+
ctx.headers = { 'Accept-Language': acceptLang, ...headers };
|
|
146
|
+
}
|
|
147
|
+
else {
|
|
148
|
+
const detectedLang = detectLanguageFromUrl(ctx.url);
|
|
149
|
+
if (detectedLang) {
|
|
150
|
+
const acceptLang = buildAcceptLanguageHeader([detectedLang]);
|
|
151
|
+
ctx.headers = { 'Accept-Language': acceptLang, ...headers };
|
|
152
|
+
}
|
|
153
|
+
else {
|
|
154
|
+
ctx.headers = headers;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
ctx.cookies = cookies;
|
|
159
|
+
ctx.raw = raw;
|
|
160
|
+
ctx.actions = actions;
|
|
161
|
+
ctx.extract = extract;
|
|
162
|
+
ctx.maxTokens = maxTokens;
|
|
163
|
+
ctx.extractImagesFlag = extractImagesFlag;
|
|
164
|
+
ctx.profileDir = profileDir;
|
|
165
|
+
ctx.headed = headed;
|
|
166
|
+
ctx.storageState = storageState;
|
|
167
|
+
ctx.proxy = proxy;
|
|
168
|
+
ctx.fullPage = fullPage;
|
|
169
|
+
ctx.autoScrollOpts = autoScrollOpts;
|
|
170
|
+
// NOTE: PDFs/DOCX are now handled via simpleFetch + document parser.
|
|
171
|
+
// No need to force browser rendering for them.
|
|
172
|
+
// If screenshot is requested, force render mode
|
|
173
|
+
if (screenshot) {
|
|
174
|
+
ctx.render = true;
|
|
175
|
+
}
|
|
176
|
+
// If stealth is requested, force render mode
|
|
177
|
+
if (stealth) {
|
|
178
|
+
ctx.render = true;
|
|
179
|
+
}
|
|
180
|
+
// If actions are provided, force render mode
|
|
181
|
+
if (actions && actions.length > 0) {
|
|
182
|
+
ctx.render = true;
|
|
183
|
+
}
|
|
184
|
+
// If branding is requested, force render mode
|
|
185
|
+
if (opts.branding) {
|
|
186
|
+
ctx.render = true;
|
|
187
|
+
}
|
|
188
|
+
// If designAnalysis is requested, force render mode
|
|
189
|
+
if (opts.designAnalysis) {
|
|
190
|
+
ctx.render = true;
|
|
191
|
+
}
|
|
192
|
+
// If autoScroll is requested, force render mode
|
|
193
|
+
if (autoScrollOpts) {
|
|
194
|
+
ctx.render = true;
|
|
195
|
+
}
|
|
196
|
+
// Auto-detect SPAs that require browser rendering (no --render flag needed).
|
|
197
|
+
// This list is NOT proprietary — every developer knows these sites are SPAs.
|
|
198
|
+
// The proprietary part is the domain EXTRACTORS (what data to pull), not this list.
|
|
199
|
+
// Premium hook can extend this for additional server-side intelligence.
|
|
200
|
+
if (!ctx.render) {
|
|
201
|
+
const spaDomainsHook = getSPADomainsHook();
|
|
202
|
+
const spaPatternsHook = getSPAPatternsHook();
|
|
203
|
+
// Full SPA domain list — always available (npm + server)
|
|
204
|
+
const DEFAULT_SPA_DOMAINS = new Set([
|
|
205
|
+
// Search & travel
|
|
206
|
+
'www.google.com',
|
|
207
|
+
'flights.google.com',
|
|
208
|
+
// Travel & hospitality
|
|
209
|
+
'www.airbnb.com',
|
|
210
|
+
'www.booking.com',
|
|
211
|
+
'www.expedia.com',
|
|
212
|
+
'www.kayak.com',
|
|
213
|
+
'www.skyscanner.com',
|
|
214
|
+
'www.tripadvisor.com',
|
|
215
|
+
// Jobs
|
|
216
|
+
'www.indeed.com',
|
|
217
|
+
'www.glassdoor.com',
|
|
218
|
+
// Real estate
|
|
219
|
+
'www.zillow.com',
|
|
220
|
+
// Prediction markets (extractor handles specific paths; browser render for unknown paths)
|
|
221
|
+
'polymarket.com',
|
|
222
|
+
'www.polymarket.com',
|
|
223
|
+
// Our own dashboard
|
|
224
|
+
'app.webpeel.dev',
|
|
225
|
+
]);
|
|
226
|
+
const DEFAULT_SPA_PATTERNS = [
|
|
227
|
+
/google\.com\/travel/,
|
|
228
|
+
/google\.com\/maps/,
|
|
229
|
+
/google\.com\/shopping/,
|
|
230
|
+
];
|
|
231
|
+
// Premium hook can extend with additional domains; otherwise use full default list
|
|
232
|
+
const SPA_DOMAINS = spaDomainsHook ? spaDomainsHook() : DEFAULT_SPA_DOMAINS;
|
|
233
|
+
const SPA_URL_PATTERNS = spaPatternsHook ? spaPatternsHook() : DEFAULT_SPA_PATTERNS;
|
|
234
|
+
try {
|
|
235
|
+
const hostname = new URL(ctx.url).hostname;
|
|
236
|
+
if (SPA_DOMAINS.has(hostname)) {
|
|
237
|
+
ctx.render = true;
|
|
238
|
+
log.debug(`Auto-enabling render: SPA domain detected (${hostname})`);
|
|
239
|
+
}
|
|
240
|
+
else if (SPA_URL_PATTERNS.some(p => p.test(ctx.url))) {
|
|
241
|
+
ctx.render = true;
|
|
242
|
+
log.debug(`Auto-enabling render: SPA URL pattern matched`);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
catch {
|
|
246
|
+
// Invalid URL — skip SPA detection
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
// ---------------------------------------------------------------------------
|
|
251
|
+
// Stage 2: handleYouTube
|
|
252
|
+
// ---------------------------------------------------------------------------
|
|
253
|
+
/**
|
|
254
|
+
* If the URL is a YouTube URL, attempt transcript extraction.
|
|
255
|
+
* Returns a PeelResult on success, or null to fall through to normal pipeline.
|
|
256
|
+
*/
|
|
257
|
+
export async function handleYouTube(ctx) {
|
|
258
|
+
const ytVideoId = parseYouTubeUrl(ctx.url);
|
|
259
|
+
if (!ytVideoId)
|
|
260
|
+
return null;
|
|
261
|
+
const ytStartTime = Date.now();
|
|
262
|
+
try {
|
|
263
|
+
const transcript = await getYouTubeTranscript(ctx.url, {
|
|
264
|
+
language: ctx.options.language ?? ctx.options.location?.languages?.[0]?.split('-')[0] ?? 'en',
|
|
265
|
+
});
|
|
266
|
+
// Format view count
|
|
267
|
+
let viewStr = '';
|
|
268
|
+
if (transcript.viewCount) {
|
|
269
|
+
const v = parseInt(transcript.viewCount, 10);
|
|
270
|
+
if (!isNaN(v)) {
|
|
271
|
+
if (v >= 1_000_000)
|
|
272
|
+
viewStr = `${(v / 1_000_000).toFixed(1).replace(/\.0$/, '')}M views`;
|
|
273
|
+
else if (v >= 1_000)
|
|
274
|
+
viewStr = `${(v / 1_000).toFixed(1).replace(/\.0$/, '')}K views`;
|
|
275
|
+
else
|
|
276
|
+
viewStr = `${v.toLocaleString()} views`;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
// Format publish date
|
|
280
|
+
let publishStr = '';
|
|
281
|
+
if (transcript.publishDate) {
|
|
282
|
+
try {
|
|
283
|
+
const d = new Date(transcript.publishDate);
|
|
284
|
+
publishStr = d.toLocaleDateString('en-US', { month: 'short', year: 'numeric', day: 'numeric' });
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
publishStr = transcript.publishDate;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
// Build header metadata line
|
|
291
|
+
const headerParts = [`**Channel:** ${transcript.channel}`];
|
|
292
|
+
if (transcript.duration && transcript.duration !== '0:00')
|
|
293
|
+
headerParts.push(`**Duration:** ${transcript.duration}`);
|
|
294
|
+
if (viewStr)
|
|
295
|
+
headerParts.push(`**${viewStr}**`);
|
|
296
|
+
if (publishStr)
|
|
297
|
+
headerParts.push(`**Published:** ${publishStr}`);
|
|
298
|
+
/**
|
|
299
|
+
* Strip music note symbols from YouTube auto-caption text.
|
|
300
|
+
* Cleans: [♪♪♪], [🎵🎵🎵], ♪ text ♪ (keeps inner text), standalone ♪ / 🎵
|
|
301
|
+
*/
|
|
302
|
+
const cleanMusicNotes = (text) => text
|
|
303
|
+
.replace(/\[[♪🎵]+\]/g, '')
|
|
304
|
+
.replace(/♪\s*([^♪]*?)\s*♪/g, (_, inner) => inner.trim())
|
|
305
|
+
.replace(/[♪🎵]+/g, '')
|
|
306
|
+
.replace(/\s{2,}/g, ' ')
|
|
307
|
+
.trim();
|
|
308
|
+
// Add paragraph breaks to transcript for readability
|
|
309
|
+
let readableText = cleanMusicNotes(transcript.fullText);
|
|
310
|
+
readableText = readableText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
311
|
+
readableText = readableText.replace(/\n{3,}/g, '\n\n');
|
|
312
|
+
// Build a clean markdown representation of the video + transcript
|
|
313
|
+
const parts = [`# ${transcript.title}`, headerParts.join(' | ')];
|
|
314
|
+
if (transcript.summary) {
|
|
315
|
+
let summaryText = cleanMusicNotes(transcript.summary);
|
|
316
|
+
summaryText = summaryText.replace(/([.!?])\s+(?=[A-Z])/g, '$1\n\n');
|
|
317
|
+
parts.push(`## Summary\n\n${summaryText}`);
|
|
318
|
+
}
|
|
319
|
+
if (transcript.keyPoints && transcript.keyPoints.length > 0) {
|
|
320
|
+
const cleanedKps = transcript.keyPoints.map((kp) => cleanMusicNotes(kp)).filter((kp) => kp.length > 0);
|
|
321
|
+
if (cleanedKps.length > 0) {
|
|
322
|
+
parts.push(`## Key Points\n\n${cleanedKps.map((kp) => `- ${kp}`).join('\n')}`);
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
if (transcript.chapters && transcript.chapters.length > 0) {
|
|
326
|
+
parts.push(`## Chapters\n\n${transcript.chapters.map(ch => `- ${ch.time} — ${ch.title}`).join('\n')}`);
|
|
327
|
+
}
|
|
328
|
+
parts.push(`## Full Transcript\n\n${readableText}`);
|
|
329
|
+
const videoInfoContent = parts.join('\n\n');
|
|
330
|
+
const elapsed = Date.now() - ytStartTime;
|
|
331
|
+
const tokens = estimateTokens(videoInfoContent);
|
|
332
|
+
const fingerprint = createHash('sha256').update(videoInfoContent).digest('hex').slice(0, 16);
|
|
333
|
+
return {
|
|
334
|
+
url: `https://www.youtube.com/watch?v=${ytVideoId}`,
|
|
335
|
+
title: transcript.title,
|
|
336
|
+
content: videoInfoContent,
|
|
337
|
+
metadata: {
|
|
338
|
+
description: `YouTube video by ${transcript.channel}, duration ${transcript.duration}`,
|
|
339
|
+
author: transcript.channel,
|
|
340
|
+
},
|
|
341
|
+
links: [`https://www.youtube.com/watch?v=${ytVideoId}`],
|
|
342
|
+
tokens,
|
|
343
|
+
method: 'simple',
|
|
344
|
+
elapsed,
|
|
345
|
+
contentType: 'youtube',
|
|
346
|
+
quality: 1.0,
|
|
347
|
+
fingerprint,
|
|
348
|
+
extracted: undefined,
|
|
349
|
+
structured: transcript,
|
|
350
|
+
};
|
|
351
|
+
}
|
|
352
|
+
catch (_ytError) {
|
|
353
|
+
// If transcript extraction fails (no captions, page changed, etc.),
|
|
354
|
+
// fall through to the normal HTML fetch pipeline below.
|
|
355
|
+
return null;
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
// ---------------------------------------------------------------------------
|
|
359
|
+
// Stage 3: fetchContent
|
|
360
|
+
// ---------------------------------------------------------------------------
|
|
361
|
+
/**
|
|
362
|
+
* Fetch the URL via smartFetch, handle autoScroll, and store result in ctx.fetchResult.
|
|
363
|
+
*/
|
|
364
|
+
export async function fetchContent(ctx) {
|
|
365
|
+
const needsBranding = ctx.options.branding && ctx.render;
|
|
366
|
+
const needsAutoScroll = !!ctx.autoScrollOpts && ctx.render;
|
|
367
|
+
const needsDesignAnalysis = ctx.options.designAnalysis && ctx.render;
|
|
368
|
+
// Try API-based domain extraction first (Reddit, GitHub, HN use APIs, not HTML)
|
|
369
|
+
// This avoids expensive browser fetches that often get blocked
|
|
370
|
+
// Skip if noDomainApi is set — user wants raw page content, not API shortcut
|
|
371
|
+
if (hasDomainExtractor(ctx.url) && !ctx.options.noDomainApi) {
|
|
372
|
+
try {
|
|
373
|
+
ctx.timer.mark('domainApiFirst');
|
|
374
|
+
const ddResult = await runDomainExtract('', ctx.url);
|
|
375
|
+
ctx.timer.end('domainApiFirst');
|
|
376
|
+
if (ddResult && ddResult.cleanContent.length > 50) {
|
|
377
|
+
ctx.domainData = ddResult;
|
|
378
|
+
ctx.content = ddResult.cleanContent;
|
|
379
|
+
// Capture raw HTML size from the extractor (e.g. Wikipedia mobile-html size)
|
|
380
|
+
if (ddResult.rawHtmlSize && ddResult.rawHtmlSize > 0) {
|
|
381
|
+
ctx.rawHtmlSize = ddResult.rawHtmlSize;
|
|
382
|
+
}
|
|
383
|
+
else {
|
|
384
|
+
// For API-first extractors (HN, Reddit, GitHub), the raw HTML page is typically
|
|
385
|
+
// 6-10x larger than the extracted content. Estimate conservatively at 7x.
|
|
386
|
+
ctx.rawHtmlSize = ddResult.cleanContent.length * 7;
|
|
387
|
+
}
|
|
388
|
+
// Create minimal fetchResult so downstream stages don't crash
|
|
389
|
+
ctx.fetchResult = {
|
|
390
|
+
html: ddResult.cleanContent,
|
|
391
|
+
url: ctx.url,
|
|
392
|
+
status: 200,
|
|
393
|
+
contentType: 'text/html',
|
|
394
|
+
method: 'domain-api',
|
|
395
|
+
};
|
|
396
|
+
ctx.title = ddResult.structured?.title || '';
|
|
397
|
+
ctx.quality = 0.95; // High quality — structured API data
|
|
398
|
+
// Compute basic metadata so downstream stages have wordCount etc.
|
|
399
|
+
const domainWordCount = ddResult.cleanContent.split(/\s+/).filter(Boolean).length;
|
|
400
|
+
ctx.metadata = {
|
|
401
|
+
...(ctx.metadata || {}),
|
|
402
|
+
title: ddResult.structured?.title || ctx.title,
|
|
403
|
+
description: ddResult.structured?.description || ddResult.structured?.extract || '',
|
|
404
|
+
wordCount: domainWordCount,
|
|
405
|
+
language: ddResult.structured?.language || ctx.options.location?.languages?.[0]?.split('-')[0] || 'en',
|
|
406
|
+
};
|
|
407
|
+
ctx.domainApiHandled = true;
|
|
408
|
+
return; // Skip browser fetch entirely
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
catch (e) {
|
|
412
|
+
// Domain API failed — fall through to normal fetch
|
|
413
|
+
const errMsg = e instanceof Error ? e.message : String(e);
|
|
414
|
+
log.warn('domain API first-pass failed, falling back to fetch:', errMsg);
|
|
415
|
+
ctx.warnings.push(`Domain API extraction failed: ${errMsg}`);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
ctx.timer.mark('fetch');
|
|
419
|
+
let fetchResult;
|
|
420
|
+
try {
|
|
421
|
+
fetchResult = await smartFetch(ctx.url, {
|
|
422
|
+
forceBrowser: ctx.render,
|
|
423
|
+
stealth: ctx.stealth,
|
|
424
|
+
waitMs: ctx.wait,
|
|
425
|
+
userAgent: ctx.userAgent,
|
|
426
|
+
timeoutMs: ctx.timeout,
|
|
427
|
+
screenshot: ctx.screenshot,
|
|
428
|
+
screenshotFullPage: ctx.screenshotFullPage,
|
|
429
|
+
headers: ctx.headers,
|
|
430
|
+
cookies: ctx.cookies,
|
|
431
|
+
actions: ctx.actions,
|
|
432
|
+
keepPageOpen: needsBranding || needsAutoScroll || needsDesignAnalysis,
|
|
433
|
+
profileDir: ctx.profileDir,
|
|
434
|
+
headed: ctx.headed,
|
|
435
|
+
storageState: ctx.storageState,
|
|
436
|
+
proxy: ctx.proxy,
|
|
437
|
+
proxies: ctx.options.proxies,
|
|
438
|
+
device: ctx.options.device,
|
|
439
|
+
viewportWidth: ctx.options.viewportWidth,
|
|
440
|
+
viewportHeight: ctx.options.viewportHeight,
|
|
441
|
+
deviceScaleFactor: ctx.options.deviceScaleFactor,
|
|
442
|
+
waitUntil: ctx.options.waitUntil,
|
|
443
|
+
waitSelector: ctx.options.waitSelector,
|
|
444
|
+
blockResources: ctx.options.blockResources,
|
|
445
|
+
cloaked: ctx.options.cloaked,
|
|
446
|
+
cycle: ctx.options.cycle,
|
|
447
|
+
tls: ctx.options.tls,
|
|
448
|
+
noEscalate: ctx.options.noEscalate,
|
|
449
|
+
});
|
|
450
|
+
}
|
|
451
|
+
catch (fetchError) {
|
|
452
|
+
// If fetch failed but we have a domain extractor, try it as fallback
|
|
453
|
+
// Respect noDomainApi flag even in error fallback path
|
|
454
|
+
if (hasDomainExtractor(ctx.url) && !ctx.options.noDomainApi) {
|
|
455
|
+
try {
|
|
456
|
+
const ddResult = await runDomainExtract('', ctx.url);
|
|
457
|
+
if (ddResult && ddResult.cleanContent.length > 50) {
|
|
458
|
+
ctx.timer.end('fetch');
|
|
459
|
+
ctx.domainData = ddResult;
|
|
460
|
+
ctx.content = ddResult.cleanContent;
|
|
461
|
+
if (ddResult.rawHtmlSize && ddResult.rawHtmlSize > 0) {
|
|
462
|
+
ctx.rawHtmlSize = ddResult.rawHtmlSize;
|
|
463
|
+
}
|
|
464
|
+
else {
|
|
465
|
+
// Estimate raw HTML size for API-first extractors (7x compression factor)
|
|
466
|
+
ctx.rawHtmlSize = ddResult.cleanContent.length * 7;
|
|
467
|
+
}
|
|
468
|
+
ctx.fetchResult = {
|
|
469
|
+
html: ddResult.cleanContent,
|
|
470
|
+
url: ctx.url,
|
|
471
|
+
status: 200,
|
|
472
|
+
contentType: 'text/html',
|
|
473
|
+
method: 'domain-api-fallback',
|
|
474
|
+
};
|
|
475
|
+
ctx.title = ddResult.structured?.title || '';
|
|
476
|
+
ctx.quality = 0.90;
|
|
477
|
+
const fallbackWordCount = ddResult.cleanContent.split(/\s+/).filter(Boolean).length;
|
|
478
|
+
ctx.metadata = { ...(ctx.metadata || {}), title: ddResult.structured?.title || ctx.title, wordCount: fallbackWordCount, language: ddResult.structured?.language || ctx.options.location?.languages?.[0]?.split('-')[0] || 'en' };
|
|
479
|
+
ctx.domainApiHandled = true;
|
|
480
|
+
return;
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
catch (e) {
|
|
484
|
+
// Domain API also failed — throw original error
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
// Search-as-proxy fallback for blocked requests (BlockedError before pipeline)
|
|
488
|
+
// When all fetch strategies fail with a bot-protection block, try DDG search
|
|
489
|
+
// to get the title/snippet from the search engine's cached version.
|
|
490
|
+
if (fetchError instanceof BlockedError) {
|
|
491
|
+
try {
|
|
492
|
+
// @ts-ignore — proprietary module, gitignored
|
|
493
|
+
const { searchFallback } = await import('./search-fallback.js');
|
|
494
|
+
const searchResult = await searchFallback(ctx.url);
|
|
495
|
+
// If DDG/primary returned very little, also try Bing for richer snippets
|
|
496
|
+
if (!searchResult.cachedContent || searchResult.cachedContent.length < 400) {
|
|
497
|
+
try {
|
|
498
|
+
const { simpleFetch } = await import('./http-fetch.js');
|
|
499
|
+
const bingUrl = `https://www.bing.com/search?q=${encodeURIComponent(ctx.url)}`;
|
|
500
|
+
const bingResult = await simpleFetch(bingUrl, ctx.userAgent, 8000);
|
|
501
|
+
if (bingResult.html && bingResult.html.length > 500) {
|
|
502
|
+
const snippetMatch = bingResult.html.match(/<p[^>]*class="[^"]*snippet[^"]*"[^>]*>(.*?)<\/p>/gi);
|
|
503
|
+
if (snippetMatch) {
|
|
504
|
+
const bingSnippet = snippetMatch.map(s => s.replace(/<[^>]+>/g, '')).join('\n');
|
|
505
|
+
searchResult.cachedContent = (searchResult.cachedContent || '') + '\n\n---\n*Additional context from Bing:*\n' + bingSnippet;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
catch { /* Bing fallback is best-effort */ }
|
|
510
|
+
}
|
|
511
|
+
if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
|
|
512
|
+
ctx.timer.end('fetch');
|
|
513
|
+
ctx.content = searchResult.cachedContent;
|
|
514
|
+
ctx.title = searchResult.title || ctx.title;
|
|
515
|
+
ctx.quality = 0.4;
|
|
516
|
+
ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
|
|
517
|
+
ctx.fetchResult = {
|
|
518
|
+
html: searchResult.cachedContent,
|
|
519
|
+
url: ctx.url,
|
|
520
|
+
status: 0,
|
|
521
|
+
contentType: 'text/markdown',
|
|
522
|
+
method: 'search-fallback',
|
|
523
|
+
};
|
|
524
|
+
ctx.metadata = {
|
|
525
|
+
...(ctx.metadata || {}),
|
|
526
|
+
title: searchResult.title || ctx.title,
|
|
527
|
+
blocked: true,
|
|
528
|
+
fallbackSource: searchResult.source,
|
|
529
|
+
};
|
|
530
|
+
return;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
catch { /* Search fallback also failed — rethrow original BlockedError */ }
|
|
534
|
+
}
|
|
535
|
+
// Enhance error messages with actionable advice
|
|
536
|
+
if (fetchError instanceof BlockedError) {
|
|
537
|
+
// Instead of crashing, return a helpful response with the block info
|
|
538
|
+
ctx.timer.end('fetch');
|
|
539
|
+
const host = new URL(ctx.url).hostname.replace('www.', '');
|
|
540
|
+
ctx.content = `# ⚠️ ${host} — Access Blocked\n\nThis site uses advanced bot protection and blocked our request.\n\n**What you can try:**\n- Use a browser profile with saved login: \`webpeel login ${host}\`\n- Try an alternative site that provides similar data\n\n*Direct link: [Open in browser](${ctx.url})*`;
|
|
541
|
+
ctx.title = `${host} — Blocked`;
|
|
542
|
+
ctx.quality = 0.2;
|
|
543
|
+
ctx.warnings.push('Site blocked automated access. Showing fallback content.');
|
|
544
|
+
ctx.fetchResult = {
|
|
545
|
+
html: ctx.content,
|
|
546
|
+
url: ctx.url,
|
|
547
|
+
status: 403,
|
|
548
|
+
contentType: 'text/markdown',
|
|
549
|
+
method: 'blocked-fallback',
|
|
550
|
+
};
|
|
551
|
+
return;
|
|
552
|
+
}
|
|
553
|
+
const errMsg = fetchError instanceof Error ? fetchError.message : String(fetchError);
|
|
554
|
+
if (errMsg.toLowerCase().includes('timeout') || errMsg.toLowerCase().includes('timed out') || errMsg.includes('AbortError')) {
|
|
555
|
+
const ms = ctx.timeout ?? 30000;
|
|
556
|
+
throw Errors.fetchTimeout(ctx.url, ms);
|
|
557
|
+
}
|
|
558
|
+
throw fetchError;
|
|
559
|
+
}
|
|
560
|
+
const fetchDuration = ctx.timer.end('fetch');
|
|
561
|
+
// Fast path: if a plain HTTP fetch completed quickly with real HTML content,
|
|
562
|
+
// mark it so post-processing can skip expensive heuristics (challenge detection).
|
|
563
|
+
// Only applies to non-browser fetches that succeeded with HTML content.
|
|
564
|
+
if (fetchDuration < 500 &&
|
|
565
|
+
!ctx.render &&
|
|
566
|
+
fetchResult.statusCode === 200 &&
|
|
567
|
+
(fetchResult.contentType || '').includes('html') &&
|
|
568
|
+
(fetchResult.html?.length || 0) > 200) {
|
|
569
|
+
ctx.fastPath = true;
|
|
570
|
+
}
|
|
571
|
+
// Auto-scroll to load lazy content, then grab fresh HTML
|
|
572
|
+
if (needsAutoScroll && fetchResult.page) {
|
|
573
|
+
try {
|
|
574
|
+
await runAutoScroll(fetchResult.page, ctx.autoScrollOpts);
|
|
575
|
+
// Capture refreshed HTML after scrolling
|
|
576
|
+
fetchResult.html = await fetchResult.page.content();
|
|
577
|
+
}
|
|
578
|
+
catch (e) {
|
|
579
|
+
// Non-fatal: auto-scroll failed, continuing with whatever HTML we have
|
|
580
|
+
log.debug('auto-scroll failed:', e instanceof Error ? e.message : e);
|
|
581
|
+
}
|
|
582
|
+
finally {
|
|
583
|
+
// Close page unless branding or design analysis also needs it
|
|
584
|
+
if (!needsBranding && !needsDesignAnalysis) {
|
|
585
|
+
try {
|
|
586
|
+
await fetchResult.page.close().catch(() => { });
|
|
587
|
+
if (fetchResult.browser) {
|
|
588
|
+
await fetchResult.browser.close().catch(() => { });
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
catch (e) {
|
|
592
|
+
// Non-fatal: page/browser cleanup after auto-scroll
|
|
593
|
+
log.debug('page/browser cleanup after auto-scroll:', e instanceof Error ? e.message : e);
|
|
594
|
+
}
|
|
595
|
+
fetchResult.page = undefined;
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
// Capture raw HTML size BEFORE any processing (accurate measurement of original content)
|
|
600
|
+
ctx.rawHtmlSize = fetchResult.html?.length || 0;
|
|
601
|
+
ctx.fetchResult = fetchResult;
|
|
602
|
+
// Attempt to solve challenge/CAPTCHA page when detected
|
|
603
|
+
if (fetchResult.challengeDetected) {
|
|
604
|
+
const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
|
|
605
|
+
// Only attempt solve if we have a browser worker URL or are not on a resource-constrained env
|
|
606
|
+
const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
|
|
607
|
+
if (canSolve) {
|
|
608
|
+
try {
|
|
609
|
+
const { solveChallenge } = await import('../ee/challenge-solver.js');
|
|
610
|
+
const { detectChallenge } = await import('./challenge-detection.js');
|
|
611
|
+
const rawHtml = fetchResult.html || '';
|
|
612
|
+
const detectionResult = detectChallenge(rawHtml, fetchResult.statusCode);
|
|
613
|
+
const challengeType = detectionResult.type || 'generic-block';
|
|
614
|
+
const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
|
|
615
|
+
timeout: 15000,
|
|
616
|
+
});
|
|
617
|
+
if (solveResult.solved && solveResult.html) {
|
|
618
|
+
fetchResult.html = solveResult.html;
|
|
619
|
+
fetchResult.challengeDetected = false;
|
|
620
|
+
log.debug(`Challenge solved (${challengeType}) for ${ctx.url}`);
|
|
621
|
+
}
|
|
622
|
+
else {
|
|
623
|
+
ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
catch (e) {
|
|
627
|
+
ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
|
|
628
|
+
log.debug('Challenge solve failed:', e instanceof Error ? e.message : e);
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
else {
|
|
632
|
+
ctx.warnings.push('Challenge/CAPTCHA page detected. Content may be incomplete or from a bot-detection page.');
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
// ---------------------------------------------------------------------------
|
|
637
|
+
// Stage 4: detectContentType
|
|
638
|
+
// ---------------------------------------------------------------------------
|
|
639
|
+
/**
|
|
640
|
+
* Detect and set ctx.contentType based on response headers and content.
|
|
641
|
+
*/
|
|
642
|
+
export function detectContentType(ctx) {
|
|
643
|
+
// Skip HTML parsing stages — domain API already provided clean content
|
|
644
|
+
if (ctx.domainApiHandled)
|
|
645
|
+
return;
|
|
646
|
+
const fetchResult = ctx.fetchResult;
|
|
647
|
+
const ct = (fetchResult.contentType || '').toLowerCase();
|
|
648
|
+
const urlLower = fetchResult.url.toLowerCase();
|
|
649
|
+
// Check for binary document types (PDF/DOCX)
|
|
650
|
+
const isDocument = isPdfContentType(ct) || isDocxContentType(ct) ||
|
|
651
|
+
urlLower.endsWith('.pdf') || urlLower.endsWith('.docx');
|
|
652
|
+
// Check for image types (for OCR text extraction)
|
|
653
|
+
const IMAGE_URL_EXTS = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.tiff', '.tif', '.bmp'];
|
|
654
|
+
const isImage = !isDocument && (ct.startsWith('image/') ||
|
|
655
|
+
IMAGE_URL_EXTS.some(ext => urlLower.endsWith(ext)));
|
|
656
|
+
const isHTML = !isDocument && !isImage && (ct.includes('html') || ct.includes('xhtml') || (!ct && fetchResult.html.trimStart().startsWith('<')));
|
|
657
|
+
const isJSON = !isDocument && !isImage && ct.includes('json');
|
|
658
|
+
const isXML = !isDocument && !isImage && (ct.includes('xml') || ct.includes('rss') || ct.includes('atom'));
|
|
659
|
+
const isPlainText = !isDocument && !isImage && (ct.includes('text/plain') || ct.includes('text/markdown') || ct.includes('text/csv') || ct.includes('text/css') || ct.includes('javascript'));
|
|
660
|
+
ctx.contentType = isImage ? 'image' : isDocument ? 'document' : isHTML ? 'html' : isJSON ? 'json' : isXML ? 'xml' : isPlainText ? 'text' : 'html';
|
|
661
|
+
// Flag when the server returned pre-rendered markdown — no HTML parsing needed
|
|
662
|
+
if (ct.includes('text/markdown')) {
|
|
663
|
+
ctx.serverMarkdown = true;
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
// ---------------------------------------------------------------------------
|
|
667
|
+
// Stage 5: parseContent
|
|
668
|
+
// ---------------------------------------------------------------------------
|
|
669
|
+
/**
|
|
670
|
+
* Parse content from fetchResult based on the detected contentType.
|
|
671
|
+
* Sets ctx.content, ctx.title, ctx.metadata, ctx.links, ctx.quality, ctx.prunedPercent.
|
|
672
|
+
*/
|
|
673
|
+
export async function parseContent(ctx) {
|
|
674
|
+
// Skip HTML parsing stages — domain API already provided clean content
|
|
675
|
+
if (ctx.domainApiHandled)
|
|
676
|
+
return;
|
|
677
|
+
const fetchResult = ctx.fetchResult;
|
|
678
|
+
const { contentType, format, fullPage, raw, selector, exclude, includeTags, excludeTags } = ctx;
|
|
679
|
+
const hasBuffer = !!fetchResult.buffer;
|
|
680
|
+
// === Image alt-text enhancement (opt-in, heuristic) ===
|
|
681
|
+
// Runs before any conversion so both lite mode and standard mode benefit.
|
|
682
|
+
if (ctx.options.captionImages && contentType === 'html' && fetchResult.html) {
|
|
683
|
+
ctx.timer.mark('captionImages');
|
|
684
|
+
const { enhanceImageAltText } = await import('./image-caption.js');
|
|
685
|
+
fetchResult.html = enhanceImageAltText(fetchResult.html);
|
|
686
|
+
ctx.timer.end('captionImages');
|
|
687
|
+
}
|
|
688
|
+
if (contentType === 'image' && hasBuffer) {
|
|
689
|
+
// === OCR pipeline — extract text from images using Tesseract.js ===
|
|
690
|
+
ctx.timer.mark('ocr');
|
|
691
|
+
const { extractTextFromImage } = await import('./ocr.js');
|
|
692
|
+
const ocrText = await extractTextFromImage(fetchResult.buffer);
|
|
693
|
+
ctx.timer.end('ocr');
|
|
694
|
+
if (ocrText.length > 0) {
|
|
695
|
+
ctx.content = `# OCR Text Extraction\n\n${ocrText}`;
|
|
696
|
+
}
|
|
697
|
+
else {
|
|
698
|
+
ctx.content = '# OCR Text Extraction\n\n*(No text detected in image)*';
|
|
699
|
+
}
|
|
700
|
+
ctx.title = '';
|
|
701
|
+
ctx.metadata = { url: fetchResult.url, title: '' };
|
|
702
|
+
ctx.quality = ocrText.length > 10 ? 0.8 : 0.1;
|
|
703
|
+
}
|
|
704
|
+
else if (contentType === 'document' && hasBuffer) {
|
|
705
|
+
// Document parsing pipeline (PDF/DOCX)
|
|
706
|
+
// 'clean' maps to 'markdown' for extraction; cleanForAI is applied in buildResult
|
|
707
|
+
const docFormat = format === 'clean' ? 'markdown' : format;
|
|
708
|
+
const docResult = await extractDocumentToFormat(fetchResult.buffer, {
|
|
709
|
+
url: fetchResult.url,
|
|
710
|
+
contentType: fetchResult.contentType,
|
|
711
|
+
format: docFormat,
|
|
712
|
+
});
|
|
713
|
+
ctx.content = docResult.content;
|
|
714
|
+
ctx.title = docResult.metadata.title;
|
|
715
|
+
ctx.metadata = docResult.metadata;
|
|
716
|
+
ctx.quality = 1.0; // Documents are inherently structured content
|
|
717
|
+
}
|
|
718
|
+
else if (contentType === 'html') {
|
|
719
|
+
// === Lite mode — minimal processing, maximum speed ===
|
|
720
|
+
// Skips pruning, metadata, quality scoring, JSON-LD. Just fetch → markdown.
|
|
721
|
+
if (ctx.options.lite) {
|
|
722
|
+
let liteHtml = fetchResult.html;
|
|
723
|
+
if (selector) {
|
|
724
|
+
liteHtml = selectContent(liteHtml, selector, exclude);
|
|
725
|
+
}
|
|
726
|
+
ctx.timer.mark('convert');
|
|
727
|
+
switch (format) {
|
|
728
|
+
case 'html':
|
|
729
|
+
ctx.content = liteHtml;
|
|
730
|
+
break;
|
|
731
|
+
case 'text':
|
|
732
|
+
ctx.content = htmlToText(liteHtml);
|
|
733
|
+
break;
|
|
734
|
+
case 'clean':
|
|
735
|
+
ctx.content = cleanForAI(htmlToMarkdown(liteHtml, { raw, prune: false }));
|
|
736
|
+
break;
|
|
737
|
+
default:
|
|
738
|
+
ctx.content = htmlToMarkdown(liteHtml, { raw, prune: false });
|
|
739
|
+
break;
|
|
740
|
+
}
|
|
741
|
+
ctx.timer.end('convert');
|
|
742
|
+
ctx.title = liteHtml.match(/<title[^>]*>([^<]*)<\/title>/i)?.[1]?.trim() || '';
|
|
743
|
+
ctx.quality = 0.5; // Unknown quality in lite mode
|
|
744
|
+
return;
|
|
745
|
+
}
|
|
746
|
+
// === JSON-LD extraction — first-class content source ===
|
|
747
|
+
// Many sites (recipes, products, articles) embed structured data that's
|
|
748
|
+
// more reliable than DOM parsing, especially on JS-heavy SPAs.
|
|
749
|
+
if (!raw && !selector) {
|
|
750
|
+
const { extractJsonLd } = await import('./json-ld.js');
|
|
751
|
+
const jsonLdResult = extractJsonLd(fetchResult.html);
|
|
752
|
+
if (jsonLdResult && jsonLdResult.found && jsonLdResult.content.length > 100) {
|
|
753
|
+
ctx.content = jsonLdResult.content;
|
|
754
|
+
ctx.title = jsonLdResult.title || ctx.title;
|
|
755
|
+
ctx.jsonLdType = jsonLdResult.type;
|
|
756
|
+
ctx.quality = 0.95; // Structured data is high quality
|
|
757
|
+
// Still extract metadata and links from HTML
|
|
758
|
+
ctx.timer.mark('metadata');
|
|
759
|
+
const meta = extractMetadata(fetchResult.html, fetchResult.url);
|
|
760
|
+
ctx.metadata = meta.metadata;
|
|
761
|
+
if (!ctx.title)
|
|
762
|
+
ctx.title = meta.title;
|
|
763
|
+
const htmlForLinks = fetchResult.html.length > 100000
|
|
764
|
+
? fetchResult.html.slice(0, 100000)
|
|
765
|
+
: fetchResult.html;
|
|
766
|
+
ctx.links = extractLinks(htmlForLinks, fetchResult.url);
|
|
767
|
+
ctx.linkCount = ctx.links.length;
|
|
768
|
+
ctx.timer.end('metadata');
|
|
769
|
+
return;
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
// === Readable mode fast-path ===
|
|
773
|
+
// Run readability on raw HTML directly, skipping expensive prune + convert stages.
|
|
774
|
+
// Readability handles its own noise removal and outputs markdown, making prune/convert redundant.
|
|
775
|
+
if (ctx.options.readable && !raw && !selector && !fullPage) {
|
|
776
|
+
// Run readability and metadata extraction in parallel
|
|
777
|
+
const [readResult, metaResult] = await Promise.all([
|
|
778
|
+
Promise.resolve().then(() => {
|
|
779
|
+
ctx.timer.mark('readability');
|
|
780
|
+
const result = extractReadableContent(fetchResult.html, fetchResult.url);
|
|
781
|
+
ctx.timer.end('readability');
|
|
782
|
+
return result;
|
|
783
|
+
}),
|
|
784
|
+
Promise.resolve().then(() => {
|
|
785
|
+
ctx.timer.mark('metadata');
|
|
786
|
+
const meta = extractMetadata(fetchResult.html, fetchResult.url);
|
|
787
|
+
const htmlForLinks = fetchResult.html.length > 100000
|
|
788
|
+
? fetchResult.html.slice(0, 100000)
|
|
789
|
+
: fetchResult.html;
|
|
790
|
+
const links = extractLinks(htmlForLinks, fetchResult.url);
|
|
791
|
+
ctx.timer.end('metadata');
|
|
792
|
+
return { meta, links };
|
|
793
|
+
}),
|
|
794
|
+
]);
|
|
795
|
+
// Quality check: if readability result is < 15% of the HTML body text, it likely failed
|
|
796
|
+
// (picked footnotes, sidebar, or wrong section as "main content" — e.g. aosabook.org)
|
|
797
|
+
const htmlTextLen = fetchResult.html.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim().length;
|
|
798
|
+
const readableLen = readResult.content?.length || 0;
|
|
799
|
+
const readabilityFailed = htmlTextLen > 2000 && readableLen > 0 && readableLen < htmlTextLen * 0.15;
|
|
800
|
+
if (readabilityFailed) {
|
|
801
|
+
log.debug(`Readability returned only ${Math.round(readableLen / htmlTextLen * 100)}% of content — falling through to standard extraction`);
|
|
802
|
+
// Don't return early — fall through to standard HTML pipeline below
|
|
803
|
+
}
|
|
804
|
+
else {
|
|
805
|
+
ctx.readabilityResult = readResult;
|
|
806
|
+
ctx.content = readResult.content;
|
|
807
|
+
ctx.title = readResult.title || metaResult.meta.title || ctx.title;
|
|
808
|
+
ctx.metadata = {
|
|
809
|
+
...metaResult.meta.metadata,
|
|
810
|
+
title: readResult.title || metaResult.meta.title,
|
|
811
|
+
...(readResult.author ? { author: readResult.author } : {}),
|
|
812
|
+
...(readResult.date ? { publishedDate: readResult.date } : {}),
|
|
813
|
+
};
|
|
814
|
+
ctx.links = metaResult.links;
|
|
815
|
+
ctx.linkCount = metaResult.links.length;
|
|
816
|
+
ctx.quality = readResult.content.length > 200 ? 0.95 : 0.5;
|
|
817
|
+
return;
|
|
818
|
+
}
|
|
819
|
+
}
|
|
820
|
+
// Standard HTML pipeline
|
|
821
|
+
let html = fetchResult.html;
|
|
822
|
+
// Apply include/exclude tags filtering first (before selector)
|
|
823
|
+
if (includeTags || excludeTags) {
|
|
824
|
+
html = filterByTags(html, includeTags, excludeTags);
|
|
825
|
+
}
|
|
826
|
+
if (selector) {
|
|
827
|
+
html = selectContent(html, selector, exclude);
|
|
828
|
+
}
|
|
829
|
+
else if (exclude?.length) {
|
|
830
|
+
// Apply exclude selectors even without a specific selector
|
|
831
|
+
const cheerio = await import('cheerio');
|
|
832
|
+
const $doc = cheerio.load(html);
|
|
833
|
+
exclude.forEach(sel => $doc(sel).remove());
|
|
834
|
+
html = $doc.html() || html;
|
|
835
|
+
}
|
|
836
|
+
// Smart main content detection (unless raw or selector specified)
|
|
837
|
+
let contentHtml = html;
|
|
838
|
+
if (!raw && !selector) {
|
|
839
|
+
const detected = detectMainContent(html);
|
|
840
|
+
if (detected.detected) {
|
|
841
|
+
contentHtml = detected.html;
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
const metadataTask = Promise.resolve().then(() => {
|
|
845
|
+
ctx.timer.mark('metadata');
|
|
846
|
+
const meta = extractMetadata(html, fetchResult.url);
|
|
847
|
+
// When budget is set, use pre-truncated HTML for link extraction (faster)
|
|
848
|
+
const htmlForLinks = (ctx.options.budget && ctx.options.budget > 0 && html.length > 100000)
|
|
849
|
+
? html.slice(0, 100000)
|
|
850
|
+
: html;
|
|
851
|
+
const result = {
|
|
852
|
+
title: meta.title,
|
|
853
|
+
metadata: meta.metadata,
|
|
854
|
+
links: extractLinks(htmlForLinks, fetchResult.url),
|
|
855
|
+
};
|
|
856
|
+
ctx.timer.end('metadata');
|
|
857
|
+
return result;
|
|
858
|
+
});
|
|
859
|
+
// Content density pruning — runs on HTML before markdown conversion.
|
|
860
|
+
// Removes low-value blocks (sidebars, footers, ads) CSS selectors miss.
|
|
861
|
+
// OFF when fullPage=true, format !== markdown, or content is small (< 20K chars — overhead not worth it).
|
|
862
|
+
if (format === 'markdown' && !fullPage && contentHtml.length >= 20000) {
|
|
863
|
+
ctx.timer.mark('prune');
|
|
864
|
+
const pruned = pruneContent(contentHtml, { dynamic: true });
|
|
865
|
+
ctx.timer.end('prune');
|
|
866
|
+
contentHtml = pruned.html;
|
|
867
|
+
if (pruned.nodesRemoved > 0) {
|
|
868
|
+
ctx.prunedPercent = pruned.reductionPercent;
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
// OPTIMIZATION: When budget is set, pre-truncate HTML before markdown conversion.
|
|
872
|
+
// Converting 332K chars → markdown takes ~450ms. If budget=4000 tokens (~16K chars),
|
|
873
|
+
// we only need ~50K chars of HTML (3x overhead for tags/attributes).
|
|
874
|
+
// This cuts convert time from ~450ms to ~30ms on large pages.
|
|
875
|
+
let htmlForConvert = contentHtml;
|
|
876
|
+
// Skip pre-truncation when question is specified — QA needs full content to find answers
|
|
877
|
+
// that may be deep in the article (e.g., "Who coined AI?" → History section of Wikipedia)
|
|
878
|
+
const hasQuestion = !!ctx.options.question;
|
|
879
|
+
if (!hasQuestion && ctx.options.budget && ctx.options.budget > 0 && contentHtml.length > 50000) {
|
|
880
|
+
const estimatedCharsNeeded = ctx.options.budget * 12; // ~12 chars HTML per output token
|
|
881
|
+
const minChars = Math.max(estimatedCharsNeeded, 50000); // at least 50K to ensure quality
|
|
882
|
+
if (contentHtml.length > minChars) {
|
|
883
|
+
// Truncate at a block boundary (</p>, </div>, </li>, </tr>) to avoid broken HTML
|
|
884
|
+
const truncPoint = contentHtml.lastIndexOf('</', minChars);
|
|
885
|
+
if (truncPoint > minChars * 0.8) {
|
|
886
|
+
// Find the end of this closing tag
|
|
887
|
+
const tagEnd = contentHtml.indexOf('>', truncPoint);
|
|
888
|
+
htmlForConvert = contentHtml.slice(0, tagEnd > 0 ? tagEnd + 1 : minChars);
|
|
889
|
+
}
|
|
890
|
+
else {
|
|
891
|
+
htmlForConvert = contentHtml.slice(0, minChars);
|
|
892
|
+
}
|
|
893
|
+
if (process.env.DEBUG) {
|
|
894
|
+
log.debug(`budget pre-truncate: ${contentHtml.length} → ${htmlForConvert.length} chars`);
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
const contentTask = Promise.resolve().then(() => {
|
|
899
|
+
ctx.timer.mark('convert');
|
|
900
|
+
let converted;
|
|
901
|
+
switch (format) {
|
|
902
|
+
case 'html':
|
|
903
|
+
converted = htmlForConvert;
|
|
904
|
+
break;
|
|
905
|
+
case 'text':
|
|
906
|
+
converted = htmlToText(htmlForConvert);
|
|
907
|
+
break;
|
|
908
|
+
case 'clean': {
|
|
909
|
+
// First convert to markdown, then strip link syntax
|
|
910
|
+
const md = htmlToMarkdown(htmlForConvert, { raw, prune: false });
|
|
911
|
+
converted = cleanForAI(md);
|
|
912
|
+
break;
|
|
913
|
+
}
|
|
914
|
+
case 'markdown':
|
|
915
|
+
default:
|
|
916
|
+
// prune:false — already pruned above; avoid double-pruning in htmlToMarkdown
|
|
917
|
+
converted = htmlToMarkdown(htmlForConvert, { raw, prune: false });
|
|
918
|
+
break;
|
|
919
|
+
}
|
|
920
|
+
ctx.timer.end('convert');
|
|
921
|
+
return converted;
|
|
922
|
+
});
|
|
923
|
+
const [metaResult, convertedContent] = await Promise.all([metadataTask, contentTask]);
|
|
924
|
+
ctx.title = metaResult.title;
|
|
925
|
+
ctx.metadata = metaResult.metadata;
|
|
926
|
+
ctx.links = metaResult.links;
|
|
927
|
+
ctx.content = convertedContent;
|
|
928
|
+
// Safety net: if budget pre-truncation produced thin content but the full HTML
|
|
929
|
+
// has substantial content, redo conversion WITHOUT pre-truncation.
|
|
930
|
+
// This catches pages where the actual content is in the second half of the HTML
|
|
931
|
+
// (common for listing/index pages, SPAs with shell-first layouts).
|
|
932
|
+
if (htmlForConvert !== contentHtml && convertedContent.length < 200 && contentHtml.length > 20000) {
|
|
933
|
+
if (process.env.DEBUG) {
|
|
934
|
+
log.debug(`budget pre-truncation produced thin content (${convertedContent.length} chars from ${htmlForConvert.length} HTML). Retrying with full HTML (${contentHtml.length} chars).`);
|
|
935
|
+
}
|
|
936
|
+
ctx.timer.mark('convert-retry');
|
|
937
|
+
let retryConverted;
|
|
938
|
+
switch (format) {
|
|
939
|
+
case 'html':
|
|
940
|
+
retryConverted = contentHtml;
|
|
941
|
+
break;
|
|
942
|
+
case 'text':
|
|
943
|
+
retryConverted = htmlToText(contentHtml);
|
|
944
|
+
break;
|
|
945
|
+
case 'clean':
|
|
946
|
+
retryConverted = cleanForAI(htmlToMarkdown(contentHtml, { raw, prune: false }));
|
|
947
|
+
break;
|
|
948
|
+
case 'markdown':
|
|
949
|
+
default:
|
|
950
|
+
retryConverted = htmlToMarkdown(contentHtml, { raw, prune: false });
|
|
951
|
+
break;
|
|
952
|
+
}
|
|
953
|
+
ctx.timer.end('convert-retry');
|
|
954
|
+
ctx.content = retryConverted;
|
|
955
|
+
}
|
|
956
|
+
// Clean up markdown noise (empty links, excess newlines, trailing spaces)
|
|
957
|
+
if (format === 'markdown') {
|
|
958
|
+
ctx.content = cleanMarkdownNoise(ctx.content);
|
|
959
|
+
ctx.content = pruneMarkdown(ctx.content);
|
|
960
|
+
}
|
|
961
|
+
ctx.quality = calculateQuality(ctx.content, fetchResult.html);
|
|
962
|
+
}
|
|
963
|
+
else if (contentType === 'json') {
|
|
964
|
+
// JSON content — format nicely
|
|
965
|
+
try {
|
|
966
|
+
const parsed = JSON.parse(fetchResult.html);
|
|
967
|
+
ctx.content = JSON.stringify(parsed, null, 2);
|
|
968
|
+
ctx.title = 'JSON Response';
|
|
969
|
+
// Extract any URLs from JSON for links
|
|
970
|
+
const urlRegex = /https?:\/\/[^\s"'`,\]})]+/g;
|
|
971
|
+
const found = ctx.content.match(urlRegex) || [];
|
|
972
|
+
ctx.links = [...new Set(found)];
|
|
973
|
+
}
|
|
974
|
+
catch (e) {
|
|
975
|
+
// Non-fatal: JSON parse failed, treating as malformed
|
|
976
|
+
log.debug('JSON parse failed:', e instanceof Error ? e.message : e);
|
|
977
|
+
ctx.content = fetchResult.html;
|
|
978
|
+
ctx.title = 'JSON Response (malformed)';
|
|
979
|
+
}
|
|
980
|
+
ctx.quality = 1.0; // JSON is structured, always "clean"
|
|
981
|
+
}
|
|
982
|
+
else if (contentType === 'xml') {
|
|
983
|
+
// XML/RSS/Atom — convert to readable format
|
|
984
|
+
try {
|
|
985
|
+
const $ = (await import('cheerio')).load(fetchResult.html, { xml: true });
|
|
986
|
+
// Check if RSS/Atom feed
|
|
987
|
+
const items = $('item, entry');
|
|
988
|
+
if (items.length > 0) {
|
|
989
|
+
ctx.title = $('channel > title, feed > title').first().text() || 'RSS/Atom Feed';
|
|
990
|
+
const feedItems = [];
|
|
991
|
+
items.each((_, el) => {
|
|
992
|
+
const itemTitle = $(el).find('title').first().text();
|
|
993
|
+
const itemLink = $(el).find('link').first().text() || $(el).find('link').first().attr('href') || '';
|
|
994
|
+
const itemDesc = $(el).find('description, summary, content').first().text().slice(0, 200);
|
|
995
|
+
feedItems.push(`## ${itemTitle}\n${itemLink}\n${itemDesc}`);
|
|
996
|
+
if (itemLink)
|
|
997
|
+
ctx.links.push(itemLink);
|
|
998
|
+
});
|
|
999
|
+
ctx.content = `# ${ctx.title}\n\n${feedItems.join('\n\n---\n\n')}`;
|
|
1000
|
+
}
|
|
1001
|
+
else {
|
|
1002
|
+
ctx.content = fetchResult.html;
|
|
1003
|
+
ctx.title = $('title').first().text() || 'XML Document';
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
catch (e) {
|
|
1007
|
+
// Non-fatal: XML/RSS parse failed, using raw content
|
|
1008
|
+
log.debug('XML/RSS parse failed:', e instanceof Error ? e.message : e);
|
|
1009
|
+
ctx.content = fetchResult.html;
|
|
1010
|
+
ctx.title = 'XML Document';
|
|
1011
|
+
}
|
|
1012
|
+
ctx.quality = 0.9;
|
|
1013
|
+
}
|
|
1014
|
+
else {
|
|
1015
|
+
// Plain text, CSS, JS, etc — return as-is
|
|
1016
|
+
ctx.content = fetchResult.html;
|
|
1017
|
+
ctx.title = fetchResult.url.split('/').pop() || 'Text Document';
|
|
1018
|
+
// Extract URLs from plain text
|
|
1019
|
+
const urlRegex = /https?:\/\/[^\s"'`,\]})]+/g;
|
|
1020
|
+
const found = ctx.content.match(urlRegex) || [];
|
|
1021
|
+
ctx.links = [...new Set(found)];
|
|
1022
|
+
ctx.quality = 1.0;
|
|
1023
|
+
}
|
|
1024
|
+
// --- Auth wall detection ---
|
|
1025
|
+
// Run after content extraction. Only check when content is sparse OR quality is low,
|
|
1026
|
+
// and we're not already in a blocked state, and we have HTML to analyze.
|
|
1027
|
+
if (ctx.fetchResult?.html &&
|
|
1028
|
+
!ctx.metadata?.blocked &&
|
|
1029
|
+
!ctx.authRequired &&
|
|
1030
|
+
(ctx.content.length < 800 || (ctx.quality ?? 1) < 0.3)) {
|
|
1031
|
+
const authCheck = detectAuthWall(ctx.fetchResult.html, ctx.url, ctx.fetchResult.statusCode ?? ctx.fetchResult.status);
|
|
1032
|
+
if (authCheck.isAuthWall) {
|
|
1033
|
+
ctx.authRequired = true;
|
|
1034
|
+
const host = (() => { try {
|
|
1035
|
+
return new URL(ctx.url).hostname.replace('www.', '');
|
|
1036
|
+
}
|
|
1037
|
+
catch {
|
|
1038
|
+
return ctx.url;
|
|
1039
|
+
} })();
|
|
1040
|
+
ctx.warnings.push(`Authentication required. This page is behind a login wall. ` +
|
|
1041
|
+
`Use a browser profile: webpeel profile create ${host} && webpeel "${ctx.url}" --profile ${host}`);
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
// ---------------------------------------------------------------------------
|
|
1046
|
+
// Stage 6: postProcess
|
|
1047
|
+
// ---------------------------------------------------------------------------
|
|
1048
|
+
/**
|
|
1049
|
+
* Run all post-processing in sequence:
|
|
1050
|
+
* readability, image extraction, structured extraction,
|
|
1051
|
+
* maxTokens truncation, budget distillation, domain extractors, quick answer.
|
|
1052
|
+
*/
|
|
1053
|
+
export async function postProcess(ctx) {
|
|
1054
|
+
const fetchResult = ctx.fetchResult;
|
|
1055
|
+
const { contentType, options } = ctx;
|
|
1056
|
+
const isHTML = contentType === 'html';
|
|
1057
|
+
// Lite mode — skip all post-processing (no readability, no QA, no budget, no domain extract)
|
|
1058
|
+
if (options.lite)
|
|
1059
|
+
return;
|
|
1060
|
+
// Readability mode — skip if fast-path already handled it in parseContent
|
|
1061
|
+
// Also skip if selector was used — user explicitly chose content, don't override with readability
|
|
1062
|
+
if (options.readable && isHTML && fetchResult.html && !ctx.readabilityResult && !ctx.selector) {
|
|
1063
|
+
ctx.timer.mark('readability');
|
|
1064
|
+
try {
|
|
1065
|
+
const readResult = extractReadableContent(fetchResult.html, fetchResult.url);
|
|
1066
|
+
// Quality check: if readability result is < 15% of full content, it likely failed
|
|
1067
|
+
// (picked footnotes, sidebar, or wrong section as "main content" — e.g. aosabook.org)
|
|
1068
|
+
const fullContentLen = ctx.content?.length || 0;
|
|
1069
|
+
const readableLen = readResult.content?.length || 0;
|
|
1070
|
+
if (fullContentLen > 0 && readableLen > 0 && readableLen < fullContentLen * 0.15) {
|
|
1071
|
+
// Readability failed — keep the full content (already in ctx.content)
|
|
1072
|
+
log.debug(`Readability returned only ${Math.round(readableLen / fullContentLen * 100)}% of content — using full extraction instead`);
|
|
1073
|
+
}
|
|
1074
|
+
else {
|
|
1075
|
+
ctx.readabilityResult = readResult;
|
|
1076
|
+
ctx.content = readResult.content;
|
|
1077
|
+
ctx.metadata = {
|
|
1078
|
+
...ctx.metadata,
|
|
1079
|
+
title: readResult.title || ctx.metadata?.title,
|
|
1080
|
+
author: readResult.author || undefined,
|
|
1081
|
+
publishedDate: readResult.date || undefined,
|
|
1082
|
+
};
|
|
1083
|
+
ctx.title = readResult.title || ctx.title;
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
catch (readErr) {
|
|
1087
|
+
// Readability can crash on complex DOMs (e.g. Amazon) — gracefully fall back to standard content
|
|
1088
|
+
log.debug('Readability failed, using standard content:', readErr.message);
|
|
1089
|
+
}
|
|
1090
|
+
ctx.timer.end('readability');
|
|
1091
|
+
}
|
|
1092
|
+
// Extract images if requested
|
|
1093
|
+
if (ctx.extractImagesFlag && isHTML) {
|
|
1094
|
+
ctx.imagesList = extractImages(fetchResult.html, fetchResult.url);
|
|
1095
|
+
}
|
|
1096
|
+
// Extract structured data if requested
|
|
1097
|
+
if (ctx.extract && isHTML) {
|
|
1098
|
+
if (ctx.extract.llmApiKey && (ctx.extract.prompt || ctx.extract.schema)) {
|
|
1099
|
+
// LLM-powered extraction
|
|
1100
|
+
const { extractWithLLM } = await import('./extract.js');
|
|
1101
|
+
ctx.extracted = await extractWithLLM(ctx.content, ctx.extract);
|
|
1102
|
+
}
|
|
1103
|
+
else if (ctx.extract.selectors || ctx.extract.schema) {
|
|
1104
|
+
// CSS-based extraction (existing)
|
|
1105
|
+
ctx.extracted = extractStructured(fetchResult.html, ctx.extract);
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
// Quick answer (LLM-free) — tries pruned content first (higher quality),
|
|
1109
|
+
// then falls back to full raw HTML text if confidence is low (catches answers
|
|
1110
|
+
// deep in the document that pruning may have removed).
|
|
1111
|
+
if (options.question && ctx.content) {
|
|
1112
|
+
ctx.timer.mark('quickAnswer');
|
|
1113
|
+
let qa = runQuickAnswer({
|
|
1114
|
+
question: options.question,
|
|
1115
|
+
content: ctx.content,
|
|
1116
|
+
url: fetchResult.url,
|
|
1117
|
+
});
|
|
1118
|
+
// If confidence is below infobox-level (0.92) and we have raw HTML, try again on full text.
|
|
1119
|
+
// This catches answers deep in articles that pruning may have removed.
|
|
1120
|
+
if (qa.confidence < 0.91 && fetchResult.html && fetchResult.html.length > ctx.content.length * 2) {
|
|
1121
|
+
const { htmlToText } = await import('./markdown.js');
|
|
1122
|
+
const fullText = htmlToText(fetchResult.html);
|
|
1123
|
+
const qaFull = runQuickAnswer({
|
|
1124
|
+
question: options.question,
|
|
1125
|
+
content: fullText,
|
|
1126
|
+
url: fetchResult.url,
|
|
1127
|
+
});
|
|
1128
|
+
// Use the full-text answer if it's more confident
|
|
1129
|
+
if (qaFull.confidence > qa.confidence) {
|
|
1130
|
+
qa = qaFull;
|
|
1131
|
+
}
|
|
1132
|
+
}
|
|
1133
|
+
ctx.timer.end('quickAnswer');
|
|
1134
|
+
ctx.quickAnswerResult = qa;
|
|
1135
|
+
}
|
|
1136
|
+
// Truncate to token budget if requested (simple truncation)
|
|
1137
|
+
if (ctx.maxTokens && ctx.maxTokens > 0) {
|
|
1138
|
+
ctx.content = truncateToTokenBudget(ctx.content, ctx.maxTokens);
|
|
1139
|
+
}
|
|
1140
|
+
// Smart budget distillation — applied AFTER maxTokens truncation
|
|
1141
|
+
// This intelligently compresses content (strips boilerplate, compresses
|
|
1142
|
+
// tables, removes weak paragraphs) rather than blindly cutting.
|
|
1143
|
+
// Skip for domain-extracted content (e.g. YouTube) — it's already clean and structured.
|
|
1144
|
+
if (options.budget && options.budget > 0 && !ctx.domainData) {
|
|
1145
|
+
const budgetFormat = ctx.contentType === 'json' ? 'json' :
|
|
1146
|
+
ctx.format === 'text' ? 'text' : 'markdown';
|
|
1147
|
+
const originalContent = ctx.content;
|
|
1148
|
+
ctx.timer.mark('budget');
|
|
1149
|
+
let budgetedContent = distillToBudget(ctx.content, options.budget, budgetFormat);
|
|
1150
|
+
ctx.timer.end('budget');
|
|
1151
|
+
if (process.env.DEBUG) {
|
|
1152
|
+
log.debug(`budget result: ${originalContent.length} → ${budgetedContent.length} chars`);
|
|
1153
|
+
}
|
|
1154
|
+
// Safety net: if BM25 distillation stripped too much (< 10% of original)
|
|
1155
|
+
// on a substantial page, fall back to simple head truncation.
|
|
1156
|
+
// This happens on listing/index pages with no clear topic to rank by.
|
|
1157
|
+
if (budgetedContent.length < originalContent.length * 0.10 && originalContent.length > 500) {
|
|
1158
|
+
const estimatedChars = options.budget * 4; // rough: 1 token ≈ 4 chars
|
|
1159
|
+
// Trim at a word boundary to avoid cutting mid-word
|
|
1160
|
+
let truncated = originalContent.slice(0, estimatedChars);
|
|
1161
|
+
const lastSpace = truncated.lastIndexOf(' ');
|
|
1162
|
+
if (lastSpace > estimatedChars * 0.8) {
|
|
1163
|
+
truncated = truncated.slice(0, lastSpace);
|
|
1164
|
+
}
|
|
1165
|
+
budgetedContent = truncated;
|
|
1166
|
+
ctx.budgetFallback = true;
|
|
1167
|
+
ctx.warnings.push('Content was truncated to fit budget using head truncation (BM25 distillation produced insufficient content)');
|
|
1168
|
+
if (process.env.DEBUG) {
|
|
1169
|
+
log.debug(`budget distillation fallback: BM25 produced ${budgetedContent.length} chars (< 10% of ${originalContent.length}), using head truncation`);
|
|
1170
|
+
}
|
|
1171
|
+
}
|
|
1172
|
+
ctx.content = budgetedContent;
|
|
1173
|
+
}
|
|
1174
|
+
// Domain-aware structured extraction (Twitter, Reddit, GitHub, HN)
|
|
1175
|
+
// Fires when URL matches a known domain. Replaces content with clean markdown.
|
|
1176
|
+
if (hasDomainExtractor(fetchResult.url) && !ctx.domainApiHandled && !ctx.options.noDomainApi) {
|
|
1177
|
+
try {
|
|
1178
|
+
ctx.timer.mark('domainExtract');
|
|
1179
|
+
// Try raw HTML first, then fall back to readability-processed content
|
|
1180
|
+
// (some SPAs like Google Flights have data only after readability processing)
|
|
1181
|
+
let ddResult = await runDomainExtract(fetchResult.html, fetchResult.url);
|
|
1182
|
+
if (!ddResult && ctx.content) {
|
|
1183
|
+
ddResult = await runDomainExtract(ctx.content, fetchResult.url);
|
|
1184
|
+
}
|
|
1185
|
+
ctx.timer.end('domainExtract');
|
|
1186
|
+
if (ddResult) {
|
|
1187
|
+
ctx.domainData = ddResult;
|
|
1188
|
+
ctx.content = ddResult.cleanContent;
|
|
1189
|
+
// Update title from domain extractor (takes precedence over HTML page title)
|
|
1190
|
+
if (ddResult.structured?.title) {
|
|
1191
|
+
ctx.title = ddResult.structured.title;
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
}
|
|
1195
|
+
catch (e) {
|
|
1196
|
+
// Domain extraction failure is non-fatal; continue with normal content
|
|
1197
|
+
const errMsg2 = e instanceof Error ? e.message : String(e);
|
|
1198
|
+
log.warn('domain extraction (second pass) failed:', errMsg2);
|
|
1199
|
+
ctx.warnings.push(`Domain extraction (second pass) failed: ${errMsg2}`);
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
// === Challenge / bot-protection page detection (post-extraction) ===
|
|
1203
|
+
// After content extraction, verify the raw HTML isn't actually a challenge/block page
|
|
1204
|
+
// that slipped through the fetch-level checks (e.g. a 200-status challenge page with
|
|
1205
|
+
// enough HTML to pass content-length gates).
|
|
1206
|
+
//
|
|
1207
|
+
// Uses the proper detectChallenge() function on raw HTML instead of fragile string
|
|
1208
|
+
// matching on extracted markdown — this avoids false positives from articles that
|
|
1209
|
+
// mention security terms and correctly handles 404 pages, real content with security
|
|
1210
|
+
// keywords, and vendor-specific challenge patterns.
|
|
1211
|
+
//
|
|
1212
|
+
// Fast path: skip for HTTP fetches that completed in < 500ms with 200 status —
|
|
1213
|
+
// a fast successful response is virtually never a challenge page.
|
|
1214
|
+
//
|
|
1215
|
+
// Also flag very thin content from stealth/browser fetches as suspicious — if the
|
|
1216
|
+
// browser rendered a page but extracted almost nothing, it's likely a challenge page
|
|
1217
|
+
// that rendered its JS but produced no meaningful text.
|
|
1218
|
+
if (!ctx.fastPath && ctx.fetchResult?.html) {
|
|
1219
|
+
const { detectChallenge } = await import('./challenge-detection.js');
|
|
1220
|
+
const rawHtml = ctx.fetchResult.html;
|
|
1221
|
+
const statusCode = ctx.fetchResult.statusCode ?? ctx.fetchResult.status;
|
|
1222
|
+
const postExtractChallenge = detectChallenge(rawHtml, statusCode);
|
|
1223
|
+
// Also flag very thin browser/stealth results — challenge pages that execute JS
|
|
1224
|
+
// often produce minimal extracted text even though the HTML is large
|
|
1225
|
+
const isThinBrowserResult = ctx.content
|
|
1226
|
+
&& ctx.content.length < 100
|
|
1227
|
+
&& (ctx.stealth || ctx.fetchResult?.method === 'stealth' || ctx.fetchResult?.method === 'browser');
|
|
1228
|
+
const isChallengeContent = (postExtractChallenge.isChallenge && postExtractChallenge.confidence >= 0.7)
|
|
1229
|
+
|| isThinBrowserResult;
|
|
1230
|
+
if (isChallengeContent) {
|
|
1231
|
+
const challengeType = postExtractChallenge.type || 'generic-block';
|
|
1232
|
+
log.debug(`Post-extraction challenge detected: ${challengeType} (confidence: ${postExtractChallenge.confidence.toFixed(2)}) for ${ctx.url}`);
|
|
1233
|
+
ctx.warnings.push('Bot protection detected. Content is a challenge page, not the actual page content.');
|
|
1234
|
+
if (ctx.metadata) {
|
|
1235
|
+
ctx.metadata.blocked = true;
|
|
1236
|
+
ctx.metadata.challengeDetected = true;
|
|
1237
|
+
}
|
|
1238
|
+
// Try challenge solver first (if browser worker available or local solve enabled)
|
|
1239
|
+
let solvedViaChallengeSolver = false;
|
|
1240
|
+
const hasBrowserWorker = !!process.env.BROWSER_WORKER_URL;
|
|
1241
|
+
const canSolve = hasBrowserWorker || process.env.ENABLE_LOCAL_CHALLENGE_SOLVE === 'true';
|
|
1242
|
+
if (canSolve) {
|
|
1243
|
+
try {
|
|
1244
|
+
const { solveChallenge } = await import('../ee/challenge-solver.js');
|
|
1245
|
+
const solveResult = await solveChallenge(ctx.url, challengeType, rawHtml, {
|
|
1246
|
+
timeout: 15000,
|
|
1247
|
+
});
|
|
1248
|
+
if (solveResult.solved && solveResult.html) {
|
|
1249
|
+
// Re-parse the solved HTML
|
|
1250
|
+
const { htmlToMarkdown, htmlToText, cleanForAI } = await import('./markdown.js');
|
|
1251
|
+
const fmt = ctx.format || 'markdown';
|
|
1252
|
+
ctx.content = fmt === 'text' ? htmlToText(solveResult.html)
|
|
1253
|
+
: fmt === 'clean' ? cleanForAI(solveResult.html)
|
|
1254
|
+
: htmlToMarkdown(solveResult.html);
|
|
1255
|
+
ctx.fetchResult.html = solveResult.html;
|
|
1256
|
+
if (ctx.metadata) {
|
|
1257
|
+
ctx.metadata.blocked = false;
|
|
1258
|
+
ctx.metadata.challengeDetected = false;
|
|
1259
|
+
ctx.metadata.challengeSolved = true;
|
|
1260
|
+
}
|
|
1261
|
+
solvedViaChallengeSolver = true;
|
|
1262
|
+
log.debug(`Content-level challenge solved for ${ctx.url}`);
|
|
1263
|
+
}
|
|
1264
|
+
}
|
|
1265
|
+
catch (e) {
|
|
1266
|
+
log.debug('Content-level challenge solve failed:', e instanceof Error ? e.message : e);
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
// Fall back to search fallback if challenge solve didn't work
|
|
1270
|
+
if (!solvedViaChallengeSolver) {
|
|
1271
|
+
try {
|
|
1272
|
+
// @ts-ignore — proprietary module, gitignored
|
|
1273
|
+
const { searchFallback } = await import('./search-fallback.js');
|
|
1274
|
+
const searchResult = await searchFallback(ctx.url);
|
|
1275
|
+
if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
|
|
1276
|
+
ctx.content = searchResult.cachedContent;
|
|
1277
|
+
ctx.title = searchResult.title || ctx.title;
|
|
1278
|
+
ctx.quality = 0.4;
|
|
1279
|
+
ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
|
|
1280
|
+
if (ctx.metadata) {
|
|
1281
|
+
ctx.metadata.fallbackSource = searchResult.source;
|
|
1282
|
+
}
|
|
1283
|
+
}
|
|
1284
|
+
}
|
|
1285
|
+
catch { /* Search fallback failed — continue with challenge page content */ }
|
|
1286
|
+
}
|
|
1287
|
+
}
|
|
1288
|
+
}
|
|
1289
|
+
// === Active domain verification ===
|
|
1290
|
+
// Run for ALL sites — even known official/established domains benefit from
|
|
1291
|
+
// showing real TLS, DNS, and header signals. This is what makes WebPeel useful.
|
|
1292
|
+
{
|
|
1293
|
+
const { verifyDomain } = await import('./domain-verify.js');
|
|
1294
|
+
const existingHeaders = ctx.fetchResult?.responseHeaders || undefined;
|
|
1295
|
+
ctx.domainVerification = await verifyDomain(ctx.url, existingHeaders).catch(() => null);
|
|
1296
|
+
}
|
|
1297
|
+
// === Zero-token safety net ===
|
|
1298
|
+
// NEVER return empty content. If pipeline produced nothing, fall back.
|
|
1299
|
+
if (!ctx.content || ctx.content.trim().length === 0) {
|
|
1300
|
+
ctx.warnings.push('Primary extraction failed; content sourced from fallback (meta description or raw HTML)');
|
|
1301
|
+
// Try 1: JSON-LD (may not have been tried if selector/raw was used)
|
|
1302
|
+
if (fetchResult.html) {
|
|
1303
|
+
const { extractJsonLd } = await import('./json-ld.js');
|
|
1304
|
+
const jsonLd = extractJsonLd(fetchResult.html);
|
|
1305
|
+
if (jsonLd?.content && jsonLd.content.length > 50) {
|
|
1306
|
+
ctx.content = jsonLd.content;
|
|
1307
|
+
ctx.title = jsonLd.title || ctx.title;
|
|
1308
|
+
ctx.jsonLdType = jsonLd.type;
|
|
1309
|
+
ctx.quality = 0.90;
|
|
1310
|
+
return;
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
// Try 2: Meta description + title as minimal content
|
|
1314
|
+
const metaDesc = ctx.metadata?.description || ctx.metadata?.ogDescription;
|
|
1315
|
+
const pageTitle = ctx.title || ctx.metadata?.title;
|
|
1316
|
+
if (metaDesc || pageTitle) {
|
|
1317
|
+
const parts = [];
|
|
1318
|
+
if (pageTitle)
|
|
1319
|
+
parts.push(`# ${pageTitle}\n`);
|
|
1320
|
+
if (metaDesc)
|
|
1321
|
+
parts.push(metaDesc);
|
|
1322
|
+
ctx.content = parts.join('\n');
|
|
1323
|
+
ctx.quality = 0.3; // Low quality — we only got metadata
|
|
1324
|
+
return;
|
|
1325
|
+
}
|
|
1326
|
+
// Try 3: Raw text from HTML (strip all tags)
|
|
1327
|
+
if (fetchResult.html && fetchResult.html.length > 100) {
|
|
1328
|
+
const { htmlToText } = await import('./markdown.js');
|
|
1329
|
+
const rawText = htmlToText(fetchResult.html);
|
|
1330
|
+
if (rawText.trim().length > 50) {
|
|
1331
|
+
ctx.content = rawText.slice(0, 10000); // Cap at 10K chars
|
|
1332
|
+
ctx.quality = 0.2; // Very low quality
|
|
1333
|
+
return;
|
|
1334
|
+
}
|
|
1335
|
+
}
|
|
1336
|
+
// Try 4: Search-as-proxy fallback (when page appears blocked)
|
|
1337
|
+
// Search engines already crawled this page — use their cached snippet.
|
|
1338
|
+
try {
|
|
1339
|
+
// @ts-ignore — proprietary module, gitignored
|
|
1340
|
+
const { searchFallback } = await import('./search-fallback.js');
|
|
1341
|
+
const searchResult = await searchFallback(ctx.url);
|
|
1342
|
+
if (searchResult.cachedContent && searchResult.cachedContent.length > 50) {
|
|
1343
|
+
ctx.content = searchResult.cachedContent;
|
|
1344
|
+
ctx.title = searchResult.title || ctx.title;
|
|
1345
|
+
ctx.quality = 0.4; // Low quality — it's a search snippet, not the full page
|
|
1346
|
+
ctx.warnings.push('Content retrieved from search engine cache because the original page blocked direct access. Results may be incomplete.');
|
|
1347
|
+
if (ctx.metadata) {
|
|
1348
|
+
ctx.metadata.blocked = true;
|
|
1349
|
+
ctx.metadata.fallbackSource = searchResult.source;
|
|
1350
|
+
}
|
|
1351
|
+
return;
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
catch { /* Search fallback failed — continue to final empty handler */ }
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
// ---------------------------------------------------------------------------
|
|
1358
|
+
// Stage 7: finalize
|
|
1359
|
+
// ---------------------------------------------------------------------------
|
|
1360
|
+
/**
|
|
1361
|
+
* Screenshot base64 conversion, branding extraction (needs page), change tracking, AI summary.
|
|
1362
|
+
*/
|
|
1363
|
+
export async function finalize(ctx) {
|
|
1364
|
+
const fetchResult = ctx.fetchResult;
|
|
1365
|
+
const { options } = ctx;
|
|
1366
|
+
// Convert screenshot buffer to base64 if present
|
|
1367
|
+
ctx.screenshotBase64 = fetchResult.screenshot?.toString('base64');
|
|
1368
|
+
// Extract branding if requested (reuses existing browser page when available)
|
|
1369
|
+
if (options.branding && ctx.render && fetchResult.page) {
|
|
1370
|
+
try {
|
|
1371
|
+
const { extractBranding } = await import('./branding.js');
|
|
1372
|
+
ctx.brandingProfile = await extractBranding(fetchResult.page);
|
|
1373
|
+
}
|
|
1374
|
+
catch (error) {
|
|
1375
|
+
log.error('Branding extraction failed:', error);
|
|
1376
|
+
}
|
|
1377
|
+
finally {
|
|
1378
|
+
// Clean up the kept-open page and browser
|
|
1379
|
+
try {
|
|
1380
|
+
await fetchResult.page.close().catch(() => { });
|
|
1381
|
+
if (fetchResult.browser) {
|
|
1382
|
+
await fetchResult.browser.close().catch(() => { });
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
catch (e) {
|
|
1386
|
+
// Non-fatal: page/browser cleanup after branding extraction
|
|
1387
|
+
log.debug('page/browser cleanup after branding:', e instanceof Error ? e.message : e);
|
|
1388
|
+
}
|
|
1389
|
+
}
|
|
1390
|
+
}
|
|
1391
|
+
// Extract design analysis if requested (reuses existing browser page when available)
|
|
1392
|
+
if (options.designAnalysis && ctx.render && fetchResult.page) {
|
|
1393
|
+
try {
|
|
1394
|
+
const { extractDesignAnalysis } = await import('./design-analysis.js');
|
|
1395
|
+
ctx.designAnalysisResult = await extractDesignAnalysis(fetchResult.page);
|
|
1396
|
+
}
|
|
1397
|
+
catch (error) {
|
|
1398
|
+
log.error('Design analysis extraction failed:', error);
|
|
1399
|
+
}
|
|
1400
|
+
finally {
|
|
1401
|
+
if (!options.branding) {
|
|
1402
|
+
// Clean up the page and browser if branding didn't already do it
|
|
1403
|
+
try {
|
|
1404
|
+
await fetchResult.page.close().catch(() => { });
|
|
1405
|
+
if (fetchResult.browser) {
|
|
1406
|
+
await fetchResult.browser.close().catch(() => { });
|
|
1407
|
+
}
|
|
1408
|
+
}
|
|
1409
|
+
catch (e) {
|
|
1410
|
+
log.debug('page/browser cleanup after design analysis:', e instanceof Error ? e.message : e);
|
|
1411
|
+
}
|
|
1412
|
+
}
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
// Track content changes if requested
|
|
1416
|
+
if (options.changeTracking) {
|
|
1417
|
+
try {
|
|
1418
|
+
const fingerprint = createHash('sha256').update(ctx.content).digest('hex').slice(0, 16);
|
|
1419
|
+
const { trackChange } = await import('./change-tracking.js');
|
|
1420
|
+
ctx.changeResult = await trackChange(fetchResult.url, ctx.content, fingerprint);
|
|
1421
|
+
}
|
|
1422
|
+
catch (error) {
|
|
1423
|
+
log.error('Change tracking failed:', error);
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
// Generate AI summary if requested
|
|
1427
|
+
if (options.summary && options.llm) {
|
|
1428
|
+
try {
|
|
1429
|
+
const { summarizeContent } = await import('./summarize.js');
|
|
1430
|
+
const maxLength = typeof options.summary === 'object' && options.summary.maxLength
|
|
1431
|
+
? options.summary.maxLength
|
|
1432
|
+
: 150;
|
|
1433
|
+
ctx.summaryText = await summarizeContent(ctx.content, {
|
|
1434
|
+
apiKey: options.llm.apiKey,
|
|
1435
|
+
model: options.llm.model,
|
|
1436
|
+
apiBase: options.llm.baseUrl,
|
|
1437
|
+
maxWords: maxLength,
|
|
1438
|
+
});
|
|
1439
|
+
}
|
|
1440
|
+
catch (error) {
|
|
1441
|
+
log.error('Summary generation failed:', error);
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
}
|
|
1445
|
+
// ---------------------------------------------------------------------------
|
|
1446
|
+
// Stage 8: buildResult
|
|
1447
|
+
// ---------------------------------------------------------------------------
|
|
1448
|
+
/**
|
|
1449
|
+
* Assemble the final PeelResult from the pipeline context.
|
|
1450
|
+
*/
|
|
1451
|
+
export function buildResult(ctx) {
|
|
1452
|
+
const fetchResult = ctx.fetchResult;
|
|
1453
|
+
const elapsed = Date.now() - ctx.startTime;
|
|
1454
|
+
// --- Trust & Safety ---
|
|
1455
|
+
// Run prompt injection scan on final content
|
|
1456
|
+
const sanitizeResult = sanitizeForLLM(ctx.content);
|
|
1457
|
+
// If injection was detected, use the cleaned content
|
|
1458
|
+
if (sanitizeResult.injectionDetected) {
|
|
1459
|
+
ctx.content = sanitizeResult.content;
|
|
1460
|
+
ctx.warnings.push('Prompt injection patterns detected and stripped from content.');
|
|
1461
|
+
}
|
|
1462
|
+
// Assess source credibility
|
|
1463
|
+
const credibility = getSourceCredibility(ctx.url);
|
|
1464
|
+
// Merge active domain verification signals (if available)
|
|
1465
|
+
const dv = ctx.domainVerification ?? null;
|
|
1466
|
+
const verificationBonus = dv?.verificationScore ?? 0;
|
|
1467
|
+
const finalCredibilityScore = Math.min(100, credibility.score + verificationBonus);
|
|
1468
|
+
// Merge signals/warnings from active verification into credibility
|
|
1469
|
+
const mergedSignals = [
|
|
1470
|
+
...(credibility.signals ?? []),
|
|
1471
|
+
...(dv?.signals ?? []),
|
|
1472
|
+
];
|
|
1473
|
+
const mergedCredWarnings = [
|
|
1474
|
+
...(credibility.warnings ?? []),
|
|
1475
|
+
...(dv?.warnings ?? []),
|
|
1476
|
+
];
|
|
1477
|
+
// Compute composite trust score from source credibility (0-100) + content safety
|
|
1478
|
+
let trustScore = finalCredibilityScore / 100; // normalize 0-100 → 0-1
|
|
1479
|
+
if (sanitizeResult.injectionDetected)
|
|
1480
|
+
trustScore -= 0.3;
|
|
1481
|
+
if ((ctx.quality ?? 1.0) < 0.5)
|
|
1482
|
+
trustScore -= 0.1;
|
|
1483
|
+
trustScore = Math.round(Math.max(0, Math.min(1, trustScore)) * 100) / 100;
|
|
1484
|
+
// Build trust warnings
|
|
1485
|
+
const trustWarnings = [...mergedCredWarnings];
|
|
1486
|
+
if (credibility.tier === 'new')
|
|
1487
|
+
trustWarnings.push('Domain has limited verifiable presence — exercise caution.');
|
|
1488
|
+
if (credibility.tier === 'suspicious')
|
|
1489
|
+
trustWarnings.push('Domain shows suspicious signals — treat content with caution.');
|
|
1490
|
+
if (sanitizeResult.injectionDetected)
|
|
1491
|
+
trustWarnings.push(`Prompt injection detected: ${sanitizeResult.detectedPatterns.join(', ')}`);
|
|
1492
|
+
if (sanitizeResult.strippedChars > 0)
|
|
1493
|
+
trustWarnings.push(`Stripped ${sanitizeResult.strippedChars} suspicious characters (zero-width/Unicode smuggling).`);
|
|
1494
|
+
// Build verification sub-object (compact version for PeelResult)
|
|
1495
|
+
const verificationData = dv ? {
|
|
1496
|
+
tls: dv.tls ? { valid: dv.tls.valid, issuer: dv.tls.issuer, daysRemaining: dv.tls.daysRemaining } : null,
|
|
1497
|
+
dns: dv.dns ? { hasMx: dv.dns.hasMx, hasDmarc: dv.dns.hasDmarc, hasSpf: dv.dns.hasSpf } : null,
|
|
1498
|
+
headers: dv.headers ? { hsts: dv.headers.hsts, csp: dv.headers.csp, server: dv.headers.server } : null,
|
|
1499
|
+
} : undefined;
|
|
1500
|
+
// Include safe browsing data in trust object
|
|
1501
|
+
const sb = ctx.safeBrowsingResult;
|
|
1502
|
+
const trust = {
|
|
1503
|
+
source: {
|
|
1504
|
+
tier: credibility.tier,
|
|
1505
|
+
score: finalCredibilityScore,
|
|
1506
|
+
label: credibility.label,
|
|
1507
|
+
signals: mergedSignals,
|
|
1508
|
+
warnings: mergedCredWarnings,
|
|
1509
|
+
...(verificationData ? { verification: verificationData } : {}),
|
|
1510
|
+
},
|
|
1511
|
+
contentSafety: {
|
|
1512
|
+
clean: !sanitizeResult.injectionDetected,
|
|
1513
|
+
injectionDetected: sanitizeResult.injectionDetected,
|
|
1514
|
+
detectedPatterns: sanitizeResult.detectedPatterns,
|
|
1515
|
+
strippedCount: sanitizeResult.strippedChars,
|
|
1516
|
+
},
|
|
1517
|
+
...(sb ? {
|
|
1518
|
+
safeBrowsing: { safe: sb.safe, threats: sb.threats, source: sb.source },
|
|
1519
|
+
} : {}),
|
|
1520
|
+
...(sb?.threatFeeds ? {
|
|
1521
|
+
threatFeeds: {
|
|
1522
|
+
safe: sb.threatFeeds.safe,
|
|
1523
|
+
threats: sb.threatFeeds.threats,
|
|
1524
|
+
source: sb.threatFeeds.source,
|
|
1525
|
+
...(sb.threatFeeds.details ? { details: sb.threatFeeds.details } : {}),
|
|
1526
|
+
},
|
|
1527
|
+
} : {}),
|
|
1528
|
+
score: trustScore,
|
|
1529
|
+
warnings: trustWarnings,
|
|
1530
|
+
};
|
|
1531
|
+
const tokens = estimateTokens(ctx.content);
|
|
1532
|
+
const fingerprint = createHash('sha256').update(ctx.content).digest('hex').slice(0, 16);
|
|
1533
|
+
// Token savings metrics — only when raw HTML size was captured (from actual fetch or domain extractor)
|
|
1534
|
+
const rawHtmlSize = ctx.rawHtmlSize ?? 0;
|
|
1535
|
+
const rawTokenEstimate = rawHtmlSize > 0 ? Math.round(rawHtmlSize / 4) : undefined;
|
|
1536
|
+
const tokenSavingsPercent = rawTokenEstimate !== undefined && rawTokenEstimate > 0
|
|
1537
|
+
? Math.max(0, Math.round((1 - tokens / rawTokenEstimate) * 100))
|
|
1538
|
+
: undefined;
|
|
1539
|
+
// Build freshness from fetchResult response headers
|
|
1540
|
+
const freshness = {
|
|
1541
|
+
...(fetchResult.responseHeaders?.['last-modified'] ? { lastModified: fetchResult.responseHeaders['last-modified'] } : {}),
|
|
1542
|
+
...(fetchResult.responseHeaders?.['etag'] ? { etag: fetchResult.responseHeaders['etag'] } : {}),
|
|
1543
|
+
fetchedAt: new Date().toISOString(),
|
|
1544
|
+
...(fetchResult.responseHeaders?.['cache-control'] ? { cacheControl: fetchResult.responseHeaders['cache-control'] } : {}),
|
|
1545
|
+
};
|
|
1546
|
+
// Detect and warn about potential content issues
|
|
1547
|
+
let warning;
|
|
1548
|
+
const contentLen = ctx.content.length;
|
|
1549
|
+
const htmlLen = ctx.fetchResult?.html?.length || 0;
|
|
1550
|
+
// Add contentQuality metadata for thin content (< 100 words)
|
|
1551
|
+
const wordCount = ctx.content.trim().split(/\s+/).filter((w) => w.length > 0).length;
|
|
1552
|
+
if (wordCount < 100 && wordCount > 0) {
|
|
1553
|
+
ctx.warnings.push(`Content is thin (${wordCount} words). The page may be paywalled, require authentication, or block automated access.`);
|
|
1554
|
+
if (ctx.metadata) {
|
|
1555
|
+
ctx.metadata.contentQuality = 'thin';
|
|
1556
|
+
}
|
|
1557
|
+
}
|
|
1558
|
+
if (contentLen < 100 && htmlLen > 1000) {
|
|
1559
|
+
warning = 'Content extraction produced very little text from a substantial page. The site may use heavy JavaScript rendering. Try adding render: true.';
|
|
1560
|
+
}
|
|
1561
|
+
else if (ctx.budgetFallback) {
|
|
1562
|
+
warning = 'Budget distillation was unable to identify key content. Showing first portion of page instead. This may be a listing or index page — try fetching without a budget for full content.';
|
|
1563
|
+
}
|
|
1564
|
+
else if (contentLen < 50) {
|
|
1565
|
+
// Check if this looks like a blocked request
|
|
1566
|
+
const fetchMethod = ctx.fetchResult?.method || 'unknown';
|
|
1567
|
+
const triedStealth = fetchMethod === 'stealth' || ctx.options.stealth;
|
|
1568
|
+
const triedBrowser = fetchMethod === 'browser' || ctx.options.render;
|
|
1569
|
+
if (triedStealth || triedBrowser) {
|
|
1570
|
+
warning = 'This site appears to use bot protection (Cloudflare, Akamai, PerimeterX). Try: --cloaked flag, a residential proxy (--proxy), or check if the URL requires authentication.';
|
|
1571
|
+
// Set blocked flag in metadata
|
|
1572
|
+
if (ctx.metadata) {
|
|
1573
|
+
ctx.metadata.blocked = true;
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
else {
|
|
1577
|
+
warning = 'Very little content extracted. The page may require JavaScript rendering (try --render), be behind a login wall, or use bot protection.';
|
|
1578
|
+
}
|
|
1579
|
+
}
|
|
1580
|
+
// Apply clean format if requested (after all other processing)
|
|
1581
|
+
if (ctx.format === 'clean' && ctx.content) {
|
|
1582
|
+
ctx.content = cleanForAI(ctx.content);
|
|
1583
|
+
}
|
|
1584
|
+
// Query-dependent highlights (BM25-powered)
|
|
1585
|
+
let highlights;
|
|
1586
|
+
let highlightedContent;
|
|
1587
|
+
if (ctx.options.highlightQuery && ctx.content) {
|
|
1588
|
+
const highlightMaxChars = ctx.options.highlightMaxChars ?? 1000;
|
|
1589
|
+
const queryTerms = ctx.options.highlightQuery
|
|
1590
|
+
.toLowerCase()
|
|
1591
|
+
.replace(/[^\w\s]/g, ' ')
|
|
1592
|
+
.split(/\s+/)
|
|
1593
|
+
.filter(t => t.length > 0);
|
|
1594
|
+
if (queryTerms.length > 0) {
|
|
1595
|
+
const blocks = splitIntoBlocks(ctx.content);
|
|
1596
|
+
const scores = scoreBM25(blocks, queryTerms);
|
|
1597
|
+
// Pair blocks with scores and sort by score descending
|
|
1598
|
+
const scored = blocks.map((block, i) => ({ text: block.raw, score: scores[i], index: i }));
|
|
1599
|
+
scored.sort((a, b) => b.score - a.score);
|
|
1600
|
+
// Take top blocks until highlightMaxChars is reached
|
|
1601
|
+
const selected = [];
|
|
1602
|
+
let totalChars = 0;
|
|
1603
|
+
for (const item of scored) {
|
|
1604
|
+
if (item.score <= 0)
|
|
1605
|
+
break; // skip zero-score blocks
|
|
1606
|
+
if (totalChars + item.text.length > highlightMaxChars && selected.length > 0)
|
|
1607
|
+
break;
|
|
1608
|
+
selected.push({ text: item.text, score: Math.round(item.score * 10000) / 10000 });
|
|
1609
|
+
totalChars += item.text.length;
|
|
1610
|
+
}
|
|
1611
|
+
if (selected.length > 0) {
|
|
1612
|
+
highlights = selected;
|
|
1613
|
+
highlightedContent = selected.map(h => h.text).join('\n\n');
|
|
1614
|
+
}
|
|
1615
|
+
}
|
|
1616
|
+
}
|
|
1617
|
+
// Chunking for RAG pipelines
|
|
1618
|
+
let ragChunks;
|
|
1619
|
+
if (ctx.options.chunk) {
|
|
1620
|
+
const chunkOpts = typeof ctx.options.chunk === 'object'
|
|
1621
|
+
? ctx.options.chunk
|
|
1622
|
+
: {};
|
|
1623
|
+
const chunkResult = chunkContent(ctx.content, chunkOpts);
|
|
1624
|
+
ragChunks = chunkResult.chunks;
|
|
1625
|
+
}
|
|
1626
|
+
return {
|
|
1627
|
+
url: fetchResult.url,
|
|
1628
|
+
title: ctx.title,
|
|
1629
|
+
content: ctx.content,
|
|
1630
|
+
metadata: ctx.metadata,
|
|
1631
|
+
links: ctx.links,
|
|
1632
|
+
tokens,
|
|
1633
|
+
method: ctx.contentType === 'image' ? 'ocr' : fetchResult.method === 'cached' ? 'simple' : fetchResult.method,
|
|
1634
|
+
elapsed,
|
|
1635
|
+
screenshot: ctx.screenshotBase64,
|
|
1636
|
+
contentType: ctx.contentType,
|
|
1637
|
+
quality: ctx.quality,
|
|
1638
|
+
fingerprint,
|
|
1639
|
+
extracted: ctx.extracted,
|
|
1640
|
+
branding: ctx.brandingProfile,
|
|
1641
|
+
designAnalysis: ctx.designAnalysisResult,
|
|
1642
|
+
changeTracking: ctx.changeResult,
|
|
1643
|
+
summary: ctx.summaryText,
|
|
1644
|
+
images: ctx.imagesList,
|
|
1645
|
+
linkCount: ctx.links.length,
|
|
1646
|
+
freshness,
|
|
1647
|
+
...(warning !== undefined ? { warning } : {}),
|
|
1648
|
+
...(ctx.metadata && ctx.metadata.blocked ? { blocked: true } : {}),
|
|
1649
|
+
...(ctx.authRequired ? { authRequired: true } : {}),
|
|
1650
|
+
...(ctx.prunedPercent !== undefined ? { prunedPercent: ctx.prunedPercent } : {}),
|
|
1651
|
+
...(ctx.domainData !== undefined ? { domainData: ctx.domainData } : {}),
|
|
1652
|
+
...(ctx.readabilityResult !== undefined ? { readability: ctx.readabilityResult } : {}),
|
|
1653
|
+
...(ctx.quickAnswerResult !== undefined ? { quickAnswer: ctx.quickAnswerResult } : {}),
|
|
1654
|
+
timing: ctx.timer.toTiming(),
|
|
1655
|
+
...(ctx.jsonLdType !== undefined ? { jsonLdType: ctx.jsonLdType } : {}),
|
|
1656
|
+
...(ctx.warnings.length > 0 ? { warnings: ctx.warnings } : {}),
|
|
1657
|
+
...(ragChunks !== undefined ? { chunks: ragChunks } : {}),
|
|
1658
|
+
...(highlights !== undefined ? { highlights } : {}),
|
|
1659
|
+
...(highlightedContent !== undefined ? { highlightedContent } : {}),
|
|
1660
|
+
...(ctx.serverMarkdown ? { serverMarkdown: true } : {}),
|
|
1661
|
+
...(rawTokenEstimate !== undefined ? { rawTokenEstimate } : {}),
|
|
1662
|
+
...(tokenSavingsPercent !== undefined ? { tokenSavingsPercent } : {}),
|
|
1663
|
+
...(fetchResult.autoInteract !== undefined ? { autoInteract: fetchResult.autoInteract } : {}),
|
|
1664
|
+
trust,
|
|
1665
|
+
};
|
|
1666
|
+
}
|