@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,1345 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fetch commands: default URL handler, read, pipe
|
|
3
|
+
*/
|
|
4
|
+
import ora from 'ora';
|
|
5
|
+
import { writeFileSync, readFileSync, existsSync } from 'fs';
|
|
6
|
+
import { getProfilePath, loadStorageState, touchProfile } from '../../core/profiles.js';
|
|
7
|
+
import { shouldForceBrowser } from '../../core/strategies.js';
|
|
8
|
+
import { peel, cleanup } from '../../index.js';
|
|
9
|
+
import { checkUsage, showUsageFooter, loadConfig } from '../../cli-auth.js';
|
|
10
|
+
import { getCache, setCache, parseTTL } from '../../cache.js';
|
|
11
|
+
import { estimateTokens, htmlToMarkdown } from '../../core/markdown.js';
|
|
12
|
+
import { distillToBudget, budgetListings } from '../../core/budget.js';
|
|
13
|
+
import { parseActions, formatError, fetchViaApi, outputResult, writeStdout, buildEnvelope, classifyErrorCode, formatListingsCsv, normaliseExtractedToRows, } from '../utils.js';
|
|
14
|
+
// ─── readStdin ────────────────────────────────────────────────────────────────
|
|
15
|
+
async function readStdin() {
|
|
16
|
+
const chunks = [];
|
|
17
|
+
for await (const chunk of process.stdin) {
|
|
18
|
+
chunks.push(Buffer.from(chunk));
|
|
19
|
+
}
|
|
20
|
+
return Buffer.concat(chunks).toString('utf-8');
|
|
21
|
+
}
|
|
22
|
+
// ─── runStdin ─────────────────────────────────────────────────────────────────
|
|
23
|
+
// Read HTML from stdin, convert to markdown, and output
|
|
24
|
+
async function runStdin(options) {
|
|
25
|
+
try {
|
|
26
|
+
const html = await readStdin();
|
|
27
|
+
if (!html.trim()) {
|
|
28
|
+
process.stderr.write('Error: No input received on stdin\n');
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}
|
|
31
|
+
const markdown = htmlToMarkdown(html, { raw: false, prune: true });
|
|
32
|
+
if (options.json) {
|
|
33
|
+
const tokens = estimateTokens(markdown);
|
|
34
|
+
process.stdout.write(JSON.stringify({ success: true, content: markdown, tokens }) + '\n');
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
process.stdout.write(markdown + '\n');
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
catch (err) {
|
|
41
|
+
process.stderr.write(`Error: ${err.message}\n`);
|
|
42
|
+
process.exit(1);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
// ─── runFetch ─────────────────────────────────────────────────────────────────
|
|
46
|
+
// Main fetch handler — shared with the `pipe` and `ask` subcommands
|
|
47
|
+
export async function runFetch(url, options) {
|
|
48
|
+
// --silent: suppress all log output (set env var before any logger fires)
|
|
49
|
+
if (options.silent && !process.env.WEBPEEL_LOG_LEVEL) {
|
|
50
|
+
process.env.WEBPEEL_LOG_LEVEL = 'silent';
|
|
51
|
+
}
|
|
52
|
+
// --content-only: override all output flags — we just want raw content
|
|
53
|
+
if (options.contentOnly) {
|
|
54
|
+
options.silent = true;
|
|
55
|
+
// Disable json/text/html — we output content directly
|
|
56
|
+
options.json = false;
|
|
57
|
+
options.html = false;
|
|
58
|
+
options.text = false;
|
|
59
|
+
}
|
|
60
|
+
// Handle --format flag: maps to existing boolean flags
|
|
61
|
+
if (options.format) {
|
|
62
|
+
const fmt = options.format.toLowerCase();
|
|
63
|
+
if (fmt === 'text')
|
|
64
|
+
options.text = true;
|
|
65
|
+
else if (fmt === 'html')
|
|
66
|
+
options.html = true;
|
|
67
|
+
else if (fmt === 'json')
|
|
68
|
+
options.json = true;
|
|
69
|
+
else if (fmt === 'markdown' || fmt === 'md') { /* default, do nothing */ }
|
|
70
|
+
else {
|
|
71
|
+
console.error(`Unknown format: ${options.format}. Use: text, markdown, html, or json`);
|
|
72
|
+
process.exit(1);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Smart defaults: when piped (not a TTY), default to silent JSON + budget
|
|
76
|
+
// BUT respect explicit --format flag (user chose the output format)
|
|
77
|
+
// AND respect --content-only (raw content output, no JSON wrapper)
|
|
78
|
+
const isPiped = !process.stdout.isTTY;
|
|
79
|
+
const hasExplicitFormat = options.format && ['text', 'html', 'markdown', 'md'].includes(options.format.toLowerCase());
|
|
80
|
+
if (isPiped && !options.html && !options.text && !hasExplicitFormat && !options.contentOnly) {
|
|
81
|
+
if (!options.json)
|
|
82
|
+
options.json = true;
|
|
83
|
+
if (!options.silent)
|
|
84
|
+
options.silent = true;
|
|
85
|
+
// Auto-enable readability for AI consumers — clean content by default
|
|
86
|
+
if (!options.readable && !options.fullNav) {
|
|
87
|
+
options.readable = true;
|
|
88
|
+
}
|
|
89
|
+
// Auto token budget for piped mode (AI consumers want concise content)
|
|
90
|
+
if (options.budget === undefined && !options.fullContent && !options.raw && !options.full) {
|
|
91
|
+
options.budget = 4000;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
// --full alias: sets raw + fullContent
|
|
95
|
+
if (options.full) {
|
|
96
|
+
options.raw = true;
|
|
97
|
+
options.fullContent = true;
|
|
98
|
+
}
|
|
99
|
+
// Smart defaults for terminal (interactive) mode
|
|
100
|
+
const isTerminal = process.stdout.isTTY && !isPiped;
|
|
101
|
+
if (isTerminal && !options.raw && !options.html && !options.text) {
|
|
102
|
+
// Auto-readable: clean content by default (like browser Reader Mode)
|
|
103
|
+
if (!options.readable && !options.fullNav && !options.selector) {
|
|
104
|
+
options.readable = true;
|
|
105
|
+
}
|
|
106
|
+
// Default token budget: don't flood the terminal with 20K tokens
|
|
107
|
+
if (options.budget === undefined && !options.fullContent && !options.raw) {
|
|
108
|
+
options.budget = 4000;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
// --agent sets sensible defaults for AI agents; explicit flags override
|
|
112
|
+
if (options.agent) {
|
|
113
|
+
if (!options.json)
|
|
114
|
+
options.json = true;
|
|
115
|
+
if (!options.silent)
|
|
116
|
+
options.silent = true;
|
|
117
|
+
if (!options.extractAll)
|
|
118
|
+
options.extractAll = true;
|
|
119
|
+
if (options.budget === undefined)
|
|
120
|
+
options.budget = 4000;
|
|
121
|
+
// Agent mode = clean content by default
|
|
122
|
+
if (!options.readable && !options.fullNav) {
|
|
123
|
+
options.readable = true;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
const isJson = options.json;
|
|
127
|
+
// --- --list-schemas: print all available schemas and exit ---
|
|
128
|
+
if (options.listSchemas) {
|
|
129
|
+
const { loadBundledSchemas } = await import('../../core/schema-extraction.js');
|
|
130
|
+
const schemas = loadBundledSchemas();
|
|
131
|
+
if (isJson) {
|
|
132
|
+
await writeStdout(JSON.stringify(schemas.map(s => ({
|
|
133
|
+
name: s.name,
|
|
134
|
+
version: s.version,
|
|
135
|
+
domains: s.domains,
|
|
136
|
+
urlPatterns: s.urlPatterns,
|
|
137
|
+
})), null, 2) + '\n');
|
|
138
|
+
}
|
|
139
|
+
else {
|
|
140
|
+
console.log(`\nAvailable extraction schemas (${schemas.length}):\n`);
|
|
141
|
+
for (const s of schemas) {
|
|
142
|
+
console.log(` ${s.name} (v${s.version})`);
|
|
143
|
+
console.log(` Domains: ${s.domains.join(', ')}`);
|
|
144
|
+
if (s.urlPatterns && s.urlPatterns.length > 0) {
|
|
145
|
+
console.log(` URL patterns: ${s.urlPatterns.join(', ')}`);
|
|
146
|
+
}
|
|
147
|
+
console.log('');
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
process.exit(0);
|
|
151
|
+
}
|
|
152
|
+
// --- #4b: Read URL from stdin (pipe mode) if no URL argument provided ---
|
|
153
|
+
if ((!url || url.trim() === '') && !process.stdin.isTTY) {
|
|
154
|
+
try {
|
|
155
|
+
const stdinData = await readStdin();
|
|
156
|
+
const stdinUrl = stdinData.trim().split('\n')[0].trim();
|
|
157
|
+
if (stdinUrl && (stdinUrl.startsWith('http://') || stdinUrl.startsWith('https://'))) {
|
|
158
|
+
url = stdinUrl;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
catch { /* ignore stdin read errors */ }
|
|
162
|
+
}
|
|
163
|
+
// --- #5: Concise error for missing URL (no help dump) ---
|
|
164
|
+
if (!url || url.trim() === '') {
|
|
165
|
+
if (isJson) {
|
|
166
|
+
await writeStdout(JSON.stringify({ success: false, error: { type: 'invalid_request', message: 'URL is required' } }) + '\n');
|
|
167
|
+
}
|
|
168
|
+
else {
|
|
169
|
+
console.error('Error: URL is required');
|
|
170
|
+
console.error('Usage: webpeel <url> [options]');
|
|
171
|
+
console.error('Run "webpeel --help" for full usage.');
|
|
172
|
+
}
|
|
173
|
+
process.exit(1);
|
|
174
|
+
}
|
|
175
|
+
// --- #6: Helper to output JSON errors and exit ---
|
|
176
|
+
function exitWithJsonError(message, code) {
|
|
177
|
+
if (isJson) {
|
|
178
|
+
process.stdout.write(JSON.stringify({
|
|
179
|
+
success: false,
|
|
180
|
+
error: { type: code.toLowerCase(), message },
|
|
181
|
+
}) + '\n');
|
|
182
|
+
}
|
|
183
|
+
else {
|
|
184
|
+
console.error(`Error: ${message}`);
|
|
185
|
+
}
|
|
186
|
+
process.exit(1);
|
|
187
|
+
}
|
|
188
|
+
// SECURITY: Enhanced URL validation
|
|
189
|
+
if (url.length > 2048) {
|
|
190
|
+
exitWithJsonError('URL too long (max 2048 characters)', 'INVALID_URL');
|
|
191
|
+
}
|
|
192
|
+
// Check for control characters
|
|
193
|
+
if (/[\x00-\x1F\x7F]/.test(url)) {
|
|
194
|
+
exitWithJsonError('URL contains invalid control characters', 'INVALID_URL');
|
|
195
|
+
}
|
|
196
|
+
// Validate URL format
|
|
197
|
+
try {
|
|
198
|
+
const parsed = new URL(url);
|
|
199
|
+
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
200
|
+
exitWithJsonError('Only HTTP and HTTPS protocols are allowed', 'INVALID_URL');
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
catch {
|
|
204
|
+
// Check if it looks like a command/verb the user typed by mistake
|
|
205
|
+
const commonVerbs = ['fetch', 'get', 'scrape', 'read', 'download', 'curl', 'wget', 'peel'];
|
|
206
|
+
if (commonVerbs.includes(url.toLowerCase())) {
|
|
207
|
+
exitWithJsonError(`Did you mean: webpeel "${process.argv[3] || '<url>'}"?\nThe URL goes directly after webpeel — no verb needed.\nExample: webpeel "https://example.com" --json`, 'INVALID_URL');
|
|
208
|
+
}
|
|
209
|
+
else {
|
|
210
|
+
exitWithJsonError(`Invalid URL: "${url}"\nMake sure to include the protocol (https://)\nExample: webpeel "https://${url}" --json`, 'INVALID_URL');
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
const useStealth = options.stealth || false;
|
|
214
|
+
// Check usage quota
|
|
215
|
+
const usageCheck = await checkUsage();
|
|
216
|
+
if (!usageCheck.allowed) {
|
|
217
|
+
if (isJson) {
|
|
218
|
+
await writeStdout(JSON.stringify({ success: false, error: { type: 'rate_limited', message: usageCheck.message } }) + '\n');
|
|
219
|
+
process.exit(1);
|
|
220
|
+
}
|
|
221
|
+
console.error(usageCheck.message);
|
|
222
|
+
process.exit(1);
|
|
223
|
+
}
|
|
224
|
+
// ── --export: YouTube transcript download (early exit) ────────────────
|
|
225
|
+
if (options.export) {
|
|
226
|
+
const exportFmt = options.export.toLowerCase();
|
|
227
|
+
const validExportFmts = ['srt', 'txt', 'md', 'json'];
|
|
228
|
+
if (!validExportFmts.includes(exportFmt)) {
|
|
229
|
+
console.error(`Error: --export format must be one of: ${validExportFmts.join(', ')}`);
|
|
230
|
+
process.exit(1);
|
|
231
|
+
}
|
|
232
|
+
const exportCfg = loadConfig();
|
|
233
|
+
const exportApiKey = exportCfg.apiKey || process.env.WEBPEEL_API_KEY;
|
|
234
|
+
const exportApiUrl = process.env.WEBPEEL_API_URL || 'https://api.webpeel.dev';
|
|
235
|
+
if (!exportApiKey) {
|
|
236
|
+
console.error('No API key configured. Run: webpeel auth <your-key>');
|
|
237
|
+
console.error('Get a free key at: https://app.webpeel.dev/keys');
|
|
238
|
+
process.exit(2);
|
|
239
|
+
}
|
|
240
|
+
const lang = options.language || 'en';
|
|
241
|
+
const exportUrl = `${exportApiUrl}/v1/transcript/export?url=${encodeURIComponent(url)}&format=${exportFmt}&language=${lang}`;
|
|
242
|
+
const exportRes = await fetch(exportUrl, {
|
|
243
|
+
headers: { 'Authorization': `Bearer ${exportApiKey}` },
|
|
244
|
+
signal: AbortSignal.timeout(options.timeout ?? 90000),
|
|
245
|
+
});
|
|
246
|
+
if (!exportRes.ok) {
|
|
247
|
+
const errBody = await exportRes.text().catch(() => '');
|
|
248
|
+
try {
|
|
249
|
+
const errJson = JSON.parse(errBody);
|
|
250
|
+
const msg = errJson?.error?.message || errJson?.message || exportRes.statusText;
|
|
251
|
+
console.error(`Export failed (${exportRes.status}): ${msg}`);
|
|
252
|
+
}
|
|
253
|
+
catch {
|
|
254
|
+
console.error(`Export failed (${exportRes.status}): ${exportRes.statusText}`);
|
|
255
|
+
}
|
|
256
|
+
process.exit(1);
|
|
257
|
+
}
|
|
258
|
+
const exportContent = await exportRes.text();
|
|
259
|
+
if (options.output) {
|
|
260
|
+
writeFileSync(options.output, exportContent, 'utf-8');
|
|
261
|
+
if (!options.silent) {
|
|
262
|
+
console.error(`Transcript saved to: ${options.output}`);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
266
|
+
process.stdout.write(exportContent);
|
|
267
|
+
if (!exportContent.endsWith('\n'))
|
|
268
|
+
process.stdout.write('\n');
|
|
269
|
+
}
|
|
270
|
+
await cleanup();
|
|
271
|
+
process.exit(0);
|
|
272
|
+
}
|
|
273
|
+
// Check cache first (before spinner/network)
|
|
274
|
+
// Default: 5m TTL for all CLI fetches unless --no-cache is set
|
|
275
|
+
let cacheTtlMs;
|
|
276
|
+
const cacheDisabled = options.cache === false; // --no-cache sets options.cache to false
|
|
277
|
+
const explicitTtl = typeof options.cache === 'string' ? options.cache : undefined;
|
|
278
|
+
if (!cacheDisabled) {
|
|
279
|
+
const ttlStr = explicitTtl || '5m';
|
|
280
|
+
try {
|
|
281
|
+
cacheTtlMs = parseTTL(ttlStr);
|
|
282
|
+
}
|
|
283
|
+
catch (e) {
|
|
284
|
+
exitWithJsonError(e.message, 'FETCH_FAILED');
|
|
285
|
+
}
|
|
286
|
+
const cacheOptions = {
|
|
287
|
+
render: options.render,
|
|
288
|
+
stealth: options.stealth,
|
|
289
|
+
selector: options.selector,
|
|
290
|
+
format: options.html ? 'html' : options.text ? 'text' : options.clean ? 'clean' : 'markdown',
|
|
291
|
+
budget: null, // Budget excluded from cache key — cache stores full content
|
|
292
|
+
readable: options.readable || false,
|
|
293
|
+
noDomainApi: options.skipDomainApi || false, // Different cache for domain-api bypass
|
|
294
|
+
};
|
|
295
|
+
const cachedResult = getCache(url, cacheOptions);
|
|
296
|
+
if (cachedResult) {
|
|
297
|
+
if (!options.silent) {
|
|
298
|
+
console.error(`\x1b[36m⚡ Cache hit\x1b[0m (TTL: ${ttlStr})`);
|
|
299
|
+
}
|
|
300
|
+
// Apply budget to cached content (cache stores full, budget is post-process)
|
|
301
|
+
if (options.budget && options.budget > 0 && cachedResult.content) {
|
|
302
|
+
const fmt = options.text ? 'text' : 'markdown';
|
|
303
|
+
cachedResult.content = distillToBudget(cachedResult.content, options.budget, fmt);
|
|
304
|
+
cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
|
|
305
|
+
}
|
|
306
|
+
// LLM extraction from cached content
|
|
307
|
+
if (options.llmExtract || options.extractSchema) {
|
|
308
|
+
const { extractWithLLM } = await import('../../core/llm-extract.js');
|
|
309
|
+
const llmCfgCached = loadConfig();
|
|
310
|
+
const llmApiKeyCached = options.llmKey || llmCfgCached.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
311
|
+
if (!llmApiKeyCached) {
|
|
312
|
+
console.error('Error: LLM extraction requires an API key.\nSet OPENAI_API_KEY environment variable or use --llm-key <key>');
|
|
313
|
+
process.exit(1);
|
|
314
|
+
}
|
|
315
|
+
const llmModelCached = options.llmModel || llmCfgCached.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
|
|
316
|
+
const llmBaseUrlCached = options.llmBaseUrl || llmCfgCached.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
|
|
317
|
+
const llmInstructionCached = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
|
|
318
|
+
// Parse schema if provided
|
|
319
|
+
let llmSchemaCached;
|
|
320
|
+
if (options.extractSchema) {
|
|
321
|
+
let schemaStr = options.extractSchema;
|
|
322
|
+
if (schemaStr.startsWith('@')) {
|
|
323
|
+
schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
|
|
324
|
+
}
|
|
325
|
+
try {
|
|
326
|
+
llmSchemaCached = JSON.parse(schemaStr);
|
|
327
|
+
}
|
|
328
|
+
catch {
|
|
329
|
+
console.error('Error: --extract-schema must be valid JSON or a valid @file.json path');
|
|
330
|
+
process.exit(1);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
const llmResultCached = await extractWithLLM({
|
|
334
|
+
content: cachedResult.content,
|
|
335
|
+
instruction: llmInstructionCached,
|
|
336
|
+
schema: llmSchemaCached,
|
|
337
|
+
apiKey: llmApiKeyCached,
|
|
338
|
+
model: llmModelCached,
|
|
339
|
+
baseUrl: llmBaseUrlCached,
|
|
340
|
+
});
|
|
341
|
+
await writeStdout(JSON.stringify(llmResultCached.items, null, 2) + '\n');
|
|
342
|
+
if (!options.silent) {
|
|
343
|
+
const { input, output } = llmResultCached.tokensUsed;
|
|
344
|
+
const costStr = llmResultCached.cost !== undefined ? ` | Est. cost: $${llmResultCached.cost.toFixed(6)}` : '';
|
|
345
|
+
console.error(`\n🤖 LLM extraction: ${llmResultCached.items.length} items | ${input} input + ${output} output tokens${costStr} | model: ${llmResultCached.model}`);
|
|
346
|
+
}
|
|
347
|
+
process.exit(0);
|
|
348
|
+
}
|
|
349
|
+
// --- LLM-free Quick Answer (also on cached content) ---
|
|
350
|
+
if (options.question && cachedResult.content) {
|
|
351
|
+
const { quickAnswer } = await import('../../core/quick-answer.js');
|
|
352
|
+
const qa = quickAnswer({
|
|
353
|
+
question: options.question,
|
|
354
|
+
content: cachedResult.content,
|
|
355
|
+
url: cachedResult.url,
|
|
356
|
+
});
|
|
357
|
+
cachedResult.quickAnswer = qa;
|
|
358
|
+
if (!isJson) {
|
|
359
|
+
const conf = (qa.confidence * 100).toFixed(0);
|
|
360
|
+
await writeStdout(`\n\x1b[36m📋 ${qa.question}\x1b[0m\n\n`);
|
|
361
|
+
if (qa.answer) {
|
|
362
|
+
await writeStdout(`\x1b[32m💡 Answer (${conf}% confidence):\x1b[0m\n${qa.answer}\n`);
|
|
363
|
+
}
|
|
364
|
+
else {
|
|
365
|
+
await writeStdout(`\x1b[33m💡 No relevant answer found (${conf}% confidence)\x1b[0m\n`);
|
|
366
|
+
}
|
|
367
|
+
if (qa.passages && qa.passages.length > 1) {
|
|
368
|
+
await writeStdout(`\n\x1b[33m📝 Supporting evidence:\x1b[0m\n`);
|
|
369
|
+
for (const p of qa.passages.slice(1, 4)) {
|
|
370
|
+
await writeStdout(` • [${(p.score * 100).toFixed(0)}%] ${p.text.substring(0, 200)}${p.text.length > 200 ? '...' : ''}\n`);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
await writeStdout('\n');
|
|
374
|
+
await cleanup();
|
|
375
|
+
process.exit(0);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
// --- BM25 Schema Template Extraction (cached path) ---
|
|
379
|
+
if (options.schema && cachedResult.content) {
|
|
380
|
+
const { getSchemaTemplate: getSchTmplCached } = await import('../../core/schema-templates.js');
|
|
381
|
+
const schTemplateCached = getSchTmplCached(options.schema);
|
|
382
|
+
if (schTemplateCached) {
|
|
383
|
+
const { quickAnswer: qaCached } = await import('../../core/quick-answer.js');
|
|
384
|
+
const { smartExtractSchemaFields: smartExtractCached } = await import('../../core/schema-postprocess.js');
|
|
385
|
+
const extractedCached = smartExtractCached(cachedResult.content, schTemplateCached.fields, qaCached, {
|
|
386
|
+
pageTitle: cachedResult.title,
|
|
387
|
+
pageUrl: cachedResult.url,
|
|
388
|
+
metadata: cachedResult.metadata,
|
|
389
|
+
});
|
|
390
|
+
cachedResult.extracted = extractedCached;
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
if (options.contentOnly) {
|
|
394
|
+
await writeStdout(cachedResult.content + '\n');
|
|
395
|
+
}
|
|
396
|
+
else {
|
|
397
|
+
await outputResult(cachedResult, options, { cached: true });
|
|
398
|
+
}
|
|
399
|
+
process.exit(0);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
// --progress: show escalation steps on stderr (overrides spinner)
|
|
403
|
+
let progressInterval;
|
|
404
|
+
const progressStart = Date.now();
|
|
405
|
+
if (options.progress) {
|
|
406
|
+
process.stderr.write(`[simple] Fetching ${url}...\n`);
|
|
407
|
+
// Show escalation hints based on elapsed time (best-effort approximations)
|
|
408
|
+
const progressSteps = [
|
|
409
|
+
{ afterMs: 2500, message: '[simple] Waiting for response...' },
|
|
410
|
+
{ afterMs: 6000, message: '[browser] Simple too slow — escalating to browser render...' },
|
|
411
|
+
{ afterMs: 12000, message: '[browser] Rendering with Chromium...' },
|
|
412
|
+
{ afterMs: 20000, message: '[stealth] Escalating to stealth mode...' },
|
|
413
|
+
];
|
|
414
|
+
let stepIdx = 0;
|
|
415
|
+
progressInterval = setInterval(() => {
|
|
416
|
+
const elapsed = Date.now() - progressStart;
|
|
417
|
+
while (stepIdx < progressSteps.length && elapsed >= progressSteps[stepIdx].afterMs) {
|
|
418
|
+
process.stderr.write(`${progressSteps[stepIdx].message}\n`);
|
|
419
|
+
stepIdx++;
|
|
420
|
+
}
|
|
421
|
+
}, 500);
|
|
422
|
+
}
|
|
423
|
+
// Suppress spinner when --progress is active (progress lines replace it)
|
|
424
|
+
const spinner = (options.silent || options.progress) ? null : ora('Fetching...').start();
|
|
425
|
+
// Auto progress: after 3 s, update spinner text with elapsed time + method hints
|
|
426
|
+
// Updated every 2 s so the user knows we're still working.
|
|
427
|
+
const autoProgressStart = Date.now();
|
|
428
|
+
const autoProgressSteps = [
|
|
429
|
+
{ afterMs: 3000, text: '⏳ Fetching... (slow response)' },
|
|
430
|
+
{ afterMs: 6000, text: '⏳ Fetching with browser... ({s}s)' },
|
|
431
|
+
{ afterMs: 12000, text: '⏳ Fetching with browser... ({s}s — stealth may be needed)' },
|
|
432
|
+
{ afterMs: 20000, text: '⏳ Fetching with stealth browser + proxy... ({s}s)' },
|
|
433
|
+
];
|
|
434
|
+
let autoProgressStepIdx = 0;
|
|
435
|
+
const autoProgressInterval = spinner ? setInterval(() => {
|
|
436
|
+
const elapsed = Date.now() - autoProgressStart;
|
|
437
|
+
const secs = Math.round(elapsed / 1000);
|
|
438
|
+
while (autoProgressStepIdx < autoProgressSteps.length &&
|
|
439
|
+
elapsed >= autoProgressSteps[autoProgressStepIdx].afterMs) {
|
|
440
|
+
autoProgressStepIdx++;
|
|
441
|
+
}
|
|
442
|
+
if (autoProgressStepIdx > 0 && spinner) {
|
|
443
|
+
const tmpl = autoProgressSteps[autoProgressStepIdx - 1].text;
|
|
444
|
+
spinner.text = tmpl.replace('{s}', String(secs));
|
|
445
|
+
}
|
|
446
|
+
}, 2000) : null;
|
|
447
|
+
try {
|
|
448
|
+
// Validate options
|
|
449
|
+
if (options.wait && (options.wait < 0 || options.wait > 60000)) {
|
|
450
|
+
throw Object.assign(new Error('Wait time must be between 0 and 60000ms'), { _code: 'FETCH_FAILED' });
|
|
451
|
+
}
|
|
452
|
+
// Parse custom headers
|
|
453
|
+
let headers;
|
|
454
|
+
if (options.header && options.header.length > 0) {
|
|
455
|
+
headers = {};
|
|
456
|
+
for (const header of options.header) {
|
|
457
|
+
const colonIndex = header.indexOf(':');
|
|
458
|
+
if (colonIndex === -1) {
|
|
459
|
+
throw Object.assign(new Error(`Invalid header format: ${header}. Expected "Key: Value"`), { _code: 'FETCH_FAILED' });
|
|
460
|
+
}
|
|
461
|
+
const key = header.slice(0, colonIndex).trim();
|
|
462
|
+
const value = header.slice(colonIndex + 1).trim();
|
|
463
|
+
headers[key] = value;
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
// Parse actions
|
|
467
|
+
let actions;
|
|
468
|
+
if (options.action && options.action.length > 0) {
|
|
469
|
+
try {
|
|
470
|
+
actions = parseActions(options.action);
|
|
471
|
+
}
|
|
472
|
+
catch (e) {
|
|
473
|
+
throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
// --extract-schema auto-enables JSON output
|
|
477
|
+
if (options.extractSchema) {
|
|
478
|
+
options.json = true;
|
|
479
|
+
}
|
|
480
|
+
// Parse extract
|
|
481
|
+
let extract;
|
|
482
|
+
if (options.llmExtract || options.extractSchema) {
|
|
483
|
+
// LLM-based extraction is handled post-fetch (after peel returns markdown).
|
|
484
|
+
// Early-validate that an API key is available so we fail fast.
|
|
485
|
+
const llmCfg = loadConfig();
|
|
486
|
+
const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
487
|
+
if (!llmApiKey) {
|
|
488
|
+
throw Object.assign(new Error('LLM extraction requires an API key.\n' +
|
|
489
|
+
'Set OPENAI_API_KEY environment variable or use --llm-key <key>'), { _code: 'FETCH_FAILED' });
|
|
490
|
+
}
|
|
491
|
+
// Do NOT set extract here — peel runs normally, LLM extraction happens below.
|
|
492
|
+
}
|
|
493
|
+
else if (options.extract) {
|
|
494
|
+
// Smart extract: detect schema format vs CSS selectors
|
|
495
|
+
let extractJson;
|
|
496
|
+
try {
|
|
497
|
+
extractJson = JSON.parse(options.extract);
|
|
498
|
+
}
|
|
499
|
+
catch {
|
|
500
|
+
throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\' or \'{"company": "string"}\')'), { _code: 'FETCH_FAILED' });
|
|
501
|
+
}
|
|
502
|
+
// If all values are type names (string/boolean/number/array/object),
|
|
503
|
+
// treat as structured schema extraction (routed to extractStructured after fetch).
|
|
504
|
+
// Otherwise treat as CSS selector map.
|
|
505
|
+
const { isTypeSchema } = await import('../../core/structured-extract.js');
|
|
506
|
+
if (isTypeSchema(extractJson)) {
|
|
507
|
+
// Mark for post-fetch structured extraction (handled below)
|
|
508
|
+
options._structuredSchema = extractJson;
|
|
509
|
+
}
|
|
510
|
+
else {
|
|
511
|
+
// CSS-based extraction
|
|
512
|
+
extract = { selectors: extractJson };
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
// Validate maxTokens
|
|
516
|
+
if (options.maxTokens !== undefined) {
|
|
517
|
+
if (isNaN(options.maxTokens) || options.maxTokens < 100) {
|
|
518
|
+
throw Object.assign(new Error('--max-tokens must be at least 100'), { _code: 'FETCH_FAILED' });
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
// Parse include-tags and exclude-tags
|
|
522
|
+
let includeTags;
|
|
523
|
+
let excludeTags;
|
|
524
|
+
if (options.onlyMainContent) {
|
|
525
|
+
includeTags = ['main', 'article'];
|
|
526
|
+
}
|
|
527
|
+
else if (options.includeTags) {
|
|
528
|
+
includeTags = options.includeTags.split(',').map((t) => t.trim());
|
|
529
|
+
}
|
|
530
|
+
if (options.excludeTags) {
|
|
531
|
+
excludeTags = options.excludeTags.split(',').map((t) => t.trim());
|
|
532
|
+
}
|
|
533
|
+
// Build location options
|
|
534
|
+
let locationOptions;
|
|
535
|
+
if (options.location || options.language) {
|
|
536
|
+
locationOptions = {};
|
|
537
|
+
if (options.location) {
|
|
538
|
+
locationOptions.country = options.location;
|
|
539
|
+
}
|
|
540
|
+
if (options.language) {
|
|
541
|
+
locationOptions.languages = [options.language];
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
// ── Resolve --profile: name → path + storage state ─────────────────
|
|
545
|
+
let resolvedProfileDir;
|
|
546
|
+
let resolvedStorageState;
|
|
547
|
+
let resolvedProfileName;
|
|
548
|
+
if (options.profile) {
|
|
549
|
+
const profilePath = getProfilePath(options.profile);
|
|
550
|
+
if (profilePath) {
|
|
551
|
+
// It's a named profile in ~/.webpeel/profiles/
|
|
552
|
+
resolvedProfileDir = profilePath;
|
|
553
|
+
resolvedStorageState = loadStorageState(options.profile) ?? undefined;
|
|
554
|
+
resolvedProfileName = options.profile;
|
|
555
|
+
}
|
|
556
|
+
else if (existsSync(options.profile)) {
|
|
557
|
+
// It's a raw directory path (backward compat)
|
|
558
|
+
resolvedProfileDir = options.profile;
|
|
559
|
+
}
|
|
560
|
+
else {
|
|
561
|
+
exitWithJsonError(`Profile "${options.profile}" not found. Run "webpeel profile list" to see available profiles.`, 'PROFILE_NOT_FOUND');
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
// Build peel options
|
|
565
|
+
// --stealth auto-enables --render (stealth requires browser)
|
|
566
|
+
// --action auto-enables --render (actions require browser)
|
|
567
|
+
// --scroll-extract implies --render (needs browser)
|
|
568
|
+
//
|
|
569
|
+
// Bare --scroll-extract (no number) → smart autoScroll (detects stable height)
|
|
570
|
+
// --scroll-extract N (with number) → legacy fixed N scrolls via actions
|
|
571
|
+
const scrollExtractRaw = options.scrollExtract;
|
|
572
|
+
const isAutoScroll = scrollExtractRaw !== undefined && typeof scrollExtractRaw !== 'number';
|
|
573
|
+
const scrollExtractCount = isAutoScroll
|
|
574
|
+
? 0
|
|
575
|
+
: (scrollExtractRaw !== undefined ? scrollExtractRaw : 0);
|
|
576
|
+
const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || isAutoScroll
|
|
577
|
+
|| (options.device && options.device !== 'desktop')
|
|
578
|
+
|| !!options.viewport
|
|
579
|
+
|| !!options.waitUntil
|
|
580
|
+
|| !!options.waitSelector
|
|
581
|
+
|| !!options.blockResources
|
|
582
|
+
|| !!options.screenshot // Auto-enable render for screenshot (needs browser)
|
|
583
|
+
|| false;
|
|
584
|
+
// Inject scroll actions when --scroll-extract N (fixed count) is used
|
|
585
|
+
if (scrollExtractCount > 0) {
|
|
586
|
+
const scrollActions = [];
|
|
587
|
+
for (let i = 0; i < scrollExtractCount; i++) {
|
|
588
|
+
scrollActions.push({ type: 'scroll', to: 'bottom' });
|
|
589
|
+
scrollActions.push({ type: 'wait', ms: 1500 });
|
|
590
|
+
}
|
|
591
|
+
actions = actions ? [...actions, ...scrollActions] : scrollActions;
|
|
592
|
+
}
|
|
593
|
+
const peelOptions = {
|
|
594
|
+
render: useRender,
|
|
595
|
+
stealth: options.stealth || false,
|
|
596
|
+
wait: options.wait || 0,
|
|
597
|
+
timeout: options.timeout,
|
|
598
|
+
userAgent: options.ua,
|
|
599
|
+
screenshot: options.screenshot !== undefined,
|
|
600
|
+
screenshotFullPage: options.fullPage || false,
|
|
601
|
+
selector: options.selector,
|
|
602
|
+
exclude: options.exclude,
|
|
603
|
+
includeTags,
|
|
604
|
+
excludeTags,
|
|
605
|
+
headers,
|
|
606
|
+
cookies: options.cookie,
|
|
607
|
+
raw: options.raw || false,
|
|
608
|
+
noDomainApi: options.skipDomainApi || false,
|
|
609
|
+
lite: options.lite || false,
|
|
610
|
+
actions,
|
|
611
|
+
maxTokens: options.maxTokens,
|
|
612
|
+
// Note: budget is applied AFTER caching (so cache stores full content)
|
|
613
|
+
// We pass it to peel() for programmatic API compatibility, but the CLI
|
|
614
|
+
// also applies it post-fetch (see below) to ensure cache stores full result.
|
|
615
|
+
extract,
|
|
616
|
+
images: options.images || false,
|
|
617
|
+
location: locationOptions,
|
|
618
|
+
profileDir: resolvedProfileDir,
|
|
619
|
+
headed: options.headed || false,
|
|
620
|
+
storageState: resolvedStorageState,
|
|
621
|
+
proxy: options.proxy,
|
|
622
|
+
proxies: options.proxies,
|
|
623
|
+
fullPage: options.fullContent || false,
|
|
624
|
+
readable: options.readable || false,
|
|
625
|
+
// Smart auto-scroll (bare --scroll-extract flag)
|
|
626
|
+
autoScroll: isAutoScroll
|
|
627
|
+
? { timeout: options.scrollExtractTimeout }
|
|
628
|
+
: undefined,
|
|
629
|
+
device: options.device,
|
|
630
|
+
viewportWidth: options.viewport ? options.viewport.width : undefined,
|
|
631
|
+
viewportHeight: options.viewport ? options.viewport.height : undefined,
|
|
632
|
+
deviceScaleFactor: options.scale,
|
|
633
|
+
waitUntil: options.waitUntil,
|
|
634
|
+
waitSelector: options.waitSelector,
|
|
635
|
+
blockResources: options.blockResources ? options.blockResources.split(',').map((s) => s.trim()) : undefined,
|
|
636
|
+
cloaked: options.cloaked ? true : undefined,
|
|
637
|
+
cycle: options.cycle ? true : undefined,
|
|
638
|
+
tls: (options.tls || options.cycle) ? true : undefined,
|
|
639
|
+
highlightQuery: options.highlightQuery,
|
|
640
|
+
highlightMaxChars: options.highlightMaxChars,
|
|
641
|
+
};
|
|
642
|
+
if (options.cloaked) {
|
|
643
|
+
peelOptions.render = true; // CloakBrowser is a browser
|
|
644
|
+
}
|
|
645
|
+
// Add chunk option if requested
|
|
646
|
+
if (options.chunk) {
|
|
647
|
+
peelOptions.chunk = {
|
|
648
|
+
maxTokens: options.chunkSize || 512,
|
|
649
|
+
overlap: options.chunkOverlap || 50,
|
|
650
|
+
strategy: options.chunkStrategy || 'section',
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
// Add summary option if requested
|
|
654
|
+
if (options.summary) {
|
|
655
|
+
const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
|
|
656
|
+
if (!llmApiKey) {
|
|
657
|
+
throw Object.assign(new Error('--summary requires --llm-key or OPENAI_API_KEY environment variable'), { _code: 'FETCH_FAILED' });
|
|
658
|
+
}
|
|
659
|
+
peelOptions.summary = true;
|
|
660
|
+
peelOptions.llm = {
|
|
661
|
+
apiKey: llmApiKey,
|
|
662
|
+
model: process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini',
|
|
663
|
+
baseUrl: process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1',
|
|
664
|
+
};
|
|
665
|
+
}
|
|
666
|
+
// Determine format
|
|
667
|
+
if (options.html) {
|
|
668
|
+
peelOptions.format = 'html';
|
|
669
|
+
}
|
|
670
|
+
else if (options.text) {
|
|
671
|
+
peelOptions.format = 'text';
|
|
672
|
+
}
|
|
673
|
+
else if (options.clean) {
|
|
674
|
+
peelOptions.format = 'clean';
|
|
675
|
+
// --clean implies readable mode (article content only, no navs/footers)
|
|
676
|
+
peelOptions.readable = true;
|
|
677
|
+
}
|
|
678
|
+
else {
|
|
679
|
+
peelOptions.format = 'markdown';
|
|
680
|
+
}
|
|
681
|
+
// Fetch the page — route through API if key is configured, otherwise require auth
|
|
682
|
+
const fetchCfg = loadConfig();
|
|
683
|
+
const fetchApiKey = fetchCfg.apiKey || process.env.WEBPEEL_API_KEY;
|
|
684
|
+
const fetchApiUrl = process.env.WEBPEEL_API_URL || 'https://api.webpeel.dev';
|
|
685
|
+
// Features that require a local browser and cannot be delegated to the remote API.
|
|
686
|
+
// Also include domains (like amazon.com) that require stealth/browser rendering —
|
|
687
|
+
// the remote API won't render them correctly without special flags, so route locally.
|
|
688
|
+
const domainNeedsLocalBrowser = !!(shouldForceBrowser(url));
|
|
689
|
+
const needsLocalBrowser = !!(peelOptions.screenshot ||
|
|
690
|
+
peelOptions.actions?.length ||
|
|
691
|
+
peelOptions.profileDir ||
|
|
692
|
+
peelOptions.headed ||
|
|
693
|
+
peelOptions.storageState ||
|
|
694
|
+
peelOptions.cloaked ||
|
|
695
|
+
domainNeedsLocalBrowser);
|
|
696
|
+
let result;
|
|
697
|
+
if (fetchApiKey && !needsLocalBrowser) {
|
|
698
|
+
// Use the WebPeel API — no local Playwright needed
|
|
699
|
+
result = await fetchViaApi(url, peelOptions, fetchApiKey, fetchApiUrl);
|
|
700
|
+
}
|
|
701
|
+
else {
|
|
702
|
+
// No API key — fall back to local peel() mode (runs locally, no API needed)
|
|
703
|
+
if (spinner)
|
|
704
|
+
spinner.text = 'Fetching locally (no API key)…';
|
|
705
|
+
const startLocal = Date.now();
|
|
706
|
+
const { peel } = await import('../../index.js');
|
|
707
|
+
const localResult = await peel(url, peelOptions);
|
|
708
|
+
const elapsed = Date.now() - startLocal;
|
|
709
|
+
// Normalize to the shape fetchViaApi returns
|
|
710
|
+
result = {
|
|
711
|
+
...localResult,
|
|
712
|
+
elapsed: localResult.elapsed ?? elapsed,
|
|
713
|
+
method: localResult.method ?? 'local',
|
|
714
|
+
tokens: localResult.tokens ?? Math.ceil((localResult.content?.length ?? 0) / 4),
|
|
715
|
+
cached: false,
|
|
716
|
+
};
|
|
717
|
+
}
|
|
718
|
+
// Update lastUsed timestamp for named profiles
|
|
719
|
+
if (resolvedProfileName) {
|
|
720
|
+
touchProfile(resolvedProfileName);
|
|
721
|
+
}
|
|
722
|
+
// Stop progress intervals and show final result
|
|
723
|
+
if (progressInterval) {
|
|
724
|
+
clearInterval(progressInterval);
|
|
725
|
+
progressInterval = undefined;
|
|
726
|
+
}
|
|
727
|
+
if (autoProgressInterval)
|
|
728
|
+
clearInterval(autoProgressInterval);
|
|
729
|
+
if (options.progress) {
|
|
730
|
+
const method = result.method || 'simple';
|
|
731
|
+
const elapsedSec = ((result.elapsed || (Date.now() - progressStart)) / 1000).toFixed(1);
|
|
732
|
+
const tokenCount = (result.tokens || 0).toLocaleString();
|
|
733
|
+
// Show escalation arrow if browser/stealth was needed
|
|
734
|
+
if (method !== 'simple') {
|
|
735
|
+
process.stderr.write(`[simple] → [${method}] escalated\n`);
|
|
736
|
+
}
|
|
737
|
+
process.stderr.write(`[${method}] Done — ${tokenCount} tokens in ${elapsedSec}s\n`);
|
|
738
|
+
}
|
|
739
|
+
else if (spinner) {
|
|
740
|
+
const domainTag = result.domainData
|
|
741
|
+
? ` [${result.domainData.domain}:${result.domainData.type}]`
|
|
742
|
+
: '';
|
|
743
|
+
spinner.succeed(`Fetched in ${result.elapsed}ms using ${result.method} method${domainTag}`);
|
|
744
|
+
// Smart hints — suggest features the user might not know about
|
|
745
|
+
if (!options.silent && !options.json && !options.skipDomainApi) {
|
|
746
|
+
if (result.method === 'domain-api') {
|
|
747
|
+
const extractorName = result.domainData?.domain || new URL(url).hostname.replace('www.', '') || 'domain';
|
|
748
|
+
console.error(`\x1b[33m💡 Tip: Using our ${extractorName} extractor. Want the raw page instead? Add --skip-domain-api\x1b[0m`);
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
if (!options.silent && !options.json && result.tokens && result.tokens < 50 && !options.render) {
|
|
752
|
+
console.error(`\x1b[33m💡 Tip: Very little content extracted. This may be a JavaScript-rendered page.\x1b[0m`);
|
|
753
|
+
console.error(`\x1b[33m Try: webpeel "${url}" --render\x1b[0m`);
|
|
754
|
+
console.error(`\x1b[33m For infinite scroll/SPAs: --action 'scroll:bottom' --action 'wait:2000'\x1b[0m`);
|
|
755
|
+
console.error(`\x1b[33m Or use --stealth if the site blocks bots.\x1b[0m`);
|
|
756
|
+
}
|
|
757
|
+
// Auth wall detection hint
|
|
758
|
+
if (!options.json && result.authRequired) {
|
|
759
|
+
let authHost = url;
|
|
760
|
+
try {
|
|
761
|
+
authHost = new URL(url).hostname.replace('www.', '');
|
|
762
|
+
}
|
|
763
|
+
catch { /* ignore */ }
|
|
764
|
+
console.error('');
|
|
765
|
+
console.error('\x1b[33m🔐 This page requires authentication.\x1b[0m');
|
|
766
|
+
console.error(`\x1b[36m 1. Create a login profile: webpeel profile create ${authHost}\x1b[0m`);
|
|
767
|
+
console.error('\x1b[36m 2. Log in to the site in the browser that opens\x1b[0m');
|
|
768
|
+
console.error('\x1b[36m 3. Press Ctrl+C when done\x1b[0m');
|
|
769
|
+
console.error(`\x1b[36m 4. Re-run with: webpeel "${url}" --profile ${authHost}\x1b[0m`);
|
|
770
|
+
console.error('');
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
// Trust & safety warnings — shown prominently in non-JSON mode
|
|
774
|
+
if (!options.silent && !options.json) {
|
|
775
|
+
const trustData = result.trust;
|
|
776
|
+
const sbData = result.safeBrowsing;
|
|
777
|
+
// Unsafe: safe browsing threats detected
|
|
778
|
+
const allThreats = [
|
|
779
|
+
...(sbData?.threats ?? []),
|
|
780
|
+
...(trustData?.threatFeeds?.threats ?? []),
|
|
781
|
+
].filter((t, i, a) => a.indexOf(t) === i);
|
|
782
|
+
if (sbData && !sbData.safe && allThreats.length > 0) {
|
|
783
|
+
console.error(`\x1b[31m🚨 UNSAFE — Threats detected: ${allThreats.join(', ')}\x1b[0m`);
|
|
784
|
+
}
|
|
785
|
+
else if (trustData?.threatFeeds && !trustData.threatFeeds.safe && trustData.threatFeeds.threats.length > 0) {
|
|
786
|
+
console.error(`\x1b[31m🚨 UNSAFE — Threat feeds flagged: ${trustData.threatFeeds.threats.join(', ')}\x1b[0m`);
|
|
787
|
+
if (trustData.threatFeeds.details) {
|
|
788
|
+
console.error(`\x1b[31m ${trustData.threatFeeds.details}\x1b[0m`);
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
else if (trustData && trustData.score < 0.5) {
|
|
792
|
+
// Low trust score
|
|
793
|
+
const tier = trustData.source?.tier ?? 'unknown';
|
|
794
|
+
const label = trustData.source?.label ?? '';
|
|
795
|
+
const reason = tier === 'suspicious'
|
|
796
|
+
? 'Domain shows suspicious signals'
|
|
797
|
+
: tier === 'new'
|
|
798
|
+
? 'Domain has limited verifiable presence'
|
|
799
|
+
: label || 'Low credibility domain';
|
|
800
|
+
console.error(`\x1b[33m⚠️ Low trust score (${trustData.score.toFixed(2)}) — ${reason}\x1b[0m`);
|
|
801
|
+
}
|
|
802
|
+
// Show any trust warnings
|
|
803
|
+
if (trustData?.warnings && trustData.warnings.length > 0) {
|
|
804
|
+
for (const warn of trustData.warnings) {
|
|
805
|
+
console.error(`\x1b[33m⚠️ ${warn}\x1b[0m`);
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
// Show metadata header
|
|
810
|
+
const pageTitle = result.metadata?.title || result.title;
|
|
811
|
+
if (!options.silent && !options.json && pageTitle) {
|
|
812
|
+
const parts = [];
|
|
813
|
+
if (result.metadata?.author)
|
|
814
|
+
parts.push(`by ${result.metadata.author}`);
|
|
815
|
+
if (result.readability?.readingTime)
|
|
816
|
+
parts.push(result.readability.readingTime);
|
|
817
|
+
if (result.tokens)
|
|
818
|
+
parts.push(`${result.tokens.toLocaleString()} tokens`);
|
|
819
|
+
const subtitle = parts.length ? ` · ${parts.join(' · ')}` : '';
|
|
820
|
+
console.error(`\x1b[36m📄 ${pageTitle}${subtitle}\x1b[0m`);
|
|
821
|
+
}
|
|
822
|
+
// Show usage footer for free/anonymous users
|
|
823
|
+
if (usageCheck.usageInfo && !options.silent) {
|
|
824
|
+
showUsageFooter(usageCheck.usageInfo, usageCheck.isAnonymous || false, useStealth);
|
|
825
|
+
}
|
|
826
|
+
// Handle screenshot saving
|
|
827
|
+
if (options.screenshot && result.screenshot) {
|
|
828
|
+
const screenshotPath = typeof options.screenshot === 'string'
|
|
829
|
+
? options.screenshot
|
|
830
|
+
: 'screenshot.png';
|
|
831
|
+
const screenshotBuffer = Buffer.from(result.screenshot, 'base64');
|
|
832
|
+
writeFileSync(screenshotPath, screenshotBuffer);
|
|
833
|
+
if (!options.silent) {
|
|
834
|
+
console.error(`Screenshot saved to: ${screenshotPath}`);
|
|
835
|
+
}
|
|
836
|
+
// Remove screenshot from JSON output if saving to file
|
|
837
|
+
if (typeof options.screenshot === 'string') {
|
|
838
|
+
delete result.screenshot;
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
// Store full result in cache (before budget distillation so cache is reusable)
|
|
842
|
+
if (cacheTtlMs && !cacheDisabled) {
|
|
843
|
+
setCache(url, result, cacheTtlMs, {
|
|
844
|
+
render: options.render,
|
|
845
|
+
stealth: useStealth,
|
|
846
|
+
selector: options.selector,
|
|
847
|
+
format: peelOptions.format,
|
|
848
|
+
budget: null, // Budget excluded — cache stores full content, budget applied post-cache
|
|
849
|
+
readable: options.readable || false,
|
|
850
|
+
});
|
|
851
|
+
}
|
|
852
|
+
// Apply smart budget distillation AFTER caching (cache always stores full content)
|
|
853
|
+
// When --agent is set, always apply budget even with --extract-all (listings will be budgeted
|
|
854
|
+
// separately, but if no listings are found the content itself still needs trimming).
|
|
855
|
+
const skipBudgetForExtract = (options.extractAll || options.scrollExtract !== undefined) && !options.agent;
|
|
856
|
+
let contentTruncated = false;
|
|
857
|
+
if (options.budget && options.budget > 0 && !skipBudgetForExtract) {
|
|
858
|
+
const budgetFormat = peelOptions.format === 'text' ? 'text' : 'markdown';
|
|
859
|
+
const distilled = distillToBudget(result.content, options.budget, budgetFormat);
|
|
860
|
+
if (distilled !== result.content) {
|
|
861
|
+
contentTruncated = true;
|
|
862
|
+
result.content = distilled;
|
|
863
|
+
result.tokens = estimateTokens(distilled);
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
// --- BM25 Query-Focused Filtering ---
|
|
867
|
+
if (options.focus && result.content) {
|
|
868
|
+
const { filterByRelevance } = await import('../../core/bm25-filter.js');
|
|
869
|
+
const focusResult = filterByRelevance(result.content, { query: options.focus });
|
|
870
|
+
result.content = focusResult.content;
|
|
871
|
+
result.tokens = estimateTokens(focusResult.content);
|
|
872
|
+
if (isJson) {
|
|
873
|
+
result.focusQuery = options.focus;
|
|
874
|
+
result.focusReduction = focusResult.reductionPercent;
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
// --- LLM-free Quick Answer ---
|
|
878
|
+
if (options.question && result.content) {
|
|
879
|
+
const { quickAnswer } = await import('../../core/quick-answer.js');
|
|
880
|
+
const qa = quickAnswer({
|
|
881
|
+
question: options.question,
|
|
882
|
+
content: result.content,
|
|
883
|
+
url: result.url,
|
|
884
|
+
});
|
|
885
|
+
result.quickAnswer = qa;
|
|
886
|
+
if (!isJson) {
|
|
887
|
+
// Display answer prominently in human-readable mode
|
|
888
|
+
const conf = (qa.confidence * 100).toFixed(0);
|
|
889
|
+
await writeStdout(`\n\x1b[36m📋 ${qa.question}\x1b[0m\n\n`);
|
|
890
|
+
if (qa.answer) {
|
|
891
|
+
await writeStdout(`\x1b[32m💡 Answer (${conf}% confidence):\x1b[0m\n${qa.answer}\n`);
|
|
892
|
+
}
|
|
893
|
+
else {
|
|
894
|
+
await writeStdout(`\x1b[33m💡 No relevant answer found (${conf}% confidence)\x1b[0m\n`);
|
|
895
|
+
}
|
|
896
|
+
if (qa.passages && qa.passages.length > 1) {
|
|
897
|
+
await writeStdout(`\n\x1b[33m📝 Supporting evidence:\x1b[0m\n`);
|
|
898
|
+
for (const p of qa.passages.slice(1, 4)) {
|
|
899
|
+
await writeStdout(` • [${(p.score * 100).toFixed(0)}%] ${p.text.substring(0, 200)}${p.text.length > 200 ? '...' : ''}\n`);
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
await writeStdout('\n');
|
|
903
|
+
await cleanup();
|
|
904
|
+
process.exit(0);
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
// --- RAG Chunking output (chunks come from pipeline via peelOptions.chunk) ---
|
|
908
|
+
if (result.chunks && result.chunks.length > 0 && !isJson) {
|
|
909
|
+
console.log(`\n${'─'.repeat(60)}`);
|
|
910
|
+
console.log(`📦 ${result.chunks.length} chunks (${options.chunkStrategy || 'section'} strategy)\n`);
|
|
911
|
+
for (const chunk of result.chunks) {
|
|
912
|
+
const sectionLabel = chunk.section ? ` [${chunk.section}]` : '';
|
|
913
|
+
console.log(`── Chunk ${chunk.index + 1}${sectionLabel} (${chunk.tokenCount} tokens, ${chunk.wordCount} words) ──`);
|
|
914
|
+
console.log(chunk.text.substring(0, 200) + (chunk.text.length > 200 ? '...' : ''));
|
|
915
|
+
console.log('');
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
// --- #4: Content quality warning ---
|
|
919
|
+
const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
|
|
920
|
+
const isRedirect = false; // peel() follows redirects — final result is always 200
|
|
921
|
+
if (result.tokens < 20 && !useRender && isHtmlContent && !isRedirect) {
|
|
922
|
+
const warningMsg = `Low content detected (${result.tokens} tokens). Try: webpeel ${url} --render`;
|
|
923
|
+
if (isJson) {
|
|
924
|
+
result.warning = warningMsg;
|
|
925
|
+
}
|
|
926
|
+
else {
|
|
927
|
+
console.error(`⚠ ${warningMsg}`);
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
// --- Structured schema extraction (--extract with type schema or --extract-prompt) ---
|
|
931
|
+
if (options._structuredSchema || options.extractPrompt) {
|
|
932
|
+
const { extractStructured, simpleToExtractionSchema } = await import('../../core/structured-extract.js');
|
|
933
|
+
const rawSchema = options._structuredSchema;
|
|
934
|
+
const schema = rawSchema
|
|
935
|
+
? simpleToExtractionSchema(rawSchema)
|
|
936
|
+
: { type: 'object', properties: { result: { type: 'string', description: options.extractPrompt } } };
|
|
937
|
+
const strResult = await extractStructured(result.content, schema, undefined, // No LLM config — use heuristic (no key needed)
|
|
938
|
+
options.extractPrompt);
|
|
939
|
+
if (isJson) {
|
|
940
|
+
await writeStdout(JSON.stringify({
|
|
941
|
+
success: true,
|
|
942
|
+
data: strResult.data,
|
|
943
|
+
confidence: strResult.confidence,
|
|
944
|
+
method: 'heuristic',
|
|
945
|
+
}, null, 2) + '\n');
|
|
946
|
+
}
|
|
947
|
+
else {
|
|
948
|
+
await writeStdout(JSON.stringify(strResult.data, null, 2) + '\n');
|
|
949
|
+
if (!options.silent) {
|
|
950
|
+
console.error(`\n📊 Structured extraction: confidence=${(strResult.confidence * 100).toFixed(0)}% (heuristic)`);
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
await cleanup();
|
|
954
|
+
process.exit(0);
|
|
955
|
+
}
|
|
956
|
+
// --- LLM-based extraction (post-peel) ---
|
|
957
|
+
if (options.llmExtract || options.extractSchema) {
|
|
958
|
+
const { extractWithLLM } = await import('../../core/llm-extract.js');
|
|
959
|
+
const llmCfg = loadConfig();
|
|
960
|
+
const llmApiKey = options.llmKey || llmCfg.llm?.apiKey || process.env.OPENAI_API_KEY;
|
|
961
|
+
const llmModel = options.llmModel || llmCfg.llm?.model || process.env.WEBPEEL_LLM_MODEL || 'gpt-4o-mini';
|
|
962
|
+
const llmBaseUrl = options.llmBaseUrl || llmCfg.llm?.baseUrl || process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1';
|
|
963
|
+
const llmInstruction = typeof options.llmExtract === 'string' ? options.llmExtract : undefined;
|
|
964
|
+
// Parse --extract-schema if provided
|
|
965
|
+
let llmSchema;
|
|
966
|
+
if (options.extractSchema) {
|
|
967
|
+
let schemaStr = options.extractSchema;
|
|
968
|
+
if (schemaStr.startsWith('@')) {
|
|
969
|
+
schemaStr = readFileSync(schemaStr.slice(1), 'utf-8');
|
|
970
|
+
}
|
|
971
|
+
try {
|
|
972
|
+
llmSchema = JSON.parse(schemaStr);
|
|
973
|
+
}
|
|
974
|
+
catch {
|
|
975
|
+
exitWithJsonError('--extract-schema must be valid JSON or a valid @file.json path', 'FETCH_FAILED');
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
const llmResult = await extractWithLLM({
|
|
979
|
+
content: result.content,
|
|
980
|
+
instruction: llmInstruction,
|
|
981
|
+
schema: llmSchema,
|
|
982
|
+
apiKey: llmApiKey,
|
|
983
|
+
model: llmModel,
|
|
984
|
+
baseUrl: llmBaseUrl,
|
|
985
|
+
});
|
|
986
|
+
// Output structured items as JSON
|
|
987
|
+
await writeStdout(JSON.stringify(llmResult.items, null, 2) + '\n');
|
|
988
|
+
// Show token usage and estimated cost
|
|
989
|
+
if (!options.silent) {
|
|
990
|
+
const { input, output } = llmResult.tokensUsed;
|
|
991
|
+
const costStr = llmResult.cost !== undefined
|
|
992
|
+
? ` | Est. cost: $${llmResult.cost.toFixed(6)}`
|
|
993
|
+
: '';
|
|
994
|
+
console.error(`\n🤖 LLM extraction: ${llmResult.items.length} items | ${input} input + ${output} output tokens${costStr} | model: ${llmResult.model}`);
|
|
995
|
+
}
|
|
996
|
+
await cleanup();
|
|
997
|
+
process.exit(0);
|
|
998
|
+
}
|
|
999
|
+
// --- Extract-all / pagination / output formatting ---
|
|
1000
|
+
const wantsExtractAll = options.extractAll || options.scrollExtract !== undefined;
|
|
1001
|
+
const pagesCount = Math.min(Math.max(options.pages || 1, 1), 10);
|
|
1002
|
+
if (wantsExtractAll) {
|
|
1003
|
+
const { extractListings } = await import('../../core/extract-listings.js');
|
|
1004
|
+
const { findNextPageUrl } = await import('../../core/paginate.js');
|
|
1005
|
+
const { findSchemaForUrl, extractWithSchema, loadBundledSchemas } = await import('../../core/schema-extraction.js');
|
|
1006
|
+
// Resolve which schema to use (explicit --schema flag or auto-detect)
|
|
1007
|
+
let activeSchema = null;
|
|
1008
|
+
if (options.schema) {
|
|
1009
|
+
// Find schema by name or domain match
|
|
1010
|
+
const schemaQuery = options.schema.toLowerCase();
|
|
1011
|
+
const allSchemas = loadBundledSchemas();
|
|
1012
|
+
activeSchema = allSchemas.find(s => s.name.toLowerCase().includes(schemaQuery) ||
|
|
1013
|
+
s.domains.some(d => d.toLowerCase().includes(schemaQuery))) ?? null;
|
|
1014
|
+
if (!activeSchema && !options.silent) {
|
|
1015
|
+
console.error(`Warning: No schema found for "${options.schema}", falling back to auto-detection`);
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
else {
|
|
1019
|
+
// Auto-detect from URL
|
|
1020
|
+
activeSchema = findSchemaForUrl(result.url || url);
|
|
1021
|
+
}
|
|
1022
|
+
// We need the raw HTML for extraction. Re-fetch with format=html if needed.
|
|
1023
|
+
let allListings = [];
|
|
1024
|
+
// Fetch HTML for extraction
|
|
1025
|
+
const htmlResult = peelOptions.format === 'html'
|
|
1026
|
+
? result
|
|
1027
|
+
: await peel(url, { ...peelOptions, format: 'html', maxTokens: undefined });
|
|
1028
|
+
// Try schema extraction first, fall back to generic
|
|
1029
|
+
if (activeSchema) {
|
|
1030
|
+
const schemaListings = extractWithSchema(htmlResult.content, activeSchema, result.url);
|
|
1031
|
+
if (schemaListings.length > 0) {
|
|
1032
|
+
allListings.push(...schemaListings);
|
|
1033
|
+
}
|
|
1034
|
+
else {
|
|
1035
|
+
// Schema returned nothing — fall back to generic
|
|
1036
|
+
allListings.push(...extractListings(htmlResult.content, result.url));
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
else {
|
|
1040
|
+
allListings.push(...extractListings(htmlResult.content, result.url));
|
|
1041
|
+
}
|
|
1042
|
+
// Pagination: follow "Next" links
|
|
1043
|
+
if (pagesCount > 1) {
|
|
1044
|
+
let currentHtml = htmlResult.content;
|
|
1045
|
+
let currentUrl = result.url;
|
|
1046
|
+
for (let page = 1; page < pagesCount; page++) {
|
|
1047
|
+
const nextUrl = findNextPageUrl(currentHtml, currentUrl);
|
|
1048
|
+
if (!nextUrl)
|
|
1049
|
+
break;
|
|
1050
|
+
try {
|
|
1051
|
+
const nextResult = await peel(nextUrl, { ...peelOptions, format: 'html', maxTokens: undefined });
|
|
1052
|
+
let pageListings;
|
|
1053
|
+
if (activeSchema) {
|
|
1054
|
+
const schemaPage = extractWithSchema(nextResult.content, activeSchema, nextResult.url);
|
|
1055
|
+
pageListings = schemaPage.length > 0
|
|
1056
|
+
? schemaPage
|
|
1057
|
+
: extractListings(nextResult.content, nextResult.url);
|
|
1058
|
+
}
|
|
1059
|
+
else {
|
|
1060
|
+
pageListings = extractListings(nextResult.content, nextResult.url);
|
|
1061
|
+
}
|
|
1062
|
+
allListings.push(...pageListings);
|
|
1063
|
+
currentHtml = nextResult.content;
|
|
1064
|
+
currentUrl = nextResult.url;
|
|
1065
|
+
}
|
|
1066
|
+
catch {
|
|
1067
|
+
break; // Stop paginating on error
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
// Apply budget to listings if requested
|
|
1072
|
+
let listingsTruncated = false;
|
|
1073
|
+
let totalAvailableListings;
|
|
1074
|
+
if (options.budget && options.budget > 0 && allListings.length > 0) {
|
|
1075
|
+
const { maxItems, truncated, totalAvailable } = budgetListings(allListings.length, options.budget);
|
|
1076
|
+
if (truncated) {
|
|
1077
|
+
listingsTruncated = true;
|
|
1078
|
+
totalAvailableListings = totalAvailable;
|
|
1079
|
+
allListings = allListings.slice(0, maxItems);
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
// Output based on format flags
|
|
1083
|
+
if (options.csv) {
|
|
1084
|
+
const csvOutput = formatListingsCsv(allListings);
|
|
1085
|
+
await writeStdout(csvOutput);
|
|
1086
|
+
}
|
|
1087
|
+
else if (options.table) {
|
|
1088
|
+
const { formatTable } = await import('../../core/table-format.js');
|
|
1089
|
+
const tableRows = allListings.map(item => {
|
|
1090
|
+
const row = {};
|
|
1091
|
+
for (const [k, v] of Object.entries(item)) {
|
|
1092
|
+
if (v !== undefined)
|
|
1093
|
+
row[k] = v;
|
|
1094
|
+
}
|
|
1095
|
+
return row;
|
|
1096
|
+
});
|
|
1097
|
+
await writeStdout(formatTable(tableRows) + '\n');
|
|
1098
|
+
}
|
|
1099
|
+
else if (isJson) {
|
|
1100
|
+
// Use unified envelope for JSON output
|
|
1101
|
+
const structured = allListings;
|
|
1102
|
+
const envelope = buildEnvelope(result, {
|
|
1103
|
+
cached: false,
|
|
1104
|
+
structured,
|
|
1105
|
+
truncated: listingsTruncated || undefined,
|
|
1106
|
+
totalAvailable: totalAvailableListings,
|
|
1107
|
+
});
|
|
1108
|
+
// Also include legacy fields for backward compat
|
|
1109
|
+
envelope.listings = allListings;
|
|
1110
|
+
envelope.count = allListings.length;
|
|
1111
|
+
await writeStdout(JSON.stringify(envelope, null, 2) + '\n');
|
|
1112
|
+
}
|
|
1113
|
+
else {
|
|
1114
|
+
// Formatted text output
|
|
1115
|
+
if (allListings.length === 0) {
|
|
1116
|
+
await writeStdout('No listings found.\n');
|
|
1117
|
+
}
|
|
1118
|
+
else {
|
|
1119
|
+
const truncNote = listingsTruncated && totalAvailableListings
|
|
1120
|
+
? ` (${totalAvailableListings} total — budget limited to ${allListings.length})`
|
|
1121
|
+
: '';
|
|
1122
|
+
await writeStdout(`Found ${allListings.length} listings${truncNote}:\n\n`);
|
|
1123
|
+
allListings.forEach((item, i) => {
|
|
1124
|
+
const pricePart = item.price ? ` — ${item.price}` : '';
|
|
1125
|
+
const line = `${i + 1}. ${item.title}${pricePart}\n`;
|
|
1126
|
+
process.stdout.write(line);
|
|
1127
|
+
if (item.link) {
|
|
1128
|
+
process.stdout.write(` ${item.link}\n`);
|
|
1129
|
+
}
|
|
1130
|
+
process.stdout.write('\n');
|
|
1131
|
+
});
|
|
1132
|
+
}
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
else if (options.csv || options.table) {
|
|
1136
|
+
// CSV / table output for --extract (CSS selector extraction)
|
|
1137
|
+
if (result.extracted) {
|
|
1138
|
+
const rows = normaliseExtractedToRows(result.extracted);
|
|
1139
|
+
if (options.csv) {
|
|
1140
|
+
await writeStdout(formatListingsCsv(rows));
|
|
1141
|
+
}
|
|
1142
|
+
else {
|
|
1143
|
+
const { formatTable } = await import('../../core/table-format.js');
|
|
1144
|
+
await writeStdout(formatTable(rows) + '\n');
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
else {
|
|
1148
|
+
console.error('--csv / --table require --extract-all or --extract to produce structured data.');
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
else {
|
|
1152
|
+
// --- BM25 Schema Template Extraction (no LLM needed) ---
|
|
1153
|
+
if (options.schema && result.content) {
|
|
1154
|
+
const { getSchemaTemplate: getSchTmpl } = await import('../../core/schema-templates.js');
|
|
1155
|
+
const schTemplate = getSchTmpl(options.schema);
|
|
1156
|
+
if (schTemplate) {
|
|
1157
|
+
const { quickAnswer: qa } = await import('../../core/quick-answer.js');
|
|
1158
|
+
const { smartExtractSchemaFields } = await import('../../core/schema-postprocess.js');
|
|
1159
|
+
const extracted = smartExtractSchemaFields(result.content, schTemplate.fields, qa, {
|
|
1160
|
+
pageTitle: result.title,
|
|
1161
|
+
pageUrl: result.url,
|
|
1162
|
+
metadata: result.metadata,
|
|
1163
|
+
});
|
|
1164
|
+
result.extracted = extracted;
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
// --content-only: output raw content only, no wrapper
|
|
1168
|
+
if (options.contentOnly) {
|
|
1169
|
+
await writeStdout(result.content + '\n');
|
|
1170
|
+
}
|
|
1171
|
+
else {
|
|
1172
|
+
// Output results (default path)
|
|
1173
|
+
await outputResult(result, options, {
|
|
1174
|
+
cached: false,
|
|
1175
|
+
truncated: contentTruncated || undefined,
|
|
1176
|
+
});
|
|
1177
|
+
// Token savings display (our unique selling point)
|
|
1178
|
+
if (!options.json && !options.silent && result.tokenSavingsPercent) {
|
|
1179
|
+
const savings = result.tokenSavingsPercent;
|
|
1180
|
+
const raw = result.rawTokenEstimate;
|
|
1181
|
+
const optimized = result.tokens || 0;
|
|
1182
|
+
if (savings > 0) {
|
|
1183
|
+
const rawStr = raw ? `${raw.toLocaleString()}→${optimized.toLocaleString()} tokens` : `${optimized.toLocaleString()} tokens`;
|
|
1184
|
+
process.stderr.write(`\x1b[32m💰 Token savings: ${savings}% smaller than raw HTML (${rawStr})\x1b[0m\n`);
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
// Clean up and exit
|
|
1190
|
+
await cleanup();
|
|
1191
|
+
process.exit(0);
|
|
1192
|
+
}
|
|
1193
|
+
catch (error) {
|
|
1194
|
+
if (autoProgressInterval)
|
|
1195
|
+
clearInterval(autoProgressInterval);
|
|
1196
|
+
if (spinner) {
|
|
1197
|
+
spinner.fail('Failed to fetch');
|
|
1198
|
+
}
|
|
1199
|
+
// --- #6: Consistent JSON error output ---
|
|
1200
|
+
if (isJson) {
|
|
1201
|
+
const errMsg = error instanceof Error ? error.message : 'Unknown error';
|
|
1202
|
+
const errCode = classifyErrorCode(error);
|
|
1203
|
+
await writeStdout(JSON.stringify({ success: false, error: { type: errCode.toLowerCase(), message: errMsg } }) + '\n');
|
|
1204
|
+
await cleanup();
|
|
1205
|
+
process.exit(1);
|
|
1206
|
+
}
|
|
1207
|
+
if (error instanceof Error) {
|
|
1208
|
+
console.error('\n' + formatError(error, url || '', options));
|
|
1209
|
+
}
|
|
1210
|
+
else {
|
|
1211
|
+
console.error('\x1b[31m✖ Unknown error occurred\x1b[0m');
|
|
1212
|
+
}
|
|
1213
|
+
await cleanup();
|
|
1214
|
+
process.exit(1);
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
// ─── registerFetchCommands ───────────────────────────────────────────────────
|
|
1218
|
+
export function registerFetchCommands(program) {
|
|
1219
|
+
// ── Default command: fetch a URL ─────────────────────────────────────────
|
|
1220
|
+
program
|
|
1221
|
+
.argument('[url]', 'URL to fetch')
|
|
1222
|
+
.option('-r, --render', 'Use headless browser (for JS-heavy sites)')
|
|
1223
|
+
.option('--stealth', 'Use stealth mode to bypass bot detection (auto-enables --render)')
|
|
1224
|
+
.option('--cloaked', 'Use CloakBrowser stealth (requires: npm install cloakbrowser)')
|
|
1225
|
+
.option('--tls', 'Use PeelTLS TLS fingerprint spoofing (built-in, no install needed)')
|
|
1226
|
+
.option('--cycle', 'Use PeelTLS TLS fingerprint spoofing (alias for --tls)', false)
|
|
1227
|
+
.option('--proxy <url>', 'Proxy URL for requests (http://host:port, socks5://user:pass@host:port)')
|
|
1228
|
+
.option('--proxies <urls>', 'Comma-separated list of proxy URLs for rotation (tried in order on failure)', (val) => val.split(',').map((s) => s.trim()).filter(Boolean))
|
|
1229
|
+
.option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
|
|
1230
|
+
.option('--html', 'Output raw HTML instead of markdown')
|
|
1231
|
+
.option('--text', 'Output plain text instead of markdown')
|
|
1232
|
+
.option('--clean', 'Clean output — article content only, no links or metadata (alias for --readable with URL-stripped markdown)')
|
|
1233
|
+
.option('--json', 'Output as JSON')
|
|
1234
|
+
.option('-t, --timeout <ms>', 'Request timeout (ms)', (v) => parseInt(v, 10), 30000)
|
|
1235
|
+
.option('--ua <agent>', 'Custom user agent')
|
|
1236
|
+
.option('-s, --silent', 'Silent mode (no spinner)')
|
|
1237
|
+
.option('--screenshot [path]', 'Take a screenshot (optionally save to file path)')
|
|
1238
|
+
.option('--full-page', 'Full-page screenshot (use with --screenshot)')
|
|
1239
|
+
.option('--selector <css>', 'CSS selector to extract (e.g., "article", ".content")')
|
|
1240
|
+
.option('--exclude <selectors...>', 'CSS selectors to exclude (e.g., ".sidebar" ".ads")')
|
|
1241
|
+
.option('--include-tags <tags>', 'Comma-separated HTML tags/selectors to include (e.g., "main,article,.content")')
|
|
1242
|
+
.option('--exclude-tags <tags>', 'Comma-separated HTML tags/selectors to exclude (e.g., "nav,footer,aside")')
|
|
1243
|
+
.option('--only-main-content', 'Shortcut for --include-tags main,article')
|
|
1244
|
+
.option('--full-content', 'Return full page content (disable automatic content density pruning)')
|
|
1245
|
+
.option('--readable', 'Reader mode — extract only the main article content, strip all noise (like browser Reader Mode)')
|
|
1246
|
+
.option('--full-nav', 'Keep full navigation/content (disable auto-readability when piped or in agent mode)')
|
|
1247
|
+
.option('--focus <query>', 'Query-focused filtering — only return content relevant to this query (BM25 ranking)')
|
|
1248
|
+
.option('--chunk', 'Split content into RAG-ready chunks')
|
|
1249
|
+
.option('--chunk-size <tokens>', 'Max tokens per chunk (default: 512)', parseInt)
|
|
1250
|
+
.option('--chunk-overlap <tokens>', 'Overlap tokens between chunks (default: 50)', parseInt)
|
|
1251
|
+
.option('--chunk-strategy <strategy>', 'Chunking strategy: section (default), paragraph, fixed')
|
|
1252
|
+
.option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
|
|
1253
|
+
.option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
|
|
1254
|
+
.option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
|
|
1255
|
+
.option('--no-cache', 'Disable automatic caching for this request')
|
|
1256
|
+
.option('--links', 'Output only the links found on the page')
|
|
1257
|
+
.option('--images', 'Output image URLs from the page')
|
|
1258
|
+
.option('--meta', 'Output only the page metadata (title, description, author, etc.)')
|
|
1259
|
+
.option('--raw', 'Return full page without smart content extraction')
|
|
1260
|
+
.option('--skip-domain-api', 'Bypass domain-specific API extractors — force actual page scraping')
|
|
1261
|
+
.option('--full', 'Alias for --raw — full page content, no budget')
|
|
1262
|
+
.option('--lite', 'Lite mode — minimal processing, maximum speed (skip pruning, budget, metadata)')
|
|
1263
|
+
.option('--action <actions...>', 'Page actions before scraping (e.g., "click:.btn" "wait:2000" "scroll:bottom")')
|
|
1264
|
+
.option('--extract <json>', 'Extract structured data using CSS selectors or type schema (e.g., \'{"title": "h1"}\' for CSS, \'{"name": "string"}\' for schema)')
|
|
1265
|
+
.option('--extract-prompt <prompt>', 'Natural language prompt for structured extraction (no LLM key needed — uses heuristics)')
|
|
1266
|
+
.option('--llm-extract [instruction]', 'Extract structured data using LLM (optional instruction, e.g. "extract hotel names and prices")')
|
|
1267
|
+
.option('--extract-schema <schema>', 'JSON schema for structured extraction (requires LLM key). Pass inline JSON or @file.json')
|
|
1268
|
+
.option('--llm-key <key>', 'LLM API key for AI features (or use OPENAI_API_KEY env var)')
|
|
1269
|
+
.option('--llm-model <model>', 'LLM model to use (default: gpt-4o-mini)')
|
|
1270
|
+
.option('--llm-base-url <url>', 'LLM API base URL (default: https://api.openai.com/v1)')
|
|
1271
|
+
.option('--summary', 'Generate AI summary of content (requires --llm-key or OPENAI_API_KEY)')
|
|
1272
|
+
.option('--location <country>', 'ISO country code for geo-targeting (e.g., "US", "DE", "JP")')
|
|
1273
|
+
.option('--language <lang>', 'Language preference (e.g., "en", "de", "ja")')
|
|
1274
|
+
.option('--max-tokens <n>', 'Maximum token count for output (truncate if exceeded)', parseInt)
|
|
1275
|
+
.option('--budget <n>', 'Smart token budget — distill content to fit within N tokens (heuristic, no LLM key needed)', parseInt)
|
|
1276
|
+
.option('--extract-all', 'Auto-detect and extract repeated listing items (e.g., search results)')
|
|
1277
|
+
.option('--schema <name>', 'Force a specific extraction schema by name or domain (e.g., "booking.com", "amazon")')
|
|
1278
|
+
.option('--list-schemas', 'List all available extraction schemas and their supported domains')
|
|
1279
|
+
.option('--scroll-extract [count]', 'Scroll page N times to load lazy content (bare flag = smart auto-scroll until stable), then extract (implies --render)', (v) => parseInt(v, 10))
|
|
1280
|
+
.option('--scroll-extract-timeout <ms>', 'Total timeout in ms for auto-scroll (default: 30000, only used with bare --scroll-extract)', parseInt)
|
|
1281
|
+
.option('--csv', 'Output extraction results as CSV')
|
|
1282
|
+
.option('--table', 'Output extraction results as a formatted table')
|
|
1283
|
+
.option('--pages <n>', 'Follow pagination "Next" links for N pages (max 10)', (v) => parseInt(v, 10))
|
|
1284
|
+
.option('--profile <path>', 'Use a persistent browser profile directory (cookies/sessions survive between calls)')
|
|
1285
|
+
.option('--headed', 'Run browser in headed (visible) mode — useful for profile setup and debugging')
|
|
1286
|
+
.option('-q, --question <q>', 'Ask a question about the page content (BM25-powered, no LLM key needed)')
|
|
1287
|
+
.option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)')
|
|
1288
|
+
.option('--device <type>', 'Device emulation: desktop (default), mobile, tablet (auto-enables --render)')
|
|
1289
|
+
.option('--viewport <WxH>', 'Browser viewport size (e.g., "1920x1080") (auto-enables --render)', (val) => {
|
|
1290
|
+
const [w, h] = val.split('x').map(Number);
|
|
1291
|
+
return { width: w, height: h };
|
|
1292
|
+
})
|
|
1293
|
+
.option('--scale <factor>', 'Device scale factor (pixel density) for screenshots (default: auto from device profile)', parseFloat)
|
|
1294
|
+
.option('--wait-until <event>', 'Page load event: domcontentloaded, networkidle, load, commit (auto-enables --render)')
|
|
1295
|
+
.option('--wait-selector <css>', 'Wait for CSS selector before extracting (auto-enables --render)')
|
|
1296
|
+
.option('--block-resources <types>', 'Block resource types, comma-separated: image,stylesheet,font,media,script (auto-enables --render)')
|
|
1297
|
+
.option('--format <type>', 'Output format: markdown (default), text, html, json')
|
|
1298
|
+
.option('--content-only', 'Output only the raw content field (no metadata, no JSON wrapper) — ideal for piping to LLMs')
|
|
1299
|
+
.option('--progress', 'Show engine escalation steps (simple → browser → stealth) with timing')
|
|
1300
|
+
.option('--stdin', 'Read HTML from stdin instead of fetching a URL — converts to markdown')
|
|
1301
|
+
.option('--export <format>', 'Export YouTube transcript in the given format: srt, txt, md, json')
|
|
1302
|
+
.option('--output <file>', 'Write output to a file instead of stdout')
|
|
1303
|
+
.action(async (url, options) => {
|
|
1304
|
+
if (options.stdin) {
|
|
1305
|
+
await runStdin(options);
|
|
1306
|
+
return;
|
|
1307
|
+
}
|
|
1308
|
+
await runFetch(url, options);
|
|
1309
|
+
});
|
|
1310
|
+
// ── read subcommand (explicit readable mode) ─────────────────────────────
|
|
1311
|
+
program
|
|
1312
|
+
.command('read <url>')
|
|
1313
|
+
.description('Read a page in clean reader mode (like browser Reader View)')
|
|
1314
|
+
.option('--json', 'Output as JSON')
|
|
1315
|
+
.option('-s, --silent', 'Silent mode')
|
|
1316
|
+
.option('--budget <n>', 'Token budget (default: 4000)', parseInt)
|
|
1317
|
+
.option('--focus <query>', 'Focus on content relevant to this query')
|
|
1318
|
+
.option('--highlight-query <query>', 'Extract only passages relevant to this query (BM25-powered)')
|
|
1319
|
+
.option('--highlight-max-chars <n>', 'Max characters for highlights (default: 1000)', parseInt)
|
|
1320
|
+
.action(async (url, opts) => {
|
|
1321
|
+
await runFetch(url, {
|
|
1322
|
+
...opts,
|
|
1323
|
+
readable: true,
|
|
1324
|
+
budget: 4000,
|
|
1325
|
+
});
|
|
1326
|
+
});
|
|
1327
|
+
// ── pipe subcommand — always JSON, no UI (agent-friendly) ────────────────
|
|
1328
|
+
program
|
|
1329
|
+
.command('pipe <url>')
|
|
1330
|
+
.description('Pipe-friendly fetch (always JSON, no UI). Alias for: webpeel <url> --json --silent')
|
|
1331
|
+
.option('-r, --render', 'Use headless browser')
|
|
1332
|
+
.option('--stealth', 'Stealth mode')
|
|
1333
|
+
.option('--budget <n>', 'Token budget', parseInt)
|
|
1334
|
+
.option('--clean', 'Clean format for AI')
|
|
1335
|
+
.option('-q, --question <q>', 'Quick answer')
|
|
1336
|
+
.option('--proxy <url>', 'Proxy URL')
|
|
1337
|
+
.option('--timeout <ms>', 'Timeout in ms', parseInt)
|
|
1338
|
+
.option('-s, --silent', 'Silent mode (always on for pipe, accepted for compatibility)')
|
|
1339
|
+
.action(async (url, opts) => {
|
|
1340
|
+
// Force JSON + silent — always, unconditionally
|
|
1341
|
+
opts.json = true;
|
|
1342
|
+
opts.silent = true;
|
|
1343
|
+
await runFetch(url, opts);
|
|
1344
|
+
});
|
|
1345
|
+
}
|