@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,1629 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Search provider abstraction
|
|
3
|
+
*
|
|
4
|
+
* WebPeel supports multiple web search backends. DuckDuckGo is the default
|
|
5
|
+
* (no API key required). The StealthSearchProvider uses WebPeel's own stealth
|
|
6
|
+
* browser to scrape multiple search engines in parallel — fully self-hosted,
|
|
7
|
+
* no external API keys required.
|
|
8
|
+
*
|
|
9
|
+
* Provider fallback chain (DDG):
|
|
10
|
+
* DDG HTTP → DDG Lite → Brave (if key) → StealthSearchProvider (Bing + Ecosia)
|
|
11
|
+
*
|
|
12
|
+
* In production with no API keys configured, getBestSearchProvider() returns
|
|
13
|
+
* StealthSearchProvider since DDG HTTP is often blocked on datacenter IPs.
|
|
14
|
+
*/
|
|
15
|
+
import { fetch as undiciFetch, ProxyAgent } from 'undici';
|
|
16
|
+
import { load } from 'cheerio';
|
|
17
|
+
import { getStealthBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
|
+
import { getWebshareProxy, getWebshareProxyUrl } from './proxy-config.js';
|
|
19
|
+
import { createLogger } from './logger.js';
|
|
20
|
+
import { searchViaSearXNG } from './searxng-provider.js';
|
|
21
|
+
const log = createLogger('search');
|
|
22
|
+
function decodeHtmlEntities(input) {
|
|
23
|
+
// Cheerio usually decodes entities when using `.text()`, but keep this as a
|
|
24
|
+
// safety net since DuckDuckGo snippets sometimes leak encoded entities.
|
|
25
|
+
return input
|
|
26
|
+
.replace(/ /gi, ' ')
|
|
27
|
+
.replace(/&/gi, '&')
|
|
28
|
+
.replace(/</gi, '<')
|
|
29
|
+
.replace(/>/gi, '>')
|
|
30
|
+
.replace(/"/gi, '"')
|
|
31
|
+
.replace(/'/g, "'")
|
|
32
|
+
.replace(/&#x([0-9a-f]+);/gi, (_m, hex) => {
|
|
33
|
+
const cp = Number.parseInt(String(hex), 16);
|
|
34
|
+
if (!Number.isFinite(cp) || cp < 0 || cp > 0x10ffff)
|
|
35
|
+
return _m;
|
|
36
|
+
try {
|
|
37
|
+
return String.fromCodePoint(cp);
|
|
38
|
+
}
|
|
39
|
+
catch {
|
|
40
|
+
return _m;
|
|
41
|
+
}
|
|
42
|
+
})
|
|
43
|
+
.replace(/&#(\d+);/g, (_m, num) => {
|
|
44
|
+
const cp = Number.parseInt(String(num), 10);
|
|
45
|
+
if (!Number.isFinite(cp) || cp < 0 || cp > 0x10ffff)
|
|
46
|
+
return _m;
|
|
47
|
+
try {
|
|
48
|
+
return String.fromCodePoint(cp);
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
return _m;
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
function cleanText(input, opts) {
|
|
56
|
+
let s = decodeHtmlEntities(input);
|
|
57
|
+
s = s.replace(/\s+/g, ' ').trim();
|
|
58
|
+
if (opts.stripEllipsisPadding) {
|
|
59
|
+
// Remove leading/trailing "..." or Unicode ellipsis padding.
|
|
60
|
+
s = s
|
|
61
|
+
.replace(/^(?:\.{3,}|…)+\s*/g, '')
|
|
62
|
+
.replace(/\s*(?:\.{3,}|…)+$/g, '')
|
|
63
|
+
.trim();
|
|
64
|
+
}
|
|
65
|
+
if (s.length > opts.maxLen)
|
|
66
|
+
s = s.slice(0, opts.maxLen);
|
|
67
|
+
return s;
|
|
68
|
+
}
|
|
69
|
+
/** Decode DuckDuckGo redirect URLs to their final destination */
|
|
70
|
+
function decodeDdgUrl(rawUrl) {
|
|
71
|
+
try {
|
|
72
|
+
// Handle //duckduckgo.com/l/?uddg=... format
|
|
73
|
+
const urlStr = rawUrl.startsWith('//') ? 'https:' + rawUrl : rawUrl;
|
|
74
|
+
const parsed = new URL(urlStr);
|
|
75
|
+
if (parsed.hostname === 'duckduckgo.com' && parsed.pathname === '/l/') {
|
|
76
|
+
const uddg = parsed.searchParams.get('uddg');
|
|
77
|
+
if (uddg)
|
|
78
|
+
return uddg; // Already decoded by URL parser
|
|
79
|
+
}
|
|
80
|
+
// Filter out DDG internal URLs (including ad redirects like /y.js)
|
|
81
|
+
if (parsed.hostname === 'duckduckgo.com')
|
|
82
|
+
return '';
|
|
83
|
+
return rawUrl.startsWith('//') ? 'https:' + rawUrl : rawUrl;
|
|
84
|
+
}
|
|
85
|
+
catch {
|
|
86
|
+
return rawUrl;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
/** Returns true if a URL looks like a DuckDuckGo ad or tracking link */
|
|
90
|
+
function isDdgAdUrl(url) {
|
|
91
|
+
try {
|
|
92
|
+
const parsed = new URL(url);
|
|
93
|
+
// DDG-internal ad redirect paths
|
|
94
|
+
if (parsed.hostname === 'duckduckgo.com')
|
|
95
|
+
return true;
|
|
96
|
+
// URLs with known ad tracking query params
|
|
97
|
+
if (parsed.searchParams.has('ad_domain') ||
|
|
98
|
+
parsed.searchParams.has('ad_provider') ||
|
|
99
|
+
parsed.searchParams.has('ad_type'))
|
|
100
|
+
return true;
|
|
101
|
+
return false;
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return false;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
/** Returns true if a snippet is a DuckDuckGo ad snippet */
|
|
108
|
+
function isDdgAdSnippet(snippet) {
|
|
109
|
+
return snippet.includes('Ad ·') ||
|
|
110
|
+
snippet.includes('Ad Viewing ads is privacy protected by DuckDuckGo') ||
|
|
111
|
+
snippet.toLowerCase().startsWith('ad ·');
|
|
112
|
+
}
|
|
113
|
+
class ProviderStatsTracker {
|
|
114
|
+
history = new Map();
|
|
115
|
+
windowSize;
|
|
116
|
+
failThreshold;
|
|
117
|
+
minSamples;
|
|
118
|
+
decayMs; // failures older than this are ignored
|
|
119
|
+
constructor(windowSize = 10, failThreshold = 0.8, minSamples = 5, decayMs = 5 * 60 * 1000) {
|
|
120
|
+
this.windowSize = windowSize;
|
|
121
|
+
this.failThreshold = failThreshold;
|
|
122
|
+
this.minSamples = minSamples;
|
|
123
|
+
this.decayMs = decayMs; // default 5 minutes: old failures don't permanently lock a provider
|
|
124
|
+
}
|
|
125
|
+
/** Record the outcome of a single attempt for the given source. */
|
|
126
|
+
record(sourceId, success) {
|
|
127
|
+
const arr = this.history.get(sourceId) ?? [];
|
|
128
|
+
arr.push({ success, ts: Date.now() });
|
|
129
|
+
if (arr.length > this.windowSize)
|
|
130
|
+
arr.splice(0, arr.length - this.windowSize);
|
|
131
|
+
this.history.set(sourceId, arr);
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Returns the failure rate (0–1) for the given source based on
|
|
135
|
+
* the sliding window of recorded attempts. Returns 0 if fewer
|
|
136
|
+
* than minSamples have been recorded, or if all samples are older
|
|
137
|
+
* than decayMs (failures expire so cold-start blips don't permanently
|
|
138
|
+
* lock out a provider).
|
|
139
|
+
*/
|
|
140
|
+
getFailureRate(sourceId) {
|
|
141
|
+
const arr = this.history.get(sourceId);
|
|
142
|
+
if (!arr || arr.length < this.minSamples)
|
|
143
|
+
return 0;
|
|
144
|
+
const cutoff = Date.now() - this.decayMs;
|
|
145
|
+
const recent = arr.filter(a => a.ts >= cutoff);
|
|
146
|
+
if (recent.length < this.minSamples)
|
|
147
|
+
return 0; // not enough recent samples
|
|
148
|
+
const failures = recent.filter(a => !a.success).length;
|
|
149
|
+
return failures / recent.length;
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Returns true when the source should be skipped (failure rate >=
|
|
153
|
+
* failThreshold with at least minSamples recent recorded).
|
|
154
|
+
*/
|
|
155
|
+
shouldSkip(sourceId) {
|
|
156
|
+
return this.getFailureRate(sourceId) >= this.failThreshold;
|
|
157
|
+
}
|
|
158
|
+
/** Debug snapshot for a source. */
|
|
159
|
+
getStats(sourceId) {
|
|
160
|
+
const arr = this.history.get(sourceId) ?? [];
|
|
161
|
+
const failures = arr.filter(a => !a.success).length;
|
|
162
|
+
const failureRate = arr.length === 0 ? 0 : failures / arr.length;
|
|
163
|
+
return { attempts: arr.length, failures, failureRate, skipRecommended: this.shouldSkip(sourceId) };
|
|
164
|
+
}
|
|
165
|
+
/** Clear history — useful in tests. */
|
|
166
|
+
reset(sourceId) {
|
|
167
|
+
if (sourceId !== undefined)
|
|
168
|
+
this.history.delete(sourceId);
|
|
169
|
+
else
|
|
170
|
+
this.history.clear();
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Module-level singleton. Exported so callers can inspect or reset stats
|
|
175
|
+
* (e.g. in tests) and to log diagnostics.
|
|
176
|
+
*/
|
|
177
|
+
export const providerStats = new ProviderStatsTracker();
|
|
178
|
+
/**
|
|
179
|
+
* Build a combined AbortSignal that fires after `timeoutMs` OR when the
|
|
180
|
+
* optional `parent` signal is aborted — whichever comes first.
|
|
181
|
+
*/
|
|
182
|
+
function createTimeoutSignal(timeoutMs, parent) {
|
|
183
|
+
const ts = AbortSignal.timeout(timeoutMs);
|
|
184
|
+
if (!parent)
|
|
185
|
+
return ts;
|
|
186
|
+
// AbortSignal.any available in Node.js ≥ 20.3
|
|
187
|
+
return AbortSignal.any([parent, ts]);
|
|
188
|
+
}
|
|
189
|
+
function normalizeUrlForDedupe(rawUrl) {
|
|
190
|
+
try {
|
|
191
|
+
const u = new URL(rawUrl);
|
|
192
|
+
const host = u.hostname.toLowerCase().replace(/^www\./, '');
|
|
193
|
+
let path = u.pathname || '/';
|
|
194
|
+
path = path.replace(/\/+$/g, '');
|
|
195
|
+
return `${host}${path}`;
|
|
196
|
+
}
|
|
197
|
+
catch {
|
|
198
|
+
return rawUrl
|
|
199
|
+
.trim()
|
|
200
|
+
.toLowerCase()
|
|
201
|
+
.replace(/^https?:\/\//, '')
|
|
202
|
+
.replace(/^www\./, '')
|
|
203
|
+
.replace(/[?#].*$/, '')
|
|
204
|
+
.replace(/\/+$/g, '');
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Merge results from multiple sources, deduplicating by normalized URL.
|
|
209
|
+
* Preserves original order (first occurrence wins) and limits to maxCount.
|
|
210
|
+
*/
|
|
211
|
+
export function mergeSearchResults(results, maxCount) {
|
|
212
|
+
const seen = new Set();
|
|
213
|
+
const merged = [];
|
|
214
|
+
for (const r of results) {
|
|
215
|
+
if (merged.length >= maxCount)
|
|
216
|
+
break;
|
|
217
|
+
const key = normalizeUrlForDedupe(r.url);
|
|
218
|
+
if (seen.has(key))
|
|
219
|
+
continue;
|
|
220
|
+
seen.add(key);
|
|
221
|
+
merged.push(r);
|
|
222
|
+
}
|
|
223
|
+
return merged;
|
|
224
|
+
}
|
|
225
|
+
// ============================================================
|
|
226
|
+
// Result Relevance Filtering
|
|
227
|
+
// Lightweight keyword-overlap scoring — no external deps.
|
|
228
|
+
// Applied after fetching raw results to remove completely off-
|
|
229
|
+
// topic hits (e.g., a grammar article returned for "used cars").
|
|
230
|
+
// ============================================================
|
|
231
|
+
const STOP_WORDS = new Set([
|
|
232
|
+
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'in', 'on', 'at', 'to', 'for',
|
|
233
|
+
'of', 'with', 'how', 'what', 'where', 'when', 'why', 'best', 'top', 'most',
|
|
234
|
+
'and', 'or', 'but', 'not', 'do', 'does', 'did', 'be', 'been', 'have', 'has',
|
|
235
|
+
'buy', 'get', 'find', 'about', 'from', 'by', 'its', 'it', 'this', 'that',
|
|
236
|
+
'much', 'very', 'can', 'will', 'would', 'could', 'should', 'per', 'than',
|
|
237
|
+
'some', 'just', 'also', 'more', 'like', 'make', 'any', 'each', 'all', 'my',
|
|
238
|
+
'your', 'our', 'their', 'me', 'us', 'them', 'so', 'if', 'then', 'here',
|
|
239
|
+
]);
|
|
240
|
+
/**
|
|
241
|
+
* Extract meaningful keywords from a search query by stripping stop words and
|
|
242
|
+
* short tokens. Returns lowercase tokens, deduped.
|
|
243
|
+
*/
|
|
244
|
+
function extractKeywords(query) {
|
|
245
|
+
const seen = new Set();
|
|
246
|
+
return query
|
|
247
|
+
.toLowerCase()
|
|
248
|
+
.replace(/[^a-z0-9\s]/g, ' ')
|
|
249
|
+
.split(/\s+/)
|
|
250
|
+
.filter(w => w.length >= 2 && !STOP_WORDS.has(w))
|
|
251
|
+
.filter(w => {
|
|
252
|
+
if (seen.has(w))
|
|
253
|
+
return false;
|
|
254
|
+
seen.add(w);
|
|
255
|
+
return true;
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Compute a [0, 1] relevance score for a single result against extracted keywords.
|
|
260
|
+
* Weights: title 0.5, URL 0.3, snippet 0.2.
|
|
261
|
+
*/
|
|
262
|
+
function scoreResult(result, keywords) {
|
|
263
|
+
if (keywords.length === 0)
|
|
264
|
+
return 1;
|
|
265
|
+
const titleLower = (result.title || '').toLowerCase();
|
|
266
|
+
const urlLower = (result.url || '').toLowerCase();
|
|
267
|
+
const snippetLower = (result.snippet || '').toLowerCase();
|
|
268
|
+
let titleHits = 0;
|
|
269
|
+
let urlHits = 0;
|
|
270
|
+
let snippetHits = 0;
|
|
271
|
+
for (const kw of keywords) {
|
|
272
|
+
if (titleLower.includes(kw))
|
|
273
|
+
titleHits++;
|
|
274
|
+
if (urlLower.includes(kw))
|
|
275
|
+
urlHits++;
|
|
276
|
+
if (snippetLower.includes(kw))
|
|
277
|
+
snippetHits++;
|
|
278
|
+
}
|
|
279
|
+
const titleScore = titleHits / keywords.length;
|
|
280
|
+
const urlScore = urlHits / keywords.length;
|
|
281
|
+
const snippetScore = snippetHits / keywords.length;
|
|
282
|
+
return titleScore * 0.5 + urlScore * 0.3 + snippetScore * 0.2;
|
|
283
|
+
}
|
|
284
|
+
/**
|
|
285
|
+
* Filter and rank results by relevance to the original query.
|
|
286
|
+
*
|
|
287
|
+
* 1. Extract meaningful keywords from the query (remove stop words).
|
|
288
|
+
* 2. Score each result by keyword overlap with title + URL + snippet.
|
|
289
|
+
* 3. Remove results with zero overlap (completely irrelevant).
|
|
290
|
+
* 4. Sort descending by score, keeping original index as tiebreaker.
|
|
291
|
+
* 5. Attach `relevanceScore` (0–1) to each surviving result.
|
|
292
|
+
*
|
|
293
|
+
* Results without any scores (query produced no keywords) are returned as-is.
|
|
294
|
+
*/
|
|
295
|
+
export function filterRelevantResults(results, query) {
|
|
296
|
+
const keywords = extractKeywords(query);
|
|
297
|
+
if (keywords.length === 0)
|
|
298
|
+
return results; // no keywords to filter on
|
|
299
|
+
const scored = results.map((r, idx) => ({
|
|
300
|
+
result: r,
|
|
301
|
+
score: scoreResult(r, keywords),
|
|
302
|
+
idx,
|
|
303
|
+
}));
|
|
304
|
+
// Drop results with insufficient overlap — require ≥15% keyword match
|
|
305
|
+
// to filter out dictionary/definition pages that match on a single common word
|
|
306
|
+
const minScore = keywords.length >= 3 ? 0.15 : 0.01;
|
|
307
|
+
const relevant = scored.filter(s => s.score >= minScore);
|
|
308
|
+
// Sort by score descending, original order as tiebreaker
|
|
309
|
+
relevant.sort((a, b) => (b.score !== a.score ? b.score - a.score : a.idx - b.idx));
|
|
310
|
+
return relevant.map(s => ({
|
|
311
|
+
...s.result,
|
|
312
|
+
relevanceScore: Math.min(1, s.score),
|
|
313
|
+
}));
|
|
314
|
+
}
|
|
315
|
+
/**
|
|
316
|
+
* StealthSearchProvider — self-hosted multi-engine search
|
|
317
|
+
*
|
|
318
|
+
* Uses WebPeel's own stealth browser (rebrowser-playwright with anti-detection)
|
|
319
|
+
* to scrape DuckDuckGo, Bing, and Ecosia in parallel. No external API keys
|
|
320
|
+
* required. Results are deduplicated by normalized URL before returning.
|
|
321
|
+
*
|
|
322
|
+
* Timeout: 15s per engine, 20s total.
|
|
323
|
+
*/
|
|
324
|
+
export class StealthSearchProvider {
|
|
325
|
+
id = 'stealth';
|
|
326
|
+
requiresApiKey = false;
|
|
327
|
+
/**
|
|
328
|
+
* Short-TTL in-memory cache for search results.
|
|
329
|
+
* Key: normalised "query::count", Value: { results, ts }.
|
|
330
|
+
* Entries expire after SEARCH_CACHE_TTL_MS. Avoids redundant browser scrapes
|
|
331
|
+
* when the same query is issued within a short window (e.g. transit outbound +
|
|
332
|
+
* general enrichment both searching similar terms).
|
|
333
|
+
*/
|
|
334
|
+
static SEARCH_CACHE_TTL_MS = 90_000; // 90 seconds
|
|
335
|
+
static SEARCH_CACHE_MAX = 50;
|
|
336
|
+
static searchCache = new Map();
|
|
337
|
+
getCachedSearch(query, count) {
|
|
338
|
+
const key = `${query.toLowerCase().trim()}::${count}`;
|
|
339
|
+
const entry = StealthSearchProvider.searchCache.get(key);
|
|
340
|
+
if (!entry)
|
|
341
|
+
return null;
|
|
342
|
+
if (Date.now() - entry.ts > StealthSearchProvider.SEARCH_CACHE_TTL_MS) {
|
|
343
|
+
StealthSearchProvider.searchCache.delete(key);
|
|
344
|
+
return null;
|
|
345
|
+
}
|
|
346
|
+
return entry.results;
|
|
347
|
+
}
|
|
348
|
+
setCachedSearch(query, count, results) {
|
|
349
|
+
const key = `${query.toLowerCase().trim()}::${count}`;
|
|
350
|
+
// Evict oldest entries if cache is full
|
|
351
|
+
if (StealthSearchProvider.searchCache.size >= StealthSearchProvider.SEARCH_CACHE_MAX) {
|
|
352
|
+
const oldest = StealthSearchProvider.searchCache.keys().next().value;
|
|
353
|
+
if (oldest)
|
|
354
|
+
StealthSearchProvider.searchCache.delete(oldest);
|
|
355
|
+
}
|
|
356
|
+
StealthSearchProvider.searchCache.set(key, { results, ts: Date.now() });
|
|
357
|
+
}
|
|
358
|
+
/** Validate and normalize a URL; returns null if invalid/non-http or a DDG ad URL */
|
|
359
|
+
validateUrl(rawUrl) {
|
|
360
|
+
try {
|
|
361
|
+
const parsed = new URL(rawUrl);
|
|
362
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
363
|
+
return null;
|
|
364
|
+
// Filter all DuckDuckGo URLs (internal links, ad redirects, etc.)
|
|
365
|
+
if (parsed.hostname === 'duckduckgo.com')
|
|
366
|
+
return null;
|
|
367
|
+
// Filter URLs with ad tracking query params
|
|
368
|
+
if (parsed.searchParams.has('ad_domain') ||
|
|
369
|
+
parsed.searchParams.has('ad_provider') ||
|
|
370
|
+
parsed.searchParams.has('ad_type'))
|
|
371
|
+
return null;
|
|
372
|
+
return parsed.href;
|
|
373
|
+
}
|
|
374
|
+
catch {
|
|
375
|
+
return null;
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
/**
|
|
379
|
+
* Scrape DuckDuckGo HTML endpoint with stealth browser.
|
|
380
|
+
* Uses the warm shared stealth browser (new context per call) for speed.
|
|
381
|
+
*/
|
|
382
|
+
async scrapeDDG(query, count, options) {
|
|
383
|
+
const locale = options?.locale ?? 'en-US';
|
|
384
|
+
let ctx;
|
|
385
|
+
try {
|
|
386
|
+
const browser = await getStealthBrowser();
|
|
387
|
+
const params = new URLSearchParams({ q: query });
|
|
388
|
+
const url = `https://html.duckduckgo.com/html/?${params.toString()}`;
|
|
389
|
+
const proxy = getWebshareProxy();
|
|
390
|
+
ctx = await browser.newContext({
|
|
391
|
+
userAgent: getRandomUserAgent(),
|
|
392
|
+
locale,
|
|
393
|
+
timezoneId: 'America/New_York',
|
|
394
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
395
|
+
});
|
|
396
|
+
const page = await ctx.newPage();
|
|
397
|
+
await applyStealthScripts(page);
|
|
398
|
+
await Promise.race([
|
|
399
|
+
page.goto(url, { waitUntil: 'domcontentloaded', timeout: 12_000 }),
|
|
400
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('DDG stealth timeout')), 15_000)),
|
|
401
|
+
]);
|
|
402
|
+
// Wait for result selectors instead of a fixed 3s delay.
|
|
403
|
+
// The selector wait resolves as soon as content is painted; the 3s
|
|
404
|
+
// timeout is a safety net (same overall worst-case as before).
|
|
405
|
+
await page.waitForSelector('.result', { timeout: 3000 }).catch(() => { });
|
|
406
|
+
const html = await page.content();
|
|
407
|
+
if (!html)
|
|
408
|
+
return [];
|
|
409
|
+
const $ = load(html);
|
|
410
|
+
const results = [];
|
|
411
|
+
const seen = new Set();
|
|
412
|
+
$('.result').each((_i, elem) => {
|
|
413
|
+
if (results.length >= count)
|
|
414
|
+
return;
|
|
415
|
+
const $r = $(elem);
|
|
416
|
+
const titleRaw = $r.find('.result__title').text() || $r.find('.result__a').text();
|
|
417
|
+
const rawUrl = $r.find('.result__a').attr('href') || '';
|
|
418
|
+
const snippetRaw = $r.find('.result__snippet').text();
|
|
419
|
+
const title = cleanText(titleRaw, { maxLen: 200 });
|
|
420
|
+
const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
|
|
421
|
+
if (!title || !rawUrl)
|
|
422
|
+
return;
|
|
423
|
+
// Filter ad snippets
|
|
424
|
+
if (isDdgAdSnippet(snippet))
|
|
425
|
+
return;
|
|
426
|
+
// Extract real URL from DDG redirect param
|
|
427
|
+
const finalUrl = decodeDdgUrl(rawUrl);
|
|
428
|
+
if (!finalUrl)
|
|
429
|
+
return; // filtered out (DDG internal link)
|
|
430
|
+
// Filter ad URLs
|
|
431
|
+
if (isDdgAdUrl(finalUrl))
|
|
432
|
+
return;
|
|
433
|
+
const validated = this.validateUrl(finalUrl);
|
|
434
|
+
if (!validated)
|
|
435
|
+
return;
|
|
436
|
+
const key = normalizeUrlForDedupe(validated);
|
|
437
|
+
if (seen.has(key))
|
|
438
|
+
return;
|
|
439
|
+
seen.add(key);
|
|
440
|
+
results.push({ title, url: validated, snippet });
|
|
441
|
+
});
|
|
442
|
+
return results;
|
|
443
|
+
}
|
|
444
|
+
catch {
|
|
445
|
+
return [];
|
|
446
|
+
}
|
|
447
|
+
finally {
|
|
448
|
+
await ctx?.close().catch(() => { });
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
/**
|
|
452
|
+
* Scrape Bing web search with stealth browser.
|
|
453
|
+
* Selectors: li.b_algo for result containers.
|
|
454
|
+
*/
|
|
455
|
+
async scrapeBing(query, count, options) {
|
|
456
|
+
const locale = options?.locale ?? 'en-US';
|
|
457
|
+
let ctx;
|
|
458
|
+
try {
|
|
459
|
+
const browser = await getStealthBrowser();
|
|
460
|
+
const params = new URLSearchParams({ q: query });
|
|
461
|
+
const url = `https://www.bing.com/search?${params.toString()}`;
|
|
462
|
+
const proxy = getWebshareProxy();
|
|
463
|
+
ctx = await browser.newContext({
|
|
464
|
+
userAgent: getRandomUserAgent(),
|
|
465
|
+
locale,
|
|
466
|
+
timezoneId: 'America/New_York',
|
|
467
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
468
|
+
});
|
|
469
|
+
const page = await ctx.newPage();
|
|
470
|
+
await applyStealthScripts(page);
|
|
471
|
+
await Promise.race([
|
|
472
|
+
page.goto(url, { waitUntil: 'domcontentloaded', timeout: 12_000 }),
|
|
473
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('Bing stealth timeout')), 15_000)),
|
|
474
|
+
]);
|
|
475
|
+
// Wait for Bing result containers instead of a fixed 2s delay.
|
|
476
|
+
await page.waitForSelector('li.b_algo', { timeout: 2000 }).catch(() => { });
|
|
477
|
+
const html = await page.content();
|
|
478
|
+
if (!html)
|
|
479
|
+
return [];
|
|
480
|
+
const $ = load(html);
|
|
481
|
+
const results = [];
|
|
482
|
+
const seen = new Set();
|
|
483
|
+
$('li.b_algo').each((_i, elem) => {
|
|
484
|
+
if (results.length >= count)
|
|
485
|
+
return;
|
|
486
|
+
const $r = $(elem);
|
|
487
|
+
// Title + URL from h2 > a
|
|
488
|
+
const $a = $r.find('h2 > a');
|
|
489
|
+
const title = cleanText($a.text(), { maxLen: 200 });
|
|
490
|
+
const rawUrl = $a.attr('href') || '';
|
|
491
|
+
if (!title || !rawUrl)
|
|
492
|
+
return;
|
|
493
|
+
// Decode Bing redirect URLs: https://www.bing.com/ck/a?...&u=a1<base64url>&ntb=1
|
|
494
|
+
// The `u` param is a base64url-encoded real URL prefixed with "a1"
|
|
495
|
+
let finalUrl = rawUrl;
|
|
496
|
+
try {
|
|
497
|
+
const bingUrl = new URL(rawUrl);
|
|
498
|
+
if (bingUrl.hostname.endsWith('bing.com') && bingUrl.pathname.startsWith('/ck/')) {
|
|
499
|
+
const u = bingUrl.searchParams.get('u');
|
|
500
|
+
if (u && u.startsWith('a1')) {
|
|
501
|
+
const decoded = Buffer.from(u.slice(2), 'base64url').toString('utf-8');
|
|
502
|
+
if (decoded.startsWith('http'))
|
|
503
|
+
finalUrl = decoded;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
catch { /* use raw */ }
|
|
508
|
+
const validated = this.validateUrl(finalUrl);
|
|
509
|
+
if (!validated)
|
|
510
|
+
return;
|
|
511
|
+
// Snippet: prefer .b_lineclamp2 > p, then div.b_caption > p
|
|
512
|
+
const snippetRaw = $r.find('.b_lineclamp2 p').first().text() ||
|
|
513
|
+
$r.find('div.b_caption > p').first().text() ||
|
|
514
|
+
$r.find('.b_caption').text();
|
|
515
|
+
const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
|
|
516
|
+
const key = normalizeUrlForDedupe(validated);
|
|
517
|
+
if (seen.has(key))
|
|
518
|
+
return;
|
|
519
|
+
seen.add(key);
|
|
520
|
+
results.push({ title, url: validated, snippet });
|
|
521
|
+
});
|
|
522
|
+
return results;
|
|
523
|
+
}
|
|
524
|
+
catch {
|
|
525
|
+
return [];
|
|
526
|
+
}
|
|
527
|
+
finally {
|
|
528
|
+
await ctx?.close().catch(() => { });
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
/**
|
|
532
|
+
* Scrape Ecosia web search with stealth browser.
|
|
533
|
+
* Uses the warm shared stealth browser (new context per call) for speed.
|
|
534
|
+
* Tries multiple selector patterns since Ecosia updates their HTML frequently.
|
|
535
|
+
*/
|
|
536
|
+
async scrapeEcosia(query, count, options) {
|
|
537
|
+
const locale = options?.locale ?? 'en-US';
|
|
538
|
+
let ctx;
|
|
539
|
+
try {
|
|
540
|
+
const browser = await getStealthBrowser();
|
|
541
|
+
const params = new URLSearchParams({ q: query });
|
|
542
|
+
const url = `https://www.ecosia.org/search?${params.toString()}`;
|
|
543
|
+
const proxy = getWebshareProxy();
|
|
544
|
+
ctx = await browser.newContext({
|
|
545
|
+
userAgent: getRandomUserAgent(),
|
|
546
|
+
locale,
|
|
547
|
+
timezoneId: 'America/New_York',
|
|
548
|
+
...(proxy ? { proxy: { server: proxy.server, username: proxy.username, password: proxy.password } } : {}),
|
|
549
|
+
});
|
|
550
|
+
const page = await ctx.newPage();
|
|
551
|
+
await applyStealthScripts(page);
|
|
552
|
+
await Promise.race([
|
|
553
|
+
page.goto(url, { waitUntil: 'domcontentloaded', timeout: 12_000 }),
|
|
554
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('Ecosia stealth timeout')), 15_000)),
|
|
555
|
+
]);
|
|
556
|
+
// Wait for any of Ecosia's result container selectors instead of a fixed 2s delay.
|
|
557
|
+
await page.waitForSelector('article.result, .result, [data-test-id="result"]', { timeout: 2000 }).catch(() => { });
|
|
558
|
+
const html = await page.content();
|
|
559
|
+
if (!html)
|
|
560
|
+
return [];
|
|
561
|
+
const $ = load(html);
|
|
562
|
+
const results = [];
|
|
563
|
+
const seen = new Set();
|
|
564
|
+
// Try multiple container selectors — Ecosia changes HTML periodically
|
|
565
|
+
const containers = $('article.result, .result, [data-test-id="result"]');
|
|
566
|
+
containers.each((_i, elem) => {
|
|
567
|
+
if (results.length >= count)
|
|
568
|
+
return;
|
|
569
|
+
const $r = $(elem);
|
|
570
|
+
// Title + URL: try multiple patterns
|
|
571
|
+
let $a = $r.find('a.result-title').first();
|
|
572
|
+
if (!$a.length)
|
|
573
|
+
$a = $r.find('h2 > a').first();
|
|
574
|
+
if (!$a.length)
|
|
575
|
+
$a = $r.find('a[href]').first();
|
|
576
|
+
const title = cleanText($a.text(), { maxLen: 200 });
|
|
577
|
+
const rawUrl = $a.attr('href') || '';
|
|
578
|
+
if (!title || !rawUrl)
|
|
579
|
+
return;
|
|
580
|
+
const validated = this.validateUrl(rawUrl);
|
|
581
|
+
if (!validated)
|
|
582
|
+
return;
|
|
583
|
+
// Snippet: try multiple patterns
|
|
584
|
+
const snippetRaw = $r.find('p.result-snippet').first().text() ||
|
|
585
|
+
$r.find('.snippet').first().text() ||
|
|
586
|
+
$r.find('p').first().text();
|
|
587
|
+
const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
|
|
588
|
+
const key = normalizeUrlForDedupe(validated);
|
|
589
|
+
if (seen.has(key))
|
|
590
|
+
return;
|
|
591
|
+
seen.add(key);
|
|
592
|
+
results.push({ title, url: validated, snippet });
|
|
593
|
+
});
|
|
594
|
+
return results;
|
|
595
|
+
}
|
|
596
|
+
catch {
|
|
597
|
+
return [];
|
|
598
|
+
}
|
|
599
|
+
finally {
|
|
600
|
+
await ctx?.close().catch(() => { });
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
async searchWeb(query, options) {
|
|
604
|
+
const { count } = options;
|
|
605
|
+
// Check in-memory short-TTL cache first
|
|
606
|
+
const cached = this.getCachedSearch(query, count);
|
|
607
|
+
if (cached) {
|
|
608
|
+
log.info(`Stealth search cache HIT for "${query.substring(0, 40)}…" (${cached.length} results)`);
|
|
609
|
+
return cached;
|
|
610
|
+
}
|
|
611
|
+
// Launch all three engines in parallel; ignore individual engine failures
|
|
612
|
+
const [ddgOutcome, bingOutcome, ecosiaOutcome] = await Promise.allSettled([
|
|
613
|
+
this.scrapeDDG(query, count, options),
|
|
614
|
+
this.scrapeBing(query, count, options),
|
|
615
|
+
this.scrapeEcosia(query, count, options),
|
|
616
|
+
]);
|
|
617
|
+
const allResults = [];
|
|
618
|
+
for (const outcome of [ddgOutcome, bingOutcome, ecosiaOutcome]) {
|
|
619
|
+
if (outcome.status === 'fulfilled') {
|
|
620
|
+
allResults.push(...outcome.value);
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
// Deduplicate across engines by normalized URL
|
|
624
|
+
const seen = new Set();
|
|
625
|
+
const deduped = [];
|
|
626
|
+
for (const r of allResults) {
|
|
627
|
+
const key = normalizeUrlForDedupe(r.url);
|
|
628
|
+
if (seen.has(key))
|
|
629
|
+
continue;
|
|
630
|
+
seen.add(key);
|
|
631
|
+
deduped.push(r);
|
|
632
|
+
if (deduped.length >= count)
|
|
633
|
+
break;
|
|
634
|
+
}
|
|
635
|
+
// Relevance filtering: remove completely off-topic results, score the rest
|
|
636
|
+
const filtered = filterRelevantResults(deduped, query);
|
|
637
|
+
// Respect the original count limit after filtering
|
|
638
|
+
const finalResults = filtered.slice(0, count);
|
|
639
|
+
// Store in short-TTL cache
|
|
640
|
+
if (finalResults.length > 0) {
|
|
641
|
+
this.setCachedSearch(query, count, finalResults);
|
|
642
|
+
}
|
|
643
|
+
return finalResults;
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
export class DuckDuckGoProvider {
|
|
647
|
+
id = 'duckduckgo';
|
|
648
|
+
requiresApiKey = false;
|
|
649
|
+
buildQueryAttempts(originalQuery) {
|
|
650
|
+
const q = originalQuery.trim();
|
|
651
|
+
if (!q)
|
|
652
|
+
return [];
|
|
653
|
+
const attempts = [];
|
|
654
|
+
// Required retry strategy order:
|
|
655
|
+
// 1) original query
|
|
656
|
+
// 2) keywords-only (strip question words, articles, prepositions)
|
|
657
|
+
// 3) quoted query
|
|
658
|
+
// 4) query site:*
|
|
659
|
+
attempts.push(q);
|
|
660
|
+
// For long queries (>5 words), extract just the meaningful keywords
|
|
661
|
+
// "how much does a used 2023 Tesla Model 3 cost per month" → "2023 Tesla Model 3 cost month"
|
|
662
|
+
const words = q.split(/\s+/);
|
|
663
|
+
if (words.length > 5) {
|
|
664
|
+
const keywordsOnly = words
|
|
665
|
+
.filter(w => !STOP_WORDS.has(w.toLowerCase()) && w.length >= 2)
|
|
666
|
+
.join(' ');
|
|
667
|
+
if (keywordsOnly && keywordsOnly !== q) {
|
|
668
|
+
attempts.push(keywordsOnly);
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
if (!/^".*"$/.test(q))
|
|
672
|
+
attempts.push(`"${q}"`);
|
|
673
|
+
attempts.push(`${q} site:*`);
|
|
674
|
+
// Single-word queries are disproportionately likely to return 0 results on
|
|
675
|
+
// the DDG HTML endpoint (e.g. "openai" vs "open ai"). When the first three
|
|
676
|
+
// attempts fail, try a few light-touch strategies that tend to coax the
|
|
677
|
+
// parser into returning web results.
|
|
678
|
+
const isSingleWord = !/\s/.test(q);
|
|
679
|
+
const looksLikeUrlOrDomain = /[./]/.test(q) || /^https?:/i.test(q);
|
|
680
|
+
if (isSingleWord && !looksLikeUrlOrDomain) {
|
|
681
|
+
// Try splitting a common suffix (e.g. openai -> open ai)
|
|
682
|
+
if (/^[a-z]{5,}ai$/i.test(q)) {
|
|
683
|
+
attempts.push(`${q.slice(0, -2)} ai`);
|
|
684
|
+
}
|
|
685
|
+
// Common suffixes that often return at least the official domain
|
|
686
|
+
attempts.push(`${q}.com`);
|
|
687
|
+
attempts.push(`site:${q}.com`);
|
|
688
|
+
attempts.push(`${q} website`);
|
|
689
|
+
}
|
|
690
|
+
// De-dupe attempts (case-insensitive)
|
|
691
|
+
const seen = new Set();
|
|
692
|
+
return attempts
|
|
693
|
+
.map((s) => s.trim())
|
|
694
|
+
.filter((s) => s.length > 0)
|
|
695
|
+
.filter((s) => {
|
|
696
|
+
const key = s.toLowerCase();
|
|
697
|
+
if (seen.has(key))
|
|
698
|
+
return false;
|
|
699
|
+
seen.add(key);
|
|
700
|
+
return true;
|
|
701
|
+
});
|
|
702
|
+
}
|
|
703
|
+
buildSearchUrl(query, options) {
|
|
704
|
+
const { tbs, country, location, locale } = options;
|
|
705
|
+
const params = new URLSearchParams();
|
|
706
|
+
params.set('q', query);
|
|
707
|
+
// DuckDuckGo HTML endpoint supports some filtering
|
|
708
|
+
if (tbs) {
|
|
709
|
+
// DDG uses `df` for time filtering on html endpoint
|
|
710
|
+
params.set('df', tbs);
|
|
711
|
+
}
|
|
712
|
+
// DuckDuckGo locale (kl parameter): locale takes priority, then country/location
|
|
713
|
+
if (locale) {
|
|
714
|
+
params.set('kl', locale.toLowerCase());
|
|
715
|
+
}
|
|
716
|
+
else if (country || location) {
|
|
717
|
+
const region = (country || location || '').toLowerCase();
|
|
718
|
+
if (region)
|
|
719
|
+
params.set('kl', region);
|
|
720
|
+
}
|
|
721
|
+
return `https://html.duckduckgo.com/html/?${params.toString()}`;
|
|
722
|
+
}
|
|
723
|
+
async searchOnce(query, options) {
|
|
724
|
+
const { count, signal } = options;
|
|
725
|
+
const searchUrl = this.buildSearchUrl(query, options);
|
|
726
|
+
// Use realistic browser headers to avoid DDG bot detection on datacenter IPs
|
|
727
|
+
// Route through residential proxy when available (datacenter IPs are blocked)
|
|
728
|
+
const proxyUrl = getWebshareProxyUrl();
|
|
729
|
+
const baseHeaders = {
|
|
730
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
731
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
732
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
733
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
734
|
+
'Cache-Control': 'no-cache',
|
|
735
|
+
'Sec-Fetch-Dest': 'document',
|
|
736
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
737
|
+
'Sec-Fetch-Site': 'none',
|
|
738
|
+
'Sec-Fetch-User': '?1',
|
|
739
|
+
'Upgrade-Insecure-Requests': '1',
|
|
740
|
+
'Referer': 'https://duckduckgo.com/',
|
|
741
|
+
};
|
|
742
|
+
// Try direct first, then proxy as fallback.
|
|
743
|
+
// Webshare backbone IPs are blocked by DDG (returns empty results).
|
|
744
|
+
// Render datacenter IPs work intermittently — direct has better odds.
|
|
745
|
+
let response;
|
|
746
|
+
let html;
|
|
747
|
+
// let usedProxy = false;
|
|
748
|
+
// Attempt 1: Direct fetch (no proxy)
|
|
749
|
+
try {
|
|
750
|
+
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
|
|
751
|
+
html = response.ok ? await response.text() : '';
|
|
752
|
+
}
|
|
753
|
+
catch (directErr) {
|
|
754
|
+
log.debug('DDG direct fetch failed:', directErr instanceof Error ? directErr.message : directErr);
|
|
755
|
+
html = '';
|
|
756
|
+
}
|
|
757
|
+
// Check if direct returned actual results (not empty/CAPTCHA)
|
|
758
|
+
const hasResults = html.includes('class="result"') || html.includes('class="result ');
|
|
759
|
+
if (!hasResults && proxyUrl) {
|
|
760
|
+
// Attempt 2: Proxy fallback
|
|
761
|
+
log.debug('DDG direct returned no results, trying proxy...');
|
|
762
|
+
try {
|
|
763
|
+
// usedProxy = true;
|
|
764
|
+
const dispatcher = new ProxyAgent(proxyUrl);
|
|
765
|
+
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal, dispatcher });
|
|
766
|
+
if (response.ok)
|
|
767
|
+
html = await response.text();
|
|
768
|
+
}
|
|
769
|
+
catch (proxyErr) {
|
|
770
|
+
log.debug('DDG proxy also failed:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
const $ = load(html);
|
|
774
|
+
const results = [];
|
|
775
|
+
const seen = new Set();
|
|
776
|
+
$('.result').each((_i, elem) => {
|
|
777
|
+
if (results.length >= count)
|
|
778
|
+
return;
|
|
779
|
+
const $result = $(elem);
|
|
780
|
+
// Be resilient to markup variations: title can be in .result__title or
|
|
781
|
+
// directly on the anchor.
|
|
782
|
+
const titleRaw = $result.find('.result__title').text() || $result.find('.result__a').text();
|
|
783
|
+
const rawUrl = $result.find('.result__a').attr('href') || '';
|
|
784
|
+
const snippetRaw = $result.find('.result__snippet').text();
|
|
785
|
+
let title = cleanText(titleRaw, { maxLen: 200 });
|
|
786
|
+
let snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
|
|
787
|
+
if (!title || !rawUrl)
|
|
788
|
+
return;
|
|
789
|
+
// Filter ad snippets (DuckDuckGo injects ad labels into snippets)
|
|
790
|
+
if (isDdgAdSnippet(snippet))
|
|
791
|
+
return;
|
|
792
|
+
// Extract actual URL from DuckDuckGo redirect; filter DDG internal/ad URLs
|
|
793
|
+
const decoded = decodeDdgUrl(rawUrl);
|
|
794
|
+
if (!decoded)
|
|
795
|
+
return; // filtered out (DDG internal link or ad redirect)
|
|
796
|
+
// Filter ad URLs
|
|
797
|
+
if (isDdgAdUrl(decoded))
|
|
798
|
+
return;
|
|
799
|
+
// SECURITY: Validate and sanitize results — only allow HTTP/HTTPS URLs
|
|
800
|
+
let url;
|
|
801
|
+
try {
|
|
802
|
+
const parsed = new URL(decoded);
|
|
803
|
+
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
804
|
+
return;
|
|
805
|
+
}
|
|
806
|
+
url = parsed.href;
|
|
807
|
+
}
|
|
808
|
+
catch {
|
|
809
|
+
return;
|
|
810
|
+
}
|
|
811
|
+
// Deduplicate by normalized URL (strip query params, www, trailing slash)
|
|
812
|
+
const dedupeKey = normalizeUrlForDedupe(url);
|
|
813
|
+
if (seen.has(dedupeKey))
|
|
814
|
+
return;
|
|
815
|
+
seen.add(dedupeKey);
|
|
816
|
+
results.push({ title, url, snippet });
|
|
817
|
+
});
|
|
818
|
+
return results;
|
|
819
|
+
}
|
|
820
|
+
/**
|
|
821
|
+
* Fallback: DuckDuckGo Lite endpoint. Different HTML structure, sometimes
|
|
822
|
+
* works when the main HTML endpoint is temporarily blocked on datacenter IPs.
|
|
823
|
+
*/
|
|
824
|
+
async searchLite(query, options) {
|
|
825
|
+
const { count, signal } = options;
|
|
826
|
+
const params = new URLSearchParams();
|
|
827
|
+
params.set('q', query);
|
|
828
|
+
const liteProxyUrl = getWebshareProxyUrl();
|
|
829
|
+
const liteHeaders = {
|
|
830
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
831
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
832
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
833
|
+
'Referer': 'https://lite.duckduckgo.com/',
|
|
834
|
+
};
|
|
835
|
+
const liteUrl = `https://lite.duckduckgo.com/lite/?${params.toString()}`;
|
|
836
|
+
// Direct first, proxy fallback (same reasoning as searchOnce — Webshare IPs blocked by DDG)
|
|
837
|
+
let html = '';
|
|
838
|
+
try {
|
|
839
|
+
const resp = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
|
|
840
|
+
if (resp.ok)
|
|
841
|
+
html = await resp.text();
|
|
842
|
+
}
|
|
843
|
+
catch { /* direct failed */ }
|
|
844
|
+
if (!html.includes('result-link') && liteProxyUrl) {
|
|
845
|
+
try {
|
|
846
|
+
const dispatcher = new ProxyAgent(liteProxyUrl);
|
|
847
|
+
const resp = await undiciFetch(liteUrl, { headers: liteHeaders, signal, dispatcher });
|
|
848
|
+
if (resp.ok)
|
|
849
|
+
html = await resp.text();
|
|
850
|
+
}
|
|
851
|
+
catch { /* proxy also failed */ }
|
|
852
|
+
}
|
|
853
|
+
if (!html)
|
|
854
|
+
return [];
|
|
855
|
+
const $ = load(html);
|
|
856
|
+
const results = [];
|
|
857
|
+
const seen = new Set();
|
|
858
|
+
// DDG Lite uses a table-based layout with class="result-link" for links
|
|
859
|
+
// and class="result-snippet" for snippets
|
|
860
|
+
$('a.result-link').each((_i, elem) => {
|
|
861
|
+
if (results.length >= count)
|
|
862
|
+
return;
|
|
863
|
+
const $a = $(elem);
|
|
864
|
+
const title = cleanText($a.text(), { maxLen: 200 });
|
|
865
|
+
let url = $a.attr('href') || '';
|
|
866
|
+
if (!title || !url)
|
|
867
|
+
return;
|
|
868
|
+
// Extract actual URL from DDG redirect; filter DDG internal/ad URLs
|
|
869
|
+
const decoded = decodeDdgUrl(url);
|
|
870
|
+
if (!decoded)
|
|
871
|
+
return; // filtered out (DDG internal link or ad redirect)
|
|
872
|
+
// Validate URL
|
|
873
|
+
try {
|
|
874
|
+
const parsed = new URL(decoded);
|
|
875
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
876
|
+
return;
|
|
877
|
+
url = parsed.href;
|
|
878
|
+
}
|
|
879
|
+
catch {
|
|
880
|
+
return;
|
|
881
|
+
}
|
|
882
|
+
const dedupeKey = normalizeUrlForDedupe(url);
|
|
883
|
+
if (seen.has(dedupeKey))
|
|
884
|
+
return;
|
|
885
|
+
seen.add(dedupeKey);
|
|
886
|
+
// Lite snippets are in the next <td> with class result-snippet
|
|
887
|
+
const snippet = cleanText($a.closest('tr').next('tr').find('.result-snippet').text(), { maxLen: 500, stripEllipsisPadding: true });
|
|
888
|
+
results.push({ title, url, snippet });
|
|
889
|
+
});
|
|
890
|
+
return results;
|
|
891
|
+
}
|
|
892
|
+
/**
|
|
893
|
+
* HTTP-only Bing scraping via undici + cheerio. No browser required.
|
|
894
|
+
* Routes through Webshare proxy (proxy first, direct fallback).
|
|
895
|
+
* Tracks stats via providerStats('bing-http').
|
|
896
|
+
*/
|
|
897
|
+
// @ts-expect-error Disabled Stage 3.5 — kept for future re-enablement
|
|
898
|
+
async _searchBingHttp(query, options) {
|
|
899
|
+
const { count, signal } = options;
|
|
900
|
+
const bingRate = providerStats.getFailureRate('bing-http');
|
|
901
|
+
const timeoutMs = bingRate > 0.5 ? 3_000 : 8_000;
|
|
902
|
+
const bingSignal = createTimeoutSignal(timeoutMs, signal);
|
|
903
|
+
const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&count=10`;
|
|
904
|
+
const headers = {
|
|
905
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
906
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
907
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
908
|
+
'Sec-Fetch-Dest': 'document',
|
|
909
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
910
|
+
'Sec-Fetch-Site': 'none',
|
|
911
|
+
'Sec-Fetch-User': '?1',
|
|
912
|
+
'Upgrade-Insecure-Requests': '1',
|
|
913
|
+
};
|
|
914
|
+
const proxyUrl = getWebshareProxyUrl();
|
|
915
|
+
let response;
|
|
916
|
+
try {
|
|
917
|
+
if (proxyUrl) {
|
|
918
|
+
try {
|
|
919
|
+
const dispatcher = new ProxyAgent(proxyUrl);
|
|
920
|
+
response = await undiciFetch(url, { headers, signal: bingSignal, dispatcher });
|
|
921
|
+
}
|
|
922
|
+
catch (proxyErr) {
|
|
923
|
+
log.debug('Bing HTTP proxy failed, falling back to direct:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
|
|
924
|
+
response = await undiciFetch(url, { headers, signal: bingSignal });
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
else {
|
|
928
|
+
response = await undiciFetch(url, { headers, signal: bingSignal });
|
|
929
|
+
}
|
|
930
|
+
if (!response.ok) {
|
|
931
|
+
providerStats.record('bing-http', false);
|
|
932
|
+
return [];
|
|
933
|
+
}
|
|
934
|
+
const html = await response.text();
|
|
935
|
+
const $ = load(html);
|
|
936
|
+
const results = [];
|
|
937
|
+
const seen = new Set();
|
|
938
|
+
// Parse Bing organic results; skip ad containers
|
|
939
|
+
$('li.b_algo').each((_i, elem) => {
|
|
940
|
+
if (results.length >= count)
|
|
941
|
+
return;
|
|
942
|
+
const $r = $(elem);
|
|
943
|
+
// Skip if inside a .b_ad block or is itself an ad container
|
|
944
|
+
if ($r.hasClass('b_ad') || $r.closest('.b_ad').length > 0)
|
|
945
|
+
return;
|
|
946
|
+
const $a = $r.find('h2 > a').first();
|
|
947
|
+
const title = cleanText($a.text(), { maxLen: 200 });
|
|
948
|
+
const rawUrl = $a.attr('href') || '';
|
|
949
|
+
if (!title || !rawUrl)
|
|
950
|
+
return;
|
|
951
|
+
// Decode Bing redirect URLs:
|
|
952
|
+
// Relative: /ck/a?!&&p=...&u=a1<base64url>&ntb=1
|
|
953
|
+
// Absolute: https://www.bing.com/ck/a?...&u=a1<base64url>&ntb=1
|
|
954
|
+
let finalUrl = rawUrl;
|
|
955
|
+
try {
|
|
956
|
+
const base = rawUrl.startsWith('/') ? `https://www.bing.com${rawUrl}` : rawUrl;
|
|
957
|
+
const ckUrl = new URL(base);
|
|
958
|
+
if (ckUrl.hostname.endsWith('bing.com') && ckUrl.pathname.startsWith('/ck/')) {
|
|
959
|
+
const u = ckUrl.searchParams.get('u');
|
|
960
|
+
if (u && u.startsWith('a1')) {
|
|
961
|
+
const decoded = Buffer.from(u.slice(2), 'base64url').toString('utf-8');
|
|
962
|
+
if (decoded.startsWith('http'))
|
|
963
|
+
finalUrl = decoded;
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
catch { /* use rawUrl as-is */ }
|
|
968
|
+
// Validate: HTTP/HTTPS only
|
|
969
|
+
try {
|
|
970
|
+
const parsed = new URL(finalUrl);
|
|
971
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
972
|
+
return;
|
|
973
|
+
finalUrl = parsed.href;
|
|
974
|
+
}
|
|
975
|
+
catch {
|
|
976
|
+
return;
|
|
977
|
+
}
|
|
978
|
+
const key = normalizeUrlForDedupe(finalUrl);
|
|
979
|
+
if (seen.has(key))
|
|
980
|
+
return;
|
|
981
|
+
seen.add(key);
|
|
982
|
+
const snippetRaw = $r.find('.b_caption p').first().text() ||
|
|
983
|
+
$r.find('.b_caption').first().text();
|
|
984
|
+
const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
|
|
985
|
+
results.push({ title, url: finalUrl, snippet });
|
|
986
|
+
});
|
|
987
|
+
providerStats.record('bing-http', results.length > 0);
|
|
988
|
+
return results;
|
|
989
|
+
}
|
|
990
|
+
catch (e) {
|
|
991
|
+
log.debug('Bing HTTP search failed:', e instanceof Error ? e.message : e);
|
|
992
|
+
providerStats.record('bing-http', false);
|
|
993
|
+
return [];
|
|
994
|
+
}
|
|
995
|
+
}
|
|
996
|
+
/**
|
|
997
|
+
* HTTP-only Google scraping via undici + cheerio. No browser required.
|
|
998
|
+
* Routes through Webshare proxy (proxy first, direct fallback).
|
|
999
|
+
* Sends CONSENT cookie to bypass Google consent page.
|
|
1000
|
+
* Tracks stats via providerStats('google-http').
|
|
1001
|
+
*/
|
|
1002
|
+
// @ts-expect-error Disabled Stage 3.5 — kept for future re-enablement
|
|
1003
|
+
async _searchGoogleHttp(query, options) {
|
|
1004
|
+
const { count, signal } = options;
|
|
1005
|
+
const googleRate = providerStats.getFailureRate('google-http');
|
|
1006
|
+
const timeoutMs = googleRate > 0.5 ? 3_000 : 8_000;
|
|
1007
|
+
const googleSignal = createTimeoutSignal(timeoutMs, signal);
|
|
1008
|
+
const url = `https://www.google.com/search?q=${encodeURIComponent(query)}&num=10&hl=en`;
|
|
1009
|
+
const headers = {
|
|
1010
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
1011
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
1012
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
1013
|
+
// Skip Google consent/cookie wall
|
|
1014
|
+
'Cookie': 'CONSENT=YES+; SOCS=CAESEwgDEgk0OTg3ODQ2NzMaAmVuIAEaBgiA0LqmBg',
|
|
1015
|
+
'Sec-Fetch-Dest': 'document',
|
|
1016
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
1017
|
+
'Sec-Fetch-Site': 'none',
|
|
1018
|
+
'Sec-Fetch-User': '?1',
|
|
1019
|
+
'Upgrade-Insecure-Requests': '1',
|
|
1020
|
+
};
|
|
1021
|
+
const proxyUrl = getWebshareProxyUrl();
|
|
1022
|
+
let response;
|
|
1023
|
+
try {
|
|
1024
|
+
if (proxyUrl) {
|
|
1025
|
+
try {
|
|
1026
|
+
const dispatcher = new ProxyAgent(proxyUrl);
|
|
1027
|
+
response = await undiciFetch(url, { headers, signal: googleSignal, dispatcher });
|
|
1028
|
+
}
|
|
1029
|
+
catch (proxyErr) {
|
|
1030
|
+
log.debug('Google HTTP proxy failed, falling back to direct:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
|
|
1031
|
+
response = await undiciFetch(url, { headers, signal: googleSignal });
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
else {
|
|
1035
|
+
response = await undiciFetch(url, { headers, signal: googleSignal });
|
|
1036
|
+
}
|
|
1037
|
+
if (!response.ok) {
|
|
1038
|
+
providerStats.record('google-http', false);
|
|
1039
|
+
return [];
|
|
1040
|
+
}
|
|
1041
|
+
const html = await response.text();
|
|
1042
|
+
const $ = load(html);
|
|
1043
|
+
const results = [];
|
|
1044
|
+
const seen = new Set();
|
|
1045
|
+
// Google organic results live in div.g blocks.
|
|
1046
|
+
// Skip ad blocks (data-text-ad attr), People Also Ask, and related searches.
|
|
1047
|
+
$('div.g').each((_i, elem) => {
|
|
1048
|
+
if (results.length >= count)
|
|
1049
|
+
return;
|
|
1050
|
+
const $r = $(elem);
|
|
1051
|
+
// Skip ad containers (data-text-ad may be on div.g itself or on a descendant)
|
|
1052
|
+
if ($r.attr('data-text-ad') !== undefined || $r.find('[data-text-ad]').length > 0)
|
|
1053
|
+
return;
|
|
1054
|
+
if ($r.closest('.commercial-unit-desktop-top, .ads-ad').length > 0)
|
|
1055
|
+
return;
|
|
1056
|
+
const $h3 = $r.find('h3').first();
|
|
1057
|
+
if (!$h3.length)
|
|
1058
|
+
return;
|
|
1059
|
+
// Find a valid external link (starts with http, not a Google domain)
|
|
1060
|
+
const $a = $r.find('a[href]').filter((_j, el) => {
|
|
1061
|
+
const href = $(el).attr('href') || '';
|
|
1062
|
+
return href.startsWith('http') && !href.includes('google.com/');
|
|
1063
|
+
}).first();
|
|
1064
|
+
if (!$a.length)
|
|
1065
|
+
return;
|
|
1066
|
+
const href = $a.attr('href') || '';
|
|
1067
|
+
// Validate URL
|
|
1068
|
+
let finalUrl;
|
|
1069
|
+
try {
|
|
1070
|
+
const parsed = new URL(href);
|
|
1071
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
1072
|
+
return;
|
|
1073
|
+
if (parsed.hostname.includes('google.com'))
|
|
1074
|
+
return;
|
|
1075
|
+
finalUrl = parsed.href;
|
|
1076
|
+
}
|
|
1077
|
+
catch {
|
|
1078
|
+
return;
|
|
1079
|
+
}
|
|
1080
|
+
const key = normalizeUrlForDedupe(finalUrl);
|
|
1081
|
+
if (seen.has(key))
|
|
1082
|
+
return;
|
|
1083
|
+
seen.add(key);
|
|
1084
|
+
const title = cleanText($h3.text(), { maxLen: 200 });
|
|
1085
|
+
if (!title)
|
|
1086
|
+
return;
|
|
1087
|
+
// Snippet: try multiple known Google snippet CSS classes/attrs
|
|
1088
|
+
const snippetRaw = $r.find('.VwiC3b').first().text() ||
|
|
1089
|
+
$r.find('[data-sncf]').first().text() ||
|
|
1090
|
+
$r.find('[style*="-webkit-line-clamp"]').first().text() ||
|
|
1091
|
+
$r.find('.st').first().text() ||
|
|
1092
|
+
'';
|
|
1093
|
+
const snippet = cleanText(snippetRaw, { maxLen: 500, stripEllipsisPadding: true });
|
|
1094
|
+
results.push({ title, url: finalUrl, snippet });
|
|
1095
|
+
});
|
|
1096
|
+
providerStats.record('google-http', results.length > 0);
|
|
1097
|
+
return results;
|
|
1098
|
+
}
|
|
1099
|
+
catch (e) {
|
|
1100
|
+
log.debug('Google HTTP search failed:', e instanceof Error ? e.message : e);
|
|
1101
|
+
providerStats.record('google-http', false);
|
|
1102
|
+
return [];
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
async searchWeb(query, options) {
|
|
1106
|
+
const attempts = this.buildQueryAttempts(query);
|
|
1107
|
+
// -----------------------------------------------------------
|
|
1108
|
+
// Stage 0: SearXNG (self-hosted, residential IP — highest reliability)
|
|
1109
|
+
// Uses Mac Mini running SearXNG exposed via Cloudflare Tunnel.
|
|
1110
|
+
// Aggregates Google, Bing, Brave, Startpage — 30-40 results typical.
|
|
1111
|
+
// Env: SEARXNG_URL=https://search.webpeel.dev
|
|
1112
|
+
// -----------------------------------------------------------
|
|
1113
|
+
if (process.env.SEARXNG_URL) {
|
|
1114
|
+
try {
|
|
1115
|
+
const searxResults = await searchViaSearXNG(query, {
|
|
1116
|
+
count: options.count ?? 10,
|
|
1117
|
+
signal: options.signal,
|
|
1118
|
+
timeoutMs: 5000,
|
|
1119
|
+
...(options.locale ? { language: options.locale } : {}),
|
|
1120
|
+
});
|
|
1121
|
+
if (searxResults.length > 0) {
|
|
1122
|
+
providerStats.record('searxng', true);
|
|
1123
|
+
log.debug(`source=searxng returned ${searxResults.length} results`);
|
|
1124
|
+
// Map SearXNG results to WebSearchResult (description → snippet, imageUrl passthrough)
|
|
1125
|
+
const mapped = searxResults.map(r => ({
|
|
1126
|
+
title: r.title,
|
|
1127
|
+
url: r.url,
|
|
1128
|
+
snippet: r.description ?? '',
|
|
1129
|
+
imageUrl: r.imageUrl,
|
|
1130
|
+
}));
|
|
1131
|
+
const filtered = filterRelevantResults(mapped, query);
|
|
1132
|
+
return filtered.length > 0 ? filtered : mapped;
|
|
1133
|
+
}
|
|
1134
|
+
providerStats.record('searxng', false);
|
|
1135
|
+
log.debug('SearXNG returned 0 results, falling through to DDG');
|
|
1136
|
+
}
|
|
1137
|
+
catch (e) {
|
|
1138
|
+
providerStats.record('searxng', false);
|
|
1139
|
+
log.debug('SearXNG failed:', e instanceof Error ? e.message : e);
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1142
|
+
// -----------------------------------------------------------
|
|
1143
|
+
// Stage 1: DDG HTTP
|
|
1144
|
+
// Skip entirely if the source has a ≥80% failure rate over the
|
|
1145
|
+
// last 10 attempts. When elevated-but-not-skipped, cap the per-
|
|
1146
|
+
// request timeout at 2 s instead of the default 8 s so we fail
|
|
1147
|
+
// fast and get to a working fallback sooner.
|
|
1148
|
+
// -----------------------------------------------------------
|
|
1149
|
+
const ddgHttpRate = providerStats.getFailureRate('ddg-http');
|
|
1150
|
+
const skipDdgHttp = providerStats.shouldSkip('ddg-http');
|
|
1151
|
+
if (skipDdgHttp) {
|
|
1152
|
+
log.debug(`DDG HTTP skipped (failure rate ${Math.round(ddgHttpRate * 100)}% ≥ 80%)`);
|
|
1153
|
+
}
|
|
1154
|
+
else {
|
|
1155
|
+
const ddgTimeoutMs = ddgHttpRate > 0.5 ? 2_000 : 8_000;
|
|
1156
|
+
const ddgSignal = createTimeoutSignal(ddgTimeoutMs, options.signal);
|
|
1157
|
+
const ddgOptions = { ...options, signal: ddgSignal };
|
|
1158
|
+
let ddgSucceeded = false;
|
|
1159
|
+
for (const q of attempts) {
|
|
1160
|
+
try {
|
|
1161
|
+
const results = await this.searchOnce(q, ddgOptions);
|
|
1162
|
+
if (results.length > 0) {
|
|
1163
|
+
providerStats.record('ddg-http', true);
|
|
1164
|
+
log.debug(`source=ddg-http returned ${results.length} results` +
|
|
1165
|
+
(ddgTimeoutMs < 8_000 ? ` (fast-timeout ${ddgTimeoutMs}ms)` : ''));
|
|
1166
|
+
// Apply relevance filtering before returning
|
|
1167
|
+
const filtered = filterRelevantResults(results, query);
|
|
1168
|
+
return filtered.length > 0 ? filtered : results; // fallback to unfiltered if all removed
|
|
1169
|
+
}
|
|
1170
|
+
ddgSucceeded = true; // connected OK, just 0 results
|
|
1171
|
+
}
|
|
1172
|
+
catch (e) {
|
|
1173
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1174
|
+
log.debug('DDG HTTP failed:', msg);
|
|
1175
|
+
break;
|
|
1176
|
+
}
|
|
1177
|
+
}
|
|
1178
|
+
// Record outcome: connected but empty = failure for our purposes
|
|
1179
|
+
providerStats.record('ddg-http', ddgSucceeded ? false : false);
|
|
1180
|
+
// (both paths are failures — we only record true above on a live hit)
|
|
1181
|
+
}
|
|
1182
|
+
// -----------------------------------------------------------
|
|
1183
|
+
// Stage 2: DDG Lite
|
|
1184
|
+
// Same skip/fast-timeout logic as DDG HTTP.
|
|
1185
|
+
// -----------------------------------------------------------
|
|
1186
|
+
const ddgLiteRate = providerStats.getFailureRate('ddg-lite');
|
|
1187
|
+
const skipDdgLite = providerStats.shouldSkip('ddg-lite');
|
|
1188
|
+
if (skipDdgLite) {
|
|
1189
|
+
log.debug(`DDG Lite skipped (failure rate ${Math.round(ddgLiteRate * 100)}% ≥ 80%)`);
|
|
1190
|
+
}
|
|
1191
|
+
else {
|
|
1192
|
+
log.debug('DDG returned 0 results, trying DDG Lite...');
|
|
1193
|
+
const liteTimeoutMs = ddgLiteRate > 0.5 ? 2_000 : 8_000;
|
|
1194
|
+
const liteSignal = createTimeoutSignal(liteTimeoutMs, options.signal);
|
|
1195
|
+
try {
|
|
1196
|
+
const liteResults = await this.searchLite(query, { ...options, signal: liteSignal });
|
|
1197
|
+
if (liteResults.length > 0) {
|
|
1198
|
+
providerStats.record('ddg-lite', true);
|
|
1199
|
+
log.debug(`source=ddg-lite returned ${liteResults.length} results` +
|
|
1200
|
+
(liteTimeoutMs < 8_000 ? ` (fast-timeout ${liteTimeoutMs}ms)` : ''));
|
|
1201
|
+
// Apply relevance filtering before returning
|
|
1202
|
+
const filteredLite = filterRelevantResults(liteResults, query);
|
|
1203
|
+
return filteredLite.length > 0 ? filteredLite : liteResults;
|
|
1204
|
+
}
|
|
1205
|
+
providerStats.record('ddg-lite', false);
|
|
1206
|
+
log.debug('DDG Lite also returned 0 results');
|
|
1207
|
+
}
|
|
1208
|
+
catch (e) {
|
|
1209
|
+
providerStats.record('ddg-lite', false);
|
|
1210
|
+
log.debug('DDG Lite failed:', e instanceof Error ? e.message : e);
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
// -----------------------------------------------------------
|
|
1214
|
+
// Stage 3: Brave Search API (BYOK — instant if key configured)
|
|
1215
|
+
// -----------------------------------------------------------
|
|
1216
|
+
const braveKey = process.env.BRAVE_SEARCH_KEY || process.env.BRAVE_API_KEY;
|
|
1217
|
+
if (braveKey) {
|
|
1218
|
+
try {
|
|
1219
|
+
const braveProvider = new BraveSearchProvider();
|
|
1220
|
+
const braveResults = await braveProvider.searchWeb(query, { ...options, apiKey: braveKey });
|
|
1221
|
+
if (braveResults.length > 0) {
|
|
1222
|
+
log.debug(`source=brave returned ${braveResults.length} results`);
|
|
1223
|
+
return braveResults;
|
|
1224
|
+
}
|
|
1225
|
+
}
|
|
1226
|
+
catch (e) {
|
|
1227
|
+
log.debug('Brave search failed:', e instanceof Error ? e.message : e);
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
// -----------------------------------------------------------
|
|
1231
|
+
// Stage 3.5: HTTP-based Bing + Google (no browser, no API key)
|
|
1232
|
+
// DISABLED: Both Bing and Google detect non-browser HTTP clients and
|
|
1233
|
+
// serve different/irrelevant content (dictionary pages, random sites).
|
|
1234
|
+
// The scrapers are built (searchBingHttp, searchGoogleHttp) but need
|
|
1235
|
+
// further work on request fingerprinting to get real results.
|
|
1236
|
+
// TODO: Re-enable when fingerprinting is improved.
|
|
1237
|
+
// -----------------------------------------------------------
|
|
1238
|
+
// const skipBingHttp = providerStats.shouldSkip('bing-http');
|
|
1239
|
+
// const skipGoogleHttp = providerStats.shouldSkip('google-http');
|
|
1240
|
+
// if (!skipBingHttp || !skipGoogleHttp) { ... }
|
|
1241
|
+
// -----------------------------------------------------------
|
|
1242
|
+
// Stage 4: Stealth multi-engine (DDG + Bing + Ecosia in parallel)
|
|
1243
|
+
// Bypasses bot-detection on datacenter IPs. This is the reliable
|
|
1244
|
+
// last resort — but it spins up a browser so it takes a few seconds.
|
|
1245
|
+
// DISABLED on memory-constrained servers (512MB) — Playwright OOM kills.
|
|
1246
|
+
// Set NO_BROWSER_SEARCH=1 to skip this stage entirely.
|
|
1247
|
+
// -----------------------------------------------------------
|
|
1248
|
+
if (!process.env.NO_BROWSER_SEARCH) {
|
|
1249
|
+
log.debug('Trying stealth browser search (DDG + Bing + Ecosia)...');
|
|
1250
|
+
try {
|
|
1251
|
+
const stealthProvider = new StealthSearchProvider();
|
|
1252
|
+
// StealthSearchProvider already applies filterRelevantResults internally.
|
|
1253
|
+
const stealthResults = await stealthProvider.searchWeb(query, options);
|
|
1254
|
+
if (stealthResults.length > 0) {
|
|
1255
|
+
log.debug(`source=stealth returned ${stealthResults.length} results`);
|
|
1256
|
+
return stealthResults;
|
|
1257
|
+
}
|
|
1258
|
+
log.debug('Stealth search returned 0 results');
|
|
1259
|
+
}
|
|
1260
|
+
catch (e) {
|
|
1261
|
+
log.debug('Stealth search failed:', e instanceof Error ? e.message : e);
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1264
|
+
else {
|
|
1265
|
+
log.debug('Stealth browser search skipped (NO_BROWSER_SEARCH=1)');
|
|
1266
|
+
}
|
|
1267
|
+
return [];
|
|
1268
|
+
}
|
|
1269
|
+
/**
|
|
1270
|
+
* Exposed for testing: score and filter a pre-fetched result list against a query.
|
|
1271
|
+
* Equivalent to calling filterRelevantResults() directly.
|
|
1272
|
+
*/
|
|
1273
|
+
filterResults(results, query) {
|
|
1274
|
+
return filterRelevantResults(results, query);
|
|
1275
|
+
}
|
|
1276
|
+
}
|
|
1277
|
+
export class BraveSearchProvider {
|
|
1278
|
+
id = 'brave';
|
|
1279
|
+
requiresApiKey = true;
|
|
1280
|
+
async searchWeb(query, options) {
|
|
1281
|
+
const { count, apiKey, signal } = options;
|
|
1282
|
+
if (!apiKey || apiKey.trim().length === 0) {
|
|
1283
|
+
throw new Error('Brave Search requires an API key');
|
|
1284
|
+
}
|
|
1285
|
+
const url = new URL('https://api.search.brave.com/res/v1/web/search');
|
|
1286
|
+
url.searchParams.set('q', query);
|
|
1287
|
+
url.searchParams.set('count', String(Math.min(Math.max(count, 1), 10)));
|
|
1288
|
+
const response = await undiciFetch(url.toString(), {
|
|
1289
|
+
headers: {
|
|
1290
|
+
'Accept': 'application/json',
|
|
1291
|
+
'X-Subscription-Token': apiKey,
|
|
1292
|
+
},
|
|
1293
|
+
signal,
|
|
1294
|
+
});
|
|
1295
|
+
if (!response.ok) {
|
|
1296
|
+
const text = await response.text().catch(() => '');
|
|
1297
|
+
throw new Error(`Brave Search failed: HTTP ${response.status}${text ? ` - ${text}` : ''}`);
|
|
1298
|
+
}
|
|
1299
|
+
const data = await response.json();
|
|
1300
|
+
const resultsArray = data?.web?.results;
|
|
1301
|
+
if (!Array.isArray(resultsArray)) {
|
|
1302
|
+
return [];
|
|
1303
|
+
}
|
|
1304
|
+
const results = [];
|
|
1305
|
+
for (const r of resultsArray) {
|
|
1306
|
+
if (results.length >= count)
|
|
1307
|
+
break;
|
|
1308
|
+
const title = typeof r?.title === 'string' ? r.title.trim() : '';
|
|
1309
|
+
const rawUrl = typeof r?.url === 'string' ? r.url.trim() : '';
|
|
1310
|
+
const snippet = typeof r?.description === 'string'
|
|
1311
|
+
? r.description.trim()
|
|
1312
|
+
: typeof r?.snippet === 'string'
|
|
1313
|
+
? r.snippet.trim()
|
|
1314
|
+
: '';
|
|
1315
|
+
if (!title || !rawUrl)
|
|
1316
|
+
continue;
|
|
1317
|
+
// SECURITY: Validate URL protocol
|
|
1318
|
+
try {
|
|
1319
|
+
const parsed = new URL(rawUrl);
|
|
1320
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
1321
|
+
continue;
|
|
1322
|
+
}
|
|
1323
|
+
catch {
|
|
1324
|
+
continue;
|
|
1325
|
+
}
|
|
1326
|
+
results.push({
|
|
1327
|
+
title: title.slice(0, 200),
|
|
1328
|
+
url: rawUrl,
|
|
1329
|
+
snippet: snippet.slice(0, 500),
|
|
1330
|
+
});
|
|
1331
|
+
}
|
|
1332
|
+
return results;
|
|
1333
|
+
}
|
|
1334
|
+
}
|
|
1335
|
+
/**
|
|
1336
|
+
* GoogleSearchProvider — Google Search via stealth browser or Custom Search JSON API
|
|
1337
|
+
*
|
|
1338
|
+
* Two modes:
|
|
1339
|
+
* 1. Custom Search JSON API (BYOK): set GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX env vars.
|
|
1340
|
+
* Reliable, structured, 100 free queries/day. Works from any IP.
|
|
1341
|
+
* 2. Stealth browser scraping (no API key): uses playwright-extra stealth plugin to
|
|
1342
|
+
* scrape google.com/search directly. Works from datacenter IPs where DDG/Bing/Ecosia
|
|
1343
|
+
* are blocked. Gracefully returns [] if Playwright is unavailable.
|
|
1344
|
+
*
|
|
1345
|
+
* Docs: https://developers.google.com/custom-search/v1/overview
|
|
1346
|
+
*/
|
|
1347
|
+
export class GoogleSearchProvider {
|
|
1348
|
+
id = 'google';
|
|
1349
|
+
/**
|
|
1350
|
+
* requiresApiKey is false: works without API keys via stealth browser fallback.
|
|
1351
|
+
*/
|
|
1352
|
+
requiresApiKey = false;
|
|
1353
|
+
/**
|
|
1354
|
+
* Map standard freshness values to Google's dateRestrict format.
|
|
1355
|
+
* Google dateRestrict: d[n]=past n days, w[n]=past n weeks,
|
|
1356
|
+
* m[n]=past n months, y[n]=past n years.
|
|
1357
|
+
*/
|
|
1358
|
+
mapFreshnessToDateRestrict(tbs) {
|
|
1359
|
+
if (!tbs)
|
|
1360
|
+
return undefined;
|
|
1361
|
+
const map = {
|
|
1362
|
+
pd: 'd1',
|
|
1363
|
+
pw: 'w1',
|
|
1364
|
+
pm: 'm1',
|
|
1365
|
+
py: 'y1',
|
|
1366
|
+
};
|
|
1367
|
+
return map[tbs];
|
|
1368
|
+
}
|
|
1369
|
+
/** Validate URL; returns null if invalid/non-http or a DDG ad URL */
|
|
1370
|
+
validateUrl(rawUrl) {
|
|
1371
|
+
try {
|
|
1372
|
+
const parsed = new URL(rawUrl);
|
|
1373
|
+
if (!['http:', 'https:'].includes(parsed.protocol))
|
|
1374
|
+
return null;
|
|
1375
|
+
// Filter all DuckDuckGo URLs (internal links, ad redirects, etc.)
|
|
1376
|
+
if (parsed.hostname === 'duckduckgo.com')
|
|
1377
|
+
return null;
|
|
1378
|
+
// Filter URLs with ad tracking query params
|
|
1379
|
+
if (parsed.searchParams.has('ad_domain') ||
|
|
1380
|
+
parsed.searchParams.has('ad_provider') ||
|
|
1381
|
+
parsed.searchParams.has('ad_type'))
|
|
1382
|
+
return null;
|
|
1383
|
+
return parsed.href;
|
|
1384
|
+
}
|
|
1385
|
+
catch {
|
|
1386
|
+
return null;
|
|
1387
|
+
}
|
|
1388
|
+
}
|
|
1389
|
+
/**
|
|
1390
|
+
* Stealth browser scrape of google.com/search.
|
|
1391
|
+
* Used when no Custom Search API key is configured.
|
|
1392
|
+
* Strategy A: peel() with stealth rendering (consistent with StealthSearchProvider).
|
|
1393
|
+
* Strategy B: direct playwright-extra launch (if peel returns no results).
|
|
1394
|
+
*/
|
|
1395
|
+
async scrapeGoogleStealth(query, count, options) {
|
|
1396
|
+
const locale = options?.locale ?? 'en-US';
|
|
1397
|
+
// Strategy A: peel() + cheerio parse
|
|
1398
|
+
try {
|
|
1399
|
+
const { peel } = await import('../index.js');
|
|
1400
|
+
const params = new URLSearchParams({
|
|
1401
|
+
q: query,
|
|
1402
|
+
num: String(Math.min(count * 2, 20)),
|
|
1403
|
+
hl: 'en',
|
|
1404
|
+
gl: 'us',
|
|
1405
|
+
});
|
|
1406
|
+
const url = `https://www.google.com/search?${params.toString()}`;
|
|
1407
|
+
const result = await Promise.race([
|
|
1408
|
+
peel(url, { render: true, stealth: true, format: 'html', wait: 3000 }),
|
|
1409
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('Google stealth peel timeout')), 20_000)),
|
|
1410
|
+
]);
|
|
1411
|
+
const html = result.content || '';
|
|
1412
|
+
if (html) {
|
|
1413
|
+
const $ = load(html);
|
|
1414
|
+
const results = [];
|
|
1415
|
+
const seen = new Set();
|
|
1416
|
+
// Multiple selector patterns for resilience across Google HTML variants
|
|
1417
|
+
const resultBlocks = $('#search .g, #rso .g, [data-hveid] .g');
|
|
1418
|
+
resultBlocks.each((_i, elem) => {
|
|
1419
|
+
if (results.length >= count)
|
|
1420
|
+
return;
|
|
1421
|
+
const $r = $(elem);
|
|
1422
|
+
const $a = $r.find('a[href^="http"]').first();
|
|
1423
|
+
const $h3 = $r.find('h3').first();
|
|
1424
|
+
if (!$a.length || !$h3.length)
|
|
1425
|
+
return;
|
|
1426
|
+
const href = $a.attr('href') || '';
|
|
1427
|
+
if (href.includes('google.com/') ||
|
|
1428
|
+
href.includes('accounts.google') ||
|
|
1429
|
+
href.includes('/aclk') ||
|
|
1430
|
+
href.startsWith('#'))
|
|
1431
|
+
return;
|
|
1432
|
+
const validated = this.validateUrl(href);
|
|
1433
|
+
if (!validated)
|
|
1434
|
+
return;
|
|
1435
|
+
const key = normalizeUrlForDedupe(validated);
|
|
1436
|
+
if (seen.has(key))
|
|
1437
|
+
return;
|
|
1438
|
+
seen.add(key);
|
|
1439
|
+
const title = cleanText($h3.text(), { maxLen: 200 });
|
|
1440
|
+
if (!title)
|
|
1441
|
+
return;
|
|
1442
|
+
const snippetText = $r.find('[data-sncf]').first().text() ||
|
|
1443
|
+
$r.find('.VwiC3b').first().text() ||
|
|
1444
|
+
$r.find('[style*="-webkit-line-clamp"]').first().text() ||
|
|
1445
|
+
$r.find('.st').first().text() ||
|
|
1446
|
+
'';
|
|
1447
|
+
const snippet = cleanText(snippetText, { maxLen: 500, stripEllipsisPadding: true });
|
|
1448
|
+
results.push({ title, url: validated, snippet });
|
|
1449
|
+
});
|
|
1450
|
+
if (results.length > 0) {
|
|
1451
|
+
const sliced = results.slice(0, count);
|
|
1452
|
+
// Attach structured SERP data to the first result when structured=true
|
|
1453
|
+
if (options?.structured) {
|
|
1454
|
+
const { parseGoogleSerp } = await import('./google-serp-parser.js');
|
|
1455
|
+
const serp = parseGoogleSerp(html);
|
|
1456
|
+
if (sliced.length > 0) {
|
|
1457
|
+
sliced[0] = { ...sliced[0], serp };
|
|
1458
|
+
}
|
|
1459
|
+
}
|
|
1460
|
+
return sliced;
|
|
1461
|
+
}
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
catch (e) {
|
|
1465
|
+
log.debug('Google stealth (peel) error:', e.message);
|
|
1466
|
+
}
|
|
1467
|
+
// Strategy B: direct playwright-extra + stealth plugin
|
|
1468
|
+
let browser;
|
|
1469
|
+
let context;
|
|
1470
|
+
let page;
|
|
1471
|
+
try {
|
|
1472
|
+
const pwExtra = await import('playwright-extra');
|
|
1473
|
+
const StealthPlugin = (await import('puppeteer-extra-plugin-stealth')).default;
|
|
1474
|
+
const stealthChromium = pwExtra.chromium;
|
|
1475
|
+
stealthChromium.use(StealthPlugin());
|
|
1476
|
+
const params = new URLSearchParams({
|
|
1477
|
+
q: query,
|
|
1478
|
+
num: String(Math.min(count * 2, 20)),
|
|
1479
|
+
hl: 'en',
|
|
1480
|
+
gl: 'us',
|
|
1481
|
+
});
|
|
1482
|
+
const url = `https://www.google.com/search?${params.toString()}`;
|
|
1483
|
+
browser = await stealthChromium.launch({
|
|
1484
|
+
headless: true,
|
|
1485
|
+
args: [
|
|
1486
|
+
'--disable-blink-features=AutomationControlled',
|
|
1487
|
+
'--disable-dev-shm-usage',
|
|
1488
|
+
'--no-sandbox',
|
|
1489
|
+
'--disable-setuid-sandbox',
|
|
1490
|
+
'--disable-gpu',
|
|
1491
|
+
],
|
|
1492
|
+
});
|
|
1493
|
+
context = await browser.newContext({
|
|
1494
|
+
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
1495
|
+
viewport: { width: 1280, height: 720 },
|
|
1496
|
+
locale,
|
|
1497
|
+
});
|
|
1498
|
+
page = await context.newPage();
|
|
1499
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 15_000 });
|
|
1500
|
+
// Use page.content() + cheerio to avoid needing DOM lib types in tsconfig
|
|
1501
|
+
const html = await page.content();
|
|
1502
|
+
return this._parseGoogleHtml(html, count);
|
|
1503
|
+
}
|
|
1504
|
+
catch (e) {
|
|
1505
|
+
log.debug('Google stealth (playwright) error:', e.message);
|
|
1506
|
+
return [];
|
|
1507
|
+
}
|
|
1508
|
+
finally {
|
|
1509
|
+
await page?.close().catch(() => { });
|
|
1510
|
+
await context?.close().catch(() => { });
|
|
1511
|
+
await browser?.close().catch(() => { });
|
|
1512
|
+
}
|
|
1513
|
+
}
|
|
1514
|
+
/** Parse Google search result HTML using cheerio. No DOM lib types required. */
|
|
1515
|
+
_parseGoogleHtml(html, count) {
|
|
1516
|
+
const $ = load(html);
|
|
1517
|
+
const results = [];
|
|
1518
|
+
const seen = new Set();
|
|
1519
|
+
const resultBlocks = $('#search .g, #rso .g, [data-hveid] .g');
|
|
1520
|
+
resultBlocks.each((_i, elem) => {
|
|
1521
|
+
if (results.length >= count)
|
|
1522
|
+
return;
|
|
1523
|
+
const $r = $(elem);
|
|
1524
|
+
const $a = $r.find('a[href^="http"]').first();
|
|
1525
|
+
const $h3 = $r.find('h3').first();
|
|
1526
|
+
if (!$a.length || !$h3.length)
|
|
1527
|
+
return;
|
|
1528
|
+
const href = $a.attr('href') || '';
|
|
1529
|
+
if (href.includes('google.com/') ||
|
|
1530
|
+
href.includes('accounts.google') ||
|
|
1531
|
+
href.includes('/aclk') ||
|
|
1532
|
+
href.startsWith('#'))
|
|
1533
|
+
return;
|
|
1534
|
+
const validated = this.validateUrl(href);
|
|
1535
|
+
if (!validated)
|
|
1536
|
+
return;
|
|
1537
|
+
const key = normalizeUrlForDedupe(validated);
|
|
1538
|
+
if (seen.has(key))
|
|
1539
|
+
return;
|
|
1540
|
+
seen.add(key);
|
|
1541
|
+
const title = cleanText($h3.text(), { maxLen: 200 });
|
|
1542
|
+
if (!title)
|
|
1543
|
+
return;
|
|
1544
|
+
const snippetText = $r.find('[data-sncf]').first().text() ||
|
|
1545
|
+
$r.find('.VwiC3b').first().text() ||
|
|
1546
|
+
$r.find('[style*="-webkit-line-clamp"]').first().text() ||
|
|
1547
|
+
$r.find('.st').first().text() ||
|
|
1548
|
+
'';
|
|
1549
|
+
const snippet = cleanText(snippetText, { maxLen: 500, stripEllipsisPadding: true });
|
|
1550
|
+
results.push({ title, url: validated, snippet });
|
|
1551
|
+
});
|
|
1552
|
+
return results.slice(0, count);
|
|
1553
|
+
}
|
|
1554
|
+
async searchWeb(query, options) {
|
|
1555
|
+
const { count, apiKey: optApiKey, tbs } = options;
|
|
1556
|
+
const apiKey = optApiKey || process.env.GOOGLE_SEARCH_KEY || process.env.GOOGLE_API_KEY;
|
|
1557
|
+
const cx = process.env.GOOGLE_SEARCH_CX;
|
|
1558
|
+
// No API key — fall back to stealth browser scraping
|
|
1559
|
+
if (!apiKey || !cx) {
|
|
1560
|
+
return this.scrapeGoogleStealth(query, count, options);
|
|
1561
|
+
}
|
|
1562
|
+
// Custom Search JSON API path
|
|
1563
|
+
const params = new URLSearchParams({
|
|
1564
|
+
key: apiKey,
|
|
1565
|
+
cx: cx,
|
|
1566
|
+
q: query,
|
|
1567
|
+
num: String(Math.min(count, 10)), // Google CSE max is 10 per request
|
|
1568
|
+
});
|
|
1569
|
+
const dateRestrict = this.mapFreshnessToDateRestrict(tbs);
|
|
1570
|
+
if (dateRestrict)
|
|
1571
|
+
params.set('dateRestrict', dateRestrict);
|
|
1572
|
+
const response = await fetch(`https://www.googleapis.com/customsearch/v1?${params}`, {
|
|
1573
|
+
signal: AbortSignal.timeout(10000),
|
|
1574
|
+
});
|
|
1575
|
+
if (!response.ok) {
|
|
1576
|
+
const text = await response.text();
|
|
1577
|
+
throw new Error(`Google search failed (${response.status}): ${text.substring(0, 200)}`);
|
|
1578
|
+
}
|
|
1579
|
+
const data = await response.json();
|
|
1580
|
+
return (data.items || []).map((item) => ({
|
|
1581
|
+
url: item.link,
|
|
1582
|
+
title: item.title,
|
|
1583
|
+
snippet: item.snippet || '',
|
|
1584
|
+
}));
|
|
1585
|
+
}
|
|
1586
|
+
}
|
|
1587
|
+
export function getSearchProvider(id) {
|
|
1588
|
+
if (!id || id === 'duckduckgo')
|
|
1589
|
+
return new DuckDuckGoProvider();
|
|
1590
|
+
if (id === 'brave')
|
|
1591
|
+
return new BraveSearchProvider();
|
|
1592
|
+
if (id === 'stealth')
|
|
1593
|
+
return new StealthSearchProvider();
|
|
1594
|
+
if (id === 'google')
|
|
1595
|
+
return new GoogleSearchProvider();
|
|
1596
|
+
// 'baidu' and 'yandex' are handled by BaiduSearchProvider / YandexSearchProvider
|
|
1597
|
+
// from './search-engines.js'. They cannot be imported here (circular dependency).
|
|
1598
|
+
// Use search-engines.ts directly for these providers.
|
|
1599
|
+
// Exhaustive fallback (should be unreachable due to typing)
|
|
1600
|
+
return new DuckDuckGoProvider();
|
|
1601
|
+
}
|
|
1602
|
+
/**
|
|
1603
|
+
/**
|
|
1604
|
+
* Get the best available search provider based on configured API keys and
|
|
1605
|
+
* available runtime dependencies.
|
|
1606
|
+
*
|
|
1607
|
+
* Priority:
|
|
1608
|
+
* 1. Google Custom Search JSON API (if GOOGLE_SEARCH_KEY + GOOGLE_SEARCH_CX set)
|
|
1609
|
+
* 2. Brave Search (if BRAVE_SEARCH_KEY is set)
|
|
1610
|
+
* 3. Google stealth browser scraping (works from datacenter IPs; no API key needed)
|
|
1611
|
+
* — only when playwright-extra is available in node_modules
|
|
1612
|
+
* 4. DuckDuckGo with full fallback chain (DDG HTTP → DDG Lite → stealth multi-engine (Bing + Ecosia))
|
|
1613
|
+
*/
|
|
1614
|
+
export function getBestSearchProvider() {
|
|
1615
|
+
// 1. Google Custom Search JSON API (BYOK) — works from any IP
|
|
1616
|
+
const googleKey = process.env.GOOGLE_SEARCH_KEY || process.env.GOOGLE_API_KEY;
|
|
1617
|
+
const googleCx = process.env.GOOGLE_SEARCH_CX;
|
|
1618
|
+
if (googleKey && googleCx) {
|
|
1619
|
+
return { provider: new GoogleSearchProvider(), apiKey: googleKey };
|
|
1620
|
+
}
|
|
1621
|
+
// 2. Brave Search (BYOK)
|
|
1622
|
+
const braveKey = process.env.BRAVE_SEARCH_KEY || process.env.BRAVE_API_KEY;
|
|
1623
|
+
if (braveKey) {
|
|
1624
|
+
return { provider: new BraveSearchProvider(), apiKey: braveKey };
|
|
1625
|
+
}
|
|
1626
|
+
// 3. DuckDuckGo with full internal fallback chain
|
|
1627
|
+
// (DDG HTTP → DDG Lite → stealth multi-engine (Bing + Ecosia))
|
|
1628
|
+
return { provider: new DuckDuckGoProvider() };
|
|
1629
|
+
}
|