@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Crawl checkpoint system for resume capability.
|
|
3
|
+
* Saves progress to a JSON file so interrupted crawls can continue.
|
|
4
|
+
*/
|
|
5
|
+
export interface CrawlCheckpoint {
|
|
6
|
+
/** Unique crawl job ID (hash of start URL + options) */
|
|
7
|
+
jobId: string;
|
|
8
|
+
/** Starting URL */
|
|
9
|
+
startUrl: string;
|
|
10
|
+
/** URLs already crawled (with their results) */
|
|
11
|
+
completed: Map<string, {
|
|
12
|
+
status: number;
|
|
13
|
+
contentLength: number;
|
|
14
|
+
timestamp: number;
|
|
15
|
+
}>;
|
|
16
|
+
/** URLs queued but not yet crawled */
|
|
17
|
+
pending: string[];
|
|
18
|
+
/** URLs discovered but not yet queued */
|
|
19
|
+
discovered: string[];
|
|
20
|
+
/** Crawl options (serialized) */
|
|
21
|
+
options: Record<string, any>;
|
|
22
|
+
/** When crawl started */
|
|
23
|
+
startedAt: number;
|
|
24
|
+
/** Last checkpoint time */
|
|
25
|
+
lastCheckpoint: number;
|
|
26
|
+
/** Total pages target */
|
|
27
|
+
maxPages: number;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Generate a deterministic job ID from URL + options.
|
|
31
|
+
*/
|
|
32
|
+
export declare function generateJobId(url: string, options?: Record<string, any>): string;
|
|
33
|
+
/**
|
|
34
|
+
* Save a checkpoint to disk.
|
|
35
|
+
*/
|
|
36
|
+
export declare function saveCheckpoint(checkpoint: CrawlCheckpoint): void;
|
|
37
|
+
/**
|
|
38
|
+
* Load a checkpoint from disk.
|
|
39
|
+
*/
|
|
40
|
+
export declare function loadCheckpoint(jobId: string): CrawlCheckpoint | null;
|
|
41
|
+
/**
|
|
42
|
+
* Delete a checkpoint (crawl completed or abandoned).
|
|
43
|
+
*/
|
|
44
|
+
export declare function deleteCheckpoint(jobId: string): void;
|
|
45
|
+
/**
|
|
46
|
+
* List all active checkpoints.
|
|
47
|
+
*/
|
|
48
|
+
export declare function listCheckpoints(): Array<{
|
|
49
|
+
jobId: string;
|
|
50
|
+
startUrl: string;
|
|
51
|
+
completed: number;
|
|
52
|
+
pending: number;
|
|
53
|
+
lastCheckpoint: number;
|
|
54
|
+
}>;
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Crawl checkpoint system for resume capability.
|
|
3
|
+
* Saves progress to a JSON file so interrupted crawls can continue.
|
|
4
|
+
*/
|
|
5
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync, unlinkSync, readdirSync } from 'fs';
|
|
6
|
+
import { join } from 'path';
|
|
7
|
+
import { createHash } from 'crypto';
|
|
8
|
+
const CHECKPOINT_DIR = join(process.env.HOME || '/tmp', '.webpeel', 'checkpoints');
|
|
9
|
+
/**
|
|
10
|
+
* Generate a deterministic job ID from URL + options.
|
|
11
|
+
*/
|
|
12
|
+
export function generateJobId(url, options = {}) {
|
|
13
|
+
const key = JSON.stringify({
|
|
14
|
+
url,
|
|
15
|
+
maxPages: options.maxPages,
|
|
16
|
+
maxDepth: options.maxDepth,
|
|
17
|
+
includes: options.includes,
|
|
18
|
+
excludes: options.excludes,
|
|
19
|
+
});
|
|
20
|
+
return createHash('sha256').update(key).digest('hex').slice(0, 16);
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Get the checkpoint file path for a job.
|
|
24
|
+
*/
|
|
25
|
+
function getCheckpointPath(jobId) {
|
|
26
|
+
return join(CHECKPOINT_DIR, `${jobId}.json`);
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Save a checkpoint to disk.
|
|
30
|
+
*/
|
|
31
|
+
export function saveCheckpoint(checkpoint) {
|
|
32
|
+
try {
|
|
33
|
+
mkdirSync(CHECKPOINT_DIR, { recursive: true });
|
|
34
|
+
const data = {
|
|
35
|
+
...checkpoint,
|
|
36
|
+
completed: Object.fromEntries(checkpoint.completed),
|
|
37
|
+
lastCheckpoint: Date.now(),
|
|
38
|
+
};
|
|
39
|
+
writeFileSync(getCheckpointPath(checkpoint.jobId), JSON.stringify(data, null, 2));
|
|
40
|
+
}
|
|
41
|
+
catch (e) {
|
|
42
|
+
if (process.env.DEBUG) {
|
|
43
|
+
console.debug('[webpeel]', 'Failed to save checkpoint:', e instanceof Error ? e.message : e);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Load a checkpoint from disk.
|
|
49
|
+
*/
|
|
50
|
+
export function loadCheckpoint(jobId) {
|
|
51
|
+
const path = getCheckpointPath(jobId);
|
|
52
|
+
if (!existsSync(path))
|
|
53
|
+
return null;
|
|
54
|
+
try {
|
|
55
|
+
const raw = JSON.parse(readFileSync(path, 'utf-8'));
|
|
56
|
+
return {
|
|
57
|
+
...raw,
|
|
58
|
+
completed: new Map(Object.entries(raw.completed || {})),
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Delete a checkpoint (crawl completed or abandoned).
|
|
67
|
+
*/
|
|
68
|
+
export function deleteCheckpoint(jobId) {
|
|
69
|
+
const path = getCheckpointPath(jobId);
|
|
70
|
+
try {
|
|
71
|
+
if (existsSync(path)) {
|
|
72
|
+
unlinkSync(path);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
catch { /* ignore */ }
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* List all active checkpoints.
|
|
79
|
+
*/
|
|
80
|
+
export function listCheckpoints() {
|
|
81
|
+
try {
|
|
82
|
+
if (!existsSync(CHECKPOINT_DIR))
|
|
83
|
+
return [];
|
|
84
|
+
const files = readdirSync(CHECKPOINT_DIR).filter((f) => f.endsWith('.json'));
|
|
85
|
+
return files.map(f => {
|
|
86
|
+
try {
|
|
87
|
+
const raw = JSON.parse(readFileSync(join(CHECKPOINT_DIR, f), 'utf-8'));
|
|
88
|
+
return {
|
|
89
|
+
jobId: raw.jobId,
|
|
90
|
+
startUrl: raw.startUrl,
|
|
91
|
+
completed: Object.keys(raw.completed || {}).length,
|
|
92
|
+
pending: (raw.pending || []).length,
|
|
93
|
+
lastCheckpoint: raw.lastCheckpoint,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
catch {
|
|
97
|
+
return null;
|
|
98
|
+
}
|
|
99
|
+
}).filter(Boolean);
|
|
100
|
+
}
|
|
101
|
+
catch {
|
|
102
|
+
return [];
|
|
103
|
+
}
|
|
104
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web crawler functionality
|
|
3
|
+
* Crawls a starting URL and follows links matching specified patterns
|
|
4
|
+
*/
|
|
5
|
+
import type { PeelOptions } from '../types.js';
|
|
6
|
+
export interface CrawlOptions extends Omit<PeelOptions, 'format'> {
|
|
7
|
+
/** Maximum number of pages to crawl (default: 10, max: tier-dependent) */
|
|
8
|
+
maxPages?: number;
|
|
9
|
+
/** Tier for determining the max pages cap (default: 'free') */
|
|
10
|
+
tier?: string;
|
|
11
|
+
/** Maximum depth to crawl (default: 2, max: 5) */
|
|
12
|
+
maxDepth?: number;
|
|
13
|
+
/** Only crawl URLs from these domains (default: same domain as starting URL) */
|
|
14
|
+
allowedDomains?: string[];
|
|
15
|
+
/** Exclude URLs matching these patterns (regex strings) */
|
|
16
|
+
excludePatterns?: string[];
|
|
17
|
+
/** Respect robots.txt (default: true) */
|
|
18
|
+
respectRobotsTxt?: boolean;
|
|
19
|
+
/** Rate limit between requests in milliseconds (default: 1000ms = 1 req/sec) */
|
|
20
|
+
rateLimitMs?: number;
|
|
21
|
+
/** Try sitemap.xml first to discover URLs (default: false) */
|
|
22
|
+
sitemapFirst?: boolean;
|
|
23
|
+
/** Crawl strategy: breadth-first or depth-first (default: 'bfs') */
|
|
24
|
+
strategy?: 'bfs' | 'dfs';
|
|
25
|
+
/** Skip duplicate content using fingerprinting (default: true) */
|
|
26
|
+
deduplication?: boolean;
|
|
27
|
+
/** Only crawl URLs matching these regex patterns */
|
|
28
|
+
includePatterns?: string[];
|
|
29
|
+
/** Progress callback called after each page */
|
|
30
|
+
onProgress?: (status: CrawlProgress) => void;
|
|
31
|
+
/** Per-page callback — receives the full result as soon as a page completes */
|
|
32
|
+
onPage?: (result: CrawlResult) => void;
|
|
33
|
+
/** Resume an interrupted crawl from its last checkpoint */
|
|
34
|
+
resume?: boolean;
|
|
35
|
+
}
|
|
36
|
+
export interface CrawlProgress {
|
|
37
|
+
crawled: number;
|
|
38
|
+
queued: number;
|
|
39
|
+
failed: number;
|
|
40
|
+
currentUrl: string;
|
|
41
|
+
elapsed: number;
|
|
42
|
+
}
|
|
43
|
+
export interface CrawlResult {
|
|
44
|
+
/** URL of the crawled page */
|
|
45
|
+
url: string;
|
|
46
|
+
/** Page title */
|
|
47
|
+
title: string;
|
|
48
|
+
/** Markdown content */
|
|
49
|
+
markdown: string;
|
|
50
|
+
/** Token count for this page's content */
|
|
51
|
+
tokens: number;
|
|
52
|
+
/** All links found on this page (absolute URLs) */
|
|
53
|
+
links: string[];
|
|
54
|
+
/** Depth level (0 = starting URL) */
|
|
55
|
+
depth: number;
|
|
56
|
+
/** Parent URL that linked to this page (null for starting URL) */
|
|
57
|
+
parent: string | null;
|
|
58
|
+
/** Time elapsed fetching this page (ms) */
|
|
59
|
+
elapsed: number;
|
|
60
|
+
/** Error message if page failed to fetch */
|
|
61
|
+
error?: string;
|
|
62
|
+
/** Content fingerprint for deduplication */
|
|
63
|
+
fingerprint?: string;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Crawl a website starting from a URL
|
|
67
|
+
*
|
|
68
|
+
* @param startUrl - Starting URL to crawl from
|
|
69
|
+
* @param options - Crawl options
|
|
70
|
+
* @returns Array of crawl results
|
|
71
|
+
*
|
|
72
|
+
* @example
|
|
73
|
+
* ```typescript
|
|
74
|
+
* import { crawl } from 'webpeel';
|
|
75
|
+
*
|
|
76
|
+
* const results = await crawl('https://example.com', {
|
|
77
|
+
* maxPages: 20,
|
|
78
|
+
* maxDepth: 2,
|
|
79
|
+
* });
|
|
80
|
+
*
|
|
81
|
+
* console.log(`Crawled ${results.length} pages`);
|
|
82
|
+
* ```
|
|
83
|
+
*/
|
|
84
|
+
export declare function crawl(startUrl: string, options?: CrawlOptions): Promise<CrawlResult[]>;
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web crawler functionality
|
|
3
|
+
* Crawls a starting URL and follows links matching specified patterns
|
|
4
|
+
*/
|
|
5
|
+
import { peel } from '../index.js';
|
|
6
|
+
import { fetch as undiciFetch } from 'undici';
|
|
7
|
+
import { createHash } from 'crypto';
|
|
8
|
+
import { discoverSitemap } from './sitemap.js';
|
|
9
|
+
import { generateJobId, loadCheckpoint, saveCheckpoint, deleteCheckpoint, } from './crawl-checkpoint.js';
|
|
10
|
+
import { createLogger } from './logger.js';
|
|
11
|
+
const log = createLogger('crawler');
|
|
12
|
+
/** Safely compile a user-supplied regex pattern. Rejects patterns longer than 200 chars
|
|
13
|
+
* and wraps compilation in a try-catch to prevent invalid regex crashes. */
|
|
14
|
+
function safeRegex(pattern) {
|
|
15
|
+
if (pattern.length > 200) {
|
|
16
|
+
throw new Error(`Regex pattern too long (${pattern.length} chars, max 200)`);
|
|
17
|
+
}
|
|
18
|
+
try {
|
|
19
|
+
return new RegExp(pattern);
|
|
20
|
+
}
|
|
21
|
+
catch {
|
|
22
|
+
throw new Error(`Invalid regex pattern: ${pattern}`);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
/** Maximum pages allowed per tier */
|
|
26
|
+
const TIER_MAX_PAGES = {
|
|
27
|
+
free: 10,
|
|
28
|
+
starter: 25,
|
|
29
|
+
pro: 50,
|
|
30
|
+
enterprise: 100,
|
|
31
|
+
max: 100,
|
|
32
|
+
admin: 10000,
|
|
33
|
+
};
|
|
34
|
+
/**
|
|
35
|
+
* Parse robots.txt and return disallowed paths for User-agent: *
|
|
36
|
+
*/
|
|
37
|
+
async function fetchRobotsTxt(domain) {
|
|
38
|
+
const robotsUrl = `https://${domain}/robots.txt`;
|
|
39
|
+
try {
|
|
40
|
+
const response = await undiciFetch(robotsUrl, {
|
|
41
|
+
headers: {
|
|
42
|
+
'User-Agent': 'WebPeel/0.3.1 (+https://webpeel.dev)',
|
|
43
|
+
},
|
|
44
|
+
signal: AbortSignal.timeout(5000), // 5 second timeout
|
|
45
|
+
});
|
|
46
|
+
if (!response.ok) {
|
|
47
|
+
// If robots.txt doesn't exist, allow everything
|
|
48
|
+
return { disallowedPaths: [] };
|
|
49
|
+
}
|
|
50
|
+
const text = await response.text();
|
|
51
|
+
const lines = text.split('\n');
|
|
52
|
+
const disallowedPaths = [];
|
|
53
|
+
let crawlDelay;
|
|
54
|
+
let relevantSection = false;
|
|
55
|
+
for (const line of lines) {
|
|
56
|
+
const trimmed = line.trim();
|
|
57
|
+
// Check for User-agent: *
|
|
58
|
+
if (trimmed.toLowerCase().startsWith('user-agent:')) {
|
|
59
|
+
const agent = trimmed.substring('user-agent:'.length).trim();
|
|
60
|
+
relevantSection = agent === '*';
|
|
61
|
+
continue;
|
|
62
|
+
}
|
|
63
|
+
if (!relevantSection)
|
|
64
|
+
continue;
|
|
65
|
+
// Parse Disallow directives
|
|
66
|
+
if (trimmed.toLowerCase().startsWith('disallow:')) {
|
|
67
|
+
const path = trimmed.substring('disallow:'.length).trim();
|
|
68
|
+
if (path) {
|
|
69
|
+
disallowedPaths.push(path);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// Parse Crawl-delay directive
|
|
73
|
+
if (trimmed.toLowerCase().startsWith('crawl-delay:')) {
|
|
74
|
+
const delay = parseInt(trimmed.substring('crawl-delay:'.length).trim());
|
|
75
|
+
if (!isNaN(delay)) {
|
|
76
|
+
crawlDelay = delay * 1000; // Convert to milliseconds
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return { disallowedPaths, crawlDelay };
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
// If we can't fetch robots.txt, allow everything
|
|
84
|
+
return { disallowedPaths: [] };
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Check if a URL is allowed by robots.txt rules
|
|
89
|
+
*/
|
|
90
|
+
function isAllowedByRobots(url, rules) {
|
|
91
|
+
const urlObj = new URL(url);
|
|
92
|
+
const path = urlObj.pathname;
|
|
93
|
+
for (const disallowed of rules.disallowedPaths) {
|
|
94
|
+
// Simple prefix matching (proper robots.txt parsing would handle wildcards)
|
|
95
|
+
if (path.startsWith(disallowed)) {
|
|
96
|
+
return false;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return true;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Crawl a website starting from a URL
|
|
103
|
+
*
|
|
104
|
+
* @param startUrl - Starting URL to crawl from
|
|
105
|
+
* @param options - Crawl options
|
|
106
|
+
* @returns Array of crawl results
|
|
107
|
+
*
|
|
108
|
+
* @example
|
|
109
|
+
* ```typescript
|
|
110
|
+
* import { crawl } from 'webpeel';
|
|
111
|
+
*
|
|
112
|
+
* const results = await crawl('https://example.com', {
|
|
113
|
+
* maxPages: 20,
|
|
114
|
+
* maxDepth: 2,
|
|
115
|
+
* });
|
|
116
|
+
*
|
|
117
|
+
* console.log(`Crawled ${results.length} pages`);
|
|
118
|
+
* ```
|
|
119
|
+
*/
|
|
120
|
+
export async function crawl(startUrl, options = {}) {
|
|
121
|
+
const { maxPages = 10, tier, maxDepth = 2, allowedDomains, excludePatterns = [], respectRobotsTxt = true, rateLimitMs = 500, sitemapFirst = false, strategy = 'bfs', deduplication = true, includePatterns = [], resume = false, onProgress, onPage, ...peelOptions } = options;
|
|
122
|
+
const crawlStartTime = Date.now();
|
|
123
|
+
// Validate limits
|
|
124
|
+
const tierMaxPages = TIER_MAX_PAGES[tier || 'free'] ?? TIER_MAX_PAGES.free;
|
|
125
|
+
const validatedMaxPages = Math.min(Math.max(maxPages, 1), tierMaxPages);
|
|
126
|
+
const validatedMaxDepth = Math.min(Math.max(maxDepth, 1), 5);
|
|
127
|
+
const validatedRateLimit = Math.max(rateLimitMs, 100); // Min 100ms between requests
|
|
128
|
+
// Parse starting URL
|
|
129
|
+
const startUrlObj = new URL(startUrl);
|
|
130
|
+
const startDomain = startUrlObj.hostname;
|
|
131
|
+
// Default: only crawl same domain as starting URL
|
|
132
|
+
const validatedAllowedDomains = allowedDomains && allowedDomains.length > 0
|
|
133
|
+
? allowedDomains
|
|
134
|
+
: [startDomain];
|
|
135
|
+
// Compile exclude patterns (with timeout protection against ReDoS)
|
|
136
|
+
const excludeRegexes = excludePatterns.map(pattern => safeRegex(pattern));
|
|
137
|
+
// Compile include patterns (with timeout protection against ReDoS)
|
|
138
|
+
const includeRegexes = includePatterns.map(pattern => safeRegex(pattern));
|
|
139
|
+
// Fetch robots.txt if needed
|
|
140
|
+
let robotsRules = { disallowedPaths: [] };
|
|
141
|
+
if (respectRobotsTxt) {
|
|
142
|
+
robotsRules = await fetchRobotsTxt(startDomain);
|
|
143
|
+
// Use crawl-delay from robots.txt if it's larger than our rate limit
|
|
144
|
+
if (robotsRules.crawlDelay && robotsRules.crawlDelay > validatedRateLimit) {
|
|
145
|
+
log.info(`Using Crawl-delay from robots.txt: ${robotsRules.crawlDelay}ms`);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
const effectiveRateLimit = robotsRules.crawlDelay || validatedRateLimit;
|
|
149
|
+
// Checkpoint: generate a deterministic job ID for this crawl
|
|
150
|
+
const crawlOptionsForCheckpoint = {
|
|
151
|
+
maxPages: validatedMaxPages,
|
|
152
|
+
maxDepth: validatedMaxDepth,
|
|
153
|
+
includes: includePatterns,
|
|
154
|
+
excludes: excludePatterns,
|
|
155
|
+
};
|
|
156
|
+
const jobId = generateJobId(startUrl, crawlOptionsForCheckpoint);
|
|
157
|
+
// Load existing checkpoint if resume is requested
|
|
158
|
+
const checkpoint = resume ? loadCheckpoint(jobId) : null;
|
|
159
|
+
if (checkpoint) {
|
|
160
|
+
log.info(`Resuming crawl from checkpoint: ${checkpoint.completed.size} pages already crawled`);
|
|
161
|
+
}
|
|
162
|
+
// State tracking
|
|
163
|
+
const results = [];
|
|
164
|
+
const visited = new Set();
|
|
165
|
+
const contentFingerprints = new Set();
|
|
166
|
+
let failedCount = 0;
|
|
167
|
+
// If resuming, restore visited/results from checkpoint
|
|
168
|
+
if (checkpoint) {
|
|
169
|
+
for (const [url] of checkpoint.completed) {
|
|
170
|
+
visited.add(url);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
const queue = [];
|
|
174
|
+
// If resuming with pending URLs, restore queue; otherwise start from scratch
|
|
175
|
+
if (checkpoint && checkpoint.pending.length > 0) {
|
|
176
|
+
for (const pendingUrl of checkpoint.pending) {
|
|
177
|
+
queue.push({ url: pendingUrl, depth: 1, parent: startUrl });
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
queue.push({ url: startUrl, depth: 0, parent: null });
|
|
182
|
+
}
|
|
183
|
+
// Sitemap-first: Discover URLs from sitemap before crawling
|
|
184
|
+
if (sitemapFirst) {
|
|
185
|
+
try {
|
|
186
|
+
const sitemap = await discoverSitemap(startDomain, { timeout: 10000, maxUrls: validatedMaxPages });
|
|
187
|
+
for (const entry of sitemap.urls) {
|
|
188
|
+
const entryUrl = entry.url;
|
|
189
|
+
try {
|
|
190
|
+
const entryUrlObj = new URL(entryUrl);
|
|
191
|
+
if (validatedAllowedDomains.includes(entryUrlObj.hostname)) {
|
|
192
|
+
queue.push({ url: entryUrl, depth: 1, parent: startUrl });
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
catch { /* skip invalid URLs */ }
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
catch { /* skip sitemap errors */ }
|
|
199
|
+
}
|
|
200
|
+
while (queue.length > 0 && results.length < validatedMaxPages) {
|
|
201
|
+
// Use DFS (stack) or BFS (queue) strategy
|
|
202
|
+
const item = strategy === 'dfs' ? queue.pop() : queue.shift();
|
|
203
|
+
const { url, depth, parent } = item;
|
|
204
|
+
// Skip if already visited
|
|
205
|
+
if (visited.has(url))
|
|
206
|
+
continue;
|
|
207
|
+
visited.add(url);
|
|
208
|
+
// Skip if depth exceeded
|
|
209
|
+
if (depth > validatedMaxDepth)
|
|
210
|
+
continue;
|
|
211
|
+
// Validate URL
|
|
212
|
+
let urlObj;
|
|
213
|
+
try {
|
|
214
|
+
urlObj = new URL(url);
|
|
215
|
+
}
|
|
216
|
+
catch {
|
|
217
|
+
continue; // Skip invalid URLs
|
|
218
|
+
}
|
|
219
|
+
// Check if domain is allowed
|
|
220
|
+
if (!validatedAllowedDomains.includes(urlObj.hostname)) {
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
// Check exclude patterns
|
|
224
|
+
if (excludeRegexes.some(regex => regex.test(url))) {
|
|
225
|
+
continue;
|
|
226
|
+
}
|
|
227
|
+
// Check include patterns
|
|
228
|
+
if (includeRegexes.length > 0 && !includeRegexes.some(regex => regex.test(url))) {
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
231
|
+
// Check robots.txt
|
|
232
|
+
if (respectRobotsTxt && !isAllowedByRobots(url, robotsRules)) {
|
|
233
|
+
log.debug(`Skipping ${url} (disallowed by robots.txt)`);
|
|
234
|
+
continue;
|
|
235
|
+
}
|
|
236
|
+
// Fetch the page
|
|
237
|
+
try {
|
|
238
|
+
const result = await peel(url, {
|
|
239
|
+
...peelOptions,
|
|
240
|
+
format: 'markdown',
|
|
241
|
+
});
|
|
242
|
+
// Deduplication: compute content fingerprint
|
|
243
|
+
let fingerprint;
|
|
244
|
+
if (deduplication) {
|
|
245
|
+
fingerprint = createHash('sha256').update(result.content).digest('hex');
|
|
246
|
+
if (contentFingerprints.has(fingerprint)) {
|
|
247
|
+
// Skip duplicate content
|
|
248
|
+
continue;
|
|
249
|
+
}
|
|
250
|
+
contentFingerprints.add(fingerprint);
|
|
251
|
+
}
|
|
252
|
+
const crawlResult = {
|
|
253
|
+
url: result.url,
|
|
254
|
+
title: result.title,
|
|
255
|
+
markdown: result.content,
|
|
256
|
+
tokens: result.tokens ?? 0,
|
|
257
|
+
links: result.links,
|
|
258
|
+
depth,
|
|
259
|
+
parent,
|
|
260
|
+
elapsed: result.elapsed,
|
|
261
|
+
};
|
|
262
|
+
if (fingerprint) {
|
|
263
|
+
crawlResult.fingerprint = fingerprint;
|
|
264
|
+
}
|
|
265
|
+
results.push(crawlResult);
|
|
266
|
+
// Save checkpoint every 5 pages
|
|
267
|
+
if (results.length % 5 === 0) {
|
|
268
|
+
saveCheckpoint({
|
|
269
|
+
jobId,
|
|
270
|
+
startUrl,
|
|
271
|
+
completed: new Map(results
|
|
272
|
+
.filter(r => !r.error)
|
|
273
|
+
.map(r => [r.url, { status: 200, contentLength: r.markdown.length, timestamp: Date.now() }])),
|
|
274
|
+
pending: queue.map(q => q.url),
|
|
275
|
+
discovered: [],
|
|
276
|
+
options: crawlOptionsForCheckpoint,
|
|
277
|
+
startedAt: crawlStartTime,
|
|
278
|
+
lastCheckpoint: Date.now(),
|
|
279
|
+
maxPages: validatedMaxPages,
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
// Call per-page callback with full result
|
|
283
|
+
if (onPage) {
|
|
284
|
+
onPage(crawlResult);
|
|
285
|
+
}
|
|
286
|
+
// Call progress callback
|
|
287
|
+
if (onProgress) {
|
|
288
|
+
onProgress({
|
|
289
|
+
crawled: results.length,
|
|
290
|
+
queued: queue.length,
|
|
291
|
+
failed: failedCount,
|
|
292
|
+
currentUrl: url,
|
|
293
|
+
elapsed: Date.now() - crawlStartTime,
|
|
294
|
+
});
|
|
295
|
+
}
|
|
296
|
+
// Add discovered links to queue
|
|
297
|
+
if (depth < validatedMaxDepth) {
|
|
298
|
+
for (const link of result.links) {
|
|
299
|
+
if (!visited.has(link)) {
|
|
300
|
+
queue.push({
|
|
301
|
+
url: link,
|
|
302
|
+
depth: depth + 1,
|
|
303
|
+
parent: url,
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
// Rate limiting
|
|
309
|
+
if (results.length < validatedMaxPages) {
|
|
310
|
+
await new Promise(resolve => setTimeout(resolve, effectiveRateLimit));
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
catch (error) {
|
|
314
|
+
// Log error and continue
|
|
315
|
+
failedCount++;
|
|
316
|
+
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
317
|
+
log.error(`Failed to fetch ${url}: ${errorMessage}`);
|
|
318
|
+
const errorResult = {
|
|
319
|
+
url,
|
|
320
|
+
title: '',
|
|
321
|
+
markdown: '',
|
|
322
|
+
tokens: 0,
|
|
323
|
+
links: [],
|
|
324
|
+
depth,
|
|
325
|
+
parent,
|
|
326
|
+
elapsed: 0,
|
|
327
|
+
error: errorMessage,
|
|
328
|
+
};
|
|
329
|
+
results.push(errorResult);
|
|
330
|
+
// Call per-page callback with error result
|
|
331
|
+
if (onPage) {
|
|
332
|
+
onPage(errorResult);
|
|
333
|
+
}
|
|
334
|
+
// Call progress callback even for failed pages
|
|
335
|
+
if (onProgress) {
|
|
336
|
+
onProgress({
|
|
337
|
+
crawled: results.length,
|
|
338
|
+
queued: queue.length,
|
|
339
|
+
failed: failedCount,
|
|
340
|
+
currentUrl: url,
|
|
341
|
+
elapsed: Date.now() - crawlStartTime,
|
|
342
|
+
});
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
// Crawl complete — clean up checkpoint
|
|
347
|
+
deleteCheckpoint(jobId);
|
|
348
|
+
return results;
|
|
349
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-source verification — search multiple engines, compare results,
|
|
3
|
+
* compute consensus/confidence scores.
|
|
4
|
+
*/
|
|
5
|
+
import type { WebSearchResult } from './search-provider.js';
|
|
6
|
+
export interface CrossVerifyResult {
|
|
7
|
+
query: string;
|
|
8
|
+
sources: Array<{
|
|
9
|
+
engine: string;
|
|
10
|
+
resultCount: number;
|
|
11
|
+
topResults: WebSearchResult[];
|
|
12
|
+
}>;
|
|
13
|
+
consensus: Array<{
|
|
14
|
+
url: string;
|
|
15
|
+
title: string;
|
|
16
|
+
appearsIn: string[];
|
|
17
|
+
agreementScore: number;
|
|
18
|
+
averagePosition: number;
|
|
19
|
+
}>;
|
|
20
|
+
confidence: number;
|
|
21
|
+
totalSources: number;
|
|
22
|
+
elapsed: number;
|
|
23
|
+
}
|
|
24
|
+
export declare function crossVerifySearch(query: string, options?: {
|
|
25
|
+
engines?: string[];
|
|
26
|
+
count?: number;
|
|
27
|
+
}): Promise<CrossVerifyResult>;
|