@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Local-first content change tracking
|
|
3
|
+
* Stores snapshots in ~/.webpeel/snapshots/ and provides diffing
|
|
4
|
+
*/
|
|
5
|
+
import { createHash } from 'crypto';
|
|
6
|
+
import { promises as fs } from 'fs';
|
|
7
|
+
import { join } from 'path';
|
|
8
|
+
import { homedir } from 'os';
|
|
9
|
+
// Snapshot storage directory
|
|
10
|
+
const SNAPSHOTS_DIR = join(homedir(), '.webpeel', 'snapshots');
|
|
11
|
+
/**
|
|
12
|
+
* Get storage path for a URL
|
|
13
|
+
*/
|
|
14
|
+
function getSnapshotPath(url) {
|
|
15
|
+
const hash = createHash('sha256').update(url).digest('hex');
|
|
16
|
+
return join(SNAPSHOTS_DIR, `${hash}.json`);
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Ensure snapshots directory exists
|
|
20
|
+
*/
|
|
21
|
+
async function ensureSnapshotsDir() {
|
|
22
|
+
try {
|
|
23
|
+
await fs.mkdir(SNAPSHOTS_DIR, { recursive: true });
|
|
24
|
+
}
|
|
25
|
+
catch (error) {
|
|
26
|
+
// Ignore if already exists
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Get a snapshot for a URL
|
|
31
|
+
*
|
|
32
|
+
* @param url - URL to get snapshot for
|
|
33
|
+
* @returns Snapshot if exists, null otherwise
|
|
34
|
+
*
|
|
35
|
+
* @example
|
|
36
|
+
* ```typescript
|
|
37
|
+
* const snapshot = await getSnapshot('https://example.com');
|
|
38
|
+
* if (snapshot) {
|
|
39
|
+
* console.log('Last scraped:', new Date(snapshot.timestamp));
|
|
40
|
+
* }
|
|
41
|
+
* ```
|
|
42
|
+
*/
|
|
43
|
+
export async function getSnapshot(url) {
|
|
44
|
+
try {
|
|
45
|
+
const path = getSnapshotPath(url);
|
|
46
|
+
const data = await fs.readFile(path, 'utf-8');
|
|
47
|
+
return JSON.parse(data);
|
|
48
|
+
}
|
|
49
|
+
catch (error) {
|
|
50
|
+
return null;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Save a snapshot for a URL
|
|
55
|
+
*/
|
|
56
|
+
async function saveSnapshot(snapshot) {
|
|
57
|
+
await ensureSnapshotsDir();
|
|
58
|
+
const path = getSnapshotPath(snapshot.url);
|
|
59
|
+
await fs.writeFile(path, JSON.stringify(snapshot, null, 2), 'utf-8');
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Simple LCS-based unified diff implementation
|
|
63
|
+
* Returns unified diff format and change statistics
|
|
64
|
+
*/
|
|
65
|
+
function computeDiff(oldContent, newContent) {
|
|
66
|
+
const oldLines = oldContent.split('\n');
|
|
67
|
+
const newLines = newContent.split('\n');
|
|
68
|
+
// Compute LCS (Longest Common Subsequence) using dynamic programming
|
|
69
|
+
const m = oldLines.length;
|
|
70
|
+
const n = newLines.length;
|
|
71
|
+
const lcs = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
|
|
72
|
+
for (let i = 1; i <= m; i++) {
|
|
73
|
+
for (let j = 1; j <= n; j++) {
|
|
74
|
+
if (oldLines[i - 1] === newLines[j - 1]) {
|
|
75
|
+
lcs[i][j] = lcs[i - 1][j - 1] + 1;
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
lcs[i][j] = Math.max(lcs[i - 1][j], lcs[i][j - 1]);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
// Backtrack to build diff
|
|
83
|
+
const changes = [];
|
|
84
|
+
let i = m;
|
|
85
|
+
let j = n;
|
|
86
|
+
while (i > 0 || j > 0) {
|
|
87
|
+
if (i > 0 && j > 0 && oldLines[i - 1] === newLines[j - 1]) {
|
|
88
|
+
changes.unshift({ type: 'normal', line: j, content: newLines[j - 1] });
|
|
89
|
+
i--;
|
|
90
|
+
j--;
|
|
91
|
+
}
|
|
92
|
+
else if (j > 0 && (i === 0 || lcs[i][j - 1] >= lcs[i - 1][j])) {
|
|
93
|
+
changes.unshift({ type: 'add', line: j, content: newLines[j - 1] });
|
|
94
|
+
j--;
|
|
95
|
+
}
|
|
96
|
+
else if (i > 0) {
|
|
97
|
+
changes.unshift({ type: 'del', line: i, content: oldLines[i - 1] });
|
|
98
|
+
i--;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
// Count additions and deletions
|
|
102
|
+
let additions = 0;
|
|
103
|
+
let deletions = 0;
|
|
104
|
+
for (const change of changes) {
|
|
105
|
+
if (change.type === 'add')
|
|
106
|
+
additions++;
|
|
107
|
+
if (change.type === 'del')
|
|
108
|
+
deletions++;
|
|
109
|
+
}
|
|
110
|
+
// Build unified diff text
|
|
111
|
+
const diffLines = [];
|
|
112
|
+
let contextStart = 0;
|
|
113
|
+
for (let idx = 0; idx < changes.length; idx++) {
|
|
114
|
+
const change = changes[idx];
|
|
115
|
+
// Find chunks of changes
|
|
116
|
+
if (change.type !== 'normal') {
|
|
117
|
+
// Add context header
|
|
118
|
+
const chunkStart = Math.max(0, idx - 3);
|
|
119
|
+
const chunkEnd = Math.min(changes.length, idx + 10);
|
|
120
|
+
// Skip if we're continuing from previous chunk
|
|
121
|
+
if (idx > contextStart) {
|
|
122
|
+
diffLines.push(`@@ -${chunkStart + 1},${chunkEnd - chunkStart} +${chunkStart + 1},${chunkEnd - chunkStart} @@`);
|
|
123
|
+
}
|
|
124
|
+
// Add changes
|
|
125
|
+
for (let k = chunkStart; k < chunkEnd; k++) {
|
|
126
|
+
const c = changes[k];
|
|
127
|
+
const prefix = c.type === 'add' ? '+' : c.type === 'del' ? '-' : ' ';
|
|
128
|
+
diffLines.push(`${prefix}${c.content}`);
|
|
129
|
+
}
|
|
130
|
+
contextStart = chunkEnd;
|
|
131
|
+
idx = chunkEnd - 1;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
return {
|
|
135
|
+
text: diffLines.join('\n'),
|
|
136
|
+
additions,
|
|
137
|
+
deletions,
|
|
138
|
+
changes,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Track content changes for a URL
|
|
143
|
+
* Compares with previous snapshot and saves new one
|
|
144
|
+
*
|
|
145
|
+
* @param url - URL being tracked
|
|
146
|
+
* @param content - Current content
|
|
147
|
+
* @param fingerprint - Content fingerprint (SHA256 hash)
|
|
148
|
+
* @returns Change detection result
|
|
149
|
+
*
|
|
150
|
+
* @example
|
|
151
|
+
* ```typescript
|
|
152
|
+
* const result = await trackChange('https://example.com', content, fingerprint);
|
|
153
|
+
* if (result.changeStatus === 'changed') {
|
|
154
|
+
* console.log('Content changed!');
|
|
155
|
+
* console.log(`+${result.diff.additions} -${result.diff.deletions}`);
|
|
156
|
+
* }
|
|
157
|
+
* ```
|
|
158
|
+
*/
|
|
159
|
+
export async function trackChange(url, content, fingerprint) {
|
|
160
|
+
try {
|
|
161
|
+
const previous = await getSnapshot(url);
|
|
162
|
+
if (!previous) {
|
|
163
|
+
// First time seeing this URL
|
|
164
|
+
await saveSnapshot({
|
|
165
|
+
url,
|
|
166
|
+
fingerprint,
|
|
167
|
+
content,
|
|
168
|
+
timestamp: Date.now(),
|
|
169
|
+
});
|
|
170
|
+
return {
|
|
171
|
+
changeStatus: 'new',
|
|
172
|
+
previousScrapeAt: null,
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
// Compare fingerprints
|
|
176
|
+
if (previous.fingerprint === fingerprint) {
|
|
177
|
+
// Content unchanged, just update timestamp
|
|
178
|
+
await saveSnapshot({
|
|
179
|
+
...previous,
|
|
180
|
+
timestamp: Date.now(),
|
|
181
|
+
});
|
|
182
|
+
return {
|
|
183
|
+
changeStatus: 'same',
|
|
184
|
+
previousScrapeAt: new Date(previous.timestamp).toISOString(),
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
// Content changed - compute diff
|
|
188
|
+
const diff = computeDiff(previous.content, content);
|
|
189
|
+
// Save new snapshot
|
|
190
|
+
await saveSnapshot({
|
|
191
|
+
url,
|
|
192
|
+
fingerprint,
|
|
193
|
+
content,
|
|
194
|
+
timestamp: Date.now(),
|
|
195
|
+
metadata: {
|
|
196
|
+
previousFingerprint: previous.fingerprint,
|
|
197
|
+
previousTimestamp: previous.timestamp,
|
|
198
|
+
},
|
|
199
|
+
});
|
|
200
|
+
return {
|
|
201
|
+
changeStatus: 'changed',
|
|
202
|
+
previousScrapeAt: new Date(previous.timestamp).toISOString(),
|
|
203
|
+
diff,
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
catch (error) {
|
|
207
|
+
console.error('Change tracking error:', error);
|
|
208
|
+
// On error, treat as new
|
|
209
|
+
return {
|
|
210
|
+
changeStatus: 'new',
|
|
211
|
+
previousScrapeAt: null,
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Clear snapshots matching a URL pattern
|
|
217
|
+
*
|
|
218
|
+
* @param urlPattern - Optional regex pattern to match URLs (if not provided, clears all)
|
|
219
|
+
* @returns Number of snapshots cleared
|
|
220
|
+
*
|
|
221
|
+
* @example
|
|
222
|
+
* ```typescript
|
|
223
|
+
* // Clear all snapshots
|
|
224
|
+
* const count = await clearSnapshots();
|
|
225
|
+
*
|
|
226
|
+
* // Clear specific domain
|
|
227
|
+
* const count = await clearSnapshots('example\\.com');
|
|
228
|
+
* ```
|
|
229
|
+
*/
|
|
230
|
+
export async function clearSnapshots(urlPattern) {
|
|
231
|
+
try {
|
|
232
|
+
await ensureSnapshotsDir();
|
|
233
|
+
const files = await fs.readdir(SNAPSHOTS_DIR);
|
|
234
|
+
let cleared = 0;
|
|
235
|
+
const pattern = urlPattern ? (() => {
|
|
236
|
+
if (urlPattern.length > 200)
|
|
237
|
+
throw new Error('URL pattern too long (max 200 chars)');
|
|
238
|
+
try {
|
|
239
|
+
return new RegExp(urlPattern);
|
|
240
|
+
}
|
|
241
|
+
catch {
|
|
242
|
+
throw new Error(`Invalid regex: ${urlPattern}`);
|
|
243
|
+
}
|
|
244
|
+
})() : null;
|
|
245
|
+
for (const file of files) {
|
|
246
|
+
if (!file.endsWith('.json'))
|
|
247
|
+
continue;
|
|
248
|
+
const path = join(SNAPSHOTS_DIR, file);
|
|
249
|
+
if (pattern) {
|
|
250
|
+
// Check if URL matches pattern
|
|
251
|
+
try {
|
|
252
|
+
const data = await fs.readFile(path, 'utf-8');
|
|
253
|
+
const snapshot = JSON.parse(data);
|
|
254
|
+
if (pattern.test(snapshot.url)) {
|
|
255
|
+
await fs.unlink(path);
|
|
256
|
+
cleared++;
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
catch (e) {
|
|
260
|
+
if (process.env.DEBUG)
|
|
261
|
+
console.debug('[webpeel]', 'snapshot parse failed:', e instanceof Error ? e.message : e);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
else {
|
|
265
|
+
// Clear all
|
|
266
|
+
await fs.unlink(path);
|
|
267
|
+
cleared++;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
return cleared;
|
|
271
|
+
}
|
|
272
|
+
catch (error) {
|
|
273
|
+
console.error('Clear snapshots error:', error);
|
|
274
|
+
return 0;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content chunker for RAG pipelines.
|
|
3
|
+
* Splits markdown content into overlapping chunks with rich metadata.
|
|
4
|
+
*/
|
|
5
|
+
export interface ChunkOptions {
|
|
6
|
+
/** Max tokens per chunk (approximate, using ~4 chars/token) */
|
|
7
|
+
maxTokens?: number;
|
|
8
|
+
/** Overlap tokens between chunks */
|
|
9
|
+
overlap?: number;
|
|
10
|
+
/** Chunking strategy */
|
|
11
|
+
strategy?: 'section' | 'paragraph' | 'fixed';
|
|
12
|
+
}
|
|
13
|
+
export interface ContentChunk {
|
|
14
|
+
/** Chunk index (0-based) */
|
|
15
|
+
index: number;
|
|
16
|
+
/** The chunk text content */
|
|
17
|
+
text: string;
|
|
18
|
+
/** Approximate token count (~4 chars per token) */
|
|
19
|
+
tokenCount: number;
|
|
20
|
+
/** Word count */
|
|
21
|
+
wordCount: number;
|
|
22
|
+
/** Section heading this chunk belongs to (if any) */
|
|
23
|
+
section: string | null;
|
|
24
|
+
/** Section depth (1=h1, 2=h2, etc.) */
|
|
25
|
+
sectionDepth: number | null;
|
|
26
|
+
/** Character offset in original content */
|
|
27
|
+
startOffset: number;
|
|
28
|
+
/** Character end offset */
|
|
29
|
+
endOffset: number;
|
|
30
|
+
}
|
|
31
|
+
export interface ChunkResult {
|
|
32
|
+
/** Array of content chunks */
|
|
33
|
+
chunks: ContentChunk[];
|
|
34
|
+
/** Total chunks */
|
|
35
|
+
totalChunks: number;
|
|
36
|
+
/** Original content length (chars) */
|
|
37
|
+
originalLength: number;
|
|
38
|
+
/** Chunking strategy used */
|
|
39
|
+
strategy: string;
|
|
40
|
+
/** Options used */
|
|
41
|
+
options: Required<ChunkOptions>;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Split content into RAG-ready chunks with metadata.
|
|
45
|
+
*/
|
|
46
|
+
export declare function chunkContent(content: string, options?: ChunkOptions): ChunkResult;
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content chunker for RAG pipelines.
|
|
3
|
+
* Splits markdown content into overlapping chunks with rich metadata.
|
|
4
|
+
*/
|
|
5
|
+
const DEFAULT_MAX_TOKENS = 512;
|
|
6
|
+
const DEFAULT_OVERLAP = 50;
|
|
7
|
+
const CHARS_PER_TOKEN = 4; // rough approximation
|
|
8
|
+
/**
|
|
9
|
+
* Split content into RAG-ready chunks with metadata.
|
|
10
|
+
*/
|
|
11
|
+
export function chunkContent(content, options = {}) {
|
|
12
|
+
const maxTokens = options.maxTokens || DEFAULT_MAX_TOKENS;
|
|
13
|
+
const overlap = options.overlap || DEFAULT_OVERLAP;
|
|
14
|
+
const strategy = options.strategy || 'section';
|
|
15
|
+
const opts = { maxTokens, overlap, strategy };
|
|
16
|
+
let chunks;
|
|
17
|
+
switch (strategy) {
|
|
18
|
+
case 'section':
|
|
19
|
+
chunks = chunkBySection(content, maxTokens, overlap);
|
|
20
|
+
break;
|
|
21
|
+
case 'paragraph':
|
|
22
|
+
chunks = chunkByParagraph(content, maxTokens, overlap);
|
|
23
|
+
break;
|
|
24
|
+
case 'fixed':
|
|
25
|
+
chunks = chunkByFixed(content, maxTokens, overlap);
|
|
26
|
+
break;
|
|
27
|
+
default:
|
|
28
|
+
chunks = chunkBySection(content, maxTokens, overlap);
|
|
29
|
+
}
|
|
30
|
+
return {
|
|
31
|
+
chunks,
|
|
32
|
+
totalChunks: chunks.length,
|
|
33
|
+
originalLength: content.length,
|
|
34
|
+
strategy,
|
|
35
|
+
options: opts,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Section-based chunking (recommended for RAG).
|
|
40
|
+
* Splits on markdown headings (## / ### etc.), then splits large sections by paragraph.
|
|
41
|
+
* Each chunk includes its section heading for context.
|
|
42
|
+
*/
|
|
43
|
+
function chunkBySection(content, maxTokens, overlap) {
|
|
44
|
+
const maxChars = maxTokens * CHARS_PER_TOKEN;
|
|
45
|
+
const overlapChars = overlap * CHARS_PER_TOKEN;
|
|
46
|
+
const chunks = [];
|
|
47
|
+
// Split content into sections by headings
|
|
48
|
+
const sections = splitByHeadings(content);
|
|
49
|
+
let chunkIndex = 0;
|
|
50
|
+
for (const section of sections) {
|
|
51
|
+
const { heading, depth, body, startOffset } = section;
|
|
52
|
+
if (!body.trim())
|
|
53
|
+
continue;
|
|
54
|
+
// If section fits in one chunk, use it directly
|
|
55
|
+
if (body.length <= maxChars) {
|
|
56
|
+
const text = heading ? `${heading}\n\n${body.trim()}` : body.trim();
|
|
57
|
+
chunks.push({
|
|
58
|
+
index: chunkIndex++,
|
|
59
|
+
text,
|
|
60
|
+
tokenCount: Math.ceil(text.length / CHARS_PER_TOKEN),
|
|
61
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
62
|
+
section: heading ? heading.replace(/^#+\s*/, '') : null,
|
|
63
|
+
sectionDepth: depth,
|
|
64
|
+
startOffset,
|
|
65
|
+
endOffset: startOffset + body.length,
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
else {
|
|
69
|
+
// Large section — split by paragraphs with overlap
|
|
70
|
+
const paragraphs = body.split(/\n\n+/).filter(p => p.trim());
|
|
71
|
+
let currentText = '';
|
|
72
|
+
let currentStart = startOffset;
|
|
73
|
+
for (const para of paragraphs) {
|
|
74
|
+
const candidate = currentText ? `${currentText}\n\n${para}` : para;
|
|
75
|
+
if (candidate.length > maxChars && currentText) {
|
|
76
|
+
// Emit current chunk
|
|
77
|
+
const text = heading ? `${heading}\n\n${currentText.trim()}` : currentText.trim();
|
|
78
|
+
chunks.push({
|
|
79
|
+
index: chunkIndex++,
|
|
80
|
+
text,
|
|
81
|
+
tokenCount: Math.ceil(text.length / CHARS_PER_TOKEN),
|
|
82
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
83
|
+
section: heading ? heading.replace(/^#+\s*/, '') : null,
|
|
84
|
+
sectionDepth: depth,
|
|
85
|
+
startOffset: currentStart,
|
|
86
|
+
endOffset: currentStart + currentText.length,
|
|
87
|
+
});
|
|
88
|
+
// Start new chunk with overlap from end of previous
|
|
89
|
+
if (overlapChars > 0 && currentText.length > overlapChars) {
|
|
90
|
+
currentText = currentText.slice(-overlapChars) + '\n\n' + para;
|
|
91
|
+
}
|
|
92
|
+
else {
|
|
93
|
+
currentText = para;
|
|
94
|
+
}
|
|
95
|
+
currentStart = startOffset + body.indexOf(para);
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
currentText = candidate;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
// Emit remaining
|
|
102
|
+
if (currentText.trim()) {
|
|
103
|
+
const text = heading ? `${heading}\n\n${currentText.trim()}` : currentText.trim();
|
|
104
|
+
chunks.push({
|
|
105
|
+
index: chunkIndex++,
|
|
106
|
+
text,
|
|
107
|
+
tokenCount: Math.ceil(text.length / CHARS_PER_TOKEN),
|
|
108
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
109
|
+
section: heading ? heading.replace(/^#+\s*/, '') : null,
|
|
110
|
+
sectionDepth: depth,
|
|
111
|
+
startOffset: currentStart,
|
|
112
|
+
endOffset: currentStart + currentText.length,
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
return chunks;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Paragraph-based chunking.
|
|
121
|
+
* Groups paragraphs together up to maxTokens, with overlap.
|
|
122
|
+
*/
|
|
123
|
+
function chunkByParagraph(content, maxTokens, overlap) {
|
|
124
|
+
const maxChars = maxTokens * CHARS_PER_TOKEN;
|
|
125
|
+
const overlapChars = overlap * CHARS_PER_TOKEN;
|
|
126
|
+
const chunks = [];
|
|
127
|
+
const paragraphs = content.split(/\n\n+/).filter(p => p.trim());
|
|
128
|
+
let currentText = '';
|
|
129
|
+
let currentStart = 0;
|
|
130
|
+
let chunkIndex = 0;
|
|
131
|
+
// Track current section heading
|
|
132
|
+
let currentHeading = null;
|
|
133
|
+
let currentDepth = null;
|
|
134
|
+
for (const para of paragraphs) {
|
|
135
|
+
// Check if paragraph is a heading
|
|
136
|
+
const headingMatch = para.match(/^(#{1,6})\s+(.+)/);
|
|
137
|
+
if (headingMatch) {
|
|
138
|
+
currentHeading = headingMatch[2];
|
|
139
|
+
currentDepth = headingMatch[1].length;
|
|
140
|
+
}
|
|
141
|
+
const candidate = currentText ? `${currentText}\n\n${para}` : para;
|
|
142
|
+
if (candidate.length > maxChars && currentText) {
|
|
143
|
+
chunks.push({
|
|
144
|
+
index: chunkIndex++,
|
|
145
|
+
text: currentText.trim(),
|
|
146
|
+
tokenCount: Math.ceil(currentText.length / CHARS_PER_TOKEN),
|
|
147
|
+
wordCount: currentText.split(/\s+/).filter(Boolean).length,
|
|
148
|
+
section: currentHeading,
|
|
149
|
+
sectionDepth: currentDepth,
|
|
150
|
+
startOffset: currentStart,
|
|
151
|
+
endOffset: currentStart + currentText.length,
|
|
152
|
+
});
|
|
153
|
+
if (overlapChars > 0 && currentText.length > overlapChars) {
|
|
154
|
+
currentText = currentText.slice(-overlapChars) + '\n\n' + para;
|
|
155
|
+
}
|
|
156
|
+
else {
|
|
157
|
+
currentText = para;
|
|
158
|
+
}
|
|
159
|
+
currentStart = content.indexOf(para, currentStart);
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
currentText = candidate;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
if (currentText.trim()) {
|
|
166
|
+
chunks.push({
|
|
167
|
+
index: chunkIndex++,
|
|
168
|
+
text: currentText.trim(),
|
|
169
|
+
tokenCount: Math.ceil(currentText.length / CHARS_PER_TOKEN),
|
|
170
|
+
wordCount: currentText.split(/\s+/).filter(Boolean).length,
|
|
171
|
+
section: currentHeading,
|
|
172
|
+
sectionDepth: currentDepth,
|
|
173
|
+
startOffset: currentStart,
|
|
174
|
+
endOffset: currentStart + currentText.length,
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
return chunks;
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Fixed-size chunking with overlap.
|
|
181
|
+
* Simple character-based splitting for predictable chunk sizes.
|
|
182
|
+
*/
|
|
183
|
+
function chunkByFixed(content, maxTokens, overlap) {
|
|
184
|
+
const maxChars = maxTokens * CHARS_PER_TOKEN;
|
|
185
|
+
const overlapChars = overlap * CHARS_PER_TOKEN;
|
|
186
|
+
const step = Math.max(maxChars - overlapChars, 100);
|
|
187
|
+
const chunks = [];
|
|
188
|
+
let chunkIndex = 0;
|
|
189
|
+
for (let i = 0; i < content.length; i += step) {
|
|
190
|
+
const text = content.slice(i, i + maxChars).trim();
|
|
191
|
+
if (!text)
|
|
192
|
+
continue;
|
|
193
|
+
// Try to find section heading within this chunk
|
|
194
|
+
const headingMatch = text.match(/^(#{1,6})\s+(.+)/m);
|
|
195
|
+
chunks.push({
|
|
196
|
+
index: chunkIndex++,
|
|
197
|
+
text,
|
|
198
|
+
tokenCount: Math.ceil(text.length / CHARS_PER_TOKEN),
|
|
199
|
+
wordCount: text.split(/\s+/).filter(Boolean).length,
|
|
200
|
+
section: headingMatch ? headingMatch[2] : null,
|
|
201
|
+
sectionDepth: headingMatch ? headingMatch[1].length : null,
|
|
202
|
+
startOffset: i,
|
|
203
|
+
endOffset: Math.min(i + maxChars, content.length),
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
return chunks;
|
|
207
|
+
}
|
|
208
|
+
/** Split content into sections based on markdown headings */
|
|
209
|
+
function splitByHeadings(content) {
|
|
210
|
+
const lines = content.split('\n');
|
|
211
|
+
const sections = [];
|
|
212
|
+
let currentHeading = null;
|
|
213
|
+
let currentDepth = null;
|
|
214
|
+
let currentBody = [];
|
|
215
|
+
let currentStart = 0;
|
|
216
|
+
let offset = 0;
|
|
217
|
+
for (const line of lines) {
|
|
218
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)/);
|
|
219
|
+
if (headingMatch) {
|
|
220
|
+
// Save previous section
|
|
221
|
+
if (currentBody.length > 0 || currentHeading) {
|
|
222
|
+
sections.push({
|
|
223
|
+
heading: currentHeading,
|
|
224
|
+
depth: currentDepth,
|
|
225
|
+
body: currentBody.join('\n'),
|
|
226
|
+
startOffset: currentStart,
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
currentHeading = line;
|
|
230
|
+
currentDepth = headingMatch[1].length;
|
|
231
|
+
currentBody = [];
|
|
232
|
+
currentStart = offset;
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
currentBody.push(line);
|
|
236
|
+
}
|
|
237
|
+
offset += line.length + 1; // +1 for newline
|
|
238
|
+
}
|
|
239
|
+
// Don't forget last section
|
|
240
|
+
if (currentBody.length > 0 || currentHeading) {
|
|
241
|
+
sections.push({
|
|
242
|
+
heading: currentHeading,
|
|
243
|
+
depth: currentDepth,
|
|
244
|
+
body: currentBody.join('\n'),
|
|
245
|
+
startOffset: currentStart,
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
return sections;
|
|
249
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart content chunking for LLM processing.
|
|
3
|
+
*
|
|
4
|
+
* Splits content into manageable pieces with configurable overlap and strategy.
|
|
5
|
+
* Zero external dependencies; target <1ms for typical page content.
|
|
6
|
+
*/
|
|
7
|
+
export interface ChunkOptions {
|
|
8
|
+
/** Target tokens per chunk. Default: 4000 */
|
|
9
|
+
chunkSize?: number;
|
|
10
|
+
/** Overlap tokens between chunks. Default: 200 */
|
|
11
|
+
overlap?: number;
|
|
12
|
+
/** Chunking strategy. Default: 'semantic' */
|
|
13
|
+
strategy?: 'fixed' | 'semantic' | 'paragraph';
|
|
14
|
+
}
|
|
15
|
+
export interface Chunk {
|
|
16
|
+
/** Chunk index (0-based) */
|
|
17
|
+
index: number;
|
|
18
|
+
/** Chunk content */
|
|
19
|
+
content: string;
|
|
20
|
+
/** Estimated tokens in this chunk */
|
|
21
|
+
tokens: number;
|
|
22
|
+
/** Character offset in original content */
|
|
23
|
+
startOffset: number;
|
|
24
|
+
/** Whether this is the last chunk */
|
|
25
|
+
isLast: boolean;
|
|
26
|
+
}
|
|
27
|
+
export interface ChunkResult {
|
|
28
|
+
/** Array of content chunks */
|
|
29
|
+
chunks: Chunk[];
|
|
30
|
+
/** Total chunks */
|
|
31
|
+
totalChunks: number;
|
|
32
|
+
/** Total tokens across all chunks */
|
|
33
|
+
totalTokens: number;
|
|
34
|
+
/** Original content tokens */
|
|
35
|
+
originalTokens: number;
|
|
36
|
+
}
|
|
37
|
+
/** Estimate token count using chars/4 heuristic. Accurate within ±10%. */
|
|
38
|
+
export declare function estimateTokens(text: string): number;
|
|
39
|
+
/**
|
|
40
|
+
* Split content into chunks suitable for LLM processing.
|
|
41
|
+
*/
|
|
42
|
+
export declare function chunkContent(content: string, options?: ChunkOptions): ChunkResult;
|