@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Post-process BM25 quickAnswer passages to extract specific values.
|
|
3
|
+
*
|
|
4
|
+
* BM25 finds relevant passages but can't extract values. This module
|
|
5
|
+
* applies field-type-aware regex extraction to pull the actual value
|
|
6
|
+
* from the passage.
|
|
7
|
+
*/
|
|
8
|
+
const FIELD_EXTRACTORS = {
|
|
9
|
+
// Price: find currency patterns
|
|
10
|
+
price: {
|
|
11
|
+
patterns: [
|
|
12
|
+
/\$[\d,]+(?:\.\d{2})?/, // $999.99 or $1,299
|
|
13
|
+
/USD\s*[\d,]+(?:\.\d{2})?/, // USD 999.99
|
|
14
|
+
/€[\d,]+(?:\.\d{2})?/, // €999.99
|
|
15
|
+
/£[\d,]+(?:\.\d{2})?/, // £999.99
|
|
16
|
+
/¥[\d,]+/, // ¥9999
|
|
17
|
+
/[\d,]+(?:\.\d{2})?\s*(?:USD|EUR|GBP|JPY)/, // 999.99 USD
|
|
18
|
+
/(?:price|cost|costs?)\s*(?:is|:|\s)\s*\$?[\d,]+(?:\.\d{2})?/i, // "price is $999"
|
|
19
|
+
/(?:starting\s+(?:at|from)|from)\s+\$?[\d,]+(?:\.\d{2})?/i, // "starting at $99"
|
|
20
|
+
],
|
|
21
|
+
fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 60),
|
|
22
|
+
},
|
|
23
|
+
// Date: find date patterns
|
|
24
|
+
date: {
|
|
25
|
+
patterns: [
|
|
26
|
+
/\d{4}-\d{2}-\d{2}/, // 2023-11-21
|
|
27
|
+
/(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}/i, // November 21, 2023
|
|
28
|
+
/\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}/i, // 21 November 2023
|
|
29
|
+
/(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}/i, // Nov 21, 2023
|
|
30
|
+
/\d{1,2}\/\d{1,2}\/\d{2,4}/, // 11/21/2023
|
|
31
|
+
/\d{1,2}\.\d{1,2}\.\d{2,4}/, // 21.11.2023
|
|
32
|
+
],
|
|
33
|
+
fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 40),
|
|
34
|
+
},
|
|
35
|
+
// Author: find author patterns
|
|
36
|
+
author: {
|
|
37
|
+
patterns: [
|
|
38
|
+
/(?:by|author|written by|posted by)\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){0,3})/i, // "by John Smith"
|
|
39
|
+
/([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){1,3})\s+(?:wrote|writes|reports|published)/i, // "John Smith wrote"
|
|
40
|
+
],
|
|
41
|
+
// Trim captured group to only consecutive title-cased words (i flag makes [A-Z] match lowercase too)
|
|
42
|
+
trimMatch: (s) => {
|
|
43
|
+
const words = s.split(/\s+/);
|
|
44
|
+
const result = [];
|
|
45
|
+
for (const w of words) {
|
|
46
|
+
if (/^[A-Z]/.test(w))
|
|
47
|
+
result.push(w);
|
|
48
|
+
else
|
|
49
|
+
break;
|
|
50
|
+
}
|
|
51
|
+
return result.join(' ') || s;
|
|
52
|
+
},
|
|
53
|
+
fallback: (p) => {
|
|
54
|
+
// Try to find a capitalized name
|
|
55
|
+
const nameMatch = p.match(/([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){1,2})/);
|
|
56
|
+
return nameMatch?.[1] || p.split(/[.\n]/)[0].trim().slice(0, 50);
|
|
57
|
+
},
|
|
58
|
+
},
|
|
59
|
+
// Title: extract from headings or first meaningful text
|
|
60
|
+
title: {
|
|
61
|
+
patterns: [
|
|
62
|
+
/^#\s+(.+)$/m, // # Heading
|
|
63
|
+
/^##\s+(.+)$/m, // ## Heading
|
|
64
|
+
],
|
|
65
|
+
fallback: (p) => {
|
|
66
|
+
// Take the first line that's not a date, whitespace, or metadata
|
|
67
|
+
const lines = p.split('\n').filter((l) => l.trim());
|
|
68
|
+
for (const line of lines) {
|
|
69
|
+
const clean = line.replace(/^#+\s*/, '').trim();
|
|
70
|
+
// Skip lines that look like dates or metadata
|
|
71
|
+
if (/^\d{4}-\d{2}-\d{2}/.test(clean))
|
|
72
|
+
continue;
|
|
73
|
+
if (/^\d+\s*min\s*read/i.test(clean))
|
|
74
|
+
continue;
|
|
75
|
+
if (/^(by|author|posted|published|updated)/i.test(clean))
|
|
76
|
+
continue;
|
|
77
|
+
if (clean.length > 10)
|
|
78
|
+
return clean.slice(0, 120);
|
|
79
|
+
}
|
|
80
|
+
return p.split('\n')[0].trim().slice(0, 120);
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
// Name (product, event, recipe): similar to title
|
|
84
|
+
name: {
|
|
85
|
+
patterns: [
|
|
86
|
+
/^#\s+(.+)$/m,
|
|
87
|
+
/^##\s+(.+)$/m,
|
|
88
|
+
],
|
|
89
|
+
fallback: (p) => {
|
|
90
|
+
const lines = p.split('\n').filter((l) => l.trim());
|
|
91
|
+
for (const line of lines) {
|
|
92
|
+
const clean = line.replace(/^#+\s*/, '').trim();
|
|
93
|
+
if (/^\d{4}-\d{2}-\d{2}/.test(clean))
|
|
94
|
+
continue;
|
|
95
|
+
if (/^\d+\s*min\s*read/i.test(clean))
|
|
96
|
+
continue;
|
|
97
|
+
if (clean.length > 5)
|
|
98
|
+
return clean.slice(0, 100);
|
|
99
|
+
}
|
|
100
|
+
return p.split('\n')[0].trim().slice(0, 100);
|
|
101
|
+
},
|
|
102
|
+
},
|
|
103
|
+
// Brand: extract proper nouns / company names
|
|
104
|
+
brand: {
|
|
105
|
+
patterns: [
|
|
106
|
+
/(?:brand|manufacturer|made by|by)\s*:?\s*([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){0,2})/i,
|
|
107
|
+
],
|
|
108
|
+
// Trim to consecutive title-cased words only
|
|
109
|
+
trimMatch: (s) => {
|
|
110
|
+
const words = s.split(/\s+/);
|
|
111
|
+
const result = [];
|
|
112
|
+
for (const w of words) {
|
|
113
|
+
if (/^[A-Z]/.test(w))
|
|
114
|
+
result.push(w);
|
|
115
|
+
else
|
|
116
|
+
break;
|
|
117
|
+
}
|
|
118
|
+
return result.join(' ') || s;
|
|
119
|
+
},
|
|
120
|
+
fallback: (p) => {
|
|
121
|
+
// Find the first capitalized word that looks like a brand
|
|
122
|
+
const brandMatch = p.match(/([A-Z][a-zA-Z]{2,})/);
|
|
123
|
+
return brandMatch?.[1] || p.split(/[.\n]/)[0].trim().slice(0, 40);
|
|
124
|
+
},
|
|
125
|
+
},
|
|
126
|
+
// Rating: extract numeric ratings
|
|
127
|
+
rating: {
|
|
128
|
+
patterns: [
|
|
129
|
+
/(\d+(?:\.\d+)?)\s*(?:\/\s*\d+|out of \d+|stars?)/i, // 4.5/5, 4.5 out of 5, 4.5 stars
|
|
130
|
+
/(?:rating|rated|score)\s*:?\s*(\d+(?:\.\d+)?)/i, // rating: 4.5
|
|
131
|
+
/(\d+(?:\.\d+)?)\s*%/, // 95%
|
|
132
|
+
],
|
|
133
|
+
fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 50),
|
|
134
|
+
},
|
|
135
|
+
// Email
|
|
136
|
+
email: {
|
|
137
|
+
patterns: [/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/],
|
|
138
|
+
fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 80),
|
|
139
|
+
},
|
|
140
|
+
// Phone
|
|
141
|
+
phone: {
|
|
142
|
+
patterns: [
|
|
143
|
+
/(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/, // +1 (555) 123-4567
|
|
144
|
+
/(?:\+\d{1,3}[-.\s]?)?[\d\s-]{7,15}/, // International
|
|
145
|
+
],
|
|
146
|
+
fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 40),
|
|
147
|
+
},
|
|
148
|
+
// URL / image / website
|
|
149
|
+
url: {
|
|
150
|
+
patterns: [/https?:\/\/[^\s"'<>]+/],
|
|
151
|
+
fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 120),
|
|
152
|
+
},
|
|
153
|
+
image: {
|
|
154
|
+
patterns: [
|
|
155
|
+
/https?:\/\/[^\s"'<>]+\.(?:jpg|jpeg|png|gif|webp|svg|avif)[^\s"'<>]*/i,
|
|
156
|
+
/https?:\/\/[^\s"'<>]+/,
|
|
157
|
+
],
|
|
158
|
+
fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 120),
|
|
159
|
+
},
|
|
160
|
+
website: {
|
|
161
|
+
patterns: [/https?:\/\/[^\s"'<>]+/],
|
|
162
|
+
fallback: (p) => p.split(/[.\n]/)[0].trim().slice(0, 120),
|
|
163
|
+
},
|
|
164
|
+
};
|
|
165
|
+
// Default extractor: take first sentence
|
|
166
|
+
const DEFAULT_EXTRACTOR = {
|
|
167
|
+
patterns: [],
|
|
168
|
+
fallback: (p) => {
|
|
169
|
+
// Split into sentences, return the most relevant one (first non-trivial)
|
|
170
|
+
const sentences = p.split(/(?<=[.!?])\s+/).filter((s) => s.trim().length > 10);
|
|
171
|
+
return sentences[0]?.trim().slice(0, 150) || p.trim().slice(0, 150);
|
|
172
|
+
},
|
|
173
|
+
};
|
|
174
|
+
/**
|
|
175
|
+
* Post-process a BM25 passage to extract the actual value for a given field name.
|
|
176
|
+
*/
|
|
177
|
+
export function extractValueFromPassage(passage, fieldName) {
|
|
178
|
+
if (!passage || !passage.trim())
|
|
179
|
+
return '';
|
|
180
|
+
const normalizedField = fieldName.toLowerCase().trim();
|
|
181
|
+
const extractor = FIELD_EXTRACTORS[normalizedField] || DEFAULT_EXTRACTOR;
|
|
182
|
+
// Try each pattern
|
|
183
|
+
for (const pattern of extractor.patterns) {
|
|
184
|
+
const match = passage.match(pattern);
|
|
185
|
+
if (match) {
|
|
186
|
+
// If there's a capture group, use it; otherwise use the full match
|
|
187
|
+
const raw = (match[1] || match[0]).trim();
|
|
188
|
+
return extractor.trimMatch ? extractor.trimMatch(raw) : raw;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
// No pattern matched — use fallback
|
|
192
|
+
if (extractor.fallback) {
|
|
193
|
+
return extractor.fallback(passage);
|
|
194
|
+
}
|
|
195
|
+
// Last resort
|
|
196
|
+
return passage.split(/[.\n]/)[0].trim().slice(0, 100);
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Smart schema extraction that uses structural signals before falling back to BM25.
|
|
200
|
+
*
|
|
201
|
+
* For title/name: uses the page title or first heading
|
|
202
|
+
* For author: scans first 1000 chars for "by X" patterns
|
|
203
|
+
* For date: scans first 1000 chars for date patterns
|
|
204
|
+
* For price/email/phone/url: regex scan of full content
|
|
205
|
+
* For everything else: BM25 quickAnswer + post-processing
|
|
206
|
+
*/
|
|
207
|
+
export function smartExtractSchemaFields(content, templateFields, quickAnswerFn, options) {
|
|
208
|
+
const { pageTitle, pageUrl, metadata } = options || {};
|
|
209
|
+
const extracted = {};
|
|
210
|
+
const topContent = content.slice(0, 1500); // First 1500 chars for structural extraction
|
|
211
|
+
for (const [field, question] of Object.entries(templateFields)) {
|
|
212
|
+
const normalizedField = field.toLowerCase().trim();
|
|
213
|
+
let value = '';
|
|
214
|
+
// === STRUCTURAL EXTRACTION (try first) ===
|
|
215
|
+
if (normalizedField === 'title' || normalizedField === 'name') {
|
|
216
|
+
// 1. Use page title if available
|
|
217
|
+
if (pageTitle && pageTitle.length > 3) {
|
|
218
|
+
value = pageTitle.replace(/\s*[-|–—]\s*.+$/, '').trim(); // Strip " - Site Name" suffix
|
|
219
|
+
}
|
|
220
|
+
// 2. Try first heading in content
|
|
221
|
+
if (!value) {
|
|
222
|
+
const headingMatch = content.match(/^#\s+(.+)$/m);
|
|
223
|
+
if (headingMatch)
|
|
224
|
+
value = headingMatch[1].trim();
|
|
225
|
+
}
|
|
226
|
+
// 3. Try ## heading
|
|
227
|
+
if (!value) {
|
|
228
|
+
const h2Match = content.match(/^##\s+(.+)$/m);
|
|
229
|
+
if (h2Match)
|
|
230
|
+
value = h2Match[1].trim();
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
else if (normalizedField === 'author') {
|
|
234
|
+
// Scan top of page for author patterns
|
|
235
|
+
const authorPatterns = [
|
|
236
|
+
/(?:^|\n)\s*(?:by|author|written by|posted by)[:\s]+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){0,3})/im,
|
|
237
|
+
/(?:^|\n)\s*([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+){1,2})\s*[|·•]\s*(?:\d|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/im,
|
|
238
|
+
];
|
|
239
|
+
for (const pat of authorPatterns) {
|
|
240
|
+
const match = topContent.match(pat);
|
|
241
|
+
if (match?.[1]) {
|
|
242
|
+
// Trim to only capitalized words
|
|
243
|
+
const words = match[1].split(/\s+/);
|
|
244
|
+
const nameWords = [];
|
|
245
|
+
for (const w of words) {
|
|
246
|
+
if (/^[A-Z]/.test(w))
|
|
247
|
+
nameWords.push(w);
|
|
248
|
+
else
|
|
249
|
+
break;
|
|
250
|
+
}
|
|
251
|
+
if (nameWords.length >= 1) {
|
|
252
|
+
value = nameWords.join(' ');
|
|
253
|
+
break;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
// Also check metadata
|
|
258
|
+
if (!value && metadata?.author) {
|
|
259
|
+
value = String(metadata.author);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
else if (normalizedField === 'date') {
|
|
263
|
+
// Scan top of page for date patterns
|
|
264
|
+
const datePatterns = [
|
|
265
|
+
/\d{4}-\d{2}-\d{2}/,
|
|
266
|
+
/(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}/i,
|
|
267
|
+
/\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}/i,
|
|
268
|
+
/(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}/i,
|
|
269
|
+
];
|
|
270
|
+
for (const pat of datePatterns) {
|
|
271
|
+
const match = topContent.match(pat);
|
|
272
|
+
if (match) {
|
|
273
|
+
value = match[0].trim();
|
|
274
|
+
break;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
// Also check metadata
|
|
278
|
+
if (!value && metadata?.date) {
|
|
279
|
+
value = String(metadata.date);
|
|
280
|
+
}
|
|
281
|
+
if (!value && metadata?.publishedTime) {
|
|
282
|
+
value = String(metadata.publishedTime).split('T')[0];
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
else if (normalizedField === 'price') {
|
|
286
|
+
// Scan full content for currency patterns
|
|
287
|
+
const pricePatterns = [
|
|
288
|
+
/\$[\d,]+(?:\.\d{2})?/,
|
|
289
|
+
/€[\d,]+(?:\.\d{2})?/,
|
|
290
|
+
/£[\d,]+(?:\.\d{2})?/,
|
|
291
|
+
/(?:price|cost|starting at|from)\s*:?\s*\$[\d,]+(?:\.\d{2})?/i,
|
|
292
|
+
];
|
|
293
|
+
for (const pat of pricePatterns) {
|
|
294
|
+
const match = content.match(pat);
|
|
295
|
+
if (match) {
|
|
296
|
+
// Extract just the currency amount from the match
|
|
297
|
+
const currMatch = match[0].match(/[$€£¥][\d,]+(?:\.\d{2})?/);
|
|
298
|
+
value = currMatch ? currMatch[0] : match[0];
|
|
299
|
+
break;
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
else if (normalizedField === 'email') {
|
|
304
|
+
const emailMatch = content.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/);
|
|
305
|
+
if (emailMatch)
|
|
306
|
+
value = emailMatch[0];
|
|
307
|
+
}
|
|
308
|
+
else if (normalizedField === 'phone') {
|
|
309
|
+
const phoneMatch = content.match(/(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/);
|
|
310
|
+
if (phoneMatch)
|
|
311
|
+
value = phoneMatch[0];
|
|
312
|
+
}
|
|
313
|
+
else if (normalizedField === 'url' || normalizedField === 'website' || normalizedField === 'image') {
|
|
314
|
+
if (normalizedField === 'image') {
|
|
315
|
+
const imgMatch = content.match(/https?:\/\/[^\s"'<>]+\.(?:jpg|jpeg|png|gif|webp|svg|avif)[^\s"'<>]*/i);
|
|
316
|
+
if (imgMatch)
|
|
317
|
+
value = imgMatch[0];
|
|
318
|
+
}
|
|
319
|
+
if (!value) {
|
|
320
|
+
const urlMatch = content.match(/https?:\/\/[^\s"'<>]+/);
|
|
321
|
+
if (urlMatch)
|
|
322
|
+
value = urlMatch[0];
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
else if (normalizedField === 'rating') {
|
|
326
|
+
const ratingPatterns = [
|
|
327
|
+
/(\d+(?:\.\d+)?)\s*(?:\/\s*\d+|out of \d+|stars?)/i,
|
|
328
|
+
/(?:rating|rated|score)\s*:?\s*(\d+(?:\.\d+)?)/i,
|
|
329
|
+
];
|
|
330
|
+
for (const pat of ratingPatterns) {
|
|
331
|
+
const match = content.match(pat);
|
|
332
|
+
if (match) {
|
|
333
|
+
value = match[1] || match[0];
|
|
334
|
+
break;
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
else if (normalizedField === 'brand') {
|
|
339
|
+
// 1. Look for "by Brand" or "developed by Brand" etc. in content (highest priority)
|
|
340
|
+
const brandByPatterns = [
|
|
341
|
+
/(?:by|from|developed by|manufactured by|made by|produced by|created by)\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]*)?)/,
|
|
342
|
+
];
|
|
343
|
+
for (const pat of brandByPatterns) {
|
|
344
|
+
const match = topContent.match(pat);
|
|
345
|
+
if (match?.[1]) {
|
|
346
|
+
// Trim to just the brand name (first 1-2 capitalized words)
|
|
347
|
+
const words = match[1].split(/\s+/);
|
|
348
|
+
const brandWords = [];
|
|
349
|
+
for (const w of words) {
|
|
350
|
+
if (/^[A-Z]/.test(w) && !/^(The|This|That|And|For|With|From)$/.test(w))
|
|
351
|
+
brandWords.push(w);
|
|
352
|
+
else
|
|
353
|
+
break;
|
|
354
|
+
}
|
|
355
|
+
if (brandWords.length >= 1) {
|
|
356
|
+
value = brandWords.join(' ');
|
|
357
|
+
break;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
// 2. Check metadata
|
|
362
|
+
if (!value && metadata?.brand) {
|
|
363
|
+
value = String(metadata.brand);
|
|
364
|
+
}
|
|
365
|
+
// 3. Fallback: first word of page title (lower priority than content patterns)
|
|
366
|
+
if (!value && pageTitle) {
|
|
367
|
+
const brandMatch = pageTitle.match(/^([A-Z][a-zA-Z]+)/);
|
|
368
|
+
if (brandMatch)
|
|
369
|
+
value = brandMatch[1];
|
|
370
|
+
}
|
|
371
|
+
// BM25 fallback will handle the rest
|
|
372
|
+
}
|
|
373
|
+
else if (normalizedField === 'source') {
|
|
374
|
+
// 1. Try title suffix first "Article Title - Site Name" or "Article Title | Site Name"
|
|
375
|
+
// (more human-readable, more specific than domain)
|
|
376
|
+
if (pageTitle) {
|
|
377
|
+
const suffixMatch = pageTitle.match(/\s*[-|–—]\s*(.+)$/);
|
|
378
|
+
if (suffixMatch?.[1] && suffixMatch[1].length < 40) {
|
|
379
|
+
value = suffixMatch[1].trim();
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
// 2. Extract from URL domain
|
|
383
|
+
if (!value && pageUrl) {
|
|
384
|
+
try {
|
|
385
|
+
const parsed = new URL(pageUrl);
|
|
386
|
+
const host = parsed.hostname.replace(/^www\./, '');
|
|
387
|
+
const parts = host.split('.');
|
|
388
|
+
const siteName = parts.length >= 2 ? parts[parts.length - 2] : parts[0];
|
|
389
|
+
// Handle subdomains like blog.cloudflare.com
|
|
390
|
+
const subdomain = parts[0];
|
|
391
|
+
if (subdomain && !['www', 'en', 'm', 'mobile', 'api', 'app'].includes(subdomain) && subdomain !== siteName) {
|
|
392
|
+
value = `${subdomain.charAt(0).toUpperCase() + subdomain.slice(1)} ${siteName.charAt(0).toUpperCase() + siteName.slice(1)}`;
|
|
393
|
+
}
|
|
394
|
+
else {
|
|
395
|
+
value = siteName.charAt(0).toUpperCase() + siteName.slice(1);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
catch {
|
|
399
|
+
// ignore malformed URLs
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
else if (normalizedField === 'summary' || normalizedField === 'description') {
|
|
404
|
+
// Find the first substantive paragraph (skip headings, dates, metadata)
|
|
405
|
+
const lines = content.split('\n');
|
|
406
|
+
const summaryParts = [];
|
|
407
|
+
let charCount = 0;
|
|
408
|
+
for (const line of lines) {
|
|
409
|
+
const trimmed = line.trim();
|
|
410
|
+
if (!trimmed)
|
|
411
|
+
continue;
|
|
412
|
+
if (trimmed.startsWith('#'))
|
|
413
|
+
continue; // skip headings
|
|
414
|
+
if (/^\d{4}-\d{2}-\d{2}/.test(trimmed))
|
|
415
|
+
continue; // skip dates
|
|
416
|
+
if (/^\d+\s*min\s*read/i.test(trimmed))
|
|
417
|
+
continue; // skip "5 min read"
|
|
418
|
+
if (/^(by|author|posted|published|updated|written)/i.test(trimmed))
|
|
419
|
+
continue;
|
|
420
|
+
if (/^\*[^*]+\*$/.test(trimmed))
|
|
421
|
+
continue; // skip italic-only lines
|
|
422
|
+
if (trimmed.length > 30) { // substantive line
|
|
423
|
+
summaryParts.push(trimmed);
|
|
424
|
+
charCount += trimmed.length;
|
|
425
|
+
if (charCount > 300)
|
|
426
|
+
break; // ~2-3 sentences
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
if (summaryParts.length > 0) {
|
|
430
|
+
value = summaryParts.join(' ').slice(0, 400);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
else if (normalizedField === 'body') {
|
|
434
|
+
// Body IS the content — return it directly (truncated for JSON output)
|
|
435
|
+
value = content.slice(0, 2000);
|
|
436
|
+
}
|
|
437
|
+
else if (normalizedField === 'tags') {
|
|
438
|
+
// Extract topic keywords from headings (skip the first one which is the title)
|
|
439
|
+
const headings = content.match(/^#{1,3}\s+(.+)$/gm) || [];
|
|
440
|
+
const topics = [];
|
|
441
|
+
for (const h of headings.slice(1, 6)) { // skip title, take up to 5
|
|
442
|
+
const clean = h.replace(/^#+\s*/, '').replace(/[*\[\](){}]/g, '').trim();
|
|
443
|
+
if (clean.length > 3 && clean.length < 60) {
|
|
444
|
+
topics.push(clean);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
if (topics.length >= 2) {
|
|
448
|
+
value = topics.join(', ');
|
|
449
|
+
}
|
|
450
|
+
// If fewer than 2 headings, fall back to BM25
|
|
451
|
+
}
|
|
452
|
+
// === BM25 FALLBACK (only for fields without structural signal) ===
|
|
453
|
+
if (!value) {
|
|
454
|
+
try {
|
|
455
|
+
const qa = quickAnswerFn({
|
|
456
|
+
content,
|
|
457
|
+
question: typeof question === 'string' ? question : field,
|
|
458
|
+
url: pageUrl || '',
|
|
459
|
+
});
|
|
460
|
+
value = qa.answer ? extractValueFromPassage(qa.answer, field) : '';
|
|
461
|
+
}
|
|
462
|
+
catch {
|
|
463
|
+
value = '';
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
extracted[field] = value;
|
|
467
|
+
}
|
|
468
|
+
return extracted;
|
|
469
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pre-built extraction schema templates for common use cases.
|
|
3
|
+
* Used with quickAnswer BM25 extraction (no LLM needed).
|
|
4
|
+
*/
|
|
5
|
+
export interface SchemaTemplate {
|
|
6
|
+
name: string;
|
|
7
|
+
description: string;
|
|
8
|
+
fields: Record<string, string>;
|
|
9
|
+
}
|
|
10
|
+
export declare const SCHEMA_TEMPLATES: Record<string, SchemaTemplate>;
|
|
11
|
+
/**
|
|
12
|
+
* Get a schema template by name, or return null if it's not a known template.
|
|
13
|
+
* If the input looks like JSON, return null (let caller parse it as custom JSON).
|
|
14
|
+
*/
|
|
15
|
+
export declare function getSchemaTemplate(nameOrJson: string): SchemaTemplate | null;
|
|
16
|
+
/**
|
|
17
|
+
* List all available schema template names.
|
|
18
|
+
*/
|
|
19
|
+
export declare function listSchemaTemplates(): string[];
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pre-built extraction schema templates for common use cases.
|
|
3
|
+
* Used with quickAnswer BM25 extraction (no LLM needed).
|
|
4
|
+
*/
|
|
5
|
+
export const SCHEMA_TEMPLATES = {
|
|
6
|
+
product: {
|
|
7
|
+
name: 'Product',
|
|
8
|
+
description: 'Extract product information from e-commerce pages',
|
|
9
|
+
fields: {
|
|
10
|
+
name: 'What is the product name?',
|
|
11
|
+
price: 'What is the price in dollars, euros, or other currency?',
|
|
12
|
+
description: 'What are the main features and specifications of this product?',
|
|
13
|
+
brand: 'What brand or company makes this product?',
|
|
14
|
+
rating: 'What is the customer rating or review score?',
|
|
15
|
+
availability: 'Is this product in stock or available for purchase?',
|
|
16
|
+
image: 'What is the URL of the product image?',
|
|
17
|
+
sku: 'What is the SKU, model number, or product identifier?',
|
|
18
|
+
},
|
|
19
|
+
},
|
|
20
|
+
article: {
|
|
21
|
+
name: 'Article',
|
|
22
|
+
description: 'Extract article/blog post information',
|
|
23
|
+
fields: {
|
|
24
|
+
title: 'What is the title or headline of this article?',
|
|
25
|
+
author: 'Who is the author or writer of this article?',
|
|
26
|
+
date: 'When was this article published?',
|
|
27
|
+
summary: 'What is the main point or summary of this article in one paragraph?',
|
|
28
|
+
body: 'What is the full text of the article body?',
|
|
29
|
+
tags: 'What topics, tags, or categories does this article cover?',
|
|
30
|
+
source: 'What publication, website, or news source published this article?',
|
|
31
|
+
},
|
|
32
|
+
},
|
|
33
|
+
listing: {
|
|
34
|
+
name: 'Listing',
|
|
35
|
+
description: 'Extract listing/directory items',
|
|
36
|
+
fields: {
|
|
37
|
+
items: 'list of items with name, price, and description',
|
|
38
|
+
totalCount: 'total number of items or results',
|
|
39
|
+
category: 'listing category or type',
|
|
40
|
+
sortOrder: 'how items are sorted',
|
|
41
|
+
},
|
|
42
|
+
},
|
|
43
|
+
contact: {
|
|
44
|
+
name: 'Contact',
|
|
45
|
+
description: 'Extract contact information',
|
|
46
|
+
fields: {
|
|
47
|
+
name: 'person or company name',
|
|
48
|
+
email: 'email address',
|
|
49
|
+
phone: 'phone number',
|
|
50
|
+
address: 'physical address',
|
|
51
|
+
website: 'website URL',
|
|
52
|
+
company: 'company or organization name',
|
|
53
|
+
social: 'social media links or handles',
|
|
54
|
+
},
|
|
55
|
+
},
|
|
56
|
+
event: {
|
|
57
|
+
name: 'Event',
|
|
58
|
+
description: 'Extract event information',
|
|
59
|
+
fields: {
|
|
60
|
+
name: 'What is the name of this event?',
|
|
61
|
+
date: 'When does this event take place?',
|
|
62
|
+
time: 'What time does this event start?',
|
|
63
|
+
location: 'Where is this event held?',
|
|
64
|
+
price: 'How much does this event cost?',
|
|
65
|
+
description: 'What is this event about?',
|
|
66
|
+
organizer: 'Who is organizing this event?',
|
|
67
|
+
},
|
|
68
|
+
},
|
|
69
|
+
recipe: {
|
|
70
|
+
name: 'Recipe',
|
|
71
|
+
description: 'Extract recipe information from cooking sites',
|
|
72
|
+
fields: {
|
|
73
|
+
name: 'What is the name of this recipe?',
|
|
74
|
+
ingredients: 'What ingredients are needed? List all.',
|
|
75
|
+
steps: 'What are the cooking steps or instructions?',
|
|
76
|
+
prepTime: 'How long does preparation take?',
|
|
77
|
+
cookTime: 'How long does cooking take?',
|
|
78
|
+
servings: 'How many servings does this recipe make?',
|
|
79
|
+
calories: 'How many calories per serving?',
|
|
80
|
+
rating: 'What is the recipe rating?',
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
job: {
|
|
84
|
+
name: 'Job',
|
|
85
|
+
description: 'Extract job posting information',
|
|
86
|
+
fields: {
|
|
87
|
+
title: 'What is the job title?',
|
|
88
|
+
company: 'What company is hiring?',
|
|
89
|
+
location: 'Where is the job located?',
|
|
90
|
+
salary: 'What is the salary or compensation range?',
|
|
91
|
+
type: 'Is this full-time, part-time, contract, or remote?',
|
|
92
|
+
requirements: 'What are the key requirements or qualifications?',
|
|
93
|
+
description: 'What is the job description?',
|
|
94
|
+
applyUrl: 'What is the URL or method to apply?',
|
|
95
|
+
},
|
|
96
|
+
},
|
|
97
|
+
business: {
|
|
98
|
+
name: 'Business',
|
|
99
|
+
description: 'Extract business/company information',
|
|
100
|
+
fields: {
|
|
101
|
+
name: 'What is the business name?',
|
|
102
|
+
address: 'What is the full address?',
|
|
103
|
+
phone: 'What is the phone number?',
|
|
104
|
+
hours: 'What are the business hours?',
|
|
105
|
+
rating: 'What is the business rating?',
|
|
106
|
+
reviewCount: 'How many reviews does this business have?',
|
|
107
|
+
website: 'What is the business website URL?',
|
|
108
|
+
categories: 'What type of business is this?',
|
|
109
|
+
},
|
|
110
|
+
},
|
|
111
|
+
review: {
|
|
112
|
+
name: 'Review',
|
|
113
|
+
description: 'Extract review information',
|
|
114
|
+
fields: {
|
|
115
|
+
title: 'review title',
|
|
116
|
+
rating: 'rating or score',
|
|
117
|
+
author: 'reviewer name',
|
|
118
|
+
date: 'review date',
|
|
119
|
+
body: 'review text or content',
|
|
120
|
+
pros: 'positive points',
|
|
121
|
+
cons: 'negative points',
|
|
122
|
+
product: 'product or service being reviewed',
|
|
123
|
+
},
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
/**
|
|
127
|
+
* Get a schema template by name, or return null if it's not a known template.
|
|
128
|
+
* If the input looks like JSON, return null (let caller parse it as custom JSON).
|
|
129
|
+
*/
|
|
130
|
+
export function getSchemaTemplate(nameOrJson) {
|
|
131
|
+
// If it starts with { or [, it's custom JSON, not a template name
|
|
132
|
+
if (nameOrJson.trim().startsWith('{') || nameOrJson.trim().startsWith('[')) {
|
|
133
|
+
return null;
|
|
134
|
+
}
|
|
135
|
+
const key = nameOrJson.toLowerCase().trim();
|
|
136
|
+
return SCHEMA_TEMPLATES[key] || null;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* List all available schema template names.
|
|
140
|
+
*/
|
|
141
|
+
export function listSchemaTemplates() {
|
|
142
|
+
return Object.keys(SCHEMA_TEMPLATES);
|
|
143
|
+
}
|