@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured JSON Extraction Engine
|
|
3
|
+
*
|
|
4
|
+
* Extracts structured data from markdown/text content using either:
|
|
5
|
+
* 1. LLM (via callLLM from llm-provider.ts) when an LLM config is provided
|
|
6
|
+
* 2. Heuristic regex/BM25-style extraction as a zero-key fallback
|
|
7
|
+
*
|
|
8
|
+
* Firecrawl-compatible: accepts a JSON schema, returns typed structured data.
|
|
9
|
+
*/
|
|
10
|
+
import { callLLM } from './llm-provider.js';
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// System prompt
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
const SYSTEM_PROMPT = 'Extract the following fields from the content. Return valid JSON matching the schema. Only use information present in the content. If a field is not found in the content, set it to null.';
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Schema validation / type coercion
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
function coerceValue(value, expectedType) {
|
|
19
|
+
if (value === null || value === undefined)
|
|
20
|
+
return null;
|
|
21
|
+
switch (expectedType) {
|
|
22
|
+
case 'string':
|
|
23
|
+
return typeof value === 'string' ? value : String(value);
|
|
24
|
+
case 'boolean': {
|
|
25
|
+
if (typeof value === 'boolean')
|
|
26
|
+
return value;
|
|
27
|
+
const str = String(value).toLowerCase().trim();
|
|
28
|
+
if (['true', 'yes', '1', 'open', 'enabled'].includes(str))
|
|
29
|
+
return true;
|
|
30
|
+
if (['false', 'no', '0', 'closed', 'disabled'].includes(str))
|
|
31
|
+
return false;
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
case 'number': {
|
|
35
|
+
if (typeof value === 'number')
|
|
36
|
+
return isNaN(value) ? null : value;
|
|
37
|
+
const num = parseFloat(String(value).replace(/,/g, ''));
|
|
38
|
+
return isNaN(num) ? null : num;
|
|
39
|
+
}
|
|
40
|
+
case 'array':
|
|
41
|
+
return Array.isArray(value) ? value : [value];
|
|
42
|
+
case 'object':
|
|
43
|
+
return typeof value === 'object' ? value : null;
|
|
44
|
+
default:
|
|
45
|
+
return value;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
function validateAndCoerce(raw, schema) {
|
|
49
|
+
const data = {};
|
|
50
|
+
const missingRequired = [];
|
|
51
|
+
for (const [field, fieldDef] of Object.entries(schema.properties)) {
|
|
52
|
+
const coerced = coerceValue(raw[field], fieldDef.type);
|
|
53
|
+
data[field] = coerced;
|
|
54
|
+
if ((coerced === null || coerced === undefined) && schema.required?.includes(field)) {
|
|
55
|
+
missingRequired.push(field);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return { data, missingRequired };
|
|
59
|
+
}
|
|
60
|
+
// ---------------------------------------------------------------------------
|
|
61
|
+
// Parse JSON out of LLM text (handles code fences + raw JSON)
|
|
62
|
+
// ---------------------------------------------------------------------------
|
|
63
|
+
function parseLLMJson(text) {
|
|
64
|
+
const stripped = text.trim();
|
|
65
|
+
// Extract from ```json ... ``` or ``` ... ``` code fences
|
|
66
|
+
const fenceMatch = stripped.match(/```(?:json)?\s*\n?([\s\S]+?)\n?```/);
|
|
67
|
+
if (fenceMatch?.[1]) {
|
|
68
|
+
return JSON.parse(fenceMatch[1].trim());
|
|
69
|
+
}
|
|
70
|
+
// Try direct parse
|
|
71
|
+
try {
|
|
72
|
+
return JSON.parse(stripped);
|
|
73
|
+
}
|
|
74
|
+
catch {
|
|
75
|
+
// Find first {...} in the text
|
|
76
|
+
const objMatch = stripped.match(/\{[\s\S]+\}/);
|
|
77
|
+
if (objMatch) {
|
|
78
|
+
return JSON.parse(objMatch[0]);
|
|
79
|
+
}
|
|
80
|
+
throw new Error(`Could not parse JSON from LLM response: ${stripped.slice(0, 200)}`);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// ---------------------------------------------------------------------------
|
|
84
|
+
// Heuristic extraction helpers (no LLM key needed)
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
/**
|
|
87
|
+
* For string fields: search for field name in content, extract surrounding text.
|
|
88
|
+
*/
|
|
89
|
+
/** Extract first H1 or page title from markdown content */
|
|
90
|
+
function extractPageTitle(content) {
|
|
91
|
+
const h1 = content.match(/^#\s+(.+)$/m);
|
|
92
|
+
if (h1?.[1])
|
|
93
|
+
return h1[1].replace(/[*_`]/g, '').trim();
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
/** Extract meta description (after *X min read* pattern common in WebPeel output) */
|
|
97
|
+
function extractDescription(content) {
|
|
98
|
+
// First paragraph after the title
|
|
99
|
+
const lines = content.split('\n').filter(l => l.trim());
|
|
100
|
+
let seenH1 = false;
|
|
101
|
+
for (const line of lines) {
|
|
102
|
+
if (line.startsWith('#')) {
|
|
103
|
+
seenH1 = true;
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
if (line.startsWith('*') && line.endsWith('*'))
|
|
107
|
+
continue; // byline
|
|
108
|
+
if (seenH1 && line.length > 30)
|
|
109
|
+
return line.replace(/[*_`]/g, '').trim().slice(0, 300);
|
|
110
|
+
}
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
/** Extract company/brand name from title (before " — ", " - ", " | ", " · ") */
|
|
114
|
+
function extractCompanyFromTitle(title) {
|
|
115
|
+
const sep = title.match(/^([^|·\-—]+)[|·\-—]/);
|
|
116
|
+
if (sep?.[1])
|
|
117
|
+
return sep[1].trim();
|
|
118
|
+
return title.trim().slice(0, 60);
|
|
119
|
+
}
|
|
120
|
+
/** Smart field-name-aware string extractor */
|
|
121
|
+
function heuristicExtractString(fieldName, content, pageUrl) {
|
|
122
|
+
const lf = fieldName.toLowerCase();
|
|
123
|
+
const humanName = fieldName.replace(/_/g, ' ');
|
|
124
|
+
const title = extractPageTitle(content);
|
|
125
|
+
// --- Concept-aware extraction ---
|
|
126
|
+
// Company/brand/organization name
|
|
127
|
+
if (/company|brand|organization|org_name/.test(lf)) {
|
|
128
|
+
if (title)
|
|
129
|
+
return extractCompanyFromTitle(title);
|
|
130
|
+
// Fallback: extract from first heading of any level
|
|
131
|
+
const anyHeading = content.match(/^#{1,3}\s+(.+)$/m);
|
|
132
|
+
if (anyHeading?.[1])
|
|
133
|
+
return anyHeading[1].replace(/[*_`[\]]/g, '').trim().slice(0, 60);
|
|
134
|
+
}
|
|
135
|
+
// Title/name/product → first H1 or any heading, stripped of markdown
|
|
136
|
+
if (/^(title|name|product_name|product|heading)$/.test(lf)) {
|
|
137
|
+
const rawTitle = title ?? content.match(/^#{1,3}\s+(.+)$/m)?.[1];
|
|
138
|
+
if (rawTitle) {
|
|
139
|
+
// Strip markdown links [text](url) → text, badges  → '', etc.
|
|
140
|
+
return rawTitle
|
|
141
|
+
.replace(/!\[[^\]]*\]\([^)]*\)/g, '') // remove images
|
|
142
|
+
.replace(/\[([^\]]+)\]\([^)]*\)/g, '$1') // [text](url) → text
|
|
143
|
+
.replace(/\(https?:\/\/[^)]+\)/g, '') // remove bare URLs in parens
|
|
144
|
+
.replace(/[*_`[\]]/g, '')
|
|
145
|
+
.replace(/&[a-z]+;/g, '') // HTML entities
|
|
146
|
+
// Strip leading emoji (📦🎬🎵🎮 etc.) that domain extractors add as decoration
|
|
147
|
+
.replace(/^[\p{Emoji_Presentation}\p{Extended_Pictographic}\uFE0F]+\s*/u, '')
|
|
148
|
+
.replace(/\s+/g, ' ')
|
|
149
|
+
.trim().slice(0, 150);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
// Description/summary/about → first paragraph
|
|
153
|
+
if (/description|summary|about|overview/.test(lf)) {
|
|
154
|
+
return extractDescription(content) ?? null;
|
|
155
|
+
}
|
|
156
|
+
// URL/website/link → use the URL if we have it
|
|
157
|
+
if (/^(url|website|link|homepage|site)$/.test(lf)) {
|
|
158
|
+
if (pageUrl)
|
|
159
|
+
return pageUrl;
|
|
160
|
+
}
|
|
161
|
+
// Creator / designer / founder / inventor
|
|
162
|
+
if (/creator|designer|founder|inventor|invented_by|created_by/.test(lf)) {
|
|
163
|
+
const m = content.match(/(?:created?|designed?|founded?|invented?)\s+by\s+([A-Z][^\n,·|–—]+?)(?:\s*[,·|–—]|\s+in\s+\d{4}|\.)/i)
|
|
164
|
+
?? content.match(/(?:creator|designer|founder|inventor)[:\s]+([A-Z][^\n,·|]+?)(?:\s*[,·|–—]|\.)/i);
|
|
165
|
+
if (m?.[1])
|
|
166
|
+
return m[1].replace(/[*_`[\]]/g, '').trim().slice(0, 80);
|
|
167
|
+
}
|
|
168
|
+
// Director (for movies/films)
|
|
169
|
+
if (/director/.test(lf)) {
|
|
170
|
+
const m = content.match(/Director[:\s*]+([^\n|,]+)/i) ?? content.match(/Directed by[:\s]+([^\n|,]+)/i);
|
|
171
|
+
if (m?.[1])
|
|
172
|
+
return m[1].replace(/[*_`]/g, '').trim().slice(0, 100);
|
|
173
|
+
}
|
|
174
|
+
// Version (semver: x.y.z or x.y.z.w)
|
|
175
|
+
if (/^version$/.test(lf)) {
|
|
176
|
+
const m = content.match(/\*\*Version:\*\*\s*([\d]+\.[\d]+[\.\d]*)/i)
|
|
177
|
+
?? content.match(/version[:\s]+v?([\d]+\.[\d]+[\.\d]*)/i)
|
|
178
|
+
?? content.match(/v?([\d]+\.[\d]+\.[\d]+)/);
|
|
179
|
+
if (m?.[1])
|
|
180
|
+
return m[1];
|
|
181
|
+
}
|
|
182
|
+
// Author/writer/by
|
|
183
|
+
if (/author|writer|by/.test(lf)) {
|
|
184
|
+
const m = content.match(/\*By\s+([^·\n*]+)/i) ?? content.match(/Author[:\s]+([^\n,]+)/i);
|
|
185
|
+
if (m?.[1])
|
|
186
|
+
return m[1].trim().slice(0, 100);
|
|
187
|
+
}
|
|
188
|
+
// Date/published/updated
|
|
189
|
+
if (/date|published|updated|modified/.test(lf)) {
|
|
190
|
+
const m = content.match(/(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b)/i)
|
|
191
|
+
?? content.match(/(\d{4}-\d{2}-\d{2})/);
|
|
192
|
+
if (m?.[1])
|
|
193
|
+
return m[1];
|
|
194
|
+
}
|
|
195
|
+
// Email
|
|
196
|
+
if (/email|contact/.test(lf)) {
|
|
197
|
+
const m = content.match(/[\w.+-]+@[\w-]+\.[a-z]{2,}/i);
|
|
198
|
+
if (m?.[0])
|
|
199
|
+
return m[0];
|
|
200
|
+
}
|
|
201
|
+
// Price/cost/pricing → extract value near $
|
|
202
|
+
if (/price|cost|pricing|fee/.test(lf)) {
|
|
203
|
+
const m = content.match(/\$\s*[\d,]+(?:\.\d{2})?(?:\s*\/\s*\w+)?/)
|
|
204
|
+
?? content.match(/(free|no cost|no charge)/i);
|
|
205
|
+
if (m?.[0])
|
|
206
|
+
return m[0].trim();
|
|
207
|
+
}
|
|
208
|
+
// Language (for GitHub repos)
|
|
209
|
+
if (/language|lang|tech/.test(lf)) {
|
|
210
|
+
const m = content.match(/💻\s*(\w[\w#+.-]+)/) ?? content.match(/Language[:\s]+(\w[\w#+.-]+)/i);
|
|
211
|
+
if (m?.[1])
|
|
212
|
+
return m[1];
|
|
213
|
+
}
|
|
214
|
+
// Stars (for GitHub)
|
|
215
|
+
if (/stars?/.test(lf)) {
|
|
216
|
+
const m = content.match(/⭐\s*([\d,]+)\s*stars?/i) ?? content.match(/([\d,]+)\s*stars?/i);
|
|
217
|
+
if (m?.[1])
|
|
218
|
+
return m[1].replace(/,/g, '');
|
|
219
|
+
}
|
|
220
|
+
// License
|
|
221
|
+
if (/license/.test(lf)) {
|
|
222
|
+
const m = content.match(/📜\s*(\w+)/) ?? content.match(/License[:\s]+(MIT|Apache|GPL|BSD|ISC|AGPL|MPL)[^\s]*/i);
|
|
223
|
+
if (m?.[1])
|
|
224
|
+
return m[1];
|
|
225
|
+
}
|
|
226
|
+
// --- Generic patterns (exact-ish match) ---
|
|
227
|
+
const patterns = [
|
|
228
|
+
new RegExp(`(?:^|\\n)[ \\t]*${humanName}[:\\s]+([^\\n]{5,200})`, 'i'),
|
|
229
|
+
new RegExp(`"${fieldName}"\\s*:\\s*"([^"]{1,300})"`, 'i'),
|
|
230
|
+
new RegExp(`\\*{1,2}${humanName}\\*{0,2}[:\\s]+([^\\n]{5,200})`, 'i'),
|
|
231
|
+
new RegExp(`#+\\s*${humanName}\\s*\\n+([^\\n]{5,300})`, 'i'),
|
|
232
|
+
];
|
|
233
|
+
for (const pattern of patterns) {
|
|
234
|
+
const match = content.match(pattern);
|
|
235
|
+
if (match?.[1])
|
|
236
|
+
return match[1].trim().replace(/[|*_`]/g, '').slice(0, 300);
|
|
237
|
+
}
|
|
238
|
+
return null;
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* For boolean fields: search the ENTIRE content for positive/negative indicators.
|
|
242
|
+
*/
|
|
243
|
+
function heuristicExtractBoolean(fieldName, content) {
|
|
244
|
+
const lf = fieldName.toLowerCase();
|
|
245
|
+
const ctx = content.toLowerCase();
|
|
246
|
+
// Concept-aware boolean extraction — search entire content, not just near field name
|
|
247
|
+
// Free tier / free plan
|
|
248
|
+
if (/free_tier|has_free|is_free/.test(lf)) {
|
|
249
|
+
if (/free tier|free plan|\$0|no cost|no charge|free forever/.test(ctx))
|
|
250
|
+
return true;
|
|
251
|
+
if (/no free|paid only|subscription required/.test(ctx))
|
|
252
|
+
return false;
|
|
253
|
+
}
|
|
254
|
+
// Open source
|
|
255
|
+
if (/open_source|is_open|oss/.test(lf)) {
|
|
256
|
+
if (/open[- ]source|mit license|apache license|gpl|bsd license|📜\s*mit|📜\s*apache/.test(ctx))
|
|
257
|
+
return true;
|
|
258
|
+
if (/closed[- ]source|proprietary|commercial license/.test(ctx))
|
|
259
|
+
return false;
|
|
260
|
+
}
|
|
261
|
+
// API availability
|
|
262
|
+
if (/has_api|api_available|has_rest/.test(lf)) {
|
|
263
|
+
if (/rest api|graphql api|api endpoint|api key|\/v1\/|\/api\//.test(ctx))
|
|
264
|
+
return true;
|
|
265
|
+
}
|
|
266
|
+
// Authentication
|
|
267
|
+
if (/requires_auth|has_auth|is_authenticated/.test(lf)) {
|
|
268
|
+
if (/login|sign in|authentication|api key|bearer token/.test(ctx))
|
|
269
|
+
return true;
|
|
270
|
+
}
|
|
271
|
+
// General approach: search near field name concept
|
|
272
|
+
const humanName = fieldName.replace(/_/g, ' ').toLowerCase();
|
|
273
|
+
let fieldIdx = ctx.indexOf(fieldName.toLowerCase());
|
|
274
|
+
if (fieldIdx === -1)
|
|
275
|
+
fieldIdx = ctx.indexOf(humanName);
|
|
276
|
+
if (fieldIdx !== -1) {
|
|
277
|
+
const window = ctx.slice(Math.max(0, fieldIdx - 80), fieldIdx + 200);
|
|
278
|
+
const positive = ['yes', 'true', 'open source', 'open-source', 'available', 'enabled', 'supported', 'free', 'included'];
|
|
279
|
+
const negative = ['no', 'false', 'closed', 'proprietary', 'unavailable', 'disabled', 'not supported', 'excluded'];
|
|
280
|
+
for (const pos of positive) {
|
|
281
|
+
if (window.includes(pos))
|
|
282
|
+
return true;
|
|
283
|
+
}
|
|
284
|
+
for (const neg of negative) {
|
|
285
|
+
if (window.includes(neg))
|
|
286
|
+
return false;
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
return null;
|
|
290
|
+
}
|
|
291
|
+
/**
|
|
292
|
+
* For number fields: find digits near the field name.
|
|
293
|
+
*/
|
|
294
|
+
function heuristicExtractNumber(fieldName, content) {
|
|
295
|
+
const lf = fieldName.toLowerCase();
|
|
296
|
+
// Stars (GitHub)
|
|
297
|
+
if (/stars?/.test(lf)) {
|
|
298
|
+
const m = content.match(/⭐\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*stars?/i);
|
|
299
|
+
if (m?.[1]) {
|
|
300
|
+
const n = parseFloat(m[1].replace(/,/g, ''));
|
|
301
|
+
return isNaN(n) ? null : n;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
// Forks
|
|
305
|
+
if (/forks?/.test(lf)) {
|
|
306
|
+
const m = content.match(/🍴\s*([\d,]+)/) ?? content.match(/([\d,]+)\s*forks?/i);
|
|
307
|
+
if (m?.[1]) {
|
|
308
|
+
const n = parseFloat(m[1].replace(/,/g, ''));
|
|
309
|
+
return isNaN(n) ? null : n;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
// Rating/score
|
|
313
|
+
if (/rating|score/.test(lf)) {
|
|
314
|
+
const m = content.match(/⭐\s*([\d.]+)\//) ?? content.match(/([\d.]+)\s*\/\s*10/) ?? content.match(/([\d.]+)\s*\/\s*5/);
|
|
315
|
+
if (m?.[1]) {
|
|
316
|
+
const n = parseFloat(m[1]);
|
|
317
|
+
return isNaN(n) ? null : n;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
// Year
|
|
321
|
+
if (/year/.test(lf)) {
|
|
322
|
+
// Explicit "Year: YYYY" label first
|
|
323
|
+
const explicit = content.match(/\bYear[:\s]+(\d{4})\b/i);
|
|
324
|
+
if (explicit?.[1]) {
|
|
325
|
+
const n = parseInt(explicit[1]);
|
|
326
|
+
return isNaN(n) ? null : n;
|
|
327
|
+
}
|
|
328
|
+
// For "created_year" / "founded_year" / "released_year" — look for context
|
|
329
|
+
if (/creat|found|release|launch|start|born|inception/.test(lf)) {
|
|
330
|
+
const ctxMatch = content.match(/(?:created?|founded?|released?|launched?|started?|born|inception)[^\d]*(\b(?:19|20)\d{2}\b)/i)
|
|
331
|
+
?? content.match(/\b(?:in|year)\s+(\b(?:19|20)\d{2}\b)/i)
|
|
332
|
+
?? content.match(/(\b(?:19|20)\d{2}\b)/);
|
|
333
|
+
if (ctxMatch?.[1]) {
|
|
334
|
+
const n = parseInt(ctxMatch[1]);
|
|
335
|
+
return isNaN(n) ? null : n;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
// Fallback: first year found
|
|
339
|
+
const m = content.match(/\b((?:19|20)\d{2})\b/);
|
|
340
|
+
if (m?.[1]) {
|
|
341
|
+
const n = parseInt(m[1]);
|
|
342
|
+
return isNaN(n) ? null : n;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
// Downloads / weekly_downloads (npm, pypi)
|
|
346
|
+
if (/downloads?/.test(lf)) {
|
|
347
|
+
const m = content.match(/weekly\s+downloads[^\d]*([\d,]+)/i)
|
|
348
|
+
?? content.match(/downloads?[^\d]*([\d,]+)/i);
|
|
349
|
+
if (m?.[1]) {
|
|
350
|
+
const n = parseFloat(m[1].replace(/,/g, ''));
|
|
351
|
+
return isNaN(n) ? null : n;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
// Population (Wikipedia infoboxes)
|
|
355
|
+
if (/population/.test(lf)) {
|
|
356
|
+
const m = content.match(/population[^\d]*([\d,]+)/i);
|
|
357
|
+
if (m?.[1]) {
|
|
358
|
+
const n = parseFloat(m[1].replace(/,/g, ''));
|
|
359
|
+
return isNaN(n) ? null : n;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
// Generic: find number near field name — use [^\d]* to skip non-digit separators
|
|
363
|
+
const humanName = fieldName.replace(/_/g, '[\\s_-]*');
|
|
364
|
+
const pattern = new RegExp(`${humanName}[^\\d]*(\\d[\\d,]*\\.?\\d*)`, 'i');
|
|
365
|
+
const match = content.match(pattern);
|
|
366
|
+
if (match?.[1]) {
|
|
367
|
+
const num = parseFloat(match[1].replace(/,/g, ''));
|
|
368
|
+
return isNaN(num) ? null : num;
|
|
369
|
+
}
|
|
370
|
+
return null;
|
|
371
|
+
}
|
|
372
|
+
async function heuristicExtract(content, schema) {
|
|
373
|
+
const data = {};
|
|
374
|
+
let fieldsFound = 0;
|
|
375
|
+
const totalFields = Object.keys(schema.properties).length;
|
|
376
|
+
for (const [field, fieldDef] of Object.entries(schema.properties)) {
|
|
377
|
+
const type = fieldDef.type;
|
|
378
|
+
let value = null;
|
|
379
|
+
if (type === 'string') {
|
|
380
|
+
value = heuristicExtractString(field, content);
|
|
381
|
+
}
|
|
382
|
+
else if (type === 'boolean') {
|
|
383
|
+
value = heuristicExtractBoolean(field, content);
|
|
384
|
+
}
|
|
385
|
+
else if (type === 'number') {
|
|
386
|
+
value = heuristicExtractNumber(field, content);
|
|
387
|
+
}
|
|
388
|
+
// For array/object types, heuristic returns null (not enough context)
|
|
389
|
+
if (value !== null && value !== undefined)
|
|
390
|
+
fieldsFound++;
|
|
391
|
+
data[field] = value;
|
|
392
|
+
}
|
|
393
|
+
// Confidence based on fill rate:
|
|
394
|
+
// - ALL fields null → 0.1 (extraction found nothing useful)
|
|
395
|
+
// - Some fields null → 0.3-0.5 based on fill ratio
|
|
396
|
+
// - ALL fields populated → 0.6-0.7 (heuristic max — values may still be imprecise)
|
|
397
|
+
const fillRate = totalFields > 0 ? fieldsFound / totalFields : 0;
|
|
398
|
+
let confidence;
|
|
399
|
+
if (fieldsFound === 0) {
|
|
400
|
+
confidence = 0.1; // All null — heuristic found nothing
|
|
401
|
+
}
|
|
402
|
+
else if (fieldsFound === totalFields) {
|
|
403
|
+
confidence = 0.65 + fillRate * 0.05; // 0.7 for fully populated heuristic
|
|
404
|
+
}
|
|
405
|
+
else {
|
|
406
|
+
confidence = 0.3 + fillRate * 0.2; // 0.3–0.5 based on fill ratio
|
|
407
|
+
}
|
|
408
|
+
return {
|
|
409
|
+
data,
|
|
410
|
+
confidence: parseFloat(confidence.toFixed(2)),
|
|
411
|
+
tokensUsed: 0,
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
// ---------------------------------------------------------------------------
|
|
415
|
+
// Main extraction function
|
|
416
|
+
// ---------------------------------------------------------------------------
|
|
417
|
+
/**
|
|
418
|
+
* Extract structured data from markdown content using an LLM or heuristics.
|
|
419
|
+
*
|
|
420
|
+
* @param content Markdown/text content to extract from
|
|
421
|
+
* @param schema JSON schema describing what to extract
|
|
422
|
+
* @param llmConfig Optional LLM config (if omitted, uses heuristic fallback)
|
|
423
|
+
* @param prompt Optional user guidance added to the LLM prompt
|
|
424
|
+
*/
|
|
425
|
+
export async function extractStructured(content, schema, llmConfig, prompt, domainHints) {
|
|
426
|
+
// Guard: empty content
|
|
427
|
+
if (!content || content.trim().length === 0) {
|
|
428
|
+
return { data: {}, confidence: 0, tokensUsed: 0 };
|
|
429
|
+
}
|
|
430
|
+
// Guard: invalid schema
|
|
431
|
+
if (!schema || schema.type !== 'object' || typeof schema.properties !== 'object') {
|
|
432
|
+
throw new Error('Invalid schema: must be { type: "object", properties: { ... } }');
|
|
433
|
+
}
|
|
434
|
+
// ── LLM extraction ──────────────────────────────────────────────────────
|
|
435
|
+
if (llmConfig) {
|
|
436
|
+
const schemaStr = JSON.stringify(schema, null, 2);
|
|
437
|
+
const userContent = [
|
|
438
|
+
`Schema:\n${schemaStr}`,
|
|
439
|
+
prompt ? `\nInstructions: ${prompt}` : '',
|
|
440
|
+
`\nContent:\n${content.slice(0, 12000)}`,
|
|
441
|
+
]
|
|
442
|
+
.filter(Boolean)
|
|
443
|
+
.join('');
|
|
444
|
+
const messages = [
|
|
445
|
+
{ role: 'system', content: SYSTEM_PROMPT },
|
|
446
|
+
{ role: 'user', content: userContent },
|
|
447
|
+
];
|
|
448
|
+
try {
|
|
449
|
+
const llmResult = await callLLM(llmConfig, { messages, maxTokens: 2048, temperature: 0.1 });
|
|
450
|
+
const tokensUsed = llmResult.usage.input + llmResult.usage.output;
|
|
451
|
+
let parsed;
|
|
452
|
+
try {
|
|
453
|
+
parsed = parseLLMJson(llmResult.text);
|
|
454
|
+
}
|
|
455
|
+
catch {
|
|
456
|
+
// Malformed LLM response — fall back to heuristic
|
|
457
|
+
const heuristic = await heuristicExtract(content, schema);
|
|
458
|
+
return heuristic;
|
|
459
|
+
}
|
|
460
|
+
const { data, missingRequired } = validateAndCoerce(parsed, schema);
|
|
461
|
+
// Confidence for LLM extraction:
|
|
462
|
+
// - ALL fields null → 0.1 (LLM couldn't extract anything)
|
|
463
|
+
// - Partial fill → 0.85+ (LLM is generally reliable when it finds data)
|
|
464
|
+
// - All populated → 0.90-0.98 based on fill rate
|
|
465
|
+
const filledCount = Object.values(data).filter((v) => v !== null && v !== undefined).length;
|
|
466
|
+
const totalCount = Object.keys(schema.properties).length;
|
|
467
|
+
const fillRate = totalCount > 0 ? filledCount / totalCount : 0;
|
|
468
|
+
const penalty = missingRequired.length * 0.05;
|
|
469
|
+
let confidence;
|
|
470
|
+
if (filledCount === 0) {
|
|
471
|
+
confidence = 0.1; // LLM returned all nulls — extraction failed
|
|
472
|
+
}
|
|
473
|
+
else {
|
|
474
|
+
const fillBonus = fillRate * 0.08; // Up to +0.08 for fully populated
|
|
475
|
+
confidence = Math.min(0.98, 0.85 + fillBonus - penalty); // 0.85–0.93+ for LLM
|
|
476
|
+
}
|
|
477
|
+
return {
|
|
478
|
+
data,
|
|
479
|
+
confidence: parseFloat(confidence.toFixed(2)),
|
|
480
|
+
tokensUsed,
|
|
481
|
+
};
|
|
482
|
+
}
|
|
483
|
+
catch (err) {
|
|
484
|
+
// Re-throw auth/rate-limit/quota errors; fall back on parse/network errors
|
|
485
|
+
const msg = String(err instanceof Error ? err.message : err);
|
|
486
|
+
if (msg.includes('free_tier_limit') ||
|
|
487
|
+
msg.includes('API key') ||
|
|
488
|
+
msg.includes('Unauthorized') ||
|
|
489
|
+
msg.includes('401') ||
|
|
490
|
+
msg.includes('403')) {
|
|
491
|
+
throw err;
|
|
492
|
+
}
|
|
493
|
+
// Network / parse failure → heuristic fallback
|
|
494
|
+
return heuristicExtract(content, schema);
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
// ── Heuristic extraction ─────────────────────────────────────────────────
|
|
498
|
+
const heuristic = await heuristicExtract(content, schema);
|
|
499
|
+
// ── Domain hints overlay ─────────────────────────────────────────────────
|
|
500
|
+
// If domain-api pre-extracted fields (e.g. GitHub stars/language), merge them
|
|
501
|
+
// into the result. Domain-api data is authoritative — prefer over heuristic.
|
|
502
|
+
if (domainHints && Object.keys(domainHints).length > 0) {
|
|
503
|
+
const props = schema.properties;
|
|
504
|
+
let hintMerged = 0;
|
|
505
|
+
for (const [field, hintValue] of Object.entries(domainHints)) {
|
|
506
|
+
if (field in props && hintValue !== null && hintValue !== undefined) {
|
|
507
|
+
const expected = props[field].type;
|
|
508
|
+
const actual = typeof hintValue;
|
|
509
|
+
// Only merge if type matches (or number vs string coercion)
|
|
510
|
+
if (actual === expected ||
|
|
511
|
+
(expected === 'number' && actual === 'string' && !isNaN(Number(hintValue))) ||
|
|
512
|
+
(expected === 'string' && actual !== 'object')) {
|
|
513
|
+
heuristic.data[field] =
|
|
514
|
+
expected === 'number' ? Number(hintValue) : hintValue;
|
|
515
|
+
hintMerged++;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
if (hintMerged > 0) {
|
|
520
|
+
// Boost confidence since we have authoritative domain-api data
|
|
521
|
+
const filled = Object.values(heuristic.data).filter(v => v !== null && v !== undefined).length;
|
|
522
|
+
const total = Object.keys(props).length;
|
|
523
|
+
heuristic.confidence = parseFloat(Math.min(0.90, 0.65 + (filled / total) * 0.25).toFixed(2));
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
return heuristic;
|
|
527
|
+
}
|
|
528
|
+
// ---------------------------------------------------------------------------
|
|
529
|
+
// Helper: convert simple { field: "type" } map → ExtractionSchema
|
|
530
|
+
// ---------------------------------------------------------------------------
|
|
531
|
+
/**
|
|
532
|
+
* Convert a shorthand schema `{ field: "string", active: "boolean" }` to a
|
|
533
|
+
* full ExtractionSchema. Useful for CLI --extract flag.
|
|
534
|
+
*/
|
|
535
|
+
export function simpleToExtractionSchema(simple) {
|
|
536
|
+
const properties = {};
|
|
537
|
+
for (const [field, type] of Object.entries(simple)) {
|
|
538
|
+
properties[field] = { type };
|
|
539
|
+
}
|
|
540
|
+
return { type: 'object', properties };
|
|
541
|
+
}
|
|
542
|
+
/**
|
|
543
|
+
* Check if a JSON object looks like a simple type-schema
|
|
544
|
+
* (`{ field: "string" | "boolean" | "number" }`) rather than CSS selectors.
|
|
545
|
+
*/
|
|
546
|
+
export function isTypeSchema(obj) {
|
|
547
|
+
const typeNames = new Set(['string', 'boolean', 'number', 'array', 'object', 'integer']);
|
|
548
|
+
const values = Object.values(obj);
|
|
549
|
+
return values.length > 0 && values.every((v) => typeof v === 'string' && typeNames.has(v));
|
|
550
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AI-powered content summarization using OpenAI-compatible APIs
|
|
3
|
+
*/
|
|
4
|
+
export interface SummarizeOptions {
|
|
5
|
+
/** OpenAI-compatible API base URL (default: https://api.openai.com/v1) */
|
|
6
|
+
apiBase?: string;
|
|
7
|
+
/** API key for the LLM */
|
|
8
|
+
apiKey: string;
|
|
9
|
+
/** Model to use (default: gpt-4o-mini) */
|
|
10
|
+
model?: string;
|
|
11
|
+
/** Max length of summary in words */
|
|
12
|
+
maxWords?: number;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Summarize content using an OpenAI-compatible LLM API
|
|
16
|
+
*/
|
|
17
|
+
export declare function summarizeContent(content: string, options: SummarizeOptions): Promise<string>;
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AI-powered content summarization using OpenAI-compatible APIs
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Truncate content to roughly 4000 tokens (~16000 characters)
|
|
6
|
+
* This leaves room for system prompt and response
|
|
7
|
+
*/
|
|
8
|
+
function truncateContent(content) {
|
|
9
|
+
const MAX_CHARS = 16000; // ~4000 tokens
|
|
10
|
+
if (content.length <= MAX_CHARS) {
|
|
11
|
+
return content;
|
|
12
|
+
}
|
|
13
|
+
// Truncate and add ellipsis
|
|
14
|
+
return content.slice(0, MAX_CHARS) + '\n\n[Content truncated for summarization...]';
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Summarize content using an OpenAI-compatible LLM API
|
|
18
|
+
*/
|
|
19
|
+
export async function summarizeContent(content, options) {
|
|
20
|
+
const { apiBase = 'https://api.openai.com/v1', apiKey, model = 'gpt-4o-mini', maxWords = 150, } = options;
|
|
21
|
+
// Validate inputs
|
|
22
|
+
if (!apiKey || apiKey.trim().length === 0) {
|
|
23
|
+
throw new Error('API key is required for summarization');
|
|
24
|
+
}
|
|
25
|
+
if (!content || content.trim().length === 0) {
|
|
26
|
+
throw new Error('Content is required for summarization');
|
|
27
|
+
}
|
|
28
|
+
// Truncate content to fit within token limits
|
|
29
|
+
const truncatedContent = truncateContent(content);
|
|
30
|
+
// Build the prompt
|
|
31
|
+
const prompt = `Summarize the following web page content concisely in ${maxWords} words or fewer. Focus on the key information.
|
|
32
|
+
|
|
33
|
+
Content:
|
|
34
|
+
${truncatedContent}`;
|
|
35
|
+
// Call the OpenAI-compatible API
|
|
36
|
+
const apiUrl = `${apiBase.replace(/\/$/, '')}/chat/completions`;
|
|
37
|
+
try {
|
|
38
|
+
const response = await fetch(apiUrl, {
|
|
39
|
+
method: 'POST',
|
|
40
|
+
headers: {
|
|
41
|
+
'Content-Type': 'application/json',
|
|
42
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
43
|
+
},
|
|
44
|
+
body: JSON.stringify({
|
|
45
|
+
model,
|
|
46
|
+
messages: [
|
|
47
|
+
{
|
|
48
|
+
role: 'user',
|
|
49
|
+
content: prompt,
|
|
50
|
+
},
|
|
51
|
+
],
|
|
52
|
+
temperature: 0.3, // Lower temperature for more focused summaries
|
|
53
|
+
max_tokens: maxWords * 2, // Rough estimate: 1 word ≈ 1.5-2 tokens
|
|
54
|
+
}),
|
|
55
|
+
});
|
|
56
|
+
if (!response.ok) {
|
|
57
|
+
const errorText = await response.text();
|
|
58
|
+
throw new Error(`LLM API error: HTTP ${response.status} - ${errorText}`);
|
|
59
|
+
}
|
|
60
|
+
const result = await response.json();
|
|
61
|
+
// Check for API error
|
|
62
|
+
if (result.error) {
|
|
63
|
+
throw new Error(`LLM API error: ${result.error.message}`);
|
|
64
|
+
}
|
|
65
|
+
// Extract summary from response
|
|
66
|
+
const summary = result.choices?.[0]?.message?.content?.trim();
|
|
67
|
+
if (!summary) {
|
|
68
|
+
throw new Error('LLM API returned empty response');
|
|
69
|
+
}
|
|
70
|
+
return summary;
|
|
71
|
+
}
|
|
72
|
+
catch (error) {
|
|
73
|
+
if (error instanceof Error) {
|
|
74
|
+
throw new Error(`Summarization failed: ${error.message}`);
|
|
75
|
+
}
|
|
76
|
+
throw new Error('Summarization failed: Unknown error');
|
|
77
|
+
}
|
|
78
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Synonym expansion for query broadening.
|
|
3
|
+
*
|
|
4
|
+
* Provides stemmed synonym groups and a function to expand a set of stemmed
|
|
5
|
+
* query tokens with related synonyms (at a lower weight).
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* const queryTerms = tokenizeQuestion(question); // already stemmed
|
|
9
|
+
* const expanded = expandWithSynonyms(queryTerms);
|
|
10
|
+
* // expanded includes originals (weight=1.0) + synonyms (weight=0.5)
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Raw synonym groups. Each group is a set of words with equivalent or near-
|
|
14
|
+
* equivalent meaning in the context of software/web documentation queries.
|
|
15
|
+
*
|
|
16
|
+
* These are stored in unstemmed form for readability; the build process stems
|
|
17
|
+
* them into STEMMED_SYNONYM_GROUPS and builds an index.
|
|
18
|
+
*/
|
|
19
|
+
export declare const SYNONYM_GROUPS: string[][];
|
|
20
|
+
/**
|
|
21
|
+
* Stemmed synonym groups.
|
|
22
|
+
* Each word in each group has been run through the Porter stemmer.
|
|
23
|
+
* Duplicate stems within a group are deduplicated.
|
|
24
|
+
*/
|
|
25
|
+
export declare const STEMMED_SYNONYM_GROUPS: string[][];
|
|
26
|
+
export interface ExpandedTerm {
|
|
27
|
+
/** The stemmed term */
|
|
28
|
+
term: string;
|
|
29
|
+
/** 1.0 for original query terms, 0.5 for synonym expansions */
|
|
30
|
+
weight: number;
|
|
31
|
+
/** True if this term came from the original query */
|
|
32
|
+
isOriginal: boolean;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Expand a list of stemmed query tokens with their synonyms.
|
|
36
|
+
*
|
|
37
|
+
* @param terms - Already-stemmed tokens from the query
|
|
38
|
+
* @returns Array of ExpandedTerm objects. Original terms have weight=1.0,
|
|
39
|
+
* synonym expansions have weight=0.5.
|
|
40
|
+
* The returned array preserves originals first, then synonyms.
|
|
41
|
+
*/
|
|
42
|
+
export declare function expandWithSynonyms(terms: string[]): ExpandedTerm[];
|