@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema-based extraction using CSS selectors.
|
|
3
|
+
*
|
|
4
|
+
* Each schema defines how to extract listings from a specific domain,
|
|
5
|
+
* inspired by Crawl4AI's JsonCssExtractionStrategy. Unlike generic
|
|
6
|
+
* auto-detection, schemas provide exact selectors for each site's DOM.
|
|
7
|
+
*
|
|
8
|
+
* @module schema-extraction
|
|
9
|
+
*/
|
|
10
|
+
export interface SchemaField {
|
|
11
|
+
/** Field name in output (e.g., "title", "price", "rating") */
|
|
12
|
+
name: string;
|
|
13
|
+
/** CSS selector relative to baseSelector. Empty string selects the base element itself. */
|
|
14
|
+
selector: string;
|
|
15
|
+
/** What to extract */
|
|
16
|
+
type: 'text' | 'attribute' | 'html' | 'exists';
|
|
17
|
+
/** For type='attribute', which attribute to read */
|
|
18
|
+
attribute?: string;
|
|
19
|
+
/** Extract all matches (returns array instead of first match) */
|
|
20
|
+
multiple?: boolean;
|
|
21
|
+
/** Optional transform to apply after extraction */
|
|
22
|
+
transform?: 'trim' | 'number' | 'stripCurrency';
|
|
23
|
+
}
|
|
24
|
+
export interface ExtractionSchema {
|
|
25
|
+
/** Human-readable schema name (e.g., "Booking.com Hotel Search") */
|
|
26
|
+
name: string;
|
|
27
|
+
/** Schema version string */
|
|
28
|
+
version: string;
|
|
29
|
+
/** Matching domains (e.g., ["booking.com", "www.booking.com"]) */
|
|
30
|
+
domains: string[];
|
|
31
|
+
/** Optional URL path patterns (regex strings) for more specific matching */
|
|
32
|
+
urlPatterns?: string[];
|
|
33
|
+
/** CSS selector for each listing item */
|
|
34
|
+
baseSelector: string;
|
|
35
|
+
/** Fields to extract from each item */
|
|
36
|
+
fields: SchemaField[];
|
|
37
|
+
/** Optional pagination config */
|
|
38
|
+
pagination?: {
|
|
39
|
+
nextSelector?: string;
|
|
40
|
+
pageParam?: string;
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
/** A single extracted item — field names map to extracted values */
|
|
44
|
+
export interface ExtractedItem {
|
|
45
|
+
[key: string]: string | string[] | boolean | number | undefined;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Load all bundled schemas.
|
|
49
|
+
*/
|
|
50
|
+
export declare function loadBundledSchemas(): ExtractionSchema[];
|
|
51
|
+
/**
|
|
52
|
+
* Find a matching schema for a given URL.
|
|
53
|
+
*
|
|
54
|
+
* Matches by domain first, then optionally by URL patterns (regex).
|
|
55
|
+
* Returns the first matching schema or null.
|
|
56
|
+
*/
|
|
57
|
+
export declare function findSchemaForUrl(url: string): ExtractionSchema | null;
|
|
58
|
+
/**
|
|
59
|
+
* Extract listings from HTML using a schema's CSS selectors.
|
|
60
|
+
*
|
|
61
|
+
* @param html - Raw HTML string to parse
|
|
62
|
+
* @param schema - Extraction schema to use
|
|
63
|
+
* @param baseUrl - Optional base URL for resolving relative links
|
|
64
|
+
* @returns Array of extracted items (may be empty)
|
|
65
|
+
*/
|
|
66
|
+
export declare function extractWithSchema(html: string, schema: ExtractionSchema, baseUrl?: string): ExtractedItem[];
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Schema-based extraction using CSS selectors.
|
|
3
|
+
*
|
|
4
|
+
* Each schema defines how to extract listings from a specific domain,
|
|
5
|
+
* inspired by Crawl4AI's JsonCssExtractionStrategy. Unlike generic
|
|
6
|
+
* auto-detection, schemas provide exact selectors for each site's DOM.
|
|
7
|
+
*
|
|
8
|
+
* @module schema-extraction
|
|
9
|
+
*/
|
|
10
|
+
import { load } from 'cheerio';
|
|
11
|
+
/* ------------------------------------------------------------------ */
|
|
12
|
+
/* Bundled schemas (hardcoded to avoid JSON import complications) */
|
|
13
|
+
/* ------------------------------------------------------------------ */
|
|
14
|
+
const BOOKING_COM_SCHEMA = {
|
|
15
|
+
name: 'Booking.com Hotel Search',
|
|
16
|
+
version: '1.0',
|
|
17
|
+
domains: ['booking.com', 'www.booking.com'],
|
|
18
|
+
urlPatterns: ['searchresults'],
|
|
19
|
+
baseSelector: "[data-testid='property-card']",
|
|
20
|
+
fields: [
|
|
21
|
+
{ name: 'title', selector: "[data-testid='title'], .sr-hotel__name, h3 a", type: 'text' },
|
|
22
|
+
{ name: 'price', selector: "[data-testid='price-and-discounted-price'], .bui-price-display__value, [data-testid='price-for-x-nights']", type: 'text', transform: 'trim' },
|
|
23
|
+
{ name: 'rating', selector: "[data-testid='review-score'] div:first-child, .bui-review-score__badge", type: 'text' },
|
|
24
|
+
{ name: 'reviewCount', selector: "[data-testid='review-score'] div:nth-child(2) div:nth-child(2), .bui-review-score__text", type: 'text' },
|
|
25
|
+
{ name: 'location', selector: "[data-testid='address'], .sr_card_address_line", type: 'text' },
|
|
26
|
+
{ name: 'link', selector: "a[data-testid='title-link'], h3 a, a.hotel_name_link", type: 'attribute', attribute: 'href' },
|
|
27
|
+
{ name: 'image', selector: "img[data-testid='image'], img.hotel_image", type: 'attribute', attribute: 'src' },
|
|
28
|
+
{ name: 'stars', selector: "[data-testid='rating-stars'] span, .bui-star-rating .bui-star-rating__star", type: 'text' },
|
|
29
|
+
],
|
|
30
|
+
};
|
|
31
|
+
const AMAZON_COM_SCHEMA = {
|
|
32
|
+
name: 'Amazon Product Search',
|
|
33
|
+
version: '1.0',
|
|
34
|
+
domains: ['amazon.com', 'www.amazon.com', 'amazon.co.uk', 'amazon.de', 'amazon.fr', 'amazon.ca'],
|
|
35
|
+
urlPatterns: ['/s\\?', '/s/'],
|
|
36
|
+
baseSelector: "[data-component-type='s-search-result']",
|
|
37
|
+
fields: [
|
|
38
|
+
{ name: 'title', selector: 'h2 a span, h2 span a span', type: 'text' },
|
|
39
|
+
{ name: 'price', selector: '.a-price .a-offscreen', type: 'text' },
|
|
40
|
+
{ name: 'originalPrice', selector: '.a-price.a-text-price .a-offscreen', type: 'text' },
|
|
41
|
+
{ name: 'rating', selector: '.a-icon-star-small .a-icon-alt, .a-icon-star-mini .a-icon-alt', type: 'text' },
|
|
42
|
+
{ name: 'reviewCount', selector: "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span, .a-size-base.s-underline-text", type: 'text' },
|
|
43
|
+
{ name: 'link', selector: 'h2 a', type: 'attribute', attribute: 'href' },
|
|
44
|
+
{ name: 'image', selector: '.s-image', type: 'attribute', attribute: 'src' },
|
|
45
|
+
{ name: 'sponsored', selector: '.puis-sponsored-label-text', type: 'exists' },
|
|
46
|
+
{ name: 'asin', selector: '', type: 'attribute', attribute: 'data-asin' },
|
|
47
|
+
],
|
|
48
|
+
};
|
|
49
|
+
const EBAY_COM_SCHEMA = {
|
|
50
|
+
name: 'eBay Search Results',
|
|
51
|
+
version: '1.0',
|
|
52
|
+
domains: ['ebay.com', 'www.ebay.com'],
|
|
53
|
+
urlPatterns: ['/sch/'],
|
|
54
|
+
baseSelector: '.s-item, [data-viewport]',
|
|
55
|
+
fields: [
|
|
56
|
+
{ name: 'title', selector: '.s-item__title span, .s-item__title', type: 'text' },
|
|
57
|
+
{ name: 'price', selector: '.s-item__price', type: 'text' },
|
|
58
|
+
{ name: 'link', selector: '.s-item__link, a.s-item__link', type: 'attribute', attribute: 'href' },
|
|
59
|
+
{ name: 'image', selector: '.s-item__image-wrapper img, .s-item__image img', type: 'attribute', attribute: 'src' },
|
|
60
|
+
{ name: 'condition', selector: '.SECONDARY_INFO', type: 'text' },
|
|
61
|
+
{ name: 'shipping', selector: '.s-item__shipping, .s-item__freeXDays', type: 'text' },
|
|
62
|
+
{ name: 'seller', selector: '.s-item__seller-info-text', type: 'text' },
|
|
63
|
+
],
|
|
64
|
+
};
|
|
65
|
+
const YELP_COM_SCHEMA = {
|
|
66
|
+
name: 'Yelp Business Search',
|
|
67
|
+
version: '1.0',
|
|
68
|
+
domains: ['yelp.com', 'www.yelp.com'],
|
|
69
|
+
urlPatterns: ['/search'],
|
|
70
|
+
baseSelector: "[data-testid='serp-ia-card'], li.border-color--default",
|
|
71
|
+
fields: [
|
|
72
|
+
{ name: 'title', selector: "a[href*='/biz/'] span, h3 a span", type: 'text' },
|
|
73
|
+
{ name: 'rating', selector: "[aria-label*='star rating'], .i-stars", type: 'attribute', attribute: 'aria-label' },
|
|
74
|
+
{ name: 'reviewCount', selector: ".reviewCount, span[class*='css-']", type: 'text' },
|
|
75
|
+
{ name: 'price', selector: '.priceRange, span.priceRange', type: 'text' },
|
|
76
|
+
{ name: 'category', selector: ".priceCategory span, p[class*='css-'] a", type: 'text' },
|
|
77
|
+
{ name: 'link', selector: "a[href*='/biz/']", type: 'attribute', attribute: 'href' },
|
|
78
|
+
{ name: 'address', selector: "address, span[class*='css-']", type: 'text' },
|
|
79
|
+
],
|
|
80
|
+
};
|
|
81
|
+
const WALMART_COM_SCHEMA = {
|
|
82
|
+
name: 'Walmart Product Search',
|
|
83
|
+
version: '1.0',
|
|
84
|
+
domains: ['walmart.com', 'www.walmart.com'],
|
|
85
|
+
urlPatterns: ['/search'],
|
|
86
|
+
baseSelector: "[data-testid='list-view'] > div, [data-item-id]",
|
|
87
|
+
fields: [
|
|
88
|
+
{ name: 'title', selector: "a[link-identifier] span, [data-automation-id='product-title']", type: 'text' },
|
|
89
|
+
{ name: 'price', selector: "[data-automation-id='product-price'] .f2, [itemprop='price']", type: 'text' },
|
|
90
|
+
{ name: 'rating', selector: "[data-testid='product-ratings'] .w_iUH7, .stars-reviews-count", type: 'text' },
|
|
91
|
+
{ name: 'link', selector: "a[link-identifier], a[href*='/ip/']", type: 'attribute', attribute: 'href' },
|
|
92
|
+
{ name: 'image', selector: "img[data-testid='productTileImage'], img[loading]", type: 'attribute', attribute: 'src' },
|
|
93
|
+
{ name: 'seller', selector: "[data-automation-id='fulfillment-badge']", type: 'text' },
|
|
94
|
+
],
|
|
95
|
+
};
|
|
96
|
+
const HACKERNEWS_SCHEMA = {
|
|
97
|
+
name: 'Hacker News',
|
|
98
|
+
version: '1.0',
|
|
99
|
+
domains: ['news.ycombinator.com'],
|
|
100
|
+
baseSelector: 'tr.athing',
|
|
101
|
+
fields: [
|
|
102
|
+
{ name: 'title', selector: '.titleline a', type: 'text' },
|
|
103
|
+
{ name: 'link', selector: '.titleline a', type: 'attribute', attribute: 'href' },
|
|
104
|
+
{ name: 'rank', selector: '.rank', type: 'text' },
|
|
105
|
+
{ name: 'site', selector: '.sitestr', type: 'text' },
|
|
106
|
+
],
|
|
107
|
+
};
|
|
108
|
+
const EXPEDIA_COM_SCHEMA = {
|
|
109
|
+
name: 'Expedia Hotel Search',
|
|
110
|
+
version: '1.0',
|
|
111
|
+
domains: ['expedia.com', 'www.expedia.com'],
|
|
112
|
+
urlPatterns: ['Hotel-Search', 'hotel-search'],
|
|
113
|
+
baseSelector: "[data-stid='property-listing'], li.uitk-spacing[class*='uitk-spacing'], [data-stid='lodging-card-responsive']",
|
|
114
|
+
fields: [
|
|
115
|
+
{ name: 'title', selector: "[data-stid='content-hotel-title'], .uitk-heading-5, .uitk-heading-6, h3[class*='uitk-heading']", type: 'text' },
|
|
116
|
+
{ name: 'price', selector: "[data-stid='price-summary'] .uitk-type-500, [data-stid='price-summary-message-total'], .uitk-type-500", type: 'text', transform: 'trim' },
|
|
117
|
+
{ name: 'rating', selector: "[data-stid='star-rating-msg'], .uitk-badge-base, [aria-label*='out of']", type: 'text' },
|
|
118
|
+
{ name: 'reviewCount', selector: "[data-stid='review-info-text'], .uitk-type-200", type: 'text' },
|
|
119
|
+
{ name: 'location', selector: "[data-stid='location-info'], [data-stid='neighborhood-name']", type: 'text' },
|
|
120
|
+
{ name: 'link', selector: "a[data-stid='open-hotel-information'], a[href*='/h/'], a.uitk-card-link", type: 'attribute', attribute: 'href' },
|
|
121
|
+
{ name: 'image', selector: "img[data-stid='image'], .uitk-image-media img", type: 'attribute', attribute: 'src' },
|
|
122
|
+
],
|
|
123
|
+
};
|
|
124
|
+
/** All bundled schemas in priority order */
|
|
125
|
+
const BUNDLED_SCHEMAS = [
|
|
126
|
+
BOOKING_COM_SCHEMA,
|
|
127
|
+
AMAZON_COM_SCHEMA,
|
|
128
|
+
EBAY_COM_SCHEMA,
|
|
129
|
+
YELP_COM_SCHEMA,
|
|
130
|
+
WALMART_COM_SCHEMA,
|
|
131
|
+
HACKERNEWS_SCHEMA,
|
|
132
|
+
EXPEDIA_COM_SCHEMA,
|
|
133
|
+
];
|
|
134
|
+
/* ------------------------------------------------------------------ */
|
|
135
|
+
/* Helpers */
|
|
136
|
+
/* ------------------------------------------------------------------ */
|
|
137
|
+
/**
|
|
138
|
+
* Apply a transform to an extracted string value.
|
|
139
|
+
*/
|
|
140
|
+
function applyTransform(value, transform) {
|
|
141
|
+
if (!transform)
|
|
142
|
+
return value;
|
|
143
|
+
switch (transform) {
|
|
144
|
+
case 'trim':
|
|
145
|
+
return value.trim();
|
|
146
|
+
case 'number': {
|
|
147
|
+
const num = parseFloat(value.replace(/[^\d.]/g, ''));
|
|
148
|
+
return isNaN(num) ? value : num;
|
|
149
|
+
}
|
|
150
|
+
case 'stripCurrency':
|
|
151
|
+
return value.replace(/[^\d.,]/g, '').trim();
|
|
152
|
+
default:
|
|
153
|
+
return value;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Resolve a potentially relative URL against a base URL.
|
|
158
|
+
*/
|
|
159
|
+
function resolveUrl(href, baseUrl) {
|
|
160
|
+
if (!href)
|
|
161
|
+
return undefined;
|
|
162
|
+
if (href.startsWith('data:') || href.startsWith('javascript:'))
|
|
163
|
+
return undefined;
|
|
164
|
+
if (!baseUrl)
|
|
165
|
+
return href;
|
|
166
|
+
try {
|
|
167
|
+
return new URL(href, baseUrl).href;
|
|
168
|
+
}
|
|
169
|
+
catch {
|
|
170
|
+
return href;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Extract a single field value from a cheerio element.
|
|
175
|
+
*/
|
|
176
|
+
function extractFieldValue($, $el, field, baseUrl) {
|
|
177
|
+
// For empty selector on attribute type, read from the base element itself
|
|
178
|
+
const useBaseEl = field.selector === '' || field.selector.trim() === '';
|
|
179
|
+
if (field.multiple && !useBaseEl) {
|
|
180
|
+
// Collect all matches
|
|
181
|
+
const results = [];
|
|
182
|
+
$el.find(field.selector).each((_, el) => {
|
|
183
|
+
const $match = $(el);
|
|
184
|
+
let val;
|
|
185
|
+
switch (field.type) {
|
|
186
|
+
case 'text':
|
|
187
|
+
val = $match.text().trim();
|
|
188
|
+
break;
|
|
189
|
+
case 'attribute':
|
|
190
|
+
val = field.attribute ? ($match.attr(field.attribute) ?? undefined) : undefined;
|
|
191
|
+
if (field.attribute === 'href' || field.attribute === 'src') {
|
|
192
|
+
val = resolveUrl(val, baseUrl);
|
|
193
|
+
}
|
|
194
|
+
break;
|
|
195
|
+
case 'html':
|
|
196
|
+
val = $match.html() ?? undefined;
|
|
197
|
+
break;
|
|
198
|
+
case 'exists':
|
|
199
|
+
// not meaningful for multiple
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
if (val !== undefined && val !== '')
|
|
203
|
+
results.push(val);
|
|
204
|
+
});
|
|
205
|
+
return results.length > 0 ? results : undefined;
|
|
206
|
+
}
|
|
207
|
+
// Single match mode
|
|
208
|
+
const $target = useBaseEl ? $el : $el.find(field.selector).first();
|
|
209
|
+
switch (field.type) {
|
|
210
|
+
case 'exists':
|
|
211
|
+
return useBaseEl ? true : $el.find(field.selector).length > 0;
|
|
212
|
+
case 'text': {
|
|
213
|
+
if (!useBaseEl && $target.length === 0)
|
|
214
|
+
return undefined;
|
|
215
|
+
const text = $target.text().trim();
|
|
216
|
+
if (text === '')
|
|
217
|
+
return undefined;
|
|
218
|
+
const transformed = applyTransform(text, field.transform);
|
|
219
|
+
return transformed;
|
|
220
|
+
}
|
|
221
|
+
case 'attribute': {
|
|
222
|
+
if (!field.attribute)
|
|
223
|
+
return undefined;
|
|
224
|
+
const attrVal = $target.attr(field.attribute) ?? undefined;
|
|
225
|
+
if (attrVal === undefined)
|
|
226
|
+
return undefined;
|
|
227
|
+
if (field.attribute === 'href' || field.attribute === 'src') {
|
|
228
|
+
const resolved = resolveUrl(attrVal, baseUrl);
|
|
229
|
+
if (!resolved)
|
|
230
|
+
return undefined;
|
|
231
|
+
return applyTransform(resolved, field.transform);
|
|
232
|
+
}
|
|
233
|
+
return applyTransform(attrVal, field.transform);
|
|
234
|
+
}
|
|
235
|
+
case 'html': {
|
|
236
|
+
if (!useBaseEl && $target.length === 0)
|
|
237
|
+
return undefined;
|
|
238
|
+
return $target.html() ?? undefined;
|
|
239
|
+
}
|
|
240
|
+
default:
|
|
241
|
+
return undefined;
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
/* ------------------------------------------------------------------ */
|
|
245
|
+
/* Public API */
|
|
246
|
+
/* ------------------------------------------------------------------ */
|
|
247
|
+
/**
|
|
248
|
+
* Load all bundled schemas.
|
|
249
|
+
*/
|
|
250
|
+
export function loadBundledSchemas() {
|
|
251
|
+
return [...BUNDLED_SCHEMAS];
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Find a matching schema for a given URL.
|
|
255
|
+
*
|
|
256
|
+
* Matches by domain first, then optionally by URL patterns (regex).
|
|
257
|
+
* Returns the first matching schema or null.
|
|
258
|
+
*/
|
|
259
|
+
export function findSchemaForUrl(url) {
|
|
260
|
+
let parsed;
|
|
261
|
+
try {
|
|
262
|
+
parsed = new URL(url);
|
|
263
|
+
}
|
|
264
|
+
catch {
|
|
265
|
+
return null;
|
|
266
|
+
}
|
|
267
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
268
|
+
const fullUrl = url;
|
|
269
|
+
for (const schema of BUNDLED_SCHEMAS) {
|
|
270
|
+
// Check domain match
|
|
271
|
+
const domainMatch = schema.domains.some(domain => {
|
|
272
|
+
const d = domain.toLowerCase();
|
|
273
|
+
return hostname === d || hostname.endsWith('.' + d) || d.endsWith('.' + hostname);
|
|
274
|
+
});
|
|
275
|
+
if (!domainMatch)
|
|
276
|
+
continue;
|
|
277
|
+
// If no urlPatterns, domain match is enough
|
|
278
|
+
if (!schema.urlPatterns || schema.urlPatterns.length === 0) {
|
|
279
|
+
return schema;
|
|
280
|
+
}
|
|
281
|
+
// Check URL patterns against the full URL
|
|
282
|
+
const patternMatch = schema.urlPatterns.some(pattern => {
|
|
283
|
+
try {
|
|
284
|
+
return new RegExp(pattern).test(fullUrl);
|
|
285
|
+
}
|
|
286
|
+
catch {
|
|
287
|
+
return false;
|
|
288
|
+
}
|
|
289
|
+
});
|
|
290
|
+
if (patternMatch)
|
|
291
|
+
return schema;
|
|
292
|
+
}
|
|
293
|
+
return null;
|
|
294
|
+
}
|
|
295
|
+
/**
|
|
296
|
+
* Extract listings from HTML using a schema's CSS selectors.
|
|
297
|
+
*
|
|
298
|
+
* @param html - Raw HTML string to parse
|
|
299
|
+
* @param schema - Extraction schema to use
|
|
300
|
+
* @param baseUrl - Optional base URL for resolving relative links
|
|
301
|
+
* @returns Array of extracted items (may be empty)
|
|
302
|
+
*/
|
|
303
|
+
export function extractWithSchema(html, schema, baseUrl) {
|
|
304
|
+
if (!html || html.trim().length === 0)
|
|
305
|
+
return [];
|
|
306
|
+
const $ = load(html);
|
|
307
|
+
const items = [];
|
|
308
|
+
// Find the title/name field to use for filtering empty items
|
|
309
|
+
const titleFieldName = schema.fields.find(f => f.name === 'title' || f.name === 'name')?.name;
|
|
310
|
+
$(schema.baseSelector).each((_, el) => {
|
|
311
|
+
const $el = $(el);
|
|
312
|
+
const item = {};
|
|
313
|
+
for (const field of schema.fields) {
|
|
314
|
+
const value = extractFieldValue($, $el, field, baseUrl);
|
|
315
|
+
if (value !== undefined) {
|
|
316
|
+
item[field.name] = value;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
// Clean title/name field: strip common junk suffixes (e.g., "Opens in new window")
|
|
320
|
+
if (titleFieldName !== undefined && typeof item[titleFieldName] === 'string') {
|
|
321
|
+
let title = item[titleFieldName];
|
|
322
|
+
// Strip "Opens in (a) new window/tab" variants
|
|
323
|
+
title = title.replace(/\s*Opens?\s+in\s+(?:a\s+)?new\s+(?:window|tab)(?:\s+or\s+(?:window|tab))?/gi, '');
|
|
324
|
+
// Strip "New Listing", "Sponsored", "Ad" prefixes
|
|
325
|
+
title = title.replace(/^(?:New\s+Listing|Sponsored|Ad)\s*[-–—:·]?\s*/i, '');
|
|
326
|
+
item[titleFieldName] = title.trim();
|
|
327
|
+
}
|
|
328
|
+
// Skip items with no title/name (likely empty/phantom elements)
|
|
329
|
+
if (titleFieldName !== undefined) {
|
|
330
|
+
const titleVal = item[titleFieldName];
|
|
331
|
+
if (!titleVal || (typeof titleVal === 'string' && titleVal.trim() === '')) {
|
|
332
|
+
return; // skip
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
// Skip completely empty items
|
|
336
|
+
if (Object.keys(item).length === 0)
|
|
337
|
+
return;
|
|
338
|
+
items.push(item);
|
|
339
|
+
});
|
|
340
|
+
// Deduplicate: remove items with identical title + price (common with nested selectors)
|
|
341
|
+
if (titleFieldName) {
|
|
342
|
+
const seen = new Set();
|
|
343
|
+
return items.filter(item => {
|
|
344
|
+
const key = `${String(item[titleFieldName] ?? '')}|${String(item.price ?? '')}`;
|
|
345
|
+
if (seen.has(key))
|
|
346
|
+
return false;
|
|
347
|
+
seen.add(key);
|
|
348
|
+
return true;
|
|
349
|
+
});
|
|
350
|
+
}
|
|
351
|
+
return items;
|
|
352
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Post-process BM25 quickAnswer passages to extract specific values.
|
|
3
|
+
*
|
|
4
|
+
* BM25 finds relevant passages but can't extract values. This module
|
|
5
|
+
* applies field-type-aware regex extraction to pull the actual value
|
|
6
|
+
* from the passage.
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* Post-process a BM25 passage to extract the actual value for a given field name.
|
|
10
|
+
*/
|
|
11
|
+
export declare function extractValueFromPassage(passage: string, fieldName: string): string;
|
|
12
|
+
/**
|
|
13
|
+
* Smart schema extraction that uses structural signals before falling back to BM25.
|
|
14
|
+
*
|
|
15
|
+
* For title/name: uses the page title or first heading
|
|
16
|
+
* For author: scans first 1000 chars for "by X" patterns
|
|
17
|
+
* For date: scans first 1000 chars for date patterns
|
|
18
|
+
* For price/email/phone/url: regex scan of full content
|
|
19
|
+
* For everything else: BM25 quickAnswer + post-processing
|
|
20
|
+
*/
|
|
21
|
+
export declare function smartExtractSchemaFields(content: string, templateFields: Record<string, string>, quickAnswerFn: (opts: {
|
|
22
|
+
content: string;
|
|
23
|
+
question: string;
|
|
24
|
+
url?: string;
|
|
25
|
+
}) => {
|
|
26
|
+
answer: string;
|
|
27
|
+
confidence: number;
|
|
28
|
+
}, options?: {
|
|
29
|
+
pageTitle?: string;
|
|
30
|
+
pageUrl?: string;
|
|
31
|
+
metadata?: Record<string, any>;
|
|
32
|
+
}): Record<string, string>;
|