@iflow-mcp/jakeliume-webpeel 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +313 -0
- package/dist/cache.d.ts +30 -0
- package/dist/cache.js +139 -0
- package/dist/cli/commands/auth.d.ts +5 -0
- package/dist/cli/commands/auth.js +411 -0
- package/dist/cli/commands/doctor.d.ts +37 -0
- package/dist/cli/commands/doctor.js +371 -0
- package/dist/cli/commands/fetch.d.ts +6 -0
- package/dist/cli/commands/fetch.js +1345 -0
- package/dist/cli/commands/guide.d.ts +2 -0
- package/dist/cli/commands/guide.js +183 -0
- package/dist/cli/commands/interact.d.ts +5 -0
- package/dist/cli/commands/interact.js +840 -0
- package/dist/cli/commands/jobs.d.ts +5 -0
- package/dist/cli/commands/jobs.js +997 -0
- package/dist/cli/commands/monitor.d.ts +12 -0
- package/dist/cli/commands/monitor.js +197 -0
- package/dist/cli/commands/observe.d.ts +12 -0
- package/dist/cli/commands/observe.js +158 -0
- package/dist/cli/commands/screenshot.d.ts +5 -0
- package/dist/cli/commands/screenshot.js +282 -0
- package/dist/cli/commands/search.d.ts +5 -0
- package/dist/cli/commands/search.js +1021 -0
- package/dist/cli/commands/setup.d.ts +13 -0
- package/dist/cli/commands/setup.js +244 -0
- package/dist/cli/commands/skill.d.ts +15 -0
- package/dist/cli/commands/skill.js +195 -0
- package/dist/cli/utils.d.ts +84 -0
- package/dist/cli/utils.js +806 -0
- package/dist/cli-auth.d.ts +75 -0
- package/dist/cli-auth.js +369 -0
- package/dist/cli.d.ts +17 -0
- package/dist/cli.js +99 -0
- package/dist/core/actions.d.ts +69 -0
- package/dist/core/actions.js +495 -0
- package/dist/core/agent.d.ts +98 -0
- package/dist/core/agent.js +558 -0
- package/dist/core/answer.d.ts +42 -0
- package/dist/core/answer.js +395 -0
- package/dist/core/application-tracker.d.ts +84 -0
- package/dist/core/application-tracker.js +184 -0
- package/dist/core/apply.d.ts +162 -0
- package/dist/core/apply.js +816 -0
- package/dist/core/auth-detection.d.ts +35 -0
- package/dist/core/auth-detection.js +358 -0
- package/dist/core/auto-extract.d.ts +82 -0
- package/dist/core/auto-extract.js +604 -0
- package/dist/core/auto-interact.d.ts +23 -0
- package/dist/core/auto-interact.js +246 -0
- package/dist/core/bm25-filter.d.ts +66 -0
- package/dist/core/bm25-filter.js +288 -0
- package/dist/core/branding.d.ts +54 -0
- package/dist/core/branding.js +234 -0
- package/dist/core/browser-fetch.d.ts +323 -0
- package/dist/core/browser-fetch.js +1600 -0
- package/dist/core/browser-pool.d.ts +91 -0
- package/dist/core/browser-pool.js +550 -0
- package/dist/core/budget.d.ts +42 -0
- package/dist/core/budget.js +324 -0
- package/dist/core/business-intel.d.ts +47 -0
- package/dist/core/business-intel.js +279 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +121 -0
- package/dist/core/cf-worker-proxy.d.ts +32 -0
- package/dist/core/cf-worker-proxy.js +87 -0
- package/dist/core/challenge-detection.d.ts +26 -0
- package/dist/core/challenge-detection.js +468 -0
- package/dist/core/change-tracking.d.ts +75 -0
- package/dist/core/change-tracking.js +276 -0
- package/dist/core/chunker.d.ts +46 -0
- package/dist/core/chunker.js +249 -0
- package/dist/core/chunking.d.ts +42 -0
- package/dist/core/chunking.js +181 -0
- package/dist/core/circuit-breaker.d.ts +44 -0
- package/dist/core/circuit-breaker.js +85 -0
- package/dist/core/content-pruner.d.ts +47 -0
- package/dist/core/content-pruner.js +425 -0
- package/dist/core/cookie-cache.d.ts +60 -0
- package/dist/core/cookie-cache.js +163 -0
- package/dist/core/crawl-checkpoint.d.ts +54 -0
- package/dist/core/crawl-checkpoint.js +104 -0
- package/dist/core/crawler.d.ts +84 -0
- package/dist/core/crawler.js +349 -0
- package/dist/core/cross-verify.d.ts +27 -0
- package/dist/core/cross-verify.js +93 -0
- package/dist/core/deep-fetch.d.ts +74 -0
- package/dist/core/deep-fetch.js +405 -0
- package/dist/core/deep-research.d.ts +141 -0
- package/dist/core/deep-research.js +972 -0
- package/dist/core/design-analysis.d.ts +70 -0
- package/dist/core/design-analysis.js +490 -0
- package/dist/core/design-compare.d.ts +38 -0
- package/dist/core/design-compare.js +264 -0
- package/dist/core/diff.d.ts +61 -0
- package/dist/core/diff.js +289 -0
- package/dist/core/dns-cache.d.ts +20 -0
- package/dist/core/dns-cache.js +198 -0
- package/dist/core/documents.d.ts +23 -0
- package/dist/core/documents.js +123 -0
- package/dist/core/domain-memory.d.ts +66 -0
- package/dist/core/domain-memory.js +163 -0
- package/dist/core/domain-verify.d.ts +40 -0
- package/dist/core/domain-verify.js +379 -0
- package/dist/core/engine-ranker.d.ts +112 -0
- package/dist/core/engine-ranker.js +395 -0
- package/dist/core/extract-inline.d.ts +38 -0
- package/dist/core/extract-inline.js +215 -0
- package/dist/core/extract-listings.d.ts +38 -0
- package/dist/core/extract-listings.js +461 -0
- package/dist/core/extract.d.ts +9 -0
- package/dist/core/extract.js +139 -0
- package/dist/core/fetch-cache.d.ts +57 -0
- package/dist/core/fetch-cache.js +95 -0
- package/dist/core/fetcher.d.ts +13 -0
- package/dist/core/fetcher.js +12 -0
- package/dist/core/google-cache.d.ts +29 -0
- package/dist/core/google-cache.js +180 -0
- package/dist/core/google-serp-parser.d.ts +82 -0
- package/dist/core/google-serp-parser.js +287 -0
- package/dist/core/hotel-search.d.ts +122 -0
- package/dist/core/hotel-search.js +382 -0
- package/dist/core/http-fetch.d.ts +72 -0
- package/dist/core/http-fetch.js +820 -0
- package/dist/core/human.d.ts +175 -0
- package/dist/core/human.js +680 -0
- package/dist/core/image-caption.d.ts +44 -0
- package/dist/core/image-caption.js +271 -0
- package/dist/core/jobs.d.ts +75 -0
- package/dist/core/jobs.js +634 -0
- package/dist/core/json-ld.d.ts +15 -0
- package/dist/core/json-ld.js +617 -0
- package/dist/core/language-detect.d.ts +18 -0
- package/dist/core/language-detect.js +135 -0
- package/dist/core/links.d.ts +10 -0
- package/dist/core/links.js +44 -0
- package/dist/core/llm-extract.d.ts +71 -0
- package/dist/core/llm-extract.js +507 -0
- package/dist/core/llm-provider.d.ts +100 -0
- package/dist/core/llm-provider.js +702 -0
- package/dist/core/local-search.d.ts +60 -0
- package/dist/core/local-search.js +308 -0
- package/dist/core/logger.d.ts +28 -0
- package/dist/core/logger.js +104 -0
- package/dist/core/map.d.ts +33 -0
- package/dist/core/map.js +127 -0
- package/dist/core/markdown.d.ts +92 -0
- package/dist/core/markdown.js +809 -0
- package/dist/core/metadata.d.ts +34 -0
- package/dist/core/metadata.js +422 -0
- package/dist/core/observe.d.ts +113 -0
- package/dist/core/observe.js +395 -0
- package/dist/core/ocr.d.ts +12 -0
- package/dist/core/ocr.js +33 -0
- package/dist/core/paginate.d.ts +31 -0
- package/dist/core/paginate.js +106 -0
- package/dist/core/pdf.d.ts +8 -0
- package/dist/core/pdf.js +25 -0
- package/dist/core/peel-tls.d.ts +25 -0
- package/dist/core/peel-tls.js +220 -0
- package/dist/core/pipeline.d.ts +132 -0
- package/dist/core/pipeline.js +1666 -0
- package/dist/core/profiles.d.ts +61 -0
- package/dist/core/profiles.js +350 -0
- package/dist/core/prompt-guard.d.ts +30 -0
- package/dist/core/prompt-guard.js +119 -0
- package/dist/core/proxy-config.d.ts +90 -0
- package/dist/core/proxy-config.js +172 -0
- package/dist/core/quick-answer.d.ts +53 -0
- package/dist/core/quick-answer.js +833 -0
- package/dist/core/rate-governor.d.ts +80 -0
- package/dist/core/rate-governor.js +238 -0
- package/dist/core/readability.d.ts +57 -0
- package/dist/core/readability.js +533 -0
- package/dist/core/research.d.ts +66 -0
- package/dist/core/research.js +270 -0
- package/dist/core/retry.d.ts +60 -0
- package/dist/core/retry.js +119 -0
- package/dist/core/safe-browsing.d.ts +30 -0
- package/dist/core/safe-browsing.js +206 -0
- package/dist/core/schema-extraction.d.ts +66 -0
- package/dist/core/schema-extraction.js +352 -0
- package/dist/core/schema-postprocess.d.ts +32 -0
- package/dist/core/schema-postprocess.js +469 -0
- package/dist/core/schema-templates.d.ts +19 -0
- package/dist/core/schema-templates.js +143 -0
- package/dist/core/screenshot.d.ts +224 -0
- package/dist/core/screenshot.js +207 -0
- package/dist/core/search-engines.d.ts +25 -0
- package/dist/core/search-engines.js +182 -0
- package/dist/core/search-provider.d.ts +243 -0
- package/dist/core/search-provider.js +1629 -0
- package/dist/core/searxng-provider.d.ts +35 -0
- package/dist/core/searxng-provider.js +105 -0
- package/dist/core/selective-evidence.d.ts +151 -0
- package/dist/core/selective-evidence.js +389 -0
- package/dist/core/site-search.d.ts +44 -0
- package/dist/core/site-search.js +252 -0
- package/dist/core/sitemap.d.ts +23 -0
- package/dist/core/sitemap.js +105 -0
- package/dist/core/source-credibility.d.ts +29 -0
- package/dist/core/source-credibility.js +584 -0
- package/dist/core/source-scoring.d.ts +166 -0
- package/dist/core/source-scoring.js +396 -0
- package/dist/core/stemmer.d.ts +38 -0
- package/dist/core/stemmer.js +509 -0
- package/dist/core/strategies.d.ts +104 -0
- package/dist/core/strategies.js +1044 -0
- package/dist/core/strategy-hooks.d.ts +145 -0
- package/dist/core/strategy-hooks.js +74 -0
- package/dist/core/structured-extract.d.ts +43 -0
- package/dist/core/structured-extract.js +550 -0
- package/dist/core/summarize.d.ts +17 -0
- package/dist/core/summarize.js +78 -0
- package/dist/core/synonyms.d.ts +42 -0
- package/dist/core/synonyms.js +184 -0
- package/dist/core/system-monitor.d.ts +61 -0
- package/dist/core/system-monitor.js +133 -0
- package/dist/core/table-format.d.ts +30 -0
- package/dist/core/table-format.js +146 -0
- package/dist/core/threat-feeds.d.ts +23 -0
- package/dist/core/threat-feeds.js +104 -0
- package/dist/core/timing.d.ts +21 -0
- package/dist/core/timing.js +33 -0
- package/dist/core/transcript-export.d.ts +47 -0
- package/dist/core/transcript-export.js +107 -0
- package/dist/core/user-agents.d.ts +82 -0
- package/dist/core/user-agents.js +239 -0
- package/dist/core/vertical-search.d.ts +54 -0
- package/dist/core/vertical-search.js +158 -0
- package/dist/core/watch-manager.d.ts +175 -0
- package/dist/core/watch-manager.js +416 -0
- package/dist/core/watch.d.ts +101 -0
- package/dist/core/watch.js +389 -0
- package/dist/core/youtube.d.ts +130 -0
- package/dist/core/youtube.js +1175 -0
- package/dist/ee/challenge-re-export.d.ts +1 -0
- package/dist/ee/challenge-re-export.js +1 -0
- package/dist/ee/challenge-solver.d.ts +72 -0
- package/dist/ee/challenge-solver.js +720 -0
- package/dist/ee/domain-extractors.d.ts +8 -0
- package/dist/ee/domain-extractors.js +8 -0
- package/dist/ee/domain-intel.d.ts +16 -0
- package/dist/ee/domain-intel.js +133 -0
- package/dist/ee/extractors/allrecipes.d.ts +2 -0
- package/dist/ee/extractors/allrecipes.js +120 -0
- package/dist/ee/extractors/amazon.d.ts +2 -0
- package/dist/ee/extractors/amazon.js +78 -0
- package/dist/ee/extractors/arxiv.d.ts +2 -0
- package/dist/ee/extractors/arxiv.js +137 -0
- package/dist/ee/extractors/bestbuy.d.ts +2 -0
- package/dist/ee/extractors/bestbuy.js +78 -0
- package/dist/ee/extractors/carscom.d.ts +2 -0
- package/dist/ee/extractors/carscom.js +121 -0
- package/dist/ee/extractors/coingecko.d.ts +2 -0
- package/dist/ee/extractors/coingecko.js +134 -0
- package/dist/ee/extractors/craigslist.d.ts +2 -0
- package/dist/ee/extractors/craigslist.js +92 -0
- package/dist/ee/extractors/devto.d.ts +2 -0
- package/dist/ee/extractors/devto.js +135 -0
- package/dist/ee/extractors/ebay.d.ts +2 -0
- package/dist/ee/extractors/ebay.js +90 -0
- package/dist/ee/extractors/espn.d.ts +2 -0
- package/dist/ee/extractors/espn.js +260 -0
- package/dist/ee/extractors/etsy.d.ts +2 -0
- package/dist/ee/extractors/etsy.js +52 -0
- package/dist/ee/extractors/facebook.d.ts +2 -0
- package/dist/ee/extractors/facebook.js +46 -0
- package/dist/ee/extractors/github.d.ts +2 -0
- package/dist/ee/extractors/github.js +196 -0
- package/dist/ee/extractors/google-flights.d.ts +2 -0
- package/dist/ee/extractors/google-flights.js +176 -0
- package/dist/ee/extractors/hackernews.d.ts +2 -0
- package/dist/ee/extractors/hackernews.js +147 -0
- package/dist/ee/extractors/imdb.d.ts +2 -0
- package/dist/ee/extractors/imdb.js +172 -0
- package/dist/ee/extractors/index.d.ts +26 -0
- package/dist/ee/extractors/index.js +247 -0
- package/dist/ee/extractors/instagram.d.ts +2 -0
- package/dist/ee/extractors/instagram.js +102 -0
- package/dist/ee/extractors/kalshi.d.ts +2 -0
- package/dist/ee/extractors/kalshi.js +121 -0
- package/dist/ee/extractors/kayak-cars.d.ts +2 -0
- package/dist/ee/extractors/kayak-cars.js +270 -0
- package/dist/ee/extractors/linkedin.d.ts +2 -0
- package/dist/ee/extractors/linkedin.js +113 -0
- package/dist/ee/extractors/medium.d.ts +2 -0
- package/dist/ee/extractors/medium.js +130 -0
- package/dist/ee/extractors/news.d.ts +4 -0
- package/dist/ee/extractors/news.js +173 -0
- package/dist/ee/extractors/npm.d.ts +2 -0
- package/dist/ee/extractors/npm.js +86 -0
- package/dist/ee/extractors/pdf.d.ts +2 -0
- package/dist/ee/extractors/pdf.js +108 -0
- package/dist/ee/extractors/pinterest.d.ts +2 -0
- package/dist/ee/extractors/pinterest.js +34 -0
- package/dist/ee/extractors/polymarket.d.ts +2 -0
- package/dist/ee/extractors/polymarket.js +358 -0
- package/dist/ee/extractors/producthunt.d.ts +2 -0
- package/dist/ee/extractors/producthunt.js +88 -0
- package/dist/ee/extractors/pubmed.d.ts +2 -0
- package/dist/ee/extractors/pubmed.js +162 -0
- package/dist/ee/extractors/pypi.d.ts +2 -0
- package/dist/ee/extractors/pypi.js +80 -0
- package/dist/ee/extractors/reddit.d.ts +2 -0
- package/dist/ee/extractors/reddit.js +438 -0
- package/dist/ee/extractors/redfin.d.ts +2 -0
- package/dist/ee/extractors/redfin.js +156 -0
- package/dist/ee/extractors/semanticscholar.d.ts +2 -0
- package/dist/ee/extractors/semanticscholar.js +131 -0
- package/dist/ee/extractors/shared.d.ts +12 -0
- package/dist/ee/extractors/shared.js +76 -0
- package/dist/ee/extractors/soundcloud.d.ts +2 -0
- package/dist/ee/extractors/soundcloud.js +34 -0
- package/dist/ee/extractors/sportsbetting.d.ts +2 -0
- package/dist/ee/extractors/sportsbetting.js +37 -0
- package/dist/ee/extractors/spotify.d.ts +2 -0
- package/dist/ee/extractors/spotify.js +34 -0
- package/dist/ee/extractors/stackoverflow.d.ts +2 -0
- package/dist/ee/extractors/stackoverflow.js +61 -0
- package/dist/ee/extractors/substack.d.ts +2 -0
- package/dist/ee/extractors/substack.js +115 -0
- package/dist/ee/extractors/substackroot.d.ts +2 -0
- package/dist/ee/extractors/substackroot.js +46 -0
- package/dist/ee/extractors/tiktok.d.ts +2 -0
- package/dist/ee/extractors/tiktok.js +29 -0
- package/dist/ee/extractors/tradingview.d.ts +2 -0
- package/dist/ee/extractors/tradingview.js +182 -0
- package/dist/ee/extractors/twitch.d.ts +2 -0
- package/dist/ee/extractors/twitch.js +36 -0
- package/dist/ee/extractors/twitter.d.ts +2 -0
- package/dist/ee/extractors/twitter.js +327 -0
- package/dist/ee/extractors/types.d.ts +14 -0
- package/dist/ee/extractors/types.js +1 -0
- package/dist/ee/extractors/walmart.d.ts +2 -0
- package/dist/ee/extractors/walmart.js +50 -0
- package/dist/ee/extractors/weather.d.ts +2 -0
- package/dist/ee/extractors/weather.js +133 -0
- package/dist/ee/extractors/wikipedia.d.ts +4 -0
- package/dist/ee/extractors/wikipedia.js +235 -0
- package/dist/ee/extractors/yelp.d.ts +2 -0
- package/dist/ee/extractors/yelp.js +216 -0
- package/dist/ee/extractors/youtube.d.ts +2 -0
- package/dist/ee/extractors/youtube.js +189 -0
- package/dist/ee/extractors/zillow.d.ts +54 -0
- package/dist/ee/extractors/zillow.js +247 -0
- package/dist/ee/extractors-re-export.d.ts +1 -0
- package/dist/ee/extractors-re-export.js +1 -0
- package/dist/ee/premium-hooks.d.ts +20 -0
- package/dist/ee/premium-hooks.js +50 -0
- package/dist/ee/spa-detection.d.ts +2 -0
- package/dist/ee/spa-detection.js +2 -0
- package/dist/ee/stability.d.ts +4 -0
- package/dist/ee/stability.js +29 -0
- package/dist/ee/swr-cache.d.ts +14 -0
- package/dist/ee/swr-cache.js +34 -0
- package/dist/index.d.ts +143 -0
- package/dist/index.js +291 -0
- package/dist/integrations/index.d.ts +2 -0
- package/dist/integrations/index.js +2 -0
- package/dist/integrations/langchain.d.ts +64 -0
- package/dist/integrations/langchain.js +115 -0
- package/dist/integrations/llamaindex.d.ts +50 -0
- package/dist/integrations/llamaindex.js +91 -0
- package/dist/mcp/handlers/act.d.ts +5 -0
- package/dist/mcp/handlers/act.js +34 -0
- package/dist/mcp/handlers/definitions.d.ts +6 -0
- package/dist/mcp/handlers/definitions.js +395 -0
- package/dist/mcp/handlers/extract.d.ts +7 -0
- package/dist/mcp/handlers/extract.js +135 -0
- package/dist/mcp/handlers/fetch.d.ts +6 -0
- package/dist/mcp/handlers/fetch.js +98 -0
- package/dist/mcp/handlers/find.d.ts +5 -0
- package/dist/mcp/handlers/find.js +137 -0
- package/dist/mcp/handlers/index.d.ts +13 -0
- package/dist/mcp/handlers/index.js +63 -0
- package/dist/mcp/handlers/legacy.d.ts +25 -0
- package/dist/mcp/handlers/legacy.js +450 -0
- package/dist/mcp/handlers/meta.d.ts +6 -0
- package/dist/mcp/handlers/meta.js +40 -0
- package/dist/mcp/handlers/monitor.d.ts +5 -0
- package/dist/mcp/handlers/monitor.js +41 -0
- package/dist/mcp/handlers/observe.d.ts +8 -0
- package/dist/mcp/handlers/observe.js +37 -0
- package/dist/mcp/handlers/read.d.ts +6 -0
- package/dist/mcp/handlers/read.js +78 -0
- package/dist/mcp/handlers/see.d.ts +5 -0
- package/dist/mcp/handlers/see.js +75 -0
- package/dist/mcp/handlers/types.d.ts +29 -0
- package/dist/mcp/handlers/types.js +28 -0
- package/dist/mcp/server.d.ts +7 -0
- package/dist/mcp/server.js +108 -0
- package/dist/mcp/smart-router.d.ts +23 -0
- package/dist/mcp/smart-router.js +178 -0
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +632 -0
- package/dist/server/auth-store.d.ts +28 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/bull-queues.d.ts +60 -0
- package/dist/server/bull-queues.js +90 -0
- package/dist/server/email-service.d.ts +55 -0
- package/dist/server/email-service.js +291 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/audit-log.d.ts +14 -0
- package/dist/server/middleware/audit-log.js +73 -0
- package/dist/server/middleware/auth.d.ts +35 -0
- package/dist/server/middleware/auth.js +225 -0
- package/dist/server/middleware/rate-limit.d.ts +50 -0
- package/dist/server/middleware/rate-limit.js +270 -0
- package/dist/server/middleware/scope-guard.d.ts +25 -0
- package/dist/server/middleware/scope-guard.js +45 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +201 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +146 -0
- package/dist/server/pg-auth-store.js +576 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +79 -0
- package/dist/server/routes/admin-active.d.ts +7 -0
- package/dist/server/routes/admin-active.js +120 -0
- package/dist/server/routes/admin-stats.d.ts +7 -0
- package/dist/server/routes/admin-stats.js +176 -0
- package/dist/server/routes/agent.d.ts +24 -0
- package/dist/server/routes/agent.js +480 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +295 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cache-warm.d.ts +25 -0
- package/dist/server/routes/cache-warm.js +212 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/crawl.d.ts +13 -0
- package/dist/server/routes/crawl.js +287 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/deep-research.d.ts +11 -0
- package/dist/server/routes/deep-research.js +232 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +14 -0
- package/dist/server/routes/extract.js +325 -0
- package/dist/server/routes/feed.d.ts +15 -0
- package/dist/server/routes/feed.js +311 -0
- package/dist/server/routes/fetch-queue.d.ts +13 -0
- package/dist/server/routes/fetch-queue.js +357 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +1274 -0
- package/dist/server/routes/go.d.ts +14 -0
- package/dist/server/routes/go.js +81 -0
- package/dist/server/routes/health.d.ts +11 -0
- package/dist/server/routes/health.js +141 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +574 -0
- package/dist/server/routes/map.d.ts +11 -0
- package/dist/server/routes/map.js +116 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +197 -0
- package/dist/server/routes/metrics.d.ts +37 -0
- package/dist/server/routes/metrics.js +149 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/reader.d.ts +18 -0
- package/dist/server/routes/reader.js +192 -0
- package/dist/server/routes/research.d.ts +14 -0
- package/dist/server/routes/research.js +482 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +820 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +874 -0
- package/dist/server/routes/session.d.ts +17 -0
- package/dist/server/routes/session.js +548 -0
- package/dist/server/routes/share.d.ts +18 -0
- package/dist/server/routes/share.js +462 -0
- package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/cars.js +102 -0
- package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/flights.js +72 -0
- package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
- package/dist/server/routes/smart-search/handlers/general.js +717 -0
- package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
- package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/products.js +1309 -0
- package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/rental.js +154 -0
- package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
- package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
- package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
- package/dist/server/routes/smart-search/index.d.ts +19 -0
- package/dist/server/routes/smart-search/index.js +546 -0
- package/dist/server/routes/smart-search/intent.d.ts +3 -0
- package/dist/server/routes/smart-search/intent.js +264 -0
- package/dist/server/routes/smart-search/llm.d.ts +16 -0
- package/dist/server/routes/smart-search/llm.js +70 -0
- package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
- package/dist/server/routes/smart-search/sources/reddit.js +34 -0
- package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
- package/dist/server/routes/smart-search/sources/yelp.js +171 -0
- package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
- package/dist/server/routes/smart-search/sources/youtube.js +9 -0
- package/dist/server/routes/smart-search/types.d.ts +81 -0
- package/dist/server/routes/smart-search/types.js +1 -0
- package/dist/server/routes/smart-search/utils.d.ts +20 -0
- package/dist/server/routes/smart-search/utils.js +146 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +296 -0
- package/dist/server/routes/transcript-export.d.ts +10 -0
- package/dist/server/routes/transcript-export.js +178 -0
- package/dist/server/routes/usage.d.ts +9 -0
- package/dist/server/routes/usage.js +279 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1867 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +14 -0
- package/dist/server/sentry.js +104 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/dist/types.d.ts +552 -0
- package/dist/types.js +39 -0
- package/llms.txt +105 -0
- package/package.json +189 -0
|
@@ -0,0 +1,1175 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YouTube transcript extraction — no API key required.
|
|
3
|
+
*
|
|
4
|
+
* YouTube embeds caption/transcript data directly in the page HTML as JSON
|
|
5
|
+
* (inside ytInitialPlayerResponse). We parse that JSON, extract caption
|
|
6
|
+
* track URLs, fetch the timedtext XML, and return structured transcript data.
|
|
7
|
+
*/
|
|
8
|
+
import { execFile } from 'node:child_process';
|
|
9
|
+
import * as http from 'node:http';
|
|
10
|
+
import * as https from 'node:https';
|
|
11
|
+
import * as tls from 'node:tls';
|
|
12
|
+
import { readFile, unlink } from 'node:fs/promises';
|
|
13
|
+
import { tmpdir } from 'node:os';
|
|
14
|
+
import { join } from 'node:path';
|
|
15
|
+
import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
|
|
16
|
+
import { simpleFetch } from './fetcher.js';
|
|
17
|
+
import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
|
|
18
|
+
import { hasWebshareProxy as _hasWebshareProxy } from './proxy-config.js';
|
|
19
|
+
import { createLogger } from './logger.js';
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// yt-dlp startup diagnostics
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
const _ytLog = createLogger('youtube');
|
|
24
|
+
// Check yt-dlp availability on startup.
|
|
25
|
+
// Skipped in test environments (VITEST) to avoid interfering with mocked paths.
|
|
26
|
+
// Uses logger.debug (→ stderr) so it never pollutes stdout JSON output when piped.
|
|
27
|
+
let ytdlpAvailable = false;
|
|
28
|
+
(async () => {
|
|
29
|
+
if (process.env.VITEST)
|
|
30
|
+
return;
|
|
31
|
+
try {
|
|
32
|
+
const { execFileSync } = await import('node:child_process');
|
|
33
|
+
const version = execFileSync('yt-dlp', ['--version'], {
|
|
34
|
+
timeout: 5000,
|
|
35
|
+
env: { ...process.env, PATH: `/usr/local/bin:/usr/bin:/bin:${process.env.PATH ?? ''}` },
|
|
36
|
+
}).toString().trim();
|
|
37
|
+
ytdlpAvailable = true;
|
|
38
|
+
_ytLog.debug(`yt-dlp available: v${version}`);
|
|
39
|
+
}
|
|
40
|
+
catch {
|
|
41
|
+
_ytLog.debug('yt-dlp NOT available — falling back to HTTP extraction');
|
|
42
|
+
}
|
|
43
|
+
})();
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
// URL parsing
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
/**
|
|
48
|
+
* Extract the video ID from any common YouTube URL format.
|
|
49
|
+
* Returns null if the URL is not a recognisable YouTube URL.
|
|
50
|
+
*
|
|
51
|
+
* Supported formats:
|
|
52
|
+
* https://www.youtube.com/watch?v=VIDEO_ID
|
|
53
|
+
* https://youtu.be/VIDEO_ID
|
|
54
|
+
* https://www.youtube.com/embed/VIDEO_ID
|
|
55
|
+
* https://m.youtube.com/watch?v=VIDEO_ID
|
|
56
|
+
* URLs with extra params (&t=120, &list=PLxxx, etc.)
|
|
57
|
+
*/
|
|
58
|
+
export function parseYouTubeUrl(url) {
|
|
59
|
+
if (!url || typeof url !== 'string')
|
|
60
|
+
return null;
|
|
61
|
+
let parsed;
|
|
62
|
+
try {
|
|
63
|
+
parsed = new URL(url.trim());
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
const host = parsed.hostname.toLowerCase().replace(/^www\./, '').replace(/^m\./, '');
|
|
69
|
+
if (host === 'youtu.be') {
|
|
70
|
+
// https://youtu.be/VIDEO_ID
|
|
71
|
+
const id = parsed.pathname.slice(1).split('/')[0];
|
|
72
|
+
return isValidVideoId(id) ? id : null;
|
|
73
|
+
}
|
|
74
|
+
if (host === 'youtube.com') {
|
|
75
|
+
// /watch?v=VIDEO_ID
|
|
76
|
+
if (parsed.pathname === '/watch' || parsed.pathname === '/watch/') {
|
|
77
|
+
const id = parsed.searchParams.get('v');
|
|
78
|
+
return id && isValidVideoId(id) ? id : null;
|
|
79
|
+
}
|
|
80
|
+
// /embed/VIDEO_ID
|
|
81
|
+
if (parsed.pathname.startsWith('/embed/')) {
|
|
82
|
+
const id = parsed.pathname.split('/')[2];
|
|
83
|
+
return id && isValidVideoId(id) ? id : null;
|
|
84
|
+
}
|
|
85
|
+
// /shorts/VIDEO_ID
|
|
86
|
+
if (parsed.pathname.startsWith('/shorts/')) {
|
|
87
|
+
const id = parsed.pathname.split('/')[2];
|
|
88
|
+
return id && isValidVideoId(id) ? id : null;
|
|
89
|
+
}
|
|
90
|
+
// /v/VIDEO_ID (old embed format)
|
|
91
|
+
if (parsed.pathname.startsWith('/v/')) {
|
|
92
|
+
const id = parsed.pathname.split('/')[2];
|
|
93
|
+
return id && isValidVideoId(id) ? id : null;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
function isValidVideoId(id) {
|
|
99
|
+
return typeof id === 'string' && /^[A-Za-z0-9_-]{11}$/.test(id);
|
|
100
|
+
}
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
// Video info extraction
|
|
103
|
+
// ---------------------------------------------------------------------------
|
|
104
|
+
/**
|
|
105
|
+
* Extract video metadata from YouTube page HTML.
|
|
106
|
+
* Parses ytInitialPlayerResponse JSON embedded in the page.
|
|
107
|
+
*/
|
|
108
|
+
export function extractVideoInfo(html) {
|
|
109
|
+
const playerResponse = extractPlayerResponse(html);
|
|
110
|
+
const videoDetails = playerResponse?.videoDetails ?? {};
|
|
111
|
+
const microformat = playerResponse?.microformat?.playerMicroformatRenderer ?? {};
|
|
112
|
+
const videoId = videoDetails.videoId ?? '';
|
|
113
|
+
const title = videoDetails.title ??
|
|
114
|
+
microformat.title?.simpleText ??
|
|
115
|
+
extractMetaTag(html, 'og:title') ??
|
|
116
|
+
'';
|
|
117
|
+
const channel = videoDetails.author ?? microformat.ownerChannelName ?? '';
|
|
118
|
+
const lengthSeconds = parseInt(videoDetails.lengthSeconds ?? microformat.lengthSeconds ?? '0', 10);
|
|
119
|
+
const viewCount = videoDetails.viewCount ?? microformat.viewCount ?? '';
|
|
120
|
+
const publishDate = microformat.publishDate ?? microformat.uploadDate ?? '';
|
|
121
|
+
const description = videoDetails.shortDescription ??
|
|
122
|
+
microformat.description?.simpleText ??
|
|
123
|
+
extractMetaTag(html, 'og:description') ??
|
|
124
|
+
'';
|
|
125
|
+
const thumbnail = videoDetails.thumbnail?.thumbnails?.slice(-1)[0]?.url ??
|
|
126
|
+
microformat.thumbnail?.thumbnails?.slice(-1)[0]?.url ??
|
|
127
|
+
`https://img.youtube.com/vi/${videoId}/maxresdefault.jpg`;
|
|
128
|
+
// likeCount is often not available without auth
|
|
129
|
+
const likeCount = videoDetails.likeCount ?? '';
|
|
130
|
+
return {
|
|
131
|
+
videoId,
|
|
132
|
+
title,
|
|
133
|
+
channel,
|
|
134
|
+
description,
|
|
135
|
+
duration: formatDuration(lengthSeconds),
|
|
136
|
+
publishDate,
|
|
137
|
+
viewCount,
|
|
138
|
+
likeCount,
|
|
139
|
+
thumbnail,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
// ---------------------------------------------------------------------------
|
|
143
|
+
// Structured content helpers
|
|
144
|
+
// ---------------------------------------------------------------------------
|
|
145
|
+
/**
|
|
146
|
+
* Parse chapter markers from a YouTube video description.
|
|
147
|
+
* Looks for lines like "0:00 Intro\n2:34 Main topic\n5:12 Conclusion"
|
|
148
|
+
*/
|
|
149
|
+
export function parseChaptersFromDescription(description) {
|
|
150
|
+
if (!description)
|
|
151
|
+
return [];
|
|
152
|
+
// Match lines that start with a timestamp: "0:00", "1:23", "1:23:45"
|
|
153
|
+
const chapterRegex = /^(\d+:\d{2}(?::\d{2})?)\s+(.+)$/gm;
|
|
154
|
+
const chapters = [];
|
|
155
|
+
let match;
|
|
156
|
+
while ((match = chapterRegex.exec(description)) !== null) {
|
|
157
|
+
const time = match[1].trim();
|
|
158
|
+
const title = match[2].trim();
|
|
159
|
+
if (title)
|
|
160
|
+
chapters.push({ time, title });
|
|
161
|
+
}
|
|
162
|
+
// Only treat as chapters if there are at least 2 (otherwise it's probably not a chapter list)
|
|
163
|
+
return chapters.length >= 2 ? chapters : [];
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Convert a time string "1:23" or "1:23:45" to seconds.
|
|
167
|
+
*/
|
|
168
|
+
function timeStringToSeconds(timeStr) {
|
|
169
|
+
const parts = timeStr.split(':').map(Number);
|
|
170
|
+
if (parts.length === 3)
|
|
171
|
+
return parts[0] * 3600 + parts[1] * 60 + parts[2];
|
|
172
|
+
if (parts.length === 2)
|
|
173
|
+
return parts[0] * 60 + parts[1];
|
|
174
|
+
return 0;
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Split a text into sentences (basic, good enough for transcript sentences).
|
|
178
|
+
*/
|
|
179
|
+
function splitSentences(text) {
|
|
180
|
+
// Split on sentence-ending punctuation followed by space/end
|
|
181
|
+
return text.split(/(?<=[.!?])\s+/).map(s => s.trim()).filter(Boolean);
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Extract key points from transcript segments.
|
|
185
|
+
* Uses chapter timestamps when available; otherwise segments every 2 minutes.
|
|
186
|
+
* Returns the first substantive sentence (≥5 words) from each time block.
|
|
187
|
+
*/
|
|
188
|
+
export function extractKeyPoints(segments, chapters, durationSeconds) {
|
|
189
|
+
if (segments.length === 0)
|
|
190
|
+
return [];
|
|
191
|
+
const totalDuration = durationSeconds ||
|
|
192
|
+
(segments.length > 0
|
|
193
|
+
? segments[segments.length - 1].start + segments[segments.length - 1].duration
|
|
194
|
+
: 0);
|
|
195
|
+
// Build time blocks
|
|
196
|
+
let blocks;
|
|
197
|
+
if (chapters.length >= 2) {
|
|
198
|
+
blocks = chapters.map((ch, i) => ({
|
|
199
|
+
start: timeStringToSeconds(ch.time),
|
|
200
|
+
end: i + 1 < chapters.length
|
|
201
|
+
? timeStringToSeconds(chapters[i + 1].time)
|
|
202
|
+
: totalDuration || Infinity,
|
|
203
|
+
}));
|
|
204
|
+
}
|
|
205
|
+
else {
|
|
206
|
+
// Auto-segment every 2 minutes
|
|
207
|
+
const blockDuration = 120;
|
|
208
|
+
blocks = [];
|
|
209
|
+
for (let t = 0; t < (totalDuration || 600); t += blockDuration) {
|
|
210
|
+
blocks.push({ start: t, end: t + blockDuration });
|
|
211
|
+
}
|
|
212
|
+
if (blocks.length === 0)
|
|
213
|
+
blocks = [{ start: 0, end: Infinity }];
|
|
214
|
+
}
|
|
215
|
+
const keyPoints = [];
|
|
216
|
+
for (const block of blocks) {
|
|
217
|
+
const blockSegments = segments.filter(s => s.start >= block.start && s.start < block.end);
|
|
218
|
+
if (blockSegments.length === 0)
|
|
219
|
+
continue;
|
|
220
|
+
const blockText = blockSegments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
221
|
+
const sentences = splitSentences(blockText);
|
|
222
|
+
// Find first sentence with at least 5 words
|
|
223
|
+
const point = sentences.find(s => s.split(/\s+/).length >= 5);
|
|
224
|
+
if (point)
|
|
225
|
+
keyPoints.push(point.trim());
|
|
226
|
+
}
|
|
227
|
+
return keyPoints.slice(0, 12);
|
|
228
|
+
}
|
|
229
|
+
/**
|
|
230
|
+
* Extract a summary as the first ~200 words of the full transcript text.
|
|
231
|
+
*/
|
|
232
|
+
export function extractSummary(fullText) {
|
|
233
|
+
if (!fullText)
|
|
234
|
+
return '';
|
|
235
|
+
const words = fullText.split(/\s+/);
|
|
236
|
+
if (words.length <= 200)
|
|
237
|
+
return fullText;
|
|
238
|
+
return words.slice(0, 200).join(' ') + '...';
|
|
239
|
+
}
|
|
240
|
+
// ---------------------------------------------------------------------------
|
|
241
|
+
// Proxy-based InnerTube transcript extraction
|
|
242
|
+
// ---------------------------------------------------------------------------
|
|
243
|
+
// Webshare residential proxy config — reads from env vars via proxy-config.ts.
|
|
244
|
+
// Locally, falls back to direct fetch (residential IP already works).
|
|
245
|
+
// These constants are kept for use in proxyRequestSlotted() which does
|
|
246
|
+
// low-level HTTP CONNECT tunneling (not Playwright-level proxy).
|
|
247
|
+
const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
|
|
248
|
+
const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
|
|
249
|
+
const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
|
|
250
|
+
const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
|
|
251
|
+
// With paid Webshare backbone plan, each US slot has its own port:
|
|
252
|
+
// slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
|
|
253
|
+
const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
|
|
254
|
+
function isProxyConfigured() {
|
|
255
|
+
// Delegate to the shared proxy-config helper for consistency
|
|
256
|
+
return _hasWebshareProxy();
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
|
|
260
|
+
* slotted username (e.g. "argtnlhz-5"). This ensures both the /player call
|
|
261
|
+
* and the caption XML fetch go through the same residential IP.
|
|
262
|
+
*/
|
|
263
|
+
function proxyRequestSlotted(slottedUser, proxyPort, targetUrl, opts = {}) {
|
|
264
|
+
const url = new URL(targetUrl);
|
|
265
|
+
const timeout = opts.timeoutMs ?? 20000;
|
|
266
|
+
return new Promise((resolve, reject) => {
|
|
267
|
+
const proxyAuth = Buffer.from(`${slottedUser}:${PROXY_PASS}`).toString('base64');
|
|
268
|
+
const proxyReq = http.request({
|
|
269
|
+
host: PROXY_HOST,
|
|
270
|
+
port: proxyPort,
|
|
271
|
+
method: 'CONNECT',
|
|
272
|
+
path: `${url.hostname}:443`,
|
|
273
|
+
headers: { 'Proxy-Authorization': `Basic ${proxyAuth}` },
|
|
274
|
+
});
|
|
275
|
+
const timer = setTimeout(() => {
|
|
276
|
+
proxyReq.destroy();
|
|
277
|
+
reject(new Error('Proxy request timed out'));
|
|
278
|
+
}, timeout);
|
|
279
|
+
proxyReq.on('connect', (res, socket) => {
|
|
280
|
+
if (res.statusCode !== 200) {
|
|
281
|
+
clearTimeout(timer);
|
|
282
|
+
socket.destroy();
|
|
283
|
+
reject(new Error(`Proxy CONNECT failed: ${res.statusCode}`));
|
|
284
|
+
return;
|
|
285
|
+
}
|
|
286
|
+
const tlsSocket = tls.connect({ host: url.hostname, socket, servername: url.hostname }, () => {
|
|
287
|
+
const reqHeaders = {
|
|
288
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
289
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
290
|
+
'Cookie': 'CONSENT=YES+; SOCS=CAI',
|
|
291
|
+
...(opts.headers ?? {}),
|
|
292
|
+
};
|
|
293
|
+
const req = https.request({
|
|
294
|
+
hostname: url.hostname,
|
|
295
|
+
path: url.pathname + url.search,
|
|
296
|
+
method: opts.method ?? 'GET',
|
|
297
|
+
createConnection: () => tlsSocket,
|
|
298
|
+
headers: reqHeaders,
|
|
299
|
+
}, (response) => {
|
|
300
|
+
let data = '';
|
|
301
|
+
response.on('data', (chunk) => {
|
|
302
|
+
data += chunk;
|
|
303
|
+
});
|
|
304
|
+
response.on('end', () => {
|
|
305
|
+
clearTimeout(timer);
|
|
306
|
+
resolve({ status: response.statusCode ?? 0, body: data });
|
|
307
|
+
});
|
|
308
|
+
});
|
|
309
|
+
req.on('error', (e) => {
|
|
310
|
+
clearTimeout(timer);
|
|
311
|
+
reject(e);
|
|
312
|
+
});
|
|
313
|
+
if (opts.body)
|
|
314
|
+
req.write(opts.body);
|
|
315
|
+
req.end();
|
|
316
|
+
});
|
|
317
|
+
tlsSocket.on('error', (e) => {
|
|
318
|
+
clearTimeout(timer);
|
|
319
|
+
reject(e);
|
|
320
|
+
});
|
|
321
|
+
});
|
|
322
|
+
proxyReq.on('error', (e) => {
|
|
323
|
+
clearTimeout(timer);
|
|
324
|
+
reject(e);
|
|
325
|
+
});
|
|
326
|
+
proxyReq.end();
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
/**
|
|
330
|
+
* Fetch YouTube transcript via InnerTube /player API through Webshare proxy.
|
|
331
|
+
*
|
|
332
|
+
* This replicates the approach used by the Python `youtube-transcript-api` library:
|
|
333
|
+
* 1. POST to /youtubei/v1/player with ANDROID client context
|
|
334
|
+
* 2. Get caption track URLs WITHOUT the `exp=xpe` parameter
|
|
335
|
+
* 3. Fetch caption XML from those clean URLs (returns actual data, not 0 bytes)
|
|
336
|
+
*
|
|
337
|
+
* All requests go through the residential proxy to bypass YouTube's cloud IP blocking.
|
|
338
|
+
*/
|
|
339
|
+
async function getTranscriptViaProxy(videoId, preferredLang) {
|
|
340
|
+
// Try multiple proxy slots from the 44K+ US residential pool.
|
|
341
|
+
// Pick random slots across the pool for even distribution and to avoid
|
|
342
|
+
// rate-limited IPs. Try up to MAX_RETRIES different slots.
|
|
343
|
+
const MAX_RETRIES = 5;
|
|
344
|
+
const usedSlots = new Set();
|
|
345
|
+
// Public YouTube web-client InnerTube key embedded in their shipped client, not a WebPeel secret.
|
|
346
|
+
const INNERTUBE_API_KEY = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8';
|
|
347
|
+
for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
|
|
348
|
+
// Pick a random US slot we haven't tried yet
|
|
349
|
+
let slot;
|
|
350
|
+
do {
|
|
351
|
+
slot = Math.floor(Math.random() * PROXY_MAX_US_SLOTS) + 1;
|
|
352
|
+
} while (usedSlots.has(slot) && usedSlots.size < PROXY_MAX_US_SLOTS);
|
|
353
|
+
usedSlots.add(slot);
|
|
354
|
+
const proxyUser = `${PROXY_USER}-US-${slot}`;
|
|
355
|
+
const proxyPort = PROXY_BASE_PORT + slot - 1;
|
|
356
|
+
const doProxyRequest = (url, opts = {}) => proxyRequestSlotted(proxyUser, proxyPort, url, opts);
|
|
357
|
+
try {
|
|
358
|
+
// Step 1: Call InnerTube /player with ANDROID client
|
|
359
|
+
// ANDROID client returns caption URLs WITHOUT exp=xpe (avoids 0-byte responses).
|
|
360
|
+
const playerResp = await doProxyRequest(`https://www.youtube.com/youtubei/v1/player?key=${INNERTUBE_API_KEY}`, {
|
|
361
|
+
method: 'POST',
|
|
362
|
+
body: JSON.stringify({
|
|
363
|
+
context: { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } },
|
|
364
|
+
videoId,
|
|
365
|
+
}),
|
|
366
|
+
headers: { 'Content-Type': 'application/json' },
|
|
367
|
+
});
|
|
368
|
+
if (playerResp.status !== 200) {
|
|
369
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): /player returned ${playerResp.status}`);
|
|
370
|
+
continue;
|
|
371
|
+
}
|
|
372
|
+
const playerData = JSON.parse(playerResp.body);
|
|
373
|
+
const captionTracks = playerData?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
|
|
374
|
+
if (!captionTracks || captionTracks.length === 0) {
|
|
375
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no caption tracks`);
|
|
376
|
+
continue;
|
|
377
|
+
}
|
|
378
|
+
// Pick best matching language track
|
|
379
|
+
let track = captionTracks.find((t) => t.languageCode === preferredLang);
|
|
380
|
+
if (!track) {
|
|
381
|
+
track = captionTracks.find((t) => t.languageCode === 'en') ?? captionTracks[0];
|
|
382
|
+
}
|
|
383
|
+
const captionUrl = track.baseUrl;
|
|
384
|
+
if (captionUrl.includes('exp=xpe')) {
|
|
385
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption URL has exp=xpe, skipping`);
|
|
386
|
+
continue;
|
|
387
|
+
}
|
|
388
|
+
// Step 2: Fetch caption XML through the SAME proxy slot (same residential IP)
|
|
389
|
+
const capResp = await doProxyRequest(captionUrl);
|
|
390
|
+
if (!capResp.body ||
|
|
391
|
+
capResp.body.length === 0 ||
|
|
392
|
+
capResp.status === 429 ||
|
|
393
|
+
capResp.body.includes('<title>Sorry...</title>')) {
|
|
394
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption XML failed (status=${capResp.status}, bytes=${capResp.body?.length ?? 0})`);
|
|
395
|
+
continue; // Try next slot
|
|
396
|
+
}
|
|
397
|
+
// Parse XML segments — handles both <text start="" dur=""> and <p t="" d=""> formats
|
|
398
|
+
const xmlSegments = [
|
|
399
|
+
...capResp.body.matchAll(/<(?:text|p)\s[^>]*?(?:start|t)="([^"]*)"[^>]*?(?:dur|d)="([^"]*)"[^>]*>([\s\S]*?)<\/(?:text|p)>/g),
|
|
400
|
+
];
|
|
401
|
+
if (xmlSegments.length === 0) {
|
|
402
|
+
console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no segments parsed from XML`);
|
|
403
|
+
continue;
|
|
404
|
+
}
|
|
405
|
+
const segments = xmlSegments
|
|
406
|
+
.map((m) => ({
|
|
407
|
+
text: decodeHtmlEntities(m[3].replace(/<[^>]+>/g, '').replace(/\n/g, ' ').trim()),
|
|
408
|
+
start: parseFloat(m[1]) / (m[1].includes('.') ? 1 : 1000),
|
|
409
|
+
duration: parseFloat(m[2]) / (m[2].includes('.') ? 1 : 1000),
|
|
410
|
+
}))
|
|
411
|
+
.filter((s) => s.text.length > 0);
|
|
412
|
+
if (segments.length === 0)
|
|
413
|
+
continue;
|
|
414
|
+
// Extract metadata from player response
|
|
415
|
+
const vd = playerData.videoDetails ?? {};
|
|
416
|
+
const mf = playerData.microformat?.playerMicroformatRenderer ?? {};
|
|
417
|
+
const title = vd.title ?? '';
|
|
418
|
+
const channel = vd.author ?? '';
|
|
419
|
+
const lengthSeconds = parseInt(vd.lengthSeconds ?? mf.lengthSeconds ?? '0', 10);
|
|
420
|
+
const description = (vd.shortDescription ?? mf.description?.simpleText ?? '').trim();
|
|
421
|
+
const publishDate = mf.publishDate ?? mf.uploadDate ?? '';
|
|
422
|
+
const availableLanguages = captionTracks.map((t) => t.languageCode);
|
|
423
|
+
const fullText = segments.map((s) => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
424
|
+
const wordCount = fullText.split(/\s+/).filter(Boolean).length;
|
|
425
|
+
const chapters = parseChaptersFromDescription(description);
|
|
426
|
+
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
427
|
+
const summary = extractSummary(fullText);
|
|
428
|
+
const viewCount = vd.viewCount ?? mf.viewCount ?? '';
|
|
429
|
+
const likeCount = vd.likeCount ?? '';
|
|
430
|
+
console.log(`[webpeel] [youtube] Proxy slot ${slot} success: ${segments.length} segments, ${wordCount} words`);
|
|
431
|
+
return {
|
|
432
|
+
videoId,
|
|
433
|
+
title,
|
|
434
|
+
channel,
|
|
435
|
+
duration: formatDuration(lengthSeconds),
|
|
436
|
+
language: track.languageCode ?? preferredLang,
|
|
437
|
+
segments,
|
|
438
|
+
fullText,
|
|
439
|
+
availableLanguages,
|
|
440
|
+
description,
|
|
441
|
+
publishDate,
|
|
442
|
+
chapters: chapters.length > 0 ? chapters : undefined,
|
|
443
|
+
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
444
|
+
summary,
|
|
445
|
+
wordCount,
|
|
446
|
+
viewCount: viewCount || undefined,
|
|
447
|
+
likeCount: likeCount || undefined,
|
|
448
|
+
};
|
|
449
|
+
}
|
|
450
|
+
catch (err) {
|
|
451
|
+
console.log(`[webpeel] [youtube] Proxy slot ${slot} error:`, err?.message);
|
|
452
|
+
continue;
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
// All slots exhausted
|
|
456
|
+
console.log('[webpeel] [youtube] All proxy slots exhausted');
|
|
457
|
+
return null;
|
|
458
|
+
}
|
|
459
|
+
// ---------------------------------------------------------------------------
|
|
460
|
+
// Transcript extraction
|
|
461
|
+
// ---------------------------------------------------------------------------
|
|
462
|
+
/**
|
|
463
|
+
* Fetch and return the transcript for a YouTube video.
|
|
464
|
+
*
|
|
465
|
+
* @param url - Any YouTube URL format
|
|
466
|
+
* @param options.language - Preferred language code (default: "en")
|
|
467
|
+
*/
|
|
468
|
+
export async function getYouTubeTranscript(url, options = {}) {
|
|
469
|
+
const videoId = parseYouTubeUrl(url);
|
|
470
|
+
if (!videoId) {
|
|
471
|
+
throw new Error(`Not a valid YouTube URL: ${url}`);
|
|
472
|
+
}
|
|
473
|
+
const preferredLang = options.language ?? 'en';
|
|
474
|
+
const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
|
|
475
|
+
// --- Path P: Proxy-based InnerTube (primary for cloud servers) ---
|
|
476
|
+
// Uses Webshare residential proxy + ANDROID InnerTube /player API.
|
|
477
|
+
// This is the approach used by every major YouTube transcript service
|
|
478
|
+
// (youtubetotranscript.com, youtube-transcript.io, etc.)
|
|
479
|
+
if (!process.env.VITEST && isProxyConfigured()) {
|
|
480
|
+
console.log('[webpeel] [youtube] Trying path P: proxy-based InnerTube (residential proxy)');
|
|
481
|
+
try {
|
|
482
|
+
const proxyResult = await getTranscriptViaProxy(videoId, preferredLang);
|
|
483
|
+
if (proxyResult && proxyResult.segments.length > 0) {
|
|
484
|
+
console.log(`[webpeel] [youtube] Path P success: ${proxyResult.segments.length} segments, ${proxyResult.wordCount} words`);
|
|
485
|
+
return proxyResult;
|
|
486
|
+
}
|
|
487
|
+
console.log('[webpeel] [youtube] Path P returned empty/null, falling through');
|
|
488
|
+
}
|
|
489
|
+
catch (err) {
|
|
490
|
+
console.log('[webpeel] [youtube] Path P failed:', err?.message);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
// --- Path 0: youtube-transcript-plus (fastest — uses InnerTube API, ~1s) ---
|
|
494
|
+
// This library calls YouTube's internal InnerTube API directly via POST request,
|
|
495
|
+
// bypassing the IP-locked timedtext XML URLs. Works reliably from cloud servers.
|
|
496
|
+
// Skip in test mode — tests use mocked HTTP, but this path makes real InnerTube calls.
|
|
497
|
+
if (!process.env.VITEST) {
|
|
498
|
+
console.log('[webpeel] [youtube] Trying path 0: youtube-transcript-plus (InnerTube API)');
|
|
499
|
+
try {
|
|
500
|
+
const ytpSegments = await ytpFetchTranscript(videoId, { lang: preferredLang });
|
|
501
|
+
if (ytpSegments && ytpSegments.length > 0) {
|
|
502
|
+
// We have transcript segments — now fetch page metadata (title, channel, etc.)
|
|
503
|
+
let title = '', channel = '', lengthSeconds = 0, description = '', publishDate = '';
|
|
504
|
+
let availableLanguages = [preferredLang];
|
|
505
|
+
try {
|
|
506
|
+
const metaResp = await fetch(videoUrl, {
|
|
507
|
+
headers: {
|
|
508
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
509
|
+
'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
|
|
510
|
+
},
|
|
511
|
+
signal: AbortSignal.timeout(8000),
|
|
512
|
+
});
|
|
513
|
+
const html = await metaResp.text();
|
|
514
|
+
const pr = extractPlayerResponse(html);
|
|
515
|
+
if (pr) {
|
|
516
|
+
const vd = pr.videoDetails ?? {};
|
|
517
|
+
const mf = pr.microformat?.playerMicroformatRenderer ?? {};
|
|
518
|
+
title = vd.title ?? '';
|
|
519
|
+
channel = vd.author ?? '';
|
|
520
|
+
lengthSeconds = parseInt(vd.lengthSeconds ?? mf.lengthSeconds ?? '0', 10);
|
|
521
|
+
description = (vd.shortDescription ?? mf.description?.simpleText ?? '').trim();
|
|
522
|
+
publishDate = mf.publishDate ?? mf.uploadDate ?? '';
|
|
523
|
+
const tracks = extractCaptionTracks(pr);
|
|
524
|
+
if (tracks.length > 0)
|
|
525
|
+
availableLanguages = tracks.map(t => t.languageCode);
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
catch { /* metadata fetch failed — segments are enough */ }
|
|
529
|
+
// Convert youtube-transcript-plus format to our format
|
|
530
|
+
const segments = ytpSegments.map(s => ({
|
|
531
|
+
text: decodeHtmlEntities((s.text ?? '').replace(/\n/g, ' ').trim()),
|
|
532
|
+
start: (s.offset ?? 0) / 1000, // offset is in ms
|
|
533
|
+
duration: (s.duration ?? 0) / 1000,
|
|
534
|
+
})).filter(s => s.text.length > 0);
|
|
535
|
+
const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
536
|
+
const wordCount = fullText.split(/\s+/).filter(Boolean).length;
|
|
537
|
+
const chapters = parseChaptersFromDescription(description);
|
|
538
|
+
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
539
|
+
const summary = extractSummary(fullText);
|
|
540
|
+
console.log(`[webpeel] [youtube] Path 0 success: ${segments.length} segments, ${wordCount} words`);
|
|
541
|
+
return {
|
|
542
|
+
videoId,
|
|
543
|
+
title,
|
|
544
|
+
channel,
|
|
545
|
+
duration: formatDuration(lengthSeconds),
|
|
546
|
+
language: ytpSegments[0]?.lang ?? preferredLang,
|
|
547
|
+
segments,
|
|
548
|
+
fullText,
|
|
549
|
+
availableLanguages,
|
|
550
|
+
description,
|
|
551
|
+
publishDate,
|
|
552
|
+
chapters: chapters.length > 0 ? chapters : undefined,
|
|
553
|
+
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
554
|
+
summary,
|
|
555
|
+
wordCount,
|
|
556
|
+
viewCount: undefined, // not available in this path without extra fetch
|
|
557
|
+
likeCount: undefined,
|
|
558
|
+
};
|
|
559
|
+
}
|
|
560
|
+
console.log('[webpeel] [youtube] Path 0 returned empty segments');
|
|
561
|
+
}
|
|
562
|
+
catch (err) {
|
|
563
|
+
console.log('[webpeel] [youtube] Path 0 failed:', err?.message);
|
|
564
|
+
}
|
|
565
|
+
} // end VITEST guard
|
|
566
|
+
const ytUserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
|
|
567
|
+
const ytHeaders = {
|
|
568
|
+
'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
|
|
569
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
570
|
+
};
|
|
571
|
+
// --- Path 1: yt-dlp approach (most reliable on cloud servers — handles signature challenges internally) ---
|
|
572
|
+
if (ytdlpAvailable) {
|
|
573
|
+
console.log('[webpeel] [youtube] Trying path 1: yt-dlp');
|
|
574
|
+
try {
|
|
575
|
+
const ytdlpResult = await getTranscriptViaYtDlp(videoId, preferredLang);
|
|
576
|
+
if (ytdlpResult && ytdlpResult.segments.length > 0) {
|
|
577
|
+
return ytdlpResult;
|
|
578
|
+
}
|
|
579
|
+
console.log('[webpeel] [youtube] Path 1 failed: yt-dlp returned no segments');
|
|
580
|
+
}
|
|
581
|
+
catch (err) {
|
|
582
|
+
console.log('[webpeel] [youtube] Path 1 failed:', err?.message);
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
else {
|
|
586
|
+
console.log('[webpeel] [youtube] Skipping path 1: yt-dlp not available');
|
|
587
|
+
}
|
|
588
|
+
// --- Path 2: HTTP fetch (simpleFetch first; if our challenge detection fires, fall back to native fetch) ---
|
|
589
|
+
// YouTube serves consent/challenge pages to server IPs without cookies.
|
|
590
|
+
// Setting SOCS consent cookie bypasses this — same approach as youtube-transcript npm.
|
|
591
|
+
// On cloud servers, simpleFetch may throw BlockedError due to our own challenge detection;
|
|
592
|
+
// in that case we retry with native fetch() which bypasses that guard.
|
|
593
|
+
console.log('[webpeel] [youtube] Trying path 2: native fetch');
|
|
594
|
+
try {
|
|
595
|
+
let html;
|
|
596
|
+
try {
|
|
597
|
+
const fetchResult = await simpleFetch(videoUrl, ytUserAgent, 15000, ytHeaders);
|
|
598
|
+
html = fetchResult.html;
|
|
599
|
+
}
|
|
600
|
+
catch (simpleFetchErr) {
|
|
601
|
+
// If our own challenge detection threw BlockedError, retry with raw native fetch
|
|
602
|
+
const errMsg = (simpleFetchErr?.message ?? '').toLowerCase();
|
|
603
|
+
const isBlocked = simpleFetchErr?.constructor?.name === 'BlockedError' ||
|
|
604
|
+
errMsg.includes('blocked') ||
|
|
605
|
+
errMsg.includes('challenge') ||
|
|
606
|
+
errMsg.includes('cloudflare');
|
|
607
|
+
if (!isBlocked)
|
|
608
|
+
throw simpleFetchErr;
|
|
609
|
+
console.log('[webpeel] [youtube] simpleFetch BlockedError — retrying with native fetch');
|
|
610
|
+
const fetchResponse = await fetch(videoUrl, {
|
|
611
|
+
headers: {
|
|
612
|
+
'User-Agent': ytUserAgent,
|
|
613
|
+
...ytHeaders,
|
|
614
|
+
},
|
|
615
|
+
redirect: 'follow',
|
|
616
|
+
signal: AbortSignal.timeout(15000),
|
|
617
|
+
});
|
|
618
|
+
html = await fetchResponse.text();
|
|
619
|
+
}
|
|
620
|
+
if (!html.includes('ytInitialPlayerResponse') && !html.includes('ytInitialData')) {
|
|
621
|
+
throw new Error('YouTube served non-video page (likely challenge/consent)');
|
|
622
|
+
}
|
|
623
|
+
const playerResponse = extractPlayerResponse(html);
|
|
624
|
+
if (!playerResponse)
|
|
625
|
+
throw new Error('Could not parse player response');
|
|
626
|
+
const videoDetails = playerResponse.videoDetails ?? {};
|
|
627
|
+
const microformat = playerResponse.microformat?.playerMicroformatRenderer ?? {};
|
|
628
|
+
const title = videoDetails.title ?? '';
|
|
629
|
+
const channel = videoDetails.author ?? '';
|
|
630
|
+
const lengthSeconds = parseInt(videoDetails.lengthSeconds ?? microformat.lengthSeconds ?? '0', 10);
|
|
631
|
+
const description = (videoDetails.shortDescription ?? microformat.description?.simpleText ?? '').trim();
|
|
632
|
+
const publishDate = microformat.publishDate ?? microformat.uploadDate ?? '';
|
|
633
|
+
const captionTracks = extractCaptionTracks(playerResponse);
|
|
634
|
+
if (captionTracks.length === 0)
|
|
635
|
+
throw new Error('No captions available');
|
|
636
|
+
const availableLanguages = captionTracks.map(t => t.languageCode);
|
|
637
|
+
const selectedTrack = selectBestTrack(captionTracks, preferredLang);
|
|
638
|
+
// Pass same cookies + user-agent to caption fetch — URL is session-locked
|
|
639
|
+
const captionXml = await fetchCaptionXml(selectedTrack.baseUrl, ytUserAgent, ytHeaders);
|
|
640
|
+
const segments = parseCaptionXml(captionXml);
|
|
641
|
+
if (segments.length === 0) {
|
|
642
|
+
// Caption URL returned empty content (common when ip=0.0.0.0 in signature)
|
|
643
|
+
// Fall through to browser intercept path
|
|
644
|
+
throw new Error('Caption XML returned empty — session-locked URL');
|
|
645
|
+
}
|
|
646
|
+
const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
647
|
+
const wordCount = fullText.split(/\s+/).filter(Boolean).length;
|
|
648
|
+
const chapters = parseChaptersFromDescription(description);
|
|
649
|
+
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
650
|
+
const summary = extractSummary(fullText);
|
|
651
|
+
return {
|
|
652
|
+
videoId,
|
|
653
|
+
title,
|
|
654
|
+
channel,
|
|
655
|
+
duration: formatDuration(lengthSeconds),
|
|
656
|
+
language: selectedTrack.languageCode,
|
|
657
|
+
segments,
|
|
658
|
+
fullText,
|
|
659
|
+
availableLanguages,
|
|
660
|
+
description,
|
|
661
|
+
publishDate,
|
|
662
|
+
chapters: chapters.length > 0 ? chapters : undefined,
|
|
663
|
+
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
664
|
+
summary,
|
|
665
|
+
wordCount,
|
|
666
|
+
viewCount: (videoDetails.viewCount ?? microformat.viewCount ?? '') || undefined,
|
|
667
|
+
likeCount: (videoDetails.likeCount ?? '') || undefined,
|
|
668
|
+
};
|
|
669
|
+
}
|
|
670
|
+
catch (err) {
|
|
671
|
+
// Re-throw definitive failures (browser path won't help)
|
|
672
|
+
const msg = err?.message ?? '';
|
|
673
|
+
if (msg.includes('No captions available') || msg.includes('Not a valid YouTube URL')) {
|
|
674
|
+
throw err;
|
|
675
|
+
}
|
|
676
|
+
console.log('[webpeel] [youtube] Path 2 failed:', msg);
|
|
677
|
+
// Network/parsing failures — fall through to browser intercept approach
|
|
678
|
+
}
|
|
679
|
+
// --- Path 3: Browser intercept approach ---
|
|
680
|
+
// YouTube's caption URLs are session-specific (they return empty when fetched
|
|
681
|
+
// from a different HTTP client). We intercept the timedtext network request
|
|
682
|
+
// that the YouTube player makes automatically when loading the page.
|
|
683
|
+
console.log('[webpeel] [youtube] Trying path 3: browser intercept');
|
|
684
|
+
return getTranscriptViaBrowserIntercept(videoId, videoUrl, preferredLang);
|
|
685
|
+
}
|
|
686
|
+
/**
|
|
687
|
+
* Use yt-dlp to extract YouTube transcripts. yt-dlp handles all the
|
|
688
|
+
* signature challenges (player JS deciphering, multiple API endpoints)
|
|
689
|
+
* that defeat server-side HTTP fetch approaches.
|
|
690
|
+
*/
|
|
691
|
+
async function getTranscriptViaYtDlp(videoId, preferredLang) {
|
|
692
|
+
const outPath = join(tmpdir(), `webpeel_yt_${videoId}_${Date.now()}`);
|
|
693
|
+
const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
|
|
694
|
+
return new Promise((resolve) => {
|
|
695
|
+
const args = [
|
|
696
|
+
'--skip-download',
|
|
697
|
+
'--write-auto-sub',
|
|
698
|
+
'--sub-lang', preferredLang,
|
|
699
|
+
'--sub-format', 'json3',
|
|
700
|
+
'--write-info-json',
|
|
701
|
+
'--output', outPath,
|
|
702
|
+
'--no-warnings',
|
|
703
|
+
'--quiet',
|
|
704
|
+
videoUrl,
|
|
705
|
+
];
|
|
706
|
+
// Pass explicit PATH so yt-dlp is found in Docker containers
|
|
707
|
+
// pip3 installs to /usr/local/bin which may not be in Node's process.env.PATH
|
|
708
|
+
const execEnv = {
|
|
709
|
+
...process.env,
|
|
710
|
+
PATH: `/usr/local/bin:/usr/bin:/bin:${process.env.PATH ?? ''}`,
|
|
711
|
+
};
|
|
712
|
+
const proc = execFile('yt-dlp', args, { timeout: 60000, env: execEnv }, async (err) => {
|
|
713
|
+
try {
|
|
714
|
+
if (err) {
|
|
715
|
+
// yt-dlp not installed, timed out, or failed
|
|
716
|
+
console.error('[webpeel] yt-dlp error:', err.message);
|
|
717
|
+
resolve(null);
|
|
718
|
+
return;
|
|
719
|
+
}
|
|
720
|
+
// Read subtitle file
|
|
721
|
+
const subFiles = [`${outPath}.${preferredLang}.json3`, `${outPath}.en.json3`];
|
|
722
|
+
let subData = null;
|
|
723
|
+
for (const sf of subFiles) {
|
|
724
|
+
try {
|
|
725
|
+
const raw = await readFile(sf, 'utf-8');
|
|
726
|
+
subData = JSON.parse(raw);
|
|
727
|
+
await unlink(sf).catch(() => { });
|
|
728
|
+
break;
|
|
729
|
+
}
|
|
730
|
+
catch { /* try next */ }
|
|
731
|
+
}
|
|
732
|
+
// Read info JSON for metadata
|
|
733
|
+
let infoData = null;
|
|
734
|
+
try {
|
|
735
|
+
const infoRaw = await readFile(`${outPath}.info.json`, 'utf-8');
|
|
736
|
+
infoData = JSON.parse(infoRaw);
|
|
737
|
+
await unlink(`${outPath}.info.json`).catch(() => { });
|
|
738
|
+
}
|
|
739
|
+
catch { /* metadata is optional */ }
|
|
740
|
+
if (!subData || !subData.events) {
|
|
741
|
+
resolve(null);
|
|
742
|
+
return;
|
|
743
|
+
}
|
|
744
|
+
const events = subData.events || [];
|
|
745
|
+
const segments = events
|
|
746
|
+
.filter((e) => e.segs)
|
|
747
|
+
.map((e) => ({
|
|
748
|
+
text: e.segs.map((s) => s.utf8 || '').join('').trim(),
|
|
749
|
+
start: (e.tStartMs || 0) / 1000,
|
|
750
|
+
duration: (e.dDurationMs || 0) / 1000,
|
|
751
|
+
}))
|
|
752
|
+
.filter((s) => s.text.length > 0);
|
|
753
|
+
const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
754
|
+
const wordCount = fullText.split(/\s+/).filter(Boolean).length;
|
|
755
|
+
const title = infoData?.title || '';
|
|
756
|
+
const channel = infoData?.uploader || infoData?.channel || '';
|
|
757
|
+
const lengthSeconds = infoData?.duration || 0;
|
|
758
|
+
const description = infoData?.description || '';
|
|
759
|
+
const publishDate = infoData?.upload_date
|
|
760
|
+
? `${infoData.upload_date.slice(0, 4)}-${infoData.upload_date.slice(4, 6)}-${infoData.upload_date.slice(6, 8)}`
|
|
761
|
+
: '';
|
|
762
|
+
const chapters = parseChaptersFromDescription(description);
|
|
763
|
+
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
764
|
+
const summary = extractSummary(fullText);
|
|
765
|
+
resolve({
|
|
766
|
+
videoId,
|
|
767
|
+
title,
|
|
768
|
+
channel,
|
|
769
|
+
duration: formatDuration(lengthSeconds),
|
|
770
|
+
language: preferredLang,
|
|
771
|
+
segments,
|
|
772
|
+
fullText,
|
|
773
|
+
availableLanguages: [preferredLang],
|
|
774
|
+
description,
|
|
775
|
+
publishDate,
|
|
776
|
+
chapters: chapters.length > 0 ? chapters : undefined,
|
|
777
|
+
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
778
|
+
summary,
|
|
779
|
+
wordCount,
|
|
780
|
+
viewCount: (infoData.view_count?.toString() ?? '') || undefined,
|
|
781
|
+
likeCount: (infoData.like_count?.toString() ?? '') || undefined,
|
|
782
|
+
});
|
|
783
|
+
}
|
|
784
|
+
catch {
|
|
785
|
+
resolve(null);
|
|
786
|
+
}
|
|
787
|
+
});
|
|
788
|
+
// Safety: if process hangs, resolve null
|
|
789
|
+
proc.on('error', () => resolve(null));
|
|
790
|
+
});
|
|
791
|
+
}
|
|
792
|
+
/**
|
|
793
|
+
* Use a real browser with network route interception to capture the
|
|
794
|
+
* YouTube caption JSON that the player fetches automatically on page load.
|
|
795
|
+
* This preserves the session context needed for timedtext API requests.
|
|
796
|
+
*/
|
|
797
|
+
async function getTranscriptViaBrowserIntercept(videoId, videoUrl, preferredLang) {
|
|
798
|
+
const browser = await getBrowser();
|
|
799
|
+
const ua = getRandomUserAgent();
|
|
800
|
+
const context = await browser.newContext({ userAgent: ua });
|
|
801
|
+
const page = await context.newPage();
|
|
802
|
+
await applyStealthScripts(page);
|
|
803
|
+
let capturedJson = null;
|
|
804
|
+
let capturedLang = preferredLang;
|
|
805
|
+
// Intercept YouTube's timedtext API requests (the player fetches these automatically)
|
|
806
|
+
await page.route('**/api/timedtext**', async (route) => {
|
|
807
|
+
try {
|
|
808
|
+
const response = await route.fetch();
|
|
809
|
+
const text = await response.text();
|
|
810
|
+
if (text && text.length > 100 && (text.includes('events') || text.includes('segs'))) {
|
|
811
|
+
try {
|
|
812
|
+
capturedJson = JSON.parse(text);
|
|
813
|
+
// Try to extract language from URL
|
|
814
|
+
const urlObj = new URL(route.request().url());
|
|
815
|
+
capturedLang = urlObj.searchParams.get('lang') || preferredLang;
|
|
816
|
+
}
|
|
817
|
+
catch { /* keep trying */ }
|
|
818
|
+
}
|
|
819
|
+
await route.fulfill({ response });
|
|
820
|
+
}
|
|
821
|
+
catch {
|
|
822
|
+
await route.continue();
|
|
823
|
+
}
|
|
824
|
+
});
|
|
825
|
+
try {
|
|
826
|
+
await page.goto(videoUrl, { waitUntil: 'domcontentloaded', timeout: 35000 });
|
|
827
|
+
// Wait for timedtext request to be intercepted (player auto-fetches captions)
|
|
828
|
+
const startWait = Date.now();
|
|
829
|
+
while (!capturedJson && Date.now() - startWait < 12000) {
|
|
830
|
+
await page.waitForTimeout(200);
|
|
831
|
+
}
|
|
832
|
+
// Also grab page HTML for video metadata
|
|
833
|
+
const html = await page.content();
|
|
834
|
+
const playerResponse = extractPlayerResponse(html);
|
|
835
|
+
const videoDetails = playerResponse?.videoDetails ?? {};
|
|
836
|
+
const microformat = playerResponse?.microformat?.playerMicroformatRenderer ?? {};
|
|
837
|
+
const title = videoDetails.title ?? '';
|
|
838
|
+
const channel = videoDetails.author ?? '';
|
|
839
|
+
const lengthSeconds = parseInt(videoDetails.lengthSeconds ?? microformat.lengthSeconds ?? '0', 10);
|
|
840
|
+
const description = (videoDetails.shortDescription ?? microformat.description?.simpleText ?? '').trim();
|
|
841
|
+
const publishDate = microformat.publishDate ?? microformat.uploadDate ?? '';
|
|
842
|
+
const captionTracks = playerResponse ? extractCaptionTracks(playerResponse) : [];
|
|
843
|
+
const availableLanguages = captionTracks.map(t => t.languageCode);
|
|
844
|
+
const descriptionChapters = parseChaptersFromDescription(description);
|
|
845
|
+
// If no captions were intercepted, fall back to video description from player response
|
|
846
|
+
if (!capturedJson) {
|
|
847
|
+
if (description.length > 50) {
|
|
848
|
+
// Return description as transcript content (better than nothing)
|
|
849
|
+
return {
|
|
850
|
+
videoId,
|
|
851
|
+
title,
|
|
852
|
+
channel,
|
|
853
|
+
duration: formatDuration(lengthSeconds),
|
|
854
|
+
language: 'en',
|
|
855
|
+
segments: [],
|
|
856
|
+
fullText: description,
|
|
857
|
+
availableLanguages,
|
|
858
|
+
description,
|
|
859
|
+
publishDate: publishDate || undefined,
|
|
860
|
+
chapters: descriptionChapters.length > 0 ? descriptionChapters : undefined,
|
|
861
|
+
wordCount: description.split(/\s+/).filter(Boolean).length,
|
|
862
|
+
};
|
|
863
|
+
}
|
|
864
|
+
throw new Error(`No captions available for video ${videoId} — captions may be disabled`);
|
|
865
|
+
}
|
|
866
|
+
// Parse the JSON3 format (YouTube's native caption format)
|
|
867
|
+
const segments = parseJson3Events(capturedJson);
|
|
868
|
+
if (segments.length === 0) {
|
|
869
|
+
// Fallback to description if JSON3 parsing yields nothing
|
|
870
|
+
if (description.length > 50) {
|
|
871
|
+
return {
|
|
872
|
+
videoId,
|
|
873
|
+
title,
|
|
874
|
+
channel,
|
|
875
|
+
duration: formatDuration(lengthSeconds),
|
|
876
|
+
language: 'en',
|
|
877
|
+
segments: [],
|
|
878
|
+
fullText: description,
|
|
879
|
+
availableLanguages,
|
|
880
|
+
description,
|
|
881
|
+
publishDate: publishDate || undefined,
|
|
882
|
+
chapters: descriptionChapters.length > 0 ? descriptionChapters : undefined,
|
|
883
|
+
wordCount: description.split(/\s+/).filter(Boolean).length,
|
|
884
|
+
};
|
|
885
|
+
}
|
|
886
|
+
throw new Error(`Captured caption response had no segments for video ${videoId}`);
|
|
887
|
+
}
|
|
888
|
+
const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
|
|
889
|
+
const wordCount = fullText.split(/\s+/).filter(Boolean).length;
|
|
890
|
+
const chapters = descriptionChapters;
|
|
891
|
+
const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
|
|
892
|
+
const summary = extractSummary(fullText);
|
|
893
|
+
return {
|
|
894
|
+
videoId,
|
|
895
|
+
title,
|
|
896
|
+
channel,
|
|
897
|
+
duration: formatDuration(lengthSeconds),
|
|
898
|
+
language: capturedLang,
|
|
899
|
+
segments,
|
|
900
|
+
fullText,
|
|
901
|
+
availableLanguages,
|
|
902
|
+
description,
|
|
903
|
+
publishDate: publishDate || undefined,
|
|
904
|
+
chapters: chapters.length > 0 ? chapters : undefined,
|
|
905
|
+
keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
|
|
906
|
+
summary,
|
|
907
|
+
wordCount,
|
|
908
|
+
viewCount: undefined, // browser path doesn't reliably get this
|
|
909
|
+
likeCount: undefined,
|
|
910
|
+
};
|
|
911
|
+
}
|
|
912
|
+
finally {
|
|
913
|
+
await page.close().catch(() => { });
|
|
914
|
+
await context.close().catch(() => { });
|
|
915
|
+
// Note: browser itself is pooled — don't close it
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
/**
|
|
919
|
+
* Parse YouTube's JSON3 caption format (from intercepted timedtext requests).
|
|
920
|
+
* Format: { events: [{ tStartMs, dDurationMs, segs: [{ utf8: "text" } or { u: "text" }] }] }
|
|
921
|
+
*/
|
|
922
|
+
function parseJson3Events(data) {
|
|
923
|
+
const events = data.events || [];
|
|
924
|
+
return events
|
|
925
|
+
.filter(e => e.segs && e.segs.some((s) => s.utf8 || s.u))
|
|
926
|
+
.map(e => ({
|
|
927
|
+
// YouTube uses 'utf8' key in modern responses, 'u' in some older ones
|
|
928
|
+
text: decodeHtmlEntities(e.segs.map((s) => (s.utf8 ?? s.u ?? '')).join('').replace(/\n/g, ' ').trim()),
|
|
929
|
+
start: (e.tStartMs || 0) / 1000,
|
|
930
|
+
duration: (e.dDurationMs || 0) / 1000,
|
|
931
|
+
}))
|
|
932
|
+
.filter(s => s.text.length > 0);
|
|
933
|
+
}
|
|
934
|
+
/**
|
|
935
|
+
* Extract the ytInitialPlayerResponse JSON object from page HTML.
|
|
936
|
+
*/
|
|
937
|
+
export function extractPlayerResponse(html) {
|
|
938
|
+
// Try a few patterns YouTube uses to embed this data
|
|
939
|
+
const patterns = [
|
|
940
|
+
// Modern: var ytInitialPlayerResponse = {...};
|
|
941
|
+
/var ytInitialPlayerResponse\s*=\s*(\{.+?\});\s*(?:var|<\/script>)/s,
|
|
942
|
+
// Also try without trailing var (some pages end differently)
|
|
943
|
+
/ytInitialPlayerResponse\s*=\s*(\{.+?\})(?:;|\s*<\/script>)/s,
|
|
944
|
+
];
|
|
945
|
+
for (const pattern of patterns) {
|
|
946
|
+
const match = html.match(pattern);
|
|
947
|
+
if (match) {
|
|
948
|
+
try {
|
|
949
|
+
return JSON.parse(match[1]);
|
|
950
|
+
}
|
|
951
|
+
catch {
|
|
952
|
+
// Try to find a valid JSON boundary by walking the string
|
|
953
|
+
const start = html.indexOf('ytInitialPlayerResponse');
|
|
954
|
+
if (start === -1)
|
|
955
|
+
continue;
|
|
956
|
+
const braceStart = html.indexOf('{', start);
|
|
957
|
+
if (braceStart === -1)
|
|
958
|
+
continue;
|
|
959
|
+
const jsonStr = extractJsonObject(html, braceStart);
|
|
960
|
+
if (jsonStr) {
|
|
961
|
+
try {
|
|
962
|
+
return JSON.parse(jsonStr);
|
|
963
|
+
}
|
|
964
|
+
catch (e) {
|
|
965
|
+
if (process.env.DEBUG)
|
|
966
|
+
console.debug('[webpeel]', 'player response parse failed:', e instanceof Error ? e.message : e);
|
|
967
|
+
}
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
}
|
|
972
|
+
// Fallback: search for captionTracks directly
|
|
973
|
+
const captionIdx = html.indexOf('"captionTracks"');
|
|
974
|
+
if (captionIdx !== -1) {
|
|
975
|
+
// Walk back to find the enclosing object
|
|
976
|
+
const braceStart = html.lastIndexOf('{', captionIdx);
|
|
977
|
+
if (braceStart !== -1) {
|
|
978
|
+
const jsonStr = extractJsonObject(html, braceStart);
|
|
979
|
+
if (jsonStr) {
|
|
980
|
+
try {
|
|
981
|
+
return JSON.parse(jsonStr);
|
|
982
|
+
}
|
|
983
|
+
catch { /* ignore */ }
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
return null;
|
|
988
|
+
}
|
|
989
|
+
/**
|
|
990
|
+
* Extract a complete JSON object starting at position `start` in `str`.
|
|
991
|
+
* Handles nested objects/arrays and string literals.
|
|
992
|
+
*/
|
|
993
|
+
function extractJsonObject(str, start) {
|
|
994
|
+
let depth = 0;
|
|
995
|
+
let inString = false;
|
|
996
|
+
let escape = false;
|
|
997
|
+
for (let i = start; i < str.length; i++) {
|
|
998
|
+
const ch = str[i];
|
|
999
|
+
if (escape) {
|
|
1000
|
+
escape = false;
|
|
1001
|
+
continue;
|
|
1002
|
+
}
|
|
1003
|
+
if (ch === '\\' && inString) {
|
|
1004
|
+
escape = true;
|
|
1005
|
+
continue;
|
|
1006
|
+
}
|
|
1007
|
+
if (ch === '"') {
|
|
1008
|
+
inString = !inString;
|
|
1009
|
+
continue;
|
|
1010
|
+
}
|
|
1011
|
+
if (inString)
|
|
1012
|
+
continue;
|
|
1013
|
+
if (ch === '{' || ch === '[')
|
|
1014
|
+
depth++;
|
|
1015
|
+
else if (ch === '}' || ch === ']') {
|
|
1016
|
+
depth--;
|
|
1017
|
+
if (depth === 0) {
|
|
1018
|
+
return str.slice(start, i + 1);
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
return null;
|
|
1023
|
+
}
|
|
1024
|
+
/**
|
|
1025
|
+
* Extract caption tracks from the player response.
|
|
1026
|
+
*/
|
|
1027
|
+
function extractCaptionTracks(playerResponse) {
|
|
1028
|
+
try {
|
|
1029
|
+
const tracks = playerResponse?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
|
|
1030
|
+
if (!Array.isArray(tracks))
|
|
1031
|
+
return [];
|
|
1032
|
+
return tracks.map((t) => ({
|
|
1033
|
+
baseUrl: t.baseUrl ?? '',
|
|
1034
|
+
languageCode: (t.languageCode ?? 'unknown').toLowerCase(),
|
|
1035
|
+
name: t.name?.simpleText ?? t.name?.runs?.[0]?.text ?? t.languageCode ?? '',
|
|
1036
|
+
isAutoGenerated: (t.kind === 'asr') ||
|
|
1037
|
+
(t.vssId?.startsWith('a.') ?? false) ||
|
|
1038
|
+
String(t.name?.simpleText ?? '').toLowerCase().includes('auto') ||
|
|
1039
|
+
false,
|
|
1040
|
+
})).filter(t => t.baseUrl);
|
|
1041
|
+
}
|
|
1042
|
+
catch {
|
|
1043
|
+
return [];
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
/**
|
|
1047
|
+
* Pick the best caption track for the requested language.
|
|
1048
|
+
* Priority: manual track in preferred language > auto-generated in preferred language > any manual > any
|
|
1049
|
+
*/
|
|
1050
|
+
function selectBestTrack(tracks, preferredLang) {
|
|
1051
|
+
const lang = preferredLang.toLowerCase().split('-')[0]; // "en-US" → "en"
|
|
1052
|
+
// 1. Manual in preferred language
|
|
1053
|
+
const manualPref = tracks.find(t => !t.isAutoGenerated && t.languageCode.startsWith(lang));
|
|
1054
|
+
if (manualPref)
|
|
1055
|
+
return manualPref;
|
|
1056
|
+
// 2. Auto-generated in preferred language
|
|
1057
|
+
const autoPref = tracks.find(t => t.isAutoGenerated && t.languageCode.startsWith(lang));
|
|
1058
|
+
if (autoPref)
|
|
1059
|
+
return autoPref;
|
|
1060
|
+
// 3. Any manual track
|
|
1061
|
+
const anyManual = tracks.find(t => !t.isAutoGenerated);
|
|
1062
|
+
if (anyManual)
|
|
1063
|
+
return anyManual;
|
|
1064
|
+
// 4. Fall back to first available
|
|
1065
|
+
return tracks[0];
|
|
1066
|
+
}
|
|
1067
|
+
/**
|
|
1068
|
+
* Fetch the caption XML from YouTube's timedtext API.
|
|
1069
|
+
* Must use same cookies/UA as the page fetch — URLs are session-locked.
|
|
1070
|
+
* Tries simpleFetch first; falls back to native fetch() if BlockedError is thrown
|
|
1071
|
+
* (our own challenge detection fires on cloud server IPs).
|
|
1072
|
+
*/
|
|
1073
|
+
async function fetchCaptionXml(baseUrl, userAgent, headers) {
|
|
1074
|
+
try {
|
|
1075
|
+
const result = await simpleFetch(baseUrl, userAgent, 10000, headers);
|
|
1076
|
+
return result.html;
|
|
1077
|
+
}
|
|
1078
|
+
catch (simpleFetchErr) {
|
|
1079
|
+
const errMsg = (simpleFetchErr?.message ?? '').toLowerCase();
|
|
1080
|
+
const isBlocked = simpleFetchErr?.constructor?.name === 'BlockedError' ||
|
|
1081
|
+
errMsg.includes('blocked') ||
|
|
1082
|
+
errMsg.includes('challenge') ||
|
|
1083
|
+
errMsg.includes('cloudflare');
|
|
1084
|
+
if (!isBlocked)
|
|
1085
|
+
throw simpleFetchErr;
|
|
1086
|
+
// BlockedError: retry with native fetch
|
|
1087
|
+
const fetchHeaders = {};
|
|
1088
|
+
if (userAgent)
|
|
1089
|
+
fetchHeaders['User-Agent'] = userAgent;
|
|
1090
|
+
if (headers)
|
|
1091
|
+
Object.assign(fetchHeaders, headers);
|
|
1092
|
+
const response = await fetch(baseUrl, {
|
|
1093
|
+
headers: fetchHeaders,
|
|
1094
|
+
redirect: 'follow',
|
|
1095
|
+
signal: AbortSignal.timeout(10000),
|
|
1096
|
+
});
|
|
1097
|
+
return response.text();
|
|
1098
|
+
}
|
|
1099
|
+
}
|
|
1100
|
+
/**
|
|
1101
|
+
* Parse YouTube caption XML into transcript segments.
|
|
1102
|
+
*
|
|
1103
|
+
* Format: <transcript><text start="0.5" dur="2.1">Hello & world</text>...</transcript>
|
|
1104
|
+
*/
|
|
1105
|
+
export function parseCaptionXml(xml) {
|
|
1106
|
+
const segments = [];
|
|
1107
|
+
// Match all <text> elements with their attributes
|
|
1108
|
+
const textRegex = /<text\s+([^>]*)>([\s\S]*?)<\/text>/g;
|
|
1109
|
+
let match;
|
|
1110
|
+
while ((match = textRegex.exec(xml)) !== null) {
|
|
1111
|
+
const attrs = match[1];
|
|
1112
|
+
const rawText = match[2];
|
|
1113
|
+
const start = parseFloat(extractAttr(attrs, 'start') ?? '0');
|
|
1114
|
+
const duration = parseFloat(extractAttr(attrs, 'dur') ?? '0');
|
|
1115
|
+
const text = decodeHtmlEntities(rawText.trim());
|
|
1116
|
+
if (text) {
|
|
1117
|
+
segments.push({ text, start, duration });
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
return segments;
|
|
1121
|
+
}
|
|
1122
|
+
/**
|
|
1123
|
+
* Extract an attribute value from an HTML/XML attribute string.
|
|
1124
|
+
*/
|
|
1125
|
+
function extractAttr(attrs, name) {
|
|
1126
|
+
const regex = new RegExp(`${name}="([^"]*)"`, 'i');
|
|
1127
|
+
const m = attrs.match(regex);
|
|
1128
|
+
return m ? m[1] : null;
|
|
1129
|
+
}
|
|
1130
|
+
/**
|
|
1131
|
+
* Decode common HTML entities found in YouTube caption XML.
|
|
1132
|
+
*
|
|
1133
|
+
* Order of operations:
|
|
1134
|
+
* 1. Strip real HTML tags (e.g. <font color="...">) — these appear literally in the XML
|
|
1135
|
+
* 2. Decode all HTML entities (including < → < which represents literal angle brackets)
|
|
1136
|
+
*/
|
|
1137
|
+
export function decodeHtmlEntities(text) {
|
|
1138
|
+
return text
|
|
1139
|
+
// Step 1: strip real inline HTML tags (literal <...> in the text, not entities)
|
|
1140
|
+
.replace(/<[^>]+>/g, '')
|
|
1141
|
+
// Step 2: decode HTML entities
|
|
1142
|
+
.replace(/</g, '<')
|
|
1143
|
+
.replace(/>/g, '>')
|
|
1144
|
+
.replace(/&/g, '&')
|
|
1145
|
+
.replace(/"/g, '"')
|
|
1146
|
+
.replace(/'/g, "'")
|
|
1147
|
+
.replace(/'/g, "'")
|
|
1148
|
+
.replace(/'/g, "'")
|
|
1149
|
+
.replace(///g, '/')
|
|
1150
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
|
|
1151
|
+
.replace(/&#x([0-9A-Fa-f]+);/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
|
|
1152
|
+
.trim();
|
|
1153
|
+
}
|
|
1154
|
+
/**
|
|
1155
|
+
* Format seconds into MM:SS or HH:MM:SS.
|
|
1156
|
+
*/
|
|
1157
|
+
export function formatDuration(seconds) {
|
|
1158
|
+
if (!seconds || isNaN(seconds))
|
|
1159
|
+
return '0:00';
|
|
1160
|
+
const h = Math.floor(seconds / 3600);
|
|
1161
|
+
const m = Math.floor((seconds % 3600) / 60);
|
|
1162
|
+
const s = Math.floor(seconds % 60);
|
|
1163
|
+
if (h > 0) {
|
|
1164
|
+
return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`;
|
|
1165
|
+
}
|
|
1166
|
+
return `${m}:${String(s).padStart(2, '0')}`;
|
|
1167
|
+
}
|
|
1168
|
+
/**
|
|
1169
|
+
* Extract a meta tag value from HTML (og:title, og:description, etc.)
|
|
1170
|
+
*/
|
|
1171
|
+
function extractMetaTag(html, property) {
|
|
1172
|
+
const regex = new RegExp(`<meta[^>]+(?:property|name)=["']${property.replace(/:/g, '\\:')}["'][^>]+content=["']([^"']+)["']`, 'i');
|
|
1173
|
+
const m = html.match(regex) ?? html.match(new RegExp(`<meta[^>]+content=["']([^"']+)["'][^>]+(?:property|name)=["']${property.replace(/:/g, '\\:')}["']`, 'i'));
|
|
1174
|
+
return m ? decodeHtmlEntities(m[1]) : null;
|
|
1175
|
+
}
|