@iflow-mcp/jakeliume-webpeel 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (547) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +313 -0
  3. package/dist/cache.d.ts +30 -0
  4. package/dist/cache.js +139 -0
  5. package/dist/cli/commands/auth.d.ts +5 -0
  6. package/dist/cli/commands/auth.js +411 -0
  7. package/dist/cli/commands/doctor.d.ts +37 -0
  8. package/dist/cli/commands/doctor.js +371 -0
  9. package/dist/cli/commands/fetch.d.ts +6 -0
  10. package/dist/cli/commands/fetch.js +1345 -0
  11. package/dist/cli/commands/guide.d.ts +2 -0
  12. package/dist/cli/commands/guide.js +183 -0
  13. package/dist/cli/commands/interact.d.ts +5 -0
  14. package/dist/cli/commands/interact.js +840 -0
  15. package/dist/cli/commands/jobs.d.ts +5 -0
  16. package/dist/cli/commands/jobs.js +997 -0
  17. package/dist/cli/commands/monitor.d.ts +12 -0
  18. package/dist/cli/commands/monitor.js +197 -0
  19. package/dist/cli/commands/observe.d.ts +12 -0
  20. package/dist/cli/commands/observe.js +158 -0
  21. package/dist/cli/commands/screenshot.d.ts +5 -0
  22. package/dist/cli/commands/screenshot.js +282 -0
  23. package/dist/cli/commands/search.d.ts +5 -0
  24. package/dist/cli/commands/search.js +1021 -0
  25. package/dist/cli/commands/setup.d.ts +13 -0
  26. package/dist/cli/commands/setup.js +244 -0
  27. package/dist/cli/commands/skill.d.ts +15 -0
  28. package/dist/cli/commands/skill.js +195 -0
  29. package/dist/cli/utils.d.ts +84 -0
  30. package/dist/cli/utils.js +806 -0
  31. package/dist/cli-auth.d.ts +75 -0
  32. package/dist/cli-auth.js +369 -0
  33. package/dist/cli.d.ts +17 -0
  34. package/dist/cli.js +99 -0
  35. package/dist/core/actions.d.ts +69 -0
  36. package/dist/core/actions.js +495 -0
  37. package/dist/core/agent.d.ts +98 -0
  38. package/dist/core/agent.js +558 -0
  39. package/dist/core/answer.d.ts +42 -0
  40. package/dist/core/answer.js +395 -0
  41. package/dist/core/application-tracker.d.ts +84 -0
  42. package/dist/core/application-tracker.js +184 -0
  43. package/dist/core/apply.d.ts +162 -0
  44. package/dist/core/apply.js +816 -0
  45. package/dist/core/auth-detection.d.ts +35 -0
  46. package/dist/core/auth-detection.js +358 -0
  47. package/dist/core/auto-extract.d.ts +82 -0
  48. package/dist/core/auto-extract.js +604 -0
  49. package/dist/core/auto-interact.d.ts +23 -0
  50. package/dist/core/auto-interact.js +246 -0
  51. package/dist/core/bm25-filter.d.ts +66 -0
  52. package/dist/core/bm25-filter.js +288 -0
  53. package/dist/core/branding.d.ts +54 -0
  54. package/dist/core/branding.js +234 -0
  55. package/dist/core/browser-fetch.d.ts +323 -0
  56. package/dist/core/browser-fetch.js +1600 -0
  57. package/dist/core/browser-pool.d.ts +91 -0
  58. package/dist/core/browser-pool.js +550 -0
  59. package/dist/core/budget.d.ts +42 -0
  60. package/dist/core/budget.js +324 -0
  61. package/dist/core/business-intel.d.ts +47 -0
  62. package/dist/core/business-intel.js +279 -0
  63. package/dist/core/cache.d.ts +13 -0
  64. package/dist/core/cache.js +121 -0
  65. package/dist/core/cf-worker-proxy.d.ts +32 -0
  66. package/dist/core/cf-worker-proxy.js +87 -0
  67. package/dist/core/challenge-detection.d.ts +26 -0
  68. package/dist/core/challenge-detection.js +468 -0
  69. package/dist/core/change-tracking.d.ts +75 -0
  70. package/dist/core/change-tracking.js +276 -0
  71. package/dist/core/chunker.d.ts +46 -0
  72. package/dist/core/chunker.js +249 -0
  73. package/dist/core/chunking.d.ts +42 -0
  74. package/dist/core/chunking.js +181 -0
  75. package/dist/core/circuit-breaker.d.ts +44 -0
  76. package/dist/core/circuit-breaker.js +85 -0
  77. package/dist/core/content-pruner.d.ts +47 -0
  78. package/dist/core/content-pruner.js +425 -0
  79. package/dist/core/cookie-cache.d.ts +60 -0
  80. package/dist/core/cookie-cache.js +163 -0
  81. package/dist/core/crawl-checkpoint.d.ts +54 -0
  82. package/dist/core/crawl-checkpoint.js +104 -0
  83. package/dist/core/crawler.d.ts +84 -0
  84. package/dist/core/crawler.js +349 -0
  85. package/dist/core/cross-verify.d.ts +27 -0
  86. package/dist/core/cross-verify.js +93 -0
  87. package/dist/core/deep-fetch.d.ts +74 -0
  88. package/dist/core/deep-fetch.js +405 -0
  89. package/dist/core/deep-research.d.ts +141 -0
  90. package/dist/core/deep-research.js +972 -0
  91. package/dist/core/design-analysis.d.ts +70 -0
  92. package/dist/core/design-analysis.js +490 -0
  93. package/dist/core/design-compare.d.ts +38 -0
  94. package/dist/core/design-compare.js +264 -0
  95. package/dist/core/diff.d.ts +61 -0
  96. package/dist/core/diff.js +289 -0
  97. package/dist/core/dns-cache.d.ts +20 -0
  98. package/dist/core/dns-cache.js +198 -0
  99. package/dist/core/documents.d.ts +23 -0
  100. package/dist/core/documents.js +123 -0
  101. package/dist/core/domain-memory.d.ts +66 -0
  102. package/dist/core/domain-memory.js +163 -0
  103. package/dist/core/domain-verify.d.ts +40 -0
  104. package/dist/core/domain-verify.js +379 -0
  105. package/dist/core/engine-ranker.d.ts +112 -0
  106. package/dist/core/engine-ranker.js +395 -0
  107. package/dist/core/extract-inline.d.ts +38 -0
  108. package/dist/core/extract-inline.js +215 -0
  109. package/dist/core/extract-listings.d.ts +38 -0
  110. package/dist/core/extract-listings.js +461 -0
  111. package/dist/core/extract.d.ts +9 -0
  112. package/dist/core/extract.js +139 -0
  113. package/dist/core/fetch-cache.d.ts +57 -0
  114. package/dist/core/fetch-cache.js +95 -0
  115. package/dist/core/fetcher.d.ts +13 -0
  116. package/dist/core/fetcher.js +12 -0
  117. package/dist/core/google-cache.d.ts +29 -0
  118. package/dist/core/google-cache.js +180 -0
  119. package/dist/core/google-serp-parser.d.ts +82 -0
  120. package/dist/core/google-serp-parser.js +287 -0
  121. package/dist/core/hotel-search.d.ts +122 -0
  122. package/dist/core/hotel-search.js +382 -0
  123. package/dist/core/http-fetch.d.ts +72 -0
  124. package/dist/core/http-fetch.js +820 -0
  125. package/dist/core/human.d.ts +175 -0
  126. package/dist/core/human.js +680 -0
  127. package/dist/core/image-caption.d.ts +44 -0
  128. package/dist/core/image-caption.js +271 -0
  129. package/dist/core/jobs.d.ts +75 -0
  130. package/dist/core/jobs.js +634 -0
  131. package/dist/core/json-ld.d.ts +15 -0
  132. package/dist/core/json-ld.js +617 -0
  133. package/dist/core/language-detect.d.ts +18 -0
  134. package/dist/core/language-detect.js +135 -0
  135. package/dist/core/links.d.ts +10 -0
  136. package/dist/core/links.js +44 -0
  137. package/dist/core/llm-extract.d.ts +71 -0
  138. package/dist/core/llm-extract.js +507 -0
  139. package/dist/core/llm-provider.d.ts +100 -0
  140. package/dist/core/llm-provider.js +702 -0
  141. package/dist/core/local-search.d.ts +60 -0
  142. package/dist/core/local-search.js +308 -0
  143. package/dist/core/logger.d.ts +28 -0
  144. package/dist/core/logger.js +104 -0
  145. package/dist/core/map.d.ts +33 -0
  146. package/dist/core/map.js +127 -0
  147. package/dist/core/markdown.d.ts +92 -0
  148. package/dist/core/markdown.js +809 -0
  149. package/dist/core/metadata.d.ts +34 -0
  150. package/dist/core/metadata.js +422 -0
  151. package/dist/core/observe.d.ts +113 -0
  152. package/dist/core/observe.js +395 -0
  153. package/dist/core/ocr.d.ts +12 -0
  154. package/dist/core/ocr.js +33 -0
  155. package/dist/core/paginate.d.ts +31 -0
  156. package/dist/core/paginate.js +106 -0
  157. package/dist/core/pdf.d.ts +8 -0
  158. package/dist/core/pdf.js +25 -0
  159. package/dist/core/peel-tls.d.ts +25 -0
  160. package/dist/core/peel-tls.js +220 -0
  161. package/dist/core/pipeline.d.ts +132 -0
  162. package/dist/core/pipeline.js +1666 -0
  163. package/dist/core/profiles.d.ts +61 -0
  164. package/dist/core/profiles.js +350 -0
  165. package/dist/core/prompt-guard.d.ts +30 -0
  166. package/dist/core/prompt-guard.js +119 -0
  167. package/dist/core/proxy-config.d.ts +90 -0
  168. package/dist/core/proxy-config.js +172 -0
  169. package/dist/core/quick-answer.d.ts +53 -0
  170. package/dist/core/quick-answer.js +833 -0
  171. package/dist/core/rate-governor.d.ts +80 -0
  172. package/dist/core/rate-governor.js +238 -0
  173. package/dist/core/readability.d.ts +57 -0
  174. package/dist/core/readability.js +533 -0
  175. package/dist/core/research.d.ts +66 -0
  176. package/dist/core/research.js +270 -0
  177. package/dist/core/retry.d.ts +60 -0
  178. package/dist/core/retry.js +119 -0
  179. package/dist/core/safe-browsing.d.ts +30 -0
  180. package/dist/core/safe-browsing.js +206 -0
  181. package/dist/core/schema-extraction.d.ts +66 -0
  182. package/dist/core/schema-extraction.js +352 -0
  183. package/dist/core/schema-postprocess.d.ts +32 -0
  184. package/dist/core/schema-postprocess.js +469 -0
  185. package/dist/core/schema-templates.d.ts +19 -0
  186. package/dist/core/schema-templates.js +143 -0
  187. package/dist/core/screenshot.d.ts +224 -0
  188. package/dist/core/screenshot.js +207 -0
  189. package/dist/core/search-engines.d.ts +25 -0
  190. package/dist/core/search-engines.js +182 -0
  191. package/dist/core/search-provider.d.ts +243 -0
  192. package/dist/core/search-provider.js +1629 -0
  193. package/dist/core/searxng-provider.d.ts +35 -0
  194. package/dist/core/searxng-provider.js +105 -0
  195. package/dist/core/selective-evidence.d.ts +151 -0
  196. package/dist/core/selective-evidence.js +389 -0
  197. package/dist/core/site-search.d.ts +44 -0
  198. package/dist/core/site-search.js +252 -0
  199. package/dist/core/sitemap.d.ts +23 -0
  200. package/dist/core/sitemap.js +105 -0
  201. package/dist/core/source-credibility.d.ts +29 -0
  202. package/dist/core/source-credibility.js +584 -0
  203. package/dist/core/source-scoring.d.ts +166 -0
  204. package/dist/core/source-scoring.js +396 -0
  205. package/dist/core/stemmer.d.ts +38 -0
  206. package/dist/core/stemmer.js +509 -0
  207. package/dist/core/strategies.d.ts +104 -0
  208. package/dist/core/strategies.js +1044 -0
  209. package/dist/core/strategy-hooks.d.ts +145 -0
  210. package/dist/core/strategy-hooks.js +74 -0
  211. package/dist/core/structured-extract.d.ts +43 -0
  212. package/dist/core/structured-extract.js +550 -0
  213. package/dist/core/summarize.d.ts +17 -0
  214. package/dist/core/summarize.js +78 -0
  215. package/dist/core/synonyms.d.ts +42 -0
  216. package/dist/core/synonyms.js +184 -0
  217. package/dist/core/system-monitor.d.ts +61 -0
  218. package/dist/core/system-monitor.js +133 -0
  219. package/dist/core/table-format.d.ts +30 -0
  220. package/dist/core/table-format.js +146 -0
  221. package/dist/core/threat-feeds.d.ts +23 -0
  222. package/dist/core/threat-feeds.js +104 -0
  223. package/dist/core/timing.d.ts +21 -0
  224. package/dist/core/timing.js +33 -0
  225. package/dist/core/transcript-export.d.ts +47 -0
  226. package/dist/core/transcript-export.js +107 -0
  227. package/dist/core/user-agents.d.ts +82 -0
  228. package/dist/core/user-agents.js +239 -0
  229. package/dist/core/vertical-search.d.ts +54 -0
  230. package/dist/core/vertical-search.js +158 -0
  231. package/dist/core/watch-manager.d.ts +175 -0
  232. package/dist/core/watch-manager.js +416 -0
  233. package/dist/core/watch.d.ts +101 -0
  234. package/dist/core/watch.js +389 -0
  235. package/dist/core/youtube.d.ts +130 -0
  236. package/dist/core/youtube.js +1175 -0
  237. package/dist/ee/challenge-re-export.d.ts +1 -0
  238. package/dist/ee/challenge-re-export.js +1 -0
  239. package/dist/ee/challenge-solver.d.ts +72 -0
  240. package/dist/ee/challenge-solver.js +720 -0
  241. package/dist/ee/domain-extractors.d.ts +8 -0
  242. package/dist/ee/domain-extractors.js +8 -0
  243. package/dist/ee/domain-intel.d.ts +16 -0
  244. package/dist/ee/domain-intel.js +133 -0
  245. package/dist/ee/extractors/allrecipes.d.ts +2 -0
  246. package/dist/ee/extractors/allrecipes.js +120 -0
  247. package/dist/ee/extractors/amazon.d.ts +2 -0
  248. package/dist/ee/extractors/amazon.js +78 -0
  249. package/dist/ee/extractors/arxiv.d.ts +2 -0
  250. package/dist/ee/extractors/arxiv.js +137 -0
  251. package/dist/ee/extractors/bestbuy.d.ts +2 -0
  252. package/dist/ee/extractors/bestbuy.js +78 -0
  253. package/dist/ee/extractors/carscom.d.ts +2 -0
  254. package/dist/ee/extractors/carscom.js +121 -0
  255. package/dist/ee/extractors/coingecko.d.ts +2 -0
  256. package/dist/ee/extractors/coingecko.js +134 -0
  257. package/dist/ee/extractors/craigslist.d.ts +2 -0
  258. package/dist/ee/extractors/craigslist.js +92 -0
  259. package/dist/ee/extractors/devto.d.ts +2 -0
  260. package/dist/ee/extractors/devto.js +135 -0
  261. package/dist/ee/extractors/ebay.d.ts +2 -0
  262. package/dist/ee/extractors/ebay.js +90 -0
  263. package/dist/ee/extractors/espn.d.ts +2 -0
  264. package/dist/ee/extractors/espn.js +260 -0
  265. package/dist/ee/extractors/etsy.d.ts +2 -0
  266. package/dist/ee/extractors/etsy.js +52 -0
  267. package/dist/ee/extractors/facebook.d.ts +2 -0
  268. package/dist/ee/extractors/facebook.js +46 -0
  269. package/dist/ee/extractors/github.d.ts +2 -0
  270. package/dist/ee/extractors/github.js +196 -0
  271. package/dist/ee/extractors/google-flights.d.ts +2 -0
  272. package/dist/ee/extractors/google-flights.js +176 -0
  273. package/dist/ee/extractors/hackernews.d.ts +2 -0
  274. package/dist/ee/extractors/hackernews.js +147 -0
  275. package/dist/ee/extractors/imdb.d.ts +2 -0
  276. package/dist/ee/extractors/imdb.js +172 -0
  277. package/dist/ee/extractors/index.d.ts +26 -0
  278. package/dist/ee/extractors/index.js +247 -0
  279. package/dist/ee/extractors/instagram.d.ts +2 -0
  280. package/dist/ee/extractors/instagram.js +102 -0
  281. package/dist/ee/extractors/kalshi.d.ts +2 -0
  282. package/dist/ee/extractors/kalshi.js +121 -0
  283. package/dist/ee/extractors/kayak-cars.d.ts +2 -0
  284. package/dist/ee/extractors/kayak-cars.js +270 -0
  285. package/dist/ee/extractors/linkedin.d.ts +2 -0
  286. package/dist/ee/extractors/linkedin.js +113 -0
  287. package/dist/ee/extractors/medium.d.ts +2 -0
  288. package/dist/ee/extractors/medium.js +130 -0
  289. package/dist/ee/extractors/news.d.ts +4 -0
  290. package/dist/ee/extractors/news.js +173 -0
  291. package/dist/ee/extractors/npm.d.ts +2 -0
  292. package/dist/ee/extractors/npm.js +86 -0
  293. package/dist/ee/extractors/pdf.d.ts +2 -0
  294. package/dist/ee/extractors/pdf.js +108 -0
  295. package/dist/ee/extractors/pinterest.d.ts +2 -0
  296. package/dist/ee/extractors/pinterest.js +34 -0
  297. package/dist/ee/extractors/polymarket.d.ts +2 -0
  298. package/dist/ee/extractors/polymarket.js +358 -0
  299. package/dist/ee/extractors/producthunt.d.ts +2 -0
  300. package/dist/ee/extractors/producthunt.js +88 -0
  301. package/dist/ee/extractors/pubmed.d.ts +2 -0
  302. package/dist/ee/extractors/pubmed.js +162 -0
  303. package/dist/ee/extractors/pypi.d.ts +2 -0
  304. package/dist/ee/extractors/pypi.js +80 -0
  305. package/dist/ee/extractors/reddit.d.ts +2 -0
  306. package/dist/ee/extractors/reddit.js +438 -0
  307. package/dist/ee/extractors/redfin.d.ts +2 -0
  308. package/dist/ee/extractors/redfin.js +156 -0
  309. package/dist/ee/extractors/semanticscholar.d.ts +2 -0
  310. package/dist/ee/extractors/semanticscholar.js +131 -0
  311. package/dist/ee/extractors/shared.d.ts +12 -0
  312. package/dist/ee/extractors/shared.js +76 -0
  313. package/dist/ee/extractors/soundcloud.d.ts +2 -0
  314. package/dist/ee/extractors/soundcloud.js +34 -0
  315. package/dist/ee/extractors/sportsbetting.d.ts +2 -0
  316. package/dist/ee/extractors/sportsbetting.js +37 -0
  317. package/dist/ee/extractors/spotify.d.ts +2 -0
  318. package/dist/ee/extractors/spotify.js +34 -0
  319. package/dist/ee/extractors/stackoverflow.d.ts +2 -0
  320. package/dist/ee/extractors/stackoverflow.js +61 -0
  321. package/dist/ee/extractors/substack.d.ts +2 -0
  322. package/dist/ee/extractors/substack.js +115 -0
  323. package/dist/ee/extractors/substackroot.d.ts +2 -0
  324. package/dist/ee/extractors/substackroot.js +46 -0
  325. package/dist/ee/extractors/tiktok.d.ts +2 -0
  326. package/dist/ee/extractors/tiktok.js +29 -0
  327. package/dist/ee/extractors/tradingview.d.ts +2 -0
  328. package/dist/ee/extractors/tradingview.js +182 -0
  329. package/dist/ee/extractors/twitch.d.ts +2 -0
  330. package/dist/ee/extractors/twitch.js +36 -0
  331. package/dist/ee/extractors/twitter.d.ts +2 -0
  332. package/dist/ee/extractors/twitter.js +327 -0
  333. package/dist/ee/extractors/types.d.ts +14 -0
  334. package/dist/ee/extractors/types.js +1 -0
  335. package/dist/ee/extractors/walmart.d.ts +2 -0
  336. package/dist/ee/extractors/walmart.js +50 -0
  337. package/dist/ee/extractors/weather.d.ts +2 -0
  338. package/dist/ee/extractors/weather.js +133 -0
  339. package/dist/ee/extractors/wikipedia.d.ts +4 -0
  340. package/dist/ee/extractors/wikipedia.js +235 -0
  341. package/dist/ee/extractors/yelp.d.ts +2 -0
  342. package/dist/ee/extractors/yelp.js +216 -0
  343. package/dist/ee/extractors/youtube.d.ts +2 -0
  344. package/dist/ee/extractors/youtube.js +189 -0
  345. package/dist/ee/extractors/zillow.d.ts +54 -0
  346. package/dist/ee/extractors/zillow.js +247 -0
  347. package/dist/ee/extractors-re-export.d.ts +1 -0
  348. package/dist/ee/extractors-re-export.js +1 -0
  349. package/dist/ee/premium-hooks.d.ts +20 -0
  350. package/dist/ee/premium-hooks.js +50 -0
  351. package/dist/ee/spa-detection.d.ts +2 -0
  352. package/dist/ee/spa-detection.js +2 -0
  353. package/dist/ee/stability.d.ts +4 -0
  354. package/dist/ee/stability.js +29 -0
  355. package/dist/ee/swr-cache.d.ts +14 -0
  356. package/dist/ee/swr-cache.js +34 -0
  357. package/dist/index.d.ts +143 -0
  358. package/dist/index.js +291 -0
  359. package/dist/integrations/index.d.ts +2 -0
  360. package/dist/integrations/index.js +2 -0
  361. package/dist/integrations/langchain.d.ts +64 -0
  362. package/dist/integrations/langchain.js +115 -0
  363. package/dist/integrations/llamaindex.d.ts +50 -0
  364. package/dist/integrations/llamaindex.js +91 -0
  365. package/dist/mcp/handlers/act.d.ts +5 -0
  366. package/dist/mcp/handlers/act.js +34 -0
  367. package/dist/mcp/handlers/definitions.d.ts +6 -0
  368. package/dist/mcp/handlers/definitions.js +395 -0
  369. package/dist/mcp/handlers/extract.d.ts +7 -0
  370. package/dist/mcp/handlers/extract.js +135 -0
  371. package/dist/mcp/handlers/fetch.d.ts +6 -0
  372. package/dist/mcp/handlers/fetch.js +98 -0
  373. package/dist/mcp/handlers/find.d.ts +5 -0
  374. package/dist/mcp/handlers/find.js +137 -0
  375. package/dist/mcp/handlers/index.d.ts +13 -0
  376. package/dist/mcp/handlers/index.js +63 -0
  377. package/dist/mcp/handlers/legacy.d.ts +25 -0
  378. package/dist/mcp/handlers/legacy.js +450 -0
  379. package/dist/mcp/handlers/meta.d.ts +6 -0
  380. package/dist/mcp/handlers/meta.js +40 -0
  381. package/dist/mcp/handlers/monitor.d.ts +5 -0
  382. package/dist/mcp/handlers/monitor.js +41 -0
  383. package/dist/mcp/handlers/observe.d.ts +8 -0
  384. package/dist/mcp/handlers/observe.js +37 -0
  385. package/dist/mcp/handlers/read.d.ts +6 -0
  386. package/dist/mcp/handlers/read.js +78 -0
  387. package/dist/mcp/handlers/see.d.ts +5 -0
  388. package/dist/mcp/handlers/see.js +75 -0
  389. package/dist/mcp/handlers/types.d.ts +29 -0
  390. package/dist/mcp/handlers/types.js +28 -0
  391. package/dist/mcp/server.d.ts +7 -0
  392. package/dist/mcp/server.js +108 -0
  393. package/dist/mcp/smart-router.d.ts +23 -0
  394. package/dist/mcp/smart-router.js +178 -0
  395. package/dist/server/app.d.ts +14 -0
  396. package/dist/server/app.js +632 -0
  397. package/dist/server/auth-store.d.ts +28 -0
  398. package/dist/server/auth-store.js +88 -0
  399. package/dist/server/bull-queues.d.ts +60 -0
  400. package/dist/server/bull-queues.js +90 -0
  401. package/dist/server/email-service.d.ts +55 -0
  402. package/dist/server/email-service.js +291 -0
  403. package/dist/server/job-queue.d.ts +100 -0
  404. package/dist/server/job-queue.js +145 -0
  405. package/dist/server/logger.d.ts +10 -0
  406. package/dist/server/logger.js +37 -0
  407. package/dist/server/middleware/audit-log.d.ts +14 -0
  408. package/dist/server/middleware/audit-log.js +73 -0
  409. package/dist/server/middleware/auth.d.ts +35 -0
  410. package/dist/server/middleware/auth.js +225 -0
  411. package/dist/server/middleware/rate-limit.d.ts +50 -0
  412. package/dist/server/middleware/rate-limit.js +270 -0
  413. package/dist/server/middleware/scope-guard.d.ts +25 -0
  414. package/dist/server/middleware/scope-guard.js +45 -0
  415. package/dist/server/middleware/url-validator.d.ts +15 -0
  416. package/dist/server/middleware/url-validator.js +201 -0
  417. package/dist/server/openapi.yaml +6418 -0
  418. package/dist/server/pg-auth-store.d.ts +146 -0
  419. package/dist/server/pg-auth-store.js +576 -0
  420. package/dist/server/pg-job-queue.d.ts +59 -0
  421. package/dist/server/pg-job-queue.js +375 -0
  422. package/dist/server/routes/activity.d.ts +6 -0
  423. package/dist/server/routes/activity.js +79 -0
  424. package/dist/server/routes/admin-active.d.ts +7 -0
  425. package/dist/server/routes/admin-active.js +120 -0
  426. package/dist/server/routes/admin-stats.d.ts +7 -0
  427. package/dist/server/routes/admin-stats.js +176 -0
  428. package/dist/server/routes/agent.d.ts +24 -0
  429. package/dist/server/routes/agent.js +480 -0
  430. package/dist/server/routes/answer.d.ts +5 -0
  431. package/dist/server/routes/answer.js +125 -0
  432. package/dist/server/routes/ask.d.ts +28 -0
  433. package/dist/server/routes/ask.js +295 -0
  434. package/dist/server/routes/batch.d.ts +6 -0
  435. package/dist/server/routes/batch.js +493 -0
  436. package/dist/server/routes/cache-warm.d.ts +25 -0
  437. package/dist/server/routes/cache-warm.js +212 -0
  438. package/dist/server/routes/cli-usage.d.ts +6 -0
  439. package/dist/server/routes/cli-usage.js +127 -0
  440. package/dist/server/routes/compat.d.ts +23 -0
  441. package/dist/server/routes/compat.js +652 -0
  442. package/dist/server/routes/crawl.d.ts +13 -0
  443. package/dist/server/routes/crawl.js +287 -0
  444. package/dist/server/routes/deep-fetch.d.ts +8 -0
  445. package/dist/server/routes/deep-fetch.js +57 -0
  446. package/dist/server/routes/deep-research.d.ts +11 -0
  447. package/dist/server/routes/deep-research.js +232 -0
  448. package/dist/server/routes/demo.d.ts +24 -0
  449. package/dist/server/routes/demo.js +517 -0
  450. package/dist/server/routes/do.d.ts +8 -0
  451. package/dist/server/routes/do.js +72 -0
  452. package/dist/server/routes/extract.d.ts +14 -0
  453. package/dist/server/routes/extract.js +325 -0
  454. package/dist/server/routes/feed.d.ts +15 -0
  455. package/dist/server/routes/feed.js +311 -0
  456. package/dist/server/routes/fetch-queue.d.ts +13 -0
  457. package/dist/server/routes/fetch-queue.js +357 -0
  458. package/dist/server/routes/fetch.d.ts +7 -0
  459. package/dist/server/routes/fetch.js +1274 -0
  460. package/dist/server/routes/go.d.ts +14 -0
  461. package/dist/server/routes/go.js +81 -0
  462. package/dist/server/routes/health.d.ts +11 -0
  463. package/dist/server/routes/health.js +141 -0
  464. package/dist/server/routes/jobs.d.ts +7 -0
  465. package/dist/server/routes/jobs.js +574 -0
  466. package/dist/server/routes/map.d.ts +11 -0
  467. package/dist/server/routes/map.js +116 -0
  468. package/dist/server/routes/mcp.d.ts +14 -0
  469. package/dist/server/routes/mcp.js +197 -0
  470. package/dist/server/routes/metrics.d.ts +37 -0
  471. package/dist/server/routes/metrics.js +149 -0
  472. package/dist/server/routes/oauth.d.ts +9 -0
  473. package/dist/server/routes/oauth.js +396 -0
  474. package/dist/server/routes/playground.d.ts +17 -0
  475. package/dist/server/routes/playground.js +283 -0
  476. package/dist/server/routes/reader.d.ts +18 -0
  477. package/dist/server/routes/reader.js +192 -0
  478. package/dist/server/routes/research.d.ts +14 -0
  479. package/dist/server/routes/research.js +482 -0
  480. package/dist/server/routes/screenshot.d.ts +22 -0
  481. package/dist/server/routes/screenshot.js +820 -0
  482. package/dist/server/routes/search.d.ts +6 -0
  483. package/dist/server/routes/search.js +874 -0
  484. package/dist/server/routes/session.d.ts +17 -0
  485. package/dist/server/routes/session.js +548 -0
  486. package/dist/server/routes/share.d.ts +18 -0
  487. package/dist/server/routes/share.js +462 -0
  488. package/dist/server/routes/smart-search/handlers/cars.d.ts +2 -0
  489. package/dist/server/routes/smart-search/handlers/cars.js +102 -0
  490. package/dist/server/routes/smart-search/handlers/flights.d.ts +2 -0
  491. package/dist/server/routes/smart-search/handlers/flights.js +72 -0
  492. package/dist/server/routes/smart-search/handlers/general.d.ts +13 -0
  493. package/dist/server/routes/smart-search/handlers/general.js +717 -0
  494. package/dist/server/routes/smart-search/handlers/hotels.d.ts +2 -0
  495. package/dist/server/routes/smart-search/handlers/hotels.js +88 -0
  496. package/dist/server/routes/smart-search/handlers/products.d.ts +2 -0
  497. package/dist/server/routes/smart-search/handlers/products.js +1309 -0
  498. package/dist/server/routes/smart-search/handlers/rental.d.ts +2 -0
  499. package/dist/server/routes/smart-search/handlers/rental.js +154 -0
  500. package/dist/server/routes/smart-search/handlers/restaurants.d.ts +2 -0
  501. package/dist/server/routes/smart-search/handlers/restaurants.js +225 -0
  502. package/dist/server/routes/smart-search/handlers/transit-verdict.d.ts +41 -0
  503. package/dist/server/routes/smart-search/handlers/transit-verdict.js +224 -0
  504. package/dist/server/routes/smart-search/index.d.ts +19 -0
  505. package/dist/server/routes/smart-search/index.js +546 -0
  506. package/dist/server/routes/smart-search/intent.d.ts +3 -0
  507. package/dist/server/routes/smart-search/intent.js +264 -0
  508. package/dist/server/routes/smart-search/llm.d.ts +16 -0
  509. package/dist/server/routes/smart-search/llm.js +70 -0
  510. package/dist/server/routes/smart-search/sources/reddit.d.ts +18 -0
  511. package/dist/server/routes/smart-search/sources/reddit.js +34 -0
  512. package/dist/server/routes/smart-search/sources/yelp.d.ts +25 -0
  513. package/dist/server/routes/smart-search/sources/yelp.js +171 -0
  514. package/dist/server/routes/smart-search/sources/youtube.d.ts +8 -0
  515. package/dist/server/routes/smart-search/sources/youtube.js +9 -0
  516. package/dist/server/routes/smart-search/types.d.ts +81 -0
  517. package/dist/server/routes/smart-search/types.js +1 -0
  518. package/dist/server/routes/smart-search/utils.d.ts +20 -0
  519. package/dist/server/routes/smart-search/utils.js +146 -0
  520. package/dist/server/routes/stats.d.ts +6 -0
  521. package/dist/server/routes/stats.js +71 -0
  522. package/dist/server/routes/stripe.d.ts +15 -0
  523. package/dist/server/routes/stripe.js +296 -0
  524. package/dist/server/routes/transcript-export.d.ts +10 -0
  525. package/dist/server/routes/transcript-export.js +178 -0
  526. package/dist/server/routes/usage.d.ts +9 -0
  527. package/dist/server/routes/usage.js +279 -0
  528. package/dist/server/routes/users.d.ts +8 -0
  529. package/dist/server/routes/users.js +1867 -0
  530. package/dist/server/routes/watch.d.ts +15 -0
  531. package/dist/server/routes/watch.js +309 -0
  532. package/dist/server/routes/webhooks.d.ts +26 -0
  533. package/dist/server/routes/webhooks.js +170 -0
  534. package/dist/server/routes/youtube.d.ts +6 -0
  535. package/dist/server/routes/youtube.js +130 -0
  536. package/dist/server/sentry.d.ts +14 -0
  537. package/dist/server/sentry.js +104 -0
  538. package/dist/server/types.d.ts +15 -0
  539. package/dist/server/types.js +7 -0
  540. package/dist/server/utils/response.d.ts +44 -0
  541. package/dist/server/utils/response.js +69 -0
  542. package/dist/server/utils/sse.d.ts +22 -0
  543. package/dist/server/utils/sse.js +38 -0
  544. package/dist/types.d.ts +552 -0
  545. package/dist/types.js +39 -0
  546. package/llms.txt +105 -0
  547. package/package.json +189 -0
@@ -0,0 +1,1175 @@
1
+ /**
2
+ * YouTube transcript extraction — no API key required.
3
+ *
4
+ * YouTube embeds caption/transcript data directly in the page HTML as JSON
5
+ * (inside ytInitialPlayerResponse). We parse that JSON, extract caption
6
+ * track URLs, fetch the timedtext XML, and return structured transcript data.
7
+ */
8
+ import { execFile } from 'node:child_process';
9
+ import * as http from 'node:http';
10
+ import * as https from 'node:https';
11
+ import * as tls from 'node:tls';
12
+ import { readFile, unlink } from 'node:fs/promises';
13
+ import { tmpdir } from 'node:os';
14
+ import { join } from 'node:path';
15
+ import { fetchTranscript as ytpFetchTranscript } from 'youtube-transcript-plus';
16
+ import { simpleFetch } from './fetcher.js';
17
+ import { getBrowser, getRandomUserAgent, applyStealthScripts } from './browser-pool.js';
18
+ import { hasWebshareProxy as _hasWebshareProxy } from './proxy-config.js';
19
+ import { createLogger } from './logger.js';
20
+ // ---------------------------------------------------------------------------
21
+ // yt-dlp startup diagnostics
22
+ // ---------------------------------------------------------------------------
23
+ const _ytLog = createLogger('youtube');
24
+ // Check yt-dlp availability on startup.
25
+ // Skipped in test environments (VITEST) to avoid interfering with mocked paths.
26
+ // Uses logger.debug (→ stderr) so it never pollutes stdout JSON output when piped.
27
+ let ytdlpAvailable = false;
28
+ (async () => {
29
+ if (process.env.VITEST)
30
+ return;
31
+ try {
32
+ const { execFileSync } = await import('node:child_process');
33
+ const version = execFileSync('yt-dlp', ['--version'], {
34
+ timeout: 5000,
35
+ env: { ...process.env, PATH: `/usr/local/bin:/usr/bin:/bin:${process.env.PATH ?? ''}` },
36
+ }).toString().trim();
37
+ ytdlpAvailable = true;
38
+ _ytLog.debug(`yt-dlp available: v${version}`);
39
+ }
40
+ catch {
41
+ _ytLog.debug('yt-dlp NOT available — falling back to HTTP extraction');
42
+ }
43
+ })();
44
+ // ---------------------------------------------------------------------------
45
+ // URL parsing
46
+ // ---------------------------------------------------------------------------
47
+ /**
48
+ * Extract the video ID from any common YouTube URL format.
49
+ * Returns null if the URL is not a recognisable YouTube URL.
50
+ *
51
+ * Supported formats:
52
+ * https://www.youtube.com/watch?v=VIDEO_ID
53
+ * https://youtu.be/VIDEO_ID
54
+ * https://www.youtube.com/embed/VIDEO_ID
55
+ * https://m.youtube.com/watch?v=VIDEO_ID
56
+ * URLs with extra params (&t=120, &list=PLxxx, etc.)
57
+ */
58
+ export function parseYouTubeUrl(url) {
59
+ if (!url || typeof url !== 'string')
60
+ return null;
61
+ let parsed;
62
+ try {
63
+ parsed = new URL(url.trim());
64
+ }
65
+ catch {
66
+ return null;
67
+ }
68
+ const host = parsed.hostname.toLowerCase().replace(/^www\./, '').replace(/^m\./, '');
69
+ if (host === 'youtu.be') {
70
+ // https://youtu.be/VIDEO_ID
71
+ const id = parsed.pathname.slice(1).split('/')[0];
72
+ return isValidVideoId(id) ? id : null;
73
+ }
74
+ if (host === 'youtube.com') {
75
+ // /watch?v=VIDEO_ID
76
+ if (parsed.pathname === '/watch' || parsed.pathname === '/watch/') {
77
+ const id = parsed.searchParams.get('v');
78
+ return id && isValidVideoId(id) ? id : null;
79
+ }
80
+ // /embed/VIDEO_ID
81
+ if (parsed.pathname.startsWith('/embed/')) {
82
+ const id = parsed.pathname.split('/')[2];
83
+ return id && isValidVideoId(id) ? id : null;
84
+ }
85
+ // /shorts/VIDEO_ID
86
+ if (parsed.pathname.startsWith('/shorts/')) {
87
+ const id = parsed.pathname.split('/')[2];
88
+ return id && isValidVideoId(id) ? id : null;
89
+ }
90
+ // /v/VIDEO_ID (old embed format)
91
+ if (parsed.pathname.startsWith('/v/')) {
92
+ const id = parsed.pathname.split('/')[2];
93
+ return id && isValidVideoId(id) ? id : null;
94
+ }
95
+ }
96
+ return null;
97
+ }
98
+ function isValidVideoId(id) {
99
+ return typeof id === 'string' && /^[A-Za-z0-9_-]{11}$/.test(id);
100
+ }
101
+ // ---------------------------------------------------------------------------
102
+ // Video info extraction
103
+ // ---------------------------------------------------------------------------
104
+ /**
105
+ * Extract video metadata from YouTube page HTML.
106
+ * Parses ytInitialPlayerResponse JSON embedded in the page.
107
+ */
108
+ export function extractVideoInfo(html) {
109
+ const playerResponse = extractPlayerResponse(html);
110
+ const videoDetails = playerResponse?.videoDetails ?? {};
111
+ const microformat = playerResponse?.microformat?.playerMicroformatRenderer ?? {};
112
+ const videoId = videoDetails.videoId ?? '';
113
+ const title = videoDetails.title ??
114
+ microformat.title?.simpleText ??
115
+ extractMetaTag(html, 'og:title') ??
116
+ '';
117
+ const channel = videoDetails.author ?? microformat.ownerChannelName ?? '';
118
+ const lengthSeconds = parseInt(videoDetails.lengthSeconds ?? microformat.lengthSeconds ?? '0', 10);
119
+ const viewCount = videoDetails.viewCount ?? microformat.viewCount ?? '';
120
+ const publishDate = microformat.publishDate ?? microformat.uploadDate ?? '';
121
+ const description = videoDetails.shortDescription ??
122
+ microformat.description?.simpleText ??
123
+ extractMetaTag(html, 'og:description') ??
124
+ '';
125
+ const thumbnail = videoDetails.thumbnail?.thumbnails?.slice(-1)[0]?.url ??
126
+ microformat.thumbnail?.thumbnails?.slice(-1)[0]?.url ??
127
+ `https://img.youtube.com/vi/${videoId}/maxresdefault.jpg`;
128
+ // likeCount is often not available without auth
129
+ const likeCount = videoDetails.likeCount ?? '';
130
+ return {
131
+ videoId,
132
+ title,
133
+ channel,
134
+ description,
135
+ duration: formatDuration(lengthSeconds),
136
+ publishDate,
137
+ viewCount,
138
+ likeCount,
139
+ thumbnail,
140
+ };
141
+ }
142
+ // ---------------------------------------------------------------------------
143
+ // Structured content helpers
144
+ // ---------------------------------------------------------------------------
145
+ /**
146
+ * Parse chapter markers from a YouTube video description.
147
+ * Looks for lines like "0:00 Intro\n2:34 Main topic\n5:12 Conclusion"
148
+ */
149
+ export function parseChaptersFromDescription(description) {
150
+ if (!description)
151
+ return [];
152
+ // Match lines that start with a timestamp: "0:00", "1:23", "1:23:45"
153
+ const chapterRegex = /^(\d+:\d{2}(?::\d{2})?)\s+(.+)$/gm;
154
+ const chapters = [];
155
+ let match;
156
+ while ((match = chapterRegex.exec(description)) !== null) {
157
+ const time = match[1].trim();
158
+ const title = match[2].trim();
159
+ if (title)
160
+ chapters.push({ time, title });
161
+ }
162
+ // Only treat as chapters if there are at least 2 (otherwise it's probably not a chapter list)
163
+ return chapters.length >= 2 ? chapters : [];
164
+ }
165
+ /**
166
+ * Convert a time string "1:23" or "1:23:45" to seconds.
167
+ */
168
+ function timeStringToSeconds(timeStr) {
169
+ const parts = timeStr.split(':').map(Number);
170
+ if (parts.length === 3)
171
+ return parts[0] * 3600 + parts[1] * 60 + parts[2];
172
+ if (parts.length === 2)
173
+ return parts[0] * 60 + parts[1];
174
+ return 0;
175
+ }
176
+ /**
177
+ * Split a text into sentences (basic, good enough for transcript sentences).
178
+ */
179
+ function splitSentences(text) {
180
+ // Split on sentence-ending punctuation followed by space/end
181
+ return text.split(/(?<=[.!?])\s+/).map(s => s.trim()).filter(Boolean);
182
+ }
183
+ /**
184
+ * Extract key points from transcript segments.
185
+ * Uses chapter timestamps when available; otherwise segments every 2 minutes.
186
+ * Returns the first substantive sentence (≥5 words) from each time block.
187
+ */
188
+ export function extractKeyPoints(segments, chapters, durationSeconds) {
189
+ if (segments.length === 0)
190
+ return [];
191
+ const totalDuration = durationSeconds ||
192
+ (segments.length > 0
193
+ ? segments[segments.length - 1].start + segments[segments.length - 1].duration
194
+ : 0);
195
+ // Build time blocks
196
+ let blocks;
197
+ if (chapters.length >= 2) {
198
+ blocks = chapters.map((ch, i) => ({
199
+ start: timeStringToSeconds(ch.time),
200
+ end: i + 1 < chapters.length
201
+ ? timeStringToSeconds(chapters[i + 1].time)
202
+ : totalDuration || Infinity,
203
+ }));
204
+ }
205
+ else {
206
+ // Auto-segment every 2 minutes
207
+ const blockDuration = 120;
208
+ blocks = [];
209
+ for (let t = 0; t < (totalDuration || 600); t += blockDuration) {
210
+ blocks.push({ start: t, end: t + blockDuration });
211
+ }
212
+ if (blocks.length === 0)
213
+ blocks = [{ start: 0, end: Infinity }];
214
+ }
215
+ const keyPoints = [];
216
+ for (const block of blocks) {
217
+ const blockSegments = segments.filter(s => s.start >= block.start && s.start < block.end);
218
+ if (blockSegments.length === 0)
219
+ continue;
220
+ const blockText = blockSegments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
221
+ const sentences = splitSentences(blockText);
222
+ // Find first sentence with at least 5 words
223
+ const point = sentences.find(s => s.split(/\s+/).length >= 5);
224
+ if (point)
225
+ keyPoints.push(point.trim());
226
+ }
227
+ return keyPoints.slice(0, 12);
228
+ }
229
+ /**
230
+ * Extract a summary as the first ~200 words of the full transcript text.
231
+ */
232
+ export function extractSummary(fullText) {
233
+ if (!fullText)
234
+ return '';
235
+ const words = fullText.split(/\s+/);
236
+ if (words.length <= 200)
237
+ return fullText;
238
+ return words.slice(0, 200).join(' ') + '...';
239
+ }
240
+ // ---------------------------------------------------------------------------
241
+ // Proxy-based InnerTube transcript extraction
242
+ // ---------------------------------------------------------------------------
243
+ // Webshare residential proxy config — reads from env vars via proxy-config.ts.
244
+ // Locally, falls back to direct fetch (residential IP already works).
245
+ // These constants are kept for use in proxyRequestSlotted() which does
246
+ // low-level HTTP CONNECT tunneling (not Playwright-level proxy).
247
+ const PROXY_HOST = process.env.WEBSHARE_PROXY_HOST || 'p.webshare.io';
248
+ const PROXY_BASE_PORT = parseInt(process.env.WEBSHARE_PROXY_PORT || '10000', 10);
249
+ const PROXY_USER = process.env.WEBSHARE_PROXY_USER || '';
250
+ const PROXY_PASS = process.env.WEBSHARE_PROXY_PASS || '';
251
+ // With paid Webshare backbone plan, each US slot has its own port:
252
+ // slot N → port (PROXY_BASE_PORT + N - 1), username: USER-US-N
253
+ const PROXY_MAX_US_SLOTS = parseInt(process.env.WEBSHARE_PROXY_SLOTS || '44744', 10);
254
+ function isProxyConfigured() {
255
+ // Delegate to the shared proxy-config helper for consistency
256
+ return _hasWebshareProxy();
257
+ }
258
+ /**
259
+ * Make an HTTP(S) request through the Webshare CONNECT proxy with a specific
260
+ * slotted username (e.g. "argtnlhz-5"). This ensures both the /player call
261
+ * and the caption XML fetch go through the same residential IP.
262
+ */
263
+ function proxyRequestSlotted(slottedUser, proxyPort, targetUrl, opts = {}) {
264
+ const url = new URL(targetUrl);
265
+ const timeout = opts.timeoutMs ?? 20000;
266
+ return new Promise((resolve, reject) => {
267
+ const proxyAuth = Buffer.from(`${slottedUser}:${PROXY_PASS}`).toString('base64');
268
+ const proxyReq = http.request({
269
+ host: PROXY_HOST,
270
+ port: proxyPort,
271
+ method: 'CONNECT',
272
+ path: `${url.hostname}:443`,
273
+ headers: { 'Proxy-Authorization': `Basic ${proxyAuth}` },
274
+ });
275
+ const timer = setTimeout(() => {
276
+ proxyReq.destroy();
277
+ reject(new Error('Proxy request timed out'));
278
+ }, timeout);
279
+ proxyReq.on('connect', (res, socket) => {
280
+ if (res.statusCode !== 200) {
281
+ clearTimeout(timer);
282
+ socket.destroy();
283
+ reject(new Error(`Proxy CONNECT failed: ${res.statusCode}`));
284
+ return;
285
+ }
286
+ const tlsSocket = tls.connect({ host: url.hostname, socket, servername: url.hostname }, () => {
287
+ const reqHeaders = {
288
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
289
+ 'Accept-Language': 'en-US,en;q=0.9',
290
+ 'Cookie': 'CONSENT=YES+; SOCS=CAI',
291
+ ...(opts.headers ?? {}),
292
+ };
293
+ const req = https.request({
294
+ hostname: url.hostname,
295
+ path: url.pathname + url.search,
296
+ method: opts.method ?? 'GET',
297
+ createConnection: () => tlsSocket,
298
+ headers: reqHeaders,
299
+ }, (response) => {
300
+ let data = '';
301
+ response.on('data', (chunk) => {
302
+ data += chunk;
303
+ });
304
+ response.on('end', () => {
305
+ clearTimeout(timer);
306
+ resolve({ status: response.statusCode ?? 0, body: data });
307
+ });
308
+ });
309
+ req.on('error', (e) => {
310
+ clearTimeout(timer);
311
+ reject(e);
312
+ });
313
+ if (opts.body)
314
+ req.write(opts.body);
315
+ req.end();
316
+ });
317
+ tlsSocket.on('error', (e) => {
318
+ clearTimeout(timer);
319
+ reject(e);
320
+ });
321
+ });
322
+ proxyReq.on('error', (e) => {
323
+ clearTimeout(timer);
324
+ reject(e);
325
+ });
326
+ proxyReq.end();
327
+ });
328
+ }
329
+ /**
330
+ * Fetch YouTube transcript via InnerTube /player API through Webshare proxy.
331
+ *
332
+ * This replicates the approach used by the Python `youtube-transcript-api` library:
333
+ * 1. POST to /youtubei/v1/player with ANDROID client context
334
+ * 2. Get caption track URLs WITHOUT the `exp=xpe` parameter
335
+ * 3. Fetch caption XML from those clean URLs (returns actual data, not 0 bytes)
336
+ *
337
+ * All requests go through the residential proxy to bypass YouTube's cloud IP blocking.
338
+ */
339
+ async function getTranscriptViaProxy(videoId, preferredLang) {
340
+ // Try multiple proxy slots from the 44K+ US residential pool.
341
+ // Pick random slots across the pool for even distribution and to avoid
342
+ // rate-limited IPs. Try up to MAX_RETRIES different slots.
343
+ const MAX_RETRIES = 5;
344
+ const usedSlots = new Set();
345
+ // Public YouTube web-client InnerTube key embedded in their shipped client, not a WebPeel secret.
346
+ const INNERTUBE_API_KEY = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8';
347
+ for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
348
+ // Pick a random US slot we haven't tried yet
349
+ let slot;
350
+ do {
351
+ slot = Math.floor(Math.random() * PROXY_MAX_US_SLOTS) + 1;
352
+ } while (usedSlots.has(slot) && usedSlots.size < PROXY_MAX_US_SLOTS);
353
+ usedSlots.add(slot);
354
+ const proxyUser = `${PROXY_USER}-US-${slot}`;
355
+ const proxyPort = PROXY_BASE_PORT + slot - 1;
356
+ const doProxyRequest = (url, opts = {}) => proxyRequestSlotted(proxyUser, proxyPort, url, opts);
357
+ try {
358
+ // Step 1: Call InnerTube /player with ANDROID client
359
+ // ANDROID client returns caption URLs WITHOUT exp=xpe (avoids 0-byte responses).
360
+ const playerResp = await doProxyRequest(`https://www.youtube.com/youtubei/v1/player?key=${INNERTUBE_API_KEY}`, {
361
+ method: 'POST',
362
+ body: JSON.stringify({
363
+ context: { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } },
364
+ videoId,
365
+ }),
366
+ headers: { 'Content-Type': 'application/json' },
367
+ });
368
+ if (playerResp.status !== 200) {
369
+ console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): /player returned ${playerResp.status}`);
370
+ continue;
371
+ }
372
+ const playerData = JSON.parse(playerResp.body);
373
+ const captionTracks = playerData?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
374
+ if (!captionTracks || captionTracks.length === 0) {
375
+ console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no caption tracks`);
376
+ continue;
377
+ }
378
+ // Pick best matching language track
379
+ let track = captionTracks.find((t) => t.languageCode === preferredLang);
380
+ if (!track) {
381
+ track = captionTracks.find((t) => t.languageCode === 'en') ?? captionTracks[0];
382
+ }
383
+ const captionUrl = track.baseUrl;
384
+ if (captionUrl.includes('exp=xpe')) {
385
+ console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption URL has exp=xpe, skipping`);
386
+ continue;
387
+ }
388
+ // Step 2: Fetch caption XML through the SAME proxy slot (same residential IP)
389
+ const capResp = await doProxyRequest(captionUrl);
390
+ if (!capResp.body ||
391
+ capResp.body.length === 0 ||
392
+ capResp.status === 429 ||
393
+ capResp.body.includes('<title>Sorry...</title>')) {
394
+ console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): caption XML failed (status=${capResp.status}, bytes=${capResp.body?.length ?? 0})`);
395
+ continue; // Try next slot
396
+ }
397
+ // Parse XML segments — handles both <text start="" dur=""> and <p t="" d=""> formats
398
+ const xmlSegments = [
399
+ ...capResp.body.matchAll(/<(?:text|p)\s[^>]*?(?:start|t)="([^"]*)"[^>]*?(?:dur|d)="([^"]*)"[^>]*>([\s\S]*?)<\/(?:text|p)>/g),
400
+ ];
401
+ if (xmlSegments.length === 0) {
402
+ console.log(`[webpeel] [youtube] Proxy US-${slot} (port ${proxyPort}): no segments parsed from XML`);
403
+ continue;
404
+ }
405
+ const segments = xmlSegments
406
+ .map((m) => ({
407
+ text: decodeHtmlEntities(m[3].replace(/<[^>]+>/g, '').replace(/\n/g, ' ').trim()),
408
+ start: parseFloat(m[1]) / (m[1].includes('.') ? 1 : 1000),
409
+ duration: parseFloat(m[2]) / (m[2].includes('.') ? 1 : 1000),
410
+ }))
411
+ .filter((s) => s.text.length > 0);
412
+ if (segments.length === 0)
413
+ continue;
414
+ // Extract metadata from player response
415
+ const vd = playerData.videoDetails ?? {};
416
+ const mf = playerData.microformat?.playerMicroformatRenderer ?? {};
417
+ const title = vd.title ?? '';
418
+ const channel = vd.author ?? '';
419
+ const lengthSeconds = parseInt(vd.lengthSeconds ?? mf.lengthSeconds ?? '0', 10);
420
+ const description = (vd.shortDescription ?? mf.description?.simpleText ?? '').trim();
421
+ const publishDate = mf.publishDate ?? mf.uploadDate ?? '';
422
+ const availableLanguages = captionTracks.map((t) => t.languageCode);
423
+ const fullText = segments.map((s) => s.text).join(' ').replace(/\s+/g, ' ').trim();
424
+ const wordCount = fullText.split(/\s+/).filter(Boolean).length;
425
+ const chapters = parseChaptersFromDescription(description);
426
+ const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
427
+ const summary = extractSummary(fullText);
428
+ const viewCount = vd.viewCount ?? mf.viewCount ?? '';
429
+ const likeCount = vd.likeCount ?? '';
430
+ console.log(`[webpeel] [youtube] Proxy slot ${slot} success: ${segments.length} segments, ${wordCount} words`);
431
+ return {
432
+ videoId,
433
+ title,
434
+ channel,
435
+ duration: formatDuration(lengthSeconds),
436
+ language: track.languageCode ?? preferredLang,
437
+ segments,
438
+ fullText,
439
+ availableLanguages,
440
+ description,
441
+ publishDate,
442
+ chapters: chapters.length > 0 ? chapters : undefined,
443
+ keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
444
+ summary,
445
+ wordCount,
446
+ viewCount: viewCount || undefined,
447
+ likeCount: likeCount || undefined,
448
+ };
449
+ }
450
+ catch (err) {
451
+ console.log(`[webpeel] [youtube] Proxy slot ${slot} error:`, err?.message);
452
+ continue;
453
+ }
454
+ }
455
+ // All slots exhausted
456
+ console.log('[webpeel] [youtube] All proxy slots exhausted');
457
+ return null;
458
+ }
459
+ // ---------------------------------------------------------------------------
460
+ // Transcript extraction
461
+ // ---------------------------------------------------------------------------
462
+ /**
463
+ * Fetch and return the transcript for a YouTube video.
464
+ *
465
+ * @param url - Any YouTube URL format
466
+ * @param options.language - Preferred language code (default: "en")
467
+ */
468
+ export async function getYouTubeTranscript(url, options = {}) {
469
+ const videoId = parseYouTubeUrl(url);
470
+ if (!videoId) {
471
+ throw new Error(`Not a valid YouTube URL: ${url}`);
472
+ }
473
+ const preferredLang = options.language ?? 'en';
474
+ const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
475
+ // --- Path P: Proxy-based InnerTube (primary for cloud servers) ---
476
+ // Uses Webshare residential proxy + ANDROID InnerTube /player API.
477
+ // This is the approach used by every major YouTube transcript service
478
+ // (youtubetotranscript.com, youtube-transcript.io, etc.)
479
+ if (!process.env.VITEST && isProxyConfigured()) {
480
+ console.log('[webpeel] [youtube] Trying path P: proxy-based InnerTube (residential proxy)');
481
+ try {
482
+ const proxyResult = await getTranscriptViaProxy(videoId, preferredLang);
483
+ if (proxyResult && proxyResult.segments.length > 0) {
484
+ console.log(`[webpeel] [youtube] Path P success: ${proxyResult.segments.length} segments, ${proxyResult.wordCount} words`);
485
+ return proxyResult;
486
+ }
487
+ console.log('[webpeel] [youtube] Path P returned empty/null, falling through');
488
+ }
489
+ catch (err) {
490
+ console.log('[webpeel] [youtube] Path P failed:', err?.message);
491
+ }
492
+ }
493
+ // --- Path 0: youtube-transcript-plus (fastest — uses InnerTube API, ~1s) ---
494
+ // This library calls YouTube's internal InnerTube API directly via POST request,
495
+ // bypassing the IP-locked timedtext XML URLs. Works reliably from cloud servers.
496
+ // Skip in test mode — tests use mocked HTTP, but this path makes real InnerTube calls.
497
+ if (!process.env.VITEST) {
498
+ console.log('[webpeel] [youtube] Trying path 0: youtube-transcript-plus (InnerTube API)');
499
+ try {
500
+ const ytpSegments = await ytpFetchTranscript(videoId, { lang: preferredLang });
501
+ if (ytpSegments && ytpSegments.length > 0) {
502
+ // We have transcript segments — now fetch page metadata (title, channel, etc.)
503
+ let title = '', channel = '', lengthSeconds = 0, description = '', publishDate = '';
504
+ let availableLanguages = [preferredLang];
505
+ try {
506
+ const metaResp = await fetch(videoUrl, {
507
+ headers: {
508
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
509
+ 'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
510
+ },
511
+ signal: AbortSignal.timeout(8000),
512
+ });
513
+ const html = await metaResp.text();
514
+ const pr = extractPlayerResponse(html);
515
+ if (pr) {
516
+ const vd = pr.videoDetails ?? {};
517
+ const mf = pr.microformat?.playerMicroformatRenderer ?? {};
518
+ title = vd.title ?? '';
519
+ channel = vd.author ?? '';
520
+ lengthSeconds = parseInt(vd.lengthSeconds ?? mf.lengthSeconds ?? '0', 10);
521
+ description = (vd.shortDescription ?? mf.description?.simpleText ?? '').trim();
522
+ publishDate = mf.publishDate ?? mf.uploadDate ?? '';
523
+ const tracks = extractCaptionTracks(pr);
524
+ if (tracks.length > 0)
525
+ availableLanguages = tracks.map(t => t.languageCode);
526
+ }
527
+ }
528
+ catch { /* metadata fetch failed — segments are enough */ }
529
+ // Convert youtube-transcript-plus format to our format
530
+ const segments = ytpSegments.map(s => ({
531
+ text: decodeHtmlEntities((s.text ?? '').replace(/\n/g, ' ').trim()),
532
+ start: (s.offset ?? 0) / 1000, // offset is in ms
533
+ duration: (s.duration ?? 0) / 1000,
534
+ })).filter(s => s.text.length > 0);
535
+ const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
536
+ const wordCount = fullText.split(/\s+/).filter(Boolean).length;
537
+ const chapters = parseChaptersFromDescription(description);
538
+ const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
539
+ const summary = extractSummary(fullText);
540
+ console.log(`[webpeel] [youtube] Path 0 success: ${segments.length} segments, ${wordCount} words`);
541
+ return {
542
+ videoId,
543
+ title,
544
+ channel,
545
+ duration: formatDuration(lengthSeconds),
546
+ language: ytpSegments[0]?.lang ?? preferredLang,
547
+ segments,
548
+ fullText,
549
+ availableLanguages,
550
+ description,
551
+ publishDate,
552
+ chapters: chapters.length > 0 ? chapters : undefined,
553
+ keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
554
+ summary,
555
+ wordCount,
556
+ viewCount: undefined, // not available in this path without extra fetch
557
+ likeCount: undefined,
558
+ };
559
+ }
560
+ console.log('[webpeel] [youtube] Path 0 returned empty segments');
561
+ }
562
+ catch (err) {
563
+ console.log('[webpeel] [youtube] Path 0 failed:', err?.message);
564
+ }
565
+ } // end VITEST guard
566
+ const ytUserAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36';
567
+ const ytHeaders = {
568
+ 'Cookie': 'SOCS=CAISNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwNTE1LjA3X3AxGgJlbiADGgYIgLv3tQY; CONSENT=PENDING+987',
569
+ 'Accept-Language': 'en-US,en;q=0.9',
570
+ };
571
+ // --- Path 1: yt-dlp approach (most reliable on cloud servers — handles signature challenges internally) ---
572
+ if (ytdlpAvailable) {
573
+ console.log('[webpeel] [youtube] Trying path 1: yt-dlp');
574
+ try {
575
+ const ytdlpResult = await getTranscriptViaYtDlp(videoId, preferredLang);
576
+ if (ytdlpResult && ytdlpResult.segments.length > 0) {
577
+ return ytdlpResult;
578
+ }
579
+ console.log('[webpeel] [youtube] Path 1 failed: yt-dlp returned no segments');
580
+ }
581
+ catch (err) {
582
+ console.log('[webpeel] [youtube] Path 1 failed:', err?.message);
583
+ }
584
+ }
585
+ else {
586
+ console.log('[webpeel] [youtube] Skipping path 1: yt-dlp not available');
587
+ }
588
+ // --- Path 2: HTTP fetch (simpleFetch first; if our challenge detection fires, fall back to native fetch) ---
589
+ // YouTube serves consent/challenge pages to server IPs without cookies.
590
+ // Setting SOCS consent cookie bypasses this — same approach as youtube-transcript npm.
591
+ // On cloud servers, simpleFetch may throw BlockedError due to our own challenge detection;
592
+ // in that case we retry with native fetch() which bypasses that guard.
593
+ console.log('[webpeel] [youtube] Trying path 2: native fetch');
594
+ try {
595
+ let html;
596
+ try {
597
+ const fetchResult = await simpleFetch(videoUrl, ytUserAgent, 15000, ytHeaders);
598
+ html = fetchResult.html;
599
+ }
600
+ catch (simpleFetchErr) {
601
+ // If our own challenge detection threw BlockedError, retry with raw native fetch
602
+ const errMsg = (simpleFetchErr?.message ?? '').toLowerCase();
603
+ const isBlocked = simpleFetchErr?.constructor?.name === 'BlockedError' ||
604
+ errMsg.includes('blocked') ||
605
+ errMsg.includes('challenge') ||
606
+ errMsg.includes('cloudflare');
607
+ if (!isBlocked)
608
+ throw simpleFetchErr;
609
+ console.log('[webpeel] [youtube] simpleFetch BlockedError — retrying with native fetch');
610
+ const fetchResponse = await fetch(videoUrl, {
611
+ headers: {
612
+ 'User-Agent': ytUserAgent,
613
+ ...ytHeaders,
614
+ },
615
+ redirect: 'follow',
616
+ signal: AbortSignal.timeout(15000),
617
+ });
618
+ html = await fetchResponse.text();
619
+ }
620
+ if (!html.includes('ytInitialPlayerResponse') && !html.includes('ytInitialData')) {
621
+ throw new Error('YouTube served non-video page (likely challenge/consent)');
622
+ }
623
+ const playerResponse = extractPlayerResponse(html);
624
+ if (!playerResponse)
625
+ throw new Error('Could not parse player response');
626
+ const videoDetails = playerResponse.videoDetails ?? {};
627
+ const microformat = playerResponse.microformat?.playerMicroformatRenderer ?? {};
628
+ const title = videoDetails.title ?? '';
629
+ const channel = videoDetails.author ?? '';
630
+ const lengthSeconds = parseInt(videoDetails.lengthSeconds ?? microformat.lengthSeconds ?? '0', 10);
631
+ const description = (videoDetails.shortDescription ?? microformat.description?.simpleText ?? '').trim();
632
+ const publishDate = microformat.publishDate ?? microformat.uploadDate ?? '';
633
+ const captionTracks = extractCaptionTracks(playerResponse);
634
+ if (captionTracks.length === 0)
635
+ throw new Error('No captions available');
636
+ const availableLanguages = captionTracks.map(t => t.languageCode);
637
+ const selectedTrack = selectBestTrack(captionTracks, preferredLang);
638
+ // Pass same cookies + user-agent to caption fetch — URL is session-locked
639
+ const captionXml = await fetchCaptionXml(selectedTrack.baseUrl, ytUserAgent, ytHeaders);
640
+ const segments = parseCaptionXml(captionXml);
641
+ if (segments.length === 0) {
642
+ // Caption URL returned empty content (common when ip=0.0.0.0 in signature)
643
+ // Fall through to browser intercept path
644
+ throw new Error('Caption XML returned empty — session-locked URL');
645
+ }
646
+ const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
647
+ const wordCount = fullText.split(/\s+/).filter(Boolean).length;
648
+ const chapters = parseChaptersFromDescription(description);
649
+ const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
650
+ const summary = extractSummary(fullText);
651
+ return {
652
+ videoId,
653
+ title,
654
+ channel,
655
+ duration: formatDuration(lengthSeconds),
656
+ language: selectedTrack.languageCode,
657
+ segments,
658
+ fullText,
659
+ availableLanguages,
660
+ description,
661
+ publishDate,
662
+ chapters: chapters.length > 0 ? chapters : undefined,
663
+ keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
664
+ summary,
665
+ wordCount,
666
+ viewCount: (videoDetails.viewCount ?? microformat.viewCount ?? '') || undefined,
667
+ likeCount: (videoDetails.likeCount ?? '') || undefined,
668
+ };
669
+ }
670
+ catch (err) {
671
+ // Re-throw definitive failures (browser path won't help)
672
+ const msg = err?.message ?? '';
673
+ if (msg.includes('No captions available') || msg.includes('Not a valid YouTube URL')) {
674
+ throw err;
675
+ }
676
+ console.log('[webpeel] [youtube] Path 2 failed:', msg);
677
+ // Network/parsing failures — fall through to browser intercept approach
678
+ }
679
+ // --- Path 3: Browser intercept approach ---
680
+ // YouTube's caption URLs are session-specific (they return empty when fetched
681
+ // from a different HTTP client). We intercept the timedtext network request
682
+ // that the YouTube player makes automatically when loading the page.
683
+ console.log('[webpeel] [youtube] Trying path 3: browser intercept');
684
+ return getTranscriptViaBrowserIntercept(videoId, videoUrl, preferredLang);
685
+ }
686
+ /**
687
+ * Use yt-dlp to extract YouTube transcripts. yt-dlp handles all the
688
+ * signature challenges (player JS deciphering, multiple API endpoints)
689
+ * that defeat server-side HTTP fetch approaches.
690
+ */
691
+ async function getTranscriptViaYtDlp(videoId, preferredLang) {
692
+ const outPath = join(tmpdir(), `webpeel_yt_${videoId}_${Date.now()}`);
693
+ const videoUrl = `https://www.youtube.com/watch?v=${videoId}`;
694
+ return new Promise((resolve) => {
695
+ const args = [
696
+ '--skip-download',
697
+ '--write-auto-sub',
698
+ '--sub-lang', preferredLang,
699
+ '--sub-format', 'json3',
700
+ '--write-info-json',
701
+ '--output', outPath,
702
+ '--no-warnings',
703
+ '--quiet',
704
+ videoUrl,
705
+ ];
706
+ // Pass explicit PATH so yt-dlp is found in Docker containers
707
+ // pip3 installs to /usr/local/bin which may not be in Node's process.env.PATH
708
+ const execEnv = {
709
+ ...process.env,
710
+ PATH: `/usr/local/bin:/usr/bin:/bin:${process.env.PATH ?? ''}`,
711
+ };
712
+ const proc = execFile('yt-dlp', args, { timeout: 60000, env: execEnv }, async (err) => {
713
+ try {
714
+ if (err) {
715
+ // yt-dlp not installed, timed out, or failed
716
+ console.error('[webpeel] yt-dlp error:', err.message);
717
+ resolve(null);
718
+ return;
719
+ }
720
+ // Read subtitle file
721
+ const subFiles = [`${outPath}.${preferredLang}.json3`, `${outPath}.en.json3`];
722
+ let subData = null;
723
+ for (const sf of subFiles) {
724
+ try {
725
+ const raw = await readFile(sf, 'utf-8');
726
+ subData = JSON.parse(raw);
727
+ await unlink(sf).catch(() => { });
728
+ break;
729
+ }
730
+ catch { /* try next */ }
731
+ }
732
+ // Read info JSON for metadata
733
+ let infoData = null;
734
+ try {
735
+ const infoRaw = await readFile(`${outPath}.info.json`, 'utf-8');
736
+ infoData = JSON.parse(infoRaw);
737
+ await unlink(`${outPath}.info.json`).catch(() => { });
738
+ }
739
+ catch { /* metadata is optional */ }
740
+ if (!subData || !subData.events) {
741
+ resolve(null);
742
+ return;
743
+ }
744
+ const events = subData.events || [];
745
+ const segments = events
746
+ .filter((e) => e.segs)
747
+ .map((e) => ({
748
+ text: e.segs.map((s) => s.utf8 || '').join('').trim(),
749
+ start: (e.tStartMs || 0) / 1000,
750
+ duration: (e.dDurationMs || 0) / 1000,
751
+ }))
752
+ .filter((s) => s.text.length > 0);
753
+ const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
754
+ const wordCount = fullText.split(/\s+/).filter(Boolean).length;
755
+ const title = infoData?.title || '';
756
+ const channel = infoData?.uploader || infoData?.channel || '';
757
+ const lengthSeconds = infoData?.duration || 0;
758
+ const description = infoData?.description || '';
759
+ const publishDate = infoData?.upload_date
760
+ ? `${infoData.upload_date.slice(0, 4)}-${infoData.upload_date.slice(4, 6)}-${infoData.upload_date.slice(6, 8)}`
761
+ : '';
762
+ const chapters = parseChaptersFromDescription(description);
763
+ const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
764
+ const summary = extractSummary(fullText);
765
+ resolve({
766
+ videoId,
767
+ title,
768
+ channel,
769
+ duration: formatDuration(lengthSeconds),
770
+ language: preferredLang,
771
+ segments,
772
+ fullText,
773
+ availableLanguages: [preferredLang],
774
+ description,
775
+ publishDate,
776
+ chapters: chapters.length > 0 ? chapters : undefined,
777
+ keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
778
+ summary,
779
+ wordCount,
780
+ viewCount: (infoData.view_count?.toString() ?? '') || undefined,
781
+ likeCount: (infoData.like_count?.toString() ?? '') || undefined,
782
+ });
783
+ }
784
+ catch {
785
+ resolve(null);
786
+ }
787
+ });
788
+ // Safety: if process hangs, resolve null
789
+ proc.on('error', () => resolve(null));
790
+ });
791
+ }
792
+ /**
793
+ * Use a real browser with network route interception to capture the
794
+ * YouTube caption JSON that the player fetches automatically on page load.
795
+ * This preserves the session context needed for timedtext API requests.
796
+ */
797
+ async function getTranscriptViaBrowserIntercept(videoId, videoUrl, preferredLang) {
798
+ const browser = await getBrowser();
799
+ const ua = getRandomUserAgent();
800
+ const context = await browser.newContext({ userAgent: ua });
801
+ const page = await context.newPage();
802
+ await applyStealthScripts(page);
803
+ let capturedJson = null;
804
+ let capturedLang = preferredLang;
805
+ // Intercept YouTube's timedtext API requests (the player fetches these automatically)
806
+ await page.route('**/api/timedtext**', async (route) => {
807
+ try {
808
+ const response = await route.fetch();
809
+ const text = await response.text();
810
+ if (text && text.length > 100 && (text.includes('events') || text.includes('segs'))) {
811
+ try {
812
+ capturedJson = JSON.parse(text);
813
+ // Try to extract language from URL
814
+ const urlObj = new URL(route.request().url());
815
+ capturedLang = urlObj.searchParams.get('lang') || preferredLang;
816
+ }
817
+ catch { /* keep trying */ }
818
+ }
819
+ await route.fulfill({ response });
820
+ }
821
+ catch {
822
+ await route.continue();
823
+ }
824
+ });
825
+ try {
826
+ await page.goto(videoUrl, { waitUntil: 'domcontentloaded', timeout: 35000 });
827
+ // Wait for timedtext request to be intercepted (player auto-fetches captions)
828
+ const startWait = Date.now();
829
+ while (!capturedJson && Date.now() - startWait < 12000) {
830
+ await page.waitForTimeout(200);
831
+ }
832
+ // Also grab page HTML for video metadata
833
+ const html = await page.content();
834
+ const playerResponse = extractPlayerResponse(html);
835
+ const videoDetails = playerResponse?.videoDetails ?? {};
836
+ const microformat = playerResponse?.microformat?.playerMicroformatRenderer ?? {};
837
+ const title = videoDetails.title ?? '';
838
+ const channel = videoDetails.author ?? '';
839
+ const lengthSeconds = parseInt(videoDetails.lengthSeconds ?? microformat.lengthSeconds ?? '0', 10);
840
+ const description = (videoDetails.shortDescription ?? microformat.description?.simpleText ?? '').trim();
841
+ const publishDate = microformat.publishDate ?? microformat.uploadDate ?? '';
842
+ const captionTracks = playerResponse ? extractCaptionTracks(playerResponse) : [];
843
+ const availableLanguages = captionTracks.map(t => t.languageCode);
844
+ const descriptionChapters = parseChaptersFromDescription(description);
845
+ // If no captions were intercepted, fall back to video description from player response
846
+ if (!capturedJson) {
847
+ if (description.length > 50) {
848
+ // Return description as transcript content (better than nothing)
849
+ return {
850
+ videoId,
851
+ title,
852
+ channel,
853
+ duration: formatDuration(lengthSeconds),
854
+ language: 'en',
855
+ segments: [],
856
+ fullText: description,
857
+ availableLanguages,
858
+ description,
859
+ publishDate: publishDate || undefined,
860
+ chapters: descriptionChapters.length > 0 ? descriptionChapters : undefined,
861
+ wordCount: description.split(/\s+/).filter(Boolean).length,
862
+ };
863
+ }
864
+ throw new Error(`No captions available for video ${videoId} — captions may be disabled`);
865
+ }
866
+ // Parse the JSON3 format (YouTube's native caption format)
867
+ const segments = parseJson3Events(capturedJson);
868
+ if (segments.length === 0) {
869
+ // Fallback to description if JSON3 parsing yields nothing
870
+ if (description.length > 50) {
871
+ return {
872
+ videoId,
873
+ title,
874
+ channel,
875
+ duration: formatDuration(lengthSeconds),
876
+ language: 'en',
877
+ segments: [],
878
+ fullText: description,
879
+ availableLanguages,
880
+ description,
881
+ publishDate: publishDate || undefined,
882
+ chapters: descriptionChapters.length > 0 ? descriptionChapters : undefined,
883
+ wordCount: description.split(/\s+/).filter(Boolean).length,
884
+ };
885
+ }
886
+ throw new Error(`Captured caption response had no segments for video ${videoId}`);
887
+ }
888
+ const fullText = segments.map(s => s.text).join(' ').replace(/\s+/g, ' ').trim();
889
+ const wordCount = fullText.split(/\s+/).filter(Boolean).length;
890
+ const chapters = descriptionChapters;
891
+ const keyPoints = extractKeyPoints(segments, chapters, lengthSeconds);
892
+ const summary = extractSummary(fullText);
893
+ return {
894
+ videoId,
895
+ title,
896
+ channel,
897
+ duration: formatDuration(lengthSeconds),
898
+ language: capturedLang,
899
+ segments,
900
+ fullText,
901
+ availableLanguages,
902
+ description,
903
+ publishDate: publishDate || undefined,
904
+ chapters: chapters.length > 0 ? chapters : undefined,
905
+ keyPoints: keyPoints.length > 0 ? keyPoints : undefined,
906
+ summary,
907
+ wordCount,
908
+ viewCount: undefined, // browser path doesn't reliably get this
909
+ likeCount: undefined,
910
+ };
911
+ }
912
+ finally {
913
+ await page.close().catch(() => { });
914
+ await context.close().catch(() => { });
915
+ // Note: browser itself is pooled — don't close it
916
+ }
917
+ }
918
+ /**
919
+ * Parse YouTube's JSON3 caption format (from intercepted timedtext requests).
920
+ * Format: { events: [{ tStartMs, dDurationMs, segs: [{ utf8: "text" } or { u: "text" }] }] }
921
+ */
922
+ function parseJson3Events(data) {
923
+ const events = data.events || [];
924
+ return events
925
+ .filter(e => e.segs && e.segs.some((s) => s.utf8 || s.u))
926
+ .map(e => ({
927
+ // YouTube uses 'utf8' key in modern responses, 'u' in some older ones
928
+ text: decodeHtmlEntities(e.segs.map((s) => (s.utf8 ?? s.u ?? '')).join('').replace(/\n/g, ' ').trim()),
929
+ start: (e.tStartMs || 0) / 1000,
930
+ duration: (e.dDurationMs || 0) / 1000,
931
+ }))
932
+ .filter(s => s.text.length > 0);
933
+ }
934
+ /**
935
+ * Extract the ytInitialPlayerResponse JSON object from page HTML.
936
+ */
937
+ export function extractPlayerResponse(html) {
938
+ // Try a few patterns YouTube uses to embed this data
939
+ const patterns = [
940
+ // Modern: var ytInitialPlayerResponse = {...};
941
+ /var ytInitialPlayerResponse\s*=\s*(\{.+?\});\s*(?:var|<\/script>)/s,
942
+ // Also try without trailing var (some pages end differently)
943
+ /ytInitialPlayerResponse\s*=\s*(\{.+?\})(?:;|\s*<\/script>)/s,
944
+ ];
945
+ for (const pattern of patterns) {
946
+ const match = html.match(pattern);
947
+ if (match) {
948
+ try {
949
+ return JSON.parse(match[1]);
950
+ }
951
+ catch {
952
+ // Try to find a valid JSON boundary by walking the string
953
+ const start = html.indexOf('ytInitialPlayerResponse');
954
+ if (start === -1)
955
+ continue;
956
+ const braceStart = html.indexOf('{', start);
957
+ if (braceStart === -1)
958
+ continue;
959
+ const jsonStr = extractJsonObject(html, braceStart);
960
+ if (jsonStr) {
961
+ try {
962
+ return JSON.parse(jsonStr);
963
+ }
964
+ catch (e) {
965
+ if (process.env.DEBUG)
966
+ console.debug('[webpeel]', 'player response parse failed:', e instanceof Error ? e.message : e);
967
+ }
968
+ }
969
+ }
970
+ }
971
+ }
972
+ // Fallback: search for captionTracks directly
973
+ const captionIdx = html.indexOf('"captionTracks"');
974
+ if (captionIdx !== -1) {
975
+ // Walk back to find the enclosing object
976
+ const braceStart = html.lastIndexOf('{', captionIdx);
977
+ if (braceStart !== -1) {
978
+ const jsonStr = extractJsonObject(html, braceStart);
979
+ if (jsonStr) {
980
+ try {
981
+ return JSON.parse(jsonStr);
982
+ }
983
+ catch { /* ignore */ }
984
+ }
985
+ }
986
+ }
987
+ return null;
988
+ }
989
+ /**
990
+ * Extract a complete JSON object starting at position `start` in `str`.
991
+ * Handles nested objects/arrays and string literals.
992
+ */
993
+ function extractJsonObject(str, start) {
994
+ let depth = 0;
995
+ let inString = false;
996
+ let escape = false;
997
+ for (let i = start; i < str.length; i++) {
998
+ const ch = str[i];
999
+ if (escape) {
1000
+ escape = false;
1001
+ continue;
1002
+ }
1003
+ if (ch === '\\' && inString) {
1004
+ escape = true;
1005
+ continue;
1006
+ }
1007
+ if (ch === '"') {
1008
+ inString = !inString;
1009
+ continue;
1010
+ }
1011
+ if (inString)
1012
+ continue;
1013
+ if (ch === '{' || ch === '[')
1014
+ depth++;
1015
+ else if (ch === '}' || ch === ']') {
1016
+ depth--;
1017
+ if (depth === 0) {
1018
+ return str.slice(start, i + 1);
1019
+ }
1020
+ }
1021
+ }
1022
+ return null;
1023
+ }
1024
+ /**
1025
+ * Extract caption tracks from the player response.
1026
+ */
1027
+ function extractCaptionTracks(playerResponse) {
1028
+ try {
1029
+ const tracks = playerResponse?.captions?.playerCaptionsTracklistRenderer?.captionTracks;
1030
+ if (!Array.isArray(tracks))
1031
+ return [];
1032
+ return tracks.map((t) => ({
1033
+ baseUrl: t.baseUrl ?? '',
1034
+ languageCode: (t.languageCode ?? 'unknown').toLowerCase(),
1035
+ name: t.name?.simpleText ?? t.name?.runs?.[0]?.text ?? t.languageCode ?? '',
1036
+ isAutoGenerated: (t.kind === 'asr') ||
1037
+ (t.vssId?.startsWith('a.') ?? false) ||
1038
+ String(t.name?.simpleText ?? '').toLowerCase().includes('auto') ||
1039
+ false,
1040
+ })).filter(t => t.baseUrl);
1041
+ }
1042
+ catch {
1043
+ return [];
1044
+ }
1045
+ }
1046
+ /**
1047
+ * Pick the best caption track for the requested language.
1048
+ * Priority: manual track in preferred language > auto-generated in preferred language > any manual > any
1049
+ */
1050
+ function selectBestTrack(tracks, preferredLang) {
1051
+ const lang = preferredLang.toLowerCase().split('-')[0]; // "en-US" → "en"
1052
+ // 1. Manual in preferred language
1053
+ const manualPref = tracks.find(t => !t.isAutoGenerated && t.languageCode.startsWith(lang));
1054
+ if (manualPref)
1055
+ return manualPref;
1056
+ // 2. Auto-generated in preferred language
1057
+ const autoPref = tracks.find(t => t.isAutoGenerated && t.languageCode.startsWith(lang));
1058
+ if (autoPref)
1059
+ return autoPref;
1060
+ // 3. Any manual track
1061
+ const anyManual = tracks.find(t => !t.isAutoGenerated);
1062
+ if (anyManual)
1063
+ return anyManual;
1064
+ // 4. Fall back to first available
1065
+ return tracks[0];
1066
+ }
1067
+ /**
1068
+ * Fetch the caption XML from YouTube's timedtext API.
1069
+ * Must use same cookies/UA as the page fetch — URLs are session-locked.
1070
+ * Tries simpleFetch first; falls back to native fetch() if BlockedError is thrown
1071
+ * (our own challenge detection fires on cloud server IPs).
1072
+ */
1073
+ async function fetchCaptionXml(baseUrl, userAgent, headers) {
1074
+ try {
1075
+ const result = await simpleFetch(baseUrl, userAgent, 10000, headers);
1076
+ return result.html;
1077
+ }
1078
+ catch (simpleFetchErr) {
1079
+ const errMsg = (simpleFetchErr?.message ?? '').toLowerCase();
1080
+ const isBlocked = simpleFetchErr?.constructor?.name === 'BlockedError' ||
1081
+ errMsg.includes('blocked') ||
1082
+ errMsg.includes('challenge') ||
1083
+ errMsg.includes('cloudflare');
1084
+ if (!isBlocked)
1085
+ throw simpleFetchErr;
1086
+ // BlockedError: retry with native fetch
1087
+ const fetchHeaders = {};
1088
+ if (userAgent)
1089
+ fetchHeaders['User-Agent'] = userAgent;
1090
+ if (headers)
1091
+ Object.assign(fetchHeaders, headers);
1092
+ const response = await fetch(baseUrl, {
1093
+ headers: fetchHeaders,
1094
+ redirect: 'follow',
1095
+ signal: AbortSignal.timeout(10000),
1096
+ });
1097
+ return response.text();
1098
+ }
1099
+ }
1100
+ /**
1101
+ * Parse YouTube caption XML into transcript segments.
1102
+ *
1103
+ * Format: <transcript><text start="0.5" dur="2.1">Hello &amp; world</text>...</transcript>
1104
+ */
1105
+ export function parseCaptionXml(xml) {
1106
+ const segments = [];
1107
+ // Match all <text> elements with their attributes
1108
+ const textRegex = /<text\s+([^>]*)>([\s\S]*?)<\/text>/g;
1109
+ let match;
1110
+ while ((match = textRegex.exec(xml)) !== null) {
1111
+ const attrs = match[1];
1112
+ const rawText = match[2];
1113
+ const start = parseFloat(extractAttr(attrs, 'start') ?? '0');
1114
+ const duration = parseFloat(extractAttr(attrs, 'dur') ?? '0');
1115
+ const text = decodeHtmlEntities(rawText.trim());
1116
+ if (text) {
1117
+ segments.push({ text, start, duration });
1118
+ }
1119
+ }
1120
+ return segments;
1121
+ }
1122
+ /**
1123
+ * Extract an attribute value from an HTML/XML attribute string.
1124
+ */
1125
+ function extractAttr(attrs, name) {
1126
+ const regex = new RegExp(`${name}="([^"]*)"`, 'i');
1127
+ const m = attrs.match(regex);
1128
+ return m ? m[1] : null;
1129
+ }
1130
+ /**
1131
+ * Decode common HTML entities found in YouTube caption XML.
1132
+ *
1133
+ * Order of operations:
1134
+ * 1. Strip real HTML tags (e.g. <font color="...">) — these appear literally in the XML
1135
+ * 2. Decode all HTML entities (including &lt; → < which represents literal angle brackets)
1136
+ */
1137
+ export function decodeHtmlEntities(text) {
1138
+ return text
1139
+ // Step 1: strip real inline HTML tags (literal <...> in the text, not entities)
1140
+ .replace(/<[^>]+>/g, '')
1141
+ // Step 2: decode HTML entities
1142
+ .replace(/&lt;/g, '<')
1143
+ .replace(/&gt;/g, '>')
1144
+ .replace(/&amp;/g, '&')
1145
+ .replace(/&quot;/g, '"')
1146
+ .replace(/&#39;/g, "'")
1147
+ .replace(/&apos;/g, "'")
1148
+ .replace(/&#x27;/g, "'")
1149
+ .replace(/&#x2F;/g, '/')
1150
+ .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
1151
+ .replace(/&#x([0-9A-Fa-f]+);/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
1152
+ .trim();
1153
+ }
1154
+ /**
1155
+ * Format seconds into MM:SS or HH:MM:SS.
1156
+ */
1157
+ export function formatDuration(seconds) {
1158
+ if (!seconds || isNaN(seconds))
1159
+ return '0:00';
1160
+ const h = Math.floor(seconds / 3600);
1161
+ const m = Math.floor((seconds % 3600) / 60);
1162
+ const s = Math.floor(seconds % 60);
1163
+ if (h > 0) {
1164
+ return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`;
1165
+ }
1166
+ return `${m}:${String(s).padStart(2, '0')}`;
1167
+ }
1168
+ /**
1169
+ * Extract a meta tag value from HTML (og:title, og:description, etc.)
1170
+ */
1171
+ function extractMetaTag(html, property) {
1172
+ const regex = new RegExp(`<meta[^>]+(?:property|name)=["']${property.replace(/:/g, '\\:')}["'][^>]+content=["']([^"']+)["']`, 'i');
1173
+ const m = html.match(regex) ?? html.match(new RegExp(`<meta[^>]+content=["']([^"']+)["'][^>]+(?:property|name)=["']${property.replace(/:/g, '\\:')}["']`, 'i'));
1174
+ return m ? decodeHtmlEntities(m[1]) : null;
1175
+ }