webscout 8.2.2__py3-none-any.whl → 2026.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (483) hide show
  1. webscout/AIauto.py +524 -143
  2. webscout/AIbase.py +247 -123
  3. webscout/AIutel.py +68 -132
  4. webscout/Bard.py +1072 -535
  5. webscout/Extra/GitToolkit/__init__.py +2 -2
  6. webscout/Extra/GitToolkit/gitapi/__init__.py +20 -12
  7. webscout/Extra/GitToolkit/gitapi/gist.py +142 -0
  8. webscout/Extra/GitToolkit/gitapi/organization.py +91 -0
  9. webscout/Extra/GitToolkit/gitapi/repository.py +308 -195
  10. webscout/Extra/GitToolkit/gitapi/search.py +162 -0
  11. webscout/Extra/GitToolkit/gitapi/trending.py +236 -0
  12. webscout/Extra/GitToolkit/gitapi/user.py +128 -96
  13. webscout/Extra/GitToolkit/gitapi/utils.py +82 -62
  14. webscout/Extra/YTToolkit/README.md +443 -0
  15. webscout/Extra/YTToolkit/YTdownloader.py +953 -957
  16. webscout/Extra/YTToolkit/__init__.py +3 -3
  17. webscout/Extra/YTToolkit/transcriber.py +595 -476
  18. webscout/Extra/YTToolkit/ytapi/README.md +230 -0
  19. webscout/Extra/YTToolkit/ytapi/__init__.py +22 -6
  20. webscout/Extra/YTToolkit/ytapi/captions.py +190 -0
  21. webscout/Extra/YTToolkit/ytapi/channel.py +302 -307
  22. webscout/Extra/YTToolkit/ytapi/errors.py +13 -13
  23. webscout/Extra/YTToolkit/ytapi/extras.py +178 -45
  24. webscout/Extra/YTToolkit/ytapi/hashtag.py +120 -0
  25. webscout/Extra/YTToolkit/ytapi/https.py +89 -88
  26. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -61
  27. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -59
  28. webscout/Extra/YTToolkit/ytapi/pool.py +8 -8
  29. webscout/Extra/YTToolkit/ytapi/query.py +143 -40
  30. webscout/Extra/YTToolkit/ytapi/shorts.py +122 -0
  31. webscout/Extra/YTToolkit/ytapi/stream.py +68 -63
  32. webscout/Extra/YTToolkit/ytapi/suggestions.py +97 -0
  33. webscout/Extra/YTToolkit/ytapi/utils.py +66 -62
  34. webscout/Extra/YTToolkit/ytapi/video.py +189 -18
  35. webscout/Extra/__init__.py +2 -3
  36. webscout/Extra/gguf.py +1298 -682
  37. webscout/Extra/tempmail/README.md +488 -0
  38. webscout/Extra/tempmail/__init__.py +28 -28
  39. webscout/Extra/tempmail/async_utils.py +143 -141
  40. webscout/Extra/tempmail/base.py +172 -161
  41. webscout/Extra/tempmail/cli.py +191 -187
  42. webscout/Extra/tempmail/emailnator.py +88 -84
  43. webscout/Extra/tempmail/mail_tm.py +378 -361
  44. webscout/Extra/tempmail/temp_mail_io.py +304 -292
  45. webscout/Extra/weather.py +196 -194
  46. webscout/Extra/weather_ascii.py +17 -15
  47. webscout/Provider/AISEARCH/PERPLEXED_search.py +175 -0
  48. webscout/Provider/AISEARCH/Perplexity.py +237 -304
  49. webscout/Provider/AISEARCH/README.md +106 -0
  50. webscout/Provider/AISEARCH/__init__.py +16 -10
  51. webscout/Provider/AISEARCH/brave_search.py +298 -0
  52. webscout/Provider/AISEARCH/iask_search.py +130 -209
  53. webscout/Provider/AISEARCH/monica_search.py +200 -246
  54. webscout/Provider/AISEARCH/webpilotai_search.py +242 -281
  55. webscout/Provider/Algion.py +413 -0
  56. webscout/Provider/Andi.py +74 -69
  57. webscout/Provider/Apriel.py +313 -0
  58. webscout/Provider/Ayle.py +323 -0
  59. webscout/Provider/ChatSandbox.py +329 -0
  60. webscout/Provider/ClaudeOnline.py +365 -0
  61. webscout/Provider/Cohere.py +232 -208
  62. webscout/Provider/DeepAI.py +367 -0
  63. webscout/Provider/Deepinfra.py +343 -173
  64. webscout/Provider/EssentialAI.py +217 -0
  65. webscout/Provider/ExaAI.py +274 -261
  66. webscout/Provider/Gemini.py +60 -54
  67. webscout/Provider/GithubChat.py +385 -367
  68. webscout/Provider/Gradient.py +286 -0
  69. webscout/Provider/Groq.py +556 -670
  70. webscout/Provider/HadadXYZ.py +323 -0
  71. webscout/Provider/HeckAI.py +392 -233
  72. webscout/Provider/HuggingFace.py +387 -0
  73. webscout/Provider/IBM.py +340 -0
  74. webscout/Provider/Jadve.py +317 -266
  75. webscout/Provider/K2Think.py +306 -0
  76. webscout/Provider/Koboldai.py +221 -381
  77. webscout/Provider/Netwrck.py +273 -228
  78. webscout/Provider/Nvidia.py +310 -0
  79. webscout/Provider/OPENAI/DeepAI.py +489 -0
  80. webscout/Provider/OPENAI/K2Think.py +423 -0
  81. webscout/Provider/OPENAI/PI.py +463 -0
  82. webscout/Provider/OPENAI/README.md +890 -0
  83. webscout/Provider/OPENAI/TogetherAI.py +405 -0
  84. webscout/Provider/OPENAI/TwoAI.py +255 -0
  85. webscout/Provider/OPENAI/__init__.py +148 -25
  86. webscout/Provider/OPENAI/ai4chat.py +348 -0
  87. webscout/Provider/OPENAI/akashgpt.py +436 -0
  88. webscout/Provider/OPENAI/algion.py +303 -0
  89. webscout/Provider/OPENAI/ayle.py +365 -0
  90. webscout/Provider/OPENAI/base.py +253 -46
  91. webscout/Provider/OPENAI/cerebras.py +296 -0
  92. webscout/Provider/OPENAI/chatgpt.py +514 -193
  93. webscout/Provider/OPENAI/chatsandbox.py +233 -0
  94. webscout/Provider/OPENAI/deepinfra.py +403 -272
  95. webscout/Provider/OPENAI/e2b.py +2370 -1350
  96. webscout/Provider/OPENAI/elmo.py +278 -0
  97. webscout/Provider/OPENAI/exaai.py +186 -138
  98. webscout/Provider/OPENAI/freeassist.py +446 -0
  99. webscout/Provider/OPENAI/gradient.py +448 -0
  100. webscout/Provider/OPENAI/groq.py +380 -0
  101. webscout/Provider/OPENAI/hadadxyz.py +292 -0
  102. webscout/Provider/OPENAI/heckai.py +100 -104
  103. webscout/Provider/OPENAI/huggingface.py +321 -0
  104. webscout/Provider/OPENAI/ibm.py +425 -0
  105. webscout/Provider/OPENAI/llmchat.py +253 -0
  106. webscout/Provider/OPENAI/llmchatco.py +378 -327
  107. webscout/Provider/OPENAI/meta.py +541 -0
  108. webscout/Provider/OPENAI/netwrck.py +110 -84
  109. webscout/Provider/OPENAI/nvidia.py +317 -0
  110. webscout/Provider/OPENAI/oivscode.py +348 -0
  111. webscout/Provider/OPENAI/openrouter.py +328 -0
  112. webscout/Provider/OPENAI/pydantic_imports.py +1 -0
  113. webscout/Provider/OPENAI/sambanova.py +397 -0
  114. webscout/Provider/OPENAI/sonus.py +126 -115
  115. webscout/Provider/OPENAI/textpollinations.py +218 -133
  116. webscout/Provider/OPENAI/toolbaz.py +136 -166
  117. webscout/Provider/OPENAI/typefully.py +419 -0
  118. webscout/Provider/OPENAI/typliai.py +279 -0
  119. webscout/Provider/OPENAI/utils.py +314 -211
  120. webscout/Provider/OPENAI/wisecat.py +103 -125
  121. webscout/Provider/OPENAI/writecream.py +185 -156
  122. webscout/Provider/OPENAI/x0gpt.py +227 -136
  123. webscout/Provider/OPENAI/zenmux.py +380 -0
  124. webscout/Provider/OpenRouter.py +386 -0
  125. webscout/Provider/Openai.py +337 -496
  126. webscout/Provider/PI.py +443 -344
  127. webscout/Provider/QwenLM.py +346 -254
  128. webscout/Provider/STT/__init__.py +28 -0
  129. webscout/Provider/STT/base.py +303 -0
  130. webscout/Provider/STT/elevenlabs.py +264 -0
  131. webscout/Provider/Sambanova.py +317 -0
  132. webscout/Provider/TTI/README.md +69 -0
  133. webscout/Provider/TTI/__init__.py +37 -12
  134. webscout/Provider/TTI/base.py +147 -0
  135. webscout/Provider/TTI/claudeonline.py +393 -0
  136. webscout/Provider/TTI/magicstudio.py +292 -0
  137. webscout/Provider/TTI/miragic.py +180 -0
  138. webscout/Provider/TTI/pollinations.py +331 -0
  139. webscout/Provider/TTI/together.py +334 -0
  140. webscout/Provider/TTI/utils.py +14 -0
  141. webscout/Provider/TTS/README.md +186 -0
  142. webscout/Provider/TTS/__init__.py +43 -7
  143. webscout/Provider/TTS/base.py +523 -0
  144. webscout/Provider/TTS/deepgram.py +286 -156
  145. webscout/Provider/TTS/elevenlabs.py +189 -111
  146. webscout/Provider/TTS/freetts.py +218 -0
  147. webscout/Provider/TTS/murfai.py +288 -113
  148. webscout/Provider/TTS/openai_fm.py +364 -0
  149. webscout/Provider/TTS/parler.py +203 -111
  150. webscout/Provider/TTS/qwen.py +334 -0
  151. webscout/Provider/TTS/sherpa.py +286 -0
  152. webscout/Provider/TTS/speechma.py +693 -180
  153. webscout/Provider/TTS/streamElements.py +275 -333
  154. webscout/Provider/TTS/utils.py +280 -280
  155. webscout/Provider/TextPollinationsAI.py +221 -121
  156. webscout/Provider/TogetherAI.py +450 -0
  157. webscout/Provider/TwoAI.py +309 -199
  158. webscout/Provider/TypliAI.py +311 -0
  159. webscout/Provider/UNFINISHED/ChatHub.py +219 -0
  160. webscout/Provider/{OPENAI/glider.py → UNFINISHED/ChutesAI.py} +160 -145
  161. webscout/Provider/UNFINISHED/GizAI.py +300 -0
  162. webscout/Provider/UNFINISHED/Marcus.py +218 -0
  163. webscout/Provider/UNFINISHED/Qodo.py +481 -0
  164. webscout/Provider/UNFINISHED/XenAI.py +330 -0
  165. webscout/Provider/{Youchat.py → UNFINISHED/Youchat.py} +64 -47
  166. webscout/Provider/UNFINISHED/aihumanizer.py +41 -0
  167. webscout/Provider/UNFINISHED/grammerchecker.py +37 -0
  168. webscout/Provider/UNFINISHED/liner.py +342 -0
  169. webscout/Provider/UNFINISHED/liner_api_request.py +246 -0
  170. webscout/Provider/UNFINISHED/samurai.py +231 -0
  171. webscout/Provider/WiseCat.py +256 -196
  172. webscout/Provider/WrDoChat.py +390 -0
  173. webscout/Provider/__init__.py +115 -198
  174. webscout/Provider/ai4chat.py +181 -202
  175. webscout/Provider/akashgpt.py +330 -342
  176. webscout/Provider/cerebras.py +397 -242
  177. webscout/Provider/cleeai.py +236 -213
  178. webscout/Provider/elmo.py +291 -234
  179. webscout/Provider/geminiapi.py +343 -208
  180. webscout/Provider/julius.py +245 -223
  181. webscout/Provider/learnfastai.py +333 -266
  182. webscout/Provider/llama3mitril.py +230 -180
  183. webscout/Provider/llmchat.py +308 -213
  184. webscout/Provider/llmchatco.py +321 -311
  185. webscout/Provider/meta.py +996 -794
  186. webscout/Provider/oivscode.py +332 -0
  187. webscout/Provider/searchchat.py +316 -293
  188. webscout/Provider/sonus.py +264 -208
  189. webscout/Provider/toolbaz.py +359 -320
  190. webscout/Provider/turboseek.py +332 -219
  191. webscout/Provider/typefully.py +262 -280
  192. webscout/Provider/x0gpt.py +332 -256
  193. webscout/__init__.py +31 -38
  194. webscout/__main__.py +5 -5
  195. webscout/cli.py +585 -293
  196. webscout/client.py +1497 -0
  197. webscout/conversation.py +140 -565
  198. webscout/exceptions.py +383 -339
  199. webscout/litagent/__init__.py +29 -29
  200. webscout/litagent/agent.py +492 -455
  201. webscout/litagent/constants.py +60 -60
  202. webscout/models.py +505 -181
  203. webscout/optimizers.py +32 -378
  204. webscout/prompt_manager.py +376 -274
  205. webscout/sanitize.py +1514 -0
  206. webscout/scout/README.md +452 -0
  207. webscout/scout/__init__.py +8 -8
  208. webscout/scout/core/__init__.py +7 -7
  209. webscout/scout/core/crawler.py +330 -140
  210. webscout/scout/core/scout.py +800 -568
  211. webscout/scout/core/search_result.py +51 -96
  212. webscout/scout/core/text_analyzer.py +64 -63
  213. webscout/scout/core/text_utils.py +412 -277
  214. webscout/scout/core/web_analyzer.py +54 -52
  215. webscout/scout/element.py +872 -460
  216. webscout/scout/parsers/__init__.py +70 -69
  217. webscout/scout/parsers/html5lib_parser.py +182 -172
  218. webscout/scout/parsers/html_parser.py +238 -236
  219. webscout/scout/parsers/lxml_parser.py +203 -178
  220. webscout/scout/utils.py +38 -37
  221. webscout/search/__init__.py +47 -0
  222. webscout/search/base.py +201 -0
  223. webscout/search/bing_main.py +45 -0
  224. webscout/search/brave_main.py +92 -0
  225. webscout/search/duckduckgo_main.py +57 -0
  226. webscout/search/engines/__init__.py +127 -0
  227. webscout/search/engines/bing/__init__.py +15 -0
  228. webscout/search/engines/bing/base.py +35 -0
  229. webscout/search/engines/bing/images.py +114 -0
  230. webscout/search/engines/bing/news.py +96 -0
  231. webscout/search/engines/bing/suggestions.py +36 -0
  232. webscout/search/engines/bing/text.py +109 -0
  233. webscout/search/engines/brave/__init__.py +19 -0
  234. webscout/search/engines/brave/base.py +47 -0
  235. webscout/search/engines/brave/images.py +213 -0
  236. webscout/search/engines/brave/news.py +353 -0
  237. webscout/search/engines/brave/suggestions.py +318 -0
  238. webscout/search/engines/brave/text.py +167 -0
  239. webscout/search/engines/brave/videos.py +364 -0
  240. webscout/search/engines/duckduckgo/__init__.py +25 -0
  241. webscout/search/engines/duckduckgo/answers.py +80 -0
  242. webscout/search/engines/duckduckgo/base.py +189 -0
  243. webscout/search/engines/duckduckgo/images.py +100 -0
  244. webscout/search/engines/duckduckgo/maps.py +183 -0
  245. webscout/search/engines/duckduckgo/news.py +70 -0
  246. webscout/search/engines/duckduckgo/suggestions.py +22 -0
  247. webscout/search/engines/duckduckgo/text.py +221 -0
  248. webscout/search/engines/duckduckgo/translate.py +48 -0
  249. webscout/search/engines/duckduckgo/videos.py +80 -0
  250. webscout/search/engines/duckduckgo/weather.py +84 -0
  251. webscout/search/engines/mojeek.py +61 -0
  252. webscout/search/engines/wikipedia.py +77 -0
  253. webscout/search/engines/yahoo/__init__.py +41 -0
  254. webscout/search/engines/yahoo/answers.py +19 -0
  255. webscout/search/engines/yahoo/base.py +34 -0
  256. webscout/search/engines/yahoo/images.py +323 -0
  257. webscout/search/engines/yahoo/maps.py +19 -0
  258. webscout/search/engines/yahoo/news.py +258 -0
  259. webscout/search/engines/yahoo/suggestions.py +140 -0
  260. webscout/search/engines/yahoo/text.py +273 -0
  261. webscout/search/engines/yahoo/translate.py +19 -0
  262. webscout/search/engines/yahoo/videos.py +302 -0
  263. webscout/search/engines/yahoo/weather.py +220 -0
  264. webscout/search/engines/yandex.py +67 -0
  265. webscout/search/engines/yep/__init__.py +13 -0
  266. webscout/search/engines/yep/base.py +34 -0
  267. webscout/search/engines/yep/images.py +101 -0
  268. webscout/search/engines/yep/suggestions.py +38 -0
  269. webscout/search/engines/yep/text.py +99 -0
  270. webscout/search/http_client.py +172 -0
  271. webscout/search/results.py +141 -0
  272. webscout/search/yahoo_main.py +57 -0
  273. webscout/search/yep_main.py +48 -0
  274. webscout/server/__init__.py +48 -0
  275. webscout/server/config.py +78 -0
  276. webscout/server/exceptions.py +69 -0
  277. webscout/server/providers.py +286 -0
  278. webscout/server/request_models.py +131 -0
  279. webscout/server/request_processing.py +404 -0
  280. webscout/server/routes.py +642 -0
  281. webscout/server/server.py +351 -0
  282. webscout/server/ui_templates.py +1171 -0
  283. webscout/swiftcli/__init__.py +79 -809
  284. webscout/swiftcli/core/__init__.py +7 -0
  285. webscout/swiftcli/core/cli.py +574 -0
  286. webscout/swiftcli/core/context.py +98 -0
  287. webscout/swiftcli/core/group.py +268 -0
  288. webscout/swiftcli/decorators/__init__.py +28 -0
  289. webscout/swiftcli/decorators/command.py +243 -0
  290. webscout/swiftcli/decorators/options.py +247 -0
  291. webscout/swiftcli/decorators/output.py +392 -0
  292. webscout/swiftcli/exceptions.py +21 -0
  293. webscout/swiftcli/plugins/__init__.py +9 -0
  294. webscout/swiftcli/plugins/base.py +134 -0
  295. webscout/swiftcli/plugins/manager.py +269 -0
  296. webscout/swiftcli/utils/__init__.py +58 -0
  297. webscout/swiftcli/utils/formatting.py +251 -0
  298. webscout/swiftcli/utils/parsing.py +368 -0
  299. webscout/update_checker.py +280 -136
  300. webscout/utils.py +28 -14
  301. webscout/version.py +2 -1
  302. webscout/version.py.bak +3 -0
  303. webscout/zeroart/__init__.py +218 -55
  304. webscout/zeroart/base.py +70 -60
  305. webscout/zeroart/effects.py +155 -99
  306. webscout/zeroart/fonts.py +1799 -816
  307. webscout-2026.1.19.dist-info/METADATA +638 -0
  308. webscout-2026.1.19.dist-info/RECORD +312 -0
  309. {webscout-8.2.2.dist-info → webscout-2026.1.19.dist-info}/WHEEL +1 -1
  310. webscout-2026.1.19.dist-info/entry_points.txt +4 -0
  311. webscout-2026.1.19.dist-info/top_level.txt +1 -0
  312. inferno/__init__.py +0 -6
  313. inferno/__main__.py +0 -9
  314. inferno/cli.py +0 -6
  315. webscout/DWEBS.py +0 -477
  316. webscout/Extra/autocoder/__init__.py +0 -9
  317. webscout/Extra/autocoder/autocoder.py +0 -849
  318. webscout/Extra/autocoder/autocoder_utiles.py +0 -332
  319. webscout/LLM.py +0 -442
  320. webscout/Litlogger/__init__.py +0 -67
  321. webscout/Litlogger/core/__init__.py +0 -6
  322. webscout/Litlogger/core/level.py +0 -23
  323. webscout/Litlogger/core/logger.py +0 -165
  324. webscout/Litlogger/handlers/__init__.py +0 -12
  325. webscout/Litlogger/handlers/console.py +0 -33
  326. webscout/Litlogger/handlers/file.py +0 -143
  327. webscout/Litlogger/handlers/network.py +0 -173
  328. webscout/Litlogger/styles/__init__.py +0 -7
  329. webscout/Litlogger/styles/colors.py +0 -249
  330. webscout/Litlogger/styles/formats.py +0 -458
  331. webscout/Litlogger/styles/text.py +0 -87
  332. webscout/Litlogger/utils/__init__.py +0 -6
  333. webscout/Litlogger/utils/detectors.py +0 -153
  334. webscout/Litlogger/utils/formatters.py +0 -200
  335. webscout/Local/__init__.py +0 -12
  336. webscout/Local/__main__.py +0 -9
  337. webscout/Local/api.py +0 -576
  338. webscout/Local/cli.py +0 -516
  339. webscout/Local/config.py +0 -75
  340. webscout/Local/llm.py +0 -287
  341. webscout/Local/model_manager.py +0 -253
  342. webscout/Local/server.py +0 -721
  343. webscout/Local/utils.py +0 -93
  344. webscout/Provider/AI21.py +0 -177
  345. webscout/Provider/AISEARCH/DeepFind.py +0 -250
  346. webscout/Provider/AISEARCH/ISou.py +0 -256
  347. webscout/Provider/AISEARCH/felo_search.py +0 -228
  348. webscout/Provider/AISEARCH/genspark_search.py +0 -208
  349. webscout/Provider/AISEARCH/hika_search.py +0 -194
  350. webscout/Provider/AISEARCH/scira_search.py +0 -324
  351. webscout/Provider/Aitopia.py +0 -292
  352. webscout/Provider/AllenAI.py +0 -413
  353. webscout/Provider/Blackboxai.py +0 -229
  354. webscout/Provider/C4ai.py +0 -432
  355. webscout/Provider/ChatGPTClone.py +0 -226
  356. webscout/Provider/ChatGPTES.py +0 -237
  357. webscout/Provider/ChatGPTGratis.py +0 -194
  358. webscout/Provider/Chatify.py +0 -175
  359. webscout/Provider/Cloudflare.py +0 -273
  360. webscout/Provider/DeepSeek.py +0 -196
  361. webscout/Provider/ElectronHub.py +0 -709
  362. webscout/Provider/ExaChat.py +0 -342
  363. webscout/Provider/Free2GPT.py +0 -241
  364. webscout/Provider/GPTWeb.py +0 -193
  365. webscout/Provider/Glider.py +0 -211
  366. webscout/Provider/HF_space/__init__.py +0 -0
  367. webscout/Provider/HF_space/qwen_qwen2.py +0 -206
  368. webscout/Provider/HuggingFaceChat.py +0 -462
  369. webscout/Provider/Hunyuan.py +0 -272
  370. webscout/Provider/LambdaChat.py +0 -392
  371. webscout/Provider/Llama.py +0 -200
  372. webscout/Provider/Llama3.py +0 -204
  373. webscout/Provider/Marcus.py +0 -148
  374. webscout/Provider/OLLAMA.py +0 -396
  375. webscout/Provider/OPENAI/c4ai.py +0 -367
  376. webscout/Provider/OPENAI/chatgptclone.py +0 -460
  377. webscout/Provider/OPENAI/exachat.py +0 -433
  378. webscout/Provider/OPENAI/freeaichat.py +0 -352
  379. webscout/Provider/OPENAI/opkfc.py +0 -488
  380. webscout/Provider/OPENAI/scirachat.py +0 -463
  381. webscout/Provider/OPENAI/standardinput.py +0 -425
  382. webscout/Provider/OPENAI/typegpt.py +0 -346
  383. webscout/Provider/OPENAI/uncovrAI.py +0 -455
  384. webscout/Provider/OPENAI/venice.py +0 -413
  385. webscout/Provider/OPENAI/yep.py +0 -327
  386. webscout/Provider/OpenGPT.py +0 -199
  387. webscout/Provider/Perplexitylabs.py +0 -415
  388. webscout/Provider/Phind.py +0 -535
  389. webscout/Provider/PizzaGPT.py +0 -198
  390. webscout/Provider/Reka.py +0 -214
  391. webscout/Provider/StandardInput.py +0 -278
  392. webscout/Provider/TTI/AiForce/__init__.py +0 -22
  393. webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
  394. webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
  395. webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
  396. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
  397. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
  398. webscout/Provider/TTI/ImgSys/__init__.py +0 -23
  399. webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
  400. webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
  401. webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
  402. webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
  403. webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
  404. webscout/Provider/TTI/Nexra/__init__.py +0 -22
  405. webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
  406. webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
  407. webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
  408. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
  409. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
  410. webscout/Provider/TTI/aiarta/__init__.py +0 -2
  411. webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
  412. webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
  413. webscout/Provider/TTI/artbit/__init__.py +0 -22
  414. webscout/Provider/TTI/artbit/async_artbit.py +0 -155
  415. webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
  416. webscout/Provider/TTI/fastflux/__init__.py +0 -22
  417. webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
  418. webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
  419. webscout/Provider/TTI/huggingface/__init__.py +0 -22
  420. webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
  421. webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
  422. webscout/Provider/TTI/piclumen/__init__.py +0 -23
  423. webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
  424. webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
  425. webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
  426. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
  427. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
  428. webscout/Provider/TTI/talkai/__init__.py +0 -4
  429. webscout/Provider/TTI/talkai/async_talkai.py +0 -229
  430. webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
  431. webscout/Provider/TTS/gesserit.py +0 -127
  432. webscout/Provider/TeachAnything.py +0 -187
  433. webscout/Provider/Venice.py +0 -219
  434. webscout/Provider/VercelAI.py +0 -234
  435. webscout/Provider/WebSim.py +0 -228
  436. webscout/Provider/Writecream.py +0 -211
  437. webscout/Provider/WritingMate.py +0 -197
  438. webscout/Provider/aimathgpt.py +0 -189
  439. webscout/Provider/askmyai.py +0 -158
  440. webscout/Provider/asksteve.py +0 -203
  441. webscout/Provider/bagoodex.py +0 -145
  442. webscout/Provider/chatglm.py +0 -205
  443. webscout/Provider/copilot.py +0 -428
  444. webscout/Provider/freeaichat.py +0 -271
  445. webscout/Provider/gaurish.py +0 -244
  446. webscout/Provider/geminiprorealtime.py +0 -160
  447. webscout/Provider/granite.py +0 -187
  448. webscout/Provider/hermes.py +0 -219
  449. webscout/Provider/koala.py +0 -268
  450. webscout/Provider/labyrinth.py +0 -340
  451. webscout/Provider/lepton.py +0 -194
  452. webscout/Provider/llamatutor.py +0 -192
  453. webscout/Provider/multichat.py +0 -325
  454. webscout/Provider/promptrefine.py +0 -193
  455. webscout/Provider/scira_chat.py +0 -277
  456. webscout/Provider/scnet.py +0 -187
  457. webscout/Provider/talkai.py +0 -194
  458. webscout/Provider/tutorai.py +0 -252
  459. webscout/Provider/typegpt.py +0 -232
  460. webscout/Provider/uncovr.py +0 -312
  461. webscout/Provider/yep.py +0 -376
  462. webscout/litprinter/__init__.py +0 -59
  463. webscout/scout/core.py +0 -881
  464. webscout/tempid.py +0 -128
  465. webscout/webscout_search.py +0 -1346
  466. webscout/webscout_search_async.py +0 -877
  467. webscout/yep_search.py +0 -297
  468. webscout-8.2.2.dist-info/METADATA +0 -734
  469. webscout-8.2.2.dist-info/RECORD +0 -309
  470. webscout-8.2.2.dist-info/entry_points.txt +0 -5
  471. webscout-8.2.2.dist-info/top_level.txt +0 -3
  472. webstoken/__init__.py +0 -30
  473. webstoken/classifier.py +0 -189
  474. webstoken/keywords.py +0 -216
  475. webstoken/language.py +0 -128
  476. webstoken/ner.py +0 -164
  477. webstoken/normalizer.py +0 -35
  478. webstoken/processor.py +0 -77
  479. webstoken/sentiment.py +0 -206
  480. webstoken/stemmer.py +0 -73
  481. webstoken/tagger.py +0 -60
  482. webstoken/tokenizer.py +0 -158
  483. {webscout-8.2.2.dist-info → webscout-2026.1.19.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,476 +1,595 @@
1
- """
2
- >>> from webscout import YTTranscriber
3
- >>> transcript = YTTranscriber.get_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
4
- >>> print(transcript)
5
- {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
6
-
7
- """
8
-
9
- import requests
10
- import http.cookiejar as cookiejar
11
- import json
12
- from xml.etree import ElementTree
13
- import re
14
- import html
15
- from typing import List, Dict, Union, Optional
16
- from functools import lru_cache #
17
- from concurrent.futures import ThreadPoolExecutor
18
- from webscout.exceptions import *
19
-
20
- WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
21
- MAX_WORKERS = 4
22
-
23
- class YTTranscriber:
24
- """Transcribe YouTube videos with style! 🎤
25
-
26
- >>> transcript = YTTranscriber.get_transcript('https://youtu.be/dQw4w9WgXcQ')
27
- >>> print(transcript[0]['text'])
28
- 'Never gonna give you up'
29
- """
30
-
31
- _session = None
32
- _executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
33
-
34
- @classmethod
35
- def _get_session(cls):
36
- if cls._session is None:
37
- cls._session = requests.Session()
38
- cls._session.headers.update({
39
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
40
- })
41
- return cls._session
42
-
43
- @classmethod
44
- @lru_cache(maxsize=100)
45
- def get_transcript(cls, video_url: str, languages: Optional[str] = 'en',
46
- proxies: Dict[str, str] = None,
47
- cookies: str = None,
48
- preserve_formatting: bool = False) -> List[Dict[str, Union[str, float]]]:
49
- """
50
- Retrieves the transcript for a given YouTube video URL.
51
-
52
- Args:
53
- video_url (str): YouTube video URL (supports various formats).
54
- languages (str, optional): Language code for the transcript.
55
- If None, fetches the auto-generated transcript.
56
- Defaults to 'en'.
57
- proxies (Dict[str, str], optional): Proxies to use for the request. Defaults to None.
58
- cookies (str, optional): Path to the cookie file. Defaults to None.
59
- preserve_formatting (bool, optional): Whether to preserve formatting tags. Defaults to False.
60
-
61
- Returns:
62
- List[Dict[str, Union[str, float]]]: A list of dictionaries, each containing:
63
- - 'text': The transcribed text.
64
- - 'start': The start time of the text segment (in seconds).
65
- - 'duration': The duration of the text segment (in seconds).
66
-
67
- Raises:
68
- TranscriptRetrievalError: If there's an error retrieving the transcript.
69
- """
70
- video_id = cls._extract_video_id(video_url)
71
- http_client = cls._get_session()
72
-
73
- if proxies:
74
- http_client.proxies.update(proxies)
75
-
76
- if cookies:
77
- cls._load_cookies(cookies, video_id)
78
-
79
- transcript_list = TranscriptListFetcher(http_client).fetch(video_id)
80
- language_codes = [languages] if languages else None
81
- transcript = transcript_list.find_transcript(language_codes)
82
-
83
- return transcript.fetch(preserve_formatting)
84
-
85
- @staticmethod
86
- def _extract_video_id(video_url: str) -> str:
87
- """Extracts the video ID from different YouTube URL formats."""
88
- patterns = [
89
- r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
90
- r'youtu\.be\/([0-9A-Za-z_-]{11})',
91
- r'youtube\.com\/embed\/([0-9A-Za-z_-]{11})'
92
- ]
93
-
94
- for pattern in patterns:
95
- match = re.search(pattern, video_url)
96
- if match:
97
- return match.group(1)
98
-
99
- if re.match(r'^[0-9A-Za-z_-]{11}$', video_url):
100
- return video_url
101
-
102
- raise InvalidVideoIdError(video_url)
103
-
104
- @staticmethod
105
- def _load_cookies(cookies: str, video_id: str) -> None:
106
- """Loads cookies from a file."""
107
- try:
108
- cj = cookiejar.MozillaCookieJar(cookies)
109
- cj.load()
110
- return cj
111
- except (cookiejar.LoadError, FileNotFoundError):
112
- raise CookiePathInvalidError(video_id)
113
-
114
- class TranscriptListFetcher:
115
- """Fetches the list of transcripts for a YouTube video."""
116
-
117
- def __init__(self, http_client: requests.Session):
118
- """Initializes TranscriptListFetcher."""
119
- self._http_client = http_client
120
-
121
- def fetch(self, video_id: str):
122
- """Fetches and returns a TranscriptList."""
123
- return TranscriptList.build(
124
- self._http_client,
125
- video_id,
126
- self._extract_captions_json(self._fetch_video_html(video_id), video_id),
127
- )
128
-
129
- def _extract_captions_json(self, html: str, video_id: str) -> dict:
130
- """Extracts the captions JSON data from the video's HTML."""
131
- splitted_html = html.split('"captions":')
132
-
133
- if len(splitted_html) <= 1:
134
- if video_id.startswith('http://') or video_id.startswith('https://'):
135
- raise InvalidVideoIdError(video_id)
136
- if 'class="g-recaptcha"' in html:
137
- raise TooManyRequestsError(video_id)
138
- if '"playabilityStatus":' not in html:
139
- raise VideoUnavailableError(video_id)
140
-
141
- raise TranscriptsDisabledError(video_id)
142
-
143
- captions_json = json.loads(
144
- splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
145
- ).get('playerCaptionsTracklistRenderer')
146
- if captions_json is None:
147
- raise TranscriptsDisabledError(video_id)
148
-
149
- if 'captionTracks' not in captions_json:
150
- raise TranscriptsDisabledError(video_id)
151
-
152
- return captions_json
153
-
154
- def _create_consent_cookie(self, html, video_id):
155
- match = re.search('name="v" value="(.*?)"', html)
156
- if match is None:
157
- raise FailedToCreateConsentCookieError(video_id)
158
- self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
159
-
160
- def _fetch_video_html(self, video_id):
161
- html = self._fetch_html(video_id)
162
- if 'action="https://consent.youtube.com/s"' in html:
163
- self._create_consent_cookie(html, video_id)
164
- html = self._fetch_html(video_id)
165
- if 'action="https://consent.youtube.com/s"' in html:
166
- raise FailedToCreateConsentCookieError(video_id)
167
- return html
168
-
169
- def _fetch_html(self, video_id):
170
- response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
171
- return html.unescape(_raise_http_errors(response, video_id).text)
172
-
173
-
174
- class TranscriptList:
175
- """
176
- >>> transcript_list = TranscriptList.build(http_client, video_id, captions_json)
177
- >>> transcript = transcript_list.find_transcript(['en'])
178
- >>> print(transcript)
179
- en ("English")[TRANSLATABLE]
180
- """
181
-
182
- def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
183
- """Init that transcript list with all the good stuff! 💯"""
184
- self.video_id = video_id
185
- self._manually_created_transcripts = manually_created_transcripts
186
- self._generated_transcripts = generated_transcripts
187
- self._translation_languages = translation_languages
188
-
189
- @staticmethod
190
- def build(http_client, video_id, captions_json):
191
- """
192
- Factory method for TranscriptList.
193
-
194
- :param http_client: http client which is used to make the transcript retrieving http calls
195
- :type http_client: requests.Session
196
- :param video_id: the id of the video this TranscriptList is for
197
- :type video_id: str
198
- :param captions_json: the JSON parsed from the YouTube pages static HTML
199
- :type captions_json: dict
200
- :return: the created TranscriptList
201
- :rtype TranscriptList:
202
- """
203
- translation_languages = [
204
- {
205
- 'language': translation_language['languageName']['simpleText'],
206
- 'language_code': translation_language['languageCode'],
207
- } for translation_language in captions_json.get('translationLanguages', [])
208
- ]
209
-
210
- manually_created_transcripts = {}
211
- generated_transcripts = {}
212
-
213
- for caption in captions_json['captionTracks']:
214
- if caption.get('kind', '') == 'asr':
215
- transcript_dict = generated_transcripts
216
- else:
217
- transcript_dict = manually_created_transcripts
218
-
219
- transcript_dict[caption['languageCode']] = Transcript(
220
- http_client,
221
- video_id,
222
- caption['baseUrl'],
223
- caption['name']['simpleText'],
224
- caption['languageCode'],
225
- caption.get('kind', '') == 'asr',
226
- translation_languages if caption.get('isTranslatable', False) else [],
227
- )
228
-
229
- return TranscriptList(
230
- video_id,
231
- manually_created_transcripts,
232
- generated_transcripts,
233
- translation_languages,
234
- )
235
-
236
- def __iter__(self):
237
- return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
238
-
239
- def find_transcript(self, language_codes):
240
- """
241
- Finds a transcript for a given language code. If no language is provided, it will
242
- return the auto-generated transcript.
243
-
244
- :param language_codes: A list of language codes in a descending priority.
245
- :type languages: list[str]
246
- :return: the found Transcript
247
- :rtype Transcript:
248
- :raises: NoTranscriptFound
249
- """
250
- if 'any' in language_codes:
251
- for transcript in self:
252
- return transcript
253
- return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
254
-
255
- def find_generated_transcript(self, language_codes):
256
- """
257
- Finds an automatically generated transcript for a given language code.
258
-
259
- :param language_codes: A list of language codes in a descending priority. For example, if this is set to
260
- ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
261
- it fails to do so.
262
- :type languages: list[str]
263
- :return: the found Transcript
264
- :rtype Transcript:
265
- :raises: NoTranscriptFound
266
- """
267
- if 'any' in language_codes:
268
- for transcript in self:
269
- if transcript.is_generated:
270
- return transcript
271
- return self._find_transcript(language_codes, [self._generated_transcripts])
272
-
273
- def find_manually_created_transcript(self, language_codes):
274
- """
275
- Finds a manually created transcript for a given language code.
276
-
277
- :param language_codes: A list of language codes in a descending priority. For example, if this is set to
278
- ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
279
- it fails to do so.
280
- :type languages: list[str]
281
- :return: the found Transcript
282
- :rtype Transcript:
283
- :raises: NoTranscriptFound
284
- """
285
- return self._find_transcript(language_codes, [self._manually_created_transcripts])
286
-
287
- def _find_transcript(self, language_codes, transcript_dicts):
288
- for language_code in language_codes:
289
- for transcript_dict in transcript_dicts:
290
- if language_code in transcript_dict:
291
- return transcript_dict[language_code]
292
-
293
- raise NoTranscriptFoundError(
294
- self.video_id,
295
- language_codes,
296
- self
297
- )
298
-
299
- def __str__(self):
300
- return (
301
- 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
302
- '(MANUALLY CREATED)\n'
303
- '{available_manually_created_transcript_languages}\n\n'
304
- '(GENERATED)\n'
305
- '{available_generated_transcripts}\n\n'
306
- '(TRANSLATION LANGUAGES)\n'
307
- '{available_translation_languages}'
308
- ).format(
309
- video_id=self.video_id,
310
- available_manually_created_transcript_languages=self._get_language_description(
311
- str(transcript) for transcript in self._manually_created_transcripts.values()
312
- ),
313
- available_generated_transcripts=self._get_language_description(
314
- str(transcript) for transcript in self._generated_transcripts.values()
315
- ),
316
- available_translation_languages=self._get_language_description(
317
- '{language_code} ("{language}")'.format(
318
- language=translation_language['language'],
319
- language_code=translation_language['language_code'],
320
- ) for translation_language in self._translation_languages
321
- )
322
- )
323
-
324
- def _get_language_description(self, transcript_strings):
325
- description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
326
- return description if description else 'None'
327
-
328
-
329
- class Transcript:
330
- """Your personal transcript handler! 🎭
331
-
332
- >>> transcript = transcript_list.find_transcript(['en'])
333
- >>> print(transcript.language)
334
- 'English'
335
- >>> if transcript.is_translatable:
336
- ... es_transcript = transcript.translate('es')
337
- ... print(es_transcript.language)
338
- 'Spanish'
339
- """
340
-
341
- def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
342
- """Initialize with all the goodies! 🎁"""
343
- self._http_client = http_client
344
- self.video_id = video_id
345
- self._url = url
346
- self.language = language
347
- self.language_code = language_code
348
- self.is_generated = is_generated
349
- self.translation_languages = translation_languages
350
- self._translation_languages_dict = {
351
- translation_language['language_code']: translation_language['language']
352
- for translation_language in translation_languages
353
- }
354
-
355
- def fetch(self, preserve_formatting=False):
356
- """Get that transcript data! 🎯
357
-
358
- Args:
359
- preserve_formatting (bool): Keep HTML formatting? Default is nah fam.
360
-
361
- Returns:
362
- list: That sweet transcript data with text, start time, and duration! 📝
363
- """
364
- response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
365
- return TranscriptParser(preserve_formatting=preserve_formatting).parse(
366
- _raise_http_errors(response, self.video_id).text,
367
- )
368
-
369
- def __str__(self):
370
- """String representation looking clean! 💅"""
371
- return '{language_code} ("{language}"){translation_description}'.format(
372
- language=self.language,
373
- language_code=self.language_code,
374
- translation_description='[TRANSLATABLE]' if self.is_translatable else ''
375
- )
376
-
377
- @property
378
- def is_translatable(self):
379
- """Can we translate this? 🌍"""
380
- return len(self.translation_languages) > 0
381
-
382
- def translate(self, language_code):
383
- """Translate to another language! 🌎
384
-
385
- Args:
386
- language_code (str): Which language you want fam?
387
-
388
- Returns:
389
- Transcript: A fresh transcript in your requested language! 🔄
390
-
391
- Raises:
392
- NotTranslatableError: If we can't translate this one 😢
393
- TranslationLanguageNotAvailableError: If that language isn't available 🚫
394
- """
395
- if not self.is_translatable:
396
- raise NotTranslatableError(self.video_id)
397
-
398
- if language_code not in self._translation_languages_dict:
399
- raise TranslationLanguageNotAvailableError(self.video_id)
400
-
401
- return Transcript(
402
- self._http_client,
403
- self.video_id,
404
- '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
405
- self._translation_languages_dict[language_code],
406
- language_code,
407
- True,
408
- [],
409
- )
410
-
411
-
412
- class TranscriptParser:
413
- """Parsing those transcripts like a pro! 🎯
414
-
415
- >>> parser = TranscriptParser(preserve_formatting=True)
416
- >>> data = parser.parse(xml_data)
417
- >>> print(data[0])
418
- {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
419
- """
420
-
421
- _FORMATTING_TAGS = [
422
- 'strong', # For that extra emphasis 💪
423
- 'em', # When you need that italic swag 🎨
424
- 'b', # Bold and beautiful 💯
425
- 'i', # More italic vibes ✨
426
- 'mark', # Highlight that text 🌟
427
- 'small', # Keep it lowkey 🤫
428
- 'del', # Strike it out ⚡
429
- 'ins', # Insert new stuff 🆕
430
- 'sub', # Subscript gang 📉
431
- 'sup', # Superscript squad 📈
432
- ]
433
-
434
- def __init__(self, preserve_formatting=False):
435
- """Get ready to parse with style! 🎨"""
436
- self._html_regex = self._get_html_regex(preserve_formatting)
437
-
438
- def _get_html_regex(self, preserve_formatting):
439
- """Get that regex pattern ready! 🎯"""
440
- if preserve_formatting:
441
- formats_regex = '|'.join(self._FORMATTING_TAGS)
442
- formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
443
- html_regex = re.compile(formats_regex, re.IGNORECASE)
444
- else:
445
- html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
446
- return html_regex
447
-
448
- def parse(self, plain_data):
449
- """Parse that XML data into something beautiful! ✨"""
450
- return [
451
- {
452
- 'text': re.sub(self._html_regex, '', html.unescape(xml_element.text)),
453
- 'start': float(xml_element.attrib['start']),
454
- 'duration': float(xml_element.attrib.get('dur', '0.0')),
455
- }
456
- for xml_element in ElementTree.fromstring(plain_data)
457
- if xml_element.text is not None
458
- ]
459
-
460
-
461
- def _raise_http_errors(response, video_id):
462
- """Handle those HTTP errors with style! 🛠️"""
463
- try:
464
- response.raise_for_status()
465
- return response
466
- except requests.exceptions.HTTPError as error:
467
- raise YouTubeRequestFailedError(video_id, error)
468
-
469
-
470
- if __name__ == "__main__":
471
- # Let's get this party started! 🎉
472
- from rich import print
473
- video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
474
- transcript = YTTranscriber.get_transcript(video_url, languages=None)
475
- print("Here's what we got! 🔥")
476
- print(transcript)
1
+ """
2
+ >>> from webscout import YTTranscriber
3
+ >>> transcript = YTTranscriber.get_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
4
+ >>> print(transcript)
5
+ {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
6
+
7
+ """
8
+
9
+ import html
10
+ import http.cookiejar as cookiejar
11
+ import re
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from functools import lru_cache
14
+ from typing import Dict, List, Optional, Union
15
+ from xml.etree import ElementTree
16
+
17
+ from curl_cffi.requests import Session
18
+
19
+ from webscout.exceptions import (
20
+ CookiePathInvalidError,
21
+ FailedToCreateConsentCookieError,
22
+ InvalidVideoIdError,
23
+ NoTranscriptFoundError,
24
+ NotTranslatableError,
25
+ TooManyRequestsError,
26
+ TranscriptRetrievalError,
27
+ TranscriptsDisabledError,
28
+ TranslationLanguageNotAvailableError,
29
+ VideoUnavailableError,
30
+ YouTubeRequestFailedError,
31
+ )
32
+ from webscout.litagent import LitAgent
33
+
34
+ # YouTube API settings
35
+ WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
36
+ INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/player?key={api_key}"
37
+ INNERTUBE_CONTEXT = {"client": {"clientName": "ANDROID", "clientVersion": "20.10.38"}}
38
+ MAX_WORKERS = 4
39
+
40
+
41
+ class YTTranscriber:
42
+ """Transcribe YouTube videos with style! 🎤
43
+
44
+ >>> transcript = YTTranscriber.get_transcript('https://youtu.be/dQw4w9WgXcQ')
45
+ >>> print(transcript[0]['text'])
46
+ 'Never gonna give you up'
47
+ """
48
+
49
+ _session = None
50
+ _executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
51
+
52
+ @classmethod
53
+ def _get_session(cls):
54
+ if cls._session is None:
55
+ cls._session = Session()
56
+ cls._session.headers.update({
57
+ 'User-Agent': LitAgent().random()
58
+ })
59
+ return cls._session
60
+
61
+ @classmethod
62
+ @lru_cache(maxsize=100)
63
+ def get_transcript(cls, video_url: str, languages: Optional[str] = 'en',
64
+ proxies: Optional[Dict[str, str]] = None,
65
+ cookies: Optional[str] = None,
66
+ preserve_formatting: bool = False) -> List[Dict[str, Union[str, float]]]:
67
+ """
68
+ Retrieves the transcript for a given YouTube video URL.
69
+
70
+ Args:
71
+ video_url (str): YouTube video URL (supports various formats).
72
+ languages (str, optional): Language code for the transcript.
73
+ If None, fetches the first available transcript.
74
+ Defaults to 'en'.
75
+ proxies (Dict[str, str], optional): Proxies to use for the request. Defaults to None.
76
+ cookies (str, optional): Path to the cookie file. Defaults to None.
77
+ preserve_formatting (bool, optional): Whether to preserve formatting tags. Defaults to False.
78
+
79
+ Returns:
80
+ List[Dict[str, Union[str, float]]]: A list of dictionaries, each containing:
81
+ - 'text': The transcribed text.
82
+ - 'start': The start time of the text segment (in seconds).
83
+ - 'duration': The duration of the text segment (in seconds).
84
+
85
+ Raises:
86
+ TranscriptRetrievalError: If there's an error retrieving the transcript.
87
+ """
88
+ video_id = cls._extract_video_id(video_url)
89
+ http_client = cls._get_session()
90
+
91
+ if proxies:
92
+ http_client.proxies.update(proxies)
93
+
94
+ if cookies:
95
+ cls._load_cookies(cookies, video_id)
96
+
97
+ transcript_list = TranscriptListFetcher(http_client).fetch(video_id)
98
+ language_codes = [languages] if languages else None
99
+ transcript = transcript_list.find_transcript(language_codes)
100
+
101
+ return transcript.fetch(preserve_formatting)
102
+
103
+ @staticmethod
104
+ def _extract_video_id(video_url: str) -> str:
105
+ """Extracts the video ID from different YouTube URL formats."""
106
+ patterns = [
107
+ r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
108
+ r'youtu\.be\/([0-9A-Za-z_-]{11})',
109
+ r'youtube\.com\/embed\/([0-9A-Za-z_-]{11})',
110
+ r'youtube\.com\/shorts\/([0-9A-Za-z_-]{11})'
111
+ ]
112
+
113
+ for pattern in patterns:
114
+ match = re.search(pattern, video_url)
115
+ if match:
116
+ return match.group(1)
117
+
118
+ if re.match(r'^[0-9A-Za-z_-]{11}$', video_url):
119
+ return video_url
120
+
121
+ raise InvalidVideoIdError(video_url)
122
+
123
+ @staticmethod
124
+ def _load_cookies(cookies: str, video_id: str) -> cookiejar.MozillaCookieJar:
125
+ """Loads cookies from a file."""
126
+ try:
127
+ cj = cookiejar.MozillaCookieJar(cookies)
128
+ cj.load()
129
+ return cj
130
+ except (cookiejar.LoadError, FileNotFoundError):
131
+ raise CookiePathInvalidError(video_id)
132
+
133
+
134
+ class TranscriptListFetcher:
135
+ """Fetches the list of transcripts for a YouTube video using InnerTube API."""
136
+
137
+ def __init__(self, http_client: Session):
138
+ """Initializes TranscriptListFetcher."""
139
+ self._http_client = http_client
140
+
141
+ def fetch(self, video_id: str):
142
+ """Fetches and returns a TranscriptList."""
143
+ captions_json = self._fetch_captions_json(video_id)
144
+ return TranscriptList.build(
145
+ self._http_client,
146
+ video_id,
147
+ captions_json,
148
+ )
149
+
150
+ def _fetch_captions_json(self, video_id: str) -> dict:
151
+ """Fetches captions JSON using InnerTube API."""
152
+ # First get the HTML to extract the API key
153
+ video_html = self._fetch_video_html(video_id)
154
+ api_key = self._extract_innertube_api_key(video_html, video_id)
155
+
156
+ # Use InnerTube API to get video data
157
+ innertube_data = self._fetch_innertube_data(video_id, api_key)
158
+ return self._extract_captions_from_innertube(innertube_data, video_id)
159
+
160
+ def _extract_innertube_api_key(self, html_content: str, video_id: str) -> str:
161
+ """Extracts the InnerTube API key from HTML."""
162
+ pattern = r'"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"'
163
+ match = re.search(pattern, html_content)
164
+ if match and len(match.groups()) == 1:
165
+ return match.group(1)
166
+
167
+ # Check for IP block
168
+ if 'class="g-recaptcha"' in html_content:
169
+ raise TooManyRequestsError(video_id)
170
+
171
+ raise TranscriptRetrievalError(video_id, "Could not extract InnerTube API key")
172
+
173
+ def _fetch_innertube_data(self, video_id: str, api_key: str) -> dict:
174
+ """Fetches video data from InnerTube API."""
175
+ response = self._http_client.post(
176
+ INNERTUBE_API_URL.format(api_key=api_key),
177
+ json={
178
+ "context": INNERTUBE_CONTEXT,
179
+ "videoId": video_id,
180
+ },
181
+ )
182
+ return _raise_http_errors(response, video_id).json()
183
+
184
+ def _extract_captions_from_innertube(self, innertube_data: dict, video_id: str) -> dict:
185
+ """Extracts captions JSON from InnerTube API response."""
186
+ # Check playability status
187
+ playability_status = innertube_data.get("playabilityStatus", {})
188
+ status = playability_status.get("status")
189
+
190
+ if status == "ERROR":
191
+ reason = playability_status.get("reason", "Unknown error")
192
+ if "unavailable" in reason.lower():
193
+ raise VideoUnavailableError(video_id)
194
+ raise TranscriptRetrievalError(video_id, reason)
195
+
196
+ if status == "LOGIN_REQUIRED":
197
+ reason = playability_status.get("reason", "")
198
+ if "bot" in reason.lower():
199
+ raise TooManyRequestsError(video_id)
200
+ if "age" in reason.lower() or "inappropriate" in reason.lower():
201
+ raise TranscriptRetrievalError(video_id, "Video is age-restricted")
202
+ raise TranscriptRetrievalError(video_id, reason or "Login required")
203
+
204
+ # Get captions
205
+ captions = innertube_data.get("captions", {})
206
+ captions_json = captions.get("playerCaptionsTracklistRenderer")
207
+
208
+ if captions_json is None or "captionTracks" not in captions_json:
209
+ raise TranscriptsDisabledError(video_id)
210
+
211
+ return captions_json
212
+
213
+ def _create_consent_cookie(self, html_content, video_id):
214
+ match = re.search('name="v" value="(.*?)"', html_content)
215
+ if match is None:
216
+ raise FailedToCreateConsentCookieError(video_id)
217
+ self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
218
+
219
+ def _fetch_video_html(self, video_id):
220
+ html_content = self._fetch_html(video_id)
221
+ if 'action="https://consent.youtube.com/s"' in html_content:
222
+ self._create_consent_cookie(html_content, video_id)
223
+ html_content = self._fetch_html(video_id)
224
+ if 'action="https://consent.youtube.com/s"' in html_content:
225
+ raise FailedToCreateConsentCookieError(video_id)
226
+ return html_content
227
+
228
+ def _fetch_html(self, video_id):
229
+ response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
230
+ return html.unescape(_raise_http_errors(response, video_id).text)
231
+
232
+
233
+ class TranscriptList:
234
+ """
235
+ >>> transcript_list = TranscriptList.build(http_client, video_id, captions_json)
236
+ >>> transcript = transcript_list.find_transcript(['en'])
237
+ >>> print(transcript)
238
+ en ("English")[TRANSLATABLE]
239
+ """
240
+
241
+ def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
242
+ """Init that transcript list with all the good stuff! 💯"""
243
+ self.video_id = video_id
244
+ self._manually_created_transcripts = manually_created_transcripts
245
+ self._generated_transcripts = generated_transcripts
246
+ self._translation_languages = translation_languages
247
+
248
+ @staticmethod
249
+ def build(http_client, video_id, captions_json):
250
+ """
251
+ Factory method for TranscriptList.
252
+
253
+ :param http_client: http client which is used to make the transcript retrieving http calls
254
+ :type http_client: Session
255
+ :param video_id: the id of the video this TranscriptList is for
256
+ :type video_id: str
257
+ :param captions_json: the JSON parsed from the YouTube API
258
+ :type captions_json: dict
259
+ :return: the created TranscriptList
260
+ :rtype TranscriptList:
261
+ """
262
+ # Handle both old format (simpleText) and new format (runs)
263
+ translation_languages = []
264
+ for tl in captions_json.get('translationLanguages', []):
265
+ lang_name = tl.get('languageName', {})
266
+ if isinstance(lang_name, dict):
267
+ # Try new format first (runs), then old format (simpleText)
268
+ if 'runs' in lang_name:
269
+ name = lang_name['runs'][0]['text']
270
+ elif 'simpleText' in lang_name:
271
+ name = lang_name['simpleText']
272
+ else:
273
+ name = tl.get('languageCode', 'Unknown')
274
+ else:
275
+ name = str(lang_name)
276
+ translation_languages.append({
277
+ 'language': name,
278
+ 'language_code': tl['languageCode'],
279
+ })
280
+
281
+ manually_created_transcripts = {}
282
+ generated_transcripts = {}
283
+
284
+ for caption in captions_json['captionTracks']:
285
+ if caption.get('kind', '') == 'asr':
286
+ transcript_dict = generated_transcripts
287
+ else:
288
+ transcript_dict = manually_created_transcripts
289
+
290
+ # Extract caption name - handle both formats
291
+ caption_name = caption.get('name', {})
292
+ if isinstance(caption_name, dict):
293
+ if 'runs' in caption_name:
294
+ name = caption_name['runs'][0]['text']
295
+ elif 'simpleText' in caption_name:
296
+ name = caption_name['simpleText']
297
+ else:
298
+ name = caption.get('languageCode', 'Unknown')
299
+ else:
300
+ name = str(caption_name) if caption_name else caption.get('languageCode', 'Unknown')
301
+
302
+ # Remove &fmt=srv3 from URL as it can cause issues
303
+ base_url = caption['baseUrl'].replace("&fmt=srv3", "")
304
+
305
+ transcript_dict[caption['languageCode']] = Transcript(
306
+ http_client,
307
+ video_id,
308
+ base_url,
309
+ name,
310
+ caption['languageCode'],
311
+ caption.get('kind', '') == 'asr',
312
+ translation_languages if caption.get('isTranslatable', False) else [],
313
+ )
314
+
315
+ return TranscriptList(
316
+ video_id,
317
+ manually_created_transcripts,
318
+ generated_transcripts,
319
+ translation_languages,
320
+ )
321
+
322
+ def __iter__(self):
323
+ return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
324
+
325
+ def find_transcript(self, language_codes):
326
+ """
327
+ Finds a transcript for a given language code. If no language is provided, it will
328
+ return the first available transcript.
329
+
330
+ :param language_codes: A list of language codes in a descending priority.
331
+ :type languages: list[str]
332
+ :return: the found Transcript
333
+ :rtype Transcript:
334
+ :raises: NoTranscriptFound
335
+ """
336
+ if not language_codes:
337
+ language_codes = ['any']
338
+
339
+ if 'any' in language_codes:
340
+ for transcript in self:
341
+ return transcript
342
+ return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
343
+
344
+ def find_generated_transcript(self, language_codes):
345
+ """
346
+ Finds an automatically generated transcript for a given language code.
347
+
348
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
349
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
350
+ it fails to do so.
351
+ :type languages: list[str]
352
+ :return: the found Transcript
353
+ :rtype Transcript:
354
+ :raises: NoTranscriptFound
355
+ """
356
+ if not language_codes:
357
+ language_codes = ['any']
358
+
359
+ if 'any' in language_codes:
360
+ for transcript in self:
361
+ if transcript.is_generated:
362
+ return transcript
363
+ return self._find_transcript(language_codes, [self._generated_transcripts])
364
+
365
+ def find_manually_created_transcript(self, language_codes):
366
+ """
367
+ Finds a manually created transcript for a given language code.
368
+
369
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
370
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
371
+ it fails to do so.
372
+ :type languages: list[str]
373
+ :return: the found Transcript
374
+ :rtype Transcript:
375
+ :raises: NoTranscriptFound
376
+ """
377
+ if not language_codes:
378
+ language_codes = ['any']
379
+ return self._find_transcript(language_codes, [self._manually_created_transcripts])
380
+
381
+ def _find_transcript(self, language_codes, transcript_dicts):
382
+ for language_code in language_codes:
383
+ for transcript_dict in transcript_dicts:
384
+ if language_code in transcript_dict:
385
+ return transcript_dict[language_code]
386
+
387
+ raise NoTranscriptFoundError(
388
+ self.video_id,
389
+ language_codes,
390
+ self
391
+ )
392
+
393
+ def __str__(self):
394
+ return (
395
+ 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
396
+ '(MANUALLY CREATED)\n'
397
+ '{available_manually_created_transcript_languages}\n\n'
398
+ '(GENERATED)\n'
399
+ '{available_generated_transcripts}\n\n'
400
+ '(TRANSLATION LANGUAGES)\n'
401
+ '{available_translation_languages}'
402
+ ).format(
403
+ video_id=self.video_id,
404
+ available_manually_created_transcript_languages=self._get_language_description(
405
+ str(transcript) for transcript in self._manually_created_transcripts.values()
406
+ ),
407
+ available_generated_transcripts=self._get_language_description(
408
+ str(transcript) for transcript in self._generated_transcripts.values()
409
+ ),
410
+ available_translation_languages=self._get_language_description(
411
+ '{language_code} ("{language}")'.format(
412
+ language=translation_language['language'],
413
+ language_code=translation_language['language_code'],
414
+ ) for translation_language in self._translation_languages
415
+ )
416
+ )
417
+
418
+ def _get_language_description(self, transcript_strings):
419
+ description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
420
+ return description if description else 'None'
421
+
422
+
423
+ class Transcript:
424
+ """Your personal transcript handler! 🎭
425
+
426
+ >>> transcript = transcript_list.find_transcript(['en'])
427
+ >>> print(transcript.language)
428
+ 'English'
429
+ >>> if transcript.is_translatable:
430
+ ... es_transcript = transcript.translate('es')
431
+ ... print(es_transcript.language)
432
+ 'Spanish'
433
+ """
434
+
435
+ def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
436
+ """Initialize with all the goodies! 🎁"""
437
+ self._http_client = http_client
438
+ self.video_id = video_id
439
+ self._url = url
440
+ self.language = language
441
+ self.language_code = language_code
442
+ self.is_generated = is_generated
443
+ self.translation_languages = translation_languages
444
+ self._translation_languages_dict = {
445
+ translation_language['language_code']: translation_language['language']
446
+ for translation_language in translation_languages
447
+ }
448
+
449
+ def fetch(self, preserve_formatting=False):
450
+ """Get that transcript data! 🎯
451
+
452
+ Args:
453
+ preserve_formatting (bool): Keep HTML formatting? Default is nah fam.
454
+
455
+ Returns:
456
+ list: That sweet transcript data with text, start time, and duration! 📝
457
+ """
458
+ response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
459
+ return TranscriptParser(preserve_formatting=preserve_formatting).parse(
460
+ _raise_http_errors(response, self.video_id).text,
461
+ )
462
+
463
+ def __str__(self):
464
+ """String representation looking clean! 💅"""
465
+ return '{language_code} ("{language}"){translation_description}'.format(
466
+ language=self.language,
467
+ language_code=self.language_code,
468
+ translation_description='[TRANSLATABLE]' if self.is_translatable else ''
469
+ )
470
+
471
+ @property
472
+ def is_translatable(self):
473
+ """Can we translate this? 🌍"""
474
+ return len(self.translation_languages) > 0
475
+
476
+ def translate(self, language_code):
477
+ """Translate to another language! 🌎
478
+
479
+ Args:
480
+ language_code (str): Which language you want fam?
481
+
482
+ Returns:
483
+ Transcript: A fresh transcript in your requested language! 🔄
484
+
485
+ Raises:
486
+ NotTranslatableError: If we can't translate this one 😢
487
+ TranslationLanguageNotAvailableError: If that language isn't available 🚫
488
+ """
489
+ if not self.is_translatable:
490
+ raise NotTranslatableError(self.video_id)
491
+
492
+ if language_code not in self._translation_languages_dict:
493
+ raise TranslationLanguageNotAvailableError(self.video_id)
494
+
495
+ return Transcript(
496
+ self._http_client,
497
+ self.video_id,
498
+ '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
499
+ self._translation_languages_dict[language_code],
500
+ language_code,
501
+ True,
502
+ [],
503
+ )
504
+
505
+
506
+ class TranscriptParser:
507
+ """Parsing those transcripts like a pro! 🎯
508
+
509
+ >>> parser = TranscriptParser(preserve_formatting=True)
510
+ >>> data = parser.parse(xml_data)
511
+ >>> print(data[0])
512
+ {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
513
+ """
514
+
515
+ _FORMATTING_TAGS = [
516
+ 'strong', # For that extra emphasis 💪
517
+ 'em', # When you need that italic swag 🎨
518
+ 'b', # Bold and beautiful 💯
519
+ 'i', # More italic vibes ✨
520
+ 'mark', # Highlight that text 🌟
521
+ 'small', # Keep it lowkey 🤫
522
+ 'del', # Strike it out ⚡
523
+ 'ins', # Insert new stuff 🆕
524
+ 'sub', # Subscript gang 📉
525
+ 'sup', # Superscript squad 📈
526
+ ]
527
+
528
+ def __init__(self, preserve_formatting=False):
529
+ """Get ready to parse with style! 🎨"""
530
+ self._html_regex = self._get_html_regex(preserve_formatting)
531
+
532
+ def _get_html_regex(self, preserve_formatting):
533
+ """Get that regex pattern ready! 🎯"""
534
+ if preserve_formatting:
535
+ formats_regex = '|'.join(self._FORMATTING_TAGS)
536
+ formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
537
+ html_regex = re.compile(formats_regex, re.IGNORECASE)
538
+ else:
539
+ html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
540
+ return html_regex
541
+
542
+ def parse(self, plain_data):
543
+ """Parse that XML data into something beautiful! ✨"""
544
+ try:
545
+ return [
546
+ {
547
+ 'text': re.sub(self._html_regex, '', html.unescape(xml_element.text or '')),
548
+ 'start': float(xml_element.attrib['start']),
549
+ 'duration': float(xml_element.attrib.get('dur', '0.0')),
550
+ }
551
+ for xml_element in ElementTree.fromstring(plain_data)
552
+ if xml_element.text is not None
553
+ ]
554
+ except ElementTree.ParseError:
555
+ # If XML parsing fails, try to extract text manually
556
+ return self._fallback_parse(plain_data)
557
+
558
+ def _fallback_parse(self, plain_data):
559
+ """Fallback parsing method if XML parsing fails."""
560
+ results = []
561
+ # Try regex pattern matching
562
+ pattern = r'<text start="([^"]+)" dur="([^"]+)"[^>]*>([^<]*)</text>'
563
+ matches = re.findall(pattern, plain_data, re.DOTALL)
564
+
565
+ for start, dur, text in matches:
566
+ text = html.unescape(text)
567
+ text = re.sub(self._html_regex, '', text)
568
+ if text.strip():
569
+ results.append({
570
+ 'text': text.strip(),
571
+ 'start': float(start),
572
+ 'duration': float(dur),
573
+ })
574
+
575
+ return results
576
+
577
+
578
+ def _raise_http_errors(response, video_id):
579
+ """Handle those HTTP errors with style! 🛠️"""
580
+ try:
581
+ if response.status_code == 429:
582
+ raise TooManyRequestsError(video_id)
583
+ response.raise_for_status()
584
+ return response
585
+ except Exception as error:
586
+ raise YouTubeRequestFailedError(video_id, error)
587
+
588
+
589
+ if __name__ == "__main__":
590
+ # Let's get this party started! 🎉
591
+ from rich import print
592
+ video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
593
+ transcript = YTTranscriber.get_transcript(video_url, languages=None)
594
+ print("Here's what we got! 🔥")
595
+ print(transcript[:5])