webscout 7.0__py3-none-any.whl → 7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (147) hide show
  1. webscout/AIauto.py +191 -191
  2. webscout/AIbase.py +122 -122
  3. webscout/AIutel.py +440 -440
  4. webscout/Bard.py +343 -161
  5. webscout/DWEBS.py +489 -492
  6. webscout/Extra/YTToolkit/YTdownloader.py +995 -995
  7. webscout/Extra/YTToolkit/__init__.py +2 -2
  8. webscout/Extra/YTToolkit/transcriber.py +476 -479
  9. webscout/Extra/YTToolkit/ytapi/channel.py +307 -307
  10. webscout/Extra/YTToolkit/ytapi/playlist.py +58 -58
  11. webscout/Extra/YTToolkit/ytapi/pool.py +7 -7
  12. webscout/Extra/YTToolkit/ytapi/utils.py +62 -62
  13. webscout/Extra/YTToolkit/ytapi/video.py +103 -103
  14. webscout/Extra/autocoder/__init__.py +9 -9
  15. webscout/Extra/autocoder/autocoder_utiles.py +199 -199
  16. webscout/Extra/autocoder/rawdog.py +5 -7
  17. webscout/Extra/autollama.py +230 -230
  18. webscout/Extra/gguf.py +3 -3
  19. webscout/Extra/weather.py +171 -171
  20. webscout/LLM.py +442 -442
  21. webscout/Litlogger/__init__.py +67 -681
  22. webscout/Litlogger/core/__init__.py +6 -0
  23. webscout/Litlogger/core/level.py +20 -0
  24. webscout/Litlogger/core/logger.py +123 -0
  25. webscout/Litlogger/handlers/__init__.py +12 -0
  26. webscout/Litlogger/handlers/console.py +50 -0
  27. webscout/Litlogger/handlers/file.py +143 -0
  28. webscout/Litlogger/handlers/network.py +174 -0
  29. webscout/Litlogger/styles/__init__.py +7 -0
  30. webscout/Litlogger/styles/colors.py +231 -0
  31. webscout/Litlogger/styles/formats.py +377 -0
  32. webscout/Litlogger/styles/text.py +87 -0
  33. webscout/Litlogger/utils/__init__.py +6 -0
  34. webscout/Litlogger/utils/detectors.py +154 -0
  35. webscout/Litlogger/utils/formatters.py +200 -0
  36. webscout/Provider/AISEARCH/DeepFind.py +250 -250
  37. webscout/Provider/Blackboxai.py +136 -137
  38. webscout/Provider/ChatGPTGratis.py +226 -0
  39. webscout/Provider/Cloudflare.py +91 -78
  40. webscout/Provider/DeepSeek.py +218 -0
  41. webscout/Provider/Deepinfra.py +59 -35
  42. webscout/Provider/Free2GPT.py +131 -124
  43. webscout/Provider/Gemini.py +100 -115
  44. webscout/Provider/Glider.py +74 -59
  45. webscout/Provider/Groq.py +30 -18
  46. webscout/Provider/Jadve.py +108 -77
  47. webscout/Provider/Llama3.py +117 -94
  48. webscout/Provider/Marcus.py +191 -137
  49. webscout/Provider/Netwrck.py +62 -50
  50. webscout/Provider/PI.py +79 -124
  51. webscout/Provider/PizzaGPT.py +129 -83
  52. webscout/Provider/QwenLM.py +311 -0
  53. webscout/Provider/TTI/AiForce/__init__.py +22 -22
  54. webscout/Provider/TTI/AiForce/async_aiforce.py +257 -257
  55. webscout/Provider/TTI/AiForce/sync_aiforce.py +242 -242
  56. webscout/Provider/TTI/Nexra/__init__.py +22 -22
  57. webscout/Provider/TTI/Nexra/async_nexra.py +286 -286
  58. webscout/Provider/TTI/Nexra/sync_nexra.py +258 -258
  59. webscout/Provider/TTI/PollinationsAI/__init__.py +23 -23
  60. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +330 -330
  61. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +285 -285
  62. webscout/Provider/TTI/artbit/__init__.py +22 -22
  63. webscout/Provider/TTI/artbit/async_artbit.py +184 -184
  64. webscout/Provider/TTI/artbit/sync_artbit.py +176 -176
  65. webscout/Provider/TTI/blackbox/__init__.py +4 -4
  66. webscout/Provider/TTI/blackbox/async_blackbox.py +212 -212
  67. webscout/Provider/TTI/blackbox/sync_blackbox.py +199 -199
  68. webscout/Provider/TTI/deepinfra/__init__.py +4 -4
  69. webscout/Provider/TTI/deepinfra/async_deepinfra.py +227 -227
  70. webscout/Provider/TTI/deepinfra/sync_deepinfra.py +199 -199
  71. webscout/Provider/TTI/huggingface/__init__.py +22 -22
  72. webscout/Provider/TTI/huggingface/async_huggingface.py +199 -199
  73. webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -195
  74. webscout/Provider/TTI/imgninza/__init__.py +4 -4
  75. webscout/Provider/TTI/imgninza/async_ninza.py +214 -214
  76. webscout/Provider/TTI/imgninza/sync_ninza.py +209 -209
  77. webscout/Provider/TTI/talkai/__init__.py +4 -4
  78. webscout/Provider/TTI/talkai/async_talkai.py +229 -229
  79. webscout/Provider/TTI/talkai/sync_talkai.py +207 -207
  80. webscout/Provider/TTS/deepgram.py +182 -182
  81. webscout/Provider/TTS/elevenlabs.py +136 -136
  82. webscout/Provider/TTS/gesserit.py +150 -150
  83. webscout/Provider/TTS/murfai.py +138 -138
  84. webscout/Provider/TTS/parler.py +133 -134
  85. webscout/Provider/TTS/streamElements.py +360 -360
  86. webscout/Provider/TTS/utils.py +280 -280
  87. webscout/Provider/TTS/voicepod.py +116 -116
  88. webscout/Provider/TextPollinationsAI.py +74 -47
  89. webscout/Provider/WiseCat.py +193 -0
  90. webscout/Provider/__init__.py +144 -136
  91. webscout/Provider/cerebras.py +242 -227
  92. webscout/Provider/chatglm.py +204 -204
  93. webscout/Provider/dgaf.py +67 -39
  94. webscout/Provider/gaurish.py +105 -66
  95. webscout/Provider/geminiapi.py +208 -208
  96. webscout/Provider/granite.py +223 -0
  97. webscout/Provider/hermes.py +218 -218
  98. webscout/Provider/llama3mitril.py +179 -179
  99. webscout/Provider/llamatutor.py +72 -62
  100. webscout/Provider/llmchat.py +60 -35
  101. webscout/Provider/meta.py +794 -794
  102. webscout/Provider/multichat.py +331 -230
  103. webscout/Provider/typegpt.py +359 -356
  104. webscout/Provider/yep.py +5 -5
  105. webscout/__main__.py +5 -5
  106. webscout/cli.py +319 -319
  107. webscout/conversation.py +241 -242
  108. webscout/exceptions.py +328 -328
  109. webscout/litagent/__init__.py +28 -28
  110. webscout/litagent/agent.py +2 -3
  111. webscout/litprinter/__init__.py +0 -58
  112. webscout/scout/__init__.py +8 -8
  113. webscout/scout/core.py +884 -884
  114. webscout/scout/element.py +459 -459
  115. webscout/scout/parsers/__init__.py +69 -69
  116. webscout/scout/parsers/html5lib_parser.py +172 -172
  117. webscout/scout/parsers/html_parser.py +236 -236
  118. webscout/scout/parsers/lxml_parser.py +178 -178
  119. webscout/scout/utils.py +38 -38
  120. webscout/swiftcli/__init__.py +811 -811
  121. webscout/update_checker.py +2 -12
  122. webscout/version.py +1 -1
  123. webscout/webscout_search.py +1142 -1140
  124. webscout/webscout_search_async.py +635 -635
  125. webscout/zeroart/__init__.py +54 -54
  126. webscout/zeroart/base.py +60 -60
  127. webscout/zeroart/effects.py +99 -99
  128. webscout/zeroart/fonts.py +816 -816
  129. {webscout-7.0.dist-info → webscout-7.2.dist-info}/METADATA +21 -28
  130. webscout-7.2.dist-info/RECORD +217 -0
  131. webstoken/__init__.py +30 -30
  132. webstoken/classifier.py +189 -189
  133. webstoken/keywords.py +216 -216
  134. webstoken/language.py +128 -128
  135. webstoken/ner.py +164 -164
  136. webstoken/normalizer.py +35 -35
  137. webstoken/processor.py +77 -77
  138. webstoken/sentiment.py +206 -206
  139. webstoken/stemmer.py +73 -73
  140. webstoken/tagger.py +60 -60
  141. webstoken/tokenizer.py +158 -158
  142. webscout/Provider/RUBIKSAI.py +0 -272
  143. webscout-7.0.dist-info/RECORD +0 -199
  144. {webscout-7.0.dist-info → webscout-7.2.dist-info}/LICENSE.md +0 -0
  145. {webscout-7.0.dist-info → webscout-7.2.dist-info}/WHEEL +0 -0
  146. {webscout-7.0.dist-info → webscout-7.2.dist-info}/entry_points.txt +0 -0
  147. {webscout-7.0.dist-info → webscout-7.2.dist-info}/top_level.txt +0 -0
@@ -1,480 +1,477 @@
1
- """Wassup fam! 🔥 This module is your go-to for getting those YouTube transcripts!
2
-
3
- >>> from webscout import YTTranscriber
4
- >>> transcript = YTTranscriber.get_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
5
- >>> print(transcript)
6
- {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
7
-
8
- Built different by @HelpingAI 👑
9
- """
10
-
11
- import requests # For making those HTTP requests like a boss 🌐
12
- import http.cookiejar as cookiejar # Handling cookies and stuff 🍪
13
- import json # JSON parsing - keeping it clean! 📝
14
- from xml.etree import ElementTree # XML parsing magic
15
- import re # Regex for pattern matching 🎯
16
- import html # HTML stuff made easy 💪
17
- from typing import List, Dict, Union, Optional # Type hints for that clean code 💯
18
- from functools import lru_cache # Cache that data for speed! ⚡
19
- from concurrent.futures import ThreadPoolExecutor # Parallel processing gang 🚀
20
- import asyncio # Async/await swag 😎
21
- from webscout.exceptions import * # All our custom exceptions 🛠️
22
-
23
- WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
24
- MAX_WORKERS = 4 # Keeping it optimal fam! 💪
25
-
26
- class YTTranscriber:
27
- """Your boy for getting those YouTube transcripts! 🎥
28
-
29
- >>> transcript = YTTranscriber.get_transcript('https://youtu.be/dQw4w9WgXcQ')
30
- >>> print(transcript[0]['text'])
31
- 'Never gonna give you up'
32
- """
33
-
34
- _session = None
35
- _executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
36
-
37
- @classmethod
38
- def _get_session(cls):
39
- if cls._session is None:
40
- cls._session = requests.Session()
41
- cls._session.headers.update({
42
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
43
- })
44
- return cls._session
45
-
46
- @classmethod
47
- @lru_cache(maxsize=100)
48
- def get_transcript(cls, video_url: str, languages: Optional[str] = 'en',
49
- proxies: Dict[str, str] = None,
50
- cookies: str = None,
51
- preserve_formatting: bool = False) -> List[Dict[str, Union[str, float]]]:
52
- """
53
- Retrieves the transcript for a given YouTube video URL.
54
-
55
- Args:
56
- video_url (str): YouTube video URL (supports various formats).
57
- languages (str, optional): Language code for the transcript.
58
- If None, fetches the auto-generated transcript.
59
- Defaults to 'en'.
60
- proxies (Dict[str, str], optional): Proxies to use for the request. Defaults to None.
61
- cookies (str, optional): Path to the cookie file. Defaults to None.
62
- preserve_formatting (bool, optional): Whether to preserve formatting tags. Defaults to False.
63
-
64
- Returns:
65
- List[Dict[str, Union[str, float]]]: A list of dictionaries, each containing:
66
- - 'text': The transcribed text.
67
- - 'start': The start time of the text segment (in seconds).
68
- - 'duration': The duration of the text segment (in seconds).
69
-
70
- Raises:
71
- TranscriptRetrievalError: If there's an error retrieving the transcript.
72
- """
73
- video_id = cls._extract_video_id(video_url)
74
- http_client = cls._get_session()
75
-
76
- if proxies:
77
- http_client.proxies.update(proxies)
78
-
79
- if cookies:
80
- cls._load_cookies(cookies, video_id)
81
-
82
- transcript_list = TranscriptListFetcher(http_client).fetch(video_id)
83
- language_codes = [languages] if languages else None
84
- transcript = transcript_list.find_transcript(language_codes)
85
-
86
- return transcript.fetch(preserve_formatting)
87
-
88
- @staticmethod
89
- def _extract_video_id(video_url: str) -> str:
90
- """Extracts the video ID from different YouTube URL formats."""
91
- patterns = [
92
- r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
93
- r'youtu\.be\/([0-9A-Za-z_-]{11})',
94
- r'youtube\.com\/embed\/([0-9A-Za-z_-]{11})'
95
- ]
96
-
97
- for pattern in patterns:
98
- match = re.search(pattern, video_url)
99
- if match:
100
- return match.group(1)
101
-
102
- if re.match(r'^[0-9A-Za-z_-]{11}$', video_url):
103
- return video_url
104
-
105
- raise InvalidVideoIdError(video_url)
106
-
107
- @staticmethod
108
- def _load_cookies(cookies: str, video_id: str) -> None:
109
- """Loads cookies from a file."""
110
- try:
111
- cj = cookiejar.MozillaCookieJar(cookies)
112
- cj.load()
113
- return cj
114
- except (cookiejar.LoadError, FileNotFoundError):
115
- raise CookiePathInvalidError(video_id)
116
-
117
- class TranscriptListFetcher:
118
- """Fetches the list of transcripts for a YouTube video."""
119
-
120
- def __init__(self, http_client: requests.Session):
121
- """Initializes TranscriptListFetcher."""
122
- self._http_client = http_client
123
-
124
- def fetch(self, video_id: str):
125
- """Fetches and returns a TranscriptList."""
126
- return TranscriptList.build(
127
- self._http_client,
128
- video_id,
129
- self._extract_captions_json(self._fetch_video_html(video_id), video_id),
130
- )
131
-
132
- def _extract_captions_json(self, html: str, video_id: str) -> dict:
133
- """Extracts the captions JSON data from the video's HTML."""
134
- splitted_html = html.split('"captions":')
135
-
136
- if len(splitted_html) <= 1:
137
- if video_id.startswith('http://') or video_id.startswith('https://'):
138
- raise InvalidVideoIdError(video_id)
139
- if 'class="g-recaptcha"' in html:
140
- raise TooManyRequestsError(video_id)
141
- if '"playabilityStatus":' not in html:
142
- raise VideoUnavailableError(video_id)
143
-
144
- raise TranscriptsDisabledError(video_id)
145
-
146
- captions_json = json.loads(
147
- splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
148
- ).get('playerCaptionsTracklistRenderer')
149
- if captions_json is None:
150
- raise TranscriptsDisabledError(video_id)
151
-
152
- if 'captionTracks' not in captions_json:
153
- raise TranscriptsDisabledError(video_id)
154
-
155
- return captions_json
156
-
157
- def _create_consent_cookie(self, html, video_id):
158
- match = re.search('name="v" value="(.*?)"', html)
159
- if match is None:
160
- raise FailedToCreateConsentCookieError(video_id)
161
- self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
162
-
163
- def _fetch_video_html(self, video_id):
164
- html = self._fetch_html(video_id)
165
- if 'action="https://consent.youtube.com/s"' in html:
166
- self._create_consent_cookie(html, video_id)
167
- html = self._fetch_html(video_id)
168
- if 'action="https://consent.youtube.com/s"' in html:
169
- raise FailedToCreateConsentCookieError(video_id)
170
- return html
171
-
172
- def _fetch_html(self, video_id):
173
- response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
174
- return html.unescape(_raise_http_errors(response, video_id).text)
175
-
176
-
177
- class TranscriptList:
178
- """Yo fam! This class is all about managing those YouTube transcript lists! 🎯
179
-
180
- >>> transcript_list = TranscriptList.build(http_client, video_id, captions_json)
181
- >>> transcript = transcript_list.find_transcript(['en'])
182
- >>> print(transcript)
183
- en ("English")[TRANSLATABLE]
184
- """
185
-
186
- def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
187
- """Init that transcript list with all the good stuff! 💯"""
188
- self.video_id = video_id
189
- self._manually_created_transcripts = manually_created_transcripts
190
- self._generated_transcripts = generated_transcripts
191
- self._translation_languages = translation_languages
192
-
193
- @staticmethod
194
- def build(http_client, video_id, captions_json):
195
- """
196
- Factory method for TranscriptList.
197
-
198
- :param http_client: http client which is used to make the transcript retrieving http calls
199
- :type http_client: requests.Session
200
- :param video_id: the id of the video this TranscriptList is for
201
- :type video_id: str
202
- :param captions_json: the JSON parsed from the YouTube pages static HTML
203
- :type captions_json: dict
204
- :return: the created TranscriptList
205
- :rtype TranscriptList:
206
- """
207
- translation_languages = [
208
- {
209
- 'language': translation_language['languageName']['simpleText'],
210
- 'language_code': translation_language['languageCode'],
211
- } for translation_language in captions_json.get('translationLanguages', [])
212
- ]
213
-
214
- manually_created_transcripts = {}
215
- generated_transcripts = {}
216
-
217
- for caption in captions_json['captionTracks']:
218
- if caption.get('kind', '') == 'asr':
219
- transcript_dict = generated_transcripts
220
- else:
221
- transcript_dict = manually_created_transcripts
222
-
223
- transcript_dict[caption['languageCode']] = Transcript(
224
- http_client,
225
- video_id,
226
- caption['baseUrl'],
227
- caption['name']['simpleText'],
228
- caption['languageCode'],
229
- caption.get('kind', '') == 'asr',
230
- translation_languages if caption.get('isTranslatable', False) else [],
231
- )
232
-
233
- return TranscriptList(
234
- video_id,
235
- manually_created_transcripts,
236
- generated_transcripts,
237
- translation_languages,
238
- )
239
-
240
- def __iter__(self):
241
- return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
242
-
243
- def find_transcript(self, language_codes):
244
- """
245
- Finds a transcript for a given language code. If no language is provided, it will
246
- return the auto-generated transcript.
247
-
248
- :param language_codes: A list of language codes in a descending priority.
249
- :type languages: list[str]
250
- :return: the found Transcript
251
- :rtype Transcript:
252
- :raises: NoTranscriptFound
253
- """
254
- if 'any' in language_codes:
255
- for transcript in self:
256
- return transcript
257
- return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
258
-
259
- def find_generated_transcript(self, language_codes):
260
- """
261
- Finds an automatically generated transcript for a given language code.
262
-
263
- :param language_codes: A list of language codes in a descending priority. For example, if this is set to
264
- ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
265
- it fails to do so.
266
- :type languages: list[str]
267
- :return: the found Transcript
268
- :rtype Transcript:
269
- :raises: NoTranscriptFound
270
- """
271
- if 'any' in language_codes:
272
- for transcript in self:
273
- if transcript.is_generated:
274
- return transcript
275
- return self._find_transcript(language_codes, [self._generated_transcripts])
276
-
277
- def find_manually_created_transcript(self, language_codes):
278
- """
279
- Finds a manually created transcript for a given language code.
280
-
281
- :param language_codes: A list of language codes in a descending priority. For example, if this is set to
282
- ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
283
- it fails to do so.
284
- :type languages: list[str]
285
- :return: the found Transcript
286
- :rtype Transcript:
287
- :raises: NoTranscriptFound
288
- """
289
- return self._find_transcript(language_codes, [self._manually_created_transcripts])
290
-
291
- def _find_transcript(self, language_codes, transcript_dicts):
292
- for language_code in language_codes:
293
- for transcript_dict in transcript_dicts:
294
- if language_code in transcript_dict:
295
- return transcript_dict[language_code]
296
-
297
- raise NoTranscriptFoundError(
298
- self.video_id,
299
- language_codes,
300
- self
301
- )
302
-
303
- def __str__(self):
304
- return (
305
- 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
306
- '(MANUALLY CREATED)\n'
307
- '{available_manually_created_transcript_languages}\n\n'
308
- '(GENERATED)\n'
309
- '{available_generated_transcripts}\n\n'
310
- '(TRANSLATION LANGUAGES)\n'
311
- '{available_translation_languages}'
312
- ).format(
313
- video_id=self.video_id,
314
- available_manually_created_transcript_languages=self._get_language_description(
315
- str(transcript) for transcript in self._manually_created_transcripts.values()
316
- ),
317
- available_generated_transcripts=self._get_language_description(
318
- str(transcript) for transcript in self._generated_transcripts.values()
319
- ),
320
- available_translation_languages=self._get_language_description(
321
- '{language_code} ("{language}")'.format(
322
- language=translation_language['language'],
323
- language_code=translation_language['language_code'],
324
- ) for translation_language in self._translation_languages
325
- )
326
- )
327
-
328
- def _get_language_description(self, transcript_strings):
329
- description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
330
- return description if description else 'None'
331
-
332
-
333
- class Transcript:
334
- """Your personal transcript handler! 🎭
335
-
336
- >>> transcript = transcript_list.find_transcript(['en'])
337
- >>> print(transcript.language)
338
- 'English'
339
- >>> if transcript.is_translatable:
340
- ... es_transcript = transcript.translate('es')
341
- ... print(es_transcript.language)
342
- 'Spanish'
343
- """
344
-
345
- def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
346
- """Initialize with all the goodies! 🎁"""
347
- self._http_client = http_client
348
- self.video_id = video_id
349
- self._url = url
350
- self.language = language
351
- self.language_code = language_code
352
- self.is_generated = is_generated
353
- self.translation_languages = translation_languages
354
- self._translation_languages_dict = {
355
- translation_language['language_code']: translation_language['language']
356
- for translation_language in translation_languages
357
- }
358
-
359
- def fetch(self, preserve_formatting=False):
360
- """Get that transcript data! 🎯
361
-
362
- Args:
363
- preserve_formatting (bool): Keep HTML formatting? Default is nah fam.
364
-
365
- Returns:
366
- list: That sweet transcript data with text, start time, and duration! 📝
367
- """
368
- response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
369
- return TranscriptParser(preserve_formatting=preserve_formatting).parse(
370
- _raise_http_errors(response, self.video_id).text,
371
- )
372
-
373
- def __str__(self):
374
- """String representation looking clean! 💅"""
375
- return '{language_code} ("{language}"){translation_description}'.format(
376
- language=self.language,
377
- language_code=self.language_code,
378
- translation_description='[TRANSLATABLE]' if self.is_translatable else ''
379
- )
380
-
381
- @property
382
- def is_translatable(self):
383
- """Can we translate this? 🌍"""
384
- return len(self.translation_languages) > 0
385
-
386
- def translate(self, language_code):
387
- """Translate to another language! 🌎
388
-
389
- Args:
390
- language_code (str): Which language you want fam?
391
-
392
- Returns:
393
- Transcript: A fresh transcript in your requested language! 🔄
394
-
395
- Raises:
396
- NotTranslatableError: If we can't translate this one 😢
397
- TranslationLanguageNotAvailableError: If that language isn't available 🚫
398
- """
399
- if not self.is_translatable:
400
- raise NotTranslatableError(self.video_id)
401
-
402
- if language_code not in self._translation_languages_dict:
403
- raise TranslationLanguageNotAvailableError(self.video_id)
404
-
405
- return Transcript(
406
- self._http_client,
407
- self.video_id,
408
- '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
409
- self._translation_languages_dict[language_code],
410
- language_code,
411
- True,
412
- [],
413
- )
414
-
415
-
416
- class TranscriptParser:
417
- """Parsing those transcripts like a pro! 🎯
418
-
419
- >>> parser = TranscriptParser(preserve_formatting=True)
420
- >>> data = parser.parse(xml_data)
421
- >>> print(data[0])
422
- {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
423
- """
424
-
425
- _FORMATTING_TAGS = [
426
- 'strong', # For that extra emphasis 💪
427
- 'em', # When you need that italic swag 🎨
428
- 'b', # Bold and beautiful 💯
429
- 'i', # More italic vibes
430
- 'mark', # Highlight that text 🌟
431
- 'small', # Keep it lowkey 🤫
432
- 'del', # Strike it out ⚡
433
- 'ins', # Insert new stuff 🆕
434
- 'sub', # Subscript gang 📉
435
- 'sup', # Superscript squad 📈
436
- ]
437
-
438
- def __init__(self, preserve_formatting=False):
439
- """Get ready to parse with style! 🎨"""
440
- self._html_regex = self._get_html_regex(preserve_formatting)
441
-
442
- def _get_html_regex(self, preserve_formatting):
443
- """Get that regex pattern ready! 🎯"""
444
- if preserve_formatting:
445
- formats_regex = '|'.join(self._FORMATTING_TAGS)
446
- formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
447
- html_regex = re.compile(formats_regex, re.IGNORECASE)
448
- else:
449
- html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
450
- return html_regex
451
-
452
- def parse(self, plain_data):
453
- """Parse that XML data into something beautiful! ✨"""
454
- return [
455
- {
456
- 'text': re.sub(self._html_regex, '', html.unescape(xml_element.text)),
457
- 'start': float(xml_element.attrib['start']),
458
- 'duration': float(xml_element.attrib.get('dur', '0.0')),
459
- }
460
- for xml_element in ElementTree.fromstring(plain_data)
461
- if xml_element.text is not None
462
- ]
463
-
464
-
465
- def _raise_http_errors(response, video_id):
466
- """Handle those HTTP errors with style! 🛠️"""
467
- try:
468
- response.raise_for_status()
469
- return response
470
- except requests.exceptions.HTTPError as error:
471
- raise YouTubeRequestFailedError(video_id, error)
472
-
473
-
474
- if __name__ == "__main__":
475
- # Let's get this party started! 🎉
476
- from rich import print
477
- video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
478
- transcript = YTTranscriber.get_transcript(video_url, languages=None)
479
- print("Here's what we got! 🔥")
1
+ """
2
+ >>> from webscout import YTTranscriber
3
+ >>> transcript = YTTranscriber.get_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
4
+ >>> print(transcript)
5
+ {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
6
+
7
+ """
8
+
9
+ import requests # For making those HTTP requests like a boss 🌐
10
+ import http.cookiejar as cookiejar # Handling cookies and stuff 🍪
11
+ import json # JSON parsing - keeping it clean! 📝
12
+ from xml.etree import ElementTree # XML parsing magic
13
+ import re # Regex for pattern matching 🎯
14
+ import html # HTML stuff made easy 💪
15
+ from typing import List, Dict, Union, Optional # Type hints for that clean code 💯
16
+ from functools import lru_cache # Cache that data for speed! ⚡
17
+ from concurrent.futures import ThreadPoolExecutor # Parallel processing gang 🚀
18
+ import asyncio # Async/await swag 😎
19
+ from webscout.exceptions import * # All our custom exceptions 🛠️
20
+
21
+ WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
22
+ MAX_WORKERS = 4 # Keeping it optimal fam! 💪
23
+
24
+ class YTTranscriber:
25
+ """Your boy for getting those YouTube transcripts! 🎥
26
+
27
+ >>> transcript = YTTranscriber.get_transcript('https://youtu.be/dQw4w9WgXcQ')
28
+ >>> print(transcript[0]['text'])
29
+ 'Never gonna give you up'
30
+ """
31
+
32
+ _session = None
33
+ _executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
34
+
35
+ @classmethod
36
+ def _get_session(cls):
37
+ if cls._session is None:
38
+ cls._session = requests.Session()
39
+ cls._session.headers.update({
40
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
41
+ })
42
+ return cls._session
43
+
44
+ @classmethod
45
+ @lru_cache(maxsize=100)
46
+ def get_transcript(cls, video_url: str, languages: Optional[str] = 'en',
47
+ proxies: Dict[str, str] = None,
48
+ cookies: str = None,
49
+ preserve_formatting: bool = False) -> List[Dict[str, Union[str, float]]]:
50
+ """
51
+ Retrieves the transcript for a given YouTube video URL.
52
+
53
+ Args:
54
+ video_url (str): YouTube video URL (supports various formats).
55
+ languages (str, optional): Language code for the transcript.
56
+ If None, fetches the auto-generated transcript.
57
+ Defaults to 'en'.
58
+ proxies (Dict[str, str], optional): Proxies to use for the request. Defaults to None.
59
+ cookies (str, optional): Path to the cookie file. Defaults to None.
60
+ preserve_formatting (bool, optional): Whether to preserve formatting tags. Defaults to False.
61
+
62
+ Returns:
63
+ List[Dict[str, Union[str, float]]]: A list of dictionaries, each containing:
64
+ - 'text': The transcribed text.
65
+ - 'start': The start time of the text segment (in seconds).
66
+ - 'duration': The duration of the text segment (in seconds).
67
+
68
+ Raises:
69
+ TranscriptRetrievalError: If there's an error retrieving the transcript.
70
+ """
71
+ video_id = cls._extract_video_id(video_url)
72
+ http_client = cls._get_session()
73
+
74
+ if proxies:
75
+ http_client.proxies.update(proxies)
76
+
77
+ if cookies:
78
+ cls._load_cookies(cookies, video_id)
79
+
80
+ transcript_list = TranscriptListFetcher(http_client).fetch(video_id)
81
+ language_codes = [languages] if languages else None
82
+ transcript = transcript_list.find_transcript(language_codes)
83
+
84
+ return transcript.fetch(preserve_formatting)
85
+
86
+ @staticmethod
87
+ def _extract_video_id(video_url: str) -> str:
88
+ """Extracts the video ID from different YouTube URL formats."""
89
+ patterns = [
90
+ r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
91
+ r'youtu\.be\/([0-9A-Za-z_-]{11})',
92
+ r'youtube\.com\/embed\/([0-9A-Za-z_-]{11})'
93
+ ]
94
+
95
+ for pattern in patterns:
96
+ match = re.search(pattern, video_url)
97
+ if match:
98
+ return match.group(1)
99
+
100
+ if re.match(r'^[0-9A-Za-z_-]{11}$', video_url):
101
+ return video_url
102
+
103
+ raise InvalidVideoIdError(video_url)
104
+
105
+ @staticmethod
106
+ def _load_cookies(cookies: str, video_id: str) -> None:
107
+ """Loads cookies from a file."""
108
+ try:
109
+ cj = cookiejar.MozillaCookieJar(cookies)
110
+ cj.load()
111
+ return cj
112
+ except (cookiejar.LoadError, FileNotFoundError):
113
+ raise CookiePathInvalidError(video_id)
114
+
115
+ class TranscriptListFetcher:
116
+ """Fetches the list of transcripts for a YouTube video."""
117
+
118
+ def __init__(self, http_client: requests.Session):
119
+ """Initializes TranscriptListFetcher."""
120
+ self._http_client = http_client
121
+
122
+ def fetch(self, video_id: str):
123
+ """Fetches and returns a TranscriptList."""
124
+ return TranscriptList.build(
125
+ self._http_client,
126
+ video_id,
127
+ self._extract_captions_json(self._fetch_video_html(video_id), video_id),
128
+ )
129
+
130
+ def _extract_captions_json(self, html: str, video_id: str) -> dict:
131
+ """Extracts the captions JSON data from the video's HTML."""
132
+ splitted_html = html.split('"captions":')
133
+
134
+ if len(splitted_html) <= 1:
135
+ if video_id.startswith('http://') or video_id.startswith('https://'):
136
+ raise InvalidVideoIdError(video_id)
137
+ if 'class="g-recaptcha"' in html:
138
+ raise TooManyRequestsError(video_id)
139
+ if '"playabilityStatus":' not in html:
140
+ raise VideoUnavailableError(video_id)
141
+
142
+ raise TranscriptsDisabledError(video_id)
143
+
144
+ captions_json = json.loads(
145
+ splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
146
+ ).get('playerCaptionsTracklistRenderer')
147
+ if captions_json is None:
148
+ raise TranscriptsDisabledError(video_id)
149
+
150
+ if 'captionTracks' not in captions_json:
151
+ raise TranscriptsDisabledError(video_id)
152
+
153
+ return captions_json
154
+
155
+ def _create_consent_cookie(self, html, video_id):
156
+ match = re.search('name="v" value="(.*?)"', html)
157
+ if match is None:
158
+ raise FailedToCreateConsentCookieError(video_id)
159
+ self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
160
+
161
+ def _fetch_video_html(self, video_id):
162
+ html = self._fetch_html(video_id)
163
+ if 'action="https://consent.youtube.com/s"' in html:
164
+ self._create_consent_cookie(html, video_id)
165
+ html = self._fetch_html(video_id)
166
+ if 'action="https://consent.youtube.com/s"' in html:
167
+ raise FailedToCreateConsentCookieError(video_id)
168
+ return html
169
+
170
+ def _fetch_html(self, video_id):
171
+ response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
172
+ return html.unescape(_raise_http_errors(response, video_id).text)
173
+
174
+
175
+ class TranscriptList:
176
+ """
177
+ >>> transcript_list = TranscriptList.build(http_client, video_id, captions_json)
178
+ >>> transcript = transcript_list.find_transcript(['en'])
179
+ >>> print(transcript)
180
+ en ("English")[TRANSLATABLE]
181
+ """
182
+
183
+ def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
184
+ """Init that transcript list with all the good stuff! 💯"""
185
+ self.video_id = video_id
186
+ self._manually_created_transcripts = manually_created_transcripts
187
+ self._generated_transcripts = generated_transcripts
188
+ self._translation_languages = translation_languages
189
+
190
+ @staticmethod
191
+ def build(http_client, video_id, captions_json):
192
+ """
193
+ Factory method for TranscriptList.
194
+
195
+ :param http_client: http client which is used to make the transcript retrieving http calls
196
+ :type http_client: requests.Session
197
+ :param video_id: the id of the video this TranscriptList is for
198
+ :type video_id: str
199
+ :param captions_json: the JSON parsed from the YouTube pages static HTML
200
+ :type captions_json: dict
201
+ :return: the created TranscriptList
202
+ :rtype TranscriptList:
203
+ """
204
+ translation_languages = [
205
+ {
206
+ 'language': translation_language['languageName']['simpleText'],
207
+ 'language_code': translation_language['languageCode'],
208
+ } for translation_language in captions_json.get('translationLanguages', [])
209
+ ]
210
+
211
+ manually_created_transcripts = {}
212
+ generated_transcripts = {}
213
+
214
+ for caption in captions_json['captionTracks']:
215
+ if caption.get('kind', '') == 'asr':
216
+ transcript_dict = generated_transcripts
217
+ else:
218
+ transcript_dict = manually_created_transcripts
219
+
220
+ transcript_dict[caption['languageCode']] = Transcript(
221
+ http_client,
222
+ video_id,
223
+ caption['baseUrl'],
224
+ caption['name']['simpleText'],
225
+ caption['languageCode'],
226
+ caption.get('kind', '') == 'asr',
227
+ translation_languages if caption.get('isTranslatable', False) else [],
228
+ )
229
+
230
+ return TranscriptList(
231
+ video_id,
232
+ manually_created_transcripts,
233
+ generated_transcripts,
234
+ translation_languages,
235
+ )
236
+
237
+ def __iter__(self):
238
+ return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
239
+
240
+ def find_transcript(self, language_codes):
241
+ """
242
+ Finds a transcript for a given language code. If no language is provided, it will
243
+ return the auto-generated transcript.
244
+
245
+ :param language_codes: A list of language codes in a descending priority.
246
+ :type languages: list[str]
247
+ :return: the found Transcript
248
+ :rtype Transcript:
249
+ :raises: NoTranscriptFound
250
+ """
251
+ if 'any' in language_codes:
252
+ for transcript in self:
253
+ return transcript
254
+ return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
255
+
256
+ def find_generated_transcript(self, language_codes):
257
+ """
258
+ Finds an automatically generated transcript for a given language code.
259
+
260
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
261
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
262
+ it fails to do so.
263
+ :type languages: list[str]
264
+ :return: the found Transcript
265
+ :rtype Transcript:
266
+ :raises: NoTranscriptFound
267
+ """
268
+ if 'any' in language_codes:
269
+ for transcript in self:
270
+ if transcript.is_generated:
271
+ return transcript
272
+ return self._find_transcript(language_codes, [self._generated_transcripts])
273
+
274
+ def find_manually_created_transcript(self, language_codes):
275
+ """
276
+ Finds a manually created transcript for a given language code.
277
+
278
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
279
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
280
+ it fails to do so.
281
+ :type languages: list[str]
282
+ :return: the found Transcript
283
+ :rtype Transcript:
284
+ :raises: NoTranscriptFound
285
+ """
286
+ return self._find_transcript(language_codes, [self._manually_created_transcripts])
287
+
288
+ def _find_transcript(self, language_codes, transcript_dicts):
289
+ for language_code in language_codes:
290
+ for transcript_dict in transcript_dicts:
291
+ if language_code in transcript_dict:
292
+ return transcript_dict[language_code]
293
+
294
+ raise NoTranscriptFoundError(
295
+ self.video_id,
296
+ language_codes,
297
+ self
298
+ )
299
+
300
+ def __str__(self):
301
+ return (
302
+ 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
303
+ '(MANUALLY CREATED)\n'
304
+ '{available_manually_created_transcript_languages}\n\n'
305
+ '(GENERATED)\n'
306
+ '{available_generated_transcripts}\n\n'
307
+ '(TRANSLATION LANGUAGES)\n'
308
+ '{available_translation_languages}'
309
+ ).format(
310
+ video_id=self.video_id,
311
+ available_manually_created_transcript_languages=self._get_language_description(
312
+ str(transcript) for transcript in self._manually_created_transcripts.values()
313
+ ),
314
+ available_generated_transcripts=self._get_language_description(
315
+ str(transcript) for transcript in self._generated_transcripts.values()
316
+ ),
317
+ available_translation_languages=self._get_language_description(
318
+ '{language_code} ("{language}")'.format(
319
+ language=translation_language['language'],
320
+ language_code=translation_language['language_code'],
321
+ ) for translation_language in self._translation_languages
322
+ )
323
+ )
324
+
325
+ def _get_language_description(self, transcript_strings):
326
+ description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
327
+ return description if description else 'None'
328
+
329
+
330
+ class Transcript:
331
+ """Your personal transcript handler! 🎭
332
+
333
+ >>> transcript = transcript_list.find_transcript(['en'])
334
+ >>> print(transcript.language)
335
+ 'English'
336
+ >>> if transcript.is_translatable:
337
+ ... es_transcript = transcript.translate('es')
338
+ ... print(es_transcript.language)
339
+ 'Spanish'
340
+ """
341
+
342
+ def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
343
+ """Initialize with all the goodies! 🎁"""
344
+ self._http_client = http_client
345
+ self.video_id = video_id
346
+ self._url = url
347
+ self.language = language
348
+ self.language_code = language_code
349
+ self.is_generated = is_generated
350
+ self.translation_languages = translation_languages
351
+ self._translation_languages_dict = {
352
+ translation_language['language_code']: translation_language['language']
353
+ for translation_language in translation_languages
354
+ }
355
+
356
+ def fetch(self, preserve_formatting=False):
357
+ """Get that transcript data! 🎯
358
+
359
+ Args:
360
+ preserve_formatting (bool): Keep HTML formatting? Default is nah fam.
361
+
362
+ Returns:
363
+ list: That sweet transcript data with text, start time, and duration! 📝
364
+ """
365
+ response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
366
+ return TranscriptParser(preserve_formatting=preserve_formatting).parse(
367
+ _raise_http_errors(response, self.video_id).text,
368
+ )
369
+
370
+ def __str__(self):
371
+ """String representation looking clean! 💅"""
372
+ return '{language_code} ("{language}"){translation_description}'.format(
373
+ language=self.language,
374
+ language_code=self.language_code,
375
+ translation_description='[TRANSLATABLE]' if self.is_translatable else ''
376
+ )
377
+
378
+ @property
379
+ def is_translatable(self):
380
+ """Can we translate this? 🌍"""
381
+ return len(self.translation_languages) > 0
382
+
383
+ def translate(self, language_code):
384
+ """Translate to another language! 🌎
385
+
386
+ Args:
387
+ language_code (str): Which language you want fam?
388
+
389
+ Returns:
390
+ Transcript: A fresh transcript in your requested language! 🔄
391
+
392
+ Raises:
393
+ NotTranslatableError: If we can't translate this one 😢
394
+ TranslationLanguageNotAvailableError: If that language isn't available 🚫
395
+ """
396
+ if not self.is_translatable:
397
+ raise NotTranslatableError(self.video_id)
398
+
399
+ if language_code not in self._translation_languages_dict:
400
+ raise TranslationLanguageNotAvailableError(self.video_id)
401
+
402
+ return Transcript(
403
+ self._http_client,
404
+ self.video_id,
405
+ '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
406
+ self._translation_languages_dict[language_code],
407
+ language_code,
408
+ True,
409
+ [],
410
+ )
411
+
412
+
413
+ class TranscriptParser:
414
+ """Parsing those transcripts like a pro! 🎯
415
+
416
+ >>> parser = TranscriptParser(preserve_formatting=True)
417
+ >>> data = parser.parse(xml_data)
418
+ >>> print(data[0])
419
+ {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
420
+ """
421
+
422
+ _FORMATTING_TAGS = [
423
+ 'strong', # For that extra emphasis 💪
424
+ 'em', # When you need that italic swag 🎨
425
+ 'b', # Bold and beautiful 💯
426
+ 'i', # More italic vibes
427
+ 'mark', # Highlight that text 🌟
428
+ 'small', # Keep it lowkey 🤫
429
+ 'del', # Strike it out
430
+ 'ins', # Insert new stuff 🆕
431
+ 'sub', # Subscript gang 📉
432
+ 'sup', # Superscript squad 📈
433
+ ]
434
+
435
+ def __init__(self, preserve_formatting=False):
436
+ """Get ready to parse with style! 🎨"""
437
+ self._html_regex = self._get_html_regex(preserve_formatting)
438
+
439
+ def _get_html_regex(self, preserve_formatting):
440
+ """Get that regex pattern ready! 🎯"""
441
+ if preserve_formatting:
442
+ formats_regex = '|'.join(self._FORMATTING_TAGS)
443
+ formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
444
+ html_regex = re.compile(formats_regex, re.IGNORECASE)
445
+ else:
446
+ html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
447
+ return html_regex
448
+
449
+ def parse(self, plain_data):
450
+ """Parse that XML data into something beautiful! ✨"""
451
+ return [
452
+ {
453
+ 'text': re.sub(self._html_regex, '', html.unescape(xml_element.text)),
454
+ 'start': float(xml_element.attrib['start']),
455
+ 'duration': float(xml_element.attrib.get('dur', '0.0')),
456
+ }
457
+ for xml_element in ElementTree.fromstring(plain_data)
458
+ if xml_element.text is not None
459
+ ]
460
+
461
+
462
+ def _raise_http_errors(response, video_id):
463
+ """Handle those HTTP errors with style! 🛠️"""
464
+ try:
465
+ response.raise_for_status()
466
+ return response
467
+ except requests.exceptions.HTTPError as error:
468
+ raise YouTubeRequestFailedError(video_id, error)
469
+
470
+
471
+ if __name__ == "__main__":
472
+ # Let's get this party started! 🎉
473
+ from rich import print
474
+ video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
475
+ transcript = YTTranscriber.get_transcript(video_url, languages=None)
476
+ print("Here's what we got! 🔥")
480
477
  print(transcript)