webscout 7.1__py3-none-any.whl → 7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (144) hide show
  1. webscout/AIauto.py +191 -191
  2. webscout/AIbase.py +122 -122
  3. webscout/AIutel.py +440 -440
  4. webscout/Bard.py +343 -161
  5. webscout/DWEBS.py +489 -492
  6. webscout/Extra/YTToolkit/YTdownloader.py +995 -995
  7. webscout/Extra/YTToolkit/__init__.py +2 -2
  8. webscout/Extra/YTToolkit/transcriber.py +476 -479
  9. webscout/Extra/YTToolkit/ytapi/channel.py +307 -307
  10. webscout/Extra/YTToolkit/ytapi/playlist.py +58 -58
  11. webscout/Extra/YTToolkit/ytapi/pool.py +7 -7
  12. webscout/Extra/YTToolkit/ytapi/utils.py +62 -62
  13. webscout/Extra/YTToolkit/ytapi/video.py +103 -103
  14. webscout/Extra/autocoder/__init__.py +9 -9
  15. webscout/Extra/autocoder/autocoder_utiles.py +199 -199
  16. webscout/Extra/autocoder/rawdog.py +5 -7
  17. webscout/Extra/autollama.py +230 -230
  18. webscout/Extra/gguf.py +3 -3
  19. webscout/Extra/weather.py +171 -171
  20. webscout/LLM.py +442 -442
  21. webscout/Litlogger/__init__.py +67 -681
  22. webscout/Litlogger/core/__init__.py +6 -0
  23. webscout/Litlogger/core/level.py +20 -0
  24. webscout/Litlogger/core/logger.py +123 -0
  25. webscout/Litlogger/handlers/__init__.py +12 -0
  26. webscout/Litlogger/handlers/console.py +50 -0
  27. webscout/Litlogger/handlers/file.py +143 -0
  28. webscout/Litlogger/handlers/network.py +174 -0
  29. webscout/Litlogger/styles/__init__.py +7 -0
  30. webscout/Litlogger/styles/colors.py +231 -0
  31. webscout/Litlogger/styles/formats.py +377 -0
  32. webscout/Litlogger/styles/text.py +87 -0
  33. webscout/Litlogger/utils/__init__.py +6 -0
  34. webscout/Litlogger/utils/detectors.py +154 -0
  35. webscout/Litlogger/utils/formatters.py +200 -0
  36. webscout/Provider/AISEARCH/DeepFind.py +250 -250
  37. webscout/Provider/Blackboxai.py +3 -3
  38. webscout/Provider/ChatGPTGratis.py +226 -0
  39. webscout/Provider/Cloudflare.py +3 -4
  40. webscout/Provider/DeepSeek.py +218 -0
  41. webscout/Provider/Deepinfra.py +3 -3
  42. webscout/Provider/Free2GPT.py +131 -124
  43. webscout/Provider/Gemini.py +100 -115
  44. webscout/Provider/Glider.py +3 -3
  45. webscout/Provider/Groq.py +5 -1
  46. webscout/Provider/Jadve.py +3 -3
  47. webscout/Provider/Marcus.py +191 -192
  48. webscout/Provider/Netwrck.py +3 -3
  49. webscout/Provider/PI.py +2 -2
  50. webscout/Provider/PizzaGPT.py +2 -3
  51. webscout/Provider/QwenLM.py +311 -0
  52. webscout/Provider/TTI/AiForce/__init__.py +22 -22
  53. webscout/Provider/TTI/AiForce/async_aiforce.py +257 -257
  54. webscout/Provider/TTI/AiForce/sync_aiforce.py +242 -242
  55. webscout/Provider/TTI/Nexra/__init__.py +22 -22
  56. webscout/Provider/TTI/Nexra/async_nexra.py +286 -286
  57. webscout/Provider/TTI/Nexra/sync_nexra.py +258 -258
  58. webscout/Provider/TTI/PollinationsAI/__init__.py +23 -23
  59. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +330 -330
  60. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +285 -285
  61. webscout/Provider/TTI/artbit/__init__.py +22 -22
  62. webscout/Provider/TTI/artbit/async_artbit.py +184 -184
  63. webscout/Provider/TTI/artbit/sync_artbit.py +176 -176
  64. webscout/Provider/TTI/blackbox/__init__.py +4 -4
  65. webscout/Provider/TTI/blackbox/async_blackbox.py +212 -212
  66. webscout/Provider/TTI/blackbox/sync_blackbox.py +199 -199
  67. webscout/Provider/TTI/deepinfra/__init__.py +4 -4
  68. webscout/Provider/TTI/deepinfra/async_deepinfra.py +227 -227
  69. webscout/Provider/TTI/deepinfra/sync_deepinfra.py +199 -199
  70. webscout/Provider/TTI/huggingface/__init__.py +22 -22
  71. webscout/Provider/TTI/huggingface/async_huggingface.py +199 -199
  72. webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -195
  73. webscout/Provider/TTI/imgninza/__init__.py +4 -4
  74. webscout/Provider/TTI/imgninza/async_ninza.py +214 -214
  75. webscout/Provider/TTI/imgninza/sync_ninza.py +209 -209
  76. webscout/Provider/TTI/talkai/__init__.py +4 -4
  77. webscout/Provider/TTI/talkai/async_talkai.py +229 -229
  78. webscout/Provider/TTI/talkai/sync_talkai.py +207 -207
  79. webscout/Provider/TTS/deepgram.py +182 -182
  80. webscout/Provider/TTS/elevenlabs.py +136 -136
  81. webscout/Provider/TTS/gesserit.py +150 -150
  82. webscout/Provider/TTS/murfai.py +138 -138
  83. webscout/Provider/TTS/parler.py +133 -134
  84. webscout/Provider/TTS/streamElements.py +360 -360
  85. webscout/Provider/TTS/utils.py +280 -280
  86. webscout/Provider/TTS/voicepod.py +116 -116
  87. webscout/Provider/TextPollinationsAI.py +2 -3
  88. webscout/Provider/WiseCat.py +193 -0
  89. webscout/Provider/__init__.py +144 -134
  90. webscout/Provider/cerebras.py +242 -227
  91. webscout/Provider/chatglm.py +204 -204
  92. webscout/Provider/dgaf.py +2 -3
  93. webscout/Provider/gaurish.py +2 -3
  94. webscout/Provider/geminiapi.py +208 -208
  95. webscout/Provider/granite.py +223 -0
  96. webscout/Provider/hermes.py +218 -218
  97. webscout/Provider/llama3mitril.py +179 -179
  98. webscout/Provider/llamatutor.py +3 -3
  99. webscout/Provider/llmchat.py +2 -3
  100. webscout/Provider/meta.py +794 -794
  101. webscout/Provider/multichat.py +331 -331
  102. webscout/Provider/typegpt.py +359 -359
  103. webscout/Provider/yep.py +2 -2
  104. webscout/__main__.py +5 -5
  105. webscout/cli.py +319 -319
  106. webscout/conversation.py +241 -242
  107. webscout/exceptions.py +328 -328
  108. webscout/litagent/__init__.py +28 -28
  109. webscout/litagent/agent.py +2 -3
  110. webscout/litprinter/__init__.py +0 -58
  111. webscout/scout/__init__.py +8 -8
  112. webscout/scout/core.py +884 -884
  113. webscout/scout/element.py +459 -459
  114. webscout/scout/parsers/__init__.py +69 -69
  115. webscout/scout/parsers/html5lib_parser.py +172 -172
  116. webscout/scout/parsers/html_parser.py +236 -236
  117. webscout/scout/parsers/lxml_parser.py +178 -178
  118. webscout/scout/utils.py +38 -38
  119. webscout/swiftcli/__init__.py +811 -811
  120. webscout/update_checker.py +2 -12
  121. webscout/version.py +1 -1
  122. webscout/webscout_search.py +5 -4
  123. webscout/zeroart/__init__.py +54 -54
  124. webscout/zeroart/base.py +60 -60
  125. webscout/zeroart/effects.py +99 -99
  126. webscout/zeroart/fonts.py +816 -816
  127. {webscout-7.1.dist-info → webscout-7.2.dist-info}/METADATA +4 -3
  128. webscout-7.2.dist-info/RECORD +217 -0
  129. webstoken/__init__.py +30 -30
  130. webstoken/classifier.py +189 -189
  131. webstoken/keywords.py +216 -216
  132. webstoken/language.py +128 -128
  133. webstoken/ner.py +164 -164
  134. webstoken/normalizer.py +35 -35
  135. webstoken/processor.py +77 -77
  136. webstoken/sentiment.py +206 -206
  137. webstoken/stemmer.py +73 -73
  138. webstoken/tagger.py +60 -60
  139. webstoken/tokenizer.py +158 -158
  140. webscout-7.1.dist-info/RECORD +0 -198
  141. {webscout-7.1.dist-info → webscout-7.2.dist-info}/LICENSE.md +0 -0
  142. {webscout-7.1.dist-info → webscout-7.2.dist-info}/WHEEL +0 -0
  143. {webscout-7.1.dist-info → webscout-7.2.dist-info}/entry_points.txt +0 -0
  144. {webscout-7.1.dist-info → webscout-7.2.dist-info}/top_level.txt +0 -0
@@ -1,480 +1,477 @@
1
- """Wassup fam! 🔥 This module is your go-to for getting those YouTube transcripts!
2
-
3
- >>> from webscout import YTTranscriber
4
- >>> transcript = YTTranscriber.get_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
5
- >>> print(transcript)
6
- {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
7
-
8
- Built different by @HelpingAI 👑
9
- """
10
-
11
- import requests # For making those HTTP requests like a boss 🌐
12
- import http.cookiejar as cookiejar # Handling cookies and stuff 🍪
13
- import json # JSON parsing - keeping it clean! 📝
14
- from xml.etree import ElementTree # XML parsing magic
15
- import re # Regex for pattern matching 🎯
16
- import html # HTML stuff made easy 💪
17
- from typing import List, Dict, Union, Optional # Type hints for that clean code 💯
18
- from functools import lru_cache # Cache that data for speed! ⚡
19
- from concurrent.futures import ThreadPoolExecutor # Parallel processing gang 🚀
20
- import asyncio # Async/await swag 😎
21
- from webscout.exceptions import * # All our custom exceptions 🛠️
22
-
23
- WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
24
- MAX_WORKERS = 4 # Keeping it optimal fam! 💪
25
-
26
- class YTTranscriber:
27
- """Your boy for getting those YouTube transcripts! 🎥
28
-
29
- >>> transcript = YTTranscriber.get_transcript('https://youtu.be/dQw4w9WgXcQ')
30
- >>> print(transcript[0]['text'])
31
- 'Never gonna give you up'
32
- """
33
-
34
- _session = None
35
- _executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
36
-
37
- @classmethod
38
- def _get_session(cls):
39
- if cls._session is None:
40
- cls._session = requests.Session()
41
- cls._session.headers.update({
42
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
43
- })
44
- return cls._session
45
-
46
- @classmethod
47
- @lru_cache(maxsize=100)
48
- def get_transcript(cls, video_url: str, languages: Optional[str] = 'en',
49
- proxies: Dict[str, str] = None,
50
- cookies: str = None,
51
- preserve_formatting: bool = False) -> List[Dict[str, Union[str, float]]]:
52
- """
53
- Retrieves the transcript for a given YouTube video URL.
54
-
55
- Args:
56
- video_url (str): YouTube video URL (supports various formats).
57
- languages (str, optional): Language code for the transcript.
58
- If None, fetches the auto-generated transcript.
59
- Defaults to 'en'.
60
- proxies (Dict[str, str], optional): Proxies to use for the request. Defaults to None.
61
- cookies (str, optional): Path to the cookie file. Defaults to None.
62
- preserve_formatting (bool, optional): Whether to preserve formatting tags. Defaults to False.
63
-
64
- Returns:
65
- List[Dict[str, Union[str, float]]]: A list of dictionaries, each containing:
66
- - 'text': The transcribed text.
67
- - 'start': The start time of the text segment (in seconds).
68
- - 'duration': The duration of the text segment (in seconds).
69
-
70
- Raises:
71
- TranscriptRetrievalError: If there's an error retrieving the transcript.
72
- """
73
- video_id = cls._extract_video_id(video_url)
74
- http_client = cls._get_session()
75
-
76
- if proxies:
77
- http_client.proxies.update(proxies)
78
-
79
- if cookies:
80
- cls._load_cookies(cookies, video_id)
81
-
82
- transcript_list = TranscriptListFetcher(http_client).fetch(video_id)
83
- language_codes = [languages] if languages else None
84
- transcript = transcript_list.find_transcript(language_codes)
85
-
86
- return transcript.fetch(preserve_formatting)
87
-
88
- @staticmethod
89
- def _extract_video_id(video_url: str) -> str:
90
- """Extracts the video ID from different YouTube URL formats."""
91
- patterns = [
92
- r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
93
- r'youtu\.be\/([0-9A-Za-z_-]{11})',
94
- r'youtube\.com\/embed\/([0-9A-Za-z_-]{11})'
95
- ]
96
-
97
- for pattern in patterns:
98
- match = re.search(pattern, video_url)
99
- if match:
100
- return match.group(1)
101
-
102
- if re.match(r'^[0-9A-Za-z_-]{11}$', video_url):
103
- return video_url
104
-
105
- raise InvalidVideoIdError(video_url)
106
-
107
- @staticmethod
108
- def _load_cookies(cookies: str, video_id: str) -> None:
109
- """Loads cookies from a file."""
110
- try:
111
- cj = cookiejar.MozillaCookieJar(cookies)
112
- cj.load()
113
- return cj
114
- except (cookiejar.LoadError, FileNotFoundError):
115
- raise CookiePathInvalidError(video_id)
116
-
117
- class TranscriptListFetcher:
118
- """Fetches the list of transcripts for a YouTube video."""
119
-
120
- def __init__(self, http_client: requests.Session):
121
- """Initializes TranscriptListFetcher."""
122
- self._http_client = http_client
123
-
124
- def fetch(self, video_id: str):
125
- """Fetches and returns a TranscriptList."""
126
- return TranscriptList.build(
127
- self._http_client,
128
- video_id,
129
- self._extract_captions_json(self._fetch_video_html(video_id), video_id),
130
- )
131
-
132
- def _extract_captions_json(self, html: str, video_id: str) -> dict:
133
- """Extracts the captions JSON data from the video's HTML."""
134
- splitted_html = html.split('"captions":')
135
-
136
- if len(splitted_html) <= 1:
137
- if video_id.startswith('http://') or video_id.startswith('https://'):
138
- raise InvalidVideoIdError(video_id)
139
- if 'class="g-recaptcha"' in html:
140
- raise TooManyRequestsError(video_id)
141
- if '"playabilityStatus":' not in html:
142
- raise VideoUnavailableError(video_id)
143
-
144
- raise TranscriptsDisabledError(video_id)
145
-
146
- captions_json = json.loads(
147
- splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
148
- ).get('playerCaptionsTracklistRenderer')
149
- if captions_json is None:
150
- raise TranscriptsDisabledError(video_id)
151
-
152
- if 'captionTracks' not in captions_json:
153
- raise TranscriptsDisabledError(video_id)
154
-
155
- return captions_json
156
-
157
- def _create_consent_cookie(self, html, video_id):
158
- match = re.search('name="v" value="(.*?)"', html)
159
- if match is None:
160
- raise FailedToCreateConsentCookieError(video_id)
161
- self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
162
-
163
- def _fetch_video_html(self, video_id):
164
- html = self._fetch_html(video_id)
165
- if 'action="https://consent.youtube.com/s"' in html:
166
- self._create_consent_cookie(html, video_id)
167
- html = self._fetch_html(video_id)
168
- if 'action="https://consent.youtube.com/s"' in html:
169
- raise FailedToCreateConsentCookieError(video_id)
170
- return html
171
-
172
- def _fetch_html(self, video_id):
173
- response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
174
- return html.unescape(_raise_http_errors(response, video_id).text)
175
-
176
-
177
- class TranscriptList:
178
- """Yo fam! This class is all about managing those YouTube transcript lists! 🎯
179
-
180
- >>> transcript_list = TranscriptList.build(http_client, video_id, captions_json)
181
- >>> transcript = transcript_list.find_transcript(['en'])
182
- >>> print(transcript)
183
- en ("English")[TRANSLATABLE]
184
- """
185
-
186
- def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
187
- """Init that transcript list with all the good stuff! 💯"""
188
- self.video_id = video_id
189
- self._manually_created_transcripts = manually_created_transcripts
190
- self._generated_transcripts = generated_transcripts
191
- self._translation_languages = translation_languages
192
-
193
- @staticmethod
194
- def build(http_client, video_id, captions_json):
195
- """
196
- Factory method for TranscriptList.
197
-
198
- :param http_client: http client which is used to make the transcript retrieving http calls
199
- :type http_client: requests.Session
200
- :param video_id: the id of the video this TranscriptList is for
201
- :type video_id: str
202
- :param captions_json: the JSON parsed from the YouTube pages static HTML
203
- :type captions_json: dict
204
- :return: the created TranscriptList
205
- :rtype TranscriptList:
206
- """
207
- translation_languages = [
208
- {
209
- 'language': translation_language['languageName']['simpleText'],
210
- 'language_code': translation_language['languageCode'],
211
- } for translation_language in captions_json.get('translationLanguages', [])
212
- ]
213
-
214
- manually_created_transcripts = {}
215
- generated_transcripts = {}
216
-
217
- for caption in captions_json['captionTracks']:
218
- if caption.get('kind', '') == 'asr':
219
- transcript_dict = generated_transcripts
220
- else:
221
- transcript_dict = manually_created_transcripts
222
-
223
- transcript_dict[caption['languageCode']] = Transcript(
224
- http_client,
225
- video_id,
226
- caption['baseUrl'],
227
- caption['name']['simpleText'],
228
- caption['languageCode'],
229
- caption.get('kind', '') == 'asr',
230
- translation_languages if caption.get('isTranslatable', False) else [],
231
- )
232
-
233
- return TranscriptList(
234
- video_id,
235
- manually_created_transcripts,
236
- generated_transcripts,
237
- translation_languages,
238
- )
239
-
240
- def __iter__(self):
241
- return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
242
-
243
- def find_transcript(self, language_codes):
244
- """
245
- Finds a transcript for a given language code. If no language is provided, it will
246
- return the auto-generated transcript.
247
-
248
- :param language_codes: A list of language codes in a descending priority.
249
- :type languages: list[str]
250
- :return: the found Transcript
251
- :rtype Transcript:
252
- :raises: NoTranscriptFound
253
- """
254
- if 'any' in language_codes:
255
- for transcript in self:
256
- return transcript
257
- return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
258
-
259
- def find_generated_transcript(self, language_codes):
260
- """
261
- Finds an automatically generated transcript for a given language code.
262
-
263
- :param language_codes: A list of language codes in a descending priority. For example, if this is set to
264
- ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
265
- it fails to do so.
266
- :type languages: list[str]
267
- :return: the found Transcript
268
- :rtype Transcript:
269
- :raises: NoTranscriptFound
270
- """
271
- if 'any' in language_codes:
272
- for transcript in self:
273
- if transcript.is_generated:
274
- return transcript
275
- return self._find_transcript(language_codes, [self._generated_transcripts])
276
-
277
- def find_manually_created_transcript(self, language_codes):
278
- """
279
- Finds a manually created transcript for a given language code.
280
-
281
- :param language_codes: A list of language codes in a descending priority. For example, if this is set to
282
- ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
283
- it fails to do so.
284
- :type languages: list[str]
285
- :return: the found Transcript
286
- :rtype Transcript:
287
- :raises: NoTranscriptFound
288
- """
289
- return self._find_transcript(language_codes, [self._manually_created_transcripts])
290
-
291
- def _find_transcript(self, language_codes, transcript_dicts):
292
- for language_code in language_codes:
293
- for transcript_dict in transcript_dicts:
294
- if language_code in transcript_dict:
295
- return transcript_dict[language_code]
296
-
297
- raise NoTranscriptFoundError(
298
- self.video_id,
299
- language_codes,
300
- self
301
- )
302
-
303
- def __str__(self):
304
- return (
305
- 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
306
- '(MANUALLY CREATED)\n'
307
- '{available_manually_created_transcript_languages}\n\n'
308
- '(GENERATED)\n'
309
- '{available_generated_transcripts}\n\n'
310
- '(TRANSLATION LANGUAGES)\n'
311
- '{available_translation_languages}'
312
- ).format(
313
- video_id=self.video_id,
314
- available_manually_created_transcript_languages=self._get_language_description(
315
- str(transcript) for transcript in self._manually_created_transcripts.values()
316
- ),
317
- available_generated_transcripts=self._get_language_description(
318
- str(transcript) for transcript in self._generated_transcripts.values()
319
- ),
320
- available_translation_languages=self._get_language_description(
321
- '{language_code} ("{language}")'.format(
322
- language=translation_language['language'],
323
- language_code=translation_language['language_code'],
324
- ) for translation_language in self._translation_languages
325
- )
326
- )
327
-
328
- def _get_language_description(self, transcript_strings):
329
- description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
330
- return description if description else 'None'
331
-
332
-
333
- class Transcript:
334
- """Your personal transcript handler! 🎭
335
-
336
- >>> transcript = transcript_list.find_transcript(['en'])
337
- >>> print(transcript.language)
338
- 'English'
339
- >>> if transcript.is_translatable:
340
- ... es_transcript = transcript.translate('es')
341
- ... print(es_transcript.language)
342
- 'Spanish'
343
- """
344
-
345
- def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
346
- """Initialize with all the goodies! 🎁"""
347
- self._http_client = http_client
348
- self.video_id = video_id
349
- self._url = url
350
- self.language = language
351
- self.language_code = language_code
352
- self.is_generated = is_generated
353
- self.translation_languages = translation_languages
354
- self._translation_languages_dict = {
355
- translation_language['language_code']: translation_language['language']
356
- for translation_language in translation_languages
357
- }
358
-
359
- def fetch(self, preserve_formatting=False):
360
- """Get that transcript data! 🎯
361
-
362
- Args:
363
- preserve_formatting (bool): Keep HTML formatting? Default is nah fam.
364
-
365
- Returns:
366
- list: That sweet transcript data with text, start time, and duration! 📝
367
- """
368
- response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
369
- return TranscriptParser(preserve_formatting=preserve_formatting).parse(
370
- _raise_http_errors(response, self.video_id).text,
371
- )
372
-
373
- def __str__(self):
374
- """String representation looking clean! 💅"""
375
- return '{language_code} ("{language}"){translation_description}'.format(
376
- language=self.language,
377
- language_code=self.language_code,
378
- translation_description='[TRANSLATABLE]' if self.is_translatable else ''
379
- )
380
-
381
- @property
382
- def is_translatable(self):
383
- """Can we translate this? 🌍"""
384
- return len(self.translation_languages) > 0
385
-
386
- def translate(self, language_code):
387
- """Translate to another language! 🌎
388
-
389
- Args:
390
- language_code (str): Which language you want fam?
391
-
392
- Returns:
393
- Transcript: A fresh transcript in your requested language! 🔄
394
-
395
- Raises:
396
- NotTranslatableError: If we can't translate this one 😢
397
- TranslationLanguageNotAvailableError: If that language isn't available 🚫
398
- """
399
- if not self.is_translatable:
400
- raise NotTranslatableError(self.video_id)
401
-
402
- if language_code not in self._translation_languages_dict:
403
- raise TranslationLanguageNotAvailableError(self.video_id)
404
-
405
- return Transcript(
406
- self._http_client,
407
- self.video_id,
408
- '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
409
- self._translation_languages_dict[language_code],
410
- language_code,
411
- True,
412
- [],
413
- )
414
-
415
-
416
- class TranscriptParser:
417
- """Parsing those transcripts like a pro! 🎯
418
-
419
- >>> parser = TranscriptParser(preserve_formatting=True)
420
- >>> data = parser.parse(xml_data)
421
- >>> print(data[0])
422
- {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
423
- """
424
-
425
- _FORMATTING_TAGS = [
426
- 'strong', # For that extra emphasis 💪
427
- 'em', # When you need that italic swag 🎨
428
- 'b', # Bold and beautiful 💯
429
- 'i', # More italic vibes
430
- 'mark', # Highlight that text 🌟
431
- 'small', # Keep it lowkey 🤫
432
- 'del', # Strike it out ⚡
433
- 'ins', # Insert new stuff 🆕
434
- 'sub', # Subscript gang 📉
435
- 'sup', # Superscript squad 📈
436
- ]
437
-
438
- def __init__(self, preserve_formatting=False):
439
- """Get ready to parse with style! 🎨"""
440
- self._html_regex = self._get_html_regex(preserve_formatting)
441
-
442
- def _get_html_regex(self, preserve_formatting):
443
- """Get that regex pattern ready! 🎯"""
444
- if preserve_formatting:
445
- formats_regex = '|'.join(self._FORMATTING_TAGS)
446
- formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
447
- html_regex = re.compile(formats_regex, re.IGNORECASE)
448
- else:
449
- html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
450
- return html_regex
451
-
452
- def parse(self, plain_data):
453
- """Parse that XML data into something beautiful! ✨"""
454
- return [
455
- {
456
- 'text': re.sub(self._html_regex, '', html.unescape(xml_element.text)),
457
- 'start': float(xml_element.attrib['start']),
458
- 'duration': float(xml_element.attrib.get('dur', '0.0')),
459
- }
460
- for xml_element in ElementTree.fromstring(plain_data)
461
- if xml_element.text is not None
462
- ]
463
-
464
-
465
- def _raise_http_errors(response, video_id):
466
- """Handle those HTTP errors with style! 🛠️"""
467
- try:
468
- response.raise_for_status()
469
- return response
470
- except requests.exceptions.HTTPError as error:
471
- raise YouTubeRequestFailedError(video_id, error)
472
-
473
-
474
- if __name__ == "__main__":
475
- # Let's get this party started! 🎉
476
- from rich import print
477
- video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
478
- transcript = YTTranscriber.get_transcript(video_url, languages=None)
479
- print("Here's what we got! 🔥")
1
+ """
2
+ >>> from webscout import YTTranscriber
3
+ >>> transcript = YTTranscriber.get_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
4
+ >>> print(transcript)
5
+ {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
6
+
7
+ """
8
+
9
+ import requests # For making those HTTP requests like a boss 🌐
10
+ import http.cookiejar as cookiejar # Handling cookies and stuff 🍪
11
+ import json # JSON parsing - keeping it clean! 📝
12
+ from xml.etree import ElementTree # XML parsing magic
13
+ import re # Regex for pattern matching 🎯
14
+ import html # HTML stuff made easy 💪
15
+ from typing import List, Dict, Union, Optional # Type hints for that clean code 💯
16
+ from functools import lru_cache # Cache that data for speed! ⚡
17
+ from concurrent.futures import ThreadPoolExecutor # Parallel processing gang 🚀
18
+ import asyncio # Async/await swag 😎
19
+ from webscout.exceptions import * # All our custom exceptions 🛠️
20
+
21
+ WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
22
+ MAX_WORKERS = 4 # Keeping it optimal fam! 💪
23
+
24
+ class YTTranscriber:
25
+ """Your boy for getting those YouTube transcripts! 🎥
26
+
27
+ >>> transcript = YTTranscriber.get_transcript('https://youtu.be/dQw4w9WgXcQ')
28
+ >>> print(transcript[0]['text'])
29
+ 'Never gonna give you up'
30
+ """
31
+
32
+ _session = None
33
+ _executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
34
+
35
+ @classmethod
36
+ def _get_session(cls):
37
+ if cls._session is None:
38
+ cls._session = requests.Session()
39
+ cls._session.headers.update({
40
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
41
+ })
42
+ return cls._session
43
+
44
+ @classmethod
45
+ @lru_cache(maxsize=100)
46
+ def get_transcript(cls, video_url: str, languages: Optional[str] = 'en',
47
+ proxies: Dict[str, str] = None,
48
+ cookies: str = None,
49
+ preserve_formatting: bool = False) -> List[Dict[str, Union[str, float]]]:
50
+ """
51
+ Retrieves the transcript for a given YouTube video URL.
52
+
53
+ Args:
54
+ video_url (str): YouTube video URL (supports various formats).
55
+ languages (str, optional): Language code for the transcript.
56
+ If None, fetches the auto-generated transcript.
57
+ Defaults to 'en'.
58
+ proxies (Dict[str, str], optional): Proxies to use for the request. Defaults to None.
59
+ cookies (str, optional): Path to the cookie file. Defaults to None.
60
+ preserve_formatting (bool, optional): Whether to preserve formatting tags. Defaults to False.
61
+
62
+ Returns:
63
+ List[Dict[str, Union[str, float]]]: A list of dictionaries, each containing:
64
+ - 'text': The transcribed text.
65
+ - 'start': The start time of the text segment (in seconds).
66
+ - 'duration': The duration of the text segment (in seconds).
67
+
68
+ Raises:
69
+ TranscriptRetrievalError: If there's an error retrieving the transcript.
70
+ """
71
+ video_id = cls._extract_video_id(video_url)
72
+ http_client = cls._get_session()
73
+
74
+ if proxies:
75
+ http_client.proxies.update(proxies)
76
+
77
+ if cookies:
78
+ cls._load_cookies(cookies, video_id)
79
+
80
+ transcript_list = TranscriptListFetcher(http_client).fetch(video_id)
81
+ language_codes = [languages] if languages else None
82
+ transcript = transcript_list.find_transcript(language_codes)
83
+
84
+ return transcript.fetch(preserve_formatting)
85
+
86
+ @staticmethod
87
+ def _extract_video_id(video_url: str) -> str:
88
+ """Extracts the video ID from different YouTube URL formats."""
89
+ patterns = [
90
+ r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
91
+ r'youtu\.be\/([0-9A-Za-z_-]{11})',
92
+ r'youtube\.com\/embed\/([0-9A-Za-z_-]{11})'
93
+ ]
94
+
95
+ for pattern in patterns:
96
+ match = re.search(pattern, video_url)
97
+ if match:
98
+ return match.group(1)
99
+
100
+ if re.match(r'^[0-9A-Za-z_-]{11}$', video_url):
101
+ return video_url
102
+
103
+ raise InvalidVideoIdError(video_url)
104
+
105
+ @staticmethod
106
+ def _load_cookies(cookies: str, video_id: str) -> None:
107
+ """Loads cookies from a file."""
108
+ try:
109
+ cj = cookiejar.MozillaCookieJar(cookies)
110
+ cj.load()
111
+ return cj
112
+ except (cookiejar.LoadError, FileNotFoundError):
113
+ raise CookiePathInvalidError(video_id)
114
+
115
+ class TranscriptListFetcher:
116
+ """Fetches the list of transcripts for a YouTube video."""
117
+
118
+ def __init__(self, http_client: requests.Session):
119
+ """Initializes TranscriptListFetcher."""
120
+ self._http_client = http_client
121
+
122
+ def fetch(self, video_id: str):
123
+ """Fetches and returns a TranscriptList."""
124
+ return TranscriptList.build(
125
+ self._http_client,
126
+ video_id,
127
+ self._extract_captions_json(self._fetch_video_html(video_id), video_id),
128
+ )
129
+
130
+ def _extract_captions_json(self, html: str, video_id: str) -> dict:
131
+ """Extracts the captions JSON data from the video's HTML."""
132
+ splitted_html = html.split('"captions":')
133
+
134
+ if len(splitted_html) <= 1:
135
+ if video_id.startswith('http://') or video_id.startswith('https://'):
136
+ raise InvalidVideoIdError(video_id)
137
+ if 'class="g-recaptcha"' in html:
138
+ raise TooManyRequestsError(video_id)
139
+ if '"playabilityStatus":' not in html:
140
+ raise VideoUnavailableError(video_id)
141
+
142
+ raise TranscriptsDisabledError(video_id)
143
+
144
+ captions_json = json.loads(
145
+ splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
146
+ ).get('playerCaptionsTracklistRenderer')
147
+ if captions_json is None:
148
+ raise TranscriptsDisabledError(video_id)
149
+
150
+ if 'captionTracks' not in captions_json:
151
+ raise TranscriptsDisabledError(video_id)
152
+
153
+ return captions_json
154
+
155
+ def _create_consent_cookie(self, html, video_id):
156
+ match = re.search('name="v" value="(.*?)"', html)
157
+ if match is None:
158
+ raise FailedToCreateConsentCookieError(video_id)
159
+ self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
160
+
161
+ def _fetch_video_html(self, video_id):
162
+ html = self._fetch_html(video_id)
163
+ if 'action="https://consent.youtube.com/s"' in html:
164
+ self._create_consent_cookie(html, video_id)
165
+ html = self._fetch_html(video_id)
166
+ if 'action="https://consent.youtube.com/s"' in html:
167
+ raise FailedToCreateConsentCookieError(video_id)
168
+ return html
169
+
170
+ def _fetch_html(self, video_id):
171
+ response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'})
172
+ return html.unescape(_raise_http_errors(response, video_id).text)
173
+
174
+
175
+ class TranscriptList:
176
+ """
177
+ >>> transcript_list = TranscriptList.build(http_client, video_id, captions_json)
178
+ >>> transcript = transcript_list.find_transcript(['en'])
179
+ >>> print(transcript)
180
+ en ("English")[TRANSLATABLE]
181
+ """
182
+
183
+ def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
184
+ """Init that transcript list with all the good stuff! 💯"""
185
+ self.video_id = video_id
186
+ self._manually_created_transcripts = manually_created_transcripts
187
+ self._generated_transcripts = generated_transcripts
188
+ self._translation_languages = translation_languages
189
+
190
+ @staticmethod
191
+ def build(http_client, video_id, captions_json):
192
+ """
193
+ Factory method for TranscriptList.
194
+
195
+ :param http_client: http client which is used to make the transcript retrieving http calls
196
+ :type http_client: requests.Session
197
+ :param video_id: the id of the video this TranscriptList is for
198
+ :type video_id: str
199
+ :param captions_json: the JSON parsed from the YouTube pages static HTML
200
+ :type captions_json: dict
201
+ :return: the created TranscriptList
202
+ :rtype TranscriptList:
203
+ """
204
+ translation_languages = [
205
+ {
206
+ 'language': translation_language['languageName']['simpleText'],
207
+ 'language_code': translation_language['languageCode'],
208
+ } for translation_language in captions_json.get('translationLanguages', [])
209
+ ]
210
+
211
+ manually_created_transcripts = {}
212
+ generated_transcripts = {}
213
+
214
+ for caption in captions_json['captionTracks']:
215
+ if caption.get('kind', '') == 'asr':
216
+ transcript_dict = generated_transcripts
217
+ else:
218
+ transcript_dict = manually_created_transcripts
219
+
220
+ transcript_dict[caption['languageCode']] = Transcript(
221
+ http_client,
222
+ video_id,
223
+ caption['baseUrl'],
224
+ caption['name']['simpleText'],
225
+ caption['languageCode'],
226
+ caption.get('kind', '') == 'asr',
227
+ translation_languages if caption.get('isTranslatable', False) else [],
228
+ )
229
+
230
+ return TranscriptList(
231
+ video_id,
232
+ manually_created_transcripts,
233
+ generated_transcripts,
234
+ translation_languages,
235
+ )
236
+
237
+ def __iter__(self):
238
+ return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
239
+
240
+ def find_transcript(self, language_codes):
241
+ """
242
+ Finds a transcript for a given language code. If no language is provided, it will
243
+ return the auto-generated transcript.
244
+
245
+ :param language_codes: A list of language codes in a descending priority.
246
+ :type languages: list[str]
247
+ :return: the found Transcript
248
+ :rtype Transcript:
249
+ :raises: NoTranscriptFound
250
+ """
251
+ if 'any' in language_codes:
252
+ for transcript in self:
253
+ return transcript
254
+ return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
255
+
256
+ def find_generated_transcript(self, language_codes):
257
+ """
258
+ Finds an automatically generated transcript for a given language code.
259
+
260
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
261
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
262
+ it fails to do so.
263
+ :type languages: list[str]
264
+ :return: the found Transcript
265
+ :rtype Transcript:
266
+ :raises: NoTranscriptFound
267
+ """
268
+ if 'any' in language_codes:
269
+ for transcript in self:
270
+ if transcript.is_generated:
271
+ return transcript
272
+ return self._find_transcript(language_codes, [self._generated_transcripts])
273
+
274
+ def find_manually_created_transcript(self, language_codes):
275
+ """
276
+ Finds a manually created transcript for a given language code.
277
+
278
+ :param language_codes: A list of language codes in a descending priority. For example, if this is set to
279
+ ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
280
+ it fails to do so.
281
+ :type languages: list[str]
282
+ :return: the found Transcript
283
+ :rtype Transcript:
284
+ :raises: NoTranscriptFound
285
+ """
286
+ return self._find_transcript(language_codes, [self._manually_created_transcripts])
287
+
288
+ def _find_transcript(self, language_codes, transcript_dicts):
289
+ for language_code in language_codes:
290
+ for transcript_dict in transcript_dicts:
291
+ if language_code in transcript_dict:
292
+ return transcript_dict[language_code]
293
+
294
+ raise NoTranscriptFoundError(
295
+ self.video_id,
296
+ language_codes,
297
+ self
298
+ )
299
+
300
+ def __str__(self):
301
+ return (
302
+ 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
303
+ '(MANUALLY CREATED)\n'
304
+ '{available_manually_created_transcript_languages}\n\n'
305
+ '(GENERATED)\n'
306
+ '{available_generated_transcripts}\n\n'
307
+ '(TRANSLATION LANGUAGES)\n'
308
+ '{available_translation_languages}'
309
+ ).format(
310
+ video_id=self.video_id,
311
+ available_manually_created_transcript_languages=self._get_language_description(
312
+ str(transcript) for transcript in self._manually_created_transcripts.values()
313
+ ),
314
+ available_generated_transcripts=self._get_language_description(
315
+ str(transcript) for transcript in self._generated_transcripts.values()
316
+ ),
317
+ available_translation_languages=self._get_language_description(
318
+ '{language_code} ("{language}")'.format(
319
+ language=translation_language['language'],
320
+ language_code=translation_language['language_code'],
321
+ ) for translation_language in self._translation_languages
322
+ )
323
+ )
324
+
325
+ def _get_language_description(self, transcript_strings):
326
+ description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
327
+ return description if description else 'None'
328
+
329
+
330
+ class Transcript:
331
+ """Your personal transcript handler! 🎭
332
+
333
+ >>> transcript = transcript_list.find_transcript(['en'])
334
+ >>> print(transcript.language)
335
+ 'English'
336
+ >>> if transcript.is_translatable:
337
+ ... es_transcript = transcript.translate('es')
338
+ ... print(es_transcript.language)
339
+ 'Spanish'
340
+ """
341
+
342
+ def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
343
+ """Initialize with all the goodies! 🎁"""
344
+ self._http_client = http_client
345
+ self.video_id = video_id
346
+ self._url = url
347
+ self.language = language
348
+ self.language_code = language_code
349
+ self.is_generated = is_generated
350
+ self.translation_languages = translation_languages
351
+ self._translation_languages_dict = {
352
+ translation_language['language_code']: translation_language['language']
353
+ for translation_language in translation_languages
354
+ }
355
+
356
+ def fetch(self, preserve_formatting=False):
357
+ """Get that transcript data! 🎯
358
+
359
+ Args:
360
+ preserve_formatting (bool): Keep HTML formatting? Default is nah fam.
361
+
362
+ Returns:
363
+ list: That sweet transcript data with text, start time, and duration! 📝
364
+ """
365
+ response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
366
+ return TranscriptParser(preserve_formatting=preserve_formatting).parse(
367
+ _raise_http_errors(response, self.video_id).text,
368
+ )
369
+
370
+ def __str__(self):
371
+ """String representation looking clean! 💅"""
372
+ return '{language_code} ("{language}"){translation_description}'.format(
373
+ language=self.language,
374
+ language_code=self.language_code,
375
+ translation_description='[TRANSLATABLE]' if self.is_translatable else ''
376
+ )
377
+
378
+ @property
379
+ def is_translatable(self):
380
+ """Can we translate this? 🌍"""
381
+ return len(self.translation_languages) > 0
382
+
383
+ def translate(self, language_code):
384
+ """Translate to another language! 🌎
385
+
386
+ Args:
387
+ language_code (str): Which language you want fam?
388
+
389
+ Returns:
390
+ Transcript: A fresh transcript in your requested language! 🔄
391
+
392
+ Raises:
393
+ NotTranslatableError: If we can't translate this one 😢
394
+ TranslationLanguageNotAvailableError: If that language isn't available 🚫
395
+ """
396
+ if not self.is_translatable:
397
+ raise NotTranslatableError(self.video_id)
398
+
399
+ if language_code not in self._translation_languages_dict:
400
+ raise TranslationLanguageNotAvailableError(self.video_id)
401
+
402
+ return Transcript(
403
+ self._http_client,
404
+ self.video_id,
405
+ '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
406
+ self._translation_languages_dict[language_code],
407
+ language_code,
408
+ True,
409
+ [],
410
+ )
411
+
412
+
413
+ class TranscriptParser:
414
+ """Parsing those transcripts like a pro! 🎯
415
+
416
+ >>> parser = TranscriptParser(preserve_formatting=True)
417
+ >>> data = parser.parse(xml_data)
418
+ >>> print(data[0])
419
+ {'text': 'Never gonna give you up', 'start': 0.0, 'duration': 4.5}
420
+ """
421
+
422
+ _FORMATTING_TAGS = [
423
+ 'strong', # For that extra emphasis 💪
424
+ 'em', # When you need that italic swag 🎨
425
+ 'b', # Bold and beautiful 💯
426
+ 'i', # More italic vibes
427
+ 'mark', # Highlight that text 🌟
428
+ 'small', # Keep it lowkey 🤫
429
+ 'del', # Strike it out
430
+ 'ins', # Insert new stuff 🆕
431
+ 'sub', # Subscript gang 📉
432
+ 'sup', # Superscript squad 📈
433
+ ]
434
+
435
+ def __init__(self, preserve_formatting=False):
436
+ """Get ready to parse with style! 🎨"""
437
+ self._html_regex = self._get_html_regex(preserve_formatting)
438
+
439
+ def _get_html_regex(self, preserve_formatting):
440
+ """Get that regex pattern ready! 🎯"""
441
+ if preserve_formatting:
442
+ formats_regex = '|'.join(self._FORMATTING_TAGS)
443
+ formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
444
+ html_regex = re.compile(formats_regex, re.IGNORECASE)
445
+ else:
446
+ html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
447
+ return html_regex
448
+
449
+ def parse(self, plain_data):
450
+ """Parse that XML data into something beautiful! ✨"""
451
+ return [
452
+ {
453
+ 'text': re.sub(self._html_regex, '', html.unescape(xml_element.text)),
454
+ 'start': float(xml_element.attrib['start']),
455
+ 'duration': float(xml_element.attrib.get('dur', '0.0')),
456
+ }
457
+ for xml_element in ElementTree.fromstring(plain_data)
458
+ if xml_element.text is not None
459
+ ]
460
+
461
+
462
+ def _raise_http_errors(response, video_id):
463
+ """Handle those HTTP errors with style! 🛠️"""
464
+ try:
465
+ response.raise_for_status()
466
+ return response
467
+ except requests.exceptions.HTTPError as error:
468
+ raise YouTubeRequestFailedError(video_id, error)
469
+
470
+
471
+ if __name__ == "__main__":
472
+ # Let's get this party started! 🎉
473
+ from rich import print
474
+ video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
475
+ transcript = YTTranscriber.get_transcript(video_url, languages=None)
476
+ print("Here's what we got! 🔥")
480
477
  print(transcript)