webscout 5.1__py3-none-any.whl → 5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIauto.py +83 -277
- webscout/AIbase.py +106 -4
- webscout/AIutel.py +41 -10
- webscout/Agents/Onlinesearcher.py +91 -104
- webscout/Agents/__init__.py +2 -1
- webscout/Agents/ai.py +186 -0
- webscout/Agents/functioncall.py +57 -27
- webscout/Bing_search.py +73 -43
- webscout/DWEBS.py +99 -77
- webscout/Local/_version.py +1 -1
- webscout/Provider/AI21.py +177 -0
- webscout/Provider/Chatify.py +174 -0
- webscout/Provider/Cloudflare.py +0 -4
- webscout/Provider/EDITEE.py +215 -0
- webscout/Provider/{Berlin4h.py → NetFly.py} +81 -82
- webscout/Provider/RUBIKSAI.py +11 -5
- webscout/Provider/TTI/PollinationsAI.py +138 -0
- webscout/Provider/TTI/__init__.py +2 -0
- webscout/Provider/TTI/deepinfra.py +148 -0
- webscout/Provider/TTS/__init__.py +2 -0
- webscout/Provider/TTS/streamElements.py +292 -0
- webscout/Provider/TTS/voicepod.py +118 -0
- webscout/Provider/{liaobots.py → TeachAnything.py} +31 -122
- webscout/Provider/__init__.py +14 -4
- webscout/Provider/ai4chat.py +14 -8
- webscout/Provider/cerebras.py +199 -0
- webscout/Provider/felo_search.py +28 -68
- webscout/Provider/x0gpt.py +181 -0
- webscout/__init__.py +4 -2
- webscout/exceptions.py +2 -1
- webscout/transcriber.py +195 -140
- webscout/version.py +1 -1
- {webscout-5.1.dist-info → webscout-5.3.dist-info}/METADATA +41 -82
- {webscout-5.1.dist-info → webscout-5.3.dist-info}/RECORD +38 -28
- webscout/async_providers.py +0 -21
- webscout/voice.py +0 -34
- {webscout-5.1.dist-info → webscout-5.3.dist-info}/LICENSE.md +0 -0
- {webscout-5.1.dist-info → webscout-5.3.dist-info}/WHEEL +0 -0
- {webscout-5.1.dist-info → webscout-5.3.dist-info}/entry_points.txt +0 -0
- {webscout-5.1.dist-info → webscout-5.3.dist-info}/top_level.txt +0 -0
webscout/transcriber.py
CHANGED
|
@@ -1,144 +1,228 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
import http.cookiejar as cookiejar
|
|
3
|
-
import sys
|
|
4
3
|
import json
|
|
5
4
|
from xml.etree import ElementTree
|
|
6
5
|
import re
|
|
7
|
-
from requests import HTTPError
|
|
8
6
|
import html.parser
|
|
7
|
+
from typing import List, Dict, Union, Optional
|
|
9
8
|
|
|
10
9
|
html_parser = html.parser.HTMLParser()
|
|
11
|
-
|
|
10
|
+
|
|
12
11
|
|
|
13
12
|
def unescape(string):
|
|
14
13
|
return html.unescape(string)
|
|
15
|
-
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
|
16
14
|
|
|
17
|
-
class TranscriptRetrievalError(Exception):
|
|
18
|
-
"""
|
|
19
|
-
Base class for exceptions raised when a transcript cannot be retrieved.
|
|
20
|
-
"""
|
|
21
|
-
ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
|
|
22
|
-
CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
|
|
23
|
-
CAUSE_MESSAGE = ''
|
|
24
|
-
GITHUB_REFERRAL = (
|
|
25
|
-
'\n\nIf you are sure that the described cause is not responsible for this error '
|
|
26
|
-
'and that a transcript should be retrievable, please create an issue at '
|
|
27
|
-
'https://github.com/OE-LUCIFER/Webscout/issues. '
|
|
28
|
-
'Please add which version of webscout you are using '
|
|
29
|
-
'and provide the information needed to replicate the error. '
|
|
30
|
-
)
|
|
31
15
|
|
|
32
|
-
|
|
33
|
-
self.video_id = video_id
|
|
34
|
-
super(TranscriptRetrievalError, self).__init__(self._build_error_message())
|
|
16
|
+
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
|
35
17
|
|
|
36
|
-
def _build_error_message(self):
|
|
37
|
-
cause = self.cause
|
|
38
|
-
error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
|
|
39
18
|
|
|
40
|
-
|
|
41
|
-
|
|
19
|
+
class TranscriptRetrievalError(Exception):
|
|
20
|
+
"""Base class for transcript retrieval errors."""
|
|
42
21
|
|
|
43
|
-
|
|
22
|
+
def __init__(self, video_id, message):
|
|
23
|
+
super().__init__(message.format(video_url=WATCH_URL.format(video_id=video_id)))
|
|
24
|
+
self.video_id = video_id
|
|
44
25
|
|
|
45
|
-
@property
|
|
46
|
-
def cause(self):
|
|
47
|
-
return self.CAUSE_MESSAGE
|
|
48
26
|
|
|
49
27
|
class YouTubeRequestFailedError(TranscriptRetrievalError):
|
|
50
|
-
|
|
28
|
+
"""Raised when a request to YouTube fails."""
|
|
51
29
|
|
|
52
30
|
def __init__(self, video_id, http_error):
|
|
53
|
-
|
|
54
|
-
super(
|
|
31
|
+
message = 'Request to YouTube failed: {reason}'
|
|
32
|
+
super().__init__(video_id, message.format(reason=str(http_error)))
|
|
55
33
|
|
|
56
|
-
@property
|
|
57
|
-
def cause(self):
|
|
58
|
-
return self.CAUSE_MESSAGE.format(reason=self.reason)
|
|
59
34
|
|
|
60
35
|
class VideoUnavailableError(TranscriptRetrievalError):
|
|
61
|
-
|
|
36
|
+
"""Raised when the video is unavailable."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, video_id):
|
|
39
|
+
message = 'The video is no longer available'
|
|
40
|
+
super().__init__(video_id, message)
|
|
41
|
+
|
|
62
42
|
|
|
63
43
|
class InvalidVideoIdError(TranscriptRetrievalError):
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
44
|
+
"""Raised when an invalid video ID is provided."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, video_id):
|
|
47
|
+
message = (
|
|
48
|
+
'You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n'
|
|
49
|
+
'Do NOT run: `YTTranscriber.get_transcript("https://www.youtube.com/watch?v=1234")`\n'
|
|
50
|
+
'Instead run: `YTTranscriber.get_transcript("1234")`'
|
|
51
|
+
)
|
|
52
|
+
super().__init__(video_id, message)
|
|
53
|
+
|
|
69
54
|
|
|
70
55
|
class TooManyRequestsError(TranscriptRetrievalError):
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
56
|
+
"""Raised when YouTube rate limits the requests."""
|
|
57
|
+
|
|
58
|
+
def __init__(self, video_id):
|
|
59
|
+
message = (
|
|
60
|
+
'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
|
|
61
|
+
'One of the following things can be done to work around this:\n\
|
|
62
|
+
- Manually solve the captcha in a browser and export the cookie. '
|
|
63
|
+
'- Use a different IP address\n\
|
|
64
|
+
- Wait until the ban on your IP has been lifted'
|
|
65
|
+
)
|
|
66
|
+
super().__init__(video_id, message)
|
|
67
|
+
|
|
80
68
|
|
|
81
69
|
class TranscriptsDisabledError(TranscriptRetrievalError):
|
|
82
|
-
|
|
70
|
+
"""Raised when transcripts are disabled for the video."""
|
|
71
|
+
|
|
72
|
+
def __init__(self, video_id):
|
|
73
|
+
message = 'Subtitles are disabled for this video'
|
|
74
|
+
super().__init__(video_id, message)
|
|
75
|
+
|
|
83
76
|
|
|
84
77
|
class NoTranscriptAvailableError(TranscriptRetrievalError):
|
|
85
|
-
|
|
78
|
+
"""Raised when no transcripts are available for the video."""
|
|
79
|
+
|
|
80
|
+
def __init__(self, video_id):
|
|
81
|
+
message = 'No transcripts are available for this video'
|
|
82
|
+
super().__init__(video_id, message)
|
|
83
|
+
|
|
86
84
|
|
|
87
85
|
class NotTranslatableError(TranscriptRetrievalError):
|
|
88
|
-
|
|
86
|
+
"""Raised when the transcript is not translatable."""
|
|
87
|
+
|
|
88
|
+
def __init__(self, video_id):
|
|
89
|
+
message = 'The requested language is not translatable'
|
|
90
|
+
super().__init__(video_id, message)
|
|
91
|
+
|
|
89
92
|
|
|
90
93
|
class TranslationLanguageNotAvailableError(TranscriptRetrievalError):
|
|
91
|
-
|
|
94
|
+
"""Raised when the requested translation language is not available."""
|
|
95
|
+
|
|
96
|
+
def __init__(self, video_id):
|
|
97
|
+
message = 'The requested translation language is not available'
|
|
98
|
+
super().__init__(video_id, message)
|
|
99
|
+
|
|
92
100
|
|
|
93
101
|
class CookiePathInvalidError(TranscriptRetrievalError):
|
|
94
|
-
|
|
102
|
+
"""Raised when the cookie path is invalid."""
|
|
103
|
+
|
|
104
|
+
def __init__(self, video_id):
|
|
105
|
+
message = 'The provided cookie file was unable to be loaded'
|
|
106
|
+
super().__init__(video_id, message)
|
|
107
|
+
|
|
95
108
|
|
|
96
109
|
class CookiesInvalidError(TranscriptRetrievalError):
|
|
97
|
-
|
|
110
|
+
"""Raised when the provided cookies are invalid."""
|
|
111
|
+
|
|
112
|
+
def __init__(self, video_id):
|
|
113
|
+
message = 'The cookies provided are not valid (may have expired)'
|
|
114
|
+
super().__init__(video_id, message)
|
|
115
|
+
|
|
98
116
|
|
|
99
117
|
class FailedToCreateConsentCookieError(TranscriptRetrievalError):
|
|
100
|
-
|
|
118
|
+
"""Raised when consent cookie creation fails."""
|
|
119
|
+
|
|
120
|
+
def __init__(self, video_id):
|
|
121
|
+
message = 'Failed to automatically give consent to saving cookies'
|
|
122
|
+
super().__init__(video_id, message)
|
|
123
|
+
|
|
101
124
|
|
|
102
125
|
class NoTranscriptFoundError(TranscriptRetrievalError):
|
|
103
|
-
|
|
104
|
-
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
|
105
|
-
'{transcript_data}'
|
|
106
|
-
)
|
|
126
|
+
"""Raised when no transcript is found for the requested language codes."""
|
|
107
127
|
|
|
108
128
|
def __init__(self, video_id, requested_language_codes, transcript_data):
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
@property
|
|
114
|
-
def cause(self):
|
|
115
|
-
return self.CAUSE_MESSAGE.format(
|
|
116
|
-
requested_language_codes=self._requested_language_codes,
|
|
117
|
-
transcript_data=str(self._transcript_data),
|
|
129
|
+
message = (
|
|
130
|
+
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
|
131
|
+
'{transcript_data}'
|
|
118
132
|
)
|
|
133
|
+
super().__init__(video_id, message.format(
|
|
134
|
+
requested_language_codes=requested_language_codes,
|
|
135
|
+
transcript_data=str(transcript_data)
|
|
136
|
+
))
|
|
119
137
|
|
|
120
138
|
|
|
139
|
+
class YTTranscriber:
|
|
140
|
+
"""
|
|
141
|
+
Main class for retrieving YouTube transcripts.
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def get_transcript(video_url: str, languages: Optional[str] = 'en',
|
|
146
|
+
proxies: Dict[str, str] = None,
|
|
147
|
+
cookies: str = None,
|
|
148
|
+
preserve_formatting: bool = False) -> List[Dict[str, Union[str, float]]]:
|
|
149
|
+
"""
|
|
150
|
+
Retrieves the transcript for a given YouTube video URL.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
video_url (str): YouTube video URL (supports various formats).
|
|
154
|
+
languages (str, optional): Language code for the transcript.
|
|
155
|
+
If None, fetches the auto-generated transcript.
|
|
156
|
+
Defaults to 'en'.
|
|
157
|
+
proxies (Dict[str, str], optional): Proxies to use for the request. Defaults to None.
|
|
158
|
+
cookies (str, optional): Path to the cookie file. Defaults to None.
|
|
159
|
+
preserve_formatting (bool, optional): Whether to preserve formatting tags. Defaults to False.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
List[Dict[str, Union[str, float]]]: A list of dictionaries, each containing:
|
|
163
|
+
- 'text': The transcribed text.
|
|
164
|
+
- 'start': The start time of the text segment (in seconds).
|
|
165
|
+
- 'duration': The duration of the text segment (in seconds).
|
|
166
|
+
|
|
167
|
+
Raises:
|
|
168
|
+
TranscriptRetrievalError: If there's an error retrieving the transcript.
|
|
169
|
+
"""
|
|
170
|
+
video_id = YTTranscriber._extract_video_id(video_url)
|
|
171
|
+
|
|
172
|
+
with requests.Session() as http_client:
|
|
173
|
+
if cookies:
|
|
174
|
+
http_client.cookies = YTTranscriber._load_cookies(cookies, video_id)
|
|
175
|
+
http_client.proxies = proxies if proxies else {}
|
|
176
|
+
transcript_list_fetcher = TranscriptListFetcher(http_client)
|
|
177
|
+
transcript_list = transcript_list_fetcher.fetch(video_id)
|
|
178
|
+
|
|
179
|
+
if languages is None: # Get auto-generated transcript
|
|
180
|
+
return transcript_list.find_generated_transcript(['any']).fetch(
|
|
181
|
+
preserve_formatting=preserve_formatting)
|
|
182
|
+
else:
|
|
183
|
+
return transcript_list.find_transcript([languages]).fetch(preserve_formatting=preserve_formatting)
|
|
184
|
+
|
|
185
|
+
@staticmethod
|
|
186
|
+
def _extract_video_id(video_url: str) -> str:
|
|
187
|
+
"""Extracts the video ID from different YouTube URL formats."""
|
|
188
|
+
if 'youtube.com/watch?v=' in video_url:
|
|
189
|
+
video_id = video_url.split('youtube.com/watch?v=')[1].split('&')[0]
|
|
190
|
+
elif 'youtu.be/' in video_url:
|
|
191
|
+
video_id = video_url.split('youtu.be/')[1].split('?')[0]
|
|
192
|
+
else:
|
|
193
|
+
raise InvalidVideoIdError(video_url)
|
|
194
|
+
return video_id
|
|
195
|
+
|
|
196
|
+
@staticmethod
|
|
197
|
+
def _load_cookies(cookies: str, video_id: str) -> cookiejar.MozillaCookieJar:
|
|
198
|
+
"""Loads cookies from a file."""
|
|
199
|
+
try:
|
|
200
|
+
cookie_jar = cookiejar.MozillaCookieJar()
|
|
201
|
+
cookie_jar.load(cookies)
|
|
202
|
+
if not cookie_jar:
|
|
203
|
+
raise CookiesInvalidError(video_id)
|
|
204
|
+
return cookie_jar
|
|
205
|
+
except:
|
|
206
|
+
raise CookiePathInvalidError(video_id)
|
|
121
207
|
|
|
122
|
-
def _raise_http_errors(response, video_id):
|
|
123
|
-
try:
|
|
124
|
-
response.raise_for_status()
|
|
125
|
-
return response
|
|
126
|
-
except HTTPError as error:
|
|
127
|
-
raise YouTubeRequestFailedError(error, video_id)
|
|
128
208
|
|
|
209
|
+
class TranscriptListFetcher:
|
|
210
|
+
"""Fetches the list of transcripts for a YouTube video."""
|
|
129
211
|
|
|
130
|
-
|
|
131
|
-
|
|
212
|
+
def __init__(self, http_client: requests.Session):
|
|
213
|
+
"""Initializes TranscriptListFetcher."""
|
|
132
214
|
self._http_client = http_client
|
|
133
215
|
|
|
134
|
-
def fetch(self, video_id):
|
|
216
|
+
def fetch(self, video_id: str):
|
|
217
|
+
"""Fetches and returns a TranscriptList."""
|
|
135
218
|
return TranscriptList.build(
|
|
136
219
|
self._http_client,
|
|
137
220
|
video_id,
|
|
138
221
|
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
|
|
139
222
|
)
|
|
140
223
|
|
|
141
|
-
def _extract_captions_json(self, html, video_id):
|
|
224
|
+
def _extract_captions_json(self, html: str, video_id: str) -> dict:
|
|
225
|
+
"""Extracts the captions JSON data from the video's HTML."""
|
|
142
226
|
splitted_html = html.split('"captions":')
|
|
143
227
|
|
|
144
228
|
if len(splitted_html) <= 1:
|
|
@@ -182,11 +266,8 @@ class TranscriptListFetcher(object):
|
|
|
182
266
|
return unescape(_raise_http_errors(response, video_id).text)
|
|
183
267
|
|
|
184
268
|
|
|
185
|
-
class TranscriptList
|
|
186
|
-
"""
|
|
187
|
-
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
|
188
|
-
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
|
189
|
-
"""
|
|
269
|
+
class TranscriptList:
|
|
270
|
+
"""Represents a list of available transcripts."""
|
|
190
271
|
|
|
191
272
|
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
|
192
273
|
"""
|
|
@@ -258,18 +339,18 @@ class TranscriptList(object):
|
|
|
258
339
|
|
|
259
340
|
def find_transcript(self, language_codes):
|
|
260
341
|
"""
|
|
261
|
-
Finds a transcript for a given language code.
|
|
262
|
-
|
|
263
|
-
`find_manually_created_transcript` instead.
|
|
342
|
+
Finds a transcript for a given language code. If no language is provided, it will
|
|
343
|
+
return the auto-generated transcript.
|
|
264
344
|
|
|
265
|
-
:param language_codes: A list of language codes in a descending priority.
|
|
266
|
-
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
|
267
|
-
it fails to do so.
|
|
345
|
+
:param language_codes: A list of language codes in a descending priority.
|
|
268
346
|
:type languages: list[str]
|
|
269
347
|
:return: the found Transcript
|
|
270
348
|
:rtype Transcript:
|
|
271
349
|
:raises: NoTranscriptFound
|
|
272
350
|
"""
|
|
351
|
+
if 'any' in language_codes:
|
|
352
|
+
for transcript in self:
|
|
353
|
+
return transcript
|
|
273
354
|
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
|
|
274
355
|
|
|
275
356
|
def find_generated_transcript(self, language_codes):
|
|
@@ -284,6 +365,10 @@ class TranscriptList(object):
|
|
|
284
365
|
:rtype Transcript:
|
|
285
366
|
:raises: NoTranscriptFound
|
|
286
367
|
"""
|
|
368
|
+
if 'any' in language_codes:
|
|
369
|
+
for transcript in self:
|
|
370
|
+
if transcript.is_generated:
|
|
371
|
+
return transcript
|
|
287
372
|
return self._find_transcript(language_codes, [self._generated_transcripts])
|
|
288
373
|
|
|
289
374
|
def find_manually_created_transcript(self, language_codes):
|
|
@@ -342,7 +427,9 @@ class TranscriptList(object):
|
|
|
342
427
|
return description if description else 'None'
|
|
343
428
|
|
|
344
429
|
|
|
345
|
-
class Transcript
|
|
430
|
+
class Transcript:
|
|
431
|
+
"""Represents a single transcript."""
|
|
432
|
+
|
|
346
433
|
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
|
347
434
|
"""
|
|
348
435
|
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
|
@@ -379,7 +466,7 @@ class Transcript(object):
|
|
|
379
466
|
:rtype [{'text': str, 'start': float, 'end': float}]:
|
|
380
467
|
"""
|
|
381
468
|
response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
|
|
382
|
-
return
|
|
469
|
+
return TranscriptParser(preserve_formatting=preserve_formatting).parse(
|
|
383
470
|
_raise_http_errors(response, self.video_id).text,
|
|
384
471
|
)
|
|
385
472
|
|
|
@@ -412,7 +499,8 @@ class Transcript(object):
|
|
|
412
499
|
)
|
|
413
500
|
|
|
414
501
|
|
|
415
|
-
class
|
|
502
|
+
class TranscriptParser:
|
|
503
|
+
"""Parses the transcript data from XML."""
|
|
416
504
|
_FORMATTING_TAGS = [
|
|
417
505
|
'strong', # important
|
|
418
506
|
'em', # emphasized
|
|
@@ -449,49 +537,16 @@ class _TranscriptParser(object):
|
|
|
449
537
|
if xml_element.text is not None
|
|
450
538
|
]
|
|
451
539
|
|
|
452
|
-
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
|
453
|
-
|
|
454
|
-
class transcriber(object):
|
|
455
|
-
@classmethod
|
|
456
|
-
def list_transcripts(cls, video_id, proxies=None, cookies=None):
|
|
457
|
-
with requests.Session() as http_client:
|
|
458
|
-
if cookies:
|
|
459
|
-
http_client.cookies = cls._load_cookies(cookies, video_id)
|
|
460
|
-
http_client.proxies = proxies if proxies else {}
|
|
461
|
-
return TranscriptListFetcher(http_client).fetch(video_id)
|
|
462
|
-
|
|
463
|
-
@classmethod
|
|
464
|
-
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
|
|
465
|
-
cookies=None, preserve_formatting=False):
|
|
466
|
-
|
|
467
|
-
assert isinstance(video_ids, list), "`video_ids` must be a list of strings"
|
|
468
|
-
|
|
469
|
-
data = {}
|
|
470
|
-
unretrievable_videos = []
|
|
471
|
-
|
|
472
|
-
for video_id in video_ids:
|
|
473
|
-
try:
|
|
474
|
-
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
|
|
475
|
-
except Exception as exception:
|
|
476
|
-
if not continue_after_error:
|
|
477
|
-
raise exception
|
|
478
|
-
|
|
479
|
-
unretrievable_videos.append(video_id)
|
|
480
|
-
|
|
481
|
-
return data, unretrievable_videos
|
|
482
540
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
return
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
return cookie_jar
|
|
496
|
-
except:
|
|
497
|
-
raise CookiePathInvalidError(video_id)
|
|
541
|
+
def _raise_http_errors(response, video_id):
|
|
542
|
+
try:
|
|
543
|
+
response.raise_for_status()
|
|
544
|
+
return response
|
|
545
|
+
except requests.exceptions.HTTPError as error:
|
|
546
|
+
raise YouTubeRequestFailedError(video_id, error)
|
|
547
|
+
|
|
548
|
+
if __name__ == "__main__":
|
|
549
|
+
from rich import print
|
|
550
|
+
video_url = input("Enter the YouTube video URL: ")
|
|
551
|
+
transcript = YTTranscriber.get_transcript(video_url, languages=None)
|
|
552
|
+
print(transcript)
|
webscout/version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "
|
|
1
|
+
__version__ = "5.2"
|
|
2
2
|
__prog__ = "webscout"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: webscout
|
|
3
|
-
Version: 5.
|
|
4
|
-
Summary: Search for anything using Google, DuckDuckGo,
|
|
3
|
+
Version: 5.3
|
|
4
|
+
Summary: Search for anything using Google, DuckDuckGo, phind.com, Contains AI models, can transcribe yt videos, temporary email and phone number generation, has TTS support, webai (terminal gpt and open interpreter) and offline LLMs and more
|
|
5
5
|
Author: OEvortex
|
|
6
6
|
Author-email: helpingai5@gmail.com
|
|
7
7
|
License: HelpingAI
|
|
@@ -339,57 +339,12 @@ if __name__ == '__main__':
|
|
|
339
339
|
## Transcriber
|
|
340
340
|
The transcriber function in webscout is a handy tool that transcribes YouTube videos. Here's an example code demonstrating its usage:
|
|
341
341
|
```python
|
|
342
|
-
import
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
transcript_list = transcriber.list_transcripts(video_id)
|
|
349
|
-
for transcript in transcript_list:
|
|
350
|
-
transcript_data_list = transcript.fetch()
|
|
351
|
-
lang = transcript.language
|
|
352
|
-
transcript_text = ""
|
|
353
|
-
if transcript.language_code == 'en':
|
|
354
|
-
for line in transcript_data_list:
|
|
355
|
-
start_time = line['start']
|
|
356
|
-
end_time = start_time + line['duration']
|
|
357
|
-
formatted_line = f"{start_time:.2f} - {end_time:.2f}: {line['text']}\n"
|
|
358
|
-
transcript_text += formatted_line
|
|
359
|
-
return transcript_text
|
|
360
|
-
elif transcript.is_translatable:
|
|
361
|
-
english_transcript_list = transcript.translate('en').fetch()
|
|
362
|
-
for line in english_transcript_list:
|
|
363
|
-
start_time = line['start']
|
|
364
|
-
end_time = start_time + line['duration']
|
|
365
|
-
formatted_line = f"{start_time:.2f} - {end_time:.2f}: {line['text']}\n"
|
|
366
|
-
transcript_text += formatted_line
|
|
367
|
-
return transcript_text
|
|
368
|
-
print("Transcript extraction failed. Please check the video URL.")
|
|
369
|
-
except Exception as e:
|
|
370
|
-
print(f"Error: {e}")
|
|
371
|
-
|
|
372
|
-
def main():
|
|
373
|
-
video_url = input("Enter the video link: ")
|
|
374
|
-
|
|
375
|
-
if video_url:
|
|
376
|
-
video_id = video_url.split("=")[1]
|
|
377
|
-
print("Video URL:", video_url)
|
|
378
|
-
submit = input("Press 'Enter' to get the transcript or type 'exit' to quit: ")
|
|
379
|
-
if submit == '':
|
|
380
|
-
print("Extracting Transcript...")
|
|
381
|
-
transcript = extract_transcript(video_id)
|
|
382
|
-
print('Transcript:')
|
|
383
|
-
print(transcript)
|
|
384
|
-
print("__________________________________________________________________________________")
|
|
385
|
-
elif submit.lower() == 'exit':
|
|
386
|
-
print("Exiting...")
|
|
387
|
-
sys.exit()
|
|
388
|
-
else:
|
|
389
|
-
print("Invalid input. Please try again.")
|
|
390
|
-
|
|
391
|
-
if __name__ == "__main__":
|
|
392
|
-
main()
|
|
342
|
+
from webscout import YTTranscriber
|
|
343
|
+
yt = YTTranscriber()
|
|
344
|
+
from rich import print
|
|
345
|
+
video_url = input("Enter the YouTube video URL: ")
|
|
346
|
+
transcript = yt.get_transcript(video_url, languages=None)
|
|
347
|
+
print(transcript)
|
|
393
348
|
```
|
|
394
349
|
|
|
395
350
|
## GoogleS -- formerly DWEBS
|
|
@@ -397,7 +352,7 @@ if __name__ == "__main__":
|
|
|
397
352
|
from webscout import GoogleS
|
|
398
353
|
from rich import print
|
|
399
354
|
searcher = GoogleS()
|
|
400
|
-
results = searcher.search("HelpingAI-9B", max_results=20,
|
|
355
|
+
results = searcher.search("HelpingAI-9B", max_results=20, extract_text=False, max_text_length=200)
|
|
401
356
|
for result in results:
|
|
402
357
|
print(result)
|
|
403
358
|
```
|
|
@@ -406,41 +361,25 @@ for result in results:
|
|
|
406
361
|
from webscout import BingS
|
|
407
362
|
from rich import print
|
|
408
363
|
searcher = BingS()
|
|
409
|
-
results = searcher.search("
|
|
364
|
+
results = searcher.search("HelpingAI-9B", max_results=20, extract_webpage_text=True, max_extract_characters=1000)
|
|
410
365
|
for result in results:
|
|
411
366
|
print(result)
|
|
412
367
|
```
|
|
413
368
|
|
|
414
|
-
## Text-to-Speech:
|
|
415
|
-
```python
|
|
416
|
-
from webscout import play_audio
|
|
417
|
-
|
|
418
|
-
message = "This is an example of text-to-speech."
|
|
419
|
-
audio_content = play_audio(message, voice="Brian")
|
|
420
|
-
|
|
421
|
-
# Save the audio to a file
|
|
422
|
-
with open("output.mp3", "wb") as f:
|
|
423
|
-
f.write(audio_content)
|
|
424
|
-
```
|
|
425
|
-
### Available TTS Voices:
|
|
426
|
-
You can choose from a wide range of voices, including:
|
|
427
|
-
- Filiz, Astrid, Tatyana, Maxim, Carmen, Ines, Cristiano, Vitoria, Ricardo, Maja, Jan, Jacek, Ewa, Ruben, Lotte, Liv, Seoyeon, Takumi, Mizuki, Giorgio, Carla, Bianca, Karl, Dora, Mathieu, Celine, Chantal, Penelope, Miguel, Mia, Enrique, Conchita, Geraint, Salli, Matthew, Kimberly, Kendra, Justin, Joey, Joanna, Ivy, Raveena, Aditi, Emma, Brian, Amy, Russell, Nicole, Vicki, Marlene, Hans, Naja, Mads, Gwyneth, Zhiyu
|
|
428
|
-
- Standard and WaveNet voices for various languages (e.g., en-US, es-ES, ja-JP, etc.)
|
|
429
|
-
|
|
430
369
|
|
|
431
370
|
The WEBS and AsyncWEBS classes are used to retrieve search results from DuckDuckGo.com
|
|
432
371
|
To use the AsyncWEBS class, you can perform asynchronous operations using Python's asyncio library.
|
|
433
372
|
To initialize an instance of the WEBS or AsyncWEBS classes, you can provide the following optional arguments:
|
|
434
373
|
|
|
435
374
|
Here is an example of initializing the WEBS class:
|
|
436
|
-
```
|
|
375
|
+
```python
|
|
437
376
|
from webscout import WEBS
|
|
438
377
|
|
|
439
378
|
R = WEBS().text("python programming", max_results=5)
|
|
440
379
|
print(R)
|
|
441
380
|
```
|
|
442
381
|
Here is an example of initializing the AsyncWEBS class:
|
|
443
|
-
```
|
|
382
|
+
```python
|
|
444
383
|
import asyncio
|
|
445
384
|
import logging
|
|
446
385
|
import sys
|
|
@@ -905,14 +844,34 @@ print(result)
|
|
|
905
844
|
___
|
|
906
845
|
</details>
|
|
907
846
|
|
|
908
|
-
|
|
909
|
-
|
|
847
|
+
### Text to images - DeepInfraImager, PollinationsAI
|
|
848
|
+
```python
|
|
849
|
+
from webscout import DeepInfraImager
|
|
850
|
+
bot = DeepInfraImager()
|
|
851
|
+
resp = bot.generate("AI-generated image - webscout", 1)
|
|
852
|
+
print(bot.save(resp))
|
|
853
|
+
```
|
|
854
|
+
|
|
855
|
+
### Text to speech - Voicepods, StreamElements
|
|
856
|
+
```python
|
|
857
|
+
from webscout import Voicepods
|
|
858
|
+
voicepods = Voicepods()
|
|
859
|
+
text = "Hello, this is a test of the Voicepods text-to-speech"
|
|
860
|
+
|
|
861
|
+
print("Generating audio...")
|
|
862
|
+
audio_file = voicepods.tts(text)
|
|
863
|
+
|
|
864
|
+
print("Playing audio...")
|
|
865
|
+
voicepods.play_audio(audio_file)
|
|
866
|
+
```
|
|
867
|
+
|
|
868
|
+
### `Duckchat` - chat with LLM
|
|
910
869
|
```python
|
|
911
870
|
from webscout import WEBS as w
|
|
912
871
|
R = w().chat("Who are you", model='gpt-4o-mini') # GPT-3.5 Turbo, mixtral-8x7b, llama-3-70b, claude-3-haiku, gpt-4o-mini
|
|
913
872
|
print(R)
|
|
914
873
|
```
|
|
915
|
-
###
|
|
874
|
+
### `PhindSearch` - Search using Phind.com
|
|
916
875
|
|
|
917
876
|
```python
|
|
918
877
|
from webscout import PhindSearch
|
|
@@ -949,7 +908,7 @@ print(message)
|
|
|
949
908
|
```
|
|
950
909
|
|
|
951
910
|
|
|
952
|
-
###
|
|
911
|
+
### `You.com` - search/chat with you.com - Not working
|
|
953
912
|
```python
|
|
954
913
|
|
|
955
914
|
from webscout import YouChat
|
|
@@ -976,7 +935,7 @@ message = ai.get_message(response)
|
|
|
976
935
|
print(message)
|
|
977
936
|
```
|
|
978
937
|
|
|
979
|
-
###
|
|
938
|
+
### `Gemini` - search with google gemini
|
|
980
939
|
|
|
981
940
|
```python
|
|
982
941
|
import webscout
|
|
@@ -994,7 +953,7 @@ gemini = GEMINI(cookie_file=COOKIE_FILE, proxy=PROXIES)
|
|
|
994
953
|
response = gemini.chat("websearch about HelpingAI and who is its developer")
|
|
995
954
|
print(response)
|
|
996
955
|
```
|
|
997
|
-
###
|
|
956
|
+
### `Berlin4h` - chat with Berlin4h
|
|
998
957
|
```python
|
|
999
958
|
from webscout import Berlin4h
|
|
1000
959
|
|
|
@@ -1015,7 +974,7 @@ prompt = "Explain the concept of recursion in simple terms."
|
|
|
1015
974
|
response = ai.chat(prompt)
|
|
1016
975
|
print(response)
|
|
1017
976
|
```
|
|
1018
|
-
###
|
|
977
|
+
### `BlackBox` - Search/chat With BlackBox
|
|
1019
978
|
```python
|
|
1020
979
|
from webscout import BLACKBOXAI
|
|
1021
980
|
from rich import print
|
|
@@ -1046,7 +1005,7 @@ while True:
|
|
|
1046
1005
|
r = ai.chat(prompt)
|
|
1047
1006
|
print(r)
|
|
1048
1007
|
```
|
|
1049
|
-
###
|
|
1008
|
+
### `PERPLEXITY` - Search With PERPLEXITY
|
|
1050
1009
|
```python
|
|
1051
1010
|
from webscout import Perplexity
|
|
1052
1011
|
from rich import print
|
|
@@ -1472,7 +1431,7 @@ if "error" not in function_call_data:
|
|
|
1472
1431
|
else:
|
|
1473
1432
|
print(f"Error: {function_call_data['error']}")
|
|
1474
1433
|
```
|
|
1475
|
-
### LLAMA3, pizzagpt, RUBIKSAI, Koala, Darkai, AI4Chat, Farfalle, PIAI, Felo, XDASH, Julius, YouChat, YEPCHAT, Cloudflare, TurboSeek,
|
|
1434
|
+
### LLAMA3, pizzagpt, RUBIKSAI, Koala, Darkai, AI4Chat, Farfalle, PIAI, Felo, XDASH, Julius, YouChat, YEPCHAT, Cloudflare, TurboSeek, NetFly, Editee, AI21, Chatify, Cerebras, X0GPT
|
|
1476
1435
|
code similar to other provider
|
|
1477
1436
|
### `LLM`
|
|
1478
1437
|
```python
|