abstract-webtools 0.1.4.40__py3-none-any.whl → 0.1.4.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/abstract_webtools.py +454 -534
- abstract_webtools/big_user_agent_list.py +1 -0
- {abstract_webtools-0.1.4.40.dist-info → abstract_webtools-0.1.4.42.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.4.40.dist-info → abstract_webtools-0.1.4.42.dist-info}/RECORD +7 -6
- {abstract_webtools-0.1.4.40.dist-info → abstract_webtools-0.1.4.42.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.4.40.dist-info → abstract_webtools-0.1.4.42.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.4.40.dist-info → abstract_webtools-0.1.4.42.dist-info}/top_level.txt +0 -0
@@ -71,14 +71,13 @@ import requests
|
|
71
71
|
import os
|
72
72
|
# Google Chrome Driver
|
73
73
|
from selenium import webdriver
|
74
|
-
import yt_dlp
|
75
74
|
import ssl
|
76
75
|
import re
|
76
|
+
import yt_dlp
|
77
77
|
import threading
|
78
78
|
import requests
|
79
|
-
import time
|
80
79
|
from requests.adapters import HTTPAdapter
|
81
|
-
from typing import Optional, List
|
80
|
+
from typing import Optional, List,Union
|
82
81
|
from requests.packages.urllib3.poolmanager import PoolManager
|
83
82
|
from requests.packages.urllib3.util import ssl_
|
84
83
|
from urllib.parse import urlparse
|
@@ -220,6 +219,7 @@ class DynamicRateLimiterManagerSingleton:
|
|
220
219
|
DynamicRateLimiterManagerSingleton._instance = DynamicRateLimiterManager(service_name=service_name, low_limit=low_limit, high_limit=limit_epoch, limit_epoch=60,starting_tokens=starting_tokens,epoch_cycle_adjustment=epoch_cycle_adjustment)
|
221
220
|
return DynamicRateLimiterManagerSingleton._instance
|
222
221
|
|
222
|
+
|
223
223
|
class CipherManager:
|
224
224
|
@staticmethod
|
225
225
|
def get_default_ciphers()-> list:
|
@@ -259,124 +259,15 @@ class CipherManagerSingleton:
|
|
259
259
|
CipherManagerSingleton._instance = CipherManager(cipher_list=cipher_list)
|
260
260
|
return CipherManagerSingleton._instance
|
261
261
|
class SSLManager:
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
def get_default_tls_options():
|
268
|
-
return ["OP_NO_TLSv1", "OP_NO_TLSv1_1", "OP_NO_COMPRESSION"]
|
269
|
-
|
270
|
-
@staticmethod
|
271
|
-
def get_all_tls_options() -> int:
|
272
|
-
"""
|
273
|
-
Returns the SSL options to be used when creating the SSL context.
|
274
|
-
[
|
275
|
-
ssl.OP_SINGLE_ECDH_USE,
|
276
|
-
ssl.OP_SINGLE_DH_USE,
|
277
|
-
ssl.OP_NO_TLSv1_3,
|
278
|
-
ssl.OP_NO_TLSv1_2,
|
279
|
-
ssl.OP_NO_TLSv1_1,
|
280
|
-
ssl.OP_NO_TLSv1,
|
281
|
-
ssl.OP_NO_TICKET,
|
282
|
-
ssl.OP_NO_RENEGOTIATION,
|
283
|
-
ssl.OP_NO_QUERY_MTU,
|
284
|
-
ssl.OP_NO_COMPRESSION,
|
285
|
-
ssl.OP_CIPHER_SERVER_PREFERENCE,
|
286
|
-
ssl.OP_ALLOW_NO_DHE_KEX,
|
287
|
-
ssl.OP_ALL
|
288
|
-
]
|
289
|
-
The `ssl` module in the Python standard library provides several constants that you can use to set various SSL options. Here are the available options as of Python 3.9:
|
290
|
-
|
291
|
-
1. `ssl.OP_ALL`:
|
292
|
-
- Enables a collection of various bug workaround options.
|
293
|
-
|
294
|
-
2. `ssl.OP_ALLOW_NO_DHE_KEX`:
|
295
|
-
- Allow a non-(EC)DHE handshake on a server socket if no suitable security level can be reached.
|
296
|
-
|
297
|
-
3. `ssl.OP_CIPHER_SERVER_PREFERENCE`:
|
298
|
-
- Uses the server's cipher ordering preference rather than the client's.
|
299
|
-
|
300
|
-
4. `ssl.OP_NO_COMPRESSION`:
|
301
|
-
- Prevents using SSL/TLS compression to avoid CRIME attacks.
|
302
|
-
|
303
|
-
5. `ssl.OP_NO_QUERY_MTU`:
|
304
|
-
- Disables automatic querying of kernel for MTU.
|
305
|
-
|
306
|
-
6. `ssl.OP_NO_RENEGOTIATION`:
|
307
|
-
- Disallows all renegotiation.
|
262
|
+
def __init__(self, ciphers=None, ssl_options=None, certification=None):
|
263
|
+
self.ciphers = ciphers or "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES128-SHA256:AES256-SHA:AES128-SHA"
|
264
|
+
self.ssl_options = ssl_options or ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1 | ssl.OP_NO_COMPRESSION
|
265
|
+
self.certification = certification or ssl.CERT_REQUIRED
|
266
|
+
self.ssl_context = self.get_context()
|
308
267
|
|
309
|
-
|
310
|
-
|
268
|
+
def get_context(self):
|
269
|
+
return ssl_.create_urllib3_context(ciphers=self.ciphers, cert_reqs=self.certification, options=self.ssl_options)
|
311
270
|
|
312
|
-
8. `ssl.OP_NO_TLSv1`:
|
313
|
-
- Prevents the use of TLSv1.
|
314
|
-
|
315
|
-
9. `ssl.OP_NO_TLSv1_1`:
|
316
|
-
- Prevents the use of TLSv1.1.
|
317
|
-
|
318
|
-
10. `ssl.OP_NO_TLSv1_2`:
|
319
|
-
- Prevents the use of TLSv1.2.
|
320
|
-
|
321
|
-
11. `ssl.OP_NO_TLSv1_3`:
|
322
|
-
- Prevents the use of TLSv1.3.
|
323
|
-
|
324
|
-
12. `ssl.OP_SINGLE_DH_USE`:
|
325
|
-
- Always create a new key when using temporary/ephemeral DH parameters. This option provides forward secrecy.
|
326
|
-
|
327
|
-
13. `ssl.OP_SINGLE_ECDH_USE`:
|
328
|
-
- Always create a new key when using temporary/ephemeral ECDH parameters. This option provides forward secrecy.
|
329
|
-
|
330
|
-
These constants can be combined using the bitwise OR (`|`) operator to set multiple options. For example, to prevent the use of TLSv1 and TLSv1.1, you would use:
|
331
|
-
Please note that the availability of some options might vary depending on the version of OpenSSL that Python's `ssl` module is linked against and the version of Python itself. You can always check the Python documentation specific to your version to get the most accurate and updated list.
|
332
|
-
|
333
|
-
Returns:
|
334
|
-
int: The SSL options.
|
335
|
-
|
336
|
-
"""
|
337
|
-
return [
|
338
|
-
"OP_SINGLE_ECDH_USE",
|
339
|
-
"OP_SINGLE_DH_USE",
|
340
|
-
"OP_NO_TLSv1_3",
|
341
|
-
"OP_NO_TLSv1_2",
|
342
|
-
"OP_NO_TLSv1_1",
|
343
|
-
"OP_NO_TLSv1",
|
344
|
-
"OP_NO_TICKET",
|
345
|
-
"OP_NO_RENEGOTIATION",
|
346
|
-
"OP_NO_QUERY_MTU",
|
347
|
-
"OP_NO_COMPRESSION",
|
348
|
-
"OP_CIPHER_SERVER_PREFERENCE",
|
349
|
-
"OP_ALLOW_NO_DHE_KEX",
|
350
|
-
"OP_ALL"
|
351
|
-
]
|
352
|
-
|
353
|
-
@staticmethod
|
354
|
-
def get_context(ciphers=None, options=None, cert_reqs=None):
|
355
|
-
|
356
|
-
return ssl_.create_urllib3_context(ciphers=ciphers, cert_reqs=cert_reqs, options=options)
|
357
|
-
|
358
|
-
def __init__(self, ciphers=None, ssl_options_list=None, certification=None):
|
359
|
-
self.ssl_options_list = ssl_options_list
|
360
|
-
self.create_list()
|
361
|
-
self.ssl_options_values = self.get_options_values()
|
362
|
-
self.ssl_options = self.combine_ssl_options()
|
363
|
-
self.certification = certification or self.get_default_certification()
|
364
|
-
self.cipher_manager = CipherManagerSingleton().get_instance(cipher_list=ciphers)
|
365
|
-
self.ssl_context = self.get_context(ciphers=self.cipher_manager.ciphers_string, options=self.ssl_options, cert_reqs=self.certification)
|
366
|
-
def create_list(self):
|
367
|
-
if self.ssl_options_list == None:
|
368
|
-
self.ssl_options_list= []
|
369
|
-
elif isinstance(self.ssl_options_list, str):
|
370
|
-
self.ssl_options_list=self.ssl_options_list.split(',')
|
371
|
-
if isinstance(self.ssl_options_list, str):
|
372
|
-
self.ssl_options_list=[self.ssl_options_list]
|
373
|
-
def get_options_values(self):
|
374
|
-
return [getattr(ssl, option_name) for option_name in self.ssl_options_list]
|
375
|
-
def combine_ssl_options(self):
|
376
|
-
combined_options = 0
|
377
|
-
for option in self.ssl_options_values:
|
378
|
-
combined_options |= option
|
379
|
-
return combined_options
|
380
271
|
class SSLManagerSingleton:
|
381
272
|
_instance = None
|
382
273
|
@staticmethod
|
@@ -387,60 +278,49 @@ class SSLManagerSingleton:
|
|
387
278
|
SSLManagerSingleton._instance = SSLManager(ciphers=ciphers, ssl_options_list=ssl_options_list, certification=certification)
|
388
279
|
return SSLManagerSingleton._instance
|
389
280
|
class TLSAdapter(HTTPAdapter):
|
390
|
-
def __init__(self, ciphers
|
391
|
-
|
392
|
-
|
393
|
-
self.
|
394
|
-
self.
|
395
|
-
|
396
|
-
self.
|
397
|
-
self.ssl_manager = SSLManagerSingleton.get_instance(
|
398
|
-
ciphers=self.cipher_manager.ciphers_string,
|
399
|
-
ssl_options_list=ssl_options,
|
400
|
-
certification=certification
|
401
|
-
)
|
281
|
+
def __init__(self, ssl_manager=None,ciphers=None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None):
|
282
|
+
if ssl_manager == None:
|
283
|
+
ssl_manager = SSLManager(ciphers=ciphers, ssl_options=ssl_options, certification=certification)
|
284
|
+
self.ssl_manager = ssl_manager
|
285
|
+
self.ciphers = ssl_manager.ciphers
|
286
|
+
self.certification = ssl_manager.certification
|
287
|
+
self.ssl_options = ssl_manager.ssl_options
|
402
288
|
self.ssl_context = self.ssl_manager.ssl_context
|
289
|
+
super().__init__()
|
403
290
|
|
404
291
|
def init_poolmanager(self, *args, **kwargs):
|
292
|
+
kwargs['ssl_context'] = self.ssl_context
|
405
293
|
return super().init_poolmanager(*args, **kwargs)
|
406
|
-
|
407
|
-
|
408
294
|
class TLSAdapterSingleton:
|
409
|
-
_instance = None
|
295
|
+
_instance: Optional[TLSAdapter] = None
|
296
|
+
|
410
297
|
@staticmethod
|
411
|
-
def get_instance(ciphers=None, certification=None, ssl_options=None):
|
412
|
-
if TLSAdapterSingleton._instance
|
413
|
-
TLSAdapterSingleton._instance
|
414
|
-
|
298
|
+
def get_instance(ciphers: Optional[List[str]] = None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None) -> TLSAdapter:
|
299
|
+
if (not TLSAdapterSingleton._instance) or (
|
300
|
+
TLSAdapterSingleton._instance.ciphers != ciphers or
|
301
|
+
TLSAdapterSingleton._instance.certification != certification or
|
302
|
+
TLSAdapterSingleton._instance.ssl_options != ssl_options
|
303
|
+
):
|
415
304
|
TLSAdapterSingleton._instance = TLSAdapter(ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
416
305
|
return TLSAdapterSingleton._instance
|
417
306
|
class UserAgentManager:
|
307
|
+
def __init__(self, user_agent=None):
|
308
|
+
if user_agent == None:
|
309
|
+
user_agent = self.desktop_user_agents()[0]
|
310
|
+
self.user_agent = user_agent
|
311
|
+
self.user_agent=self.get_user_agent(self.user_agent)
|
418
312
|
@staticmethod
|
419
|
-
|
420
313
|
def desktop_user_agents() -> list:
|
421
|
-
|
422
|
-
Returns a list of popular desktop user-agent strings for various browsers.
|
423
|
-
|
424
|
-
Returns:
|
425
|
-
list: A list of desktop user-agent strings.
|
426
|
-
"""
|
427
|
-
return ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59','Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko','Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14']
|
428
|
-
|
314
|
+
return ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59','Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko','Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14']
|
429
315
|
@staticmethod
|
430
|
-
def
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
dict: A dictionary containing the 'user-agent' header.
|
439
|
-
"""
|
440
|
-
return {"user-agent": user_agent}
|
441
|
-
def __init__(self,user_agent=desktop_user_agents()[0]):
|
442
|
-
self.user_agent = user_agent
|
443
|
-
self.get_user_agent(user_agent=user_agent)
|
316
|
+
def big_user_agent_list(n=0):
|
317
|
+
from .big_user_agent_list import big_user_agent_list
|
318
|
+
return big_user_agent_list[n]
|
319
|
+
@staticmethod
|
320
|
+
def get_user_agent(user_agent: str = desktop_user_agents()[0]) -> dict:
|
321
|
+
if isinstance(user_agent,dict):
|
322
|
+
return user_agent
|
323
|
+
return {"user-agent": user_agent}
|
444
324
|
class UserAgentManagerSingleton:
|
445
325
|
_instance = None
|
446
326
|
@staticmethod
|
@@ -450,215 +330,20 @@ class UserAgentManagerSingleton:
|
|
450
330
|
elif UserAgentManagerSingleton._instance.user_agent != user_agent:
|
451
331
|
UserAgentManagerSingleton._instance = UserAgentManager(user_agent=user_agent)
|
452
332
|
return UserAgentManagerSingleton._instance
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
session=None,
|
468
|
-
headers=None,
|
469
|
-
cookies=None,
|
470
|
-
adapter=None,
|
471
|
-
protocol=None,
|
472
|
-
proxies=None,
|
473
|
-
auth=None,
|
474
|
-
stream=False,
|
475
|
-
last_request_time=None,
|
476
|
-
max_retries=None,
|
477
|
-
request_wait_limit=None):
|
478
|
-
self.url_manager=url_manager
|
479
|
-
self.session=session or requests.Session()
|
480
|
-
self.headers = headers or {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}
|
481
|
-
self.cookies=cookies or "cb4c883efc59d0e990caf7508902591f4569e7bf-1617321078-0-150"
|
482
|
-
self.adapter=adapter or TLSAdapterSingleton().get_instance()
|
483
|
-
self.protocol=protocol or 'https://'
|
484
|
-
self.proxies=None or {}
|
485
|
-
self.auth=auth
|
486
|
-
self.stream=stream if isinstance(stream,bool) else False
|
487
|
-
self.last_request_time=last_request_time
|
488
|
-
self.max_retries = max_retries or 3
|
489
|
-
self.request_wait_limit = request_wait_limit or 1.5
|
490
|
-
self.session = self.initialize_session()
|
491
|
-
self.make_request()
|
492
|
-
self.source_code = None
|
493
|
-
self.source_code_bytes=None
|
494
|
-
self.source_code_json = {}
|
495
|
-
self.react_source_code=[]
|
496
|
-
self.get_response()
|
497
|
-
def initialize_session(self):
|
498
|
-
s = self.session
|
499
|
-
s.proxies=self.proxies
|
500
|
-
s.auth=self.auth
|
501
|
-
s.cookies["cf_clearance"] = self.cookies
|
502
|
-
s.headers.update(self.headers)
|
503
|
-
# Add any other headers or cookie settings here
|
504
|
-
s.mount(self.protocol, self.adapter)
|
505
|
-
return s
|
506
|
-
def get_response(self):
|
507
|
-
if self.request:
|
508
|
-
self.source_code = self.request.text
|
509
|
-
self.source_code_bytes=self.request.content
|
510
|
-
self.source_code_json = {}
|
511
|
-
if self.request.headers.get('content-type') == 'application/json':
|
512
|
-
data = convert_to_json(self.source_code)
|
513
|
-
if data:
|
514
|
-
self.source_code_json=data.get("response", data)
|
515
|
-
self.get_react_source_code()
|
516
|
-
def get_react_source_code(self) -> list:
|
517
|
-
"""
|
518
|
-
Fetches the source code of the specified URL and extracts JavaScript and JSX source code (React components).
|
519
|
-
|
520
|
-
Args:
|
521
|
-
url (str): The URL to fetch the source code from.
|
522
|
-
|
523
|
-
Returns:
|
524
|
-
list: A list of strings containing JavaScript and JSX source code found in <script> tags.
|
525
|
-
"""
|
526
|
-
if self.url_manager.correct_url is None:
|
527
|
-
return []
|
528
|
-
soup = BeautifulSoup(self.source_code_bytes,"html.parser")
|
529
|
-
script_tags = soup.find_all('script', type=lambda t: t and ('javascript' in t or 'jsx' in t))
|
530
|
-
for script_tag in script_tags:
|
531
|
-
self.react_source_code.append(script_tag.string)
|
532
|
-
|
533
|
-
|
534
|
-
def get_status(url:str=None) -> int:
|
535
|
-
"""
|
536
|
-
Gets the HTTP status code of the given URL.
|
537
|
-
|
538
|
-
Args:
|
539
|
-
url (str): The URL to check the status of.
|
540
|
-
|
541
|
-
Returns:
|
542
|
-
int: The HTTP status code of the URL, or None if the request fails.
|
543
|
-
"""
|
544
|
-
# Get the status code of the URL
|
545
|
-
return try_request(url=url).status_code
|
546
|
-
def wait_between_requests(self):
|
547
|
-
"""
|
548
|
-
Wait between requests based on the request_wait_limit.
|
549
|
-
"""
|
550
|
-
if self.last_request_time:
|
551
|
-
sleep_time = self.request_wait_limit - (get_time_stamp() - self.last_request_time)
|
552
|
-
if sleep_time > 0:
|
553
|
-
logging.info(f"Sleeping for {sleep_time:.2f} seconds.")
|
554
|
-
get_sleep(sleep_time)
|
555
|
-
|
556
|
-
def make_request(self):
|
557
|
-
"""
|
558
|
-
Make a request and handle potential errors.
|
559
|
-
"""
|
560
|
-
# Update the instance attributes if they are passed
|
561
|
-
|
562
|
-
self.wait_between_requests()
|
563
|
-
for _ in range(self.max_retries):
|
564
|
-
try:
|
565
|
-
self.try_request() # 10 seconds timeout
|
566
|
-
if self.request:
|
567
|
-
if self.request.status_code == 200:
|
568
|
-
self.last_request_time = get_time_stamp()
|
569
|
-
return self.request
|
570
|
-
elif self.request.status_code == 429:
|
571
|
-
logging.warning(f"Rate limited by {self.url_manager.correct_url}. Retrying...")
|
572
|
-
get_sleep(5) # adjust this based on the server's rate limit reset time
|
573
|
-
except requests.Timeout as e:
|
574
|
-
logging.error(f"Request to {cleaned_url} timed out: {e}")
|
575
|
-
except requests.ConnectionError:
|
576
|
-
logging.error(f"Connection error for URL {self.url_manager.correct_url}.")
|
577
|
-
except requests.Timeout:
|
578
|
-
logging.error(f"Request timeout for URL {self.url_manager.correct_url}.")
|
579
|
-
except requests.RequestException as e:
|
580
|
-
logging.error(f"Request exception for URL {self.url_manager.correct_url}: {e}")
|
581
|
-
|
582
|
-
logging.error(f"Failed to retrieve content from {self.url_manager.correct_url} after {self.max_retries} retries.")
|
583
|
-
return None
|
584
|
-
def try_request(self,timeout=10) -> (requests.Response or None):
|
585
|
-
"""
|
586
|
-
Tries to make an HTTP request to the given URL using the provided session.
|
587
|
-
|
588
|
-
Args:
|
589
|
-
url (str): The URL to make the request to.
|
590
|
-
session (type(requests.Session), optional): The requests session to use for making HTTP requests.
|
591
|
-
Defaults to requests.
|
592
|
-
|
593
|
-
Returns:
|
594
|
-
requests.Response or None: The response object if the request is successful, or None if the request fails.
|
595
|
-
"""
|
596
|
-
try:
|
597
|
-
self.request = self.session.get(url=self.url_manager.url, timeout=10)
|
598
|
-
except requests.exceptions.RequestException as e:
|
599
|
-
print(e)
|
600
|
-
return False
|
601
|
-
def get_limited_request(self,request_url,service_name="default"):
|
602
|
-
manager = DynamicRateLimiterManagerSingleton.get_instance() # Get the singleton instance
|
603
|
-
unwanted_response=True
|
604
|
-
# Check with the rate limiter if we can make a request
|
605
|
-
while True:
|
606
|
-
if not manager.request(service_name):
|
607
|
-
print("Rate limit reached for coin_gecko. Waiting for the next epoch...")
|
608
|
-
sleep_count_down(manager.services[service_name].get_sleep()["current_sleep"]) # Wait for the limit_epoch duration
|
609
|
-
# Make the actual request
|
610
|
-
response = try_request(request_url=request_url)
|
611
|
-
|
612
|
-
# If you get a rate-limit error (usually 429 status code but can vary), adjust the rate limiter
|
613
|
-
if response.status_code ==429:
|
614
|
-
print(response.json())
|
615
|
-
manager.services[service_name].request_tracker(False)
|
616
|
-
print("Rate limited by coin_gecko. Adjusted limit. Retrying...")
|
617
|
-
if len(manager.services[service_name].calculate_tokens()["succesful"])<2:
|
618
|
-
sleep_count_down(manager.services[service_name].limit_epoch) # Wait for the limit_epoch duration
|
619
|
-
else:
|
620
|
-
manager.services[service_name].current_limit-=1
|
621
|
-
sleep_count_down(manager.services[service_name].limit_epoch/len(manager.services[service_name].calculate_tokens()["succesful"])) # Wait for the limit_epoch duration
|
622
|
-
# Return the data if the request was successful
|
623
|
-
if response.status_code == 200:
|
624
|
-
manager.services[service_name].request_tracker(True)
|
625
|
-
return response.json()
|
626
|
-
elif response.status_code not in [200,429]:
|
627
|
-
print(f"Unexpected response: {response.status_code}. Message: {response.text}")
|
628
|
-
return None
|
629
|
-
@property
|
630
|
-
def url(self):
|
631
|
-
return self.url_manager.url
|
632
|
-
|
633
|
-
@url.setter
|
634
|
-
def url(self, new_url):
|
635
|
-
self._url_manager.url = new_url
|
636
|
-
class SafeRequestSingleton:
|
637
|
-
_instance = None
|
638
|
-
@staticmethod
|
639
|
-
def get_instance(url=None,headers:dict=None,max_retries=3,last_request_time=None,request_wait_limit=1.5):
|
640
|
-
if SafeRequestSingleton._instance is None:
|
641
|
-
SafeRequestSingleton._instance = SafeRequest(url,url_manager=URLManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
|
642
|
-
elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
|
643
|
-
SafeRequestSingleton._instance = SafeRequest(url,url_manager=URLManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
|
644
|
-
return SafeRequestSingleton._instance
|
645
|
-
## ##
|
646
|
-
# Usage
|
647
|
-
## safe_requester = SafeRequest()
|
648
|
-
##
|
649
|
-
## url = "example.com" # replace with your URL
|
650
|
-
#### if safe_requester.is_valid_url(url):
|
651
|
-
## response = safe_requester.make_request(url)
|
652
|
-
## if response:
|
653
|
-
## print(response.text)
|
654
|
-
## else:
|
655
|
-
## logging.error(f"Invalid URL: {url}")
|
656
|
-
# Usage 2
|
657
|
-
## safe_requester = SafeRequest()
|
658
|
-
## source_code = safe_requester.get_source_code('https://www.example.com')
|
659
|
-
## if source_code:
|
660
|
-
## print(source_code)
|
661
|
-
## ##
|
333
|
+
class NetworkManager:
|
334
|
+
def __init__(self, user_agent_manager=None,ssl_manager=None, tls_adapter=None,user_agent=None,proxies=None,auth=None,cookies=None,ciphers=None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None):
|
335
|
+
if ssl_manager == None:
|
336
|
+
ssl_manager = SSLManager(ciphers=ciphers, ssl_options=ssl_options, certification=certification)
|
337
|
+
self.ssl_manager=ssl_manager
|
338
|
+
if tls_adapter == None:
|
339
|
+
tls_adapter=TLSAdapter(ssl_manager=ssl_manager,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
340
|
+
self.tls_adapter=tls_adapter
|
341
|
+
self.ciphers=tls_adapter.ciphers
|
342
|
+
self.certification=tls_adapter.certification
|
343
|
+
self.ssl_options=tls_adapter.ssl_options
|
344
|
+
self.proxies=None or {}
|
345
|
+
self.auth=auth
|
346
|
+
self.cookies=cookies or "cb4c883efc59d0e990caf7508902591f4569e7bf-1617321078-0-150"
|
662
347
|
class MySocketClient:
|
663
348
|
def __init__(self, ip_address=None, port=None,domain_name=None):
|
664
349
|
self.sock
|
@@ -705,161 +390,21 @@ class MySocketClient():
|
|
705
390
|
elif MySocketClientSingleton._instance.ip_address != ip_address or MySocketClientSingleton._instance.port != port or URLManagerSingleton._instance.domain_name != domain_name:
|
706
391
|
MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain_name=domain_name)
|
707
392
|
return MySocketClient
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
self.video_extention=video_extention
|
720
|
-
self.download_directory=download_directory
|
721
|
-
self.video_extention=video_extention
|
722
|
-
self.header = {}#UserAgentManagerSingleton().get_instance(user_agent=user_agent)
|
723
|
-
self.base_name = os.path.basename(self.url)
|
724
|
-
self.file_name,self.ext = os.path.splitext(self.base_name)
|
725
|
-
self.video_urls = [self.url]
|
726
|
-
self.fetch_video_urls()
|
727
|
-
self.info={}
|
728
|
-
self.starttime = None
|
729
|
-
self.downloaded = 0
|
730
|
-
self.video_urls = url if isinstance(url,list) else [url]
|
731
|
-
self.send_to_dl()
|
732
|
-
|
733
|
-
|
734
|
-
def get_request(self,url):
|
735
|
-
self.request_manager = SafeRequestSingleton.get_instance(url=url)
|
736
|
-
return self.request_manager
|
737
|
-
def send_to_dl(self):
|
738
|
-
if self.standalone_download:
|
739
|
-
self.standalone_downloader()
|
740
|
-
else:
|
741
|
-
self.start()
|
742
|
-
def get_headers(self,url):
|
743
|
-
# Send an HTTP GET request to the URL
|
744
|
-
response = requests.get(url)
|
745
|
-
# Check if the request was successful (status code 200)
|
746
|
-
if response.status_code == 200:
|
747
|
-
# Access and print the response headers
|
748
|
-
return response.headers
|
749
|
-
else:
|
750
|
-
print(f"Failed to retrieve the web page. Status code: {response.status_code}")
|
751
|
-
def progress_callback(self, stream, chunk, bytes_remaining):
|
752
|
-
total_size = stream.filesize
|
753
|
-
self.downloaded = total_size - bytes_remaining
|
754
|
-
def get_directory_path(directory,name,video_extention):
|
755
|
-
file_path=os.path.join(directory,name+'.'+video_extention)
|
756
|
-
i=0
|
757
|
-
while os.path.exists(file_path) == True:
|
758
|
-
file_path=os.path.join(directory,name+f'_{i}'+'.'+video_extention)
|
759
|
-
i+=1
|
760
|
-
return file_path
|
761
|
-
def fetch_video_urls(self):
|
762
|
-
driver = webdriver.Chrome()
|
763
|
-
driver.get(self.url)
|
764
|
-
self.page_source = driver.page_source
|
765
|
-
for each in self.page_source.split('<source ')[1:]:
|
766
|
-
# NOTE: Make sure to import the `eatAll` function and use it here.
|
767
|
-
self.video_urls.append(eatInner(each.split('.{self.video_extention}'.replace('..','.'))[0].split('http')[-1],['h','t','t','p','s',':','//','/','s','=',' ','\n','\t',''])+'.mp4')
|
768
|
-
def download(self):
|
769
|
-
for video_url in self.video_urls:
|
770
|
-
ydl_opts = {
|
771
|
-
'external_downloader': 'ffmpeg',
|
772
|
-
'external_downloader_args': "-ss 00:01:00.00 -to 00:02:00.00",
|
773
|
-
'format': 'best',
|
774
|
-
'outtmpl': f'{self.download_directory} %(title)s-%(id)s.%(ext)s',
|
775
|
-
'noprogress': True
|
776
|
-
}
|
777
|
-
self.output_video = ydl_opts['outtmpl']
|
778
|
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
779
|
-
if self.get_download:
|
780
|
-
self.info = ydl.extract_info(video_url, download=self.get_download).noprogress
|
781
|
-
else:
|
782
|
-
self.info = ydl.extract_info(video_url, download=self.get_download)
|
783
|
-
self.starttime = time.time()
|
784
|
-
if self.auto_file_gen:
|
785
|
-
file_path = ydl.prepare_filename(self.info)
|
786
|
-
if self.get_info == True:
|
787
|
-
self.info['file_path']=file_path
|
788
|
-
|
789
|
-
if self.get_info == True:
|
790
|
-
self.monitoring = False
|
791
|
-
self.pause_event.set()
|
792
|
-
return self.info
|
793
|
-
self.monitoring = False
|
794
|
-
self.pause_event.set()
|
795
|
-
def monitor(self):
|
796
|
-
name = self.title or False
|
797
|
-
while self.monitoring:
|
798
|
-
print("Monitoring...")
|
799
|
-
self.pause_event.wait(60) # check every minute
|
800
|
-
if self.starttime:
|
801
|
-
elapsed_time = time.time() - self.starttime
|
802
|
-
if self.downloaded != 0 and elapsed_time !=0:
|
803
|
-
percent = self.downloaded / (self.downloaded + elapsed_time)
|
804
|
-
else:
|
805
|
-
percent = 0
|
806
|
-
if elapsed_time !=0:
|
807
|
-
try:
|
808
|
-
# operations that can cause ZeroDivisionError
|
809
|
-
percent = self.downloaded / (self.downloaded + elapsed_time)
|
810
|
-
downloaded_minutes = elapsed_time / 60
|
811
|
-
estimated_download_time = downloaded_minutes / percent - downloaded_minutes
|
812
|
-
except ZeroDivisionError:
|
813
|
-
print("Caught a division by zero!")
|
814
|
-
continue
|
815
|
-
|
816
|
-
if downloaded_minutes != 0 and (percent - downloaded_minutes) !=0:
|
817
|
-
estimated_download_time = downloaded_minutes / percent - downloaded_minutes
|
818
|
-
print(estimated_download_time)
|
819
|
-
if estimated_download_time >= 1.5:
|
820
|
-
print("Seems like YouTube is limiting our download speed, restarting the download to mitigate the problem..")
|
821
|
-
# TODO: Find a way to stop the current download and restart. This may not work efficiently since pytube doesn't expose a cancel download method.
|
822
|
-
self.start() # Restart the download process
|
823
|
-
def standalone_downloader(self):
|
824
|
-
name = self.title or False
|
825
|
-
for video_url in self.video_urls:
|
826
|
-
self.request_manager
|
827
|
-
self.info=True
|
828
|
-
self.download_video=False
|
829
|
-
headers = self.get_headers(video_url)
|
830
|
-
for v in soup.select("video source[src]"):
|
831
|
-
print("Downloading {}".format(v["src"]))
|
832
|
-
if not name_chosen:
|
833
|
-
name = v["src"].split("/")[-1].strip()
|
834
|
-
file_path = get_directory_path(directory,name,video_extention)
|
835
|
-
print(f"saving to {file_path}")
|
836
|
-
with open(get_directory_path(directory,name,video_extention), "wb") as f_out:
|
837
|
-
f_out.write(requests.get(v["src"].strip(), headers=headers).content)
|
838
|
-
|
839
|
-
def start(self):
|
840
|
-
self.download_thread = threading.Thread(target=self.download)
|
841
|
-
self.download_thread.daemon = True
|
842
|
-
self.monitor_thread = threading.Thread(target=self.monitor)
|
843
|
-
self.download_thread.start()
|
844
|
-
self.monitor_thread.start()
|
845
|
-
self.download_thread.join()
|
846
|
-
self.monitor_thread.join()
|
847
|
-
def stop(self):
|
848
|
-
self.monitoring = False
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
class VideoDownloaderSingleton():
|
853
|
-
_instance = None
|
854
|
-
@staticmethod
|
855
|
-
def get_instance(url_manager,request_manager,title=None,video_extention='mp4',download_directory=os.getcwd(),user_agent=None,download=True,get_info=False):
|
856
|
-
if VideoDownloaderSingleton._instance is None:
|
857
|
-
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
|
858
|
-
elif VideoDownloaderSingleton._instance.title != title or video_extention != VideoDownloaderSingleton._instance.video_extention or url != VideoDownloaderSingleton._instance.url or download_directory != VideoDownloaderSingleton._instance.download_directory or user_agent != VideoDownloaderSingleton._instance.user_agent:
|
859
|
-
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
|
860
|
-
return VideoDownloaderSingleton._instance
|
393
|
+
def safe_json_loads(data):
|
394
|
+
try:
|
395
|
+
return json.loads(data)
|
396
|
+
except json.JSONDecodeError:
|
397
|
+
return None
|
398
|
+
def convert_to_json(obj):
|
399
|
+
if isinstance(obj, dict):
|
400
|
+
return obj
|
401
|
+
if isinstance(obj, str):
|
402
|
+
return safe_json_loads(obj)
|
403
|
+
return None
|
861
404
|
class URLManager:
|
862
|
-
def __init__(self, url, session=requests):
|
405
|
+
def __init__(self, url=None, session=requests):
|
406
|
+
if url==None:
|
407
|
+
url='www.example.com'
|
863
408
|
self.url = url
|
864
409
|
self.session = session
|
865
410
|
|
@@ -983,7 +528,225 @@ class URLManagerSingleton:
|
|
983
528
|
elif URLManagerSingleton._instance.session != session or URLManagerSingleton._instance.url != url:
|
984
529
|
URLManagerSingleton._instance = URLManager(url,session=session)
|
985
530
|
return URLManagerSingleton._instance
|
531
|
+
class SafeRequest:
|
532
|
+
def __init__(self,
|
533
|
+
url=None,
|
534
|
+
url_manager=None,
|
535
|
+
network_manager=None,
|
536
|
+
user_agent_manager=None,
|
537
|
+
ssl_manager=None,
|
538
|
+
tls_adapter=None,
|
539
|
+
user_agent=None,
|
540
|
+
proxies=None,
|
541
|
+
headers=None,
|
542
|
+
auth=None,
|
543
|
+
cookies=None,
|
544
|
+
session=None,
|
545
|
+
adapter=None,
|
546
|
+
protocol=None,
|
547
|
+
ciphers=None,
|
548
|
+
certification=None,
|
549
|
+
ssl_options=None,
|
550
|
+
stream=False,
|
551
|
+
last_request_time=None,
|
552
|
+
max_retries=None,
|
553
|
+
request_wait_limit=None):
|
554
|
+
if url_manager == None:
|
555
|
+
url_manager = URLManager(url=url)
|
556
|
+
self.url_manager=url_manager
|
557
|
+
if network_manager == None:
|
558
|
+
network_manager=NetworkManager(user_agent_manager=user_agent_manager,ssl_manager=ssl_manager, tls_adapter=tls_adapter,user_agent=user_agent,proxies=proxies,auth=auth,cookies=cookies,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
559
|
+
if user_agent_manager == None:
|
560
|
+
user_agent_manager = UserAgentManager(user_agent=user_agent)
|
561
|
+
self.user_agent_manager = user_agent_manager
|
562
|
+
self.user_agent= self.user_agent_manager.user_agent
|
563
|
+
self.network_manager = network_manager
|
564
|
+
self.tls_adapter=self.network_manager.tls_adapter
|
565
|
+
self.ciphers=self.network_manager.ciphers
|
566
|
+
self.certification=self.network_manager.certification
|
567
|
+
self.ssl_options=self.network_manager.ssl_options
|
568
|
+
self.proxies=self.network_manager.proxies
|
569
|
+
self.auth=self.network_manager.auth
|
570
|
+
self.cookies=self.network_manager.cookies
|
571
|
+
self.session = session or requests.session()
|
572
|
+
self.protocol=protocol or 'https://'
|
573
|
+
self.headers=headers or self.user_agent or {'Accept': '*/*'}
|
574
|
+
self.stream=stream if isinstance(stream,bool) else False
|
575
|
+
self.initialize_session()
|
576
|
+
self.last_request_time=last_request_time
|
577
|
+
self.max_retries = max_retries or 3
|
578
|
+
self.request_wait_limit = request_wait_limit or 1.5
|
579
|
+
self._response=None
|
580
|
+
self.make_request()
|
581
|
+
self.source_code = None
|
582
|
+
self.source_code_bytes=None
|
583
|
+
self.source_code_json = {}
|
584
|
+
self.react_source_code=[]
|
585
|
+
self._response_data = None
|
586
|
+
self.process_response_data()
|
587
|
+
@property
|
588
|
+
def response(self):
|
589
|
+
"""Lazy-loading of response."""
|
590
|
+
if self._response is None:
|
591
|
+
self._response = self.fetch_response()
|
592
|
+
return self._response
|
593
|
+
|
594
|
+
def fetch_response(self) -> Union[requests.Response, None]:
|
595
|
+
"""Actually fetches the response from the server."""
|
596
|
+
# You can further adapt this method to use retries or other logic you had
|
597
|
+
# in your original code, but the main goal here is to fetch and return the response
|
598
|
+
return self.try_request()
|
599
|
+
def initialize_session(self):
|
600
|
+
s = self.session
|
601
|
+
s.proxies = self.network_manager.proxies # Use the proxies from the NetworkManager
|
602
|
+
s.auth = self.network_manager.auth # Use the auth from SafeRequest (if provided)
|
603
|
+
# Add any other headers or cookie settings here
|
604
|
+
s.cookies["cf_clearance"] = self.network_manager.cookies
|
605
|
+
s.headers.update(self.headers)
|
606
|
+
s.mount(self.protocol, self.network_manager.tls_adapter) # Use the TLSAdapter from the NetworkManager
|
607
|
+
return s
|
608
|
+
def process_response_data(self):
|
609
|
+
"""Processes the fetched response data."""
|
610
|
+
if not self.response:
|
611
|
+
return # No data to process
|
612
|
+
|
613
|
+
self.source_code = self.response.text
|
614
|
+
self.source_code_bytes = self.response.content
|
615
|
+
|
616
|
+
if self.response.headers.get('content-type') == 'application/json':
|
617
|
+
data = convert_to_json(self.source_code)
|
618
|
+
if data:
|
619
|
+
self.source_code_json = data.get("response", data)
|
620
|
+
|
621
|
+
self.get_react_source_code()
|
622
|
+
def get_react_source_code(self) -> list:
|
623
|
+
"""
|
624
|
+
Fetches the source code of the specified URL and extracts JavaScript and JSX source code (React components).
|
625
|
+
|
626
|
+
Args:
|
627
|
+
url (str): The URL to fetch the source code from.
|
986
628
|
|
629
|
+
Returns:
|
630
|
+
list: A list of strings containing JavaScript and JSX source code found in <script> tags.
|
631
|
+
"""
|
632
|
+
if self.url_manager.correct_url is None:
|
633
|
+
return []
|
634
|
+
soup = BeautifulSoup(self.source_code_bytes,"html.parser")
|
635
|
+
script_tags = soup.find_all('script', type=lambda t: t and ('javascript' in t or 'jsx' in t))
|
636
|
+
for script_tag in script_tags:
|
637
|
+
self.react_source_code.append(script_tag.string)
|
638
|
+
|
639
|
+
|
640
|
+
def get_status(url:str=None) -> int:
|
641
|
+
"""
|
642
|
+
Gets the HTTP status code of the given URL.
|
643
|
+
|
644
|
+
Args:
|
645
|
+
url (str): The URL to check the status of.
|
646
|
+
|
647
|
+
Returns:
|
648
|
+
int: The HTTP status code of the URL, or None if the request fails.
|
649
|
+
"""
|
650
|
+
# Get the status code of the URL
|
651
|
+
return try_request(url=url).status_code
|
652
|
+
def wait_between_requests(self):
|
653
|
+
"""
|
654
|
+
Wait between requests based on the request_wait_limit.
|
655
|
+
"""
|
656
|
+
if self.last_request_time:
|
657
|
+
sleep_time = self.request_wait_limit - (get_time_stamp() - self.last_request_time)
|
658
|
+
if sleep_time > 0:
|
659
|
+
logging.info(f"Sleeping for {sleep_time:.2f} seconds.")
|
660
|
+
get_sleep(sleep_time)
|
661
|
+
|
662
|
+
def make_request(self):
|
663
|
+
"""
|
664
|
+
Make a request and handle potential errors.
|
665
|
+
"""
|
666
|
+
# Update the instance attributes if they are passed
|
667
|
+
|
668
|
+
self.wait_between_requests()
|
669
|
+
for _ in range(self.max_retries):
|
670
|
+
try:
|
671
|
+
self.try_request() # 10 seconds timeout
|
672
|
+
if self.response:
|
673
|
+
if self.response.status_code == 200:
|
674
|
+
self.last_request_time = get_time_stamp()
|
675
|
+
return self.response
|
676
|
+
elif self.response.status_code == 429:
|
677
|
+
logging.warning(f"Rate limited by {self.url_manager.correct_url}. Retrying...")
|
678
|
+
get_sleep(5) # adjust this based on the server's rate limit reset time
|
679
|
+
except requests.Timeout as e:
|
680
|
+
logging.error(f"Request to {cleaned_url} timed out: {e}")
|
681
|
+
except requests.ConnectionError:
|
682
|
+
logging.error(f"Connection error for URL {self.url_manager.correct_url}.")
|
683
|
+
except requests.Timeout:
|
684
|
+
logging.error(f"Request timeout for URL {self.url_manager.correct_url}.")
|
685
|
+
except requests.RequestException as e:
|
686
|
+
logging.error(f"Request exception for URL {self.url_manager.correct_url}: {e}")
|
687
|
+
|
688
|
+
logging.error(f"Failed to retrieve content from {self.url_manager.correct_url} after {self.max_retries} retries.")
|
689
|
+
return None
|
690
|
+
def try_request(self, timeout=10) -> Union[requests.Response, None]:
|
691
|
+
"""
|
692
|
+
Tries to make an HTTP request to the given URL using the provided session.
|
693
|
+
|
694
|
+
Args:
|
695
|
+
timeout (int): Timeout for the request.
|
696
|
+
|
697
|
+
Returns:
|
698
|
+
requests.Response or None: The response object if the request is successful, or None if the request fails.
|
699
|
+
"""
|
700
|
+
try:
|
701
|
+
return self.session.get(url=self.url_manager.url, timeout=timeout)
|
702
|
+
except requests.exceptions.RequestException as e:
|
703
|
+
print(e)
|
704
|
+
return None
|
705
|
+
|
706
|
+
def get_limited_request(self,request_url,service_name="default"):
|
707
|
+
manager = DynamicRateLimiterManagerSingleton.get_instance() # Get the singleton instance
|
708
|
+
unwanted_response=True
|
709
|
+
# Check with the rate limiter if we can make a request
|
710
|
+
while True:
|
711
|
+
if not manager.request(service_name):
|
712
|
+
print("Rate limit reached for coin_gecko. Waiting for the next epoch...")
|
713
|
+
sleep_count_down(manager.services[service_name].get_sleep()["current_sleep"]) # Wait for the limit_epoch duration
|
714
|
+
# Make the actual request
|
715
|
+
response = try_request(request_url=request_url)
|
716
|
+
|
717
|
+
# If you get a rate-limit error (usually 429 status code but can vary), adjust the rate limiter
|
718
|
+
if response.status_code ==429:
|
719
|
+
print(response.json())
|
720
|
+
manager.services[service_name].request_tracker(False)
|
721
|
+
print("Rate limited by coin_gecko. Adjusted limit. Retrying...")
|
722
|
+
if len(manager.services[service_name].calculate_tokens()["succesful"])<2:
|
723
|
+
sleep_count_down(manager.services[service_name].limit_epoch) # Wait for the limit_epoch duration
|
724
|
+
else:
|
725
|
+
manager.services[service_name].current_limit-=1
|
726
|
+
sleep_count_down(manager.services[service_name].limit_epoch/len(manager.services[service_name].calculate_tokens()["succesful"])) # Wait for the limit_epoch duration
|
727
|
+
# Return the data if the request was successful
|
728
|
+
if response.status_code == 200:
|
729
|
+
manager.services[service_name].request_tracker(True)
|
730
|
+
return response.json()
|
731
|
+
elif response.status_code not in [200,429]:
|
732
|
+
print(f"Unexpected response: {response.status_code}. Message: {response.text}")
|
733
|
+
return None
|
734
|
+
@property
|
735
|
+
def url(self):
|
736
|
+
return self.url_manager.url
|
737
|
+
|
738
|
+
@url.setter
|
739
|
+
def url(self, new_url):
|
740
|
+
self._url_manager.url = new_url
|
741
|
+
class SafeRequestSingleton:
|
742
|
+
_instance = None
|
743
|
+
@staticmethod
|
744
|
+
def get_instance(url=None,headers:dict=None,max_retries=3,last_request_time=None,request_wait_limit=1.5):
|
745
|
+
if SafeRequestSingleton._instance is None:
|
746
|
+
SafeRequestSingleton._instance = SafeRequest(url,url_manager=URLManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
|
747
|
+
elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
|
748
|
+
SafeRequestSingleton._instance = SafeRequest(url,url_manager=URLManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
|
749
|
+
return SafeRequestSingleton._instance
|
987
750
|
class SoupManager:
|
988
751
|
def __init__(self,url_manager,request_manager, parse_type="html.parser"):
|
989
752
|
self.soup=[]
|
@@ -1188,6 +951,172 @@ class SoupManager:
|
|
1188
951
|
@url.setter
|
1189
952
|
def url(self, new_url):
|
1190
953
|
self._url = new_url
|
954
|
+
|
955
|
+
class SoupManagerSingleton():
|
956
|
+
_instance = None
|
957
|
+
@staticmethod
|
958
|
+
def get_instance(url_manager,request_manager,parse_type="html.parser",source_code=None):
|
959
|
+
if SoupManagerSingleton._instance is None:
|
960
|
+
SoupManagerSingleton._instance = SoupManager(url_manager,request_manager,parse_type=parse_type,source_code=source_code)
|
961
|
+
elif parse_type != SoupManagerSingleton._instance.parse_type or source_code != SoupManagerSingleton._instance.source_code:
|
962
|
+
SoupManagerSingleton._instance = SoupManager(url_manager,request_manager,parse_type=parse_type,source_code=source_code)
|
963
|
+
return SoupManagerSingleton._instance
|
964
|
+
class VideoDownloader:
|
965
|
+
def __init__(self, url,title=None,download_directory=os.getcwd(),user_agent=None,video_extention='mp4',download_video=True,get_info=False,auto_file_gen=True,standalone_download=False):
|
966
|
+
self.url=url
|
967
|
+
self.monitoring = True
|
968
|
+
self.downloading = True
|
969
|
+
self.pause_event = threading.Event()
|
970
|
+
self.get_download = download_video
|
971
|
+
self.get_info = get_info
|
972
|
+
self.user_agent=user_agent
|
973
|
+
self.title = title
|
974
|
+
self.auto_file_gen=auto_file_gen
|
975
|
+
self.standalone_download=standalone_download
|
976
|
+
self.video_extention=video_extention
|
977
|
+
self.download_directory=download_directory
|
978
|
+
self.video_extention=video_extention
|
979
|
+
self.header = {}#UserAgentManagerSingleton().get_instance(user_agent=user_agent)
|
980
|
+
self.base_name = os.path.basename(self.url)
|
981
|
+
self.file_name,self.ext = os.path.splitext(self.base_name)
|
982
|
+
self.video_urls = [self.url]
|
983
|
+
self.info={}
|
984
|
+
self.starttime = None
|
985
|
+
self.downloaded = 0
|
986
|
+
self.video_urls = url if isinstance(url,list) else [url]
|
987
|
+
self.send_to_dl()
|
988
|
+
|
989
|
+
def get_request(self,url):
|
990
|
+
self.request_manager = SafeRequestSingleton.get_instance(url=url)
|
991
|
+
return self.request_manager
|
992
|
+
def send_to_dl(self):
|
993
|
+
if self.standalone_download:
|
994
|
+
self.standalone_downloader()
|
995
|
+
else:
|
996
|
+
self.start()
|
997
|
+
def get_headers(self,url):
|
998
|
+
# Send an HTTP GET request to the URL
|
999
|
+
response = requests.get(url)
|
1000
|
+
# Check if the request was successful (status code 200)
|
1001
|
+
if response.status_code == 200:
|
1002
|
+
# Access and print the response headers
|
1003
|
+
return response.headers
|
1004
|
+
else:
|
1005
|
+
print(f"Failed to retrieve the web page. Status code: {response.status_code}")
|
1006
|
+
def progress_callback(self, stream, chunk, bytes_remaining):
|
1007
|
+
total_size = stream.filesize
|
1008
|
+
self.downloaded = total_size - bytes_remaining
|
1009
|
+
def get_directory_path(directory,name,video_extention):
|
1010
|
+
file_path=os.path.join(directory,name+'.'+video_extention)
|
1011
|
+
i=0
|
1012
|
+
while os.path.exists(file_path) == True:
|
1013
|
+
file_path=os.path.join(directory,name+f'_{i}'+'.'+video_extention)
|
1014
|
+
i+=1
|
1015
|
+
return file_path
|
1016
|
+
def fetch_video_urls(self):
|
1017
|
+
driver = webdriver.Chrome()
|
1018
|
+
driver.get(self.url)
|
1019
|
+
self.page_source = driver.page_source
|
1020
|
+
for each in self.page_source.split('<source ')[1:]:
|
1021
|
+
self.video_urls.append(eatInner(each.split('.{self.video_extention}'.replace('..','.'))[0].split('http')[-1],['h','t','t','p','s',':','//','/','s','=',' ','\n','\t',''])+'.mp4')
|
1022
|
+
def download(self):
|
1023
|
+
def downloading_while():
|
1024
|
+
while self.downloading:
|
1025
|
+
self.pause_event.wait(3)
|
1026
|
+
print('downloading...')
|
1027
|
+
for video_url in self.video_urls:
|
1028
|
+
ydl_opts = {
|
1029
|
+
'external_downloader': 'ffmpeg',
|
1030
|
+
'external_downloader_args': "-ss 00:01:00.00 -to 00:02:00.00",
|
1031
|
+
'format': 'best',
|
1032
|
+
'outtmpl': f'{self.download_directory} %(title)s-%(id)s.%(ext)s',
|
1033
|
+
'noprogress': True
|
1034
|
+
}
|
1035
|
+
self.output_video = ydl_opts['outtmpl']
|
1036
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
1037
|
+
self.downloading_while = threading.Thread(target=downloading_while)
|
1038
|
+
self.downloading_while.start()
|
1039
|
+
self.info = ydl.extract_info(video_url, download=self.get_download)
|
1040
|
+
self.downloading=False
|
1041
|
+
self.starttime = get_time_stamp()
|
1042
|
+
if self.auto_file_gen:
|
1043
|
+
file_path = ydl.prepare_filename(self.info)
|
1044
|
+
if self.get_info == True:
|
1045
|
+
self.info['file_path']=file_path
|
1046
|
+
if self.get_info == True:
|
1047
|
+
self.stop()
|
1048
|
+
return self.info
|
1049
|
+
self.stop()
|
1050
|
+
return self.info
|
1051
|
+
def monitor(self):
|
1052
|
+
name = self.title or False
|
1053
|
+
while self.monitoring:
|
1054
|
+
print("Monitoring...")
|
1055
|
+
self.pause_event.wait(60) # check every minute
|
1056
|
+
if self.starttime:
|
1057
|
+
elapsed_time = get_time_stamp() - self.starttime
|
1058
|
+
if self.downloaded != 0 and elapsed_time !=0:
|
1059
|
+
percent = self.downloaded / (self.downloaded + elapsed_time)
|
1060
|
+
else:
|
1061
|
+
percent = 0
|
1062
|
+
if elapsed_time !=0:
|
1063
|
+
try:
|
1064
|
+
# operations that can cause ZeroDivisionError
|
1065
|
+
percent = self.downloaded / (self.downloaded + elapsed_time)
|
1066
|
+
downloaded_minutes = elapsed_time / 60
|
1067
|
+
estimated_download_time = downloaded_minutes / percent - downloaded_minutes
|
1068
|
+
except ZeroDivisionError:
|
1069
|
+
print("Caught a division by zero!")
|
1070
|
+
continue
|
1071
|
+
|
1072
|
+
if downloaded_minutes != 0 and (percent - downloaded_minutes) !=0:
|
1073
|
+
estimated_download_time = downloaded_minutes / percent - downloaded_minutes
|
1074
|
+
print(estimated_download_time)
|
1075
|
+
if estimated_download_time >= 1.5:
|
1076
|
+
print("Seems like YouTube is limiting our download speed, restarting the download to mitigate the problem..")
|
1077
|
+
# TODO: Find a way to stop the current download and restart. This may not work efficiently since pytube doesn't expose a cancel download method.
|
1078
|
+
self.start() # Restart the download process
|
1079
|
+
def standalone_downloader(self):
|
1080
|
+
name = self.title or False
|
1081
|
+
fetch_video_urls()
|
1082
|
+
for video_url in self.video_urls:
|
1083
|
+
self.request_manager
|
1084
|
+
self.info=True
|
1085
|
+
self.download_video=False
|
1086
|
+
headers = self.get_headers(video_url)
|
1087
|
+
for v in soup.select("video source[src]"):
|
1088
|
+
print("Downloading {}".format(v["src"]))
|
1089
|
+
if not name_chosen:
|
1090
|
+
name = v["src"].split("/")[-1].strip()
|
1091
|
+
file_path = get_directory_path(directory,name,video_extention)
|
1092
|
+
print(f"saving to {file_path}")
|
1093
|
+
with open(get_directory_path(directory,name,video_extention), "wb") as f_out:
|
1094
|
+
f_out.write(requests.get(v["src"].strip(), headers=headers).content)
|
1095
|
+
|
1096
|
+
def start(self):
|
1097
|
+
self.download_thread = threading.Thread(target=self.download)
|
1098
|
+
self.download_thread.daemon = True
|
1099
|
+
self.monitor_thread = threading.Thread(target=self.monitor)
|
1100
|
+
self.download_thread.start()
|
1101
|
+
self.monitor_thread.start()
|
1102
|
+
self.download_thread.join()
|
1103
|
+
self.monitor_thread.join()
|
1104
|
+
def stop(self):
|
1105
|
+
self.monitoring = False
|
1106
|
+
self.pause_event.set()
|
1107
|
+
|
1108
|
+
|
1109
|
+
|
1110
|
+
class VideoDownloaderSingleton():
|
1111
|
+
_instance = None
|
1112
|
+
@staticmethod
|
1113
|
+
def get_instance(url_manager,request_manager,title=None,video_extention='mp4',download_directory=os.getcwd(),user_agent=None,download=True,get_info=False):
|
1114
|
+
if VideoDownloaderSingleton._instance is None:
|
1115
|
+
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
|
1116
|
+
elif VideoDownloaderSingleton._instance.title != title or video_extention != VideoDownloaderSingleton._instance.video_extention or url != VideoDownloaderSingleton._instance.url or download_directory != VideoDownloaderSingleton._instance.download_directory or user_agent != VideoDownloaderSingleton._instance.user_agent:
|
1117
|
+
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
|
1118
|
+
return VideoDownloaderSingleton._instance
|
1119
|
+
|
1191
1120
|
class LinkManager:
|
1192
1121
|
def __init__(self,url_manager,soup_manager,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title']):
|
1193
1122
|
self.url_manager= url_manager
|
@@ -1254,15 +1183,7 @@ class LinkManager:
|
|
1254
1183
|
valid_assiciated_attrs[-1]["link"]=valid_attr
|
1255
1184
|
desired_links.append(valid_assiciated_attrs)
|
1256
1185
|
return desired_links
|
1257
|
-
|
1258
|
-
_instance = None
|
1259
|
-
@staticmethod
|
1260
|
-
def get_instance(url_manager,request_manager,parse_type="html.parser",source_code=None):
|
1261
|
-
if SoupManagerSingleton._instance is None:
|
1262
|
-
SoupManagerSingleton._instance = SoupManager(url_manager,request_manager,parse_type=parse_type,source_code=source_code)
|
1263
|
-
elif parse_type != SoupManagerSingleton._instance.parse_type or source_code != SoupManagerSingleton._instance.source_code:
|
1264
|
-
SoupManagerSingleton._instance = SoupManager(url_manager,request_manager,parse_type=parse_type,source_code=source_code)
|
1265
|
-
return SoupManagerSingleton._instance
|
1186
|
+
|
1266
1187
|
def CrawlManager():
|
1267
1188
|
def __init__(self,url=None,source_code=None,parse_type="html.parser"):
|
1268
1189
|
self.url=url
|
@@ -1450,4 +1371,3 @@ class CrawlManagerSingleton():
|
|
1450
1371
|
elif parse_type != CrawlManagerSingleton._instance.parse_type or url != CrawlManagerSingleton._instance.url or source_code != CrawlManagerSingleton._instance.source_code:
|
1451
1372
|
CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
|
1452
1373
|
return CrawlManagerSingleton._instance
|
1453
|
-
|