abstract-webtools 0.1.6.146__py3-none-any.whl → 0.1.6.147__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/__init__.py +6 -0
- abstract_webtools/abstract_webtools.py +1768 -0
- abstract_webtools/managers/requestManager/requestManager.py +1 -1
- abstract_webtools/managers/seleneumManager.py +1 -0
- abstract_webtools/managers/seleniumManager.py +241 -0
- abstract_webtools/url_grabber.py +73 -1
- abstract_webtools-0.1.6.147.dist-info/METADATA +482 -0
- {abstract_webtools-0.1.6.146.dist-info → abstract_webtools-0.1.6.147.dist-info}/RECORD +11 -10
- abstract_webtools-0.1.6.146.dist-info/METADATA +0 -196
- /abstract_webtools/managers/{allss//.py" → allss.py} +0 -0
- {abstract_webtools-0.1.6.146.dist-info → abstract_webtools-0.1.6.147.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.146.dist-info → abstract_webtools-0.1.6.147.dist-info}/top_level.txt +0 -0
@@ -84,6 +84,1773 @@ from requests.adapters import HTTPAdapter
|
|
84
84
|
from urllib.parse import urlparse, urljoin
|
85
85
|
from requests.packages.urllib3.util import ssl_
|
86
86
|
from requests.packages.urllib3.poolmanager import PoolManager
|
87
|
+
from abstract_utilities import get_time_stamp,get_sleep,sleep_count_down,eatInner,eatAll,eatOuter,ThreadManager
|
88
|
+
logging.basicConfig(level=logging.INFO)
|
89
|
+
|
90
|
+
class DynamicRateLimiterManager:
|
91
|
+
def __init__(self):
|
92
|
+
# Key: Service Name, Value: DynamicRateLimiter instance
|
93
|
+
self.services = {}
|
94
|
+
|
95
|
+
def add_service(self, service_name="default", low_limit=10, high_limit=30, limit_epoch=60,starting_tokens=10,epoch_cycle_adjustment=True):
|
96
|
+
if service_name in self.services:
|
97
|
+
print(f"Service {service_name} already exists!")
|
98
|
+
return
|
99
|
+
self.services[service_name] = DynamicRateLimiter(service_name=service_name, low_limit=low_limit, high_limit=limit_epoch, limit_epoch=60,starting_tokens=starting_tokens,epoch_cycle_adjustment=epoch_cycle_adjustment)
|
100
|
+
|
101
|
+
def request(self, service_name):
|
102
|
+
if service_name not in self.services:
|
103
|
+
raise ValueError(f"Service {service_name} not found!")
|
104
|
+
|
105
|
+
limiter = self.services[service_name]
|
106
|
+
can_request = limiter.request()
|
107
|
+
|
108
|
+
# Log the outcome of the request attempt
|
109
|
+
self.log_request(service_name, can_request)
|
110
|
+
|
111
|
+
return can_request
|
112
|
+
|
113
|
+
def log_request(self, service_name, success):
|
114
|
+
# Placeholder logging method, replace with actual logging implementation
|
115
|
+
print(f"[{service_name}] Request {'succeeded' if success else 'denied'}. Current tokens: {self.services[service_name].get_current_tokens()}")
|
116
|
+
class DynamicRateLimiter:
|
117
|
+
def __init__(self, low_limit, high_limit, limit_epoch, starting_tokens=None,epoch_cycle_adjustment:int=None):
|
118
|
+
self.low_limit = low_limit
|
119
|
+
self.high_limit = high_limit
|
120
|
+
self.limit_epoch = limit_epoch # in seconds
|
121
|
+
self.request_status_json = {"succesful":[],"unsuccesful":[],"last_requested":get_time_stamp(),"first_requested":get_time_stamp(),"epoch_left":self.limit_epoch,"last_fail":get_time_stamp(),"count_since_fail":0}
|
122
|
+
self.current_limit = starting_tokens or low_limit # Default to high_limit if starting_tokens isn't provided
|
123
|
+
self.epoch_cycle_adjustment = epoch_cycle_adjustment
|
124
|
+
# Additional attributes for tracking adjustment logic
|
125
|
+
self.last_adjusted_time = get_time_stamp()
|
126
|
+
self.successful_epochs_since_last_adjustment = 0
|
127
|
+
self.request_count_in_current_epoch = 0
|
128
|
+
|
129
|
+
def _refill_tokens(self):
|
130
|
+
time_since_last_request = get_time_stamp() - self.request_status_json["last_requested"]
|
131
|
+
new_tokens = (time_since_last_request / self.limit_epoch) * self.current_limit
|
132
|
+
self.tokens = min(self.current_limit, self.get_current_tokens())
|
133
|
+
def request_tracker(self,success):
|
134
|
+
if success:
|
135
|
+
self.request_status_json["succesful"].append(get_time_stamp())
|
136
|
+
else:
|
137
|
+
self.request_status_json["unsuccesful"].append(get_time_stamp())
|
138
|
+
self.request_status_json["last_fail"]=get_time_stamp()
|
139
|
+
self.request_status_json["count_since_fail"]=0
|
140
|
+
self.adjust_limit()
|
141
|
+
self.request_status_json["last_requested"]=get_time_stamp()
|
142
|
+
def calculate_tokens(self):
|
143
|
+
successful = []
|
144
|
+
for each in self.request_status_json["succesful"]:
|
145
|
+
if (get_time_stamp() - each)<self.limit_epoch:
|
146
|
+
successful.append(each)
|
147
|
+
self.request_status_json["succesful"]=successful
|
148
|
+
unsuccessful = []
|
149
|
+
for each in self.request_status_json["unsuccesful"]:
|
150
|
+
if (get_time_stamp() - each)<self.limit_epoch:
|
151
|
+
unsuccessful.append(each)
|
152
|
+
self.request_status_json["unsuccesful"]=unsuccessful
|
153
|
+
if len(successful)==0 and len(unsuccessful)==0:
|
154
|
+
pass
|
155
|
+
elif len(successful)!=0 and len(unsuccessful)==0:
|
156
|
+
self.request_status_json["first_requested"] = successful[0]
|
157
|
+
elif len(successful)==0 and len(unsuccessful)!=0:
|
158
|
+
self.request_status_json["first_requested"] = unsuccessful[0]
|
159
|
+
else:
|
160
|
+
self.request_status_json["first_requested"] = min(unsuccessful[0],successful[0])
|
161
|
+
self.request_status_json["epoch_left"]=self.limit_epoch-(self.request_status_json["last_requested"]-self.request_status_json["first_requested"])
|
162
|
+
|
163
|
+
return self.request_status_json
|
164
|
+
def get_current_tokens(self):
|
165
|
+
self.request_status_json = self.calculate_tokens()
|
166
|
+
total_requests = len(self.request_status_json["succesful"])+len(self.request_status_json["unsuccesful"])
|
167
|
+
return max(0,self.current_limit-total_requests)
|
168
|
+
def get_sleep(self):
|
169
|
+
self.request_status_json = self.calculate_tokens()
|
170
|
+
self.request_status_json["current_sleep"]=self.request_status_json["epoch_left"]/max(1,self.get_current_tokens())
|
171
|
+
return self.request_status_json
|
172
|
+
def request(self):
|
173
|
+
self._refill_tokens()
|
174
|
+
if self.tokens > 0:
|
175
|
+
return True # The request can be made
|
176
|
+
else:
|
177
|
+
if self.tokens == 0:
|
178
|
+
self.request_status_json["count_since_fail"]+=1
|
179
|
+
if self.epoch_cycle_adjustment != None:
|
180
|
+
if self.request_status_json["count_since_fail"] >=self.epoch_cycle_adjustment:
|
181
|
+
self.current_limit=min(self.current_limit+1,self.high_limit)
|
182
|
+
return False # The request cannot be made
|
183
|
+
def _adjust_limit(self):
|
184
|
+
current_time = get_time_stamp()
|
185
|
+
if current_time - self.last_adjusted_time >= self.limit_epoch:
|
186
|
+
if len(self.clear_epoch()["succesful"]) >= self.tokens:
|
187
|
+
# We hit the rate limit this epoch, decrease our limit
|
188
|
+
self.tokens = max(1, self.tokens - 1)
|
189
|
+
else:
|
190
|
+
self.successful_epochs_since_last_adjustment += 1
|
191
|
+
if self.successful_epochs_since_last_adjustment >= 5:
|
192
|
+
# We've had 5 successful epochs, increase our limit
|
193
|
+
self.current_limit = min(self.high_limit, self.tokens + 1)
|
194
|
+
self.successful_epochs_since_last_adjustment = 0
|
195
|
+
|
196
|
+
# Reset our counters for the new epoch
|
197
|
+
self.last_adjusted_time = current_time
|
198
|
+
self.request_count_in_current_epoch = 0
|
199
|
+
def adjust_limit(self):
|
200
|
+
# Set the tokens to succesful requests_made - 1
|
201
|
+
self.tokens = len(self.calculate_tokens()["succesful"])
|
202
|
+
|
203
|
+
# Adjust the high_limit
|
204
|
+
self.current_limit = self.tokens
|
205
|
+
|
206
|
+
# Log the adjustment
|
207
|
+
print(f"Adjusted tokens to: {self.tokens} and high_limit to: {self.current_limit}")
|
208
|
+
class DynamicRateLimiterManagerSingleton:
|
209
|
+
_instance = None
|
210
|
+
@staticmethod
|
211
|
+
def get_instance(service_name="default", low_limit=10, high_limit=30, limit_epoch=60,starting_tokens=10,epoch_cycle_adjustment=True):
|
212
|
+
if DynamicRateLimiterManagerSingleton._instance is None:
|
213
|
+
DynamicRateLimiterManagerSingleton._instance = DynamicRateLimiterManager(service_name=service_name, low_limit=low_limit, high_limit=limit_epoch, limit_epoch=60,starting_tokens=starting_tokens,epoch_cycle_adjustment=epoch_cycle_adjustment)
|
214
|
+
return DynamicRateLimiterManagerSingleton._instance
|
215
|
+
|
216
|
+
|
217
|
+
class CipherManager:
|
218
|
+
@staticmethod
|
219
|
+
def get_default_ciphers()-> list:
|
220
|
+
return [
|
221
|
+
"ECDHE-RSA-AES256-GCM-SHA384", "ECDHE-ECDSA-AES256-GCM-SHA384",
|
222
|
+
"ECDHE-RSA-AES256-SHA384", "ECDHE-ECDSA-AES256-SHA384",
|
223
|
+
"ECDHE-RSA-AES256-SHA", "ECDHE-ECDSA-AES256-SHA",
|
224
|
+
"ECDHE-RSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-SHA256",
|
225
|
+
"ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-AES128-SHA256",
|
226
|
+
"AES256-SHA", "AES128-SHA"
|
227
|
+
]
|
228
|
+
|
229
|
+
def __init__(self,cipher_list=None):
|
230
|
+
if cipher_list == None:
|
231
|
+
cipher_list=self.get_default_ciphers()
|
232
|
+
self.cipher_list = cipher_list
|
233
|
+
self.create_list()
|
234
|
+
self.ciphers_string = self.add_string_list()
|
235
|
+
def add_string_list(self):
|
236
|
+
if len(self.cipher_list)==0:
|
237
|
+
return ''
|
238
|
+
return','.join(self.cipher_list)
|
239
|
+
def create_list(self):
|
240
|
+
if self.cipher_list == None:
|
241
|
+
self.cipher_list= []
|
242
|
+
elif isinstance(self.cipher_list, str):
|
243
|
+
self.cipher_list=self.cipher_list.split(',')
|
244
|
+
if isinstance(self.cipher_list, str):
|
245
|
+
self.cipher_list=[self.cipher_list]
|
246
|
+
class CipherManagerSingleton:
|
247
|
+
_instance = None
|
248
|
+
@staticmethod
|
249
|
+
def get_instance(cipher_list=None):
|
250
|
+
if CipherManagerSingleton._instance is None:
|
251
|
+
CipherManagerSingleton._instance = CipherManager(cipher_list=cipher_list)
|
252
|
+
elif CipherManagerSingleton._instance.cipher_list != cipher_list:
|
253
|
+
CipherManagerSingleton._instance = CipherManager(cipher_list=cipher_list)
|
254
|
+
return CipherManagerSingleton._instance
|
255
|
+
class SSLManager:
|
256
|
+
def __init__(self, ciphers=None, ssl_options=None, certification=None):
|
257
|
+
self.ciphers = ciphers or CipherManager().ciphers_string
|
258
|
+
self.ssl_options = ssl_options or self.get_default_ssl_settings()
|
259
|
+
self.certification = certification or ssl.CERT_REQUIRED
|
260
|
+
self.ssl_context = self.get_context()
|
261
|
+
def get_default_ssl_settings(self):
|
262
|
+
return ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1 | ssl.OP_NO_COMPRESSION
|
263
|
+
def get_context(self):
|
264
|
+
return ssl_.create_urllib3_context(ciphers=self.ciphers, cert_reqs=self.certification, options=self.ssl_options)
|
265
|
+
|
266
|
+
class SSLManagerSingleton:
|
267
|
+
_instance = None
|
268
|
+
@staticmethod
|
269
|
+
def get_instance(ciphers=None, ssl_options_list=None, certification=None):
|
270
|
+
if SSLManagerSingleton._instance is None:
|
271
|
+
SSLManagerSingleton._instance = SSLManager(ciphers=ciphers, ssl_options_list=ssl_options_list, certification=certification)
|
272
|
+
elif SSLManagerSingleton._instance.cipher_manager.ciphers_string != ciphers or SSLManagerSingleton._instance.ssl_options_list !=ssl_options_list or SSLManagerSingleton._instance.certification !=certification:
|
273
|
+
SSLManagerSingleton._instance = SSLManager(ciphers=ciphers, ssl_options_list=ssl_options_list, certification=certification)
|
274
|
+
return SSLManagerSingleton._instance
|
275
|
+
class TLSAdapter(HTTPAdapter):
|
276
|
+
def __init__(self, ssl_manager=None,ciphers=None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None):
|
277
|
+
if ssl_manager == None:
|
278
|
+
ssl_manager = SSLManager(ciphers=ciphers, ssl_options=ssl_options, certification=certification)
|
279
|
+
self.ssl_manager = ssl_manager
|
280
|
+
self.ciphers = ssl_manager.ciphers
|
281
|
+
self.certification = ssl_manager.certification
|
282
|
+
self.ssl_options = ssl_manager.ssl_options
|
283
|
+
self.ssl_context = self.ssl_manager.ssl_context
|
284
|
+
super().__init__()
|
285
|
+
|
286
|
+
def init_poolmanager(self, *args, **kwargs):
|
287
|
+
kwargs['ssl_context'] = self.ssl_context
|
288
|
+
return super().init_poolmanager(*args, **kwargs)
|
289
|
+
class TLSAdapterSingleton:
|
290
|
+
_instance: Optional[TLSAdapter] = None
|
291
|
+
|
292
|
+
@staticmethod
|
293
|
+
def get_instance(ciphers: Optional[List[str]] = None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None) -> TLSAdapter:
|
294
|
+
if (not TLSAdapterSingleton._instance) or (
|
295
|
+
TLSAdapterSingleton._instance.ciphers != ciphers or
|
296
|
+
TLSAdapterSingleton._instance.certification != certification or
|
297
|
+
TLSAdapterSingleton._instance.ssl_options != ssl_options
|
298
|
+
):
|
299
|
+
TLSAdapterSingleton._instance = TLSAdapter(ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
300
|
+
return TLSAdapterSingleton._instance
|
301
|
+
class UserAgentManager:
|
302
|
+
def __init__(self, os=None, browser=None, version=None,user_agent=None):
|
303
|
+
self.os = os or 'Windows'
|
304
|
+
self.browser = browser or "Firefox"
|
305
|
+
self.version = version or '42.0'
|
306
|
+
self.user_agent = user_agent or self.get_user_agent()
|
307
|
+
self.header = self.user_agent_header()
|
308
|
+
@staticmethod
|
309
|
+
def user_agent_db():
|
310
|
+
from .big_user_agent_list import big_user_agent_dict
|
311
|
+
return big_user_agent_dict
|
312
|
+
|
313
|
+
def get_user_agent(self):
|
314
|
+
ua_db = self.user_agent_db()
|
315
|
+
|
316
|
+
if self.os and self.os in ua_db:
|
317
|
+
os_db = ua_db[self.os]
|
318
|
+
else:
|
319
|
+
os_db = random.choice(list(ua_db.values()))
|
320
|
+
|
321
|
+
if self.browser and self.browser in os_db:
|
322
|
+
browser_db = os_db[self.browser]
|
323
|
+
else:
|
324
|
+
browser_db = random.choice(list(os_db.values()))
|
325
|
+
|
326
|
+
if self.version and self.version in browser_db:
|
327
|
+
return browser_db[self.version]
|
328
|
+
else:
|
329
|
+
return random.choice(list(browser_db.values()))
|
330
|
+
|
331
|
+
def user_agent_header(self):
|
332
|
+
return {"user-agent": self.user_agent}
|
333
|
+
class UserAgentManagerSingleton:
|
334
|
+
_instance = None
|
335
|
+
@staticmethod
|
336
|
+
def get_instance(user_agent=UserAgentManager().get_user_agent()[0]):
|
337
|
+
if UserAgentManagerSingleton._instance is None:
|
338
|
+
UserAgentManagerSingleton._instance = UserAgentManager(user_agent=user_agent)
|
339
|
+
elif UserAgentManagerSingleton._instance.user_agent != user_agent:
|
340
|
+
UserAgentManagerSingleton._instance = UserAgentManager(user_agent=user_agent)
|
341
|
+
return UserAgentManagerSingleton._instance
|
342
|
+
class NetworkManager:
|
343
|
+
def __init__(self, user_agent_manager=None,ssl_manager=None, tls_adapter=None,user_agent=None,proxies=None,cookies=None,ciphers=None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None):
|
344
|
+
if ssl_manager == None:
|
345
|
+
ssl_manager = SSLManager(ciphers=ciphers, ssl_options=ssl_options, certification=certification)
|
346
|
+
self.ssl_manager=ssl_manager
|
347
|
+
if tls_adapter == None:
|
348
|
+
tls_adapter=TLSAdapter(ssl_manager=ssl_manager,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
349
|
+
self.tls_adapter=tls_adapter
|
350
|
+
self.ciphers=tls_adapter.ciphers
|
351
|
+
self.certification=tls_adapter.certification
|
352
|
+
self.ssl_options=tls_adapter.ssl_options
|
353
|
+
self.proxies=None or {}
|
354
|
+
self.cookies=cookies or "cb4c883efc59d0e990caf7508902591f4569e7bf-1617321078-0-150"
|
355
|
+
class MySocketClient:
|
356
|
+
def __init__(self, ip_address=None, port=None,domain=None):
|
357
|
+
self.sock
|
358
|
+
self.ip_address= ip_address or None
|
359
|
+
self.port = port or None
|
360
|
+
|
361
|
+
self.domain = domain or None
|
362
|
+
def receive_data(self):
|
363
|
+
chunks = []
|
364
|
+
while True:
|
365
|
+
chunk = self.sock.recv(4096)
|
366
|
+
if chunk:
|
367
|
+
chunks.append(chunk)
|
368
|
+
else:
|
369
|
+
break
|
370
|
+
return b''.join(chunks).decode('utf-8')
|
371
|
+
def _parse_socket_response_as_json(self, data, *args, **kwargs):
|
372
|
+
return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
|
373
|
+
def process_data(self):
|
374
|
+
data = self.receive_data()
|
375
|
+
return self._parse_socket_response_as_json(data)
|
376
|
+
def _parse_json(self,json_string):
|
377
|
+
return json.loads(json_string)
|
378
|
+
def get_ip(self,domain=None):
|
379
|
+
try:
|
380
|
+
return self.sock.gethostbyname(domain if domain != None else self.domain)
|
381
|
+
except self.sock.gaierror:
|
382
|
+
return None
|
383
|
+
def grt_host_name(self,ip_address=None):
|
384
|
+
return self.sock.gethostbyaddr(ip_address if ip_address != None else self.ip_address)
|
385
|
+
def toggle_sock(self):
|
386
|
+
if self.sock != None:
|
387
|
+
self.sock.close()
|
388
|
+
else:
|
389
|
+
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
390
|
+
if host and socket:
|
391
|
+
self.sock.connect((host, port))
|
392
|
+
class MySocketClient():
|
393
|
+
_instance = None
|
394
|
+
@staticmethod
|
395
|
+
def get_instance(ip_address='local_host',port=22,domain="example.com"):
|
396
|
+
if MySocketClientSingleton._instance is None:
|
397
|
+
MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain=domain)
|
398
|
+
elif MySocketClientSingleton._instance.ip_address != ip_address or MySocketClientSingleton._instance.port != port or UrlManagerSingleton._instance.domain != domain:
|
399
|
+
MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain=domain)
|
400
|
+
return MySocketClient
|
401
|
+
|
402
|
+
class UrlManager:
|
403
|
+
"""
|
404
|
+
UrlManager is a class for managing URLs, including cleaning, validating, and finding the correct version.
|
405
|
+
|
406
|
+
Args:
|
407
|
+
url (str or None): The URL to manage (default is None).
|
408
|
+
session (requests.Session): A custom requests session (default is the requests module's session).
|
409
|
+
|
410
|
+
Attributes:
|
411
|
+
session (requests.Session): The requests session used for making HTTP requests.
|
412
|
+
clean_urls (list): List of cleaned URL variations.
|
413
|
+
url (str): The current URL.
|
414
|
+
protocol (str): The protocol part of the URL (e.g., "https").
|
415
|
+
domain (str): The domain part of the URL (e.g., "example.com").
|
416
|
+
path (str): The path part of the URL (e.g., "/path/to/resource").
|
417
|
+
query (str): The query part of the URL (e.g., "?param=value").
|
418
|
+
all_urls (list): List of all URLs (not used in the provided code).
|
419
|
+
|
420
|
+
Methods:
|
421
|
+
url_to_pieces(url): Split a URL into its protocol, domain, path, and query components.
|
422
|
+
clean_url(url): Return a list of potential URL versions with and without 'www' and 'http(s)'.
|
423
|
+
get_correct_url(url): Get the correct version of the URL from possible variations.
|
424
|
+
update_url(url): Update the URL and related attributes.
|
425
|
+
get_domain(url): Get the domain name from a URL.
|
426
|
+
url_join(url, path): Join a base URL with a path.
|
427
|
+
is_valid_url(url): Check if a URL is valid.
|
428
|
+
make_valid(href, url): Make a URL valid by joining it with a base URL.
|
429
|
+
get_relative_href(url, href): Get the relative href URL by joining it with a base URL.
|
430
|
+
|
431
|
+
Note:
|
432
|
+
- The UrlManager class provides methods for managing URLs, including cleaning and validating them.
|
433
|
+
- It also includes methods for joining and validating relative URLs.
|
434
|
+
"""
|
435
|
+
|
436
|
+
def __init__(self, url=None, session=None):
|
437
|
+
"""
|
438
|
+
Initialize a UrlManager instance.
|
439
|
+
|
440
|
+
Args:
|
441
|
+
url (str or None): The URL to manage (default is None).
|
442
|
+
session (requests.Session): A custom requests session (default is the requests module's session).
|
443
|
+
"""
|
444
|
+
self._url=url or 'www.example.com'
|
445
|
+
self.url = url or 'www.example.com'
|
446
|
+
self.session= session or requests
|
447
|
+
self.clean_urls = self.clean_url(url=url)
|
448
|
+
self.url = self.get_correct_url(clean_urls=self.clean_urls)
|
449
|
+
url_pieces = self.url_to_pieces(url=self.url)
|
450
|
+
self.protocol,self.domain,self.path,self.query=url_pieces
|
451
|
+
self.all_urls = []
|
452
|
+
def url_to_pieces(self, url):
|
453
|
+
|
454
|
+
try:
|
455
|
+
match = re.match(r'^(https?)?://?([^/]+)(/[^?]+)?(\?.+)?', url)
|
456
|
+
if match:
|
457
|
+
protocol = match.group(1) if match.group(1) else None
|
458
|
+
domain = match.group(2) if match.group(1) else None
|
459
|
+
path = match.group(3) if match.group(3) else "" # Handle None
|
460
|
+
query = match.group(4) if match.group(4) else "" # Handle None
|
461
|
+
except:
|
462
|
+
print(f'the url {url} was not reachable')
|
463
|
+
protocol,domain,path,query=None,None,"",""
|
464
|
+
return protocol, domain, path, query
|
465
|
+
|
466
|
+
def clean_url(self,url=None) -> list:
|
467
|
+
"""
|
468
|
+
Given a URL, return a list with potential URL versions including with and without 'www.',
|
469
|
+
and with 'http://' and 'https://'.
|
470
|
+
"""
|
471
|
+
if url == None:
|
472
|
+
url=self.url
|
473
|
+
urls=[]
|
474
|
+
if url:
|
475
|
+
# Remove http:// or https:// prefix
|
476
|
+
cleaned = url.replace("http://", "").replace("https://", "")
|
477
|
+
no_subdomain = cleaned.replace("www.", "", 1)
|
478
|
+
|
479
|
+
urls = [
|
480
|
+
f"https://{cleaned}",
|
481
|
+
f"http://{cleaned}",
|
482
|
+
]
|
483
|
+
|
484
|
+
# Add variants without 'www' if it was present
|
485
|
+
if cleaned != no_subdomain:
|
486
|
+
urls.extend([
|
487
|
+
f"https://{no_subdomain}",
|
488
|
+
f"http://{no_subdomain}",
|
489
|
+
])
|
490
|
+
|
491
|
+
# Add variants with 'www' if it wasn't present
|
492
|
+
else:
|
493
|
+
urls.extend([
|
494
|
+
f"https://www.{cleaned}",
|
495
|
+
f"http://www.{cleaned}",
|
496
|
+
])
|
497
|
+
|
498
|
+
return urls
|
499
|
+
|
500
|
+
def get_correct_url(self,url=None,clean_urls=None) -> (str or None):
|
501
|
+
"""
|
502
|
+
Gets the correct URL from the possible variations by trying each one with an HTTP request.
|
503
|
+
|
504
|
+
Args:
|
505
|
+
url (str): The URL to find the correct version of.
|
506
|
+
session (type(requests.Session), optional): The requests session to use for making HTTP requests.
|
507
|
+
Defaults to requests.
|
508
|
+
|
509
|
+
Returns:
|
510
|
+
str: The correct version of the URL if found, or None if none of the variations are valid.
|
511
|
+
"""
|
512
|
+
if url==None and clean_urls != None:
|
513
|
+
if self.url:
|
514
|
+
url=self.url or clean_urls[0]
|
515
|
+
if url!=None and clean_urls==None:
|
516
|
+
clean_urls=self.clean_url(url)
|
517
|
+
elif url==None and clean_urls==None:
|
518
|
+
url=self.url
|
519
|
+
clean_urls=self.clean_urls
|
520
|
+
# Get the correct URL from the possible variations
|
521
|
+
for url in clean_urls:
|
522
|
+
try:
|
523
|
+
source = self.session.get(url)
|
524
|
+
return url
|
525
|
+
except requests.exceptions.RequestException as e:
|
526
|
+
print(e)
|
527
|
+
return None
|
528
|
+
def update_url(self,url):
|
529
|
+
# These methods seem essential for setting up the UrlManager object.
|
530
|
+
self.url = url
|
531
|
+
self.clean_urls = self.clean_url()
|
532
|
+
self.correct_url = self.get_correct_url()
|
533
|
+
self.url =self.correct_url
|
534
|
+
self.protocol,self.domain,self.path,self.query=self.url_to_pieces(url=self.url)
|
535
|
+
self.all_urls = []
|
536
|
+
def get_domain(self,url):
|
537
|
+
return urlparse(url).netloc
|
538
|
+
def url_join(self,url,path):
|
539
|
+
url = eatOuter(url,['/'])
|
540
|
+
path = eatInner(path,['/'])
|
541
|
+
slash=''
|
542
|
+
if path[0] not in ['?','&']:
|
543
|
+
slash = '/'
|
544
|
+
url = url+slash+path
|
545
|
+
return url
|
546
|
+
@property
|
547
|
+
def url(self):
|
548
|
+
return self._url
|
549
|
+
@url.setter
|
550
|
+
def url(self, new_url):
|
551
|
+
self._url = new_url
|
552
|
+
@staticmethod
|
553
|
+
def is_valid_url(url):
|
554
|
+
"""
|
555
|
+
Check if the given URL is valid.
|
556
|
+
"""
|
557
|
+
parsed = urlparse(url)
|
558
|
+
return bool(parsed.netloc) and bool(parsed.scheme)
|
559
|
+
@staticmethod
|
560
|
+
def make_valid(href,url):
|
561
|
+
def is_valid_url(url):
|
562
|
+
"""
|
563
|
+
Check if the given URL is valid.
|
564
|
+
"""
|
565
|
+
parsed = urlparse(url)
|
566
|
+
return bool(parsed.netloc) and bool(parsed.scheme)
|
567
|
+
if is_valid_url(href):
|
568
|
+
return href
|
569
|
+
new_link=urljoin(url,href)
|
570
|
+
if is_valid_url(new_link):
|
571
|
+
return new_link
|
572
|
+
return False
|
573
|
+
@staticmethod
|
574
|
+
def get_relative_href(url,href):
|
575
|
+
# join the URL if it's relative (not an absolute link)
|
576
|
+
href = urljoin(url, href)
|
577
|
+
parsed_href = urlparse(href)
|
578
|
+
# remove URL GET parameters, URL fragments, etc.
|
579
|
+
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
|
580
|
+
return href
|
581
|
+
def url_basename(url):
|
582
|
+
path = urllib.parse.urlparse(url).path
|
583
|
+
return path.strip('/').split('/')[-1]
|
584
|
+
|
585
|
+
|
586
|
+
def base_url(url):
|
587
|
+
return re.match(r'https?://[^?#]+/', url).group()
|
588
|
+
|
589
|
+
|
590
|
+
def urljoin(base, path):
|
591
|
+
if isinstance(path, bytes):
|
592
|
+
path = path.decode()
|
593
|
+
if not isinstance(path, str) or not path:
|
594
|
+
return None
|
595
|
+
if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
|
596
|
+
return path
|
597
|
+
if isinstance(base, bytes):
|
598
|
+
base = base.decode()
|
599
|
+
if not isinstance(base, str) or not re.match(
|
600
|
+
r'^(?:https?:)?//', base):
|
601
|
+
return None
|
602
|
+
return urllib.parse.urljoin(base, path)
|
603
|
+
class UrlManagerSingleton:
|
604
|
+
_instance = None
|
605
|
+
@staticmethod
|
606
|
+
def get_instance(url=None,session=requests):
|
607
|
+
if UrlManagerSingleton._instance is None:
|
608
|
+
UrlManagerSingleton._instance = UrlManager(url,session=session)
|
609
|
+
elif UrlManagerSingleton._instance.session != session or UrlManagerSingleton._instance.url != url:
|
610
|
+
UrlManagerSingleton._instance = UrlManager(url,session=session)
|
611
|
+
return UrlManagerSingleton._instance
|
612
|
+
class SafeRequest:
|
613
|
+
"""
|
614
|
+
SafeRequest is a class for making HTTP requests with error handling and retries.
|
615
|
+
|
616
|
+
Args:
|
617
|
+
url (str or None): The URL to make requests to (default is None).
|
618
|
+
url_manager (UrlManager or None): An instance of UrlManager (default is None).
|
619
|
+
network_manager (NetworkManager or None): An instance of NetworkManager (default is None).
|
620
|
+
user_agent_manager (UserAgentManager or None): An instance of UserAgentManager (default is None).
|
621
|
+
ssl_manager (SSlManager or None): An instance of SSLManager (default is None).
|
622
|
+
tls_adapter (TLSAdapter or None): An instance of TLSAdapter (default is None).
|
623
|
+
user_agent (str or None): The user agent string to use for requests (default is None).
|
624
|
+
proxies (dict or None): Proxy settings for requests (default is None).
|
625
|
+
headers (dict or None): Additional headers for requests (default is None).
|
626
|
+
cookies (dict or None): Cookie settings for requests (default is None).
|
627
|
+
session (requests.Session or None): A custom requests session (default is None).
|
628
|
+
adapter (str or None): A custom adapter for requests (default is None).
|
629
|
+
protocol (str or None): The protocol to use for requests (default is 'https://').
|
630
|
+
ciphers (str or None): Cipher settings for requests (default is None).
|
631
|
+
auth (tuple or None): Authentication credentials (default is None).
|
632
|
+
login_url (str or None): The URL for authentication (default is None).
|
633
|
+
email (str or None): Email for authentication (default is None).
|
634
|
+
password (str or None): Password for authentication (default is None).
|
635
|
+
certification (str or None): Certification settings for requests (default is None).
|
636
|
+
ssl_options (str or None): SSL options for requests (default is None).
|
637
|
+
stream (bool): Whether to stream the response content (default is False).
|
638
|
+
timeout (float or None): Timeout for requests (default is None).
|
639
|
+
last_request_time (float or None): Timestamp of the last request (default is None).
|
640
|
+
max_retries (int or None): Maximum number of retries for requests (default is None).
|
641
|
+
request_wait_limit (float or None): Wait time between requests (default is None).
|
642
|
+
|
643
|
+
Methods:
|
644
|
+
update_url_manager(url_manager): Update the URL manager and reinitialize the SafeRequest.
|
645
|
+
update_url(url): Update the URL and reinitialize the SafeRequest.
|
646
|
+
re_initialize(): Reinitialize the SafeRequest with the current settings.
|
647
|
+
authenticate(s, login_url=None, email=None, password=None, checkbox=None, dropdown=None): Authenticate and make a request.
|
648
|
+
fetch_response(): Fetch the response from the server.
|
649
|
+
initialize_session(): Initialize the requests session with custom settings.
|
650
|
+
process_response_data(): Process the fetched response data.
|
651
|
+
get_react_source_code(): Extract JavaScript and JSX source code from <script> tags.
|
652
|
+
get_status(url=None): Get the HTTP status code of a URL.
|
653
|
+
wait_between_requests(): Wait between requests based on the request_wait_limit.
|
654
|
+
make_request(): Make a request and handle potential errors.
|
655
|
+
try_request(): Try to make an HTTP request using the provided session.
|
656
|
+
|
657
|
+
Note:
|
658
|
+
- The SafeRequest class is designed for making HTTP requests with error handling and retries.
|
659
|
+
- It provides methods for authentication, response handling, and error management.
|
660
|
+
"""
|
661
|
+
def __init__(self,
|
662
|
+
url=None,
|
663
|
+
source_code=None,
|
664
|
+
url_manager=None,
|
665
|
+
network_manager=None,
|
666
|
+
user_agent_manager=None,
|
667
|
+
ssl_manager=None,
|
668
|
+
ssl_options=None,
|
669
|
+
tls_adapter=None,
|
670
|
+
user_agent=None,
|
671
|
+
proxies=None,
|
672
|
+
headers=None,
|
673
|
+
cookies=None,
|
674
|
+
session=None,
|
675
|
+
adapter=None,
|
676
|
+
protocol=None,
|
677
|
+
ciphers=None,
|
678
|
+
spec_login=False,
|
679
|
+
login_referer=None,
|
680
|
+
login_user_agent=None,
|
681
|
+
auth=None,
|
682
|
+
login_url=None,
|
683
|
+
email = None,
|
684
|
+
password=None,
|
685
|
+
checkbox=None,
|
686
|
+
dropdown=None,
|
687
|
+
certification=None,
|
688
|
+
stream=False,
|
689
|
+
timeout = None,
|
690
|
+
last_request_time=None,
|
691
|
+
max_retries=None,
|
692
|
+
request_wait_limit=None):
|
693
|
+
self._url=url
|
694
|
+
self.url=url
|
695
|
+
if url_manager == None:
|
696
|
+
url_manager = UrlManager(url=self.url)
|
697
|
+
self.url_manager=url_manager
|
698
|
+
self._url_manager = self.url_manager
|
699
|
+
self.user_agent = user_agent
|
700
|
+
self.user_agent_manager = user_agent_manager or UserAgentManager(user_agent=self.user_agent)
|
701
|
+
self.headers= headers or self.user_agent_manager.header or {'Accept': '*/*'}
|
702
|
+
self.user_agent= self.user_agent_manager.user_agent
|
703
|
+
self.ciphers=ciphers or CipherManager().ciphers_string
|
704
|
+
self.certification=certification
|
705
|
+
self.ssl_options=ssl_options
|
706
|
+
self.ssl_manager = ssl_manager or SSLManager(ciphers=self.ciphers, ssl_options=self.ssl_options, certification=self.certification)
|
707
|
+
self.tls_adapter=tls_adapter or TLSAdapter(ssl_manager=self.ssl_manager,certification=self.certification,ssl_options=self.ssl_manager.ssl_options)
|
708
|
+
self.network_manager= network_manager or NetworkManager(user_agent_manager=self.user_agent_manager,ssl_manager=self.ssl_manager, tls_adapter=self.tls_adapter,user_agent=user_agent,proxies=proxies,cookies=cookies,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
709
|
+
self.stream=stream
|
710
|
+
self.tls_adapter=self.network_manager.tls_adapter
|
711
|
+
self.ciphers=self.network_manager.ciphers
|
712
|
+
self.certification=self.network_manager.certification
|
713
|
+
self.ssl_options=self.network_manager.ssl_options
|
714
|
+
self.proxies=self.network_manager.proxies
|
715
|
+
self.timeout=timeout
|
716
|
+
self.cookies=self.network_manager.cookies
|
717
|
+
self.session = session or requests.session()
|
718
|
+
self.auth = auth
|
719
|
+
self.spec_login=spec_login
|
720
|
+
self.password=password
|
721
|
+
self.email = email
|
722
|
+
self.checkbox=checkbox
|
723
|
+
self.dropdown=dropdown
|
724
|
+
self.login_url=login_url
|
725
|
+
self.login_user_agent=login_user_agent
|
726
|
+
self.login_referer=login_referer
|
727
|
+
self.protocol=protocol or 'https://'
|
728
|
+
|
729
|
+
self.stream=stream if isinstance(stream,bool) else False
|
730
|
+
self.initialize_session()
|
731
|
+
self.last_request_time=last_request_time
|
732
|
+
self.max_retries = max_retries or 3
|
733
|
+
self.request_wait_limit = request_wait_limit or 1.5
|
734
|
+
self._response=None
|
735
|
+
self.make_request()
|
736
|
+
self.source_code = None
|
737
|
+
self.source_code_bytes=None
|
738
|
+
self.source_code_json = {}
|
739
|
+
self.react_source_code=[]
|
740
|
+
self._response_data = None
|
741
|
+
self.process_response_data()
|
742
|
+
def update_url_manager(self,url_manager):
|
743
|
+
self.url_manager=url_manager
|
744
|
+
self.re_initialize()
|
745
|
+
def update_url(self,url):
|
746
|
+
self.url_manager.update_url(url=url)
|
747
|
+
self.re_initialize()
|
748
|
+
def re_initialize(self):
|
749
|
+
self._response=None
|
750
|
+
self.make_request()
|
751
|
+
self.source_code = None
|
752
|
+
self.source_code_bytes=None
|
753
|
+
self.source_code_json = {}
|
754
|
+
self.react_source_code=[]
|
755
|
+
self._response_data = None
|
756
|
+
self.process_response_data()
|
757
|
+
@property
|
758
|
+
def response(self):
|
759
|
+
"""Lazy-loading of response."""
|
760
|
+
if self._response is None:
|
761
|
+
self._response = self.fetch_response()
|
762
|
+
return self._response
|
763
|
+
def authenticate(self,session, login_url=None, email=None, password=None,checkbox=None,dropdown=None):
|
764
|
+
login_urls = login_url or [self.url_manager.url,self.url_manager.domain,self.url_manager.url_join(url=self.url_manager.domain,path='login'),self.url_manager.url_join(url=self.url_manager.domain,path='auth')]
|
765
|
+
s = session
|
766
|
+
if not isinstance(login_urls,list):
|
767
|
+
login_urls=[login_urls]
|
768
|
+
for login_url in login_urls:
|
769
|
+
login_url_manager = UrlManager(login_url)
|
770
|
+
login_url = login_url_manager.url
|
771
|
+
|
772
|
+
r = s.get(login_url)
|
773
|
+
soup = BeautifulSoup(r.content, "html.parser")
|
774
|
+
# Find the token or any CSRF protection token
|
775
|
+
token = soup.find('input', {'name': 'token'}).get('value') if soup.find('input', {'name': 'token'}) else None
|
776
|
+
if token != None:
|
777
|
+
break
|
778
|
+
login_data = {}
|
779
|
+
if email != None:
|
780
|
+
login_data['email']=email
|
781
|
+
if password != None:
|
782
|
+
login_data['password'] = password
|
783
|
+
if checkbox != None:
|
784
|
+
login_data['checkbox'] = checkbox
|
785
|
+
if dropdown != None:
|
786
|
+
login_data['dropdown']=dropdown
|
787
|
+
if token != None:
|
788
|
+
login_data['token'] = token
|
789
|
+
s.post(login_url, data=login_data)
|
790
|
+
return s
|
791
|
+
|
792
|
+
def fetch_response(self) -> Union[requests.Response, None]:
|
793
|
+
"""Actually fetches the response from the server."""
|
794
|
+
# You can further adapt this method to use retries or other logic you had
|
795
|
+
# in your original code, but the main goal here is to fetch and return the response
|
796
|
+
return self.try_request()
|
797
|
+
def spec_auth(self, session=None, email=None, password=None, login_url=None, login_referer=None, login_user_agent=None):
|
798
|
+
s = session or requests.session()
|
799
|
+
|
800
|
+
domain = self.url_manager.url_join(self.url_manager.get_correct_url(self.url_manager.domain),'login') if login_url is None else login_url
|
801
|
+
login_url = self.url_manager.get_correct_url(url=domain)
|
802
|
+
|
803
|
+
login_referer = login_referer or self.url_manager.url_join(url=login_url, path='?role=fast&to=&s=1&m=1&email=YOUR_EMAIL')
|
804
|
+
login_user_agent = login_user_agent or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:50.0) Gecko/20100101 Firefox/50.0'
|
805
|
+
|
806
|
+
headers = {"Referer": login_referer, 'User-Agent': login_user_agent}
|
807
|
+
payload = {'email': email, 'pass': password}
|
808
|
+
|
809
|
+
page = s.get(login_url)
|
810
|
+
soup = BeautifulSoup(page.content, 'lxml')
|
811
|
+
action_url = soup.find('form')['action']
|
812
|
+
s.post(action_url, data=payload, headers=headers)
|
813
|
+
return s
|
814
|
+
def initialize_session(self):
|
815
|
+
s = self.session
|
816
|
+
if self.auth:
|
817
|
+
s= self.auth
|
818
|
+
elif self.spec_login:
|
819
|
+
s=self.spec_auth(session=s,email=self.email, password=self.password, login_url=self.login_url, login_referer=self.login_referer, login_user_agent=self.login_user_agent)
|
820
|
+
elif any([self.password, self.email, self.login_url, self.checkbox, self.dropdown]):
|
821
|
+
s=self.authenticate(session=s, login_url=self.login_url, email=self.email, password=self.password, checkbox=self.checkbox, dropdown=self.dropdown)
|
822
|
+
s.proxies = self.proxies
|
823
|
+
s.cookies["cf_clearance"] = self.network_manager.cookies
|
824
|
+
s.headers.update(self.headers)
|
825
|
+
s.mount(self.protocol, self.network_manager.tls_adapter)
|
826
|
+
return s
|
827
|
+
def process_response_data(self):
|
828
|
+
"""Processes the fetched response data."""
|
829
|
+
if not self.response:
|
830
|
+
return # No data to process
|
831
|
+
|
832
|
+
self.source_code = self.response.text
|
833
|
+
self.source_code_bytes = self.response.content
|
834
|
+
|
835
|
+
if self.response.headers.get('content-type') == 'application/json':
|
836
|
+
data = convert_to_json(self.source_code)
|
837
|
+
if data:
|
838
|
+
self.source_code_json = data.get("response", data)
|
839
|
+
|
840
|
+
self.get_react_source_code()
|
841
|
+
def get_react_source_code(self) -> list:
|
842
|
+
"""
|
843
|
+
Fetches the source code of the specified URL and extracts JavaScript and JSX source code (React components).
|
844
|
+
|
845
|
+
Args:
|
846
|
+
url (str): The URL to fetch the source code from.
|
847
|
+
|
848
|
+
Returns:
|
849
|
+
list: A list of strings containing JavaScript and JSX source code found in <script> tags.
|
850
|
+
"""
|
851
|
+
if self.url_manager.url is None:
|
852
|
+
return []
|
853
|
+
soup = BeautifulSoup(self.source_code_bytes,"html.parser")
|
854
|
+
script_tags = soup.find_all('script', type=lambda t: t and ('javascript' in t or 'jsx' in t))
|
855
|
+
for script_tag in script_tags:
|
856
|
+
self.react_source_code.append(script_tag.string)
|
857
|
+
|
858
|
+
|
859
|
+
def get_status(url:str=None) -> int:
|
860
|
+
"""
|
861
|
+
Gets the HTTP status code of the given URL.
|
862
|
+
|
863
|
+
Args:
|
864
|
+
url (str): The URL to check the status of.
|
865
|
+
|
866
|
+
Returns:
|
867
|
+
int: The HTTP status code of the URL, or None if the request fails.
|
868
|
+
"""
|
869
|
+
# Get the status code of the URL
|
870
|
+
return try_request(url=url).status_code
|
871
|
+
def wait_between_requests(self):
|
872
|
+
"""
|
873
|
+
Wait between requests based on the request_wait_limit.
|
874
|
+
"""
|
875
|
+
if self.last_request_time:
|
876
|
+
sleep_time = self.request_wait_limit - (get_time_stamp() - self.last_request_time)
|
877
|
+
if sleep_time > 0:
|
878
|
+
logging.info(f"Sleeping for {sleep_time:.2f} seconds.")
|
879
|
+
get_sleep(sleep_time)
|
880
|
+
|
881
|
+
def make_request(self):
|
882
|
+
"""
|
883
|
+
Make a request and handle potential errors.
|
884
|
+
"""
|
885
|
+
# Update the instance attributes if they are passed
|
886
|
+
|
887
|
+
self.wait_between_requests()
|
888
|
+
for _ in range(self.max_retries):
|
889
|
+
try:
|
890
|
+
self.try_request() # 10 seconds timeout
|
891
|
+
if self.response:
|
892
|
+
if self.response.status_code == 200:
|
893
|
+
self.last_request_time = get_time_stamp()
|
894
|
+
return self.response
|
895
|
+
elif self.response.status_code == 429:
|
896
|
+
logging.warning(f"Rate limited by {self.url_manager.url}. Retrying...")
|
897
|
+
get_sleep(5) # adjust this based on the server's rate limit reset time
|
898
|
+
except requests.Timeout as e:
|
899
|
+
logging.error(f"Request to {cleaned_url} timed out: {e}")
|
900
|
+
except requests.ConnectionError:
|
901
|
+
logging.error(f"Connection error for URL {self.url_manager.url}.")
|
902
|
+
except requests.Timeout:
|
903
|
+
logging.error(f"Request timeout for URL {self.url_manager.url}.")
|
904
|
+
except requests.RequestException as e:
|
905
|
+
logging.error(f"Request exception for URL {self.url_manager.url}: {e}")
|
906
|
+
|
907
|
+
logging.error(f"Failed to retrieve content from {self.url_manager.url} after {self.max_retries} retries.")
|
908
|
+
return None
|
909
|
+
def try_request(self) -> Union[requests.Response, None]:
|
910
|
+
"""
|
911
|
+
Tries to make an HTTP request to the given URL using the provided session.
|
912
|
+
|
913
|
+
Args:
|
914
|
+
timeout (int): Timeout for the request.
|
915
|
+
|
916
|
+
Returns:
|
917
|
+
requests.Response or None: The response object if the request is successful, or None if the request fails.
|
918
|
+
"""
|
919
|
+
try:
|
920
|
+
return self.session.get(url=self.url_manager.url, timeout=self.timeout,stream=self.stream)
|
921
|
+
except requests.exceptions.RequestException as e:
|
922
|
+
print(e)
|
923
|
+
return None
|
924
|
+
|
925
|
+
def get_limited_request(self,request_url,service_name="default"):
|
926
|
+
manager = DynamicRateLimiterManagerSingleton.get_instance() # Get the singleton instance
|
927
|
+
unwanted_response=True
|
928
|
+
# Check with the rate limiter if we can make a request
|
929
|
+
while True:
|
930
|
+
if not manager.request(service_name):
|
931
|
+
print("Rate limit reached for coin_gecko. Waiting for the next epoch...")
|
932
|
+
sleep_count_down(manager.services[service_name].get_sleep()["current_sleep"]) # Wait for the limit_epoch duration
|
933
|
+
# Make the actual request
|
934
|
+
response = try_request(request_url=request_url)
|
935
|
+
|
936
|
+
# If you get a rate-limit error (usually 429 status code but can vary), adjust the rate limiter
|
937
|
+
if response.status_code ==429:
|
938
|
+
print(response.json())
|
939
|
+
manager.services[service_name].request_tracker(False)
|
940
|
+
print("Rate limited by coin_gecko. Adjusted limit. Retrying...")
|
941
|
+
if len(manager.services[service_name].calculate_tokens()["succesful"])<2:
|
942
|
+
sleep_count_down(manager.services[service_name].limit_epoch) # Wait for the limit_epoch duration
|
943
|
+
else:
|
944
|
+
manager.services[service_name].current_limit-=1
|
945
|
+
sleep_count_down(manager.services[service_name].limit_epoch/len(manager.services[service_name].calculate_tokens()["succesful"])) # Wait for the limit_epoch duration
|
946
|
+
# Return the data if the request was successful
|
947
|
+
if response.status_code == 200:
|
948
|
+
manager.services[service_name].request_tracker(True)
|
949
|
+
return response.json()
|
950
|
+
elif response.status_code not in [200,429]:
|
951
|
+
print(f"Unexpected response: {response.status_code}. Message: {response.text}")
|
952
|
+
return None
|
953
|
+
@property
|
954
|
+
def url(self):
|
955
|
+
return self.url_manager.url
|
956
|
+
|
957
|
+
@url.setter
|
958
|
+
def url(self, new_url):
|
959
|
+
self._url = new_url
|
960
|
+
class SafeRequestSingleton:
|
961
|
+
_instance = None
|
962
|
+
@staticmethod
|
963
|
+
def get_instance(url=None,headers:dict=None,max_retries=3,last_request_time=None,request_wait_limit=1.5):
|
964
|
+
if SafeRequestSingleton._instance is None:
|
965
|
+
SafeRequestSingleton._instance = SafeRequest(url,url_manager=UrlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
|
966
|
+
elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
|
967
|
+
SafeRequestSingleton._instance = SafeRequest(url,url_manager=UrlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
|
968
|
+
return SafeRequestSingleton._instance
|
969
|
+
class SoupManager:
|
970
|
+
"""
|
971
|
+
SoupManager is a class for managing and parsing HTML source code using BeautifulSoup.
|
972
|
+
|
973
|
+
Args:
|
974
|
+
url (str or None): The URL to be parsed (default is None).
|
975
|
+
source_code (str or None): The HTML source code (default is None).
|
976
|
+
url_manager (UrlManager or None): An instance of UrlManager (default is None).
|
977
|
+
request_manager (SafeRequest or None): An instance of SafeRequest (default is None).
|
978
|
+
parse_type (str): The type of parser to be used by BeautifulSoup (default is "html.parser").
|
979
|
+
|
980
|
+
Methods:
|
981
|
+
re_initialize(): Reinitialize the SoupManager with the current settings.
|
982
|
+
update_url(url): Update the URL and reinitialize the SoupManager.
|
983
|
+
update_source_code(source_code): Update the source code and reinitialize the SoupManager.
|
984
|
+
update_request_manager(request_manager): Update the request manager and reinitialize the SoupManager.
|
985
|
+
update_url_manager(url_manager): Update the URL manager and reinitialize the SoupManager.
|
986
|
+
update_parse_type(parse_type): Update the parsing type and reinitialize the SoupManager.
|
987
|
+
all_links: A property that provides access to all discovered links.
|
988
|
+
_all_links_get(): A method to load all discovered links.
|
989
|
+
get_all_website_links(tag="a", attr="href"): Get all URLs belonging to the same website.
|
990
|
+
meta_tags: A property that provides access to all discovered meta tags.
|
991
|
+
_meta_tags_get(): A method to load all discovered meta tags.
|
992
|
+
get_meta_tags(): Get all meta tags in the source code.
|
993
|
+
find_all(element, soup=None): Find all instances of an HTML element in the source code.
|
994
|
+
get_class(class_name, soup=None): Get the specified class from the HTML source code.
|
995
|
+
has_attributes(tag, *attrs): Check if an HTML tag has the specified attributes.
|
996
|
+
get_find_all_with_attributes(*attrs): Find all HTML tags with specified attributes.
|
997
|
+
get_all_desired_soup(tag=None, attr=None, attr_value=None): Get HTML tags based on specified criteria.
|
998
|
+
extract_elements(url, tag=None, class_name=None, class_value=None): Extract portions of source code based on filters.
|
999
|
+
find_all_with_attributes(class_name=None, *attrs): Find classes with associated href or src attributes.
|
1000
|
+
get_images(tag_name, class_name, class_value): Get images with specific class and attribute values.
|
1001
|
+
discover_classes_and_meta_images(tag_name, class_name_1, class_name_2, class_value, attrs): Discover classes and meta images.
|
1002
|
+
|
1003
|
+
Note:
|
1004
|
+
- The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
|
1005
|
+
- It provides various methods to extract data and discover elements within the source code.
|
1006
|
+
"""
|
1007
|
+
def __init__(self,url=None,source_code=None,url_manager=None,request_manager=None, parse_type="html.parser"):
|
1008
|
+
self.soup=[]
|
1009
|
+
self.url=url
|
1010
|
+
if url_manager == None:
|
1011
|
+
url_manager=UrlManager(url=self.url)
|
1012
|
+
if self.url != None and url_manager != None and url_manager.url != UrlManager(url=url).url:
|
1013
|
+
url_manager.update_url(url=self.url)
|
1014
|
+
self.url_manager= url_manager
|
1015
|
+
self.url=self.url_manager.url
|
1016
|
+
if request_manager == None:
|
1017
|
+
request_manager = SafeRequest(url_manager=self.url_manager)
|
1018
|
+
self.request_manager = request_manager
|
1019
|
+
if self.request_manager.url_manager != self.url_manager:
|
1020
|
+
self.request_manager.update_url_manager(url_manager=self.url_manager)
|
1021
|
+
self.parse_type = parse_type
|
1022
|
+
if source_code != None:
|
1023
|
+
self.source_code = source_code
|
1024
|
+
else:
|
1025
|
+
self.source_code = self.request_manager.source_code_bytes
|
1026
|
+
self.soup= BeautifulSoup(self.source_code, self.parse_type)
|
1027
|
+
self._all_links_data = None
|
1028
|
+
self._meta_tags_data = None
|
1029
|
+
def re_initialize(self):
|
1030
|
+
self.soup= BeautifulSoup(self.source_code, self.parse_type)
|
1031
|
+
self._all_links_data = None
|
1032
|
+
self._meta_tags_data = None
|
1033
|
+
def update_url(self,url):
|
1034
|
+
self.url_manager.update_url(url=url)
|
1035
|
+
self.url=self.url_manager.url
|
1036
|
+
self.request_manager.update_url(url=url)
|
1037
|
+
self.source_code = self.request_manager.source_code_bytes
|
1038
|
+
self.re_initialize()
|
1039
|
+
def update_source_code(self,source_code):
|
1040
|
+
self.source_code = source_code
|
1041
|
+
self.re_initialize()
|
1042
|
+
def update_request_manager(self,request_manager):
|
1043
|
+
self.request_manager = request_manager
|
1044
|
+
self.url_manager=self.request_manager.url_manager
|
1045
|
+
self.url=self.url_manager.url
|
1046
|
+
self.source_code = self.request_manager.source_code_bytes
|
1047
|
+
self.re_initialize()
|
1048
|
+
def update_url_manager(self,url_manager):
|
1049
|
+
self.url_manager=url_manager
|
1050
|
+
self.url=self.url_manager.url
|
1051
|
+
self.request_manager.update_url_manager(url_manager=self.url_manager)
|
1052
|
+
self.source_code = self.request_manager.source_code_bytes
|
1053
|
+
self.re_initialize()
|
1054
|
+
def update_parse_type(self,parse_type):
|
1055
|
+
self.parse_type=parse_type
|
1056
|
+
self.re_initialize()
|
1057
|
+
@property
|
1058
|
+
def all_links(self):
|
1059
|
+
"""This is a property that provides access to the _all_links_data attribute.
|
1060
|
+
The first time it's accessed, it will load the data."""
|
1061
|
+
if self._all_links_data is None:
|
1062
|
+
print("Loading all links for the first time...")
|
1063
|
+
self._all_links_data = self._all_links_get()
|
1064
|
+
return self._all_links_data
|
1065
|
+
def _all_links_get(self):
|
1066
|
+
"""A method that loads the data (can be replaced with whatever data loading logic you have)."""
|
1067
|
+
return self.get_all_website_links()
|
1068
|
+
def get_all_website_links(self,tag="a",attr="href") -> list:
|
1069
|
+
"""
|
1070
|
+
Returns all URLs that are found on the specified URL and belong to the same website.
|
1071
|
+
|
1072
|
+
Args:
|
1073
|
+
url (str): The URL to search for links.
|
1074
|
+
|
1075
|
+
Returns:
|
1076
|
+
list: A list of URLs that belong to the same website as the specified URL.
|
1077
|
+
"""
|
1078
|
+
all_urls=[self.url_manager.url]
|
1079
|
+
domain = self.url_manager.domain
|
1080
|
+
all_desired=self.get_all_desired_soup(tag=tag,attr=attr)
|
1081
|
+
for tag in all_desired:
|
1082
|
+
href = tag.attrs.get(attr)
|
1083
|
+
if href == "" or href is None:
|
1084
|
+
# href empty tag
|
1085
|
+
continue
|
1086
|
+
href=self.url_manager.get_relative_href(self.url_manager.url,href)
|
1087
|
+
if not self.url_manager.is_valid_url(href):
|
1088
|
+
# not a valid URL
|
1089
|
+
continue
|
1090
|
+
if href in all_urls:
|
1091
|
+
# already in the set
|
1092
|
+
continue
|
1093
|
+
if domain not in href:
|
1094
|
+
# external link
|
1095
|
+
continue
|
1096
|
+
all_urls.append(href)
|
1097
|
+
|
1098
|
+
return all_urls
|
1099
|
+
|
1100
|
+
|
1101
|
+
@property
|
1102
|
+
def meta_tags(self):
|
1103
|
+
"""This is a property that provides access to the _all_links_data attribute.
|
1104
|
+
The first time it's accessed, it will load the data."""
|
1105
|
+
if self._meta_tags_data is None:
|
1106
|
+
print("Loading all links for the first time...")
|
1107
|
+
self._meta_tags_data = self._all_links_get()
|
1108
|
+
return self._meta_tags_data
|
1109
|
+
def _meta_tags_get(self):
|
1110
|
+
"""A method that loads the data (can be replaced with whatever data loading logic you have)."""
|
1111
|
+
return self.get_meta_tags()
|
1112
|
+
def get_meta_tags(self):
|
1113
|
+
tags = self.find_all("meta")
|
1114
|
+
for meta_tag in tags:
|
1115
|
+
for attr, values in meta_tag.attrs.items():
|
1116
|
+
if attr not in self.meta_tags:
|
1117
|
+
self.meta_tags[attr] = []
|
1118
|
+
if values not in self.meta_tags[attr]:
|
1119
|
+
self.meta_tags[attr].append(values)
|
1120
|
+
|
1121
|
+
|
1122
|
+
def find_all(self,element,soup=None):
|
1123
|
+
soup = self.soup if soup == None else soup
|
1124
|
+
return soup.find_all(element)
|
1125
|
+
def get_class(self,class_name,soup=None):
|
1126
|
+
soup = self.soup if soup == None else soup
|
1127
|
+
return soup.get(class_name)
|
1128
|
+
@staticmethod
|
1129
|
+
def has_attributes(tag, *attrs):
|
1130
|
+
return any(tag.has_attr(attr) for attr in attrs)
|
1131
|
+
def get_find_all_with_attributes(self, *attrs):
|
1132
|
+
return self.soup.find_all(lambda t: self.has_attributes(t, *attrs))
|
1133
|
+
def find_tags_by_attributes(self, tag: str = None, attr: str = None, attr_values: List[str] = None) ->List:
|
1134
|
+
if not tag:
|
1135
|
+
tags = self.soup.find_all(True) # get all tags
|
1136
|
+
else:
|
1137
|
+
tags = self.soup.find_all(tag) # get specific tags
|
1138
|
+
|
1139
|
+
extracted_tags = []
|
1140
|
+
for t in tags:
|
1141
|
+
if attr:
|
1142
|
+
attribute_value = t.get(attr)
|
1143
|
+
if not attribute_value: # skip tags without the desired attribute
|
1144
|
+
continue
|
1145
|
+
if attr_values and not any(value in attribute_value for value in attr_values): # skip tags without any of the desired attribute values
|
1146
|
+
continue
|
1147
|
+
extracted_tags.append(t)
|
1148
|
+
return extracted_tags
|
1149
|
+
|
1150
|
+
|
1151
|
+
def extract_elements(self,url:str=None, tag:str=None, class_name:str=None, class_value:str=None) -> list:
|
1152
|
+
"""
|
1153
|
+
Extracts portions of the source code from the specified URL based on provided filters.
|
1154
|
+
|
1155
|
+
Args:
|
1156
|
+
url (str): The URL to fetch the source code from.
|
1157
|
+
element_type (str, optional): The HTML element type to filter by. Defaults to None.
|
1158
|
+
attribute_name (str, optional): The attribute name to filter by. Defaults to None.
|
1159
|
+
class_name (str, optional): The class name to filter by. Defaults to None.
|
1160
|
+
|
1161
|
+
Returns:
|
1162
|
+
list: A list of strings containing portions of the source code that match the provided filters.
|
1163
|
+
"""
|
1164
|
+
elements = []
|
1165
|
+
# If no filters are provided, return the entire source code
|
1166
|
+
if not tag and not class_name and not class_value:
|
1167
|
+
elements.append(str(self.soup))
|
1168
|
+
return elements
|
1169
|
+
# Find elements based on the filters provided
|
1170
|
+
if tag:
|
1171
|
+
elements.extend([str(tags) for tags in self.get_all_desired(tag)])
|
1172
|
+
if class_name:
|
1173
|
+
elements.extend([str(tags) for tags in self.get_all_desired(tag={class_name: True})])
|
1174
|
+
if class_value:
|
1175
|
+
elements.extend([str(tags) for tags in self.get_all_desired(class_name=class_name)])
|
1176
|
+
return elements
|
1177
|
+
def find_all_with_attributes(self, class_name=None, *attrs):
|
1178
|
+
"""
|
1179
|
+
Discovers classes in the HTML content of the provided URL
|
1180
|
+
that have associated href or src attributes.
|
1181
|
+
|
1182
|
+
Args:
|
1183
|
+
base_url (str): The URL from which to discover classes.
|
1184
|
+
|
1185
|
+
Returns:
|
1186
|
+
set: A set of unique class names.
|
1187
|
+
"""
|
1188
|
+
|
1189
|
+
|
1190
|
+
unique_classes = set()
|
1191
|
+
for tag in self.get_find_all_with_attributes(*attrs):
|
1192
|
+
class_list = self.get_class(class_name=class_name, soup=tag)
|
1193
|
+
unique_classes.update(class_list)
|
1194
|
+
return unique_classes
|
1195
|
+
def get_images(self, tag_name, class_name, class_value):
|
1196
|
+
images = []
|
1197
|
+
for tag in self.soup.find_all(tag_name):
|
1198
|
+
if class_name in tag.attrs and tag.attrs[class_name] == class_value:
|
1199
|
+
content = tag.attrs.get('content', '')
|
1200
|
+
if content:
|
1201
|
+
images.append(content)
|
1202
|
+
return images
|
1203
|
+
def extract_text_sections(self) -> list:
|
1204
|
+
"""
|
1205
|
+
Extract all sections of text from an HTML content using BeautifulSoup.
|
1206
|
+
|
1207
|
+
Args:
|
1208
|
+
html_content (str): The HTML content to be parsed.
|
1209
|
+
|
1210
|
+
Returns:
|
1211
|
+
list: A list containing all sections of text.
|
1212
|
+
"""
|
1213
|
+
# Remove any script or style elements to avoid extracting JavaScript or CSS code
|
1214
|
+
for script in self.soup(['script', 'style']):
|
1215
|
+
script.decompose()
|
1216
|
+
|
1217
|
+
# Extract text from the remaining elements
|
1218
|
+
text_sections = self.soup.stripped_strings
|
1219
|
+
return [text for text in text_sections if text]
|
1220
|
+
def discover_classes_and_meta_images(self, tag_name, class_name_1, class_name_2, class_value, attrs):
|
1221
|
+
"""
|
1222
|
+
Discovers classes in the HTML content of the provided URL
|
1223
|
+
that have associated href or src attributes. Also, fetches
|
1224
|
+
image references from meta tags.
|
1225
|
+
|
1226
|
+
Args:
|
1227
|
+
base_url (str): The URL from which to discover classes and meta images.
|
1228
|
+
|
1229
|
+
Returns:
|
1230
|
+
tuple: A set of unique class names and a list of meta images.
|
1231
|
+
"""
|
1232
|
+
|
1233
|
+
unique_classes = self.find_all_with_attributes(class_name=class_name_1, *attrs)
|
1234
|
+
images = self.get_images(tag_name=tag_name, class_name=class_name_2, class_value=class_value)
|
1235
|
+
return unique_classes, images
|
1236
|
+
def get_all_tags_and_attribute_names(self):
|
1237
|
+
tag_names = set() # Using a set to ensure uniqueness
|
1238
|
+
attribute_names = set()
|
1239
|
+
get_all = self.find_tags_by_attributes()
|
1240
|
+
for tag in get_all: # True matches all tags
|
1241
|
+
tag_names.add(tag.name)
|
1242
|
+
for attr in tag.attrs:
|
1243
|
+
attribute_names.add(attr)
|
1244
|
+
tag_names_list = list(tag_names)
|
1245
|
+
attribute_names_list = list(attribute_names)
|
1246
|
+
return {"tags":tag_names_list,"attributes":attribute_names_list}
|
1247
|
+
|
1248
|
+
def get_all_attribute_values(self):
|
1249
|
+
attribute_values={}
|
1250
|
+
get_all = self.find_tags_by_attributes()
|
1251
|
+
for tag in get_all: # True matches all tags
|
1252
|
+
for attr, value in tag.attrs.items():
|
1253
|
+
# If attribute is not yet in the dictionary, add it with an empty set
|
1254
|
+
if attr not in attribute_values:
|
1255
|
+
attribute_values[attr] = set()
|
1256
|
+
# If the attribute value is a list (e.g., class), extend the set with the list
|
1257
|
+
if isinstance(value, list):
|
1258
|
+
attribute_values[attr].update(value)
|
1259
|
+
else:
|
1260
|
+
attribute_values[attr].add(value)
|
1261
|
+
for attr, values in attribute_values.items():
|
1262
|
+
attribute_values[attr] = list(values)
|
1263
|
+
return attribute_values
|
1264
|
+
|
1265
|
+
@property
|
1266
|
+
def url(self):
|
1267
|
+
return self._url
|
1268
|
+
@url.setter
|
1269
|
+
def url(self, new_url):
|
1270
|
+
self._url = new_url
|
1271
|
+
|
1272
|
+
class SoupManagerSingleton():
|
1273
|
+
_instance = None
|
1274
|
+
@staticmethod
|
1275
|
+
def get_instance(url_manager,request_manager,parse_type="html.parser",source_code=None):
|
1276
|
+
if SoupManagerSingleton._instance is None:
|
1277
|
+
SoupManagerSingleton._instance = SoupManager(url_manager,request_manager,parse_type=parse_type,source_code=source_code)
|
1278
|
+
elif parse_type != SoupManagerSingleton._instance.parse_type or source_code != SoupManagerSingleton._instance.source_code:
|
1279
|
+
SoupManagerSingleton._instance = SoupManager(url_manager,request_manager,parse_type=parse_type,source_code=source_code)
|
1280
|
+
return SoupManagerSingleton._instance
|
1281
|
+
class VideoDownloader:
|
1282
|
+
"""
|
1283
|
+
VideoDownloader is a class for downloading videos from URLs using YouTube-DL.
|
1284
|
+
|
1285
|
+
Args:
|
1286
|
+
link (str or list): The URL(s) of the video(s) to be downloaded.
|
1287
|
+
temp_directory (str or None): The directory to store temporary video files (default is None, uses video_directory/temp_files).
|
1288
|
+
video_directory (str or None): The directory to store downloaded videos (default is None, uses 'videos' in the current working directory).
|
1289
|
+
remove_existing (bool): Whether to remove existing video files with the same name (default is True).
|
1290
|
+
|
1291
|
+
Methods:
|
1292
|
+
count_outliers(speed, threshold): Count speed outliers below the threshold.
|
1293
|
+
filter_outliers(speeds): Filter out speed outliers in the list of speeds.
|
1294
|
+
remove_temps(file_name): Remove temporary video files based on the file name.
|
1295
|
+
move_video(): Move the downloaded video to the final directory.
|
1296
|
+
yt_dlp_downloader(url, ydl_opts={}, download=True): Download video information using YouTube-DL.
|
1297
|
+
progress_callback(d): Callback function to monitor download progress.
|
1298
|
+
download(): Download video(s) based on the provided URL(s).
|
1299
|
+
monitor(): Monitor the download progress.
|
1300
|
+
start(): Start the download and monitoring threads.
|
1301
|
+
|
1302
|
+
Note:
|
1303
|
+
- The VideoDownloader class uses YouTube-DL to download videos.
|
1304
|
+
- It allows downloading from multiple URLs.
|
1305
|
+
- You need to have YouTube-DL installed to use this class.
|
1306
|
+
"""
|
1307
|
+
def __init__(self, link,temp_directory=None,video_directory=None,remove_existing=True):
|
1308
|
+
if video_directory==None:
|
1309
|
+
video_directory=os.path.join(os.getcwd(),'videos')
|
1310
|
+
if temp_directory == None:
|
1311
|
+
temp_directory=os.path.join(video_directory,'temp_files')
|
1312
|
+
self.thread_manager = ThreadManager()
|
1313
|
+
self.pause_event = self.thread_manager.add_thread('pause_event')
|
1314
|
+
self.link = link
|
1315
|
+
self.temp_directory = temp_directory
|
1316
|
+
self.video_directory = video_directory
|
1317
|
+
self.remove_existing=remove_existing
|
1318
|
+
self.video_urls=self.link if isinstance(self.link,list) else [self.link]
|
1319
|
+
self.starttime = None
|
1320
|
+
self.downloaded = 0
|
1321
|
+
self.time_interval=60
|
1322
|
+
self.monitoring=True
|
1323
|
+
self.temp_file_name = None
|
1324
|
+
self.file_name = None
|
1325
|
+
self.dl_speed = None
|
1326
|
+
self.dl_eta=None
|
1327
|
+
self.total_bytes_est=None
|
1328
|
+
self.percent_speed=None
|
1329
|
+
self.percent=None
|
1330
|
+
self.speed_track = []
|
1331
|
+
self.video_url=None
|
1332
|
+
self.last_checked = get_time_stamp()
|
1333
|
+
self.num=0
|
1334
|
+
self.start()
|
1335
|
+
def count_outliers(self,speed,threshold):
|
1336
|
+
if speed < threshold:
|
1337
|
+
self.outlier_count+=1
|
1338
|
+
else:
|
1339
|
+
self.outlier_count=0
|
1340
|
+
def filter_outliers(self,speeds):
|
1341
|
+
# Step 1: Compute initial average
|
1342
|
+
initial_avg = sum(speeds) / len(speeds)
|
1343
|
+
|
1344
|
+
# Step 2: Remove speeds 25% under the average
|
1345
|
+
threshold = initial_avg * 0.75 # 25% under average
|
1346
|
+
filtered_speeds = [speed for speed in speeds if speed >= threshold]
|
1347
|
+
|
1348
|
+
# Step 3: Compute the new average of the filtered list
|
1349
|
+
if filtered_speeds: # Ensure the list is not empty
|
1350
|
+
self.count_outliers(speeds[-1],threshold)
|
1351
|
+
return filtered_speeds
|
1352
|
+
else:
|
1353
|
+
# This can happen if all values are outliers, it's up to you how to handle it
|
1354
|
+
self.outlier_count=0
|
1355
|
+
return speeds
|
1356
|
+
def remove_temps(self,file_name):
|
1357
|
+
for temp_vid in os.listdir(self.temp_directory):
|
1358
|
+
if len(file_name)<=len(temp_vid):
|
1359
|
+
if temp_vid[:len(file_name)] == file_name:
|
1360
|
+
os.remove(os.path.join(self.temp_directory,temp_vid))
|
1361
|
+
print(f"removing {temp_vid} from {self.temp_directory}")
|
1362
|
+
def move_video(self):
|
1363
|
+
if os.path.exists(self.temp_file_path):
|
1364
|
+
shutil.move(self.temp_file_path, self.video_directory)
|
1365
|
+
print(f"moving {self.file_name} from {self.temp_directory} to {self.video_directory}")
|
1366
|
+
self.remove_temps(self.file_name)
|
1367
|
+
return True
|
1368
|
+
if os.path.exists(self.complete_file_path):
|
1369
|
+
print(f"{self.file_name} already existed in {self.video_directory}; removing it from {self.temp_directory}")
|
1370
|
+
self.remove_temps(self.file_name)
|
1371
|
+
return True
|
1372
|
+
return False
|
1373
|
+
def yt_dlp_downloader(self,url,ydl_opts={},download=True):
|
1374
|
+
try:
|
1375
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
1376
|
+
self.info_dict=ydl.extract_info(url=url, download=download)
|
1377
|
+
return True
|
1378
|
+
except:
|
1379
|
+
return False
|
1380
|
+
def progress_callback(self, d):
|
1381
|
+
self.status_dict = d
|
1382
|
+
keys = ['status',
|
1383
|
+
'downloaded_bytes',
|
1384
|
+
'fragment_index',
|
1385
|
+
'fragment_count',
|
1386
|
+
'filename',
|
1387
|
+
'tmpfilename',
|
1388
|
+
'max_progress',
|
1389
|
+
'progress_idx',
|
1390
|
+
'elapsed',
|
1391
|
+
'total_bytes_estimate',
|
1392
|
+
'speed',
|
1393
|
+
'eta',
|
1394
|
+
'_eta_str',
|
1395
|
+
'_speed_str',
|
1396
|
+
'_percent_str',
|
1397
|
+
'_total_bytes_str',
|
1398
|
+
'_total_bytes_estimate_str',
|
1399
|
+
'_downloaded_bytes_str',
|
1400
|
+
'_elapsed_str',
|
1401
|
+
'_default_template']
|
1402
|
+
if self.status_dict['status'] == 'finished':
|
1403
|
+
print("Done downloading, moving video to final directory...")
|
1404
|
+
self.move_video()
|
1405
|
+
return
|
1406
|
+
if get_time_stamp()-self.last_checked>5:
|
1407
|
+
print(self.status_dict['_default_template'])
|
1408
|
+
self.last_checked = get_time_stamp()
|
1409
|
+
if (get_time_stamp()-self.start_time/5)>6:
|
1410
|
+
self.speed_track.append(self.status_dict['speed'])
|
1411
|
+
self.speed_track=self.filter_outliers(self.speed_track)
|
1412
|
+
|
1413
|
+
def download(self):
|
1414
|
+
if not os.path.exists(self.video_directory):
|
1415
|
+
os.makedirs(self.video_directory,exist_ok=True)
|
1416
|
+
if not os.path.exists(self.temp_directory):
|
1417
|
+
os.makedirs(self.temp_directory,exist_ok=True)
|
1418
|
+
for self.num,video_url in enumerate(self.video_urls):
|
1419
|
+
if video_url != self.video_url or self.video_url == None:
|
1420
|
+
self.video_url=video_url
|
1421
|
+
self.info_dict=None
|
1422
|
+
result = self.yt_dlp_downloader(url=self.video_url,ydl_opts={'quiet': True, 'no_warnings': True},download=False)
|
1423
|
+
if self.info_dict != None and result:
|
1424
|
+
self.start_time = get_time_stamp()
|
1425
|
+
self.downloaded = 0
|
1426
|
+
self.video_title = self.info_dict.get('title', None)
|
1427
|
+
self.video_ext = self.info_dict.get('ext', 'mp4')
|
1428
|
+
self.file_name =f"{self.video_title}.{self.video_ext}"
|
1429
|
+
self.temp_file_path = os.path.join(self.temp_directory, self.file_name)
|
1430
|
+
self.complete_file_path = os.path.join(self.video_directory, self.file_name)
|
1431
|
+
if not self.move_video():
|
1432
|
+
self.dl_speed = []
|
1433
|
+
self.percent=None
|
1434
|
+
self.dl_eta=None
|
1435
|
+
self.total_bytes_est=None
|
1436
|
+
self.percent_speed=None
|
1437
|
+
self.speed_track = []
|
1438
|
+
self.outlier_count=0
|
1439
|
+
ydl_opts = {
|
1440
|
+
'outtmpl': self.temp_file_path,
|
1441
|
+
'noprogress':True,
|
1442
|
+
'progress_hooks': [self.progress_callback]
|
1443
|
+
}
|
1444
|
+
|
1445
|
+
|
1446
|
+
print("Starting download...") # Check if this point in code is reached
|
1447
|
+
result = self.yt_dlp_downloader(url=self.video_url,ydl_opts=ydl_opts,download=True)
|
1448
|
+
if result:
|
1449
|
+
print("Download finished!") # Check if download completes
|
1450
|
+
else:
|
1451
|
+
print(f'error downloding {self.video_url}')
|
1452
|
+
self.move_video()
|
1453
|
+
else:
|
1454
|
+
print(f"The video from {self.video_url} already exists in the directory {self.video_directory}. Skipping download.")
|
1455
|
+
else:
|
1456
|
+
print(f"could not find video info from {self.video_url} Skipping download.")
|
1457
|
+
if self.num==len(self.video_urls)-1:
|
1458
|
+
self.monitoring=False
|
1459
|
+
self.time_interval=0
|
1460
|
+
|
1461
|
+
def monitor(self):
|
1462
|
+
while self.monitoring:
|
1463
|
+
self.thread_manager.wait(name='pause_event',n=self.time_interval)# check every minute
|
1464
|
+
if self.monitoring:
|
1465
|
+
if 'eta' in self.status_dict:
|
1466
|
+
if self.outlier_count>=3 and (self.status_dict['eta']/60)>10:
|
1467
|
+
self.start()
|
1468
|
+
|
1469
|
+
def start(self):
|
1470
|
+
download_thread = self.thread_manager.add_thread(name='download_thread',target=self.download)
|
1471
|
+
monitor_thread = self.thread_manager.add_thread(name='monitor_thread',target_function=self.monitor)
|
1472
|
+
self.thread_manager.start(name='download_thread')
|
1473
|
+
self.thread_manager.start(name='monitor_thread')
|
1474
|
+
self.thread_manager.join(name='download_thread')
|
1475
|
+
self.thread_manager.join(name='monitor_thread')
|
1476
|
+
class VideoDownloaderSingleton():
|
1477
|
+
_instance = None
|
1478
|
+
@staticmethod
|
1479
|
+
def get_instance(url_manager,request_manager,title=None,video_extention='mp4',download_directory=os.getcwd(),user_agent=None,download=True,get_info=False):
|
1480
|
+
if VideoDownloaderSingleton._instance is None:
|
1481
|
+
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
|
1482
|
+
elif VideoDownloaderSingleton._instance.title != title or video_extention != VideoDownloaderSingleton._instance.video_extention or url != VideoDownloaderSingleton._instance.url or download_directory != VideoDownloaderSingleton._instance.download_directory or user_agent != VideoDownloaderSingleton._instance.user_agent:
|
1483
|
+
VideoDownloaderSingleton._instance = VideoDownloader(url=url,title=title,video_extention=video_extention,download_directory=download_directory,download=download,get_info=get_info,user_agent=user_agent)
|
1484
|
+
return VideoDownloaderSingleton._instance
|
1485
|
+
|
1486
|
+
class LinkManager:
|
1487
|
+
"""
|
1488
|
+
LinkManager is a class for managing and extracting links and image links from a web page.
|
1489
|
+
|
1490
|
+
Args:
|
1491
|
+
url (str): The URL of the web page (default is "https://example.com").
|
1492
|
+
source_code (str or None): The source code of the web page (default is None).
|
1493
|
+
url_manager (UrlManager or None): An instance of UrlManager (default is None).
|
1494
|
+
request_manager (SafeRequest or None): An instance of SafeRequest (default is None).
|
1495
|
+
soup_manager (SoupManager or None): An instance of SoupManager (default is None).
|
1496
|
+
image_link_tags (str): HTML tags to identify image links (default is 'img').
|
1497
|
+
img_link_attrs (str): HTML attributes to identify image link URLs (default is 'src').
|
1498
|
+
link_tags (str): HTML tags to identify links (default is 'a').
|
1499
|
+
link_attrs (str): HTML attributes to identify link URLs (default is 'href').
|
1500
|
+
strict_order_tags (bool): Flag to indicate if tags and attributes should be matched strictly (default is False).
|
1501
|
+
img_attr_value_desired (list or None): Desired attribute values for image links (default is None).
|
1502
|
+
img_attr_value_undesired (list or None): Undesired attribute values for image links (default is None).
|
1503
|
+
link_attr_value_desired (list or None): Desired attribute values for links (default is None).
|
1504
|
+
link_attr_value_undesired (list or None): Undesired attribute values for links (default is None).
|
1505
|
+
associated_data_attr (list): HTML attributes to associate with the extracted links (default is ["data-title", 'alt', 'title']).
|
1506
|
+
get_img (list): HTML attributes used to identify associated images (default is ["data-title", 'alt', 'title']).
|
1507
|
+
|
1508
|
+
Methods:
|
1509
|
+
re_initialize(): Reinitialize the LinkManager with the current settings.
|
1510
|
+
update_url_manager(url_manager): Update the URL manager with a new instance.
|
1511
|
+
update_url(url): Update the URL and reinitialize the LinkManager.
|
1512
|
+
update_source_code(source_code): Update the source code and reinitialize the LinkManager.
|
1513
|
+
update_soup_manager(soup_manager): Update the SoupManager and reinitialize the LinkManager.
|
1514
|
+
update_desired(...): Update the desired settings and reinitialize the LinkManager.
|
1515
|
+
find_all_desired(...): Find all desired links or image links based on the specified criteria.
|
1516
|
+
find_all_domain(): Find all unique domain names in the extracted links.
|
1517
|
+
|
1518
|
+
Note:
|
1519
|
+
- The LinkManager class helps manage and extract links and image links from web pages.
|
1520
|
+
- The class provides flexibility in specifying criteria for link extraction.
|
1521
|
+
"""
|
1522
|
+
def __init__(self,url="https://example.com",source_code=None,url_manager=None,request_manager=None,soup_manager=None,image_link_tags='img',img_link_attrs='src',link_tags='a',link_attrs='href',strict_order_tags=False,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title']):
|
1523
|
+
if url_manager==None:
|
1524
|
+
url_manager=UrlManager(url=url)
|
1525
|
+
self.url_manager= url_manager
|
1526
|
+
self.url=self.url_manager.url
|
1527
|
+
if request_manager==None:
|
1528
|
+
request_manager = SafeRequest(url_manager=self.url_manager)
|
1529
|
+
self.request_manager=request_manager
|
1530
|
+
if soup_manager == None:
|
1531
|
+
soup_manager = SoupManager(url_manager=self.url_manager,request_manager=self.request_manager)
|
1532
|
+
self.soup_manager = soup_manager
|
1533
|
+
if source_code != None:
|
1534
|
+
self.source_code=source_code
|
1535
|
+
else:
|
1536
|
+
self.source_code=self.request_manager.source_code_bytes
|
1537
|
+
if self.source_code != self.soup_manager.source_code:
|
1538
|
+
self.soup_manager.update_source_code(source_code=self.source_code)
|
1539
|
+
self.strict_order_tags=strict_order_tags
|
1540
|
+
self.image_link_tags=image_link_tags
|
1541
|
+
self.img_link_attrs=img_link_attrs
|
1542
|
+
self.link_tags=link_tags
|
1543
|
+
self.link_attrs=link_attrs
|
1544
|
+
self.img_attr_value_desired=img_attr_value_desired
|
1545
|
+
self.img_attr_value_undesired=img_attr_value_undesired
|
1546
|
+
self.link_attr_value_desired=link_attr_value_desired
|
1547
|
+
self.link_attr_value_undesired=link_attr_value_undesired
|
1548
|
+
self.associated_data_attr=associated_data_attr
|
1549
|
+
self.get_img=get_img
|
1550
|
+
self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,
|
1551
|
+
attr=self.img_link_attrs,
|
1552
|
+
attr_value_desired=self.img_attr_value_desired,
|
1553
|
+
attr_value_undesired=self.img_attr_value_undesired)
|
1554
|
+
self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,
|
1555
|
+
attr=self.link_attrs,
|
1556
|
+
attr_value_desired=self.link_attr_value_desired,
|
1557
|
+
attr_value_undesired=self.link_attr_value_undesired,
|
1558
|
+
associated_data_attr=self.associated_data_attr,
|
1559
|
+
get_img=get_img)
|
1560
|
+
def re_initialize(self):
|
1561
|
+
self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,attr=self.img_link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.img_attr_value_desired,attr_value_undesired=self.img_attr_value_undesired)
|
1562
|
+
self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,attr=self.link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.link_attr_value_desired,attr_value_undesired=self.link_attr_value_undesired,associated_data_attr=self.associated_data_attr,get_img=self.get_img)
|
1563
|
+
def update_url_manager(self,url_manager):
|
1564
|
+
self.url_manager=url_manager
|
1565
|
+
self.url=self.url_manager.url
|
1566
|
+
self.request_manager.update_url_manager(url_manager=self.url_manager)
|
1567
|
+
self.soup_manager.update_url_manager(url_manager=self.url_manager)
|
1568
|
+
self.source_code=self.soup_manager.source_code
|
1569
|
+
self.re_initialize()
|
1570
|
+
def update_url(self,url):
|
1571
|
+
self.url=url
|
1572
|
+
self.url_manager.update_url(url=self.url)
|
1573
|
+
self.url=self.url_manager.url
|
1574
|
+
self.request_manager.update_url(url=self.url)
|
1575
|
+
self.soup_manager.update_url(url=self.url)
|
1576
|
+
self.source_code=self.soup_manager.source_code
|
1577
|
+
self.re_initialize()
|
1578
|
+
def update_source_code(self,source_code):
|
1579
|
+
self.source_code=source_code
|
1580
|
+
if self.source_code != self.soup_manager.source_code:
|
1581
|
+
self.soup_manager.update_source_code(source_code=self.source_code)
|
1582
|
+
self.re_initialize()
|
1583
|
+
def update_soup_manager(self,soup_manager):
|
1584
|
+
self.soup_manager=soup_manager
|
1585
|
+
self.source_code=self.soup_manager.source_code
|
1586
|
+
self.re_initialize()
|
1587
|
+
def update_desired(self,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,image_link_tags=None,img_link_attrs=None,link_tags=None,link_attrs=None,strict_order_tags=None,associated_data_attr=None,get_img=None):
|
1588
|
+
self.strict_order_tags = strict_order_tags or self.strict_order_tags
|
1589
|
+
self.img_attr_value_desired=img_attr_value_desired or self.img_attr_value_desired
|
1590
|
+
self.img_attr_value_undesired=img_attr_value_undesired or self.img_attr_value_undesired
|
1591
|
+
self.link_attr_value_desired=link_attr_value_desired or self.link_attr_value_desired
|
1592
|
+
self.link_attr_value_undesired=link_attr_value_undesired or self.link_attr_value_undesired
|
1593
|
+
self.image_link_tags=image_link_tags or self.image_link_tags
|
1594
|
+
self.img_link_attrs=img_link_attrs or self.img_link_attrs
|
1595
|
+
self.link_tags=link_tags or self.link_tags
|
1596
|
+
self.link_attrs=link_attrs or self.link_attrs
|
1597
|
+
self.associated_data_attr=associated_data_attr or self.associated_data_attr
|
1598
|
+
self.get_img=get_img or self.get_img
|
1599
|
+
self.re_initialize()
|
1600
|
+
def find_all_desired(self,tag='img',attr='src',strict_order_tags=False,attr_value_desired=None,attr_value_undesired=None,associated_data_attr=None,get_img=None):
|
1601
|
+
def make_list(obj):
|
1602
|
+
if isinstance(obj,list) or obj==None:
|
1603
|
+
return obj
|
1604
|
+
return [obj]
|
1605
|
+
def get_desired_value(attr,attr_value_desired=None,attr_value_undesired=None):
|
1606
|
+
if attr_value_desired:
|
1607
|
+
for value in attr_value_desired:
|
1608
|
+
if value not in attr:
|
1609
|
+
return False
|
1610
|
+
if attr_value_undesired:
|
1611
|
+
for value in attr_value_undesired:
|
1612
|
+
if value in attr:
|
1613
|
+
return False
|
1614
|
+
return True
|
1615
|
+
attr_value_desired,attr_value_undesired,associated_data_attr,tags,attribs=make_list(attr_value_desired),make_list(attr_value_undesired),make_list(associated_data_attr),make_list(tag),make_list(attr)
|
1616
|
+
desired_ls = []
|
1617
|
+
assiciated_data=[]
|
1618
|
+
for i,tag in enumerate(tags):
|
1619
|
+
attribs_list=attribs
|
1620
|
+
if strict_order_tags:
|
1621
|
+
if len(attribs)<=i:
|
1622
|
+
attribs_list=[None]
|
1623
|
+
else:
|
1624
|
+
attribs_list=make_list(attribs[i])
|
1625
|
+
for attr in attribs_list:
|
1626
|
+
for component in self.soup_manager.soup.find_all(tag):
|
1627
|
+
if attr in component.attrs and get_desired_value(attr=component[attr],attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired):
|
1628
|
+
if component[attr] not in desired_ls:
|
1629
|
+
desired_ls.append(component[attr])
|
1630
|
+
assiciated_data.append({"value":component[attr]})
|
1631
|
+
if associated_data_attr:
|
1632
|
+
for data in associated_data_attr:
|
1633
|
+
if data in component.attrs:
|
1634
|
+
assiciated_data[-1][data]=component.attrs[data]
|
1635
|
+
if get_img and component.attrs[data]:
|
1636
|
+
if data in get_img and len(component.attrs[data])!=0:
|
1637
|
+
for each in self.soup_manager.soup.find_all('img'):
|
1638
|
+
if 'alt' in each.attrs:
|
1639
|
+
if each.attrs['alt'] == component.attrs[data] and 'src' in each.attrs:
|
1640
|
+
assiciated_data[-1]['image']=each.attrs['src']
|
1641
|
+
desired_ls.append(assiciated_data)
|
1642
|
+
return desired_ls
|
1643
|
+
def find_all_domain(self):
|
1644
|
+
domains_ls=[self.url_manager.protocol+'://'+self.url_manager.domain]
|
1645
|
+
for desired in all_desired[:-1]:
|
1646
|
+
if url_manager.is_valid_url(desired):
|
1647
|
+
parse = urlparse(desired)
|
1648
|
+
domain = parse.scheme+'://'+parse.netloc
|
1649
|
+
if domain not in domains_ls:
|
1650
|
+
domains_ls.append(domain)
|
1651
|
+
def find_all_desired_links(self,tag='img', attr='src',attr_value_desired=None,strict_order_tags=False,attr_value_undesired=None,associated_data_attr=None,all_desired=None,get_img=None):
|
1652
|
+
all_desired = all_desired or self.find_all_desired(tag=tag,attr=attr,strict_order_tags=strict_order_tags,attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired,associated_data_attr=associated_data_attr,get_img=get_img)
|
1653
|
+
assiciated_attrs = all_desired[-1]
|
1654
|
+
valid_assiciated_attrs = []
|
1655
|
+
desired_links=[]
|
1656
|
+
for i,attr in enumerate(all_desired[:-1]):
|
1657
|
+
valid_attr=self.url_manager.make_valid(attr,self.url_manager.protocol+'://'+self.url_manager.domain)
|
1658
|
+
if valid_attr:
|
1659
|
+
desired_links.append(valid_attr)
|
1660
|
+
valid_assiciated_attrs.append(assiciated_attrs[i])
|
1661
|
+
valid_assiciated_attrs[-1]["link"]=valid_attr
|
1662
|
+
desired_links.append(valid_assiciated_attrs)
|
1663
|
+
return desired_links
|
1664
|
+
|
1665
|
+
def CrawlManager():
|
1666
|
+
def __init__(self,url=None,source_code=None,parse_type="html.parser"):
|
1667
|
+
self.url=url
|
1668
|
+
self.source_code=source_code
|
1669
|
+
self.parse_type=parse_type
|
1670
|
+
get_new_source_and_url(self,url)
|
1671
|
+
def get_new_source_and_url(self,url=None):
|
1672
|
+
if url == None:
|
1673
|
+
url = self.url
|
1674
|
+
self.response = self.response_manager.response
|
1675
|
+
self.source_code=self.response_manager.source_code
|
1676
|
+
def get_classes_and_meta_info():
|
1677
|
+
class_name_1,class_name_2, class_value = 'meta','class','property','og:image'
|
1678
|
+
attrs = 'href','src'
|
1679
|
+
unique_classes, images=discover_classes_and_images(self,tag_name,class_name_1,class_name_2,class_value,attrs)
|
1680
|
+
return unique_classes, images
|
1681
|
+
def extract_links_from_url(self):
|
1682
|
+
"""
|
1683
|
+
Extracts all href and src links from a given URL's source code.
|
1684
|
+
|
1685
|
+
Args:
|
1686
|
+
base_url (str): The URL from which to extract links.
|
1687
|
+
|
1688
|
+
Returns:
|
1689
|
+
dict: Dictionary containing image links and external links under the parent page.
|
1690
|
+
"""
|
1691
|
+
agg_js = {'images':[],'external_links':[]}
|
1692
|
+
|
1693
|
+
if self.response != None:
|
1694
|
+
attrs = 'href','src'
|
1695
|
+
href_links,src_links='',''
|
1696
|
+
links = [href_links,src_links]
|
1697
|
+
for i,each in enumerate(attrs):
|
1698
|
+
links[i]= [a[attr[i]] for a in get_find_all_with_attributes(self, attrs[i])]
|
1699
|
+
# Convert all links to absolute links
|
1700
|
+
absolute_links = [(url, link) for link in links[0] + links[1]]
|
1701
|
+
# Separate images and external links
|
1702
|
+
images = [link for link in absolute_links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
|
1703
|
+
external_links = [link for link in absolute_links if urlparse(link).netloc != urlparse(url).netloc]
|
1704
|
+
agg_js['images']=images
|
1705
|
+
agg_js['external_links']=external_links
|
1706
|
+
|
1707
|
+
return agg_js
|
1708
|
+
|
1709
|
+
|
1710
|
+
def correct_xml(xml_string):
|
1711
|
+
# Parse the XML string
|
1712
|
+
root = ET.fromstring(xml_string)
|
1713
|
+
|
1714
|
+
# Loop through each <image:loc> element and correct its text if needed
|
1715
|
+
for image_loc in root.findall(".//image:loc", namespaces={'image': 'http://www.google.com/schemas/sitemap-image/1.1'}):
|
1716
|
+
# Replace '&' with '&' in the element's text
|
1717
|
+
if '&' in image_loc.text:
|
1718
|
+
image_loc.text = image_loc.text.replace('&', '&')
|
1719
|
+
|
1720
|
+
# Convert the corrected XML back to string
|
1721
|
+
corrected_xml = ET.tostring(root, encoding='utf-8').decode('utf-8')
|
1722
|
+
return corrected_xml
|
1723
|
+
|
1724
|
+
|
1725
|
+
def determine_values(self):
|
1726
|
+
# This is just a mockup. In a real application, you'd analyze the URL or its content.
|
1727
|
+
|
1728
|
+
# Assuming a blog site
|
1729
|
+
if 'blog' in self.url:
|
1730
|
+
if '2023' in self.url: # Assuming it's a current year article
|
1731
|
+
return ('weekly', '0.8')
|
1732
|
+
else:
|
1733
|
+
return ('monthly', '0.6')
|
1734
|
+
elif 'contact' in self.url:
|
1735
|
+
return ('yearly', '0.3')
|
1736
|
+
else: # Homepage or main categories
|
1737
|
+
return ('weekly', '1.0')
|
1738
|
+
def crawl(url, max_depth=3, depth=1):
|
1739
|
+
|
1740
|
+
if depth > max_depth:
|
1741
|
+
return []
|
1742
|
+
|
1743
|
+
if url in visited:
|
1744
|
+
return []
|
1745
|
+
|
1746
|
+
visited.add(url)
|
1747
|
+
|
1748
|
+
try:
|
1749
|
+
|
1750
|
+
links = [a['href'] for a in self.soup.find_all('a', href=True)]
|
1751
|
+
valid_links = []
|
1752
|
+
|
1753
|
+
for link in links:
|
1754
|
+
parsed_link = urlparse(link)
|
1755
|
+
base_url = "{}://{}".format(parsed_link.scheme, parsed_link.netloc)
|
1756
|
+
|
1757
|
+
if base_url == url: # Avoiding external URLs
|
1758
|
+
final_link = urljoin(url, parsed_link.path)
|
1759
|
+
if final_link not in valid_links:
|
1760
|
+
valid_links.append(final_link)
|
1761
|
+
|
1762
|
+
for link in valid_links:
|
1763
|
+
crawl(link, max_depth, depth+1)
|
1764
|
+
|
1765
|
+
return valid_links
|
1766
|
+
|
1767
|
+
except Exception as e:
|
1768
|
+
print(f"Error crawling {url}: {e}")
|
1769
|
+
return []
|
1770
|
+
|
1771
|
+
|
1772
|
+
# Define or import required functions here, like get_all_website_links, determine_values,
|
1773
|
+
# discover_classes_and_meta_images, and extract_links_from_url.
|
1774
|
+
def get_meta_info(self):
|
1775
|
+
|
1776
|
+
meta_info = {}
|
1777
|
+
# Fetch the title if available
|
1778
|
+
title_tag = parse_title()
|
1779
|
+
if title_tag:
|
1780
|
+
meta_info["title"] = title_tag
|
1781
|
+
# Fetch meta tags
|
1782
|
+
for meta_tag in soup.find_all('meta'):
|
1783
|
+
name = meta_tag.get('name') or meta_tag.get('property')
|
1784
|
+
if name:
|
1785
|
+
content = meta_tag.get('content')
|
1786
|
+
if content:
|
1787
|
+
meta_info[name] = content
|
1788
|
+
|
1789
|
+
return meta_info
|
1790
|
+
def generate_sitemap(self,domain):
|
1791
|
+
|
1792
|
+
with open('sitemap.xml', 'w', encoding='utf-8') as f:
|
1793
|
+
string = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n'
|
1794
|
+
|
1795
|
+
for url in self.all_site_links:
|
1796
|
+
string += f' <url>\n <loc>{url}</loc>\n'
|
1797
|
+
preprocess=[]
|
1798
|
+
self.get_new_source_and_url(url=url)
|
1799
|
+
links = extract_links_from_url(url)
|
1800
|
+
|
1801
|
+
for img in links['images']:
|
1802
|
+
if str(img).lower() not in preprocess:
|
1803
|
+
try:
|
1804
|
+
escaped_img = img.replace('&', '&')
|
1805
|
+
|
1806
|
+
str_write = f' <image:image>\n <image:loc>{escaped_img}</image:loc>\n </image:image>\n'
|
1807
|
+
string += str_write
|
1808
|
+
except:
|
1809
|
+
pass
|
1810
|
+
preprocess.append(str(img).lower())
|
1811
|
+
frequency, priority = determine_values(url)
|
1812
|
+
string += f' <changefreq>{frequency}</changefreq>\n'
|
1813
|
+
string += f' <priority>{priority}</priority>\n'
|
1814
|
+
string += f' </url>\n'
|
1815
|
+
|
1816
|
+
string += '</urlset>\n'
|
1817
|
+
f.write(string)
|
1818
|
+
# Output summary
|
1819
|
+
print(f'Sitemap saved to sitemap.xml with {len(urls)} URLs.')
|
1820
|
+
|
1821
|
+
# Output class and link details
|
1822
|
+
for url in urls:
|
1823
|
+
print(f"\nDetails for {url}:")
|
1824
|
+
classes, meta_img_refs = discover_classes_and_meta_images(url)
|
1825
|
+
|
1826
|
+
print("\nClasses with href or src attributes:")
|
1827
|
+
for class_name in classes:
|
1828
|
+
print(f"\t{class_name}")
|
1829
|
+
|
1830
|
+
print("\nMeta Image References:")
|
1831
|
+
for img_ref in meta_img_refs:
|
1832
|
+
print(f"\t{img_ref}")
|
1833
|
+
|
1834
|
+
links = extract_links_from_url(url)
|
1835
|
+
|
1836
|
+
print("\nImages:")
|
1837
|
+
for img in links['images']:
|
1838
|
+
print(f"\t{img}")
|
1839
|
+
|
1840
|
+
print("\nExternal Links:")
|
1841
|
+
for ext_link in links['external_links']:
|
1842
|
+
print(f"\t{ext_link}")
|
1843
|
+
class CrawlManagerSingleton():
|
1844
|
+
_instance = None
|
1845
|
+
@staticmethod
|
1846
|
+
def get_instance(url=None,source_code=None,parse_type="html.parser"):
|
1847
|
+
if CrawlManagerSingleton._instance is None:
|
1848
|
+
CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
|
1849
|
+
elif parse_type != CrawlManagerSingleton._instance.parse_type or url != CrawlManagerSingleton._instance.url or source_code != CrawlManagerSingleton._instance.source_code:
|
1850
|
+
CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
|
1851
|
+
return CrawlManagerSingleton._instance
|
1852
|
+
<<<<<<< HEAD
|
1853
|
+
=======
|
87
1854
|
from urllib.parse import urlparse, parse_qs
|
88
1855
|
import time
|
89
1856
|
import requests
|
@@ -99,3 +1866,4 @@ def try_request(request):
|
|
99
1866
|
response = None
|
100
1867
|
return response
|
101
1868
|
|
1869
|
+
>>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
|