abstract-webtools 0.1.5.81__py3-none-any.whl → 0.1.5.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ from ..abstract_webtools import *
2
+ from ..big_user_agent_list import *
3
+ class NetworkManager:
4
+ def __init__(self, user_agent_manager=None,ssl_manager=None, tls_adapter=None,user_agent=None,proxies=None,cookies=None,ciphers=None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None):
5
+ if ssl_manager == None:
6
+ ssl_manager = SSLManager(ciphers=ciphers, ssl_options=ssl_options, certification=certification)
7
+ self.ssl_manager=ssl_manager
8
+ if tls_adapter == None:
9
+ tls_adapter=TLSAdapter(ssl_manager=ssl_manager,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
10
+ self.tls_adapter=tls_adapter
11
+ self.ciphers=tls_adapter.ciphers
12
+ self.certification=tls_adapter.certification
13
+ self.ssl_options=tls_adapter.ssl_options
14
+ self.proxies=None or {}
15
+ self.cookies=cookies or "cb4c883efc59d0e990caf7508902591f4569e7bf-1617321078-0-150"
@@ -0,0 +1,348 @@
1
+ from ..abstract_webtools import *
2
+ from .urlManager import *
3
+ from .userAgentManager import *
4
+ from .cipherManager import *
5
+ from .sslManager import *
6
+ from .tlsAdapter import *
7
+ from .networkManager import *
8
+ from .seleniumManager import *
9
+ class requestManager:
10
+ """
11
+ SafeRequest is a class for making HTTP requests with error handling and retries.
12
+
13
+ Args:
14
+ url (str or None): The URL to make requests to (default is None).
15
+ url_mgr (urlManager or None): An instance of urlManager (default is None).
16
+ network_manager (NetworkManager or None): An instance of NetworkManager (default is None).
17
+ user_agent_manager (UserAgentManager or None): An instance of UserAgentManager (default is None).
18
+ ssl_manager (SSlManager or None): An instance of SSLManager (default is None).
19
+ tls_adapter (TLSAdapter or None): An instance of TLSAdapter (default is None).
20
+ user_agent (str or None): The user agent string to use for requests (default is None).
21
+ proxies (dict or None): Proxy settings for requests (default is None).
22
+ headers (dict or None): Additional headers for requests (default is None).
23
+ cookies (dict or None): Cookie settings for requests (default is None).
24
+ session (requests.Session or None): A custom requests session (default is None).
25
+ adapter (str or None): A custom adapter for requests (default is None).
26
+ protocol (str or None): The protocol to use for requests (default is 'https://').
27
+ ciphers (str or None): Cipher settings for requests (default is None).
28
+ auth (tuple or None): Authentication credentials (default is None).
29
+ login_url (str or None): The URL for authentication (default is None).
30
+ email (str or None): Email for authentication (default is None).
31
+ password (str or None): Password for authentication (default is None).
32
+ certification (str or None): Certification settings for requests (default is None).
33
+ ssl_options (str or None): SSL options for requests (default is None).
34
+ stream (bool): Whether to stream the response content (default is False).
35
+ timeout (float or None): Timeout for requests (default is None).
36
+ last_request_time (float or None): Timestamp of the last request (default is None).
37
+ max_retries (int or None): Maximum number of retries for requests (default is None).
38
+ request_wait_limit (float or None): Wait time between requests (default is None).
39
+
40
+ Methods:
41
+ update_url_mgr(url_mgr): Update the URL manager and reinitialize the SafeRequest.
42
+ update_url(url): Update the URL and reinitialize the SafeRequest.
43
+ re_initialize(): Reinitialize the SafeRequest with the current settings.
44
+ authenticate(s, login_url=None, email=None, password=None, checkbox=None, dropdown=None): Authenticate and make a request.
45
+ fetch_response(): Fetch the response from the server.
46
+ initialize_session(): Initialize the requests session with custom settings.
47
+ process_response_data(): Process the fetched response data.
48
+ get_react_source_code(): Extract JavaScript and JSX source code from <script> tags.
49
+ get_status(url=None): Get the HTTP status code of a URL.
50
+ wait_between_requests(): Wait between requests based on the request_wait_limit.
51
+ make_request(): Make a request and handle potential errors.
52
+ try_request(): Try to make an HTTP request using the provided session.
53
+
54
+ Note:
55
+ - The SafeRequest class is designed for making HTTP requests with error handling and retries.
56
+ - It provides methods for authentication, response handling, and error management.
57
+ """
58
+ def __init__(self,
59
+ url=None,
60
+ source_code=None,
61
+ url_mgr=None,
62
+ network_manager=None,
63
+ user_agent_manager=None,
64
+ ssl_manager=None,
65
+ ssl_options=None,
66
+ tls_adapter=None,
67
+ user_agent=None,
68
+ proxies=None,
69
+ headers=None,
70
+ cookies=None,
71
+ session=None,
72
+ adapter=None,
73
+ protocol=None,
74
+ ciphers=None,
75
+ spec_login=False,
76
+ login_referer=None,
77
+ login_user_agent=None,
78
+ auth=None,
79
+ login_url=None,
80
+ email = None,
81
+ password=None,
82
+ checkbox=None,
83
+ dropdown=None,
84
+ certification=None,
85
+ stream=False,
86
+ timeout = None,
87
+ last_request_time=None,
88
+ max_retries=None,
89
+ request_wait_limit=None):
90
+
91
+ self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
92
+ self._url=url_mgr.url
93
+ self._url_mgr = self.url_mgr
94
+ self.user_agent = user_agent
95
+ self.user_agent_manager = user_agent_manager or UserAgentManager(user_agent=self.user_agent)
96
+ self.headers= headers or self.user_agent_manager.header or {'Accept': '*/*'}
97
+ self.user_agent= self.user_agent_manager.user_agent
98
+ self.ciphers=ciphers or CipherManager().ciphers_string
99
+ self.certification=certification
100
+ self.ssl_options=ssl_options
101
+ self.ssl_manager = ssl_manager or SSLManager(ciphers=self.ciphers, ssl_options=self.ssl_options, certification=self.certification)
102
+ self.tls_adapter=tls_adapter or TLSAdapter(ssl_manager=self.ssl_manager,certification=self.certification,ssl_options=self.ssl_manager.ssl_options)
103
+ self.network_manager= network_manager or NetworkManager(user_agent_manager=self.user_agent_manager,ssl_manager=self.ssl_manager, tls_adapter=self.tls_adapter,user_agent=user_agent,proxies=proxies,cookies=cookies,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
104
+ self.stream=stream
105
+ self.tls_adapter=self.network_manager.tls_adapter
106
+ self.ciphers=self.network_manager.ciphers
107
+ self.certification=self.network_manager.certification
108
+ self.ssl_options=self.network_manager.ssl_options
109
+ self.proxies=self.network_manager.proxies
110
+ self.timeout=timeout
111
+ self.cookies=self.network_manager.cookies
112
+ self.session = session or requests.session()
113
+ self.auth = auth
114
+ self.spec_login=spec_login
115
+ self.password=password
116
+ self.email = email
117
+ self.checkbox=checkbox
118
+ self.dropdown=dropdown
119
+ self.login_url=login_url
120
+ self.login_user_agent=login_user_agent
121
+ self.login_referer=login_referer
122
+ self.protocol=protocol or 'https://'
123
+
124
+ self.stream=stream if isinstance(stream,bool) else False
125
+ self.initialize_session()
126
+ self.last_request_time=last_request_time
127
+ self.max_retries = max_retries or 3
128
+ self.request_wait_limit = request_wait_limit or 1.5
129
+ self._response=None
130
+ self.source_code = get_selenium_source(self.url)
131
+ self.source_code_bytes=None
132
+ self.source_code_json = {}
133
+ self.react_source_code=[]
134
+ self._response_data = None
135
+ self.process_response_data()
136
+ def update_url_mgr(self,url_mgr):
137
+ self.url_mgr=url_mgr
138
+ self.re_initialize()
139
+ def update_url(self,url):
140
+ self.url_mgr.update_url(url=url)
141
+ self.re_initialize()
142
+ def re_initialize(self):
143
+ self._response=None
144
+ self.make_request()
145
+ self.source_code = None
146
+ self.source_code_bytes=None
147
+ self.source_code_json = {}
148
+ self.react_source_code=[]
149
+ self._response_data = None
150
+ self.process_response_data()
151
+ @property
152
+ def response(self):
153
+ """Lazy-loading of response."""
154
+ if self._response is None:
155
+ self._response = self.fetch_response()
156
+
157
+ return self._response
158
+ def authenticate(self,session, login_url=None, email=None, password=None,checkbox=None,dropdown=None):
159
+ login_urls = login_url or [self.url_mgr.url,self.url_mgr.domain,self.url_mgr.url_join(url=self.url_mgr.domain,path='login'),self.url_mgr.url_join(url=self.url_mgr.domain,path='auth')]
160
+ s = session
161
+ if not isinstance(login_urls,list):
162
+ login_urls=[login_urls]
163
+ for login_url in login_urls:
164
+ login_url_mgr = urlManager(login_url)
165
+ login_url = login_url_mgr.url
166
+
167
+ r = s.get(login_url)
168
+ soup = BeautifulSoup(r.content, "html.parser")
169
+ # Find the token or any CSRF protection token
170
+ token = soup.find('input', {'name': 'token'}).get('value') if soup.find('input', {'name': 'token'}) else None
171
+ if token != None:
172
+ break
173
+ login_data = {}
174
+ if email != None:
175
+ login_data['email']=email
176
+ if password != None:
177
+ login_data['password'] = password
178
+ if checkbox != None:
179
+ login_data['checkbox'] = checkbox
180
+ if dropdown != None:
181
+ login_data['dropdown']=dropdown
182
+ if token != None:
183
+ login_data['token'] = token
184
+ s.post(login_url, data=login_data)
185
+ return s
186
+
187
+ def fetch_response(self) -> Union[requests.Response, None]:
188
+ """Actually fetches the response from the server."""
189
+ # You can further adapt this method to use retries or other logic you had
190
+ # in your original code, but the main goal here is to fetch and return the response
191
+ return self.try_request()
192
+ def spec_auth(self, session=None, email=None, password=None, login_url=None, login_referer=None, login_user_agent=None):
193
+ s = session or requests.session()
194
+
195
+ domain = self.url_mgr.url_join(self.url_mgr.get_correct_url(self.url_mgr.domain),'login') if login_url is None else login_url
196
+ login_url = self.url_mgr.get_correct_url(url=domain)
197
+
198
+ login_referer = login_referer or self.url_mgr.url_join(url=login_url, path='?role=fast&to=&s=1&m=1&email=YOUR_EMAIL')
199
+ login_user_agent = login_user_agent or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:50.0) Gecko/20100101 Firefox/50.0'
200
+
201
+ headers = {"Referer": login_referer, 'User-Agent': login_user_agent}
202
+ payload = {'email': email, 'pass': password}
203
+
204
+ page = s.get(login_url)
205
+ soup = BeautifulSoup(page.content, 'lxml')
206
+ action_url = soup.find('form')['action']
207
+ s.post(action_url, data=payload, headers=headers)
208
+ return s
209
+ def initialize_session(self):
210
+ s = self.session
211
+ if self.auth:
212
+ s= self.auth
213
+ elif self.spec_login:
214
+ s=self.spec_auth(session=s,email=self.email, password=self.password, login_url=self.login_url, login_referer=self.login_referer, login_user_agent=self.login_user_agent)
215
+ elif any([self.password, self.email, self.login_url, self.checkbox, self.dropdown]):
216
+ s=self.authenticate(session=s, login_url=self.login_url, email=self.email, password=self.password, checkbox=self.checkbox, dropdown=self.dropdown)
217
+ s.proxies = self.proxies
218
+ s.cookies["cf_clearance"] = self.network_manager.cookies
219
+ s.headers.update(self.headers)
220
+ s.mount(self.protocol, self.network_manager.tls_adapter)
221
+ return s
222
+ def process_response_data(self):
223
+ """Processes the fetched response data."""
224
+ if not self.response:
225
+ return # No data to process
226
+ if isinstance(self.response,str):
227
+ self.source_code = self.response
228
+ else:
229
+ self.source_code = self.response.text
230
+ self.source_code_bytes = self.response.content
231
+ if self.response.headers.get('content-type') == 'application/json':
232
+ data = convert_to_json(self.source_code)
233
+ if data:
234
+ self.source_code_json = data.get("response", data)
235
+
236
+ self.get_react_source_code()
237
+ def get_react_source_code(self) -> list:
238
+ """
239
+ Fetches the source code of the specified URL and extracts JavaScript and JSX source code (React components).
240
+
241
+ Args:
242
+ url (str): The URL to fetch the source code from.
243
+
244
+ Returns:
245
+ list: A list of strings containing JavaScript and JSX source code found in <script> tags.
246
+ """
247
+ if self.url_mgr.url is None:
248
+ return []
249
+ soup = BeautifulSoup(self.source_code_bytes,"html.parser")
250
+ script_tags = soup.find_all('script', type=lambda t: t and ('javascript' in t or 'jsx' in t))
251
+ for script_tag in script_tags:
252
+ self.react_source_code.append(script_tag.string)
253
+
254
+
255
+ def get_status(url:str=None) -> int:
256
+ """
257
+ Gets the HTTP status code of the given URL.
258
+
259
+ Args:
260
+ url (str): The URL to check the status of.
261
+
262
+ Returns:
263
+ int: The HTTP status code of the URL, or None if the request fails.
264
+ """
265
+ # Get the status code of the URL
266
+ return try_request(url=url).status_code
267
+ def wait_between_requests(self):
268
+ """
269
+ Wait between requests based on the request_wait_limit.
270
+ """
271
+ if self.last_request_time:
272
+ sleep_time = self.request_wait_limit - (get_time_stamp() - self.last_request_time)
273
+ if sleep_time > 0:
274
+ logging.info(f"Sleeping for {sleep_time:.2f} seconds.")
275
+ get_sleep(sleep_time)
276
+
277
+ def make_request(self):
278
+ """
279
+ Make a request and handle potential errors.
280
+ """
281
+ # Update the instance attributes if they are passed
282
+
283
+ self.wait_between_requests()
284
+ for _ in range(self.max_retries):
285
+ try:
286
+ self.try_request() # 10 seconds timeout
287
+ if self.response:
288
+ if self.response.status_code == 200:
289
+ self.last_request_time = get_time_stamp()
290
+ return self.response
291
+ elif self.response.status_code == 429:
292
+ logging.warning(f"Rate limited by {self.url_mgr.url}. Retrying...")
293
+ get_sleep(5) # adjust this based on the server's rate limit reset time
294
+ except requests.Timeout as e:
295
+ logging.error(f"Request to {cleaned_url} timed out: {e}")
296
+ except requests.ConnectionError:
297
+ logging.error(f"Connection error for URL {self.url_mgr.url}.")
298
+ except requests.Timeout:
299
+ logging.error(f"Request timeout for URL {self.url_mgr.url}.")
300
+ except requests.RequestException as e:
301
+ logging.error(f"Request exception for URL {self.url_mgr.url}: {e}")
302
+ try:
303
+ response = get_selenium_source(self.url_mgr.url)
304
+ if response:
305
+ self.response = response
306
+ return self.response
307
+ except:
308
+ logging.error(f"Failed to retrieve content from {self.url_mgr.url} after {self.max_retries} retries.")
309
+ return None
310
+ def try_request(self) -> Union[requests.Response, None]:
311
+ """
312
+ Tries to make an HTTP request to the given URL using the provided session.
313
+
314
+ Args:
315
+ timeout (int): Timeout for the request.
316
+
317
+ Returns:
318
+ requests.Response or None: The response object if the request is successful, or None if the request fails.
319
+ """
320
+ try:
321
+ return get_selenium_source(self.url_mgr.url)#self.session.get(url=self.url_mgr.url, timeout=self.timeout,stream=self.stream)
322
+ except requests.exceptions.RequestException as e:
323
+ print(e)
324
+ return None
325
+
326
+
327
+ @property
328
+ def url(self):
329
+ return self.url_mgr.url
330
+
331
+ @url.setter
332
+ def url(self, new_url):
333
+ self._url = new_url
334
+ class SafeRequestSingleton:
335
+ _instance = None
336
+ @staticmethod
337
+ def get_instance(url=None,headers:dict=None,max_retries=3,last_request_time=None,request_wait_limit=1.5):
338
+ if SafeRequestSingleton._instance is None:
339
+ SafeRequestSingleton._instance = SafeRequest(url,url_mgr=urlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
340
+ elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
341
+ SafeRequestSingleton._instance = SafeRequest(url,url_mgr=urlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
342
+ return SafeRequestSingleton._instance
343
+ def get_req_mgr(req_mgr=None,url=None,url_mgr=None,source_code=None):
344
+ if req_mgr:
345
+ url_mgr = req_mgr.url_mgr
346
+ url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
347
+ req_mgr = req_mgr or requestManager(url_mgr=url_mgr,source_code=source_code)
348
+ return req_mgr
@@ -0,0 +1,85 @@
1
+ import os
2
+ from ..abstract_webtools import *
3
+ from .urlManager import *
4
+ from urllib.parse import urlparse
5
+ from selenium import webdriver
6
+ from selenium.webdriver.chrome.options import Options
7
+ import logging
8
+ import urllib3
9
+
10
+ # Suppress urllib3 warnings and debug logs
11
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
12
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
13
+
14
+ # Suppress Selenium logs
15
+ logging.getLogger("selenium").setLevel(logging.WARNING)
16
+
17
+ # Setup Chrome options
18
+ chrome_options = Options()
19
+ chrome_options.add_argument("--headless") # Run Chrome in headless mode
20
+ chrome_options.add_argument("--log-level=3") # Suppress most logs
21
+
22
+ class SingletonMeta(type):
23
+ _instances = {}
24
+ def __call__(cls, *args, **kwargs):
25
+ if cls not in cls._instances:
26
+ instance = super().__call__(*args, **kwargs)
27
+ cls._instances[cls] = instance
28
+ return cls._instances[cls]
29
+
30
+ class domainManager(metaclass=SingletonMeta):
31
+ def __init__(self, url):
32
+ if not hasattr(self, 'initialized'): # Prevent reinitialization
33
+ self.initialized = True
34
+ parsed_url = urlparse(url)
35
+ self.domain = parsed_url.netloc
36
+ self.scheme = parsed_url.scheme
37
+ self.site_dir = os.path.join(os.getcwd(), self.domain)
38
+ os.makedirs(self.site_dir, exist_ok=True)
39
+ self.drivers = {}
40
+ self.page_type = []
41
+
42
+ def get_url_to_path(self, url):
43
+ url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
44
+ parsed_url = urlparse(url)
45
+ if parsed_url.netloc == self.domain:
46
+ paths = parsed_url.path.split('/')
47
+ dir_path = self.site_dir
48
+ for path in paths[:-1]:
49
+ dir_path = os.path.join(dir_path, path)
50
+ os.makedirs(dir_path, exist_ok=True)
51
+ self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
52
+ dir_path = os.path.join(dir_path, paths[-1])
53
+ return dir_path
54
+
55
+ def saved_url_check(self, url):
56
+ path = self.get_url_to_path(url)
57
+ return path
58
+
59
+ def get_with_netloc(self, url):
60
+ parsed_url = urlparse(url)
61
+ if parsed_url.netloc == '':
62
+ url = f"{self.scheme}://{self.domain}/{url.strip()}"
63
+ return url
64
+
65
+ def get_driver(self, url):
66
+ if url and url not in self.drivers:
67
+ driver = webdriver.Chrome(options=chrome_options)
68
+ self.drivers[url] = driver
69
+ driver.get(url)
70
+ return self.drivers[url]
71
+
72
+ # Function to get Selenium page source
73
+ def get_selenium_source(url):
74
+ url_mgr = urlManager(url)
75
+ if url_mgr.url:
76
+ url = str(url_mgr.url)
77
+ manager = domainManager(url)
78
+ driver = manager.get_driver(url)
79
+ try:
80
+ # Get page source
81
+ page_source = driver.page_source
82
+ return page_source
83
+ finally:
84
+ # Don't quit the driver unless you're done with all interactions
85
+ pass