abstract-webtools 0.1.6.136__py3-none-any.whl → 0.1.6.137__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,353 +0,0 @@
1
- from ..abstract_webtools import *
2
-
3
- from .userAgentManager import *
4
- from .cipherManager import *
5
- from .sslManager import *
6
- from .tlsAdapter import *
7
- from .networkManager import *
8
- from .seleniumManager import *
9
- from .urlManager import *
10
- class requestManager:
11
- """
12
- SafeRequest is a class for making HTTP requests with error handling and retries.
13
-
14
- Args:
15
- url (str or None): The URL to make requests to (default is None).
16
- url_mgr (urlManager or None): An instance of urlManager (default is None).
17
- network_manager (NetworkManager or None): An instance of NetworkManager (default is None).
18
- user_agent_manager (UserAgentManager or None): An instance of UserAgentManager (default is None).
19
- ssl_manager (SSlManager or None): An instance of SSLManager (default is None).
20
- tls_adapter (TLSAdapter or None): An instance of TLSAdapter (default is None).
21
- user_agent (str or None): The user agent string to use for requests (default is None).
22
- proxies (dict or None): Proxy settings for requests (default is None).
23
- headers (dict or None): Additional headers for requests (default is None).
24
- cookies (dict or None): Cookie settings for requests (default is None).
25
- session (requests.Session or None): A custom requests session (default is None).
26
- adapter (str or None): A custom adapter for requests (default is None).
27
- protocol (str or None): The protocol to use for requests (default is 'https://').
28
- ciphers (str or None): Cipher settings for requests (default is None).
29
- auth (tuple or None): Authentication credentials (default is None).
30
- login_url (str or None): The URL for authentication (default is None).
31
- email (str or None): Email for authentication (default is None).
32
- password (str or None): Password for authentication (default is None).
33
- certification (str or None): Certification settings for requests (default is None).
34
- ssl_options (str or None): SSL options for requests (default is None).
35
- stream (bool): Whether to stream the response content (default is False).
36
- timeout (float or None): Timeout for requests (default is None).
37
- last_request_time (float or None): Timestamp of the last request (default is None).
38
- max_retries (int or None): Maximum number of retries for requests (default is None).
39
- request_wait_limit (float or None): Wait time between requests (default is None).
40
-
41
- Methods:
42
- update_url_mgr(url_mgr): Update the URL manager and reinitialize the SafeRequest.
43
- update_url(url): Update the URL and reinitialize the SafeRequest.
44
- re_initialize(): Reinitialize the SafeRequest with the current settings.
45
- authenticate(s, login_url=None, email=None, password=None, checkbox=None, dropdown=None): Authenticate and make a request.
46
- fetch_response(): Fetch the response from the server.
47
- initialize_session(): Initialize the requests session with custom settings.
48
- process_response_data(): Process the fetched response data.
49
- get_react_source_code(): Extract JavaScript and JSX source code from <script> tags.
50
- get_status(url=None): Get the HTTP status code of a URL.
51
- wait_between_requests(): Wait between requests based on the request_wait_limit.
52
- make_request(): Make a request and handle potential errors.
53
- try_request(): Try to make an HTTP request using the provided session.
54
-
55
- Note:
56
- - The SafeRequest class is designed for making HTTP requests with error handling and retries.
57
- - It provides methods for authentication, response handling, and error management.
58
- """
59
- def __init__(self,
60
- url=None,
61
- source_code=None,
62
- url_mgr=None,
63
- network_manager=None,
64
- user_agent_manager=None,
65
- ssl_manager=None,
66
- ssl_options=None,
67
- tls_adapter=None,
68
- user_agent=None,
69
- proxies=None,
70
- headers=None,
71
- cookies=None,
72
- session=None,
73
- adapter=None,
74
- protocol=None,
75
- ciphers=None,
76
- spec_login=False,
77
- login_referer=None,
78
- login_user_agent=None,
79
- auth=None,
80
- login_url=None,
81
- email = None,
82
- password=None,
83
- checkbox=None,
84
- dropdown=None,
85
- certification=None,
86
- stream=False,
87
- timeout = None,
88
- last_request_time=None,
89
- max_retries=None,
90
- request_wait_limit=
91
- None):
92
- self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
93
- self.url=get_url(url=url,url_mgr=self.url_mgr)
94
- self._url_mgr = self.url_mgr
95
- self._url=self.url
96
- self.user_agent = user_agent
97
- self.user_agent_manager = user_agent_manager or UserAgentManager(user_agent=self.user_agent)
98
- self.headers= headers or self.user_agent_manager.header or {'Accept': '*/*'}
99
- self.user_agent= self.user_agent_manager.user_agent
100
- self.ciphers=ciphers or CipherManager().ciphers_string
101
- self.certification=certification
102
- self.ssl_options=ssl_options
103
- self.ssl_manager = ssl_manager or SSLManager(ciphers=self.ciphers, ssl_options=self.ssl_options, certification=self.certification)
104
- self.tls_adapter=tls_adapter or TLSAdapter(ssl_manager=self.ssl_manager,certification=self.certification,ssl_options=self.ssl_manager.ssl_options)
105
- self.network_manager= network_manager or NetworkManager(user_agent_manager=self.user_agent_manager,ssl_manager=self.ssl_manager, tls_adapter=self.tls_adapter,user_agent=user_agent,proxies=proxies,cookies=cookies,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
106
- self.stream=stream
107
- self.tls_adapter=self.network_manager.tls_adapter
108
- self.ciphers=self.network_manager.ciphers
109
- self.certification=self.network_manager.certification
110
- self.ssl_options=self.network_manager.ssl_options
111
- self.proxies=self.network_manager.proxies
112
- self.timeout=timeout
113
- self.cookies=self.network_manager.cookies
114
- self.session = session or requests.session()
115
- self.auth = auth
116
- self.spec_login=spec_login
117
- self.password=password
118
- self.email = email
119
- self.checkbox=checkbox
120
- self.dropdown=dropdown
121
- self.login_url=login_url
122
- self.login_user_agent=login_user_agent
123
- self.login_referer=login_referer
124
- self.protocol=protocol or 'https://'
125
-
126
- self.stream=stream if isinstance(stream,bool) else False
127
- self.initialize_session()
128
- self.last_request_time=last_request_time
129
- self.max_retries = max_retries or 3
130
- self.request_wait_limit = request_wait_limit or 1.5
131
- self._response=None
132
- self.source_code = get_selenium_source(self.url)
133
- self.source_code_bytes=None
134
- self.source_code_json = {}
135
- self.react_source_code=[]
136
- self._response_data = None
137
- self.process_response_data()
138
- def update_url_mgr(self,url_mgr):
139
- self.url_mgr=url_mgr
140
- self.re_initialize()
141
- def update_url(self,url):
142
- self.url_mgr.update_url(url=url)
143
- self.re_initialize()
144
- def re_initialize(self):
145
- self._response=None
146
- self.make_request()
147
- self.source_code = None
148
- self.source_code_bytes=None
149
- self.source_code_json = {}
150
- self.react_source_code=[]
151
- self._response_data = None
152
- self.process_response_data()
153
- @property
154
- def response(self):
155
- """Lazy-loading of response."""
156
- if self._response is None:
157
- self._response = self.fetch_response()
158
-
159
- return self._response
160
- def authenticate(self,session, login_url=None, email=None, password=None,checkbox=None,dropdown=None):
161
- login_urls = login_url or [self.url_mgr.url,self.url_mgr.domain,self.url_mgr.url_join(url=self.url_mgr.domain,path='login'),self.url_mgr.url_join(url=self.url_mgr.domain,path='auth')]
162
- s = session
163
- if not isinstance(login_urls,list):
164
- login_urls=[login_urls]
165
- for login_url in login_urls:
166
- login_url_mgr = urlManager(login_url)
167
- login_url = login_url_mgr.url
168
-
169
- r = s.get(login_url)
170
- soup = BeautifulSoup(r.content, "html.parser")
171
- # Find the token or any CSRF protection token
172
- token = soup.find('input', {'name': 'token'}).get('value') if soup.find('input', {'name': 'token'}) else None
173
- if token != None:
174
- break
175
- login_data = {}
176
- if email != None:
177
- login_data['email']=email
178
- if password != None:
179
- login_data['password'] = password
180
- if checkbox != None:
181
- login_data['checkbox'] = checkbox
182
- if dropdown != None:
183
- login_data['dropdown']=dropdown
184
- if token != None:
185
- login_data['token'] = token
186
- s.post(login_url, data=login_data)
187
- return s
188
-
189
- def fetch_response(self) -> Union[requests.Response, None]:
190
- """Actually fetches the response from the server."""
191
- # You can further adapt this method to use retries or other logic you had
192
- # in your original code, but the main goal here is to fetch and return the response
193
- return self.try_request()
194
- def spec_auth(self, session=None, email=None, password=None, login_url=None, login_referer=None, login_user_agent=None):
195
- s = session or requests.session()
196
-
197
- domain = self.url_mgr.url_join(self.url_mgr.get_correct_url(self.url_mgr.domain),'login') if login_url is None else login_url
198
- login_url = self.url_mgr.get_correct_url(url=domain)
199
-
200
- login_referer = login_referer or self.url_mgr.url_join(url=login_url, path='?role=fast&to=&s=1&m=1&email=YOUR_EMAIL')
201
- login_user_agent = login_user_agent or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:50.0) Gecko/20100101 Firefox/50.0'
202
-
203
- headers = {"Referer": login_referer, 'User-Agent': login_user_agent}
204
- payload = {'email': email, 'pass': password}
205
-
206
- page = s.get(login_url)
207
- soup = BeautifulSoup(page.content, 'lxml')
208
- action_url = soup.find('form')['action']
209
- s.post(action_url, data=payload, headers=headers)
210
- return s
211
- def initialize_session(self):
212
- s = self.session
213
- if self.auth:
214
- s= self.auth
215
- elif self.spec_login:
216
- s=self.spec_auth(session=s,email=self.email, password=self.password, login_url=self.login_url, login_referer=self.login_referer, login_user_agent=self.login_user_agent)
217
- elif any([self.password, self.email, self.login_url, self.checkbox, self.dropdown]):
218
- s=self.authenticate(session=s, login_url=self.login_url, email=self.email, password=self.password, checkbox=self.checkbox, dropdown=self.dropdown)
219
- s.proxies = self.proxies
220
- s.cookies["cf_clearance"] = self.network_manager.cookies
221
- s.headers.update(self.headers)
222
- s.mount(self.protocol, self.network_manager.tls_adapter)
223
- return s
224
- def process_response_data(self):
225
- """Processes the fetched response data."""
226
- if not self.response:
227
- return # No data to process
228
- if isinstance(self.response,str):
229
- self.source_code = self.response
230
- else:
231
- self.source_code = self.response.text
232
- self.source_code_bytes = self.response.content
233
- if self.response.headers.get('content-type') == 'application/json':
234
- data = convert_to_json(self.source_code)
235
- if data:
236
- self.source_code_json = data.get("response", data)
237
-
238
- self.get_react_source_code()
239
- def get_react_source_code(self) -> list:
240
- """
241
- Fetches the source code of the specified URL and extracts JavaScript and JSX source code (React components).
242
-
243
- Args:
244
- url (str): The URL to fetch the source code from.
245
-
246
- Returns:
247
- list: A list of strings containing JavaScript and JSX source code found in <script> tags.
248
- """
249
- if self.url_mgr.url is None:
250
- return []
251
- soup = BeautifulSoup(self.source_code_bytes,"html.parser")
252
- script_tags = soup.find_all('script', type=lambda t: t and ('javascript' in t or 'jsx' in t))
253
- for script_tag in script_tags:
254
- self.react_source_code.append(script_tag.string)
255
-
256
-
257
- def get_status(url:str=None) -> int:
258
- """
259
- Gets the HTTP status code of the given URL.
260
-
261
- Args:
262
- url (str): The URL to check the status of.
263
-
264
- Returns:
265
- int: The HTTP status code of the URL, or None if the request fails.
266
- """
267
- # Get the status code of the URL
268
- return try_request(url=url).status_code
269
- def wait_between_requests(self):
270
- """
271
- Wait between requests based on the request_wait_limit.
272
- """
273
- if self.last_request_time:
274
- sleep_time = self.request_wait_limit - (get_time_stamp() - self.last_request_time)
275
- if sleep_time > 0:
276
- logging.info(f"Sleeping for {sleep_time:.2f} seconds.")
277
- get_sleep(sleep_time)
278
-
279
- def make_request(self):
280
- """
281
- Make a request and handle potential errors.
282
- """
283
- # Update the instance attributes if they are passed
284
-
285
- self.wait_between_requests()
286
- for _ in range(self.max_retries):
287
- try:
288
- self.try_request() # 10 seconds timeout
289
- if self.response:
290
- if self.response.status_code == 200:
291
- self.last_request_time = get_time_stamp()
292
- return self.response
293
- elif self.response.status_code == 429:
294
- logging.warning(f"Rate limited by {self.url_mgr.url}. Retrying...")
295
- get_sleep(5) # adjust this based on the server's rate limit reset time
296
- except requests.Timeout as e:
297
- logging.error(f"Request to {cleaned_url} timed out: {e}")
298
- except requests.ConnectionError:
299
- logging.error(f"Connection error for URL {self.url_mgr.url}.")
300
- except requests.Timeout:
301
- logging.error(f"Request timeout for URL {self.url_mgr.url}.")
302
- except requests.RequestException as e:
303
- logging.error(f"Request exception for URL {self.url_mgr.url}: {e}")
304
- try:
305
- response = get_selenium_source(self.url_mgr.url)
306
- if response:
307
- self.response = response
308
- return self.response
309
- except:
310
- logging.error(f"Failed to retrieve content from {self.url_mgr.url} after {self.max_retries} retries.")
311
- return None
312
- def try_request(self) -> Union[requests.Response, None]:
313
- """
314
- Tries to make an HTTP request to the given URL using the provided session.
315
-
316
- Args:
317
- timeout (int): Timeout for the request.
318
-
319
- Returns:
320
- requests.Response or None: The response object if the request is successful, or None if the request fails.
321
- """
322
- try:
323
- return get_selenium_source(self.url_mgr.url)#self.session.get(url=self.url_mgr.url, timeout=self.timeout,stream=self.stream)
324
- except requests.exceptions.RequestException as e:
325
- print(e)
326
- return None
327
-
328
-
329
- @property
330
- def url(self):
331
- return self.url_mgr.url
332
-
333
- @url.setter
334
- def url(self, new_url):
335
- self._url = new_url
336
- class SafeRequestSingleton:
337
- _instance = None
338
- @staticmethod
339
- def get_instance(url=None,headers:dict=None,max_retries=3,last_request_time=None,request_wait_limit=1.5):
340
- if SafeRequestSingleton._instance is None:
341
- SafeRequestSingleton._instance = SafeRequest(url,url_mgr=urlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
342
- elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
343
- SafeRequestSingleton._instance = SafeRequest(url,url_mgr=urlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
344
- return SafeRequestSingleton._instance
345
- def get_req_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None):
346
- url = get_url(url=url,url_mgr=url_mgr)
347
- url_mgr = get_url_mgr(url=url,url_mgr=url_mgr )
348
- req_mgr = req_mgr or requestManager(url_mgr=url_mgr,url=url,source_code=source_code)
349
- return req_mgr
350
- def get_source(url=None,url_mgr=None,source_code=None,req_mgr=None):
351
- # Placeholder for actual implementation.
352
- req_mgr = get_req_mgr(req_mgr=req_mgr,url=url,url_mgr=url_mgr,source_code=source_code)
353
- return req_mgr.source_code