abstract-webtools 0.1.6.27__py3-none-any.whl → 0.1.6.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ from .linkManager import *
@@ -0,0 +1,189 @@
1
+ from ..abstract_webtools import *
2
+ from .urlManager import *
3
+ from .requestManager import *
4
+ from .soupManager import *
5
+ def get_url(url=None,url_mgr=None):
6
+ if not url and not url_mgr:
7
+ return None
8
+ if url:
9
+ url_mgr = urlManager(url)
10
+ return url_mgr.url
11
+ def get_url_mgr(url=None,url_mgr=None):
12
+ if not url and not url_mgr:
13
+ return None
14
+ if url:
15
+ url_mgr = urlManager(url)
16
+ return url_mgr
17
+ class linkManager:
18
+ """
19
+ LinkManager is a class for managing and extracting links and image links from a web page.
20
+
21
+ Args:
22
+ url (str): The URL of the web page (default is "https://example.com").
23
+ source_code (str or None): The source code of the web page (default is None).
24
+ url_mgr (UrlManager or None): An instance of UrlManager (default is None).
25
+ request_manager (requestManager or None): An instance of requestManager (default is None).
26
+ soup_manager (SoupManager or None): An instance of SoupManager (default is None).
27
+ image_link_tags (str): HTML tags to identify image links (default is 'img').
28
+ img_link_attrs (str): HTML attributes to identify image link URLs (default is 'src').
29
+ link_tags (str): HTML tags to identify links (default is 'a').
30
+ link_attrs (str): HTML attributes to identify link URLs (default is 'href').
31
+ strict_order_tags (bool): Flag to indicate if tags and attributes should be matched strictly (default is False).
32
+ img_attr_value_desired (list or None): Desired attribute values for image links (default is None).
33
+ img_attr_value_undesired (list or None): Undesired attribute values for image links (default is None).
34
+ link_attr_value_desired (list or None): Desired attribute values for links (default is None).
35
+ link_attr_value_undesired (list or None): Undesired attribute values for links (default is None).
36
+ associated_data_attr (list): HTML attributes to associate with the extracted links (default is ["data-title", 'alt', 'title']).
37
+ get_img (list): HTML attributes used to identify associated images (default is ["data-title", 'alt', 'title']).
38
+
39
+ Methods:
40
+ re_initialize(): Reinitialize the LinkManager with the current settings.
41
+ update_url_mgr(url_mgr): Update the URL manager with a new instance.
42
+ update_url(url): Update the URL and reinitialize the LinkManager.
43
+ update_source_code(source_code): Update the source code and reinitialize the LinkManager.
44
+ update_soup_manager(soup_manager): Update the SoupManager and reinitialize the LinkManager.
45
+ update_desired(...): Update the desired settings and reinitialize the LinkManager.
46
+ find_all_desired(...): Find all desired links or image links based on the specified criteria.
47
+ find_all_domain(): Find all unique domain names in the extracted links.
48
+
49
+ Note:
50
+ - The LinkManager class helps manage and extract links and image links from web pages.
51
+ - The class provides flexibility in specifying criteria for link extraction.
52
+ """
53
+ def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None,soup_manager=None,image_link_tags='img',img_link_attrs='src',link_tags='a',link_attrs='href',strict_order_tags=False,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title']):
54
+ self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
55
+ self.url=self.url_mgr.url
56
+ self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
57
+ self.source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
58
+ self.soup_mgr = get_soup_mgr(req_mgr=self.req_mgr,url_mgr=self.url_mgr,source_code = source_code)
59
+
60
+ self.strict_order_tags=strict_order_tags
61
+ self.image_link_tags=image_link_tags
62
+ self.img_link_attrs=img_link_attrs
63
+ self.link_tags=link_tags
64
+ self.link_attrs=link_attrs
65
+ self.img_attr_value_desired=img_attr_value_desired
66
+ self.img_attr_value_undesired=img_attr_value_undesired
67
+ self.link_attr_value_desired=link_attr_value_desired
68
+ self.link_attr_value_undesired=link_attr_value_undesired
69
+ self.associated_data_attr=associated_data_attr
70
+ self.get_img=get_img
71
+ self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,
72
+ attr=self.img_link_attrs,
73
+ attr_value_desired=self.img_attr_value_desired,
74
+ attr_value_undesired=self.img_attr_value_undesired)
75
+ self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,
76
+ attr=self.link_attrs,
77
+ attr_value_desired=self.link_attr_value_desired,
78
+ attr_value_undesired=self.link_attr_value_undesired,
79
+ associated_data_attr=self.associated_data_attr,
80
+ get_img=get_img)
81
+ def re_initialize(self):
82
+ self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,attr=self.img_link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.img_attr_value_desired,attr_value_undesired=self.img_attr_value_undesired)
83
+ self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,attr=self.link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.link_attr_value_desired,attr_value_undesired=self.link_attr_value_undesired,associated_data_attr=self.associated_data_attr,get_img=self.get_img)
84
+ def update_url_mgr(self,url_mgr):
85
+ self.url_mgr=url_mgr
86
+ self.url=self.url_mgr.url
87
+ self.req_mgr.update_url_mgr(url_mgr=self.url_mgr)
88
+ self.soup_mgr.update_url_mgr(url_mgr=self.url_mgr)
89
+ self.source_code=self.soup_mgr.source_code
90
+ self.re_initialize()
91
+ def update_url(self,url):
92
+ self.url=url
93
+ self.url_mgr.update_url(url=self.url)
94
+ self.url=self.url_mgr.url
95
+ self.req_mgr.update_url(url=self.url)
96
+ self.soup_mgr.update_url(url=self.url)
97
+ self.source_code=self.soup_mgr.source_code
98
+ self.re_initialize()
99
+ def update_source_code(self,source_code):
100
+ self.source_code=source_code
101
+ if self.source_code != self.soup_mgr.source_code:
102
+ self.soup_mgr.update_source_code(source_code=self.source_code)
103
+ self.re_initialize()
104
+ def update_soup_manager(self,soup_manager):
105
+ self.soup_mgr=soup_manager
106
+ self.source_code=self.soup_mgr.source_code
107
+ self.re_initialize()
108
+ def update_desired(self,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,image_link_tags=None,img_link_attrs=None,link_tags=None,link_attrs=None,strict_order_tags=None,associated_data_attr=None,get_img=None):
109
+ self.strict_order_tags = strict_order_tags or self.strict_order_tags
110
+ self.img_attr_value_desired=img_attr_value_desired or self.img_attr_value_desired
111
+ self.img_attr_value_undesired=img_attr_value_undesired or self.img_attr_value_undesired
112
+ self.link_attr_value_desired=link_attr_value_desired or self.link_attr_value_desired
113
+ self.link_attr_value_undesired=link_attr_value_undesired or self.link_attr_value_undesired
114
+ self.image_link_tags=image_link_tags or self.image_link_tags
115
+ self.img_link_attrs=img_link_attrs or self.img_link_attrs
116
+ self.link_tags=link_tags or self.link_tags
117
+ self.link_attrs=link_attrs or self.link_attrs
118
+ self.associated_data_attr=associated_data_attr or self.associated_data_attr
119
+ self.get_img=get_img or self.get_img
120
+ self.re_initialize()
121
+ def find_all_desired(self,tag='img',attr='src',strict_order_tags=False,attr_value_desired=None,attr_value_undesired=None,associated_data_attr=None,get_img=None):
122
+ def make_list(obj):
123
+ if isinstance(obj,list) or obj==None:
124
+ return obj
125
+ return [obj]
126
+ def get_desired_value(attr,attr_value_desired=None,attr_value_undesired=None):
127
+ if attr_value_desired:
128
+ for value in attr_value_desired:
129
+ if value not in attr:
130
+ return False
131
+ if attr_value_undesired:
132
+ for value in attr_value_undesired:
133
+ if value in attr:
134
+ return False
135
+ return True
136
+ attr_value_desired,attr_value_undesired,associated_data_attr,tags,attribs=make_list(attr_value_desired),make_list(attr_value_undesired),make_list(associated_data_attr),make_list(tag),make_list(attr)
137
+ desired_ls = []
138
+ assiciated_data=[]
139
+ for i,tag in enumerate(tags):
140
+ attribs_list=attribs
141
+ if strict_order_tags:
142
+ if len(attribs)<=i:
143
+ attribs_list=[None]
144
+ else:
145
+ attribs_list=make_list(attribs[i])
146
+ for attr in attribs_list:
147
+ for component in self.soup_mgr.soup.find_all(tag):
148
+ if attr in component.attrs and get_desired_value(attr=component[attr],attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired):
149
+ if component[attr] not in desired_ls:
150
+ desired_ls.append(component[attr])
151
+ assiciated_data.append({"value":component[attr]})
152
+ if associated_data_attr:
153
+ for data in associated_data_attr:
154
+ if data in component.attrs:
155
+ assiciated_data[-1][data]=component.attrs[data]
156
+ if get_img and component.attrs[data]:
157
+ if data in get_img and len(component.attrs[data])!=0:
158
+ for each in self.soup_mgr.soup.find_all('img'):
159
+ if 'alt' in each.attrs:
160
+ if each.attrs['alt'] == component.attrs[data] and 'src' in each.attrs:
161
+ assiciated_data[-1]['image']=each.attrs['src']
162
+ desired_ls.append(assiciated_data)
163
+ return desired_ls
164
+ def find_all_domain(self):
165
+ domain = urlparse(self.url_mgr.url).netloc
166
+ domains_ls=[self.url_mgr.url]
167
+ for url in self.all_desired_links[:-1]:
168
+ if self.url_mgr.is_valid_url(url):
169
+ parse = urlparse(url)
170
+ comp_domain = parse.netloc
171
+ if url not in domains_ls and comp_domain == domain:
172
+ domains_ls.append(url)
173
+ return domains_ls
174
+ def find_all_desired_links(self,tag='img', attr='src',attr_value_desired=None,strict_order_tags=False,attr_value_undesired=None,associated_data_attr=None,all_desired=None,get_img=None):
175
+ all_desired = all_desired or self.find_all_desired(tag=tag,attr=attr,strict_order_tags=strict_order_tags,attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired,associated_data_attr=associated_data_attr,get_img=get_img)
176
+ assiciated_attrs = all_desired[-1]
177
+ valid_assiciated_attrs = []
178
+ desired_links=[]
179
+ for i,attr in enumerate(all_desired[:-1]):
180
+ valid_attr=self.url_mgr.make_valid(attr,self.url_mgr.protocol+'://'+self.url_mgr.domain)
181
+ if valid_attr:
182
+ desired_links.append(valid_attr)
183
+ valid_assiciated_attrs.append(assiciated_attrs[i])
184
+ valid_assiciated_attrs[-1]["link"]=valid_attr
185
+ desired_links.append(valid_assiciated_attrs)
186
+ return desired_links
187
+
188
+
189
+
@@ -0,0 +1 @@
1
+ from .requestManager import *
@@ -0,0 +1,353 @@
1
+ from ..abstract_webtools import *
2
+
3
+ from .userAgentManager import *
4
+ from .cipherManager import *
5
+ from .sslManager import *
6
+ from .tlsAdapter import *
7
+ from .networkManager import *
8
+ from .seleniumManager import *
9
+ from .urlManager import *
10
+ class requestManager:
11
+ """
12
+ SafeRequest is a class for making HTTP requests with error handling and retries.
13
+
14
+ Args:
15
+ url (str or None): The URL to make requests to (default is None).
16
+ url_mgr (urlManager or None): An instance of urlManager (default is None).
17
+ network_manager (NetworkManager or None): An instance of NetworkManager (default is None).
18
+ user_agent_manager (UserAgentManager or None): An instance of UserAgentManager (default is None).
19
+ ssl_manager (SSlManager or None): An instance of SSLManager (default is None).
20
+ tls_adapter (TLSAdapter or None): An instance of TLSAdapter (default is None).
21
+ user_agent (str or None): The user agent string to use for requests (default is None).
22
+ proxies (dict or None): Proxy settings for requests (default is None).
23
+ headers (dict or None): Additional headers for requests (default is None).
24
+ cookies (dict or None): Cookie settings for requests (default is None).
25
+ session (requests.Session or None): A custom requests session (default is None).
26
+ adapter (str or None): A custom adapter for requests (default is None).
27
+ protocol (str or None): The protocol to use for requests (default is 'https://').
28
+ ciphers (str or None): Cipher settings for requests (default is None).
29
+ auth (tuple or None): Authentication credentials (default is None).
30
+ login_url (str or None): The URL for authentication (default is None).
31
+ email (str or None): Email for authentication (default is None).
32
+ password (str or None): Password for authentication (default is None).
33
+ certification (str or None): Certification settings for requests (default is None).
34
+ ssl_options (str or None): SSL options for requests (default is None).
35
+ stream (bool): Whether to stream the response content (default is False).
36
+ timeout (float or None): Timeout for requests (default is None).
37
+ last_request_time (float or None): Timestamp of the last request (default is None).
38
+ max_retries (int or None): Maximum number of retries for requests (default is None).
39
+ request_wait_limit (float or None): Wait time between requests (default is None).
40
+
41
+ Methods:
42
+ update_url_mgr(url_mgr): Update the URL manager and reinitialize the SafeRequest.
43
+ update_url(url): Update the URL and reinitialize the SafeRequest.
44
+ re_initialize(): Reinitialize the SafeRequest with the current settings.
45
+ authenticate(s, login_url=None, email=None, password=None, checkbox=None, dropdown=None): Authenticate and make a request.
46
+ fetch_response(): Fetch the response from the server.
47
+ initialize_session(): Initialize the requests session with custom settings.
48
+ process_response_data(): Process the fetched response data.
49
+ get_react_source_code(): Extract JavaScript and JSX source code from <script> tags.
50
+ get_status(url=None): Get the HTTP status code of a URL.
51
+ wait_between_requests(): Wait between requests based on the request_wait_limit.
52
+ make_request(): Make a request and handle potential errors.
53
+ try_request(): Try to make an HTTP request using the provided session.
54
+
55
+ Note:
56
+ - The SafeRequest class is designed for making HTTP requests with error handling and retries.
57
+ - It provides methods for authentication, response handling, and error management.
58
+ """
59
+ def __init__(self,
60
+ url=None,
61
+ source_code=None,
62
+ url_mgr=None,
63
+ network_manager=None,
64
+ user_agent_manager=None,
65
+ ssl_manager=None,
66
+ ssl_options=None,
67
+ tls_adapter=None,
68
+ user_agent=None,
69
+ proxies=None,
70
+ headers=None,
71
+ cookies=None,
72
+ session=None,
73
+ adapter=None,
74
+ protocol=None,
75
+ ciphers=None,
76
+ spec_login=False,
77
+ login_referer=None,
78
+ login_user_agent=None,
79
+ auth=None,
80
+ login_url=None,
81
+ email = None,
82
+ password=None,
83
+ checkbox=None,
84
+ dropdown=None,
85
+ certification=None,
86
+ stream=False,
87
+ timeout = None,
88
+ last_request_time=None,
89
+ max_retries=None,
90
+ request_wait_limit=
91
+ None):
92
+ self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
93
+ self.url=get_url(url=url,url_mgr=self.url_mgr)
94
+ self._url_mgr = self.url_mgr
95
+ self._url=self.url
96
+ self.user_agent = user_agent
97
+ self.user_agent_manager = user_agent_manager or UserAgentManager(user_agent=self.user_agent)
98
+ self.headers= headers or self.user_agent_manager.header or {'Accept': '*/*'}
99
+ self.user_agent= self.user_agent_manager.user_agent
100
+ self.ciphers=ciphers or CipherManager().ciphers_string
101
+ self.certification=certification
102
+ self.ssl_options=ssl_options
103
+ self.ssl_manager = ssl_manager or SSLManager(ciphers=self.ciphers, ssl_options=self.ssl_options, certification=self.certification)
104
+ self.tls_adapter=tls_adapter or TLSAdapter(ssl_manager=self.ssl_manager,certification=self.certification,ssl_options=self.ssl_manager.ssl_options)
105
+ self.network_manager= network_manager or NetworkManager(user_agent_manager=self.user_agent_manager,ssl_manager=self.ssl_manager, tls_adapter=self.tls_adapter,user_agent=user_agent,proxies=proxies,cookies=cookies,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
106
+ self.stream=stream
107
+ self.tls_adapter=self.network_manager.tls_adapter
108
+ self.ciphers=self.network_manager.ciphers
109
+ self.certification=self.network_manager.certification
110
+ self.ssl_options=self.network_manager.ssl_options
111
+ self.proxies=self.network_manager.proxies
112
+ self.timeout=timeout
113
+ self.cookies=self.network_manager.cookies
114
+ self.session = session or requests.session()
115
+ self.auth = auth
116
+ self.spec_login=spec_login
117
+ self.password=password
118
+ self.email = email
119
+ self.checkbox=checkbox
120
+ self.dropdown=dropdown
121
+ self.login_url=login_url
122
+ self.login_user_agent=login_user_agent
123
+ self.login_referer=login_referer
124
+ self.protocol=protocol or 'https://'
125
+
126
+ self.stream=stream if isinstance(stream,bool) else False
127
+ self.initialize_session()
128
+ self.last_request_time=last_request_time
129
+ self.max_retries = max_retries or 3
130
+ self.request_wait_limit = request_wait_limit or 1.5
131
+ self._response=None
132
+ self.source_code = get_selenium_source(self.url)
133
+ self.source_code_bytes=None
134
+ self.source_code_json = {}
135
+ self.react_source_code=[]
136
+ self._response_data = None
137
+ self.process_response_data()
138
+ def update_url_mgr(self,url_mgr):
139
+ self.url_mgr=url_mgr
140
+ self.re_initialize()
141
+ def update_url(self,url):
142
+ self.url_mgr.update_url(url=url)
143
+ self.re_initialize()
144
+ def re_initialize(self):
145
+ self._response=None
146
+ self.make_request()
147
+ self.source_code = None
148
+ self.source_code_bytes=None
149
+ self.source_code_json = {}
150
+ self.react_source_code=[]
151
+ self._response_data = None
152
+ self.process_response_data()
153
+ @property
154
+ def response(self):
155
+ """Lazy-loading of response."""
156
+ if self._response is None:
157
+ self._response = self.fetch_response()
158
+
159
+ return self._response
160
+ def authenticate(self,session, login_url=None, email=None, password=None,checkbox=None,dropdown=None):
161
+ login_urls = login_url or [self.url_mgr.url,self.url_mgr.domain,self.url_mgr.url_join(url=self.url_mgr.domain,path='login'),self.url_mgr.url_join(url=self.url_mgr.domain,path='auth')]
162
+ s = session
163
+ if not isinstance(login_urls,list):
164
+ login_urls=[login_urls]
165
+ for login_url in login_urls:
166
+ login_url_mgr = urlManager(login_url)
167
+ login_url = login_url_mgr.url
168
+
169
+ r = s.get(login_url)
170
+ soup = BeautifulSoup(r.content, "html.parser")
171
+ # Find the token or any CSRF protection token
172
+ token = soup.find('input', {'name': 'token'}).get('value') if soup.find('input', {'name': 'token'}) else None
173
+ if token != None:
174
+ break
175
+ login_data = {}
176
+ if email != None:
177
+ login_data['email']=email
178
+ if password != None:
179
+ login_data['password'] = password
180
+ if checkbox != None:
181
+ login_data['checkbox'] = checkbox
182
+ if dropdown != None:
183
+ login_data['dropdown']=dropdown
184
+ if token != None:
185
+ login_data['token'] = token
186
+ s.post(login_url, data=login_data)
187
+ return s
188
+
189
+ def fetch_response(self) -> Union[requests.Response, None]:
190
+ """Actually fetches the response from the server."""
191
+ # You can further adapt this method to use retries or other logic you had
192
+ # in your original code, but the main goal here is to fetch and return the response
193
+ return self.try_request()
194
+ def spec_auth(self, session=None, email=None, password=None, login_url=None, login_referer=None, login_user_agent=None):
195
+ s = session or requests.session()
196
+
197
+ domain = self.url_mgr.url_join(self.url_mgr.get_correct_url(self.url_mgr.domain),'login') if login_url is None else login_url
198
+ login_url = self.url_mgr.get_correct_url(url=domain)
199
+
200
+ login_referer = login_referer or self.url_mgr.url_join(url=login_url, path='?role=fast&to=&s=1&m=1&email=YOUR_EMAIL')
201
+ login_user_agent = login_user_agent or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:50.0) Gecko/20100101 Firefox/50.0'
202
+
203
+ headers = {"Referer": login_referer, 'User-Agent': login_user_agent}
204
+ payload = {'email': email, 'pass': password}
205
+
206
+ page = s.get(login_url)
207
+ soup = BeautifulSoup(page.content, 'lxml')
208
+ action_url = soup.find('form')['action']
209
+ s.post(action_url, data=payload, headers=headers)
210
+ return s
211
+ def initialize_session(self):
212
+ s = self.session
213
+ if self.auth:
214
+ s= self.auth
215
+ elif self.spec_login:
216
+ s=self.spec_auth(session=s,email=self.email, password=self.password, login_url=self.login_url, login_referer=self.login_referer, login_user_agent=self.login_user_agent)
217
+ elif any([self.password, self.email, self.login_url, self.checkbox, self.dropdown]):
218
+ s=self.authenticate(session=s, login_url=self.login_url, email=self.email, password=self.password, checkbox=self.checkbox, dropdown=self.dropdown)
219
+ s.proxies = self.proxies
220
+ s.cookies["cf_clearance"] = self.network_manager.cookies
221
+ s.headers.update(self.headers)
222
+ s.mount(self.protocol, self.network_manager.tls_adapter)
223
+ return s
224
+ def process_response_data(self):
225
+ """Processes the fetched response data."""
226
+ if not self.response:
227
+ return # No data to process
228
+ if isinstance(self.response,str):
229
+ self.source_code = self.response
230
+ else:
231
+ self.source_code = self.response.text
232
+ self.source_code_bytes = self.response.content
233
+ if self.response.headers.get('content-type') == 'application/json':
234
+ data = convert_to_json(self.source_code)
235
+ if data:
236
+ self.source_code_json = data.get("response", data)
237
+
238
+ self.get_react_source_code()
239
+ def get_react_source_code(self) -> list:
240
+ """
241
+ Fetches the source code of the specified URL and extracts JavaScript and JSX source code (React components).
242
+
243
+ Args:
244
+ url (str): The URL to fetch the source code from.
245
+
246
+ Returns:
247
+ list: A list of strings containing JavaScript and JSX source code found in <script> tags.
248
+ """
249
+ if self.url_mgr.url is None:
250
+ return []
251
+ soup = BeautifulSoup(self.source_code_bytes,"html.parser")
252
+ script_tags = soup.find_all('script', type=lambda t: t and ('javascript' in t or 'jsx' in t))
253
+ for script_tag in script_tags:
254
+ self.react_source_code.append(script_tag.string)
255
+
256
+
257
+ def get_status(url:str=None) -> int:
258
+ """
259
+ Gets the HTTP status code of the given URL.
260
+
261
+ Args:
262
+ url (str): The URL to check the status of.
263
+
264
+ Returns:
265
+ int: The HTTP status code of the URL, or None if the request fails.
266
+ """
267
+ # Get the status code of the URL
268
+ return try_request(url=url).status_code
269
+ def wait_between_requests(self):
270
+ """
271
+ Wait between requests based on the request_wait_limit.
272
+ """
273
+ if self.last_request_time:
274
+ sleep_time = self.request_wait_limit - (get_time_stamp() - self.last_request_time)
275
+ if sleep_time > 0:
276
+ logging.info(f"Sleeping for {sleep_time:.2f} seconds.")
277
+ get_sleep(sleep_time)
278
+
279
+ def make_request(self):
280
+ """
281
+ Make a request and handle potential errors.
282
+ """
283
+ # Update the instance attributes if they are passed
284
+
285
+ self.wait_between_requests()
286
+ for _ in range(self.max_retries):
287
+ try:
288
+ self.try_request() # 10 seconds timeout
289
+ if self.response:
290
+ if self.response.status_code == 200:
291
+ self.last_request_time = get_time_stamp()
292
+ return self.response
293
+ elif self.response.status_code == 429:
294
+ logging.warning(f"Rate limited by {self.url_mgr.url}. Retrying...")
295
+ get_sleep(5) # adjust this based on the server's rate limit reset time
296
+ except requests.Timeout as e:
297
+ logging.error(f"Request to {cleaned_url} timed out: {e}")
298
+ except requests.ConnectionError:
299
+ logging.error(f"Connection error for URL {self.url_mgr.url}.")
300
+ except requests.Timeout:
301
+ logging.error(f"Request timeout for URL {self.url_mgr.url}.")
302
+ except requests.RequestException as e:
303
+ logging.error(f"Request exception for URL {self.url_mgr.url}: {e}")
304
+ try:
305
+ response = get_selenium_source(self.url_mgr.url)
306
+ if response:
307
+ self.response = response
308
+ return self.response
309
+ except:
310
+ logging.error(f"Failed to retrieve content from {self.url_mgr.url} after {self.max_retries} retries.")
311
+ return None
312
+ def try_request(self) -> Union[requests.Response, None]:
313
+ """
314
+ Tries to make an HTTP request to the given URL using the provided session.
315
+
316
+ Args:
317
+ timeout (int): Timeout for the request.
318
+
319
+ Returns:
320
+ requests.Response or None: The response object if the request is successful, or None if the request fails.
321
+ """
322
+ try:
323
+ return get_selenium_source(self.url_mgr.url)#self.session.get(url=self.url_mgr.url, timeout=self.timeout,stream=self.stream)
324
+ except requests.exceptions.RequestException as e:
325
+ print(e)
326
+ return None
327
+
328
+
329
+ @property
330
+ def url(self):
331
+ return self.url_mgr.url
332
+
333
+ @url.setter
334
+ def url(self, new_url):
335
+ self._url = new_url
336
+ class SafeRequestSingleton:
337
+ _instance = None
338
+ @staticmethod
339
+ def get_instance(url=None,headers:dict=None,max_retries=3,last_request_time=None,request_wait_limit=1.5):
340
+ if SafeRequestSingleton._instance is None:
341
+ SafeRequestSingleton._instance = SafeRequest(url,url_mgr=urlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
342
+ elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
343
+ SafeRequestSingleton._instance = SafeRequest(url,url_mgr=urlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
344
+ return SafeRequestSingleton._instance
345
+ def get_req_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None):
346
+ url = get_url(url=url,url_mgr=url_mgr)
347
+ url_mgr = get_url_mgr(url=url,url_mgr=url_mgr )
348
+ req_mgr = req_mgr or requestManager(url_mgr=url_mgr,url=url,source_code=source_code)
349
+ return req_mgr
350
+ def get_source(url=None,url_mgr=None,source_code=None,req_mgr=None):
351
+ # Placeholder for actual implementation.
352
+ req_mgr = get_req_mgr(req_mgr=req_mgr,url=url,url_mgr=url_mgr,source_code=source_code)
353
+ return req_mgr.source_code
@@ -0,0 +1 @@
1
+ from .soupManager import *
@@ -0,0 +1,362 @@
1
+ from ..abstract_webtools import *
2
+ from .urlManager import *
3
+ from .requestManager import *
4
+ class soupManager:
5
+ """
6
+ SoupManager is a class for managing and parsing HTML source code using BeautifulSoup.
7
+
8
+ Args:
9
+ url (str or None): The URL to be parsed (default is None).
10
+ source_code (str or None): The HTML source code (default is None).
11
+ url_mgr (UrlManager or None): An instance of UrlManager (default is None).
12
+ requestManager (SafeRequest or None): An instance of SafeRequest (default is None).
13
+ parse_type (str): The type of parser to be used by BeautifulSoup (default is "html.parser").
14
+
15
+ Methods:
16
+ re_initialize(): Reinitialize the SoupManager with the current settings.
17
+ update_url(url): Update the URL and reinitialize the SoupManager.
18
+ update_source_code(source_code): Update the source code and reinitialize the SoupManager.
19
+ update_requestManager(requestManager): Update the request manager and reinitialize the SoupManager.
20
+ update_url_mgr(url_mgr): Update the URL manager and reinitialize the SoupManager.
21
+ update_parse_type(parse_type): Update the parsing type and reinitialize the SoupManager.
22
+ all_links: A property that provides access to all discovered links.
23
+ _all_links_get(): A method to load all discovered links.
24
+ get_all_website_links(tag="a", attr="href"): Get all URLs belonging to the same website.
25
+ meta_tags: A property that provides access to all discovered meta tags.
26
+ _meta_tags_get(): A method to load all discovered meta tags.
27
+ get_meta_tags(): Get all meta tags in the source code.
28
+ find_all(element, soup=None): Find all instances of an HTML element in the source code.
29
+ get_class(class_name, soup=None): Get the specified class from the HTML source code.
30
+ has_attributes(tag, *attrs): Check if an HTML tag has the specified attributes.
31
+ get_find_all_with_attributes(*attrs): Find all HTML tags with specified attributes.
32
+ get_all_desired_soup(tag=None, attr=None, attr_value=None): Get HTML tags based on specified criteria.
33
+ extract_elements(url, tag=None, class_name=None, class_value=None): Extract portions of source code based on filters.
34
+ find_all_with_attributes(class_name=None, *attrs): Find classes with associated href or src attributes.
35
+ get_images(tag_name, class_name, class_value): Get images with specific class and attribute values.
36
+ discover_classes_and_meta_images(tag_name, class_name_1, class_name_2, class_value, attrs): Discover classes and meta images.
37
+
38
+ Note:
39
+ - The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
40
+ - It provides various methods to extract data and discover elements within the source code.
41
+ """
42
+ def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None, parse_type="html.parser"):
43
+ self.soup=[]
44
+ url = get_url(url=url,url_mgr=url_mgr)
45
+ self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
46
+ self.url=self.url_mgr.url
47
+ self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
48
+ self.parse_type = parse_type
49
+ source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
50
+ if source_code:
51
+ source_code = str(source_code)
52
+ self.source_code = source_code
53
+ self.soup= BeautifulSoup(self.source_code, self.parse_type)
54
+ self.all_tags_and_attribute_names = self.get_all_tags_and_attribute_names()
55
+ self.all_tags = self.all_tags_and_attribute_names.get('tags')
56
+ self.all_attribute_names = self.all_tags_and_attribute_names.get('attributes')
57
+ self.all_tags_and_attributes = self.all_tags + self.all_attribute_names
58
+
59
+ self._all_links_data = None
60
+ self._meta_tags_data = None
61
+ def re_initialize(self):
62
+ self.soup= BeautifulSoup(self.source_code, self.parse_type)
63
+ self._all_links_data = None
64
+ self._meta_tags_data = None
65
+ def update_url(self,url):
66
+ self.url_mgr.update_url(url=url)
67
+ self.url=self.url_mgr.url
68
+ self.req_mgr.update_url(url=url)
69
+ self.source_code = self.req_mgr.source_code_bytes
70
+ self.re_initialize()
71
+ def update_source_code(self,source_code):
72
+ if source_code:
73
+ source_code = str(source_code)
74
+ self.source_code = source_code
75
+ self.re_initialize()
76
+ def update_requestManager(self,requestManager):
77
+ self.req_mgr = requestManager
78
+ self.url_mgr=self.req_mgr.url_mgr
79
+ self.url=self.url_mgr.url
80
+ self.source_code = self.req_mgr.source_code_bytes
81
+ self.re_initialize()
82
+ def update_url_mgr(self,url_mgr):
83
+ self.url_mgr=url_mgr
84
+ self.url=self.url_mgr.url
85
+ self.req_mgr.update_url_mgr(url_mgr=self.url_mgr)
86
+ self.source_code = self.req_mgr.source_code_bytes
87
+ self.re_initialize()
88
+ def update_parse_type(self,parse_type):
89
+ self.parse_type=parse_type
90
+ self.re_initialize()
91
+ @property
92
+ def all_links(self):
93
+ """This is a property that provides access to the _all_links_data attribute.
94
+ The first time it's accessed, it will load the data."""
95
+ if self._all_links_data is None:
96
+ print("Loading all links for the first time...")
97
+ self._all_links_data = self._all_links_get()
98
+ return self._all_links_data
99
+ def _all_links_get(self):
100
+ """A method that loads the data (can be replaced with whatever data loading logic you have)."""
101
+ return self.get_all_website_links()
102
+ def get_all_website_links(self,tag="a",attr="href") -> list:
103
+ """
104
+ Returns all URLs that are found on the specified URL and belong to the same website.
105
+
106
+ Args:
107
+ url (str): The URL to search for links.
108
+
109
+ Returns:
110
+ list: A list of URLs that belong to the same website as the specified URL.
111
+ """
112
+ all_urls=[self.url_mgr.url]
113
+ domain = self.url_mgr.domain
114
+ all_desired=self.get_all_website_links(tag=tag,attr=attr)
115
+ for tag in all_desired:
116
+ href = tag.attrs.get(attr)
117
+ if href == "" or href is None:
118
+ # href empty tag
119
+ continue
120
+ href=self.url_mgr.get_relative_href(self.url_mgr.url,href)
121
+ if not self.url_mgr.is_valid_url(href):
122
+ # not a valid URL
123
+ continue
124
+ if href in all_urls:
125
+ # already in the set
126
+ continue
127
+ if domain not in href:
128
+ # external link
129
+ continue
130
+ all_urls.append(href)
131
+
132
+ return all_urls
133
+
134
+
135
+ @property
136
+ def meta_tags(self):
137
+ """This is a property that provides access to the _all_links_data attribute.
138
+ The first time it's accessed, it will load the data."""
139
+ if self._meta_tags_data is None:
140
+ print("Loading all links for the first time...")
141
+ self._meta_tags_data = self._all_links_get()
142
+ return self._meta_tags_data
143
+ def _meta_tags_get(self):
144
+ """A method that loads the data (can be replaced with whatever data loading logic you have)."""
145
+ return self.get_meta_tags()
146
+ def get_meta_tags(self):
147
+ tags = self.find_all("meta")
148
+ for meta_tag in tags:
149
+ for attr, values in meta_tag.attrs.items():
150
+ if attr not in self.meta_tags:
151
+ self.meta_tags[attr] = []
152
+ if values not in self.meta_tags[attr]:
153
+ self.meta_tags[attr].append(values)
154
+
155
+
156
+ def find_all(self,element,soup=None):
157
+ soup = self.soup if soup == None else soup
158
+ return soup.find_all(element)
159
+ def get_class(self,class_name,soup=None):
160
+ soup = self.soup if soup == None else soup
161
+ return soup.get(class_name)
162
+ @staticmethod
163
+ def has_attributes(tag, *attrs):
164
+ return any(tag.has_attr(attr) for attr in attrs)
165
+ def get_find_all_with_attributes(self, *attrs):
166
+ return self.soup.find_all(lambda t: self.has_attributes(t, *attrs))
167
+ def find_tags_by_attributes(self, tag: str = None, attr: str = None, attr_values: List[str] = None) ->List:
168
+ if not tag:
169
+ tags = self.soup.find_all(True) # get all tags
170
+ else:
171
+ tags = self.soup.find_all(tag) # get specific tags
172
+
173
+ extracted_tags = []
174
+ for t in tags:
175
+ if attr:
176
+ attribute_value = t.get(attr)
177
+ if not attribute_value: # skip tags without the desired attribute
178
+ continue
179
+ if attr_values and not any(value in attribute_value for value in attr_values): # skip tags without any of the desired attribute values
180
+ continue
181
+ extracted_tags.append(t)
182
+ return extracted_tags
183
+
184
+
185
+ def extract_elements(self,url:str=None, tag:str=None, class_name:str=None, class_value:str=None) -> list:
186
+ """
187
+ Extracts portions of the source code from the specified URL based on provided filters.
188
+
189
+ Args:
190
+ url (str): The URL to fetch the source code from.
191
+ element_type (str, optional): The HTML element type to filter by. Defaults to None.
192
+ attribute_name (str, optional): The attribute name to filter by. Defaults to None.
193
+ class_name (str, optional): The class name to filter by. Defaults to None.
194
+
195
+ Returns:
196
+ list: A list of strings containing portions of the source code that match the provided filters.
197
+ """
198
+ elements = []
199
+ # If no filters are provided, return the entire source code
200
+ if not tag and not class_name and not class_value:
201
+ elements.append(str(self.soup))
202
+ return elements
203
+ # Find elements based on the filters provided
204
+ if tag:
205
+ elements.extend([str(tags) for tags in self.get_all_desired(tag)])
206
+ if class_name:
207
+ elements.extend([str(tags) for tags in self.get_all_desired(tag={class_name: True})])
208
+ if class_value:
209
+ elements.extend([str(tags) for tags in self.get_all_desired(class_name=class_name)])
210
+ return elements
211
+ def find_all_with_attributes(self, class_name=None, *attrs):
212
+ """
213
+ Discovers classes in the HTML content of the provided URL
214
+ that have associated href or src attributes.
215
+
216
+ Args:
217
+ base_url (str): The URL from which to discover classes.
218
+
219
+ Returns:
220
+ set: A set of unique class names.
221
+ """
222
+
223
+
224
+ unique_classes = set()
225
+ for tag in self.get_find_all_with_attributes(*attrs):
226
+ class_list = self.get_class(class_name=class_name, soup=tag)
227
+ unique_classes.update(class_list)
228
+ return unique_classes
229
+ def get_images(self, tag_name, class_name, class_value):
230
+ images = []
231
+ for tag in self.soup.find_all(tag_name):
232
+ if class_name in tag.attrs and tag.attrs[class_name] == class_value:
233
+ content = tag.attrs.get('content', '')
234
+ if content:
235
+ images.append(content)
236
+ return images
237
+ def extract_text_sections(self) -> list:
238
+ """
239
+ Extract all sections of text from an HTML content using BeautifulSoup.
240
+
241
+ Args:
242
+ html_content (str): The HTML content to be parsed.
243
+
244
+ Returns:
245
+ list: A list containing all sections of text.
246
+ """
247
+ # Remove any script or style elements to avoid extracting JavaScript or CSS code
248
+ for script in self.soup(['script', 'style']):
249
+ script.decompose()
250
+
251
+ # Extract text from the remaining elements
252
+ text_sections = self.soup.stripped_strings
253
+ return [text for text in text_sections if text]
254
+ def discover_classes_and_meta_images(self, tag_name, class_name_1, class_name_2, class_value, attrs):
255
+ """
256
+ Discovers classes in the HTML content of the provided URL
257
+ that have associated href or src attributes. Also, fetches
258
+ image references from meta tags.
259
+
260
+ Args:
261
+ base_url (str): The URL from which to discover classes and meta images.
262
+
263
+ Returns:
264
+ tuple: A set of unique class names and a list of meta images.
265
+ """
266
+
267
+ unique_classes = self.find_all_with_attributes(class_name=class_name_1, *attrs)
268
+ images = self.get_images(tag_name=tag_name, class_name=class_name_2, class_value=class_value)
269
+ return unique_classes, images
270
+ def get_all_tags_and_attribute_names(self):
271
+ tag_names = set() # Using a set to ensure uniqueness
272
+ attribute_names = set()
273
+ get_all = self.find_tags_by_attributes()
274
+ for tag in get_all: # True matches all tags
275
+ tag_names.add(tag.name)
276
+ for attr in tag.attrs:
277
+ attribute_names.add(attr)
278
+ tag_names_list = list(tag_names)
279
+ attribute_names_list = list(attribute_names)
280
+ return {"tags":tag_names_list,"attributes":attribute_names_list}
281
+
282
+ def get_all_attribute_values(self, tags_list=None):
283
+ """
284
+ Collects all attribute values for each specified tag or all tags if none are specified.
285
+
286
+ Parameters:
287
+ - tags_list: List of specific tags to retrieve attributes from, e.g., ['script', 'img'].
288
+ If None, retrieves attributes for all tags.
289
+
290
+ Returns:
291
+ - attribute_values: Dictionary where each key is an attribute and the value is a list of unique values for that attribute.
292
+ """
293
+ attribute_values = {}
294
+ tags_list = tags_list or self.all_tags_and_attributes
295
+ # Get all tags matching tags_list criteria
296
+ for tag_name in tags_list:
297
+ for tag in self.soup.find_all(tag_name):
298
+ for attr, value in tag.attrs.items():
299
+ if attr not in attribute_values:
300
+ attribute_values[attr] = set()
301
+
302
+ # Add attribute values
303
+ if isinstance(value, list):
304
+ attribute_values[attr].update(value)
305
+ else:
306
+ attribute_values[attr].add(value)
307
+
308
+ # Convert each set to a list for consistency
309
+ for attr, values in attribute_values.items():
310
+ attribute_values[attr] = list(values)
311
+
312
+ # Capture JavaScript URLs inside <script> tags
313
+ attribute_values['script_links'] = self.get_js_links()
314
+
315
+ return attribute_values
316
+
317
+ def get_js_links(self):
318
+ """Extract URLs embedded in JavaScript within <script> tags."""
319
+ js_links = []
320
+ script_tags = self.soup.find_all('script')
321
+ for script in script_tags:
322
+ # Find URLs in the JavaScript code
323
+ urls_in_js = re.findall(r'["\'](https?://[^"\']+|/[^"\']+)["\']', script.get_text())
324
+ js_links.extend(urls_in_js)
325
+ return list(set(js_links)) # Remove duplicates
326
+
327
+ @property
328
+ def url(self):
329
+ return self._url
330
+ @url.setter
331
+ def url(self, new_url):
332
+ self._url = new_url
333
+
334
+ class SoupManagerSingleton():
335
+ _instance = None
336
+ @staticmethod
337
+ def get_instance(url_mgr,requestManager,parse_type="html.parser",source_code=None):
338
+ if SoupManagerSingleton._instance is None:
339
+ SoupManagerSingleton._instance = SoupManager(url_mgr,requestManager,parse_type=parse_type,source_code=source_code)
340
+ elif parse_type != SoupManagerSingleton._instance.parse_type or source_code != SoupManagerSingleton._instance.source_code:
341
+ SoupManagerSingleton._instance = SoupManager(url_mgr,requestManager,parse_type=parse_type,source_code=source_code)
342
+ return SoupManagerSingleton._instance
343
+ def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,parse_type="html.parser"):
344
+ url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
345
+ url = get_url(url=url,url_mgr=url_mgr)
346
+ req_mgr = get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
347
+ soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr,url=url,source_code=source_code)
348
+ return soup_mgr
349
+ def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,tags_list = None,parse_type="html.parser"):
350
+ soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
351
+ return soup_mgr.get_all_attribute_values(tags_list=tags_list)
352
+ def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,soup_mgr=None,parse_type="html.parser"):
353
+ if source_code or soup_mgr:
354
+ if soup_mgr:
355
+ return soup_mgr.soup
356
+ return BeautifulSoup(source_code, parse_type)
357
+ url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
358
+ url = get_url(url=url,url_mgr=url_mgr)
359
+ req_mgr = req_mgr or get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
360
+ source_code = req_mgr.source_code
361
+ soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
362
+ return soup_mgr.soup
@@ -0,0 +1 @@
1
+ from .urlManager import *
@@ -0,0 +1,230 @@
1
+ from ..abstract_webtools import *
2
+ class urlManager:
3
+ """
4
+ urlManager is a class for managing URLs, including cleaning, validating, and finding the correct version.
5
+
6
+ Args:
7
+ url (str or None): The URL to manage (default is None).
8
+ session (requests.Session): A custom requests session (default is the requests module's session).
9
+
10
+ Attributes:
11
+ session (requests.Session): The requests session used for making HTTP requests.
12
+ clean_urls (list): List of cleaned URL variations.
13
+ url (str): The current URL.
14
+ protocol (str): The protocol part of the URL (e.g., "https").
15
+ domain (str): The domain part of the URL (e.g., "example.com").
16
+ path (str): The path part of the URL (e.g., "/path/to/resource").
17
+ query (str): The query part of the URL (e.g., "?param=value").
18
+ all_urls (list): List of all URLs (not used in the provided code).
19
+
20
+ Methods:
21
+ url_to_pieces(url): Split a URL into its protocol, domain, path, and query components.
22
+ clean_url(url): Return a list of potential URL versions with and without 'www' and 'http(s)'.
23
+ get_correct_url(url): Get the correct version of the URL from possible variations.
24
+ update_url(url): Update the URL and related attributes.
25
+ get_domain(url): Get the domain name from a URL.
26
+ url_join(url, path): Join a base URL with a path.
27
+ is_valid_url(url): Check if a URL is valid.
28
+ make_valid(href, url): Make a URL valid by joining it with a base URL.
29
+ get_relative_href(url, href): Get the relative href URL by joining it with a base URL.
30
+
31
+ Note:
32
+ - The urlManager class provides methods for managing URLs, including cleaning and validating them.
33
+ - It also includes methods for joining and validating relative URLs.
34
+ """
35
+
36
+ def __init__(self, url=None, session=None):
37
+ """
38
+ Initialize a urlManager instance.
39
+
40
+ Args:
41
+ url (str or None): The URL to manage (default is None).
42
+ session (requests.Session): A custom requests session (default is the requests module's session).
43
+ """
44
+ url = url or 'www.example.com'
45
+ self._url=url
46
+ self.url = url
47
+ self.session= session or requests
48
+ self.clean_urls = self.clean_url(url=url)
49
+ self.url = self.get_correct_url(clean_urls=self.clean_urls)
50
+ url_pieces = self.url_to_pieces(url=self.url)
51
+ self.protocol,self.domain,self.path,self.query=url_pieces
52
+ self.all_urls = []
53
+ def url_to_pieces(self, url):
54
+
55
+ try:
56
+ match = re.match(r'^(https?)?://?([^/]+)(/[^?]+)?(\?.+)?', url)
57
+ if match:
58
+ protocol = match.group(1) if match.group(1) else None
59
+ domain = match.group(2) if match.group(1) else None
60
+ path = match.group(3) if match.group(3) else "" # Handle None
61
+ query = match.group(4) if match.group(4) else "" # Handle None
62
+ except:
63
+ print(f'the url {url} was not reachable')
64
+ protocol,domain,path,query=None,None,"",""
65
+ return protocol, domain, path, query
66
+
67
+ def clean_url(self,url=None) -> list:
68
+ """
69
+ Given a URL, return a list with potential URL versions including with and without 'www.',
70
+ and with 'http://' and 'https://'.
71
+ """
72
+ url = url or self.url
73
+ urls=[]
74
+ if url:
75
+ # Remove http:// or https:// prefix
76
+ cleaned = url.replace("http://", "").replace("https://", "")
77
+ no_subdomain = cleaned.replace("www.", "", 1)
78
+
79
+ urls = [
80
+ f"https://{cleaned}",
81
+ f"http://{cleaned}",
82
+ ]
83
+
84
+ # Add variants without 'www' if it was present
85
+ if cleaned != no_subdomain:
86
+ urls.extend([
87
+ f"https://{no_subdomain}",
88
+ f"http://{no_subdomain}",
89
+ ])
90
+
91
+ # Add variants with 'www' if it wasn't present
92
+ else:
93
+ urls.extend([
94
+ f"https://www.{cleaned}",
95
+ f"http://www.{cleaned}",
96
+ ])
97
+
98
+ return urls
99
+
100
+ def get_correct_url(self,url=None,clean_urls=None) -> (str or None):
101
+ """
102
+ Gets the correct URL from the possible variations by trying each one with an HTTP request.
103
+
104
+ Args:
105
+ url (str): The URL to find the correct version of.
106
+ session (type(requests.Session), optional): The requests session to use for making HTTP requests.
107
+ Defaults to requests.
108
+
109
+ Returns:
110
+ str: The correct version of the URL if found, or None if none of the variations are valid.
111
+ """
112
+ self.url = url
113
+ if url==None and clean_urls != None:
114
+ if self.url:
115
+ url=self.url or clean_urls[0]
116
+ if url!=None and clean_urls==None:
117
+ clean_urls=self.clean_url(url)
118
+ elif url==None and clean_urls==None:
119
+ url=self.url
120
+ clean_urls=self.clean_urls
121
+ # Get the correct URL from the possible variations
122
+ for url in clean_urls:
123
+ try:
124
+ source = self.session.get(url)
125
+ return url
126
+ except requests.exceptions.RequestException as e:
127
+ print(e)
128
+ return None
129
+ def update_url(self,url):
130
+ # These methods seem essential for setting up the urlManager object.
131
+ self.url = url
132
+ self.clean_urls = self.clean_url()
133
+ self.correct_url = self.get_correct_url()
134
+ self.url =self.correct_url
135
+ self.protocol,self.domain,self.path,self.query=self.url_to_pieces(url=self.url)
136
+ self.all_urls = []
137
+ def get_domain(self,url=None):
138
+ url = url or self.url
139
+ return urlparse(url).netloc
140
+ def url_join(self,url,path):
141
+ url = eatOuter(url,['/'])
142
+ path = eatInner(path,['/'])
143
+ slash=''
144
+ if path[0] not in ['?','&']:
145
+ slash = '/'
146
+ url = url+slash+path
147
+ return url
148
+ @property
149
+ def url(self):
150
+ return self._url
151
+ @url.setter
152
+ def url(self, new_url):
153
+ self._url = new_url
154
+ def is_valid_url(self,url=None):
155
+ """
156
+ Check if the given URL is valid.
157
+ """
158
+ url = url or self.url
159
+ parsed = urlparse(url)
160
+ return bool(parsed.netloc) and bool(parsed.scheme)
161
+
162
+ def make_valid(self,href,url=None):
163
+ def is_valid_url(url):
164
+ url = url or self.url
165
+ """
166
+ Check if the given URL is valid.
167
+ """
168
+ parsed = urlparse(url)
169
+ return bool(parsed.netloc) and bool(parsed.scheme)
170
+ if is_valid_url(href):
171
+ return href
172
+ new_link=urljoin(url,href)
173
+ if is_valid_url(new_link):
174
+ return new_link
175
+ return False
176
+
177
+ def get_relative_href(self,url,href):
178
+ # join the URL if it's relative (not an absolute link)
179
+ url = url or self.url
180
+ href = urljoin(url, href)
181
+ parsed_href = urlparse(href)
182
+ # remove URL GET parameters, URL fragments, etc.
183
+ href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
184
+ return href
185
+ def url_basename(self,url=None):
186
+ url = url or self.url
187
+ path = urllib.parse.urlparse(url).path
188
+ return path.strip('/').split('/')[-1]
189
+
190
+
191
+ def base_url(self,url=None):
192
+ url = url or self.url
193
+ return re.match(r'https?://[^?#]+/', url).group()
194
+
195
+
196
+ def urljoin(self,base, path):
197
+ if isinstance(path, bytes):
198
+ path = path.decode()
199
+ if not isinstance(path, str) or not path:
200
+ return None
201
+ if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
202
+ return path
203
+ if isinstance(base, bytes):
204
+ base = base.decode()
205
+ if not isinstance(base, str) or not re.match(
206
+ r'^(?:https?:)?//', base):
207
+ return None
208
+ return urllib.parse.urljoin(base, path)
209
+ class urlManagerSingleton:
210
+ _instance = None
211
+ @staticmethod
212
+ def get_instance(url=None,session=requests):
213
+ if urlManagerSingleton._instance is None:
214
+ urlManagerSingleton._instance = urlManager(url,session=session)
215
+ elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
216
+ urlManagerSingleton._instance = urlManager(url,session=session)
217
+ return urlManagerSingleton._instance
218
+
219
+ def get_url(url=None,url_mgr=None):
220
+ if not url and not url_mgr:
221
+ return None
222
+ if url:
223
+ url_mgr = urlManager(url)
224
+ return url_mgr.url
225
+ def get_url_mgr(url=None,url_mgr=None):
226
+ if url_mgr == None and url:
227
+ url_mgr = urlManager(url=url)
228
+ if url_mgr and url == None:
229
+ url = url_mgr.url
230
+ return url_mgr
@@ -1,5 +1,8 @@
1
- from abstract_webtools import requestManager, urlManager, soupManager, requests, linkManager
2
- import threading,os,re,yt_dlp,urllib.request,m3u8_To_MP4,subprocess
1
+ from .requestManager.requestManager import requestManager
2
+ from .urlManager.urlManager import urlManager
3
+ from .soupManager.soupManager import soupManager
4
+ from .linkManager.linkManager import linkManager
5
+ import threading,os,re,yt_dlp,urllib.request,m3u8_To_MP4,subprocess,requests
3
6
  from abstract_utilities import get_logFile,safe_dump_to_file
4
7
  from m3u8 import M3U8 # Install: pip install m3u8
5
8
  from urllib.parse import urljoin
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.27
3
+ Version: 0.1.6.29
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -23,11 +23,19 @@ abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQq
23
23
  abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
24
24
  abstract_webtools/managers/urlManager.py,sha256=Dvf-TiSo5j_YjZS2Eq6lFfbhveneD6NA_wEE0xUXy_E,8858
25
25
  abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
26
- abstract_webtools/managers/videoDownloader.py,sha256=7RGgk4-ctTED-62rug0u8ubpxReExsjhcVFq5wKc2hk,10271
26
+ abstract_webtools/managers/videoDownloader.py,sha256=RgwPZh2gVdIbToZJFC6nNh_H7hAX9xt6RfTS9xiRVxU,10389
27
27
  abstract_webtools/managers/videoDownloader2.py,sha256=v3H6akdhvVWGrB-r35m3cp_-aKkNWadpfCiMylOnv6w,12748
28
+ abstract_webtools/managers/linkManager/__init__.py,sha256=NpfWNzvTLSfsIWSeLYIxPzeLHADk_grSx5rfgCeWERw,27
29
+ abstract_webtools/managers/linkManager/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rgf0T7Sp7wmHUw,12180
30
+ abstract_webtools/managers/requestManager/__init__.py,sha256=z2qGtweEoO_OKr959LGxVXEMu1hu7PIkmh89BEh5TI8,30
31
+ abstract_webtools/managers/requestManager/requestManager.py,sha256=zXD31WAYghV1OjnTQzRQnQGqZz6_J4mjHTdNLnBop_0,17343
32
+ abstract_webtools/managers/soupManager/__init__.py,sha256=mqfXfqM9sWlYpOkoXUqtBoVvk2KQx1862NnmRVJwGtY,27
33
+ abstract_webtools/managers/soupManager/soupManager.py,sha256=-_mRCWlyzfKlF64UU53WXBmCvJ98jQ4GyHh8S8Pw3xs,17198
34
+ abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
35
+ abstract_webtools/managers/urlManager/urlManager.py,sha256=Dvf-TiSo5j_YjZS2Eq6lFfbhveneD6NA_wEE0xUXy_E,8858
28
36
  abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
29
- abstract_webtools-0.1.6.27.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
30
- abstract_webtools-0.1.6.27.dist-info/METADATA,sha256=UoJxhmemLf8FSnKEGv9i36HMf5xNCqpK3kb4BDiUp1s,16051
31
- abstract_webtools-0.1.6.27.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
32
- abstract_webtools-0.1.6.27.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
33
- abstract_webtools-0.1.6.27.dist-info/RECORD,,
37
+ abstract_webtools-0.1.6.29.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
38
+ abstract_webtools-0.1.6.29.dist-info/METADATA,sha256=6rXZpsWEuV75cR7_4MH63_E2cT6lydRCi9Fxv1XtH00,16051
39
+ abstract_webtools-0.1.6.29.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
40
+ abstract_webtools-0.1.6.29.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
41
+ abstract_webtools-0.1.6.29.dist-info/RECORD,,