abstract-webtools 0.1.5.8__py3-none-any.whl → 0.1.5.82__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,313 @@
1
+ from ..abstract_webtools import *
2
+ from .urlManager import *
3
+ from .requestManager import *
4
+ class soupManager:
5
+ """
6
+ SoupManager is a class for managing and parsing HTML source code using BeautifulSoup.
7
+
8
+ Args:
9
+ url (str or None): The URL to be parsed (default is None).
10
+ source_code (str or None): The HTML source code (default is None).
11
+ url_mgr (UrlManager or None): An instance of UrlManager (default is None).
12
+ requestManager (SafeRequest or None): An instance of SafeRequest (default is None).
13
+ parse_type (str): The type of parser to be used by BeautifulSoup (default is "html.parser").
14
+
15
+ Methods:
16
+ re_initialize(): Reinitialize the SoupManager with the current settings.
17
+ update_url(url): Update the URL and reinitialize the SoupManager.
18
+ update_source_code(source_code): Update the source code and reinitialize the SoupManager.
19
+ update_requestManager(requestManager): Update the request manager and reinitialize the SoupManager.
20
+ update_url_mgr(url_mgr): Update the URL manager and reinitialize the SoupManager.
21
+ update_parse_type(parse_type): Update the parsing type and reinitialize the SoupManager.
22
+ all_links: A property that provides access to all discovered links.
23
+ _all_links_get(): A method to load all discovered links.
24
+ get_all_website_links(tag="a", attr="href"): Get all URLs belonging to the same website.
25
+ meta_tags: A property that provides access to all discovered meta tags.
26
+ _meta_tags_get(): A method to load all discovered meta tags.
27
+ get_meta_tags(): Get all meta tags in the source code.
28
+ find_all(element, soup=None): Find all instances of an HTML element in the source code.
29
+ get_class(class_name, soup=None): Get the specified class from the HTML source code.
30
+ has_attributes(tag, *attrs): Check if an HTML tag has the specified attributes.
31
+ get_find_all_with_attributes(*attrs): Find all HTML tags with specified attributes.
32
+ get_all_desired_soup(tag=None, attr=None, attr_value=None): Get HTML tags based on specified criteria.
33
+ extract_elements(url, tag=None, class_name=None, class_value=None): Extract portions of source code based on filters.
34
+ find_all_with_attributes(class_name=None, *attrs): Find classes with associated href or src attributes.
35
+ get_images(tag_name, class_name, class_value): Get images with specific class and attribute values.
36
+ discover_classes_and_meta_images(tag_name, class_name_1, class_name_2, class_value, attrs): Discover classes and meta images.
37
+
38
+ Note:
39
+ - The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
40
+ - It provides various methods to extract data and discover elements within the source code.
41
+ """
42
+ def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None, parse_type="html.parser"):
43
+ self.soup=[]
44
+ self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
45
+ self.url=self.url_mgr.url
46
+ self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
47
+ self.parse_type = parse_type
48
+ source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
49
+ if source_code:
50
+ source_code = str(source_code)
51
+ self.source_code = source_code
52
+ self.soup= BeautifulSoup(self.source_code, self.parse_type)
53
+ self._all_links_data = None
54
+ self._meta_tags_data = None
55
+ def re_initialize(self):
56
+ self.soup= BeautifulSoup(self.source_code, self.parse_type)
57
+ self._all_links_data = None
58
+ self._meta_tags_data = None
59
+ def update_url(self,url):
60
+ self.url_mgr.update_url(url=url)
61
+ self.url=self.url_mgr.url
62
+ self.req_mgr.update_url(url=url)
63
+ self.source_code = self.req_mgr.source_code_bytes
64
+ self.re_initialize()
65
+ def update_source_code(self,source_code):
66
+ if source_code:
67
+ source_code = str(source_code)
68
+ self.source_code = source_code
69
+ self.re_initialize()
70
+ def update_requestManager(self,requestManager):
71
+ self.req_mgr = requestManager
72
+ self.url_mgr=self.req_mgr.url_mgr
73
+ self.url=self.url_mgr.url
74
+ self.source_code = self.req_mgr.source_code_bytes
75
+ self.re_initialize()
76
+ def update_url_mgr(self,url_mgr):
77
+ self.url_mgr=url_mgr
78
+ self.url=self.url_mgr.url
79
+ self.req_mgr.update_url_mgr(url_mgr=self.url_mgr)
80
+ self.source_code = self.req_mgr.source_code_bytes
81
+ self.re_initialize()
82
+ def update_parse_type(self,parse_type):
83
+ self.parse_type=parse_type
84
+ self.re_initialize()
85
+ @property
86
+ def all_links(self):
87
+ """This is a property that provides access to the _all_links_data attribute.
88
+ The first time it's accessed, it will load the data."""
89
+ if self._all_links_data is None:
90
+ print("Loading all links for the first time...")
91
+ self._all_links_data = self._all_links_get()
92
+ return self._all_links_data
93
+ def _all_links_get(self):
94
+ """A method that loads the data (can be replaced with whatever data loading logic you have)."""
95
+ return self.get_all_website_links()
96
+ def get_all_website_links(self,tag="a",attr="href") -> list:
97
+ """
98
+ Returns all URLs that are found on the specified URL and belong to the same website.
99
+
100
+ Args:
101
+ url (str): The URL to search for links.
102
+
103
+ Returns:
104
+ list: A list of URLs that belong to the same website as the specified URL.
105
+ """
106
+ all_urls=[self.url_mgr.url]
107
+ domain = self.url_mgr.domain
108
+ all_desired=self.get_all_desired_soup(tag=tag,attr=attr)
109
+ for tag in all_desired:
110
+ href = tag.attrs.get(attr)
111
+ if href == "" or href is None:
112
+ # href empty tag
113
+ continue
114
+ href=self.url_mgr.get_relative_href(self.url_mgr.url,href)
115
+ if not self.url_mgr.is_valid_url(href):
116
+ # not a valid URL
117
+ continue
118
+ if href in all_urls:
119
+ # already in the set
120
+ continue
121
+ if domain not in href:
122
+ # external link
123
+ continue
124
+ all_urls.append(href)
125
+
126
+ return all_urls
127
+
128
+
129
+ @property
130
+ def meta_tags(self):
131
+ """This is a property that provides access to the _all_links_data attribute.
132
+ The first time it's accessed, it will load the data."""
133
+ if self._meta_tags_data is None:
134
+ print("Loading all links for the first time...")
135
+ self._meta_tags_data = self._all_links_get()
136
+ return self._meta_tags_data
137
+ def _meta_tags_get(self):
138
+ """A method that loads the data (can be replaced with whatever data loading logic you have)."""
139
+ return self.get_meta_tags()
140
+ def get_meta_tags(self):
141
+ tags = self.find_all("meta")
142
+ for meta_tag in tags:
143
+ for attr, values in meta_tag.attrs.items():
144
+ if attr not in self.meta_tags:
145
+ self.meta_tags[attr] = []
146
+ if values not in self.meta_tags[attr]:
147
+ self.meta_tags[attr].append(values)
148
+
149
+
150
+ def find_all(self,element,soup=None):
151
+ soup = self.soup if soup == None else soup
152
+ return soup.find_all(element)
153
+ def get_class(self,class_name,soup=None):
154
+ soup = self.soup if soup == None else soup
155
+ return soup.get(class_name)
156
+ @staticmethod
157
+ def has_attributes(tag, *attrs):
158
+ return any(tag.has_attr(attr) for attr in attrs)
159
+ def get_find_all_with_attributes(self, *attrs):
160
+ return self.soup.find_all(lambda t: self.has_attributes(t, *attrs))
161
+ def find_tags_by_attributes(self, tag: str = None, attr: str = None, attr_values: List[str] = None) ->List:
162
+ if not tag:
163
+ tags = self.soup.find_all(True) # get all tags
164
+ else:
165
+ tags = self.soup.find_all(tag) # get specific tags
166
+
167
+ extracted_tags = []
168
+ for t in tags:
169
+ if attr:
170
+ attribute_value = t.get(attr)
171
+ if not attribute_value: # skip tags without the desired attribute
172
+ continue
173
+ if attr_values and not any(value in attribute_value for value in attr_values): # skip tags without any of the desired attribute values
174
+ continue
175
+ extracted_tags.append(t)
176
+ return extracted_tags
177
+
178
+
179
+ def extract_elements(self,url:str=None, tag:str=None, class_name:str=None, class_value:str=None) -> list:
180
+ """
181
+ Extracts portions of the source code from the specified URL based on provided filters.
182
+
183
+ Args:
184
+ url (str): The URL to fetch the source code from.
185
+ element_type (str, optional): The HTML element type to filter by. Defaults to None.
186
+ attribute_name (str, optional): The attribute name to filter by. Defaults to None.
187
+ class_name (str, optional): The class name to filter by. Defaults to None.
188
+
189
+ Returns:
190
+ list: A list of strings containing portions of the source code that match the provided filters.
191
+ """
192
+ elements = []
193
+ # If no filters are provided, return the entire source code
194
+ if not tag and not class_name and not class_value:
195
+ elements.append(str(self.soup))
196
+ return elements
197
+ # Find elements based on the filters provided
198
+ if tag:
199
+ elements.extend([str(tags) for tags in self.get_all_desired(tag)])
200
+ if class_name:
201
+ elements.extend([str(tags) for tags in self.get_all_desired(tag={class_name: True})])
202
+ if class_value:
203
+ elements.extend([str(tags) for tags in self.get_all_desired(class_name=class_name)])
204
+ return elements
205
+ def find_all_with_attributes(self, class_name=None, *attrs):
206
+ """
207
+ Discovers classes in the HTML content of the provided URL
208
+ that have associated href or src attributes.
209
+
210
+ Args:
211
+ base_url (str): The URL from which to discover classes.
212
+
213
+ Returns:
214
+ set: A set of unique class names.
215
+ """
216
+
217
+
218
+ unique_classes = set()
219
+ for tag in self.get_find_all_with_attributes(*attrs):
220
+ class_list = self.get_class(class_name=class_name, soup=tag)
221
+ unique_classes.update(class_list)
222
+ return unique_classes
223
+ def get_images(self, tag_name, class_name, class_value):
224
+ images = []
225
+ for tag in self.soup.find_all(tag_name):
226
+ if class_name in tag.attrs and tag.attrs[class_name] == class_value:
227
+ content = tag.attrs.get('content', '')
228
+ if content:
229
+ images.append(content)
230
+ return images
231
+ def extract_text_sections(self) -> list:
232
+ """
233
+ Extract all sections of text from an HTML content using BeautifulSoup.
234
+
235
+ Args:
236
+ html_content (str): The HTML content to be parsed.
237
+
238
+ Returns:
239
+ list: A list containing all sections of text.
240
+ """
241
+ # Remove any script or style elements to avoid extracting JavaScript or CSS code
242
+ for script in self.soup(['script', 'style']):
243
+ script.decompose()
244
+
245
+ # Extract text from the remaining elements
246
+ text_sections = self.soup.stripped_strings
247
+ return [text for text in text_sections if text]
248
+ def discover_classes_and_meta_images(self, tag_name, class_name_1, class_name_2, class_value, attrs):
249
+ """
250
+ Discovers classes in the HTML content of the provided URL
251
+ that have associated href or src attributes. Also, fetches
252
+ image references from meta tags.
253
+
254
+ Args:
255
+ base_url (str): The URL from which to discover classes and meta images.
256
+
257
+ Returns:
258
+ tuple: A set of unique class names and a list of meta images.
259
+ """
260
+
261
+ unique_classes = self.find_all_with_attributes(class_name=class_name_1, *attrs)
262
+ images = self.get_images(tag_name=tag_name, class_name=class_name_2, class_value=class_value)
263
+ return unique_classes, images
264
+ def get_all_tags_and_attribute_names(self):
265
+ tag_names = set() # Using a set to ensure uniqueness
266
+ attribute_names = set()
267
+ get_all = self.find_tags_by_attributes()
268
+ for tag in get_all: # True matches all tags
269
+ tag_names.add(tag.name)
270
+ for attr in tag.attrs:
271
+ attribute_names.add(attr)
272
+ tag_names_list = list(tag_names)
273
+ attribute_names_list = list(attribute_names)
274
+ return {"tags":tag_names_list,"attributes":attribute_names_list}
275
+
276
+ def get_all_attribute_values(self):
277
+ attribute_values={}
278
+ get_all = self.find_tags_by_attributes()
279
+ for tag in get_all: # True matches all tags
280
+ for attr, value in tag.attrs.items():
281
+ # If attribute is not yet in the dictionary, add it with an empty set
282
+ if attr not in attribute_values:
283
+ attribute_values[attr] = set()
284
+ # If the attribute value is a list (e.g., class), extend the set with the list
285
+ if isinstance(value, list):
286
+ attribute_values[attr].update(value)
287
+ else:
288
+ attribute_values[attr].add(value)
289
+ for attr, values in attribute_values.items():
290
+ attribute_values[attr] = list(values)
291
+ return attribute_values
292
+
293
+ @property
294
+ def url(self):
295
+ return self._url
296
+ @url.setter
297
+ def url(self, new_url):
298
+ self._url = new_url
299
+
300
+ class SoupManagerSingleton():
301
+ _instance = None
302
+ @staticmethod
303
+ def get_instance(url_mgr,requestManager,parse_type="html.parser",source_code=None):
304
+ if SoupManagerSingleton._instance is None:
305
+ SoupManagerSingleton._instance = SoupManager(url_mgr,requestManager,parse_type=parse_type,source_code=source_code)
306
+ elif parse_type != SoupManagerSingleton._instance.parse_type or source_code != SoupManagerSingleton._instance.source_code:
307
+ SoupManagerSingleton._instance = SoupManager(url_mgr,requestManager,parse_type=parse_type,source_code=source_code)
308
+ return SoupManagerSingleton._instance
309
+ def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None):
310
+ url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
311
+ req_mgr = get_req_mgr(url_mgr=url_mgr,source_code=source_code)
312
+ soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr)
313
+ return soup_mgr
@@ -0,0 +1,21 @@
1
+ from ..abstract_webtools import *
2
+ class SSLManager:
3
+ def __init__(self, ciphers=None, ssl_options=None, certification=None):
4
+ self.ciphers = ciphers or CipherManager().ciphers_string
5
+ self.ssl_options = ssl_options or self.get_default_ssl_settings()
6
+ self.certification = certification or ssl.CERT_REQUIRED
7
+ self.ssl_context = self.get_context()
8
+ def get_default_ssl_settings(self):
9
+ return ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1 | ssl.OP_NO_COMPRESSION
10
+ def get_context(self):
11
+ return ssl_.create_urllib3_context(ciphers=self.ciphers, cert_reqs=self.certification, options=self.ssl_options)
12
+
13
+ class SSLManagerSingleton:
14
+ _instance = None
15
+ @staticmethod
16
+ def get_instance(ciphers=None, ssl_options_list=None, certification=None):
17
+ if SSLManagerSingleton._instance is None:
18
+ SSLManagerSingleton._instance = SSLManager(ciphers=ciphers, ssl_options_list=ssl_options_list, certification=certification)
19
+ elif SSLManagerSingleton._instance.cipher_manager.ciphers_string != ciphers or SSLManagerSingleton._instance.ssl_options_list !=ssl_options_list or SSLManagerSingleton._instance.certification !=certification:
20
+ SSLManagerSingleton._instance = SSLManager(ciphers=ciphers, ssl_options_list=ssl_options_list, certification=certification)
21
+ return SSLManagerSingleton._instance
@@ -0,0 +1,27 @@
1
+ from ..abstract_webtools import *
2
+ class TLSAdapter(HTTPAdapter):
3
+ def __init__(self, ssl_manager=None,ciphers=None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None):
4
+ if ssl_manager == None:
5
+ ssl_manager = SSLManager(ciphers=ciphers, ssl_options=ssl_options, certification=certification)
6
+ self.ssl_manager = ssl_manager
7
+ self.ciphers = ssl_manager.ciphers
8
+ self.certification = ssl_manager.certification
9
+ self.ssl_options = ssl_manager.ssl_options
10
+ self.ssl_context = self.ssl_manager.ssl_context
11
+ super().__init__()
12
+
13
+ def init_poolmanager(self, *args, **kwargs):
14
+ kwargs['ssl_context'] = self.ssl_context
15
+ return super().init_poolmanager(*args, **kwargs)
16
+ class TLSAdapterSingleton:
17
+ _instance: Optional[TLSAdapter] = None
18
+
19
+ @staticmethod
20
+ def get_instance(ciphers: Optional[List[str]] = None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None) -> TLSAdapter:
21
+ if (not TLSAdapterSingleton._instance) or (
22
+ TLSAdapterSingleton._instance.ciphers != ciphers or
23
+ TLSAdapterSingleton._instance.certification != certification or
24
+ TLSAdapterSingleton._instance.ssl_options != ssl_options
25
+ ):
26
+ TLSAdapterSingleton._instance = TLSAdapter(ciphers=ciphers, certification=certification, ssl_options=ssl_options)
27
+ return TLSAdapterSingleton._instance
@@ -0,0 +1,225 @@
1
+ from ..abstract_webtools import *
2
+ class urlManager:
3
+ """
4
+ urlManager is a class for managing URLs, including cleaning, validating, and finding the correct version.
5
+
6
+ Args:
7
+ url (str or None): The URL to manage (default is None).
8
+ session (requests.Session): A custom requests session (default is the requests module's session).
9
+
10
+ Attributes:
11
+ session (requests.Session): The requests session used for making HTTP requests.
12
+ clean_urls (list): List of cleaned URL variations.
13
+ url (str): The current URL.
14
+ protocol (str): The protocol part of the URL (e.g., "https").
15
+ domain (str): The domain part of the URL (e.g., "example.com").
16
+ path (str): The path part of the URL (e.g., "/path/to/resource").
17
+ query (str): The query part of the URL (e.g., "?param=value").
18
+ all_urls (list): List of all URLs (not used in the provided code).
19
+
20
+ Methods:
21
+ url_to_pieces(url): Split a URL into its protocol, domain, path, and query components.
22
+ clean_url(url): Return a list of potential URL versions with and without 'www' and 'http(s)'.
23
+ get_correct_url(url): Get the correct version of the URL from possible variations.
24
+ update_url(url): Update the URL and related attributes.
25
+ get_domain(url): Get the domain name from a URL.
26
+ url_join(url, path): Join a base URL with a path.
27
+ is_valid_url(url): Check if a URL is valid.
28
+ make_valid(href, url): Make a URL valid by joining it with a base URL.
29
+ get_relative_href(url, href): Get the relative href URL by joining it with a base URL.
30
+
31
+ Note:
32
+ - The urlManager class provides methods for managing URLs, including cleaning and validating them.
33
+ - It also includes methods for joining and validating relative URLs.
34
+ """
35
+
36
+ def __init__(self, url=None, session=None):
37
+ """
38
+ Initialize a urlManager instance.
39
+
40
+ Args:
41
+ url (str or None): The URL to manage (default is None).
42
+ session (requests.Session): A custom requests session (default is the requests module's session).
43
+ """
44
+ self._url=url or 'www.example.com'
45
+ self.url = url or 'www.example.com'
46
+ self.session= session or requests
47
+ self.clean_urls = self.clean_url(url=url)
48
+ self.url = self.get_correct_url(clean_urls=self.clean_urls)
49
+ url_pieces = self.url_to_pieces(url=self.url)
50
+ self.protocol,self.domain,self.path,self.query=url_pieces
51
+ self.all_urls = []
52
+ def url_to_pieces(self, url):
53
+
54
+ try:
55
+ match = re.match(r'^(https?)?://?([^/]+)(/[^?]+)?(\?.+)?', url)
56
+ if match:
57
+ protocol = match.group(1) if match.group(1) else None
58
+ domain = match.group(2) if match.group(1) else None
59
+ path = match.group(3) if match.group(3) else "" # Handle None
60
+ query = match.group(4) if match.group(4) else "" # Handle None
61
+ except:
62
+ print(f'the url {url} was not reachable')
63
+ protocol,domain,path,query=None,None,"",""
64
+ return protocol, domain, path, query
65
+
66
+ def clean_url(self,url=None) -> list:
67
+ """
68
+ Given a URL, return a list with potential URL versions including with and without 'www.',
69
+ and with 'http://' and 'https://'.
70
+ """
71
+ if url == None:
72
+ url=self.url
73
+ urls=[]
74
+ if url:
75
+ # Remove http:// or https:// prefix
76
+ cleaned = url.replace("http://", "").replace("https://", "")
77
+ no_subdomain = cleaned.replace("www.", "", 1)
78
+
79
+ urls = [
80
+ f"https://{cleaned}",
81
+ f"http://{cleaned}",
82
+ ]
83
+
84
+ # Add variants without 'www' if it was present
85
+ if cleaned != no_subdomain:
86
+ urls.extend([
87
+ f"https://{no_subdomain}",
88
+ f"http://{no_subdomain}",
89
+ ])
90
+
91
+ # Add variants with 'www' if it wasn't present
92
+ else:
93
+ urls.extend([
94
+ f"https://www.{cleaned}",
95
+ f"http://www.{cleaned}",
96
+ ])
97
+
98
+ return urls
99
+
100
+ def get_correct_url(self,url=None,clean_urls=None) -> (str or None):
101
+ """
102
+ Gets the correct URL from the possible variations by trying each one with an HTTP request.
103
+
104
+ Args:
105
+ url (str): The URL to find the correct version of.
106
+ session (type(requests.Session), optional): The requests session to use for making HTTP requests.
107
+ Defaults to requests.
108
+
109
+ Returns:
110
+ str: The correct version of the URL if found, or None if none of the variations are valid.
111
+ """
112
+ self.url = url
113
+ if url==None and clean_urls != None:
114
+ if self.url:
115
+ url=self.url or clean_urls[0]
116
+ if url!=None and clean_urls==None:
117
+ clean_urls=self.clean_url(url)
118
+ elif url==None and clean_urls==None:
119
+ url=self.url
120
+ clean_urls=self.clean_urls
121
+ # Get the correct URL from the possible variations
122
+ for url in clean_urls:
123
+ try:
124
+ source = self.session.get(url)
125
+ return url
126
+ except requests.exceptions.RequestException as e:
127
+ print(e)
128
+ return None
129
+ def update_url(self,url):
130
+ # These methods seem essential for setting up the urlManager object.
131
+ self.url = url
132
+ self.clean_urls = self.clean_url()
133
+ self.correct_url = self.get_correct_url()
134
+ self.url =self.correct_url
135
+ self.protocol,self.domain,self.path,self.query=self.url_to_pieces(url=self.url)
136
+ self.all_urls = []
137
+ def get_domain(self,url):
138
+ return urlparse(url).netloc
139
+ def url_join(self,url,path):
140
+ url = eatOuter(url,['/'])
141
+ path = eatInner(path,['/'])
142
+ slash=''
143
+ if path[0] not in ['?','&']:
144
+ slash = '/'
145
+ url = url+slash+path
146
+ return url
147
+ @property
148
+ def url(self):
149
+ return self._url
150
+ @url.setter
151
+ def url(self, new_url):
152
+ self._url = new_url
153
+ @staticmethod
154
+ def is_valid_url(url):
155
+ """
156
+ Check if the given URL is valid.
157
+ """
158
+ parsed = urlparse(url)
159
+ return bool(parsed.netloc) and bool(parsed.scheme)
160
+ @staticmethod
161
+ def make_valid(href,url):
162
+ def is_valid_url(url):
163
+ """
164
+ Check if the given URL is valid.
165
+ """
166
+ parsed = urlparse(url)
167
+ return bool(parsed.netloc) and bool(parsed.scheme)
168
+ if is_valid_url(href):
169
+ return href
170
+ new_link=urljoin(url,href)
171
+ if is_valid_url(new_link):
172
+ return new_link
173
+ return False
174
+ @staticmethod
175
+ def get_relative_href(url,href):
176
+ # join the URL if it's relative (not an absolute link)
177
+ href = urljoin(url, href)
178
+ parsed_href = urlparse(href)
179
+ # remove URL GET parameters, URL fragments, etc.
180
+ href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
181
+ return href
182
+ def url_basename(url):
183
+ path = urllib.parse.urlparse(url).path
184
+ return path.strip('/').split('/')[-1]
185
+
186
+
187
+ def base_url(url):
188
+ return re.match(r'https?://[^?#]+/', url).group()
189
+
190
+
191
+ def urljoin(base, path):
192
+ if isinstance(path, bytes):
193
+ path = path.decode()
194
+ if not isinstance(path, str) or not path:
195
+ return None
196
+ if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
197
+ return path
198
+ if isinstance(base, bytes):
199
+ base = base.decode()
200
+ if not isinstance(base, str) or not re.match(
201
+ r'^(?:https?:)?//', base):
202
+ return None
203
+ return urllib.parse.urljoin(base, path)
204
+ class urlManagerSingleton:
205
+ _instance = None
206
+ @staticmethod
207
+ def get_instance(url=None,session=requests):
208
+ if urlManagerSingleton._instance is None:
209
+ urlManagerSingleton._instance = urlManager(url,session=session)
210
+ elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
211
+ urlManagerSingleton._instance = urlManager(url,session=session)
212
+ return urlManagerSingleton._instance
213
+
214
+ def get_url(url=None,url_mgr=None):
215
+ if not url and not url_mgr:
216
+ return None
217
+ if url:
218
+ url_mgr = urlManager(url)
219
+ return url_mgr.url
220
+ def get_url_mgr(url=None,url_mgr=None):
221
+ if not url and not url_mgr:
222
+ return None
223
+ if url:
224
+ url_mgr = urlManager(url)
225
+ return url_mgr
@@ -0,0 +1,42 @@
1
+ from ..abstract_webtools import *
2
+ class UserAgentManager:
3
+ def __init__(self, os=None, browser=None, version=None,user_agent=None):
4
+ self.os = os or 'Windows'
5
+ self.browser = browser or "Firefox"
6
+ self.version = version or '42.0'
7
+ self.user_agent = user_agent or self.get_user_agent()
8
+ self.header = self.user_agent_header()
9
+ @staticmethod
10
+ def user_agent_db():
11
+ from ..big_user_agent_list import big_user_agent_dict
12
+ return big_user_agent_dict
13
+
14
+ def get_user_agent(self):
15
+ ua_db = self.user_agent_db()
16
+
17
+ if self.os and self.os in ua_db:
18
+ os_db = ua_db[self.os]
19
+ else:
20
+ os_db = random.choice(list(ua_db.values()))
21
+
22
+ if self.browser and self.browser in os_db:
23
+ browser_db = os_db[self.browser]
24
+ else:
25
+ browser_db = random.choice(list(os_db.values()))
26
+
27
+ if self.version and self.version in browser_db:
28
+ return browser_db[self.version]
29
+ else:
30
+ return random.choice(list(browser_db.values()))
31
+
32
+ def user_agent_header(self):
33
+ return {"user-agent": self.user_agent}
34
+ class UserAgentManagerSingleton:
35
+ _instance = None
36
+ @staticmethod
37
+ def get_instance(user_agent=UserAgentManager().get_user_agent()[0]):
38
+ if UserAgentManagerSingleton._instance is None:
39
+ UserAgentManagerSingleton._instance = UserAgentManager(user_agent=user_agent)
40
+ elif UserAgentManagerSingleton._instance.user_agent != user_agent:
41
+ UserAgentManagerSingleton._instance = UserAgentManager(user_agent=user_agent)
42
+ return UserAgentManagerSingleton._instance