abstract-webtools 0.1.6.136__py3-none-any.whl → 0.1.6.137__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/main.py +0 -0
- abstract_webtools/managers/seleniumManager.py +1 -1
- abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.6.136.dist-info → abstract_webtools-0.1.6.137.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.136.dist-info → abstract_webtools-0.1.6.137.dist-info}/RECORD +5 -12
- {abstract_webtools-0.1.6.136.dist-info → abstract_webtools-0.1.6.137.dist-info}/top_level.txt +0 -0
- abstract_webtools/__pycache__/abstract_webtools.cpython-312.pyc +0 -0
- abstract_webtools/abstract_userpit.py +0 -169
- abstract_webtools/managers/linkManager.py +0 -189
- abstract_webtools/managers/requestManager.py +0 -353
- abstract_webtools/managers/soupManager.py +0 -362
- abstract_webtools/managers/urlManager.py +0 -230
- abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4 +0 -0
- {abstract_webtools-0.1.6.136.dist-info → abstract_webtools-0.1.6.137.dist-info}/WHEEL +0 -0
@@ -1,362 +0,0 @@
|
|
1
|
-
from ..abstract_webtools import *
|
2
|
-
from .urlManager import *
|
3
|
-
from .requestManager import *
|
4
|
-
class soupManager:
|
5
|
-
"""
|
6
|
-
SoupManager is a class for managing and parsing HTML source code using BeautifulSoup.
|
7
|
-
|
8
|
-
Args:
|
9
|
-
url (str or None): The URL to be parsed (default is None).
|
10
|
-
source_code (str or None): The HTML source code (default is None).
|
11
|
-
url_mgr (UrlManager or None): An instance of UrlManager (default is None).
|
12
|
-
requestManager (SafeRequest or None): An instance of SafeRequest (default is None).
|
13
|
-
parse_type (str): The type of parser to be used by BeautifulSoup (default is "html.parser").
|
14
|
-
|
15
|
-
Methods:
|
16
|
-
re_initialize(): Reinitialize the SoupManager with the current settings.
|
17
|
-
update_url(url): Update the URL and reinitialize the SoupManager.
|
18
|
-
update_source_code(source_code): Update the source code and reinitialize the SoupManager.
|
19
|
-
update_requestManager(requestManager): Update the request manager and reinitialize the SoupManager.
|
20
|
-
update_url_mgr(url_mgr): Update the URL manager and reinitialize the SoupManager.
|
21
|
-
update_parse_type(parse_type): Update the parsing type and reinitialize the SoupManager.
|
22
|
-
all_links: A property that provides access to all discovered links.
|
23
|
-
_all_links_get(): A method to load all discovered links.
|
24
|
-
get_all_website_links(tag="a", attr="href"): Get all URLs belonging to the same website.
|
25
|
-
meta_tags: A property that provides access to all discovered meta tags.
|
26
|
-
_meta_tags_get(): A method to load all discovered meta tags.
|
27
|
-
get_meta_tags(): Get all meta tags in the source code.
|
28
|
-
find_all(element, soup=None): Find all instances of an HTML element in the source code.
|
29
|
-
get_class(class_name, soup=None): Get the specified class from the HTML source code.
|
30
|
-
has_attributes(tag, *attrs): Check if an HTML tag has the specified attributes.
|
31
|
-
get_find_all_with_attributes(*attrs): Find all HTML tags with specified attributes.
|
32
|
-
get_all_desired_soup(tag=None, attr=None, attr_value=None): Get HTML tags based on specified criteria.
|
33
|
-
extract_elements(url, tag=None, class_name=None, class_value=None): Extract portions of source code based on filters.
|
34
|
-
find_all_with_attributes(class_name=None, *attrs): Find classes with associated href or src attributes.
|
35
|
-
get_images(tag_name, class_name, class_value): Get images with specific class and attribute values.
|
36
|
-
discover_classes_and_meta_images(tag_name, class_name_1, class_name_2, class_value, attrs): Discover classes and meta images.
|
37
|
-
|
38
|
-
Note:
|
39
|
-
- The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
|
40
|
-
- It provides various methods to extract data and discover elements within the source code.
|
41
|
-
"""
|
42
|
-
def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None, parse_type="html.parser"):
|
43
|
-
self.soup=[]
|
44
|
-
url = get_url(url=url,url_mgr=url_mgr)
|
45
|
-
self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
46
|
-
self.url=self.url_mgr.url
|
47
|
-
self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
|
48
|
-
self.parse_type = parse_type
|
49
|
-
source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
|
50
|
-
if source_code:
|
51
|
-
source_code = str(source_code)
|
52
|
-
self.source_code = source_code
|
53
|
-
self.soup= BeautifulSoup(self.source_code, self.parse_type)
|
54
|
-
self.all_tags_and_attribute_names = self.get_all_tags_and_attribute_names()
|
55
|
-
self.all_tags = self.all_tags_and_attribute_names.get('tags')
|
56
|
-
self.all_attribute_names = self.all_tags_and_attribute_names.get('attributes')
|
57
|
-
self.all_tags_and_attributes = self.all_tags + self.all_attribute_names
|
58
|
-
|
59
|
-
self._all_links_data = None
|
60
|
-
self._meta_tags_data = None
|
61
|
-
def re_initialize(self):
|
62
|
-
self.soup= BeautifulSoup(self.source_code, self.parse_type)
|
63
|
-
self._all_links_data = None
|
64
|
-
self._meta_tags_data = None
|
65
|
-
def update_url(self,url):
|
66
|
-
self.url_mgr.update_url(url=url)
|
67
|
-
self.url=self.url_mgr.url
|
68
|
-
self.req_mgr.update_url(url=url)
|
69
|
-
self.source_code = self.req_mgr.source_code_bytes
|
70
|
-
self.re_initialize()
|
71
|
-
def update_source_code(self,source_code):
|
72
|
-
if source_code:
|
73
|
-
source_code = str(source_code)
|
74
|
-
self.source_code = source_code
|
75
|
-
self.re_initialize()
|
76
|
-
def update_requestManager(self,requestManager):
|
77
|
-
self.req_mgr = requestManager
|
78
|
-
self.url_mgr=self.req_mgr.url_mgr
|
79
|
-
self.url=self.url_mgr.url
|
80
|
-
self.source_code = self.req_mgr.source_code_bytes
|
81
|
-
self.re_initialize()
|
82
|
-
def update_url_mgr(self,url_mgr):
|
83
|
-
self.url_mgr=url_mgr
|
84
|
-
self.url=self.url_mgr.url
|
85
|
-
self.req_mgr.update_url_mgr(url_mgr=self.url_mgr)
|
86
|
-
self.source_code = self.req_mgr.source_code_bytes
|
87
|
-
self.re_initialize()
|
88
|
-
def update_parse_type(self,parse_type):
|
89
|
-
self.parse_type=parse_type
|
90
|
-
self.re_initialize()
|
91
|
-
@property
|
92
|
-
def all_links(self):
|
93
|
-
"""This is a property that provides access to the _all_links_data attribute.
|
94
|
-
The first time it's accessed, it will load the data."""
|
95
|
-
if self._all_links_data is None:
|
96
|
-
print("Loading all links for the first time...")
|
97
|
-
self._all_links_data = self._all_links_get()
|
98
|
-
return self._all_links_data
|
99
|
-
def _all_links_get(self):
|
100
|
-
"""A method that loads the data (can be replaced with whatever data loading logic you have)."""
|
101
|
-
return self.get_all_website_links()
|
102
|
-
def get_all_website_links(self,tag="a",attr="href") -> list:
|
103
|
-
"""
|
104
|
-
Returns all URLs that are found on the specified URL and belong to the same website.
|
105
|
-
|
106
|
-
Args:
|
107
|
-
url (str): The URL to search for links.
|
108
|
-
|
109
|
-
Returns:
|
110
|
-
list: A list of URLs that belong to the same website as the specified URL.
|
111
|
-
"""
|
112
|
-
all_urls=[self.url_mgr.url]
|
113
|
-
domain = self.url_mgr.domain
|
114
|
-
all_desired=self.get_all_website_links(tag=tag,attr=attr)
|
115
|
-
for tag in all_desired:
|
116
|
-
href = tag.attrs.get(attr)
|
117
|
-
if href == "" or href is None:
|
118
|
-
# href empty tag
|
119
|
-
continue
|
120
|
-
href=self.url_mgr.get_relative_href(self.url_mgr.url,href)
|
121
|
-
if not self.url_mgr.is_valid_url(href):
|
122
|
-
# not a valid URL
|
123
|
-
continue
|
124
|
-
if href in all_urls:
|
125
|
-
# already in the set
|
126
|
-
continue
|
127
|
-
if domain not in href:
|
128
|
-
# external link
|
129
|
-
continue
|
130
|
-
all_urls.append(href)
|
131
|
-
|
132
|
-
return all_urls
|
133
|
-
|
134
|
-
|
135
|
-
@property
|
136
|
-
def meta_tags(self):
|
137
|
-
"""This is a property that provides access to the _all_links_data attribute.
|
138
|
-
The first time it's accessed, it will load the data."""
|
139
|
-
if self._meta_tags_data is None:
|
140
|
-
print("Loading all links for the first time...")
|
141
|
-
self._meta_tags_data = self._all_links_get()
|
142
|
-
return self._meta_tags_data
|
143
|
-
def _meta_tags_get(self):
|
144
|
-
"""A method that loads the data (can be replaced with whatever data loading logic you have)."""
|
145
|
-
return self.get_meta_tags()
|
146
|
-
def get_meta_tags(self):
|
147
|
-
tags = self.find_all("meta")
|
148
|
-
for meta_tag in tags:
|
149
|
-
for attr, values in meta_tag.attrs.items():
|
150
|
-
if attr not in self.meta_tags:
|
151
|
-
self.meta_tags[attr] = []
|
152
|
-
if values not in self.meta_tags[attr]:
|
153
|
-
self.meta_tags[attr].append(values)
|
154
|
-
|
155
|
-
|
156
|
-
def find_all(self,element,soup=None):
|
157
|
-
soup = self.soup if soup == None else soup
|
158
|
-
return soup.find_all(element)
|
159
|
-
def get_class(self,class_name,soup=None):
|
160
|
-
soup = self.soup if soup == None else soup
|
161
|
-
return soup.get(class_name)
|
162
|
-
@staticmethod
|
163
|
-
def has_attributes(tag, *attrs):
|
164
|
-
return any(tag.has_attr(attr) for attr in attrs)
|
165
|
-
def get_find_all_with_attributes(self, *attrs):
|
166
|
-
return self.soup.find_all(lambda t: self.has_attributes(t, *attrs))
|
167
|
-
def find_tags_by_attributes(self, tag: str = None, attr: str = None, attr_values: List[str] = None) ->List:
|
168
|
-
if not tag:
|
169
|
-
tags = self.soup.find_all(True) # get all tags
|
170
|
-
else:
|
171
|
-
tags = self.soup.find_all(tag) # get specific tags
|
172
|
-
|
173
|
-
extracted_tags = []
|
174
|
-
for t in tags:
|
175
|
-
if attr:
|
176
|
-
attribute_value = t.get(attr)
|
177
|
-
if not attribute_value: # skip tags without the desired attribute
|
178
|
-
continue
|
179
|
-
if attr_values and not any(value in attribute_value for value in attr_values): # skip tags without any of the desired attribute values
|
180
|
-
continue
|
181
|
-
extracted_tags.append(t)
|
182
|
-
return extracted_tags
|
183
|
-
|
184
|
-
|
185
|
-
def extract_elements(self,url:str=None, tag:str=None, class_name:str=None, class_value:str=None) -> list:
|
186
|
-
"""
|
187
|
-
Extracts portions of the source code from the specified URL based on provided filters.
|
188
|
-
|
189
|
-
Args:
|
190
|
-
url (str): The URL to fetch the source code from.
|
191
|
-
element_type (str, optional): The HTML element type to filter by. Defaults to None.
|
192
|
-
attribute_name (str, optional): The attribute name to filter by. Defaults to None.
|
193
|
-
class_name (str, optional): The class name to filter by. Defaults to None.
|
194
|
-
|
195
|
-
Returns:
|
196
|
-
list: A list of strings containing portions of the source code that match the provided filters.
|
197
|
-
"""
|
198
|
-
elements = []
|
199
|
-
# If no filters are provided, return the entire source code
|
200
|
-
if not tag and not class_name and not class_value:
|
201
|
-
elements.append(str(self.soup))
|
202
|
-
return elements
|
203
|
-
# Find elements based on the filters provided
|
204
|
-
if tag:
|
205
|
-
elements.extend([str(tags) for tags in self.get_all_desired(tag)])
|
206
|
-
if class_name:
|
207
|
-
elements.extend([str(tags) for tags in self.get_all_desired(tag={class_name: True})])
|
208
|
-
if class_value:
|
209
|
-
elements.extend([str(tags) for tags in self.get_all_desired(class_name=class_name)])
|
210
|
-
return elements
|
211
|
-
def find_all_with_attributes(self, class_name=None, *attrs):
|
212
|
-
"""
|
213
|
-
Discovers classes in the HTML content of the provided URL
|
214
|
-
that have associated href or src attributes.
|
215
|
-
|
216
|
-
Args:
|
217
|
-
base_url (str): The URL from which to discover classes.
|
218
|
-
|
219
|
-
Returns:
|
220
|
-
set: A set of unique class names.
|
221
|
-
"""
|
222
|
-
|
223
|
-
|
224
|
-
unique_classes = set()
|
225
|
-
for tag in self.get_find_all_with_attributes(*attrs):
|
226
|
-
class_list = self.get_class(class_name=class_name, soup=tag)
|
227
|
-
unique_classes.update(class_list)
|
228
|
-
return unique_classes
|
229
|
-
def get_images(self, tag_name, class_name, class_value):
|
230
|
-
images = []
|
231
|
-
for tag in self.soup.find_all(tag_name):
|
232
|
-
if class_name in tag.attrs and tag.attrs[class_name] == class_value:
|
233
|
-
content = tag.attrs.get('content', '')
|
234
|
-
if content:
|
235
|
-
images.append(content)
|
236
|
-
return images
|
237
|
-
def extract_text_sections(self) -> list:
|
238
|
-
"""
|
239
|
-
Extract all sections of text from an HTML content using BeautifulSoup.
|
240
|
-
|
241
|
-
Args:
|
242
|
-
html_content (str): The HTML content to be parsed.
|
243
|
-
|
244
|
-
Returns:
|
245
|
-
list: A list containing all sections of text.
|
246
|
-
"""
|
247
|
-
# Remove any script or style elements to avoid extracting JavaScript or CSS code
|
248
|
-
for script in self.soup(['script', 'style']):
|
249
|
-
script.decompose()
|
250
|
-
|
251
|
-
# Extract text from the remaining elements
|
252
|
-
text_sections = self.soup.stripped_strings
|
253
|
-
return [text for text in text_sections if text]
|
254
|
-
def discover_classes_and_meta_images(self, tag_name, class_name_1, class_name_2, class_value, attrs):
|
255
|
-
"""
|
256
|
-
Discovers classes in the HTML content of the provided URL
|
257
|
-
that have associated href or src attributes. Also, fetches
|
258
|
-
image references from meta tags.
|
259
|
-
|
260
|
-
Args:
|
261
|
-
base_url (str): The URL from which to discover classes and meta images.
|
262
|
-
|
263
|
-
Returns:
|
264
|
-
tuple: A set of unique class names and a list of meta images.
|
265
|
-
"""
|
266
|
-
|
267
|
-
unique_classes = self.find_all_with_attributes(class_name=class_name_1, *attrs)
|
268
|
-
images = self.get_images(tag_name=tag_name, class_name=class_name_2, class_value=class_value)
|
269
|
-
return unique_classes, images
|
270
|
-
def get_all_tags_and_attribute_names(self):
|
271
|
-
tag_names = set() # Using a set to ensure uniqueness
|
272
|
-
attribute_names = set()
|
273
|
-
get_all = self.find_tags_by_attributes()
|
274
|
-
for tag in get_all: # True matches all tags
|
275
|
-
tag_names.add(tag.name)
|
276
|
-
for attr in tag.attrs:
|
277
|
-
attribute_names.add(attr)
|
278
|
-
tag_names_list = list(tag_names)
|
279
|
-
attribute_names_list = list(attribute_names)
|
280
|
-
return {"tags":tag_names_list,"attributes":attribute_names_list}
|
281
|
-
|
282
|
-
def get_all_attribute_values(self, tags_list=None):
|
283
|
-
"""
|
284
|
-
Collects all attribute values for each specified tag or all tags if none are specified.
|
285
|
-
|
286
|
-
Parameters:
|
287
|
-
- tags_list: List of specific tags to retrieve attributes from, e.g., ['script', 'img'].
|
288
|
-
If None, retrieves attributes for all tags.
|
289
|
-
|
290
|
-
Returns:
|
291
|
-
- attribute_values: Dictionary where each key is an attribute and the value is a list of unique values for that attribute.
|
292
|
-
"""
|
293
|
-
attribute_values = {}
|
294
|
-
tags_list = tags_list or self.all_tags_and_attributes
|
295
|
-
# Get all tags matching tags_list criteria
|
296
|
-
for tag_name in tags_list:
|
297
|
-
for tag in self.soup.find_all(tag_name):
|
298
|
-
for attr, value in tag.attrs.items():
|
299
|
-
if attr not in attribute_values:
|
300
|
-
attribute_values[attr] = set()
|
301
|
-
|
302
|
-
# Add attribute values
|
303
|
-
if isinstance(value, list):
|
304
|
-
attribute_values[attr].update(value)
|
305
|
-
else:
|
306
|
-
attribute_values[attr].add(value)
|
307
|
-
|
308
|
-
# Convert each set to a list for consistency
|
309
|
-
for attr, values in attribute_values.items():
|
310
|
-
attribute_values[attr] = list(values)
|
311
|
-
|
312
|
-
# Capture JavaScript URLs inside <script> tags
|
313
|
-
attribute_values['script_links'] = self.get_js_links()
|
314
|
-
|
315
|
-
return attribute_values
|
316
|
-
|
317
|
-
def get_js_links(self):
|
318
|
-
"""Extract URLs embedded in JavaScript within <script> tags."""
|
319
|
-
js_links = []
|
320
|
-
script_tags = self.soup.find_all('script')
|
321
|
-
for script in script_tags:
|
322
|
-
# Find URLs in the JavaScript code
|
323
|
-
urls_in_js = re.findall(r'["\'](https?://[^"\']+|/[^"\']+)["\']', script.get_text())
|
324
|
-
js_links.extend(urls_in_js)
|
325
|
-
return list(set(js_links)) # Remove duplicates
|
326
|
-
|
327
|
-
@property
|
328
|
-
def url(self):
|
329
|
-
return self._url
|
330
|
-
@url.setter
|
331
|
-
def url(self, new_url):
|
332
|
-
self._url = new_url
|
333
|
-
|
334
|
-
class SoupManagerSingleton():
|
335
|
-
_instance = None
|
336
|
-
@staticmethod
|
337
|
-
def get_instance(url_mgr,requestManager,parse_type="html.parser",source_code=None):
|
338
|
-
if SoupManagerSingleton._instance is None:
|
339
|
-
SoupManagerSingleton._instance = SoupManager(url_mgr,requestManager,parse_type=parse_type,source_code=source_code)
|
340
|
-
elif parse_type != SoupManagerSingleton._instance.parse_type or source_code != SoupManagerSingleton._instance.source_code:
|
341
|
-
SoupManagerSingleton._instance = SoupManager(url_mgr,requestManager,parse_type=parse_type,source_code=source_code)
|
342
|
-
return SoupManagerSingleton._instance
|
343
|
-
def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,parse_type="html.parser"):
|
344
|
-
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
345
|
-
url = get_url(url=url,url_mgr=url_mgr)
|
346
|
-
req_mgr = get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
|
347
|
-
soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr,url=url,source_code=source_code)
|
348
|
-
return soup_mgr
|
349
|
-
def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,tags_list = None,parse_type="html.parser"):
|
350
|
-
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
|
351
|
-
return soup_mgr.get_all_attribute_values(tags_list=tags_list)
|
352
|
-
def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,soup_mgr=None,parse_type="html.parser"):
|
353
|
-
if source_code or soup_mgr:
|
354
|
-
if soup_mgr:
|
355
|
-
return soup_mgr.soup
|
356
|
-
return BeautifulSoup(source_code, parse_type)
|
357
|
-
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
358
|
-
url = get_url(url=url,url_mgr=url_mgr)
|
359
|
-
req_mgr = req_mgr or get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
|
360
|
-
source_code = req_mgr.source_code
|
361
|
-
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
|
362
|
-
return soup_mgr.soup
|
@@ -1,230 +0,0 @@
|
|
1
|
-
from ..abstract_webtools import *
|
2
|
-
class urlManager:
|
3
|
-
"""
|
4
|
-
urlManager is a class for managing URLs, including cleaning, validating, and finding the correct version.
|
5
|
-
|
6
|
-
Args:
|
7
|
-
url (str or None): The URL to manage (default is None).
|
8
|
-
session (requests.Session): A custom requests session (default is the requests module's session).
|
9
|
-
|
10
|
-
Attributes:
|
11
|
-
session (requests.Session): The requests session used for making HTTP requests.
|
12
|
-
clean_urls (list): List of cleaned URL variations.
|
13
|
-
url (str): The current URL.
|
14
|
-
protocol (str): The protocol part of the URL (e.g., "https").
|
15
|
-
domain (str): The domain part of the URL (e.g., "example.com").
|
16
|
-
path (str): The path part of the URL (e.g., "/path/to/resource").
|
17
|
-
query (str): The query part of the URL (e.g., "?param=value").
|
18
|
-
all_urls (list): List of all URLs (not used in the provided code).
|
19
|
-
|
20
|
-
Methods:
|
21
|
-
url_to_pieces(url): Split a URL into its protocol, domain, path, and query components.
|
22
|
-
clean_url(url): Return a list of potential URL versions with and without 'www' and 'http(s)'.
|
23
|
-
get_correct_url(url): Get the correct version of the URL from possible variations.
|
24
|
-
update_url(url): Update the URL and related attributes.
|
25
|
-
get_domain(url): Get the domain name from a URL.
|
26
|
-
url_join(url, path): Join a base URL with a path.
|
27
|
-
is_valid_url(url): Check if a URL is valid.
|
28
|
-
make_valid(href, url): Make a URL valid by joining it with a base URL.
|
29
|
-
get_relative_href(url, href): Get the relative href URL by joining it with a base URL.
|
30
|
-
|
31
|
-
Note:
|
32
|
-
- The urlManager class provides methods for managing URLs, including cleaning and validating them.
|
33
|
-
- It also includes methods for joining and validating relative URLs.
|
34
|
-
"""
|
35
|
-
|
36
|
-
def __init__(self, url=None, session=None):
|
37
|
-
"""
|
38
|
-
Initialize a urlManager instance.
|
39
|
-
|
40
|
-
Args:
|
41
|
-
url (str or None): The URL to manage (default is None).
|
42
|
-
session (requests.Session): A custom requests session (default is the requests module's session).
|
43
|
-
"""
|
44
|
-
url = url or 'www.example.com'
|
45
|
-
self._url=url
|
46
|
-
self.url = url
|
47
|
-
self.session= session or requests
|
48
|
-
self.clean_urls = self.clean_url(url=url)
|
49
|
-
self.url = self.get_correct_url(clean_urls=self.clean_urls)
|
50
|
-
url_pieces = self.url_to_pieces(url=self.url)
|
51
|
-
self.protocol,self.domain,self.path,self.query=url_pieces
|
52
|
-
self.all_urls = []
|
53
|
-
def url_to_pieces(self, url):
|
54
|
-
|
55
|
-
try:
|
56
|
-
match = re.match(r'^(https?)?://?([^/]+)(/[^?]+)?(\?.+)?', url)
|
57
|
-
if match:
|
58
|
-
protocol = match.group(1) if match.group(1) else None
|
59
|
-
domain = match.group(2) if match.group(1) else None
|
60
|
-
path = match.group(3) if match.group(3) else "" # Handle None
|
61
|
-
query = match.group(4) if match.group(4) else "" # Handle None
|
62
|
-
except:
|
63
|
-
print(f'the url {url} was not reachable')
|
64
|
-
protocol,domain,path,query=None,None,"",""
|
65
|
-
return protocol, domain, path, query
|
66
|
-
|
67
|
-
def clean_url(self,url=None) -> list:
|
68
|
-
"""
|
69
|
-
Given a URL, return a list with potential URL versions including with and without 'www.',
|
70
|
-
and with 'http://' and 'https://'.
|
71
|
-
"""
|
72
|
-
url = url or self.url
|
73
|
-
urls=[]
|
74
|
-
if url:
|
75
|
-
# Remove http:// or https:// prefix
|
76
|
-
cleaned = url.replace("http://", "").replace("https://", "")
|
77
|
-
no_subdomain = cleaned.replace("www.", "", 1)
|
78
|
-
|
79
|
-
urls = [
|
80
|
-
f"https://{cleaned}",
|
81
|
-
f"http://{cleaned}",
|
82
|
-
]
|
83
|
-
|
84
|
-
# Add variants without 'www' if it was present
|
85
|
-
if cleaned != no_subdomain:
|
86
|
-
urls.extend([
|
87
|
-
f"https://{no_subdomain}",
|
88
|
-
f"http://{no_subdomain}",
|
89
|
-
])
|
90
|
-
|
91
|
-
# Add variants with 'www' if it wasn't present
|
92
|
-
else:
|
93
|
-
urls.extend([
|
94
|
-
f"https://www.{cleaned}",
|
95
|
-
f"http://www.{cleaned}",
|
96
|
-
])
|
97
|
-
|
98
|
-
return urls
|
99
|
-
|
100
|
-
def get_correct_url(self,url=None,clean_urls=None) -> (str or None):
|
101
|
-
"""
|
102
|
-
Gets the correct URL from the possible variations by trying each one with an HTTP request.
|
103
|
-
|
104
|
-
Args:
|
105
|
-
url (str): The URL to find the correct version of.
|
106
|
-
session (type(requests.Session), optional): The requests session to use for making HTTP requests.
|
107
|
-
Defaults to requests.
|
108
|
-
|
109
|
-
Returns:
|
110
|
-
str: The correct version of the URL if found, or None if none of the variations are valid.
|
111
|
-
"""
|
112
|
-
self.url = url
|
113
|
-
if url==None and clean_urls != None:
|
114
|
-
if self.url:
|
115
|
-
url=self.url or clean_urls[0]
|
116
|
-
if url!=None and clean_urls==None:
|
117
|
-
clean_urls=self.clean_url(url)
|
118
|
-
elif url==None and clean_urls==None:
|
119
|
-
url=self.url
|
120
|
-
clean_urls=self.clean_urls
|
121
|
-
# Get the correct URL from the possible variations
|
122
|
-
for url in clean_urls:
|
123
|
-
try:
|
124
|
-
source = self.session.get(url)
|
125
|
-
return url
|
126
|
-
except requests.exceptions.RequestException as e:
|
127
|
-
print(e)
|
128
|
-
return None
|
129
|
-
def update_url(self,url):
|
130
|
-
# These methods seem essential for setting up the urlManager object.
|
131
|
-
self.url = url
|
132
|
-
self.clean_urls = self.clean_url()
|
133
|
-
self.correct_url = self.get_correct_url()
|
134
|
-
self.url =self.correct_url
|
135
|
-
self.protocol,self.domain,self.path,self.query=self.url_to_pieces(url=self.url)
|
136
|
-
self.all_urls = []
|
137
|
-
def get_domain(self,url=None):
|
138
|
-
url = url or self.url
|
139
|
-
return urlparse(url).netloc
|
140
|
-
def url_join(self,url,path):
|
141
|
-
url = eatOuter(url,['/'])
|
142
|
-
path = eatInner(path,['/'])
|
143
|
-
slash=''
|
144
|
-
if path[0] not in ['?','&']:
|
145
|
-
slash = '/'
|
146
|
-
url = url+slash+path
|
147
|
-
return url
|
148
|
-
@property
|
149
|
-
def url(self):
|
150
|
-
return self._url
|
151
|
-
@url.setter
|
152
|
-
def url(self, new_url):
|
153
|
-
self._url = new_url
|
154
|
-
def is_valid_url(self,url=None):
|
155
|
-
"""
|
156
|
-
Check if the given URL is valid.
|
157
|
-
"""
|
158
|
-
url = url or self.url
|
159
|
-
parsed = urlparse(url)
|
160
|
-
return bool(parsed.netloc) and bool(parsed.scheme)
|
161
|
-
|
162
|
-
def make_valid(self,href,url=None):
|
163
|
-
def is_valid_url(url):
|
164
|
-
url = url or self.url
|
165
|
-
"""
|
166
|
-
Check if the given URL is valid.
|
167
|
-
"""
|
168
|
-
parsed = urlparse(url)
|
169
|
-
return bool(parsed.netloc) and bool(parsed.scheme)
|
170
|
-
if is_valid_url(href):
|
171
|
-
return href
|
172
|
-
new_link=urljoin(url,href)
|
173
|
-
if is_valid_url(new_link):
|
174
|
-
return new_link
|
175
|
-
return False
|
176
|
-
|
177
|
-
def get_relative_href(self,url,href):
|
178
|
-
# join the URL if it's relative (not an absolute link)
|
179
|
-
url = url or self.url
|
180
|
-
href = urljoin(url, href)
|
181
|
-
parsed_href = urlparse(href)
|
182
|
-
# remove URL GET parameters, URL fragments, etc.
|
183
|
-
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
|
184
|
-
return href
|
185
|
-
def url_basename(self,url=None):
|
186
|
-
url = url or self.url
|
187
|
-
path = urllib.parse.urlparse(url).path
|
188
|
-
return path.strip('/').split('/')[-1]
|
189
|
-
|
190
|
-
|
191
|
-
def base_url(self,url=None):
|
192
|
-
url = url or self.url
|
193
|
-
return re.match(r'https?://[^?#]+/', url).group()
|
194
|
-
|
195
|
-
|
196
|
-
def urljoin(self,base, path):
|
197
|
-
if isinstance(path, bytes):
|
198
|
-
path = path.decode()
|
199
|
-
if not isinstance(path, str) or not path:
|
200
|
-
return None
|
201
|
-
if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
|
202
|
-
return path
|
203
|
-
if isinstance(base, bytes):
|
204
|
-
base = base.decode()
|
205
|
-
if not isinstance(base, str) or not re.match(
|
206
|
-
r'^(?:https?:)?//', base):
|
207
|
-
return None
|
208
|
-
return urllib.parse.urljoin(base, path)
|
209
|
-
class urlManagerSingleton:
|
210
|
-
_instance = None
|
211
|
-
@staticmethod
|
212
|
-
def get_instance(url=None,session=requests):
|
213
|
-
if urlManagerSingleton._instance is None:
|
214
|
-
urlManagerSingleton._instance = urlManager(url,session=session)
|
215
|
-
elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
|
216
|
-
urlManagerSingleton._instance = urlManager(url,session=session)
|
217
|
-
return urlManagerSingleton._instance
|
218
|
-
|
219
|
-
def get_url(url=None,url_mgr=None):
|
220
|
-
if not url and not url_mgr:
|
221
|
-
return None
|
222
|
-
if url:
|
223
|
-
url_mgr = urlManager(url)
|
224
|
-
return url_mgr.url
|
225
|
-
def get_url_mgr(url=None,url_mgr=None):
|
226
|
-
if url_mgr == None and url:
|
227
|
-
url_mgr = urlManager(url=url)
|
228
|
-
if url_mgr and url == None:
|
229
|
-
url = url_mgr.url
|
230
|
-
return url_mgr
|
Binary file
|
File without changes
|