abstract-webtools 0.1.5.86__py3-none-any.whl → 0.1.5.88__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/crawlManager.py +82 -22
- abstract_webtools/managers/requestManager.py +6 -1
- abstract_webtools/managers/seleniumManager.py +4 -3
- abstract_webtools/managers/soupManager.py +53 -16
- abstract_webtools/managers/urlManager.py +4 -4
- abstract_webtools/url_grabber.py +33 -33
- {abstract_webtools-0.1.5.86.dist-info → abstract_webtools-0.1.5.88.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.5.86.dist-info → abstract_webtools-0.1.5.88.dist-info}/RECORD +11 -11
- {abstract_webtools-0.1.5.86.dist-info → abstract_webtools-0.1.5.88.dist-info}/WHEEL +1 -1
- {abstract_webtools-0.1.5.86.dist-info → abstract_webtools-0.1.5.88.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.5.86.dist-info → abstract_webtools-0.1.5.88.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,17 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
from .soupManager import *
|
2
|
+
class CrawlManager():
|
3
|
+
def __init__(self,url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
|
3
4
|
self.url=url
|
4
5
|
self.source_code=source_code
|
5
6
|
self.parse_type=parse_type
|
6
|
-
|
7
|
+
self.url_mgr = url_mgr or urlManager(url=self.url)
|
8
|
+
self.req_mgr = requestManager(url_mgr=self.url_mgr)
|
9
|
+
self.get_new_source_and_url(url)
|
7
10
|
def get_new_source_and_url(self,url=None):
|
8
11
|
if url == None:
|
9
12
|
url = self.url
|
10
|
-
self.response = self.
|
11
|
-
self.source_code=self.
|
13
|
+
self.response = self.req_mgr.response
|
14
|
+
self.source_code=self.req_mgr.source_code
|
12
15
|
def get_classes_and_meta_info():
|
13
16
|
class_name_1,class_name_2, class_value = 'meta','class','property','og:image'
|
14
17
|
attrs = 'href','src'
|
@@ -41,7 +44,36 @@ class CrawlManager:
|
|
41
44
|
agg_js['external_links']=external_links
|
42
45
|
|
43
46
|
return agg_js
|
47
|
+
def get_all_website_links(self,tag="a",attr="href") -> list:
|
48
|
+
"""
|
49
|
+
Returns all URLs that are found on the specified URL and belong to the same website.
|
44
50
|
|
51
|
+
Args:
|
52
|
+
url (str): The URL to search for links.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
list: A list of URLs that belong to the same website as the specified URL.
|
56
|
+
"""
|
57
|
+
all_urls=[self.url_mgr.url]
|
58
|
+
domain = self.url_mgr.domain
|
59
|
+
all_attribs = get_attribs(self.url_mgr.url)
|
60
|
+
for href in all_attribs.get('href',[]):
|
61
|
+
if href == "" or href is None:
|
62
|
+
# href empty tag
|
63
|
+
continue
|
64
|
+
href=self.url_mgr.get_relative_href(self.url_mgr.url,href)
|
65
|
+
if not self.url_mgr.is_valid_url(href):
|
66
|
+
# not a valid URL
|
67
|
+
continue
|
68
|
+
if href in all_urls:
|
69
|
+
# already in the set
|
70
|
+
continue
|
71
|
+
if domain not in href:
|
72
|
+
# external link
|
73
|
+
continue
|
74
|
+
all_urls.append(href)
|
75
|
+
|
76
|
+
return all_urls
|
45
77
|
|
46
78
|
def correct_xml(xml_string):
|
47
79
|
# Parse the XML string
|
@@ -58,21 +90,21 @@ class CrawlManager:
|
|
58
90
|
return corrected_xml
|
59
91
|
|
60
92
|
|
61
|
-
def determine_values(self):
|
93
|
+
def determine_values(self,url=None):
|
62
94
|
# This is just a mockup. In a real application, you'd analyze the URL or its content.
|
63
|
-
|
95
|
+
url = url or self.url
|
64
96
|
# Assuming a blog site
|
65
|
-
if 'blog' in
|
66
|
-
if '2023' in
|
97
|
+
if 'blog' in url:
|
98
|
+
if '2023' in url: # Assuming it's a current year article
|
67
99
|
return ('weekly', '0.8')
|
68
100
|
else:
|
69
101
|
return ('monthly', '0.6')
|
70
|
-
elif 'contact' in
|
102
|
+
elif 'contact' in url:
|
71
103
|
return ('yearly', '0.3')
|
72
104
|
else: # Homepage or main categories
|
73
105
|
return ('weekly', '1.0')
|
74
|
-
def crawl(url, max_depth=3, depth=1):
|
75
|
-
|
106
|
+
def crawl(self,url, max_depth=3, depth=1):
|
107
|
+
visited=set()
|
76
108
|
if depth > max_depth:
|
77
109
|
return []
|
78
110
|
|
@@ -82,8 +114,8 @@ class CrawlManager:
|
|
82
114
|
visited.add(url)
|
83
115
|
|
84
116
|
try:
|
85
|
-
|
86
|
-
links = [a['href'] for a in
|
117
|
+
soup = get_soup(url)
|
118
|
+
links = [a['href'] for a in soup.find_all('a', href=True)]
|
87
119
|
valid_links = []
|
88
120
|
|
89
121
|
for link in links:
|
@@ -107,11 +139,24 @@ class CrawlManager:
|
|
107
139
|
|
108
140
|
# Define or import required functions here, like get_all_website_links, determine_values,
|
109
141
|
# discover_classes_and_meta_images, and extract_links_from_url.
|
110
|
-
def get_meta_info(self):
|
111
|
-
|
142
|
+
def get_meta_info(self,url=None):
|
143
|
+
url = url or self.url
|
144
|
+
soup_mgr = soupManager(url=url)
|
112
145
|
meta_info = {}
|
113
146
|
# Fetch the title if available
|
114
|
-
|
147
|
+
meta_tags = soup_mgr.find_all("meta")
|
148
|
+
url = eatAll(str(url),['',' ','\n','\t','\\','/'])
|
149
|
+
attribs = get_attribs(url)
|
150
|
+
soup = get_soup(url)
|
151
|
+
|
152
|
+
for meta_tag in meta_tags:
|
153
|
+
for attr, values in meta_tag.attrs.items():
|
154
|
+
|
155
|
+
if attr not in meta_tag:
|
156
|
+
meta_tag[attr] = []
|
157
|
+
if values not in meta_tag[attr]:
|
158
|
+
meta_tag[attr].append(values)
|
159
|
+
title_tag = soup.find_all("title")
|
115
160
|
if title_tag:
|
116
161
|
meta_info["title"] = title_tag
|
117
162
|
# Fetch meta tags
|
@@ -128,13 +173,14 @@ class CrawlManager:
|
|
128
173
|
with open('sitemap.xml', 'w', encoding='utf-8') as f:
|
129
174
|
string = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n'
|
130
175
|
|
131
|
-
for url in self.
|
176
|
+
for url in self.get_all_website_links():
|
132
177
|
string += f' <url>\n <loc>{url}</loc>\n'
|
133
178
|
preprocess=[]
|
134
179
|
self.get_new_source_and_url(url=url)
|
135
|
-
links =
|
136
|
-
|
137
|
-
|
180
|
+
links = get_attribs(url)
|
181
|
+
images = [link for link in links if link.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp'))]
|
182
|
+
|
183
|
+
for img in images:
|
138
184
|
if str(img).lower() not in preprocess:
|
139
185
|
try:
|
140
186
|
escaped_img = img.replace('&', '&')
|
@@ -144,7 +190,7 @@ class CrawlManager:
|
|
144
190
|
except:
|
145
191
|
pass
|
146
192
|
preprocess.append(str(img).lower())
|
147
|
-
frequency, priority = determine_values(url)
|
193
|
+
frequency, priority = self.determine_values(url)
|
148
194
|
string += f' <changefreq>{frequency}</changefreq>\n'
|
149
195
|
string += f' <priority>{priority}</priority>\n'
|
150
196
|
string += f' </url>\n'
|
@@ -176,6 +222,7 @@ class CrawlManager:
|
|
176
222
|
print("\nExternal Links:")
|
177
223
|
for ext_link in links['external_links']:
|
178
224
|
print(f"\t{ext_link}")
|
225
|
+
|
179
226
|
class CrawlManagerSingleton():
|
180
227
|
_instance = None
|
181
228
|
@staticmethod
|
@@ -185,3 +232,16 @@ class CrawlManagerSingleton():
|
|
185
232
|
elif parse_type != CrawlManagerSingleton._instance.parse_type or url != CrawlManagerSingleton._instance.url or source_code != CrawlManagerSingleton._instance.source_code:
|
186
233
|
CrawlManagerSingleton._instance = CrawlManager(url=url,parse_type=parse_type,source_code=source_code)
|
187
234
|
return CrawlManagerSingleton._instance
|
235
|
+
def get_crawl_mgr(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
|
236
|
+
url = get_url(url=url,url_mgr=url_mgr)
|
237
|
+
url_mgr = get_url(url=url,url_mgr=url_mgr)
|
238
|
+
req_mgr=get_req_mgr(url=url,url_mgr=url_mgr,source_code=source_code)
|
239
|
+
source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr)
|
240
|
+
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,parse_type=parse_type)
|
241
|
+
crawl_mgr = CrawlManager(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
|
242
|
+
return crawl_mgr
|
243
|
+
def get_domain_crawl(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser",max_depth=3, depth=1):
|
244
|
+
crawl_mgr = get_crawl_mgr(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
|
245
|
+
url = get_url(url=url,url_mgr=url_mgr)
|
246
|
+
all_domain_links = crawl_mgr.crawl(url=url, max_depth=max_depth, depth=depth)
|
247
|
+
return all_domain_links
|
@@ -343,6 +343,11 @@ class SafeRequestSingleton:
|
|
343
343
|
def get_req_mgr(req_mgr=None,url=None,url_mgr=None,source_code=None):
|
344
344
|
if req_mgr:
|
345
345
|
url_mgr = req_mgr.url_mgr
|
346
|
+
url = get_url(url=url,url_mgr=url_mgr)
|
346
347
|
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
347
|
-
req_mgr = req_mgr or requestManager(url_mgr=url_mgr,source_code=source_code)
|
348
|
+
req_mgr = req_mgr or requestManager(url_mgr=url_mgr,url=url,source_code=source_code)
|
348
349
|
return req_mgr
|
350
|
+
def get_source(url=None,url_mgr=None,source_code=None):
|
351
|
+
# Placeholder for actual implementation.
|
352
|
+
req_mgr = get_req_mgr(req_mgr=req_mgr,url=url,url_mgr=url_mgr,source_code=source_code)
|
353
|
+
return req_mgr.source_code
|
@@ -60,9 +60,10 @@ class domainManager(metaclass=SingletonMeta):
|
|
60
60
|
for path in paths[:-1]:
|
61
61
|
dir_path = os.path.join(dir_path, path)
|
62
62
|
os.makedirs(dir_path, exist_ok=True)
|
63
|
-
|
64
|
-
|
65
|
-
|
63
|
+
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
|
64
|
+
|
65
|
+
dir_path = os.path.join(dir_path, paths[-1])
|
66
|
+
return dir_path
|
66
67
|
|
67
68
|
def saved_url_check(self, url):
|
68
69
|
path = self.get_url_to_path(url)
|
@@ -51,6 +51,11 @@ class soupManager:
|
|
51
51
|
source_code = str(source_code)
|
52
52
|
self.source_code = source_code
|
53
53
|
self.soup= BeautifulSoup(self.source_code, self.parse_type)
|
54
|
+
self.all_tags_and_attribute_names = self.get_all_tags_and_attribute_names()
|
55
|
+
self.all_tags = self.all_tags_and_attribute_names.get('tags')
|
56
|
+
self.all_attribute_names = self.all_tags_and_attribute_names.get('attributes')
|
57
|
+
self.all_tags_and_attributes = self.all_tags + self.all_attribute_names
|
58
|
+
|
54
59
|
self._all_links_data = None
|
55
60
|
self._meta_tags_data = None
|
56
61
|
def re_initialize(self):
|
@@ -106,7 +111,7 @@ class soupManager:
|
|
106
111
|
"""
|
107
112
|
all_urls=[self.url_mgr.url]
|
108
113
|
domain = self.url_mgr.domain
|
109
|
-
all_desired=self.
|
114
|
+
all_desired=self.get_all_website_links(tag=tag,attr=attr)
|
110
115
|
for tag in all_desired:
|
111
116
|
href = tag.attrs.get(attr)
|
112
117
|
if href == "" or href is None:
|
@@ -274,22 +279,50 @@ class soupManager:
|
|
274
279
|
attribute_names_list = list(attribute_names)
|
275
280
|
return {"tags":tag_names_list,"attributes":attribute_names_list}
|
276
281
|
|
277
|
-
def get_all_attribute_values(self):
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
282
|
+
def get_all_attribute_values(self, tag_list=None):
|
283
|
+
"""
|
284
|
+
Collects all attribute values for each specified tag or all tags if none are specified.
|
285
|
+
|
286
|
+
Parameters:
|
287
|
+
- tag_list: List of specific tags to retrieve attributes from, e.g., ['script', 'img'].
|
288
|
+
If None, retrieves attributes for all tags.
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
- attribute_values: Dictionary where each key is an attribute and the value is a list of unique values for that attribute.
|
292
|
+
"""
|
293
|
+
attribute_values = {}
|
294
|
+
tag_list = tag_list or self.all_tags_and_attributes
|
295
|
+
# Get all tags matching tag_list criteria
|
296
|
+
for tag_name in tag_list:
|
297
|
+
for tag in self.soup.find_all(tag_name):
|
298
|
+
for attr, value in tag.attrs.items():
|
299
|
+
if attr not in attribute_values:
|
300
|
+
attribute_values[attr] = set()
|
301
|
+
|
302
|
+
# Add attribute values
|
303
|
+
if isinstance(value, list):
|
304
|
+
attribute_values[attr].update(value)
|
305
|
+
else:
|
306
|
+
attribute_values[attr].add(value)
|
307
|
+
|
308
|
+
# Convert each set to a list for consistency
|
290
309
|
for attr, values in attribute_values.items():
|
291
310
|
attribute_values[attr] = list(values)
|
311
|
+
|
312
|
+
# Capture JavaScript URLs inside <script> tags
|
313
|
+
attribute_values['script_links'] = self.get_js_links()
|
314
|
+
|
292
315
|
return attribute_values
|
316
|
+
|
317
|
+
def get_js_links(self):
|
318
|
+
"""Extract URLs embedded in JavaScript within <script> tags."""
|
319
|
+
js_links = []
|
320
|
+
script_tags = self.soup.find_all('script')
|
321
|
+
for script in script_tags:
|
322
|
+
# Find URLs in the JavaScript code
|
323
|
+
urls_in_js = re.findall(r'["\'](https?://[^"\']+|/[^"\']+)["\']', script.get_text())
|
324
|
+
js_links.extend(urls_in_js)
|
325
|
+
return list(set(js_links)) # Remove duplicates
|
293
326
|
|
294
327
|
@property
|
295
328
|
def url(self):
|
@@ -309,6 +342,10 @@ class SoupManagerSingleton():
|
|
309
342
|
return SoupManagerSingleton._instance
|
310
343
|
def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None):
|
311
344
|
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
312
|
-
|
313
|
-
|
345
|
+
url = get_url(url=url,url_mgr=url_mgr)
|
346
|
+
req_mgr = get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
|
347
|
+
soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr,url=url,source_code=source_code)
|
314
348
|
return soup_mgr
|
349
|
+
def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,tags_list = None):
|
350
|
+
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
|
351
|
+
return soup_mgr.get_all_attribute_values(tags_list=tags_list)
|
@@ -218,8 +218,8 @@ def get_url(url=None,url_mgr=None):
|
|
218
218
|
url_mgr = urlManager(url)
|
219
219
|
return url_mgr.url
|
220
220
|
def get_url_mgr(url=None,url_mgr=None):
|
221
|
-
if
|
222
|
-
|
223
|
-
if url:
|
224
|
-
|
221
|
+
if url_mgr == None and url:
|
222
|
+
url_mgr = urlManager(url=url)
|
223
|
+
if url_mgr and url == None:
|
224
|
+
url = url_mgr.url
|
225
225
|
return url_mgr
|
abstract_webtools/url_grabber.py
CHANGED
@@ -25,7 +25,7 @@ def get_attrs(values):
|
|
25
25
|
else:
|
26
26
|
tags_js['attribute']=tags_js['attribute'][0]
|
27
27
|
return tags_js
|
28
|
-
def
|
28
|
+
def get_user_agent_mgr(user_agent=None):
|
29
29
|
return UserAgentManager(user_agent=user_agent)
|
30
30
|
def get_cipher_list():
|
31
31
|
return CipherManager().get_default_ciphers()
|
@@ -128,78 +128,78 @@ def process_url(window,values):
|
|
128
128
|
if warn_url=='' or warn_url == None:
|
129
129
|
update_status(window,warn,warn_url,response_code,valid)
|
130
130
|
return False
|
131
|
-
temp_url=
|
131
|
+
temp_url=urlManager(url=warn_url).url
|
132
132
|
if temp_url:
|
133
133
|
valid='valid'
|
134
|
-
response_code =
|
134
|
+
response_code = requestManager(url=temp_mgr).response.status_code
|
135
135
|
warn = 'success'
|
136
136
|
warn_url = temp_mgr
|
137
137
|
update_status(window,warn,warn_url,response_code,valid)
|
138
138
|
return temp_mgr
|
139
139
|
update_status(window,warn,warn_url,response_code,valid)
|
140
140
|
return False
|
141
|
-
def update_url(
|
141
|
+
def update_url(url_mgr,request_mgr,soup_mgr,link_mgr,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
|
142
142
|
ciphers = CipherManager(cipher_list=cipher_list).ciphers_string
|
143
|
-
|
144
|
-
if
|
145
|
-
|
146
|
-
|
147
|
-
window['-URL-'].update(value=
|
148
|
-
window['-CIPHERS_OUTPUT-'].update(value=
|
149
|
-
return update_source_code(
|
143
|
+
request_mgr = requestManager(url_mgr=url_mgr,ciphers=ciphers,user_agent=get_user_agents()[0])
|
144
|
+
if request_mgr.source_code:
|
145
|
+
soup_mgr= SoupManager(url_mgr=url_mgr,request_mgr=request_mgr)
|
146
|
+
link_mgr= LinkManager(url_mgr=url_mgr,request_mgr=request_mgr,soup_mgr=soup_mgr)
|
147
|
+
window['-URL-'].update(value=url_mgr.url)
|
148
|
+
window['-CIPHERS_OUTPUT-'].update(value=request_mgr.ciphers)
|
149
|
+
return update_source_code(url_mgr,request_mgr,soup_mgr,link_mgr,values)
|
150
150
|
else:
|
151
|
-
return
|
152
|
-
def update_source_code(
|
151
|
+
return url_mgr,request_mgr,soup_mgr,link_mgr
|
152
|
+
def update_source_code(url_mgr,request_mgr,soup_mgr,link_mgr,values):
|
153
153
|
parse_type = values['-parse_type-']
|
154
|
-
if parse_type !=
|
155
|
-
|
156
|
-
all_tags=
|
157
|
-
window['-SOURCECODE-'].update(value=
|
154
|
+
if parse_type != soup_mgr.parse_type:
|
155
|
+
soup_mgr.update_parse_type(parse_type=parse_type)
|
156
|
+
all_tags=soup_mgr.get_all_tags_and_attribute_names()
|
157
|
+
window['-SOURCECODE-'].update(value=soup_mgr.soup)
|
158
158
|
if values['-SOUP_TAG-'] != all_tags['tags']:
|
159
159
|
window['-SOUP_TAG-'].update(values=all_tags['tags'],value=all_tags['tags'][0])
|
160
160
|
if values['-SOUP_ATTRIBUTE-'] != all_tags['attributes']:
|
161
161
|
window['-SOUP_ATTRIBUTE-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
|
162
162
|
window['-SOUP_ATTRIBUTE_1-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
|
163
163
|
window['-SOUP_ATTRIBUTE_2-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
|
164
|
-
return
|
164
|
+
return url_mgr,request_mgr,soup_mgr,link_mgr
|
165
165
|
def url_grabber_while(window,initial_url="www.example.com"):
|
166
166
|
return_data=None
|
167
167
|
url_grab = False
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
168
|
+
url_mgr=urlManager(url=initial_url)
|
169
|
+
request_mgr = requestManager(url_mgr=url_mgr)
|
170
|
+
soup_mgr= SoupManager(url_mgr=url_mgr,request_mgr=request_mgr)
|
171
|
+
link_mgr= LinkManager(url_mgr=url_mgr,request_mgr=request_mgr,soup_mgr=soup_mgr)
|
172
172
|
while True:
|
173
173
|
event, values = window.read()
|
174
174
|
if event == sg.WINDOW_CLOSED:
|
175
175
|
break
|
176
176
|
if event=='-GRAB_URL-' or not url_grab:
|
177
177
|
url=values['-URL-']
|
178
|
-
if
|
179
|
-
if url !=
|
180
|
-
|
178
|
+
if urlManager(url=url).url:
|
179
|
+
if url != url_mgr.url or url == initial_url:
|
180
|
+
url_mgr = urlManager(url=url)
|
181
181
|
|
182
|
-
|
183
|
-
window['-URL-'].update(value=
|
182
|
+
url_mgr,request_mgr,soup_mgr,link_mgr=update_url(url_mgr=url_mgr,request_mgr=request_mgr,soup_mgr=soup_mgr,link_mgr=link_mgr,values=values)
|
183
|
+
window['-URL-'].update(value=url_mgr.url)
|
184
184
|
url_grab=True
|
185
185
|
if event == 'get soup':
|
186
186
|
tags_js = get_attrs(values)
|
187
|
-
all_desired=
|
187
|
+
all_desired=soup_mgr.find_tags_by_attributes(tag=tags_js['tag'], attr=tags_js['attribute'],attr_values=tags_js['input'])
|
188
188
|
window['-FIND_ALL_OUTPUT-'].update(value=all_desired)
|
189
189
|
if event == '-CUSTOMUA-':
|
190
190
|
window['-SOURCECODE-'].update(disabled=values['-CUSTOMUA-'])
|
191
191
|
if not values['-CUSTOMUA-']:
|
192
|
-
window['-USERAGENT-'].update(value=
|
192
|
+
window['-USERAGENT-'].update(value=user_agent_mgr.user_agent_header)
|
193
193
|
window['-USERAGENT-'].update(disabled=True)
|
194
194
|
else:
|
195
195
|
window['-USERAGENT-'].update(disabled=False)
|
196
196
|
if event=='Get All Text':
|
197
|
-
window['-FIND_ALL_OUTPUT-'].update(value=
|
197
|
+
window['-FIND_ALL_OUTPUT-'].update(value=soup_mgr.extract_text_sections())
|
198
198
|
if event == 'Action':
|
199
199
|
parse_type = values['-parse_type-']
|
200
|
-
if parse_type !=
|
201
|
-
|
202
|
-
window['-SOURCECODE-'].update(value=
|
200
|
+
if parse_type != soup_mgr.parse_type:
|
201
|
+
soup_mgr.update_parse_type(parse_type=parse_type)
|
202
|
+
window['-SOURCECODE-'].update(value=soup_mgr.soup)
|
203
203
|
elif event == 'Send Soup':
|
204
204
|
return_data = values['-FIND_ALL_OUTPUT-']
|
205
205
|
break
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.5.
|
3
|
+
Version: 0.1.5.88
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -3,26 +3,26 @@ abstract_webtools/abstract_webtools.py,sha256=6pYoObMhvOnjLiw1oQaNBoX3ipr7QUJYve
|
|
3
3
|
abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSjqZlp2KQON3w,131923
|
4
4
|
abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
|
5
5
|
abstract_webtools/soup_gui.py,sha256=n95YAps1R6DpMwR4UbthSqQby0C5WHUa9tsW-f2qpLg,5184
|
6
|
-
abstract_webtools/url_grabber.py,sha256=
|
6
|
+
abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE70B8,10441
|
7
7
|
abstract_webtools/url_grabber_new.py,sha256=Oh2Kc0gBScCo0xpopNsg8JE5lIbPuzZVKM5f5GoZmw0,3454
|
8
8
|
abstract_webtools/managers/__init__.py,sha256=5aIpbdUsDWTrhPUAjfIKnG54OULqOKan9LBL5EIUllo,407
|
9
9
|
abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
|
10
|
-
abstract_webtools/managers/crawlManager.py,sha256=
|
10
|
+
abstract_webtools/managers/crawlManager.py,sha256=9KYI949SqOqj-FS0V2VJFdXaeWoLpfYKfIXwNI6pvJ0,10604
|
11
11
|
abstract_webtools/managers/domainManager.py,sha256=N7gvSzBEqItXdOny40mvLytBiNYYNA67qTzt0Qa2qHU,1800
|
12
12
|
abstract_webtools/managers/dynamicRateLimiter.py,sha256=gopQcQo50JG2D0KcyepNCIQ_1uDQEBIHBzWf4R2Wgy0,7617
|
13
13
|
abstract_webtools/managers/linkManager.py,sha256=m6y9s8jknrTX8RtOAFKeHd4yd23G7Rgf0T7Sp7wmHUw,12180
|
14
14
|
abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
|
15
15
|
abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
|
16
|
-
abstract_webtools/managers/requestManager.py,sha256=
|
17
|
-
abstract_webtools/managers/seleniumManager.py,sha256=
|
18
|
-
abstract_webtools/managers/soupManager.py,sha256=
|
16
|
+
abstract_webtools/managers/requestManager.py,sha256=rqWNxhdBqXDFKBDNqsUVkPwBii7HIIeufpGJNEPzqJw,17303
|
17
|
+
abstract_webtools/managers/seleniumManager.py,sha256=lQdqD4YqLCQSkk-Dvk_ZVrq1y-NvbLEvfZc-vJvl2Nw,3525
|
18
|
+
abstract_webtools/managers/soupManager.py,sha256=nFHalHxxDXNI2HhDg8ffZYF36LbJ-uWxx4JnRFa3Hhw,16542
|
19
19
|
abstract_webtools/managers/sslManager.py,sha256=C-QgQw9CW84uOE5kx2MPjC3RsLbE2JQqdwdTs0H4ecc,1370
|
20
20
|
abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
|
21
|
-
abstract_webtools/managers/urlManager.py,sha256=
|
21
|
+
abstract_webtools/managers/urlManager.py,sha256=XqMrCM84BeWEfWtHc_8UFpT91ZtG-okzdKdCuC49vsA,8678
|
22
22
|
abstract_webtools/managers/userAgentManager.py,sha256=33SB2p2FG7EYZl7l2iYm1U4gI9PcdkGTZHw5lg_Ogrw,1653
|
23
23
|
abstract_webtools/managers/videoDownloader.py,sha256=6G_aLc05BTMUYUWc7iqYtHF_BaR7DnCNK_NJ-QnjsYY,10531
|
24
|
-
abstract_webtools-0.1.5.
|
25
|
-
abstract_webtools-0.1.5.
|
26
|
-
abstract_webtools-0.1.5.
|
27
|
-
abstract_webtools-0.1.5.
|
28
|
-
abstract_webtools-0.1.5.
|
24
|
+
abstract_webtools-0.1.5.88.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
25
|
+
abstract_webtools-0.1.5.88.dist-info/METADATA,sha256=9PceK41JVCeU6mPL3uP-UI1LnkuoC8iUIV1ZZOrkl2c,15858
|
26
|
+
abstract_webtools-0.1.5.88.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
27
|
+
abstract_webtools-0.1.5.88.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
28
|
+
abstract_webtools-0.1.5.88.dist-info/RECORD,,
|
File without changes
|
File without changes
|