abstract-webtools 0.1.6.145__py3-none-any.whl → 0.1.6.147__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/__init__.py +6 -0
- abstract_webtools/abstract_webtools.py +1768 -0
- abstract_webtools/managers/cipherManager.py +12 -13
- abstract_webtools/managers/crawlManager.py +35 -26
- abstract_webtools/managers/curlMgr.py +25 -47
- abstract_webtools/managers/meta_dump.py +27 -0
- abstract_webtools/managers/networkManager.py +48 -13
- abstract_webtools/managers/requestManager/requestManager.py +1 -1
- abstract_webtools/managers/seleneumManager.py +1 -0
- abstract_webtools/managers/seleniumManager.py +204 -82
- abstract_webtools/managers/soupManager/soupManager.py +46 -19
- abstract_webtools/managers/sslManager.py +11 -2
- abstract_webtools/managers/userAgentManager.py +31 -40
- abstract_webtools/url_grabber.py +73 -1
- abstract_webtools-0.1.6.147.dist-info/METADATA +482 -0
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.147.dist-info}/RECORD +19 -18
- abstract_webtools-0.1.6.145.dist-info/METADATA +0 -196
- /abstract_webtools/managers/{allss//.py" → allss.py} +0 -0
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.147.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.147.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
|
|
1
1
|
from ...abstract_webtools import *
|
2
2
|
from ..urlManager import *
|
3
3
|
from ..requestManager import *
|
4
|
+
from bs4 import BeautifulSoup
|
5
|
+
import re, json
|
6
|
+
|
4
7
|
class soupManager:
|
5
8
|
"""
|
6
9
|
SoupManager is a class for managing and parsing HTML source code using BeautifulSoup.
|
@@ -39,25 +42,49 @@ class soupManager:
|
|
39
42
|
- The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
|
40
43
|
- It provides various methods to extract data and discover elements within the source code.
|
41
44
|
"""
|
42
|
-
|
43
|
-
|
44
|
-
url =
|
45
|
-
self.
|
46
|
-
self.
|
47
|
-
self.
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
45
|
+
|
46
|
+
def __init__(self, url=None, source_code=None, req_mgr=None, parse_type="html.parser"):
|
47
|
+
self.url = url
|
48
|
+
self.req_mgr = req_mgr
|
49
|
+
self.source_code = (source_code or (req_mgr.source_code if req_mgr else "")) or ""
|
50
|
+
self.soup = BeautifulSoup(self.source_code, parse_type)
|
51
|
+
|
52
|
+
def all_meta(self):
|
53
|
+
out = []
|
54
|
+
for m in self.soup.find_all("meta"):
|
55
|
+
row = {}
|
56
|
+
for k in ("name","property","http-equiv","itemprop","charset","content"):
|
57
|
+
v = m.get(k)
|
58
|
+
if v: row[k] = v
|
59
|
+
if row: out.append(row)
|
60
|
+
return out
|
61
|
+
|
62
|
+
def citation_dict(self):
|
63
|
+
out = {}
|
64
|
+
for m in self.soup.find_all("meta"):
|
65
|
+
k = (m.get("name") or m.get("property") or "").lower()
|
66
|
+
if k.startswith("citation_") and m.get("content"):
|
67
|
+
out.setdefault(k, []).append(m["content"])
|
68
|
+
return out
|
69
|
+
|
70
|
+
def all_links(self):
|
71
|
+
res = []
|
72
|
+
for l in self.soup.find_all("link"):
|
73
|
+
rel = l.get("rel")
|
74
|
+
if isinstance(rel, list): rel = " ".join(rel)
|
75
|
+
res.append({
|
76
|
+
"rel": rel, "href": l.get("href"),
|
77
|
+
"type": l.get("type"), "title": l.get("title"), "hreflang": l.get("hreflang")
|
78
|
+
})
|
79
|
+
return res
|
80
|
+
|
81
|
+
def all_jsonld(self):
|
82
|
+
blocks = []
|
83
|
+
for s in self.soup.find_all("script", type=re.compile("application/ld\\+json", re.I)):
|
84
|
+
txt = s.get_text(strip=True)
|
85
|
+
try: blocks.append(json.loads(txt))
|
86
|
+
except Exception: blocks.append({"raw": txt})
|
87
|
+
return blocks
|
61
88
|
def re_initialize(self):
|
62
89
|
self.soup= BeautifulSoup(self.source_code, self.parse_type)
|
63
90
|
self._all_links_data = None
|
@@ -1,12 +1,21 @@
|
|
1
1
|
from ..abstract_webtools import *
|
2
|
+
# sslManager.py
|
3
|
+
from ..abstract_webtools import * # must expose ssl, ssl_
|
4
|
+
from .cipherManager import CipherManager # be explicit, safer
|
5
|
+
|
2
6
|
class SSLManager:
|
3
7
|
def __init__(self, ciphers=None, ssl_options=None, certification=None):
|
4
8
|
self.ciphers = ciphers or CipherManager().ciphers_string
|
5
9
|
self.ssl_options = ssl_options or self.get_default_ssl_settings()
|
6
10
|
self.certification = certification or ssl.CERT_REQUIRED
|
7
11
|
self.ssl_context = self.get_context()
|
12
|
+
|
8
13
|
def get_default_ssl_settings(self):
|
9
14
|
return ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1 | ssl.OP_NO_COMPRESSION
|
10
|
-
def get_context(self):
|
11
|
-
return ssl_.create_urllib3_context(ciphers=self.ciphers, cert_reqs=self.certification, options=self.ssl_options)
|
12
15
|
|
16
|
+
def get_context(self):
|
17
|
+
return ssl_.create_urllib3_context(
|
18
|
+
ciphers=self.ciphers,
|
19
|
+
cert_reqs=self.certification,
|
20
|
+
options=self.ssl_options
|
21
|
+
)
|
@@ -1,29 +1,27 @@
|
|
1
|
+
# userAgentManager.py
|
1
2
|
from ..abstract_webtools import *
|
2
3
|
import random
|
4
|
+
|
3
5
|
operating_systems = ['Macintosh','Windows','Linux']
|
4
6
|
browsers = ['Firefox','Chrome','IceDragon','Waterfox','Gecko','Safari','MetaSr']
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
if
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
return itter_list[0]
|
16
|
-
def get_browser(browser=None):
|
17
|
-
return get_itter(browser,browsers)
|
18
|
-
def get_operating_system(operating_system=None):
|
19
|
-
return get_itter(operating_system,operating_systems)
|
7
|
+
|
8
|
+
def _pick(val, options):
|
9
|
+
if not val: return options[0]
|
10
|
+
if val in options: return val
|
11
|
+
l = val.lower()
|
12
|
+
for o in options:
|
13
|
+
if l in o.lower():
|
14
|
+
return o
|
15
|
+
return options[0]
|
16
|
+
|
20
17
|
class UserAgentManager:
|
21
|
-
def __init__(self, operating_system=None, browser=None, version=None,user_agent=None):
|
22
|
-
self.operating_system =
|
23
|
-
self.browser =
|
18
|
+
def __init__(self, operating_system=None, browser=None, version=None, user_agent=None):
|
19
|
+
self.operating_system = _pick(operating_system, operating_systems)
|
20
|
+
self.browser = _pick(browser, browsers)
|
24
21
|
self.version = version or '42.0'
|
25
22
|
self.user_agent = user_agent or self.get_user_agent()
|
26
|
-
self.header = self.
|
23
|
+
self.header = {"user-agent": self.user_agent}
|
24
|
+
|
27
25
|
@staticmethod
|
28
26
|
def user_agent_db():
|
29
27
|
from ..big_user_agent_list import big_user_agent_dict
|
@@ -31,30 +29,23 @@ class UserAgentManager:
|
|
31
29
|
|
32
30
|
def get_user_agent(self):
|
33
31
|
ua_db = self.user_agent_db()
|
32
|
+
os_db = ua_db.get(self.operating_system) or random.choice(list(ua_db.values()))
|
33
|
+
br_db = os_db.get(self.browser) or random.choice(list(os_db.values()))
|
34
|
+
if self.version in br_db:
|
35
|
+
return br_db[self.version]
|
36
|
+
return random.choice(list(br_db.values()))
|
34
37
|
|
35
|
-
if self.operating_system and self.operating_system in ua_db:
|
36
|
-
operating_system_db = ua_db[self.operating_system]
|
37
|
-
else:
|
38
|
-
operating_system_db = random.choice(list(ua_db.values()))
|
39
|
-
|
40
|
-
if self.browser and self.browser in operating_system_db:
|
41
|
-
browser_db = operating_system_db[self.browser]
|
42
|
-
else:
|
43
|
-
browser_db = random.choice(list(operating_system_db.values()))
|
44
|
-
|
45
|
-
if self.version and self.version in browser_db:
|
46
|
-
return browser_db[self.version]
|
47
|
-
else:
|
48
|
-
return random.choice(list(browser_db.values()))
|
49
|
-
|
50
|
-
def user_agent_header(self):
|
51
|
-
return {"user-agent": self.user_agent}
|
52
38
|
class UserAgentManagerSingleton:
|
53
39
|
_instance = None
|
40
|
+
|
54
41
|
@staticmethod
|
55
|
-
def get_instance(
|
42
|
+
def get_instance(**kwargs):
|
43
|
+
ua = kwargs.get("user_agent")
|
56
44
|
if UserAgentManagerSingleton._instance is None:
|
57
|
-
UserAgentManagerSingleton._instance = UserAgentManager(
|
58
|
-
|
59
|
-
|
45
|
+
UserAgentManagerSingleton._instance = UserAgentManager(**kwargs)
|
46
|
+
else:
|
47
|
+
# rebuild if user_agent explicitly changed
|
48
|
+
inst = UserAgentManagerSingleton._instance
|
49
|
+
if ua and ua != inst.user_agent:
|
50
|
+
UserAgentManagerSingleton._instance = UserAgentManager(**kwargs)
|
60
51
|
return UserAgentManagerSingleton._instance
|
abstract_webtools/url_grabber.py
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
from abstract_gui import make_component,sg
|
2
2
|
import inspect
|
3
3
|
import re
|
4
|
+
from . import UserAgentManager,UrlManager,SafeRequest,SoupManager,LinkManager,CipherManager,requests,ssl,BeautifulSoup,HTTPAdapter,PoolManager,ssl_
|
5
|
+
<<<<<<< HEAD
|
6
|
+
=======
|
4
7
|
from .managers import *
|
8
|
+
>>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
|
5
9
|
window = None
|
6
10
|
|
7
11
|
def get_attrs(values):
|
@@ -25,7 +29,11 @@ def get_attrs(values):
|
|
25
29
|
else:
|
26
30
|
tags_js['attribute']=tags_js['attribute'][0]
|
27
31
|
return tags_js
|
32
|
+
<<<<<<< HEAD
|
33
|
+
def get_user_agent_manager(user_agent=None):
|
34
|
+
=======
|
28
35
|
def get_user_agent_mgr(user_agent=None):
|
36
|
+
>>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
|
29
37
|
return UserAgentManager(user_agent=user_agent)
|
30
38
|
def get_cipher_list():
|
31
39
|
return CipherManager().get_default_ciphers()
|
@@ -128,17 +136,43 @@ def process_url(window,values):
|
|
128
136
|
if warn_url=='' or warn_url == None:
|
129
137
|
update_status(window,warn,warn_url,response_code,valid)
|
130
138
|
return False
|
139
|
+
temp_url=UrlManager(url=warn_url).url
|
140
|
+
if temp_url:
|
141
|
+
valid='valid'
|
142
|
+
response_code = SafeRequest(url=temp_mgr).response.status_code
|
143
|
+
<<<<<<< HEAD
|
144
|
+
=======
|
131
145
|
temp_url=urlManager(url=warn_url).url
|
132
146
|
if temp_url:
|
133
147
|
valid='valid'
|
134
148
|
response_code = requestManager(url=temp_mgr).response.status_code
|
149
|
+
>>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
|
135
150
|
warn = 'success'
|
136
151
|
warn_url = temp_mgr
|
137
152
|
update_status(window,warn,warn_url,response_code,valid)
|
138
153
|
return temp_mgr
|
139
154
|
update_status(window,warn,warn_url,response_code,valid)
|
140
155
|
return False
|
141
|
-
def update_url(
|
156
|
+
def update_url(url_manager,request_manager,soup_manager,link_manager,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
|
157
|
+
ciphers = CipherManager(cipher_list=cipher_list).ciphers_string
|
158
|
+
request_manager = SafeRequest(url_manager=url_manager,ciphers=ciphers,user_agent=get_user_agents()[0])
|
159
|
+
if request_manager.source_code:
|
160
|
+
soup_manager= SoupManager(url_manager=url_manager,request_manager=request_manager)
|
161
|
+
link_manager= LinkManager(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager)
|
162
|
+
window['-URL-'].update(value=url_manager.url)
|
163
|
+
window['-CIPHERS_OUTPUT-'].update(value=request_manager.ciphers)
|
164
|
+
return update_source_code(url_manager,request_manager,soup_manager,link_manager,values)
|
165
|
+
else:
|
166
|
+
return url_manager,request_manager,soup_manager,link_manager
|
167
|
+
def update_source_code(url_manager,request_manager,soup_manager,link_manager,values):
|
168
|
+
parse_type = values['-parse_type-']
|
169
|
+
if parse_type != soup_manager.parse_type:
|
170
|
+
soup_manager.update_parse_type(parse_type=parse_type)
|
171
|
+
all_tags=soup_manager.get_all_tags_and_attribute_names()
|
172
|
+
<<<<<<< HEAD
|
173
|
+
window['-SOURCECODE-'].update(value=soup_manager.soup)
|
174
|
+
=======
|
175
|
+
window['-SOURCECODE-'].update(value=soup_manager.soupdef update_url(url_mgr,request_mgr,soup_mgr,link_mgr,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
|
142
176
|
ciphers = CipherManager(cipher_list=cipher_list).ciphers_string
|
143
177
|
request_mgr = requestManager(url_mgr=url_mgr,ciphers=ciphers,user_agent=get_user_agents()[0])
|
144
178
|
if request_mgr.source_code:
|
@@ -155,12 +189,23 @@ def update_source_code(url_mgr,request_mgr,soup_mgr,link_mgr,values):
|
|
155
189
|
soup_mgr.update_parse_type(parse_type=parse_type)
|
156
190
|
all_tags=soup_mgr.get_all_tags_and_attribute_names()
|
157
191
|
window['-SOURCECODE-'].update(value=soup_mgr.soup)
|
192
|
+
>>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
|
158
193
|
if values['-SOUP_TAG-'] != all_tags['tags']:
|
159
194
|
window['-SOUP_TAG-'].update(values=all_tags['tags'],value=all_tags['tags'][0])
|
160
195
|
if values['-SOUP_ATTRIBUTE-'] != all_tags['attributes']:
|
161
196
|
window['-SOUP_ATTRIBUTE-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
|
162
197
|
window['-SOUP_ATTRIBUTE_1-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
|
163
198
|
window['-SOUP_ATTRIBUTE_2-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
|
199
|
+
return url_manager,request_manager,soup_manager,link_manager
|
200
|
+
def url_grabber_while(window,initial_url="www.example.com"):
|
201
|
+
return_data=None
|
202
|
+
url_grab = False
|
203
|
+
url_manager=UrlManager(url=initial_url)
|
204
|
+
request_manager = SafeRequest(url_manager=url_manager)
|
205
|
+
soup_manager= SoupManager(url_manager=url_manager,request_manager=request_manager)
|
206
|
+
link_manager= LinkManager(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager)
|
207
|
+
<<<<<<< HEAD
|
208
|
+
=======
|
164
209
|
return url_mgr,request_mgr,soup_mgr,link_mgr
|
165
210
|
def url_grabber_while(window,initial_url="www.example.com"):
|
166
211
|
return_data=None
|
@@ -169,12 +214,25 @@ def url_grabber_while(window,initial_url="www.example.com"):
|
|
169
214
|
request_mgr = requestManager(url_mgr=url_mgr)
|
170
215
|
soup_mgr= SoupManager(url_mgr=url_mgr,request_mgr=request_mgr)
|
171
216
|
link_mgr= LinkManager(url_mgr=url_mgr,request_mgr=request_mgr,soup_mgr=soup_mgr)
|
217
|
+
>>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
|
172
218
|
while True:
|
173
219
|
event, values = window.read()
|
174
220
|
if event == sg.WINDOW_CLOSED:
|
175
221
|
break
|
176
222
|
if event=='-GRAB_URL-' or not url_grab:
|
177
223
|
url=values['-URL-']
|
224
|
+
if UrlManager(url=url).url:
|
225
|
+
if url != url_manager.url or url == initial_url:
|
226
|
+
url_manager = UrlManager(url=url)
|
227
|
+
|
228
|
+
url_manager,request_manager,soup_manager,link_manager=update_url(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager,link_manager=link_manager,values=values)
|
229
|
+
window['-URL-'].update(value=url_manager.url)
|
230
|
+
url_grab=True
|
231
|
+
if event == 'get soup':
|
232
|
+
tags_js = get_attrs(values)
|
233
|
+
all_desired=soup_manager.find_tags_by_attributes(tag=tags_js['tag'], attr=tags_js['attribute'],attr_values=tags_js['input'])
|
234
|
+
<<<<<<< HEAD
|
235
|
+
=======
|
178
236
|
if urlManager(url=url).url:
|
179
237
|
if url != url_mgr.url or url == initial_url:
|
180
238
|
url_mgr = urlManager(url=url)
|
@@ -185,21 +243,35 @@ def url_grabber_while(window,initial_url="www.example.com"):
|
|
185
243
|
if event == 'get soup':
|
186
244
|
tags_js = get_attrs(values)
|
187
245
|
all_desired=soup_mgr.find_tags_by_attributes(tag=tags_js['tag'], attr=tags_js['attribute'],attr_values=tags_js['input'])
|
246
|
+
>>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
|
188
247
|
window['-FIND_ALL_OUTPUT-'].update(value=all_desired)
|
189
248
|
if event == '-CUSTOMUA-':
|
190
249
|
window['-SOURCECODE-'].update(disabled=values['-CUSTOMUA-'])
|
191
250
|
if not values['-CUSTOMUA-']:
|
251
|
+
window['-USERAGENT-'].update(value=user_agent_manager.user_agent_header)
|
252
|
+
<<<<<<< HEAD
|
253
|
+
=======
|
192
254
|
window['-USERAGENT-'].update(value=user_agent_mgr.user_agent_header)
|
255
|
+
>>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
|
193
256
|
window['-USERAGENT-'].update(disabled=True)
|
194
257
|
else:
|
195
258
|
window['-USERAGENT-'].update(disabled=False)
|
196
259
|
if event=='Get All Text':
|
260
|
+
window['-FIND_ALL_OUTPUT-'].update(value=soup_manager.extract_text_sections())
|
261
|
+
if event == 'Action':
|
262
|
+
parse_type = values['-parse_type-']
|
263
|
+
if parse_type != soup_manager.parse_type:
|
264
|
+
soup_manager.update_parse_type(parse_type=parse_type)
|
265
|
+
window['-SOURCECODE-'].update(value=soup_manager.soup)
|
266
|
+
<<<<<<< HEAD
|
267
|
+
=======
|
197
268
|
window['-FIND_ALL_OUTPUT-'].update(value=soup_mgr.extract_text_sections())
|
198
269
|
if event == 'Action':
|
199
270
|
parse_type = values['-parse_type-']
|
200
271
|
if parse_type != soup_mgr.parse_type:
|
201
272
|
soup_mgr.update_parse_type(parse_type=parse_type)
|
202
273
|
window['-SOURCECODE-'].update(value=soup_mgr.soup)
|
274
|
+
>>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
|
203
275
|
elif event == 'Send Soup':
|
204
276
|
return_data = values['-FIND_ALL_OUTPUT-']
|
205
277
|
break
|