abstract-webtools 0.1.6.145__py3-none-any.whl → 0.1.6.147__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,9 @@
1
1
  from ...abstract_webtools import *
2
2
  from ..urlManager import *
3
3
  from ..requestManager import *
4
+ from bs4 import BeautifulSoup
5
+ import re, json
6
+
4
7
  class soupManager:
5
8
  """
6
9
  SoupManager is a class for managing and parsing HTML source code using BeautifulSoup.
@@ -39,25 +42,49 @@ class soupManager:
39
42
  - The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
40
43
  - It provides various methods to extract data and discover elements within the source code.
41
44
  """
42
- def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None,soup=None, parse_type="html.parser"):
43
- self.soup=[]
44
- url = get_url(url=url,url_mgr=url_mgr)
45
- self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
46
- self.url=self.url_mgr.url
47
- self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
48
- self.parse_type = parse_type
49
- source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
50
- if source_code:
51
- source_code = str(source_code)
52
- self.source_code = source_code or ''
53
- self.soup= soup or BeautifulSoup(self.source_code, self.parse_type)
54
- self.all_tags_and_attribute_names = self.get_all_tags_and_attribute_names()
55
- self.all_tags = self.all_tags_and_attribute_names.get('tags')
56
- self.all_attribute_names = self.all_tags_and_attribute_names.get('attributes')
57
- self.all_tags_and_attributes = self.all_tags + self.all_attribute_names
58
-
59
- self._all_links_data = None
60
- self._meta_tags_data = None
45
+
46
+ def __init__(self, url=None, source_code=None, req_mgr=None, parse_type="html.parser"):
47
+ self.url = url
48
+ self.req_mgr = req_mgr
49
+ self.source_code = (source_code or (req_mgr.source_code if req_mgr else "")) or ""
50
+ self.soup = BeautifulSoup(self.source_code, parse_type)
51
+
52
+ def all_meta(self):
53
+ out = []
54
+ for m in self.soup.find_all("meta"):
55
+ row = {}
56
+ for k in ("name","property","http-equiv","itemprop","charset","content"):
57
+ v = m.get(k)
58
+ if v: row[k] = v
59
+ if row: out.append(row)
60
+ return out
61
+
62
+ def citation_dict(self):
63
+ out = {}
64
+ for m in self.soup.find_all("meta"):
65
+ k = (m.get("name") or m.get("property") or "").lower()
66
+ if k.startswith("citation_") and m.get("content"):
67
+ out.setdefault(k, []).append(m["content"])
68
+ return out
69
+
70
+ def all_links(self):
71
+ res = []
72
+ for l in self.soup.find_all("link"):
73
+ rel = l.get("rel")
74
+ if isinstance(rel, list): rel = " ".join(rel)
75
+ res.append({
76
+ "rel": rel, "href": l.get("href"),
77
+ "type": l.get("type"), "title": l.get("title"), "hreflang": l.get("hreflang")
78
+ })
79
+ return res
80
+
81
+ def all_jsonld(self):
82
+ blocks = []
83
+ for s in self.soup.find_all("script", type=re.compile("application/ld\\+json", re.I)):
84
+ txt = s.get_text(strip=True)
85
+ try: blocks.append(json.loads(txt))
86
+ except Exception: blocks.append({"raw": txt})
87
+ return blocks
61
88
  def re_initialize(self):
62
89
  self.soup= BeautifulSoup(self.source_code, self.parse_type)
63
90
  self._all_links_data = None
@@ -1,12 +1,21 @@
1
1
  from ..abstract_webtools import *
2
+ # sslManager.py
3
+ from ..abstract_webtools import * # must expose ssl, ssl_
4
+ from .cipherManager import CipherManager # be explicit, safer
5
+
2
6
  class SSLManager:
3
7
  def __init__(self, ciphers=None, ssl_options=None, certification=None):
4
8
  self.ciphers = ciphers or CipherManager().ciphers_string
5
9
  self.ssl_options = ssl_options or self.get_default_ssl_settings()
6
10
  self.certification = certification or ssl.CERT_REQUIRED
7
11
  self.ssl_context = self.get_context()
12
+
8
13
  def get_default_ssl_settings(self):
9
14
  return ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1 | ssl.OP_NO_COMPRESSION
10
- def get_context(self):
11
- return ssl_.create_urllib3_context(ciphers=self.ciphers, cert_reqs=self.certification, options=self.ssl_options)
12
15
 
16
+ def get_context(self):
17
+ return ssl_.create_urllib3_context(
18
+ ciphers=self.ciphers,
19
+ cert_reqs=self.certification,
20
+ options=self.ssl_options
21
+ )
@@ -1,29 +1,27 @@
1
+ # userAgentManager.py
1
2
  from ..abstract_webtools import *
2
3
  import random
4
+
3
5
  operating_systems = ['Macintosh','Windows','Linux']
4
6
  browsers = ['Firefox','Chrome','IceDragon','Waterfox','Gecko','Safari','MetaSr']
5
- def get_itter(iter_input,itter_list):
6
- if not iter_input:
7
- return itter_list[0]
8
- if iter_input in itter_list:
9
- return iter_input
10
- iter_input_lower = iter_input.lower()
11
- for itter in itter_list:
12
- itter_lower = itter.lower()
13
- if iter_input_lower in itter_lower:
14
- return itter
15
- return itter_list[0]
16
- def get_browser(browser=None):
17
- return get_itter(browser,browsers)
18
- def get_operating_system(operating_system=None):
19
- return get_itter(operating_system,operating_systems)
7
+
8
+ def _pick(val, options):
9
+ if not val: return options[0]
10
+ if val in options: return val
11
+ l = val.lower()
12
+ for o in options:
13
+ if l in o.lower():
14
+ return o
15
+ return options[0]
16
+
20
17
  class UserAgentManager:
21
- def __init__(self, operating_system=None, browser=None, version=None,user_agent=None):
22
- self.operating_system = get_operating_system(operating_system=operating_system)
23
- self.browser = get_browser(browser=browser)
18
+ def __init__(self, operating_system=None, browser=None, version=None, user_agent=None):
19
+ self.operating_system = _pick(operating_system, operating_systems)
20
+ self.browser = _pick(browser, browsers)
24
21
  self.version = version or '42.0'
25
22
  self.user_agent = user_agent or self.get_user_agent()
26
- self.header = self.user_agent_header()
23
+ self.header = {"user-agent": self.user_agent}
24
+
27
25
  @staticmethod
28
26
  def user_agent_db():
29
27
  from ..big_user_agent_list import big_user_agent_dict
@@ -31,30 +29,23 @@ class UserAgentManager:
31
29
 
32
30
  def get_user_agent(self):
33
31
  ua_db = self.user_agent_db()
32
+ os_db = ua_db.get(self.operating_system) or random.choice(list(ua_db.values()))
33
+ br_db = os_db.get(self.browser) or random.choice(list(os_db.values()))
34
+ if self.version in br_db:
35
+ return br_db[self.version]
36
+ return random.choice(list(br_db.values()))
34
37
 
35
- if self.operating_system and self.operating_system in ua_db:
36
- operating_system_db = ua_db[self.operating_system]
37
- else:
38
- operating_system_db = random.choice(list(ua_db.values()))
39
-
40
- if self.browser and self.browser in operating_system_db:
41
- browser_db = operating_system_db[self.browser]
42
- else:
43
- browser_db = random.choice(list(operating_system_db.values()))
44
-
45
- if self.version and self.version in browser_db:
46
- return browser_db[self.version]
47
- else:
48
- return random.choice(list(browser_db.values()))
49
-
50
- def user_agent_header(self):
51
- return {"user-agent": self.user_agent}
52
38
  class UserAgentManagerSingleton:
53
39
  _instance = None
40
+
54
41
  @staticmethod
55
- def get_instance(user_agent=UserAgentManager().get_user_agent()[0]):
42
+ def get_instance(**kwargs):
43
+ ua = kwargs.get("user_agent")
56
44
  if UserAgentManagerSingleton._instance is None:
57
- UserAgentManagerSingleton._instance = UserAgentManager(user_agent=user_agent)
58
- elif UserAgentManagerSingleton._instance.user_agent != user_agent:
59
- UserAgentManagerSingleton._instance = UserAgentManager(user_agent=user_agent)
45
+ UserAgentManagerSingleton._instance = UserAgentManager(**kwargs)
46
+ else:
47
+ # rebuild if user_agent explicitly changed
48
+ inst = UserAgentManagerSingleton._instance
49
+ if ua and ua != inst.user_agent:
50
+ UserAgentManagerSingleton._instance = UserAgentManager(**kwargs)
60
51
  return UserAgentManagerSingleton._instance
@@ -1,7 +1,11 @@
1
1
  from abstract_gui import make_component,sg
2
2
  import inspect
3
3
  import re
4
+ from . import UserAgentManager,UrlManager,SafeRequest,SoupManager,LinkManager,CipherManager,requests,ssl,BeautifulSoup,HTTPAdapter,PoolManager,ssl_
5
+ <<<<<<< HEAD
6
+ =======
4
7
  from .managers import *
8
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
5
9
  window = None
6
10
 
7
11
  def get_attrs(values):
@@ -25,7 +29,11 @@ def get_attrs(values):
25
29
  else:
26
30
  tags_js['attribute']=tags_js['attribute'][0]
27
31
  return tags_js
32
+ <<<<<<< HEAD
33
+ def get_user_agent_manager(user_agent=None):
34
+ =======
28
35
  def get_user_agent_mgr(user_agent=None):
36
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
29
37
  return UserAgentManager(user_agent=user_agent)
30
38
  def get_cipher_list():
31
39
  return CipherManager().get_default_ciphers()
@@ -128,17 +136,43 @@ def process_url(window,values):
128
136
  if warn_url=='' or warn_url == None:
129
137
  update_status(window,warn,warn_url,response_code,valid)
130
138
  return False
139
+ temp_url=UrlManager(url=warn_url).url
140
+ if temp_url:
141
+ valid='valid'
142
+ response_code = SafeRequest(url=temp_mgr).response.status_code
143
+ <<<<<<< HEAD
144
+ =======
131
145
  temp_url=urlManager(url=warn_url).url
132
146
  if temp_url:
133
147
  valid='valid'
134
148
  response_code = requestManager(url=temp_mgr).response.status_code
149
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
135
150
  warn = 'success'
136
151
  warn_url = temp_mgr
137
152
  update_status(window,warn,warn_url,response_code,valid)
138
153
  return temp_mgr
139
154
  update_status(window,warn,warn_url,response_code,valid)
140
155
  return False
141
- def update_url(url_mgr,request_mgr,soup_mgr,link_mgr,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
156
+ def update_url(url_manager,request_manager,soup_manager,link_manager,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
157
+ ciphers = CipherManager(cipher_list=cipher_list).ciphers_string
158
+ request_manager = SafeRequest(url_manager=url_manager,ciphers=ciphers,user_agent=get_user_agents()[0])
159
+ if request_manager.source_code:
160
+ soup_manager= SoupManager(url_manager=url_manager,request_manager=request_manager)
161
+ link_manager= LinkManager(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager)
162
+ window['-URL-'].update(value=url_manager.url)
163
+ window['-CIPHERS_OUTPUT-'].update(value=request_manager.ciphers)
164
+ return update_source_code(url_manager,request_manager,soup_manager,link_manager,values)
165
+ else:
166
+ return url_manager,request_manager,soup_manager,link_manager
167
+ def update_source_code(url_manager,request_manager,soup_manager,link_manager,values):
168
+ parse_type = values['-parse_type-']
169
+ if parse_type != soup_manager.parse_type:
170
+ soup_manager.update_parse_type(parse_type=parse_type)
171
+ all_tags=soup_manager.get_all_tags_and_attribute_names()
172
+ <<<<<<< HEAD
173
+ window['-SOURCECODE-'].update(value=soup_manager.soup)
174
+ =======
175
+ window['-SOURCECODE-'].update(value=soup_manager.soupdef update_url(url_mgr,request_mgr,soup_mgr,link_mgr,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
142
176
  ciphers = CipherManager(cipher_list=cipher_list).ciphers_string
143
177
  request_mgr = requestManager(url_mgr=url_mgr,ciphers=ciphers,user_agent=get_user_agents()[0])
144
178
  if request_mgr.source_code:
@@ -155,12 +189,23 @@ def update_source_code(url_mgr,request_mgr,soup_mgr,link_mgr,values):
155
189
  soup_mgr.update_parse_type(parse_type=parse_type)
156
190
  all_tags=soup_mgr.get_all_tags_and_attribute_names()
157
191
  window['-SOURCECODE-'].update(value=soup_mgr.soup)
192
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
158
193
  if values['-SOUP_TAG-'] != all_tags['tags']:
159
194
  window['-SOUP_TAG-'].update(values=all_tags['tags'],value=all_tags['tags'][0])
160
195
  if values['-SOUP_ATTRIBUTE-'] != all_tags['attributes']:
161
196
  window['-SOUP_ATTRIBUTE-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
162
197
  window['-SOUP_ATTRIBUTE_1-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
163
198
  window['-SOUP_ATTRIBUTE_2-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
199
+ return url_manager,request_manager,soup_manager,link_manager
200
+ def url_grabber_while(window,initial_url="www.example.com"):
201
+ return_data=None
202
+ url_grab = False
203
+ url_manager=UrlManager(url=initial_url)
204
+ request_manager = SafeRequest(url_manager=url_manager)
205
+ soup_manager= SoupManager(url_manager=url_manager,request_manager=request_manager)
206
+ link_manager= LinkManager(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager)
207
+ <<<<<<< HEAD
208
+ =======
164
209
  return url_mgr,request_mgr,soup_mgr,link_mgr
165
210
  def url_grabber_while(window,initial_url="www.example.com"):
166
211
  return_data=None
@@ -169,12 +214,25 @@ def url_grabber_while(window,initial_url="www.example.com"):
169
214
  request_mgr = requestManager(url_mgr=url_mgr)
170
215
  soup_mgr= SoupManager(url_mgr=url_mgr,request_mgr=request_mgr)
171
216
  link_mgr= LinkManager(url_mgr=url_mgr,request_mgr=request_mgr,soup_mgr=soup_mgr)
217
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
172
218
  while True:
173
219
  event, values = window.read()
174
220
  if event == sg.WINDOW_CLOSED:
175
221
  break
176
222
  if event=='-GRAB_URL-' or not url_grab:
177
223
  url=values['-URL-']
224
+ if UrlManager(url=url).url:
225
+ if url != url_manager.url or url == initial_url:
226
+ url_manager = UrlManager(url=url)
227
+
228
+ url_manager,request_manager,soup_manager,link_manager=update_url(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager,link_manager=link_manager,values=values)
229
+ window['-URL-'].update(value=url_manager.url)
230
+ url_grab=True
231
+ if event == 'get soup':
232
+ tags_js = get_attrs(values)
233
+ all_desired=soup_manager.find_tags_by_attributes(tag=tags_js['tag'], attr=tags_js['attribute'],attr_values=tags_js['input'])
234
+ <<<<<<< HEAD
235
+ =======
178
236
  if urlManager(url=url).url:
179
237
  if url != url_mgr.url or url == initial_url:
180
238
  url_mgr = urlManager(url=url)
@@ -185,21 +243,35 @@ def url_grabber_while(window,initial_url="www.example.com"):
185
243
  if event == 'get soup':
186
244
  tags_js = get_attrs(values)
187
245
  all_desired=soup_mgr.find_tags_by_attributes(tag=tags_js['tag'], attr=tags_js['attribute'],attr_values=tags_js['input'])
246
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
188
247
  window['-FIND_ALL_OUTPUT-'].update(value=all_desired)
189
248
  if event == '-CUSTOMUA-':
190
249
  window['-SOURCECODE-'].update(disabled=values['-CUSTOMUA-'])
191
250
  if not values['-CUSTOMUA-']:
251
+ window['-USERAGENT-'].update(value=user_agent_manager.user_agent_header)
252
+ <<<<<<< HEAD
253
+ =======
192
254
  window['-USERAGENT-'].update(value=user_agent_mgr.user_agent_header)
255
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
193
256
  window['-USERAGENT-'].update(disabled=True)
194
257
  else:
195
258
  window['-USERAGENT-'].update(disabled=False)
196
259
  if event=='Get All Text':
260
+ window['-FIND_ALL_OUTPUT-'].update(value=soup_manager.extract_text_sections())
261
+ if event == 'Action':
262
+ parse_type = values['-parse_type-']
263
+ if parse_type != soup_manager.parse_type:
264
+ soup_manager.update_parse_type(parse_type=parse_type)
265
+ window['-SOURCECODE-'].update(value=soup_manager.soup)
266
+ <<<<<<< HEAD
267
+ =======
197
268
  window['-FIND_ALL_OUTPUT-'].update(value=soup_mgr.extract_text_sections())
198
269
  if event == 'Action':
199
270
  parse_type = values['-parse_type-']
200
271
  if parse_type != soup_mgr.parse_type:
201
272
  soup_mgr.update_parse_type(parse_type=parse_type)
202
273
  window['-SOURCECODE-'].update(value=soup_mgr.soup)
274
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
203
275
  elif event == 'Send Soup':
204
276
  return_data = values['-FIND_ALL_OUTPUT-']
205
277
  break