abstract-webtools 0.1.6.146__py3-none-any.whl → 0.1.6.147__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@ from ..cipherManager import *
4
4
  from ..sslManager import *
5
5
  from ..tlsAdapter import *
6
6
  from ..networkManager import *
7
- from ..seleniumManager import *
7
+ from ..seleneumManager import *
8
8
  from ..urlManager import *
9
9
  logging.basicConfig(level=logging.INFO)
10
10
 
@@ -239,3 +239,4 @@ def get_driver(self, url):
239
239
  key = f"{url}#{time.time()}"
240
240
  self._sessions[key] = {"driver": driver, "profile": prof}
241
241
  return driver
242
+ seleneumManager = seleniumManager
@@ -0,0 +1,241 @@
1
+ import os, time, re, json, logging, urllib3, requests,tempfile, shutil, socket, atexit, errno
2
+ from urllib.parse import urlparse, urljoin
3
+ from bs4 import BeautifulSoup # if you prefer, keep using your parser
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.options import Options
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.webdriver.support.ui import WebDriverWait
8
+ from selenium.webdriver.support import expected_conditions as EC
9
+ from abstract_security import get_env_value
10
+ from abstract_utilities import *
11
+ from .urlManager import * # your urlManager
12
+
13
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
14
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
15
+ logging.getLogger("selenium").setLevel(logging.WARNING)
16
+
17
+ # ---- Chrome options (keep yours; add safe fallbacks) ----
18
+ chrome_options = Options()
19
+ _bin = get_env_value('CHROME_BINARY')
20
+ if _bin:
21
+ chrome_options.binary_location = _bin
22
+ chrome_options.add_argument("--headless=new")
23
+ chrome_options.add_argument("--no-sandbox")
24
+ chrome_options.add_argument("--disable-dev-shm-usage")
25
+ chrome_options.add_argument("--disable-gpu")
26
+ chrome_options.add_argument("--disable-software-rasterizer")
27
+ chrome_options.add_argument("--disable-extensions")
28
+ chrome_options.add_argument("--remote-debugging-port=9222")
29
+ chrome_prefs = {"profile.managed_default_content_settings.images": 2}
30
+ chrome_options.experimental_options["prefs"] = chrome_prefs
31
+
32
+ MIN_HTML_BYTES = 2048 # tune: consider <2KB suspicious for real pages
33
+ # --- NEW helpers: unique temp profile + free port + options builder ---
34
+
35
+ def _free_port() -> int:
36
+ s = socket.socket()
37
+ s.bind(("127.0.0.1", 0))
38
+ port = s.getsockname()[1]
39
+ s.close()
40
+ return port
41
+
42
+ def _make_profile_dir(base="/var/tmp/selenium-profiles") -> str:
43
+ os.makedirs(base, exist_ok=True)
44
+ return tempfile.mkdtemp(prefix="cw-", dir=base)
45
+
46
+ def _make_chrome_options(binary_path: str | None = None,
47
+ user_data_dir: str | None = None) -> tuple[Options, str]:
48
+ opts = Options()
49
+ if binary_path:
50
+ opts.binary_location = binary_path
51
+ opts.add_argument("--headless=new")
52
+ opts.add_argument("--no-sandbox")
53
+ opts.add_argument("--disable-dev-shm-usage")
54
+ opts.add_argument("--disable-gpu")
55
+ opts.add_argument("--disable-software-rasterizer")
56
+ opts.add_argument("--disable-extensions")
57
+
58
+ prof = user_data_dir or _make_profile_dir()
59
+ opts.add_argument(f"--user-data-dir={prof}")
60
+ opts.add_argument(f"--remote-debugging-port={_free_port()}")
61
+
62
+ prefs = {"profile.managed_default_content_settings.images": 2}
63
+ opts.add_experimental_option("prefs", prefs)
64
+ return opts, prof
65
+
66
+
67
+ def _looks_like_html(text_or_bytes: bytes | str) -> bool:
68
+ if not text_or_bytes:
69
+ return False
70
+ s = text_or_bytes if isinstance(text_or_bytes, str) else text_or_bytes.decode("utf-8", "ignore")
71
+ if len(s) < MIN_HTML_BYTES:
72
+ return False
73
+ lowered = s.lower()
74
+ return ("<html" in lowered and "</html>" in lowered) or "<body" in lowered
75
+
76
+ def _requests_fallback(url: str, headers: dict | None = None, timeout: float = 15.0):
77
+ """Plain requests fallback. Returns `requests.Response | None`."""
78
+ try:
79
+ sess = requests.Session()
80
+ sess.headers.update(headers or {"User-Agent": "Mozilla/5.0"})
81
+ # honor simple redirects and cert issues as needed
82
+ resp = sess.get(url, timeout=timeout, allow_redirects=True, verify=False)
83
+ return resp
84
+ except Exception as e:
85
+ logging.warning(f"requests fallback failed for {url}: {e}")
86
+ return None
87
+
88
+ def _wait_until_ready(driver, timeout: float = 10.0):
89
+ """Waits for DOM readiness and presence of <body>."""
90
+ try:
91
+ WebDriverWait(driver, timeout).until(
92
+ lambda d: d.execute_script("return document.readyState") in ("interactive", "complete")
93
+ )
94
+ except Exception:
95
+ pass
96
+ try:
97
+ WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
98
+ except Exception:
99
+ pass
100
+ # small settle delay for late JS injections
101
+ time.sleep(0.3)
102
+ def normalize_url(url, base_url=None):
103
+ manager = seleniumManager(url)
104
+ base_url = manager.base_url
105
+ if url.startswith(base_url):
106
+ url = url[len(base_url):]
107
+ normalized_url = urljoin(base_url, url.split('#')[0])
108
+ if not normalized_url.startswith(base_url):
109
+ return None
110
+ return normalized_url
111
+ # ---- Singleton driver manager (your class; small fixes) ----
112
+ class SingletonMeta(type):
113
+ _instances = {}
114
+ def __call__(cls, *args, **kwargs):
115
+ if cls not in cls._instances:
116
+ instance = super().__call__(*args, **kwargs)
117
+ cls._instances[cls] = instance
118
+ return cls._instances[cls]
119
+
120
+ class seleniumManager(metaclass=SingletonMeta):
121
+ def __init__(self, url):
122
+ if getattr(self, "initialized", False):
123
+ return
124
+ self.initialized = True
125
+
126
+ p = urlparse(url)
127
+ self.domain = p.netloc
128
+ self.scheme = p.scheme or "https"
129
+ self.base_url = f"{self.scheme}://{self.domain}"
130
+
131
+ self.site_dir = os.path.join("/var/tmp", "cw-sites", self.domain)
132
+ os.makedirs(self.site_dir, exist_ok=True)
133
+
134
+ self._sessions: dict[str, dict] = {} # key -> {"driver": ..., "profile": ...}
135
+ atexit.register(lambda sm=self: sm.close_all())
136
+
137
+ def get_url_to_path(self, url):
138
+ url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
139
+ p = urlparse(url)
140
+ if p.netloc == self.domain:
141
+ parts = [x for x in p.path.split('/') if x]
142
+ d = self.site_dir
143
+ for seg in parts[:-1]:
144
+ d = os.path.join(d, seg)
145
+ os.makedirs(d, exist_ok=True)
146
+ last = parts[-1] if parts else "index.html"
147
+ ext = os.path.splitext(last)[-1] or ".html"
148
+ if not hasattr(self, "page_type"):
149
+ self.page_type = []
150
+ self.page_type.append(ext if not self.page_type else self.page_type[-1])
151
+ return os.path.join(d, last)
152
+
153
+ def get_with_netloc(self, url):
154
+ p = urlparse(url)
155
+ if p.netloc == '':
156
+ url = f"{self.scheme}://{self.domain}/{url.strip().lstrip('/')}"
157
+ return url
158
+
159
+ def get_driver(self, url) -> tuple[str, webdriver.Chrome]:
160
+ bin_path = get_env_value('CHROME_BINARY')
161
+ opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
162
+ driver = webdriver.Chrome(options=opts)
163
+ key = f"{url}#{time.time()}"
164
+ self._sessions[key] = {"driver": driver, "profile": prof}
165
+ return key, driver
166
+
167
+ def close_driver(self, key: str):
168
+ sess = self._sessions.pop(key, None)
169
+ if not sess: return
170
+ try:
171
+ try: sess["driver"].quit()
172
+ except Exception: pass
173
+ finally:
174
+ shutil.rmtree(sess.get("profile") or "", ignore_errors=True)
175
+
176
+ def close_all(self):
177
+ for key in list(self._sessions.keys()):
178
+ self.close_driver(key)
179
+
180
+
181
+
182
+ # ---- Hardened page-source retrieval with fallback ----
183
+ def get_selenium_source(url, max_retries: int = 2, request_fallback: bool = True, timeout: float = 12.0):
184
+ url_mgr = urlManager(url)
185
+ if not url_mgr.url:
186
+ return None
187
+ url = str(url_mgr.url)
188
+
189
+ manager = seleniumManager(url)
190
+ key, driver = manager.get_driver(url)
191
+
192
+ last_exc = None
193
+ try:
194
+ for attempt in range(1, max_retries + 1):
195
+ try:
196
+ driver.get(url)
197
+ _wait_until_ready(driver, timeout=timeout)
198
+ html = driver.page_source or ""
199
+ if not _looks_like_html(html):
200
+ html = driver.execute_script(
201
+ "return document.documentElement ? document.documentElement.outerHTML : '';"
202
+ ) or html
203
+ if _looks_like_html(html):
204
+ return html
205
+ logging.warning(f"Selenium returned suspicious HTML (len={len(html)}) for {url} "
206
+ f"[attempt {attempt}/{max_retries}]")
207
+ except Exception as e:
208
+ last_exc = e
209
+ logging.warning(f"Selenium attempt {attempt}/{max_retries} failed for {url}: {e}")
210
+ time.sleep(0.5 * attempt)
211
+
212
+ if request_fallback:
213
+ resp = _requests_fallback(url, headers={"User-Agent": "Mozilla/5.0"})
214
+ if resp is not None:
215
+ ctype = (resp.headers.get("content-type") or "").lower()
216
+ body = resp.text if hasattr(resp, "text") else (
217
+ resp.content.decode("utf-8", "ignore") if hasattr(resp, "content") else ""
218
+ )
219
+ if "application/json" in ctype:
220
+ try:
221
+ return json.dumps(resp.json())
222
+ except Exception:
223
+ return body
224
+ return body if _looks_like_html(body) or body else None
225
+ finally:
226
+ # critical: release the user-data-dir to avoid “already in use”
227
+ manager.close_driver(key)
228
+
229
+ if last_exc:
230
+ logging.error(f"Unable to retrieve page for {url}: {last_exc}")
231
+ return None
232
+
233
+ def get_driver(self, url):
234
+ # always new
235
+ bin_path = get_env_value('CHROME_BINARY')
236
+ opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
237
+ driver = webdriver.Chrome(options=opts)
238
+ # store so close_all() can clean up
239
+ key = f"{url}#{time.time()}"
240
+ self._sessions[key] = {"driver": driver, "profile": prof}
241
+ return driver
@@ -1,7 +1,11 @@
1
1
  from abstract_gui import make_component,sg
2
2
  import inspect
3
3
  import re
4
+ from . import UserAgentManager,UrlManager,SafeRequest,SoupManager,LinkManager,CipherManager,requests,ssl,BeautifulSoup,HTTPAdapter,PoolManager,ssl_
5
+ <<<<<<< HEAD
6
+ =======
4
7
  from .managers import *
8
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
5
9
  window = None
6
10
 
7
11
  def get_attrs(values):
@@ -25,7 +29,11 @@ def get_attrs(values):
25
29
  else:
26
30
  tags_js['attribute']=tags_js['attribute'][0]
27
31
  return tags_js
32
+ <<<<<<< HEAD
33
+ def get_user_agent_manager(user_agent=None):
34
+ =======
28
35
  def get_user_agent_mgr(user_agent=None):
36
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
29
37
  return UserAgentManager(user_agent=user_agent)
30
38
  def get_cipher_list():
31
39
  return CipherManager().get_default_ciphers()
@@ -128,17 +136,43 @@ def process_url(window,values):
128
136
  if warn_url=='' or warn_url == None:
129
137
  update_status(window,warn,warn_url,response_code,valid)
130
138
  return False
139
+ temp_url=UrlManager(url=warn_url).url
140
+ if temp_url:
141
+ valid='valid'
142
+ response_code = SafeRequest(url=temp_mgr).response.status_code
143
+ <<<<<<< HEAD
144
+ =======
131
145
  temp_url=urlManager(url=warn_url).url
132
146
  if temp_url:
133
147
  valid='valid'
134
148
  response_code = requestManager(url=temp_mgr).response.status_code
149
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
135
150
  warn = 'success'
136
151
  warn_url = temp_mgr
137
152
  update_status(window,warn,warn_url,response_code,valid)
138
153
  return temp_mgr
139
154
  update_status(window,warn,warn_url,response_code,valid)
140
155
  return False
141
- def update_url(url_mgr,request_mgr,soup_mgr,link_mgr,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
156
+ def update_url(url_manager,request_manager,soup_manager,link_manager,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
157
+ ciphers = CipherManager(cipher_list=cipher_list).ciphers_string
158
+ request_manager = SafeRequest(url_manager=url_manager,ciphers=ciphers,user_agent=get_user_agents()[0])
159
+ if request_manager.source_code:
160
+ soup_manager= SoupManager(url_manager=url_manager,request_manager=request_manager)
161
+ link_manager= LinkManager(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager)
162
+ window['-URL-'].update(value=url_manager.url)
163
+ window['-CIPHERS_OUTPUT-'].update(value=request_manager.ciphers)
164
+ return update_source_code(url_manager,request_manager,soup_manager,link_manager,values)
165
+ else:
166
+ return url_manager,request_manager,soup_manager,link_manager
167
+ def update_source_code(url_manager,request_manager,soup_manager,link_manager,values):
168
+ parse_type = values['-parse_type-']
169
+ if parse_type != soup_manager.parse_type:
170
+ soup_manager.update_parse_type(parse_type=parse_type)
171
+ all_tags=soup_manager.get_all_tags_and_attribute_names()
172
+ <<<<<<< HEAD
173
+ window['-SOURCECODE-'].update(value=soup_manager.soup)
174
+ =======
175
+ window['-SOURCECODE-'].update(value=soup_manager.soupdef update_url(url_mgr,request_mgr,soup_mgr,link_mgr,values,cipher_list=get_cipher_list(),user_agent=get_user_agents()[0]):
142
176
  ciphers = CipherManager(cipher_list=cipher_list).ciphers_string
143
177
  request_mgr = requestManager(url_mgr=url_mgr,ciphers=ciphers,user_agent=get_user_agents()[0])
144
178
  if request_mgr.source_code:
@@ -155,12 +189,23 @@ def update_source_code(url_mgr,request_mgr,soup_mgr,link_mgr,values):
155
189
  soup_mgr.update_parse_type(parse_type=parse_type)
156
190
  all_tags=soup_mgr.get_all_tags_and_attribute_names()
157
191
  window['-SOURCECODE-'].update(value=soup_mgr.soup)
192
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
158
193
  if values['-SOUP_TAG-'] != all_tags['tags']:
159
194
  window['-SOUP_TAG-'].update(values=all_tags['tags'],value=all_tags['tags'][0])
160
195
  if values['-SOUP_ATTRIBUTE-'] != all_tags['attributes']:
161
196
  window['-SOUP_ATTRIBUTE-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
162
197
  window['-SOUP_ATTRIBUTE_1-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
163
198
  window['-SOUP_ATTRIBUTE_2-'].update(values=all_tags['attributes'],value=all_tags['attributes'][0])
199
+ return url_manager,request_manager,soup_manager,link_manager
200
+ def url_grabber_while(window,initial_url="www.example.com"):
201
+ return_data=None
202
+ url_grab = False
203
+ url_manager=UrlManager(url=initial_url)
204
+ request_manager = SafeRequest(url_manager=url_manager)
205
+ soup_manager= SoupManager(url_manager=url_manager,request_manager=request_manager)
206
+ link_manager= LinkManager(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager)
207
+ <<<<<<< HEAD
208
+ =======
164
209
  return url_mgr,request_mgr,soup_mgr,link_mgr
165
210
  def url_grabber_while(window,initial_url="www.example.com"):
166
211
  return_data=None
@@ -169,12 +214,25 @@ def url_grabber_while(window,initial_url="www.example.com"):
169
214
  request_mgr = requestManager(url_mgr=url_mgr)
170
215
  soup_mgr= SoupManager(url_mgr=url_mgr,request_mgr=request_mgr)
171
216
  link_mgr= LinkManager(url_mgr=url_mgr,request_mgr=request_mgr,soup_mgr=soup_mgr)
217
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
172
218
  while True:
173
219
  event, values = window.read()
174
220
  if event == sg.WINDOW_CLOSED:
175
221
  break
176
222
  if event=='-GRAB_URL-' or not url_grab:
177
223
  url=values['-URL-']
224
+ if UrlManager(url=url).url:
225
+ if url != url_manager.url or url == initial_url:
226
+ url_manager = UrlManager(url=url)
227
+
228
+ url_manager,request_manager,soup_manager,link_manager=update_url(url_manager=url_manager,request_manager=request_manager,soup_manager=soup_manager,link_manager=link_manager,values=values)
229
+ window['-URL-'].update(value=url_manager.url)
230
+ url_grab=True
231
+ if event == 'get soup':
232
+ tags_js = get_attrs(values)
233
+ all_desired=soup_manager.find_tags_by_attributes(tag=tags_js['tag'], attr=tags_js['attribute'],attr_values=tags_js['input'])
234
+ <<<<<<< HEAD
235
+ =======
178
236
  if urlManager(url=url).url:
179
237
  if url != url_mgr.url or url == initial_url:
180
238
  url_mgr = urlManager(url=url)
@@ -185,21 +243,35 @@ def url_grabber_while(window,initial_url="www.example.com"):
185
243
  if event == 'get soup':
186
244
  tags_js = get_attrs(values)
187
245
  all_desired=soup_mgr.find_tags_by_attributes(tag=tags_js['tag'], attr=tags_js['attribute'],attr_values=tags_js['input'])
246
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
188
247
  window['-FIND_ALL_OUTPUT-'].update(value=all_desired)
189
248
  if event == '-CUSTOMUA-':
190
249
  window['-SOURCECODE-'].update(disabled=values['-CUSTOMUA-'])
191
250
  if not values['-CUSTOMUA-']:
251
+ window['-USERAGENT-'].update(value=user_agent_manager.user_agent_header)
252
+ <<<<<<< HEAD
253
+ =======
192
254
  window['-USERAGENT-'].update(value=user_agent_mgr.user_agent_header)
255
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
193
256
  window['-USERAGENT-'].update(disabled=True)
194
257
  else:
195
258
  window['-USERAGENT-'].update(disabled=False)
196
259
  if event=='Get All Text':
260
+ window['-FIND_ALL_OUTPUT-'].update(value=soup_manager.extract_text_sections())
261
+ if event == 'Action':
262
+ parse_type = values['-parse_type-']
263
+ if parse_type != soup_manager.parse_type:
264
+ soup_manager.update_parse_type(parse_type=parse_type)
265
+ window['-SOURCECODE-'].update(value=soup_manager.soup)
266
+ <<<<<<< HEAD
267
+ =======
197
268
  window['-FIND_ALL_OUTPUT-'].update(value=soup_mgr.extract_text_sections())
198
269
  if event == 'Action':
199
270
  parse_type = values['-parse_type-']
200
271
  if parse_type != soup_mgr.parse_type:
201
272
  soup_mgr.update_parse_type(parse_type=parse_type)
202
273
  window['-SOURCECODE-'].update(value=soup_mgr.soup)
274
+ >>>>>>> ba4baf2 (Deploy version 0.1.6.147 at 2025-09-07 09:40:38 UTC)
203
275
  elif event == 'Send Soup':
204
276
  return_data = values['-FIND_ALL_OUTPUT-']
205
277
  break