abstract-webtools 0.1.6.144__tar.gz → 0.1.6.145__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/setup.py +1 -1
  3. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/__init__.py +2 -1
  4. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/requestManager/requestManager.py +31 -19
  5. abstract_webtools-0.1.6.145/src/abstract_webtools/managers/seleneumManager.py +241 -0
  6. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/PKG-INFO +1 -1
  7. abstract_webtools-0.1.6.144/src/abstract_webtools/managers/seleneumManager.py +0 -116
  8. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/README.md +0 -0
  9. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/pyproject.toml +0 -0
  10. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/setup.cfg +0 -0
  11. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/__init__.py +0 -0
  12. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/abstract_usurpit.py +0 -0
  13. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/abstract_webtools.py +0 -0
  14. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/big_user_agent_list.py +0 -0
  15. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/domain_identifier.py +0 -0
  16. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/extention_list.py +0 -0
  17. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/find_dirs.py +0 -0
  18. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/k2s_downloader.py +0 -0
  19. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/main.py +0 -0
  20. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/allss//.py" +0 -0
  21. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/cipherManager.py +0 -0
  22. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/clownworld/__init__.py +0 -0
  23. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/clownworld/get_bolshevid_video.py +0 -0
  24. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/crawlManager.py +0 -0
  25. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
  26. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/curlMgr.py +0 -0
  27. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/domainManager.py +0 -0
  28. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  29. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/get_test.py +0 -0
  30. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
  31. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/linkManager/linkManager.py +0 -0
  32. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/__init__.py +0 -0
  33. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/imports.py +0 -0
  34. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/src/UnifiedWebManage3r.py +0 -0
  35. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/src/UnifiedWebManager.py +0 -0
  36. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/src/__init__.py +0 -0
  37. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/src/legacy_tools.py +0 -0
  38. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  39. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/networkManager.py +0 -0
  40. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
  41. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
  42. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
  43. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/soupManager/soupManager.py +0 -0
  44. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/sslManager.py +0 -0
  45. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  46. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
  47. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/urlManager/urlManager (Copy).py +0 -0
  48. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/urlManager/urlManager.py +0 -0
  49. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  50. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  51. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
  52. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/soup_gui.py +0 -0
  53. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/url_grabber.py +0 -0
  54. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools/url_grabber_new.py +0 -0
  55. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/SOURCES.txt +0 -0
  56. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  57. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/requires.txt +0 -0
  58. {abstract_webtools-0.1.6.144 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.144
3
+ Version: 0.1.6.145
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.6.144',
7
+ version='0.1.6.145',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -10,6 +10,7 @@ from .sslManager import *
10
10
  from .tlsAdapter import *
11
11
  from .urlManager import *
12
12
  from .userAgentManager import *
13
- from .seleniumManager import *
13
+ from .seleneumManager import *
14
14
  from .videoDownloader import *
15
15
  from .middleManager import *
16
+ seleniumManager = seleneumManager
@@ -328,10 +328,11 @@ class requestManager:
328
328
 
329
329
  def make_request(self):
330
330
  """
331
- Make a request and handle potential errors.
331
+ Make a request and handle potential errors, with retries.
332
332
  """
333
333
  if self.url_mgr.url is None:
334
334
  return None
335
+
335
336
  self.wait_between_requests()
336
337
  for _ in range(self.max_retries):
337
338
  try:
@@ -345,41 +346,52 @@ class requestManager:
345
346
  elif self._response.status_code == 429:
346
347
  logging.warning(f"Rate limited by {self.url_mgr.url}. Retrying...")
347
348
  time.sleep(5)
349
+ else:
350
+ # String/bytes from Selenium path
351
+ self.status_code = 200
352
+ return self._response
348
353
  except requests.Timeout as e:
349
354
  logging.error(f"Request to {self.url_mgr.url} timed out: {e}")
350
355
  except requests.ConnectionError:
351
356
  logging.error(f"Connection error for URL {self.url_mgr.url}.")
352
357
  except requests.RequestException as e:
353
358
  logging.error(f"Request exception for URL {self.url_mgr.url}: {e}")
354
- try:
355
- response = get_selenium_source(self.url_mgr.url)
356
- if response:
357
- self._response = response
358
- self.status_code = 200 # Assume success
359
- return self._response
360
- except Exception as e:
361
- logging.error(f"Failed to retrieve content from {self.url_mgr.url} after {self.max_retries} retries: {e}")
362
- return None
359
+
360
+ logging.error(f"Failed to retrieve content from {self.url_mgr.url} after {self.max_retries} retries")
361
+ return None
363
362
 
364
363
  def try_request(self) -> requests.Response | str | bytes | None:
365
364
  """
366
- Tries to make an HTTP request to the given URL using the provided session.
365
+ Tries Selenium first, then falls back to requests if Selenium fails.
367
366
  """
368
367
  if self.url_mgr.url is None:
369
368
  return None
369
+
370
+ # 1. Try Selenium
370
371
  try:
371
- return get_selenium_source(self.url_mgr.url) # or self.session.get(self.url_mgr.url, timeout=self.timeout, stream=self.stream)
372
+ return get_selenium_source(self.url_mgr.url)
373
+ except Exception as e:
374
+ logging.warning(f"Selenium failed for {self.url_mgr.url}, falling back to requests: {e}")
375
+
376
+ # 2. Fallback: requests
377
+ try:
378
+ resp = self.session.get(
379
+ self.url_mgr.url,
380
+ timeout=self.timeout or 10,
381
+ stream=self.stream
382
+ )
383
+ return resp
372
384
  except requests.RequestException as e:
373
- logging.error(f"Request failed: {e}")
385
+ logging.error(f"Requests fallback also failed for {self.url_mgr.url}: {e}")
374
386
  return None
375
387
 
376
- @property
377
- def url(self):
378
- return self.url_mgr.url
388
+ @property
389
+ def url(self):
390
+ return self.url_mgr.url
379
391
 
380
- @url.setter
381
- def url(self, new_url):
382
- self._url = new_url
392
+ @url.setter
393
+ def url(self, new_url):
394
+ self._url = new_url
383
395
  class SafeRequestSingleton:
384
396
  _instance = None
385
397
  @staticmethod
@@ -0,0 +1,241 @@
1
+ import os, time, re, json, logging, urllib3, requests,tempfile, shutil, socket, atexit, errno
2
+ from urllib.parse import urlparse, urljoin
3
+ from bs4 import BeautifulSoup # if you prefer, keep using your parser
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.options import Options
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.webdriver.support.ui import WebDriverWait
8
+ from selenium.webdriver.support import expected_conditions as EC
9
+ from abstract_security import get_env_value
10
+ from abstract_utilities import *
11
+ from .urlManager import * # your urlManager
12
+
13
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
14
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
15
+ logging.getLogger("selenium").setLevel(logging.WARNING)
16
+
17
+ # ---- Chrome options (keep yours; add safe fallbacks) ----
18
+ chrome_options = Options()
19
+ _bin = get_env_value('CHROME_BINARY')
20
+ if _bin:
21
+ chrome_options.binary_location = _bin
22
+ chrome_options.add_argument("--headless=new")
23
+ chrome_options.add_argument("--no-sandbox")
24
+ chrome_options.add_argument("--disable-dev-shm-usage")
25
+ chrome_options.add_argument("--disable-gpu")
26
+ chrome_options.add_argument("--disable-software-rasterizer")
27
+ chrome_options.add_argument("--disable-extensions")
28
+ chrome_options.add_argument("--remote-debugging-port=9222")
29
+ chrome_prefs = {"profile.managed_default_content_settings.images": 2}
30
+ chrome_options.experimental_options["prefs"] = chrome_prefs
31
+
32
+ MIN_HTML_BYTES = 2048 # tune: consider <2KB suspicious for real pages
33
+ # --- NEW helpers: unique temp profile + free port + options builder ---
34
+
35
+ def _free_port() -> int:
36
+ s = socket.socket()
37
+ s.bind(("127.0.0.1", 0))
38
+ port = s.getsockname()[1]
39
+ s.close()
40
+ return port
41
+
42
+ def _make_profile_dir(base="/var/tmp/selenium-profiles") -> str:
43
+ os.makedirs(base, exist_ok=True)
44
+ return tempfile.mkdtemp(prefix="cw-", dir=base)
45
+
46
+ def _make_chrome_options(binary_path: str | None = None,
47
+ user_data_dir: str | None = None) -> tuple[Options, str]:
48
+ opts = Options()
49
+ if binary_path:
50
+ opts.binary_location = binary_path
51
+ opts.add_argument("--headless=new")
52
+ opts.add_argument("--no-sandbox")
53
+ opts.add_argument("--disable-dev-shm-usage")
54
+ opts.add_argument("--disable-gpu")
55
+ opts.add_argument("--disable-software-rasterizer")
56
+ opts.add_argument("--disable-extensions")
57
+
58
+ prof = user_data_dir or _make_profile_dir()
59
+ opts.add_argument(f"--user-data-dir={prof}")
60
+ opts.add_argument(f"--remote-debugging-port={_free_port()}")
61
+
62
+ prefs = {"profile.managed_default_content_settings.images": 2}
63
+ opts.add_experimental_option("prefs", prefs)
64
+ return opts, prof
65
+
66
+
67
+ def _looks_like_html(text_or_bytes: bytes | str) -> bool:
68
+ if not text_or_bytes:
69
+ return False
70
+ s = text_or_bytes if isinstance(text_or_bytes, str) else text_or_bytes.decode("utf-8", "ignore")
71
+ if len(s) < MIN_HTML_BYTES:
72
+ return False
73
+ lowered = s.lower()
74
+ return ("<html" in lowered and "</html>" in lowered) or "<body" in lowered
75
+
76
+ def _requests_fallback(url: str, headers: dict | None = None, timeout: float = 15.0):
77
+ """Plain requests fallback. Returns `requests.Response | None`."""
78
+ try:
79
+ sess = requests.Session()
80
+ sess.headers.update(headers or {"User-Agent": "Mozilla/5.0"})
81
+ # honor simple redirects and cert issues as needed
82
+ resp = sess.get(url, timeout=timeout, allow_redirects=True, verify=False)
83
+ return resp
84
+ except Exception as e:
85
+ logging.warning(f"requests fallback failed for {url}: {e}")
86
+ return None
87
+
88
+ def _wait_until_ready(driver, timeout: float = 10.0):
89
+ """Waits for DOM readiness and presence of <body>."""
90
+ try:
91
+ WebDriverWait(driver, timeout).until(
92
+ lambda d: d.execute_script("return document.readyState") in ("interactive", "complete")
93
+ )
94
+ except Exception:
95
+ pass
96
+ try:
97
+ WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
98
+ except Exception:
99
+ pass
100
+ # small settle delay for late JS injections
101
+ time.sleep(0.3)
102
+ def normalize_url(url, base_url=None):
103
+ manager = seleniumManager(url)
104
+ base_url = manager.base_url
105
+ if url.startswith(base_url):
106
+ url = url[len(base_url):]
107
+ normalized_url = urljoin(base_url, url.split('#')[0])
108
+ if not normalized_url.startswith(base_url):
109
+ return None
110
+ return normalized_url
111
+ # ---- Singleton driver manager (your class; small fixes) ----
112
+ class SingletonMeta(type):
113
+ _instances = {}
114
+ def __call__(cls, *args, **kwargs):
115
+ if cls not in cls._instances:
116
+ instance = super().__call__(*args, **kwargs)
117
+ cls._instances[cls] = instance
118
+ return cls._instances[cls]
119
+
120
+ class seleniumManager(metaclass=SingletonMeta):
121
+ def __init__(self, url):
122
+ if getattr(self, "initialized", False):
123
+ return
124
+ self.initialized = True
125
+
126
+ p = urlparse(url)
127
+ self.domain = p.netloc
128
+ self.scheme = p.scheme or "https"
129
+ self.base_url = f"{self.scheme}://{self.domain}"
130
+
131
+ self.site_dir = os.path.join("/var/tmp", "cw-sites", self.domain)
132
+ os.makedirs(self.site_dir, exist_ok=True)
133
+
134
+ self._sessions: dict[str, dict] = {} # key -> {"driver": ..., "profile": ...}
135
+ atexit.register(lambda sm=self: sm.close_all())
136
+
137
+ def get_url_to_path(self, url):
138
+ url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
139
+ p = urlparse(url)
140
+ if p.netloc == self.domain:
141
+ parts = [x for x in p.path.split('/') if x]
142
+ d = self.site_dir
143
+ for seg in parts[:-1]:
144
+ d = os.path.join(d, seg)
145
+ os.makedirs(d, exist_ok=True)
146
+ last = parts[-1] if parts else "index.html"
147
+ ext = os.path.splitext(last)[-1] or ".html"
148
+ if not hasattr(self, "page_type"):
149
+ self.page_type = []
150
+ self.page_type.append(ext if not self.page_type else self.page_type[-1])
151
+ return os.path.join(d, last)
152
+
153
+ def get_with_netloc(self, url):
154
+ p = urlparse(url)
155
+ if p.netloc == '':
156
+ url = f"{self.scheme}://{self.domain}/{url.strip().lstrip('/')}"
157
+ return url
158
+
159
+ def get_driver(self, url) -> tuple[str, webdriver.Chrome]:
160
+ bin_path = get_env_value('CHROME_BINARY')
161
+ opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
162
+ driver = webdriver.Chrome(options=opts)
163
+ key = f"{url}#{time.time()}"
164
+ self._sessions[key] = {"driver": driver, "profile": prof}
165
+ return key, driver
166
+
167
+ def close_driver(self, key: str):
168
+ sess = self._sessions.pop(key, None)
169
+ if not sess: return
170
+ try:
171
+ try: sess["driver"].quit()
172
+ except Exception: pass
173
+ finally:
174
+ shutil.rmtree(sess.get("profile") or "", ignore_errors=True)
175
+
176
+ def close_all(self):
177
+ for key in list(self._sessions.keys()):
178
+ self.close_driver(key)
179
+
180
+
181
+
182
+ # ---- Hardened page-source retrieval with fallback ----
183
+ def get_selenium_source(url, max_retries: int = 2, request_fallback: bool = True, timeout: float = 12.0):
184
+ url_mgr = urlManager(url)
185
+ if not url_mgr.url:
186
+ return None
187
+ url = str(url_mgr.url)
188
+
189
+ manager = seleniumManager(url)
190
+ key, driver = manager.get_driver(url)
191
+
192
+ last_exc = None
193
+ try:
194
+ for attempt in range(1, max_retries + 1):
195
+ try:
196
+ driver.get(url)
197
+ _wait_until_ready(driver, timeout=timeout)
198
+ html = driver.page_source or ""
199
+ if not _looks_like_html(html):
200
+ html = driver.execute_script(
201
+ "return document.documentElement ? document.documentElement.outerHTML : '';"
202
+ ) or html
203
+ if _looks_like_html(html):
204
+ return html
205
+ logging.warning(f"Selenium returned suspicious HTML (len={len(html)}) for {url} "
206
+ f"[attempt {attempt}/{max_retries}]")
207
+ except Exception as e:
208
+ last_exc = e
209
+ logging.warning(f"Selenium attempt {attempt}/{max_retries} failed for {url}: {e}")
210
+ time.sleep(0.5 * attempt)
211
+
212
+ if request_fallback:
213
+ resp = _requests_fallback(url, headers={"User-Agent": "Mozilla/5.0"})
214
+ if resp is not None:
215
+ ctype = (resp.headers.get("content-type") or "").lower()
216
+ body = resp.text if hasattr(resp, "text") else (
217
+ resp.content.decode("utf-8", "ignore") if hasattr(resp, "content") else ""
218
+ )
219
+ if "application/json" in ctype:
220
+ try:
221
+ return json.dumps(resp.json())
222
+ except Exception:
223
+ return body
224
+ return body if _looks_like_html(body) or body else None
225
+ finally:
226
+ # critical: release the user-data-dir to avoid “already in use”
227
+ manager.close_driver(key)
228
+
229
+ if last_exc:
230
+ logging.error(f"Unable to retrieve page for {url}: {last_exc}")
231
+ return None
232
+
233
+ def get_driver(self, url):
234
+ # always new
235
+ bin_path = get_env_value('CHROME_BINARY')
236
+ opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
237
+ driver = webdriver.Chrome(options=opts)
238
+ # store so close_all() can clean up
239
+ key = f"{url}#{time.time()}"
240
+ self._sessions[key] = {"driver": driver, "profile": prof}
241
+ return driver
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.144
3
+ Version: 0.1.6.145
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -1,116 +0,0 @@
1
- import os
2
- #from ..abstract_webtools import urlManager
3
- from .urlManager import *
4
- from urllib.parse import urlparse
5
- from abstract_utilities import *
6
- from selenium import webdriver
7
- from selenium.webdriver.chrome.options import Options
8
- import logging
9
- import urllib3
10
- from abstract_security import get_env_value
11
- # Suppress urllib3 warnings and debug logs
12
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13
- logging.getLogger("urllib3").setLevel(logging.WARNING)
14
-
15
- # Suppress Selenium logs
16
- logging.getLogger("selenium").setLevel(logging.WARNING)
17
-
18
-
19
- # Setup Chrome options
20
- chrome_options = Options()
21
- chrome_options.binary_location = get_env_value('CHROME_BINARY')
22
- chrome_options.add_argument("--headless") # Run in headless mode
23
- chrome_options.add_argument("--no-sandbox")
24
- chrome_options.add_argument("--disable-dev-shm-usage")
25
- chrome_options.add_argument("--disable-gpu")
26
- chrome_options.add_argument("--disable-software-rasterizer")
27
- chrome_options.add_argument("--disable-extensions")
28
- chrome_options.add_argument("--remote-debugging-port=9222")
29
-
30
-
31
-
32
- class SingletonMeta(type):
33
- _instances = {}
34
- def __call__(cls, *args, **kwargs):
35
- if cls not in cls._instances:
36
- instance = super().__call__(*args, **kwargs)
37
- cls._instances[cls] = instance
38
- return cls._instances[cls]
39
-
40
- class seleniumManager(metaclass=SingletonMeta):
41
- def __init__(self, url):
42
- if not hasattr(self, 'initialized'): # Prevent reinitialization
43
- self.initialized = True
44
- parsed_url = urlparse(url)
45
- self.domain = parsed_url.netloc
46
- self.scheme = parsed_url.scheme
47
- self.base_url= f"{self.scheme}{self.domain}"
48
- self.site_dir = os.path.join(os.getcwd(), self.domain)
49
- os.makedirs(self.site_dir, exist_ok=True)
50
- self.drivers = {}
51
- self.page_type = []
52
-
53
- def get_url_to_path(self, url):
54
- url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
55
- parsed_url = urlparse(url)
56
- if parsed_url.netloc == self.domain:
57
- paths = parsed_url.path.split('/')
58
- dir_path = self.site_dir
59
- for path in paths[:-1]:
60
- dir_path = os.path.join(dir_path, path)
61
- os.makedirs(dir_path, exist_ok=True)
62
- self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
63
-
64
- dir_path = os.path.join(dir_path, paths[-1])
65
- return dir_path
66
-
67
- def saved_url_check(self, url):
68
- path = self.get_url_to_path(url)
69
- return path
70
-
71
- def get_with_netloc(self, url):
72
- parsed_url = urlparse(url)
73
- if parsed_url.netloc == '':
74
- url = f"{self.scheme}://{self.domain}/{url.strip()}"
75
- return url
76
-
77
- def get_driver(self, url):
78
- if url and url not in self.drivers:
79
- # chrome_options = Options()
80
- # chrome_options.add_argument("--headless")
81
- driver = webdriver.Chrome(options=chrome_options)
82
- self.drivers[url] = driver
83
- driver.get(url)
84
- return self.drivers[url]
85
- def normalize_url(url, base_url=None):
86
- """
87
- Normalize and resolve relative URLs, ensuring proper domain and format.
88
- """
89
- # If URL starts with the base URL repeated, remove the extra part
90
- manager = seleniumManager(url)
91
- base_url = manager.base_url
92
- if url.startswith(base_url):
93
- url = url[len(base_url):]
94
-
95
- # Resolve the URL against the base URL
96
- normalized_url = urljoin(base_url, url.split('#')[0])
97
-
98
- # Ensure only URLs belonging to the base domain are kept
99
- if not normalized_url.startswith(base_url):
100
- return None
101
-
102
- return normalized_url
103
- # Function to get Selenium page source
104
- def get_selenium_source(url):
105
- url_mgr = urlManager(url)
106
- if url_mgr.url:
107
- url = str(url_mgr.url)
108
- manager = seleniumManager(url)
109
- driver = manager.get_driver(url)
110
- try:
111
- # Get page source
112
- page_source = driver.page_source
113
- return page_source
114
- finally:
115
- # Don't quit the driver unless you're done with all interactions
116
- pass