abstract-webtools 0.1.6.143__tar.gz → 0.1.6.145__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/setup.py +1 -1
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/__init__.py +2 -1
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/requestManager/requestManager.py +31 -19
- abstract_webtools-0.1.6.145/src/abstract_webtools/managers/seleneumManager.py +241 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/PKG-INFO +1 -1
- abstract_webtools-0.1.6.143/src/abstract_webtools/managers/seleneumManager.py +0 -116
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/README.md +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/pyproject.toml +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/setup.cfg +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/__init__.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/abstract_usurpit.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/abstract_webtools.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/big_user_agent_list.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/domain_identifier.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/extention_list.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/find_dirs.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/k2s_downloader.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/main.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/allss//.py" +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/cipherManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/clownworld/__init__.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/clownworld/get_bolshevid_video.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/crawlManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/curlMgr.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/domainManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/get_test.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/linkManager/linkManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/imports.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/src/UnifiedWebManage3r.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/src/UnifiedWebManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/src/__init__.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/middleManager/src/legacy_tools.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/mySocketClient.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/networkManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/soupManager/soupManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/sslManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/urlManager/urlManager (Copy).py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/urlManager/urlManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/userAgentManager.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/videoDownloader.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/url_grabber.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/url_grabber_new.py +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/SOURCES.txt +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/requires.txt +0 -0
- {abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.145
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
4
|
long_description = fh.read()
|
5
5
|
setuptools.setup(
|
6
6
|
name='abstract_webtools',
|
7
|
-
version='0.1.6.
|
7
|
+
version='0.1.6.145',
|
8
8
|
author='putkoff',
|
9
9
|
author_email='partners@abstractendeavors.com',
|
10
10
|
description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
|
@@ -10,6 +10,7 @@ from .sslManager import *
|
|
10
10
|
from .tlsAdapter import *
|
11
11
|
from .urlManager import *
|
12
12
|
from .userAgentManager import *
|
13
|
-
from .
|
13
|
+
from .seleneumManager import *
|
14
14
|
from .videoDownloader import *
|
15
15
|
from .middleManager import *
|
16
|
+
seleniumManager = seleneumManager
|
@@ -328,10 +328,11 @@ class requestManager:
|
|
328
328
|
|
329
329
|
def make_request(self):
|
330
330
|
"""
|
331
|
-
Make a request and handle potential errors.
|
331
|
+
Make a request and handle potential errors, with retries.
|
332
332
|
"""
|
333
333
|
if self.url_mgr.url is None:
|
334
334
|
return None
|
335
|
+
|
335
336
|
self.wait_between_requests()
|
336
337
|
for _ in range(self.max_retries):
|
337
338
|
try:
|
@@ -345,41 +346,52 @@ class requestManager:
|
|
345
346
|
elif self._response.status_code == 429:
|
346
347
|
logging.warning(f"Rate limited by {self.url_mgr.url}. Retrying...")
|
347
348
|
time.sleep(5)
|
349
|
+
else:
|
350
|
+
# String/bytes from Selenium path
|
351
|
+
self.status_code = 200
|
352
|
+
return self._response
|
348
353
|
except requests.Timeout as e:
|
349
354
|
logging.error(f"Request to {self.url_mgr.url} timed out: {e}")
|
350
355
|
except requests.ConnectionError:
|
351
356
|
logging.error(f"Connection error for URL {self.url_mgr.url}.")
|
352
357
|
except requests.RequestException as e:
|
353
358
|
logging.error(f"Request exception for URL {self.url_mgr.url}: {e}")
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
self._response = response
|
358
|
-
self.status_code = 200 # Assume success
|
359
|
-
return self._response
|
360
|
-
except Exception as e:
|
361
|
-
logging.error(f"Failed to retrieve content from {self.url_mgr.url} after {self.max_retries} retries: {e}")
|
362
|
-
return None
|
359
|
+
|
360
|
+
logging.error(f"Failed to retrieve content from {self.url_mgr.url} after {self.max_retries} retries")
|
361
|
+
return None
|
363
362
|
|
364
363
|
def try_request(self) -> requests.Response | str | bytes | None:
|
365
364
|
"""
|
366
|
-
Tries
|
365
|
+
Tries Selenium first, then falls back to requests if Selenium fails.
|
367
366
|
"""
|
368
367
|
if self.url_mgr.url is None:
|
369
368
|
return None
|
369
|
+
|
370
|
+
# 1. Try Selenium
|
370
371
|
try:
|
371
|
-
return get_selenium_source(self.url_mgr.url)
|
372
|
+
return get_selenium_source(self.url_mgr.url)
|
373
|
+
except Exception as e:
|
374
|
+
logging.warning(f"Selenium failed for {self.url_mgr.url}, falling back to requests: {e}")
|
375
|
+
|
376
|
+
# 2. Fallback: requests
|
377
|
+
try:
|
378
|
+
resp = self.session.get(
|
379
|
+
self.url_mgr.url,
|
380
|
+
timeout=self.timeout or 10,
|
381
|
+
stream=self.stream
|
382
|
+
)
|
383
|
+
return resp
|
372
384
|
except requests.RequestException as e:
|
373
|
-
logging.error(f"
|
385
|
+
logging.error(f"Requests fallback also failed for {self.url_mgr.url}: {e}")
|
374
386
|
return None
|
375
387
|
|
376
|
-
|
377
|
-
|
378
|
-
|
388
|
+
@property
|
389
|
+
def url(self):
|
390
|
+
return self.url_mgr.url
|
379
391
|
|
380
|
-
|
381
|
-
|
382
|
-
|
392
|
+
@url.setter
|
393
|
+
def url(self, new_url):
|
394
|
+
self._url = new_url
|
383
395
|
class SafeRequestSingleton:
|
384
396
|
_instance = None
|
385
397
|
@staticmethod
|
@@ -0,0 +1,241 @@
|
|
1
|
+
import os, time, re, json, logging, urllib3, requests,tempfile, shutil, socket, atexit, errno
|
2
|
+
from urllib.parse import urlparse, urljoin
|
3
|
+
from bs4 import BeautifulSoup # if you prefer, keep using your parser
|
4
|
+
from selenium import webdriver
|
5
|
+
from selenium.webdriver.chrome.options import Options
|
6
|
+
from selenium.webdriver.common.by import By
|
7
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
8
|
+
from selenium.webdriver.support import expected_conditions as EC
|
9
|
+
from abstract_security import get_env_value
|
10
|
+
from abstract_utilities import *
|
11
|
+
from .urlManager import * # your urlManager
|
12
|
+
|
13
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
14
|
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
15
|
+
logging.getLogger("selenium").setLevel(logging.WARNING)
|
16
|
+
|
17
|
+
# ---- Chrome options (keep yours; add safe fallbacks) ----
|
18
|
+
chrome_options = Options()
|
19
|
+
_bin = get_env_value('CHROME_BINARY')
|
20
|
+
if _bin:
|
21
|
+
chrome_options.binary_location = _bin
|
22
|
+
chrome_options.add_argument("--headless=new")
|
23
|
+
chrome_options.add_argument("--no-sandbox")
|
24
|
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
25
|
+
chrome_options.add_argument("--disable-gpu")
|
26
|
+
chrome_options.add_argument("--disable-software-rasterizer")
|
27
|
+
chrome_options.add_argument("--disable-extensions")
|
28
|
+
chrome_options.add_argument("--remote-debugging-port=9222")
|
29
|
+
chrome_prefs = {"profile.managed_default_content_settings.images": 2}
|
30
|
+
chrome_options.experimental_options["prefs"] = chrome_prefs
|
31
|
+
|
32
|
+
MIN_HTML_BYTES = 2048 # tune: consider <2KB suspicious for real pages
|
33
|
+
# --- NEW helpers: unique temp profile + free port + options builder ---
|
34
|
+
|
35
|
+
def _free_port() -> int:
|
36
|
+
s = socket.socket()
|
37
|
+
s.bind(("127.0.0.1", 0))
|
38
|
+
port = s.getsockname()[1]
|
39
|
+
s.close()
|
40
|
+
return port
|
41
|
+
|
42
|
+
def _make_profile_dir(base="/var/tmp/selenium-profiles") -> str:
|
43
|
+
os.makedirs(base, exist_ok=True)
|
44
|
+
return tempfile.mkdtemp(prefix="cw-", dir=base)
|
45
|
+
|
46
|
+
def _make_chrome_options(binary_path: str | None = None,
|
47
|
+
user_data_dir: str | None = None) -> tuple[Options, str]:
|
48
|
+
opts = Options()
|
49
|
+
if binary_path:
|
50
|
+
opts.binary_location = binary_path
|
51
|
+
opts.add_argument("--headless=new")
|
52
|
+
opts.add_argument("--no-sandbox")
|
53
|
+
opts.add_argument("--disable-dev-shm-usage")
|
54
|
+
opts.add_argument("--disable-gpu")
|
55
|
+
opts.add_argument("--disable-software-rasterizer")
|
56
|
+
opts.add_argument("--disable-extensions")
|
57
|
+
|
58
|
+
prof = user_data_dir or _make_profile_dir()
|
59
|
+
opts.add_argument(f"--user-data-dir={prof}")
|
60
|
+
opts.add_argument(f"--remote-debugging-port={_free_port()}")
|
61
|
+
|
62
|
+
prefs = {"profile.managed_default_content_settings.images": 2}
|
63
|
+
opts.add_experimental_option("prefs", prefs)
|
64
|
+
return opts, prof
|
65
|
+
|
66
|
+
|
67
|
+
def _looks_like_html(text_or_bytes: bytes | str) -> bool:
|
68
|
+
if not text_or_bytes:
|
69
|
+
return False
|
70
|
+
s = text_or_bytes if isinstance(text_or_bytes, str) else text_or_bytes.decode("utf-8", "ignore")
|
71
|
+
if len(s) < MIN_HTML_BYTES:
|
72
|
+
return False
|
73
|
+
lowered = s.lower()
|
74
|
+
return ("<html" in lowered and "</html>" in lowered) or "<body" in lowered
|
75
|
+
|
76
|
+
def _requests_fallback(url: str, headers: dict | None = None, timeout: float = 15.0):
|
77
|
+
"""Plain requests fallback. Returns `requests.Response | None`."""
|
78
|
+
try:
|
79
|
+
sess = requests.Session()
|
80
|
+
sess.headers.update(headers or {"User-Agent": "Mozilla/5.0"})
|
81
|
+
# honor simple redirects and cert issues as needed
|
82
|
+
resp = sess.get(url, timeout=timeout, allow_redirects=True, verify=False)
|
83
|
+
return resp
|
84
|
+
except Exception as e:
|
85
|
+
logging.warning(f"requests fallback failed for {url}: {e}")
|
86
|
+
return None
|
87
|
+
|
88
|
+
def _wait_until_ready(driver, timeout: float = 10.0):
|
89
|
+
"""Waits for DOM readiness and presence of <body>."""
|
90
|
+
try:
|
91
|
+
WebDriverWait(driver, timeout).until(
|
92
|
+
lambda d: d.execute_script("return document.readyState") in ("interactive", "complete")
|
93
|
+
)
|
94
|
+
except Exception:
|
95
|
+
pass
|
96
|
+
try:
|
97
|
+
WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
98
|
+
except Exception:
|
99
|
+
pass
|
100
|
+
# small settle delay for late JS injections
|
101
|
+
time.sleep(0.3)
|
102
|
+
def normalize_url(url, base_url=None):
|
103
|
+
manager = seleniumManager(url)
|
104
|
+
base_url = manager.base_url
|
105
|
+
if url.startswith(base_url):
|
106
|
+
url = url[len(base_url):]
|
107
|
+
normalized_url = urljoin(base_url, url.split('#')[0])
|
108
|
+
if not normalized_url.startswith(base_url):
|
109
|
+
return None
|
110
|
+
return normalized_url
|
111
|
+
# ---- Singleton driver manager (your class; small fixes) ----
|
112
|
+
class SingletonMeta(type):
|
113
|
+
_instances = {}
|
114
|
+
def __call__(cls, *args, **kwargs):
|
115
|
+
if cls not in cls._instances:
|
116
|
+
instance = super().__call__(*args, **kwargs)
|
117
|
+
cls._instances[cls] = instance
|
118
|
+
return cls._instances[cls]
|
119
|
+
|
120
|
+
class seleniumManager(metaclass=SingletonMeta):
|
121
|
+
def __init__(self, url):
|
122
|
+
if getattr(self, "initialized", False):
|
123
|
+
return
|
124
|
+
self.initialized = True
|
125
|
+
|
126
|
+
p = urlparse(url)
|
127
|
+
self.domain = p.netloc
|
128
|
+
self.scheme = p.scheme or "https"
|
129
|
+
self.base_url = f"{self.scheme}://{self.domain}"
|
130
|
+
|
131
|
+
self.site_dir = os.path.join("/var/tmp", "cw-sites", self.domain)
|
132
|
+
os.makedirs(self.site_dir, exist_ok=True)
|
133
|
+
|
134
|
+
self._sessions: dict[str, dict] = {} # key -> {"driver": ..., "profile": ...}
|
135
|
+
atexit.register(lambda sm=self: sm.close_all())
|
136
|
+
|
137
|
+
def get_url_to_path(self, url):
|
138
|
+
url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
|
139
|
+
p = urlparse(url)
|
140
|
+
if p.netloc == self.domain:
|
141
|
+
parts = [x for x in p.path.split('/') if x]
|
142
|
+
d = self.site_dir
|
143
|
+
for seg in parts[:-1]:
|
144
|
+
d = os.path.join(d, seg)
|
145
|
+
os.makedirs(d, exist_ok=True)
|
146
|
+
last = parts[-1] if parts else "index.html"
|
147
|
+
ext = os.path.splitext(last)[-1] or ".html"
|
148
|
+
if not hasattr(self, "page_type"):
|
149
|
+
self.page_type = []
|
150
|
+
self.page_type.append(ext if not self.page_type else self.page_type[-1])
|
151
|
+
return os.path.join(d, last)
|
152
|
+
|
153
|
+
def get_with_netloc(self, url):
|
154
|
+
p = urlparse(url)
|
155
|
+
if p.netloc == '':
|
156
|
+
url = f"{self.scheme}://{self.domain}/{url.strip().lstrip('/')}"
|
157
|
+
return url
|
158
|
+
|
159
|
+
def get_driver(self, url) -> tuple[str, webdriver.Chrome]:
|
160
|
+
bin_path = get_env_value('CHROME_BINARY')
|
161
|
+
opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
|
162
|
+
driver = webdriver.Chrome(options=opts)
|
163
|
+
key = f"{url}#{time.time()}"
|
164
|
+
self._sessions[key] = {"driver": driver, "profile": prof}
|
165
|
+
return key, driver
|
166
|
+
|
167
|
+
def close_driver(self, key: str):
|
168
|
+
sess = self._sessions.pop(key, None)
|
169
|
+
if not sess: return
|
170
|
+
try:
|
171
|
+
try: sess["driver"].quit()
|
172
|
+
except Exception: pass
|
173
|
+
finally:
|
174
|
+
shutil.rmtree(sess.get("profile") or "", ignore_errors=True)
|
175
|
+
|
176
|
+
def close_all(self):
|
177
|
+
for key in list(self._sessions.keys()):
|
178
|
+
self.close_driver(key)
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
# ---- Hardened page-source retrieval with fallback ----
|
183
|
+
def get_selenium_source(url, max_retries: int = 2, request_fallback: bool = True, timeout: float = 12.0):
|
184
|
+
url_mgr = urlManager(url)
|
185
|
+
if not url_mgr.url:
|
186
|
+
return None
|
187
|
+
url = str(url_mgr.url)
|
188
|
+
|
189
|
+
manager = seleniumManager(url)
|
190
|
+
key, driver = manager.get_driver(url)
|
191
|
+
|
192
|
+
last_exc = None
|
193
|
+
try:
|
194
|
+
for attempt in range(1, max_retries + 1):
|
195
|
+
try:
|
196
|
+
driver.get(url)
|
197
|
+
_wait_until_ready(driver, timeout=timeout)
|
198
|
+
html = driver.page_source or ""
|
199
|
+
if not _looks_like_html(html):
|
200
|
+
html = driver.execute_script(
|
201
|
+
"return document.documentElement ? document.documentElement.outerHTML : '';"
|
202
|
+
) or html
|
203
|
+
if _looks_like_html(html):
|
204
|
+
return html
|
205
|
+
logging.warning(f"Selenium returned suspicious HTML (len={len(html)}) for {url} "
|
206
|
+
f"[attempt {attempt}/{max_retries}]")
|
207
|
+
except Exception as e:
|
208
|
+
last_exc = e
|
209
|
+
logging.warning(f"Selenium attempt {attempt}/{max_retries} failed for {url}: {e}")
|
210
|
+
time.sleep(0.5 * attempt)
|
211
|
+
|
212
|
+
if request_fallback:
|
213
|
+
resp = _requests_fallback(url, headers={"User-Agent": "Mozilla/5.0"})
|
214
|
+
if resp is not None:
|
215
|
+
ctype = (resp.headers.get("content-type") or "").lower()
|
216
|
+
body = resp.text if hasattr(resp, "text") else (
|
217
|
+
resp.content.decode("utf-8", "ignore") if hasattr(resp, "content") else ""
|
218
|
+
)
|
219
|
+
if "application/json" in ctype:
|
220
|
+
try:
|
221
|
+
return json.dumps(resp.json())
|
222
|
+
except Exception:
|
223
|
+
return body
|
224
|
+
return body if _looks_like_html(body) or body else None
|
225
|
+
finally:
|
226
|
+
# critical: release the user-data-dir to avoid “already in use”
|
227
|
+
manager.close_driver(key)
|
228
|
+
|
229
|
+
if last_exc:
|
230
|
+
logging.error(f"Unable to retrieve page for {url}: {last_exc}")
|
231
|
+
return None
|
232
|
+
|
233
|
+
def get_driver(self, url):
|
234
|
+
# always new
|
235
|
+
bin_path = get_env_value('CHROME_BINARY')
|
236
|
+
opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
|
237
|
+
driver = webdriver.Chrome(options=opts)
|
238
|
+
# store so close_all() can clean up
|
239
|
+
key = f"{url}#{time.time()}"
|
240
|
+
self._sessions[key] = {"driver": driver, "profile": prof}
|
241
|
+
return driver
|
{abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools.egg-info/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.145
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -1,116 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
#from ..abstract_webtools import urlManager
|
3
|
-
from .urlManager import *
|
4
|
-
from urllib.parse import urlparse
|
5
|
-
from abstract_utilities import *
|
6
|
-
from selenium import webdriver
|
7
|
-
from selenium.webdriver.chrome.options import Options
|
8
|
-
import logging
|
9
|
-
import urllib3
|
10
|
-
from abstract_security import get_env_value
|
11
|
-
# Suppress urllib3 warnings and debug logs
|
12
|
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
13
|
-
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
14
|
-
|
15
|
-
# Suppress Selenium logs
|
16
|
-
logging.getLogger("selenium").setLevel(logging.WARNING)
|
17
|
-
|
18
|
-
|
19
|
-
# Setup Chrome options
|
20
|
-
chrome_options = Options()
|
21
|
-
chrome_options.binary_location = get_env_value('CHROME_BINARY')
|
22
|
-
chrome_options.add_argument("--headless") # Run in headless mode
|
23
|
-
chrome_options.add_argument("--no-sandbox")
|
24
|
-
chrome_options.add_argument("--disable-dev-shm-usage")
|
25
|
-
chrome_options.add_argument("--disable-gpu")
|
26
|
-
chrome_options.add_argument("--disable-software-rasterizer")
|
27
|
-
chrome_options.add_argument("--disable-extensions")
|
28
|
-
chrome_options.add_argument("--remote-debugging-port=9222")
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
class SingletonMeta(type):
|
33
|
-
_instances = {}
|
34
|
-
def __call__(cls, *args, **kwargs):
|
35
|
-
if cls not in cls._instances:
|
36
|
-
instance = super().__call__(*args, **kwargs)
|
37
|
-
cls._instances[cls] = instance
|
38
|
-
return cls._instances[cls]
|
39
|
-
|
40
|
-
class seleniumManager(metaclass=SingletonMeta):
|
41
|
-
def __init__(self, url):
|
42
|
-
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
43
|
-
self.initialized = True
|
44
|
-
parsed_url = urlparse(url)
|
45
|
-
self.domain = parsed_url.netloc
|
46
|
-
self.scheme = parsed_url.scheme
|
47
|
-
self.base_url= f"{self.scheme}{self.domain}"
|
48
|
-
self.site_dir = os.path.join(os.getcwd(), self.domain)
|
49
|
-
os.makedirs(self.site_dir, exist_ok=True)
|
50
|
-
self.drivers = {}
|
51
|
-
self.page_type = []
|
52
|
-
|
53
|
-
def get_url_to_path(self, url):
|
54
|
-
url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
|
55
|
-
parsed_url = urlparse(url)
|
56
|
-
if parsed_url.netloc == self.domain:
|
57
|
-
paths = parsed_url.path.split('/')
|
58
|
-
dir_path = self.site_dir
|
59
|
-
for path in paths[:-1]:
|
60
|
-
dir_path = os.path.join(dir_path, path)
|
61
|
-
os.makedirs(dir_path, exist_ok=True)
|
62
|
-
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
|
63
|
-
|
64
|
-
dir_path = os.path.join(dir_path, paths[-1])
|
65
|
-
return dir_path
|
66
|
-
|
67
|
-
def saved_url_check(self, url):
|
68
|
-
path = self.get_url_to_path(url)
|
69
|
-
return path
|
70
|
-
|
71
|
-
def get_with_netloc(self, url):
|
72
|
-
parsed_url = urlparse(url)
|
73
|
-
if parsed_url.netloc == '':
|
74
|
-
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
75
|
-
return url
|
76
|
-
|
77
|
-
def get_driver(self, url):
|
78
|
-
if url and url not in self.drivers:
|
79
|
-
# chrome_options = Options()
|
80
|
-
# chrome_options.add_argument("--headless")
|
81
|
-
driver = webdriver.Chrome(options=chrome_options)
|
82
|
-
self.drivers[url] = driver
|
83
|
-
driver.get(url)
|
84
|
-
return self.drivers[url]
|
85
|
-
def normalize_url(url, base_url=None):
|
86
|
-
"""
|
87
|
-
Normalize and resolve relative URLs, ensuring proper domain and format.
|
88
|
-
"""
|
89
|
-
# If URL starts with the base URL repeated, remove the extra part
|
90
|
-
manager = seleniumManager(url)
|
91
|
-
base_url = manager.base_url
|
92
|
-
if url.startswith(base_url):
|
93
|
-
url = url[len(base_url):]
|
94
|
-
|
95
|
-
# Resolve the URL against the base URL
|
96
|
-
normalized_url = urljoin(base_url, url.split('#')[0])
|
97
|
-
|
98
|
-
# Ensure only URLs belonging to the base domain are kept
|
99
|
-
if not normalized_url.startswith(base_url):
|
100
|
-
return None
|
101
|
-
|
102
|
-
return normalized_url
|
103
|
-
# Function to get Selenium page source
|
104
|
-
def get_selenium_source(url):
|
105
|
-
url_mgr = urlManager(url)
|
106
|
-
if url_mgr.url:
|
107
|
-
url = str(url_mgr.url)
|
108
|
-
manager = seleniumManager(url)
|
109
|
-
driver = manager.get_driver(url)
|
110
|
-
try:
|
111
|
-
# Get page source
|
112
|
-
page_source = driver.page_source
|
113
|
-
return page_source
|
114
|
-
finally:
|
115
|
-
# Don't quit the driver unless you're done with all interactions
|
116
|
-
pass
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/extention_list.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/find_dirs.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/k2s_downloader.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/soup_gui.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/url_grabber.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.143 → abstract_webtools-0.1.6.145}/src/abstract_webtools/url_grabber_new.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|