abstract-webtools 0.1.6.145__py3-none-any.whl → 0.1.6.147__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/__init__.py +6 -0
- abstract_webtools/abstract_webtools.py +1768 -0
- abstract_webtools/managers/cipherManager.py +12 -13
- abstract_webtools/managers/crawlManager.py +35 -26
- abstract_webtools/managers/curlMgr.py +25 -47
- abstract_webtools/managers/meta_dump.py +27 -0
- abstract_webtools/managers/networkManager.py +48 -13
- abstract_webtools/managers/requestManager/requestManager.py +1 -1
- abstract_webtools/managers/seleneumManager.py +1 -0
- abstract_webtools/managers/seleniumManager.py +204 -82
- abstract_webtools/managers/soupManager/soupManager.py +46 -19
- abstract_webtools/managers/sslManager.py +11 -2
- abstract_webtools/managers/userAgentManager.py +31 -40
- abstract_webtools/url_grabber.py +73 -1
- abstract_webtools-0.1.6.147.dist-info/METADATA +482 -0
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.147.dist-info}/RECORD +19 -18
- abstract_webtools-0.1.6.145.dist-info/METADATA +0 -196
- /abstract_webtools/managers/{allss//.py" → allss.py} +0 -0
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.147.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.147.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,20 @@
|
|
1
1
|
class CipherManager:
|
2
2
|
@staticmethod
|
3
|
-
def
|
3
|
+
def get_default_ciphers() -> list:
|
4
4
|
return [
|
5
|
-
"ECDHE-RSA-AES256-GCM-SHA384",
|
6
|
-
"ECDHE-RSA-AES256-SHA384",
|
7
|
-
"ECDHE-RSA-AES256-SHA",
|
8
|
-
"ECDHE-RSA-AES128-GCM-SHA256",
|
9
|
-
"ECDHE-ECDSA-AES128-GCM-SHA256",
|
10
|
-
"AES256-SHA",
|
5
|
+
"ECDHE-RSA-AES256-GCM-SHA384","ECDHE-ECDSA-AES256-GCM-SHA384",
|
6
|
+
"ECDHE-RSA-AES256-SHA384","ECDHE-ECDSA-AES256-SHA384",
|
7
|
+
"ECDHE-RSA-AES256-SHA","ECDHE-ECDSA-AES256-SHA",
|
8
|
+
"ECDHE-RSA-AES128-GCM-SHA256","ECDHE-RSA-AES128-SHA256",
|
9
|
+
"ECDHE-ECDSA-AES128-GCM-SHA256","ECDHE-ECDSA-AES128-SHA256",
|
10
|
+
"AES256-SHA","AES128-SHA"
|
11
11
|
]
|
12
|
+
def __init__(self, cipher_list=None):
|
13
|
+
self.cipher_list = cipher_list or self.get_default_ciphers()
|
14
|
+
if isinstance(self.cipher_list, str):
|
15
|
+
self.cipher_list = [c.strip() for c in self.cipher_list.split(',') if c.strip()]
|
16
|
+
self.ciphers_string = ','.join(self.cipher_list) if self.cipher_list else ''
|
12
17
|
|
13
|
-
def __init__(self,cipher_list=None):
|
14
|
-
if cipher_list == None:
|
15
|
-
cipher_list=self.get_default_ciphers()
|
16
|
-
self.cipher_list = cipher_list
|
17
|
-
self.create_list()
|
18
|
-
self.ciphers_string = self.add_string_list()
|
19
18
|
def add_string_list(self):
|
20
19
|
if len(self.cipher_list)==0:
|
21
20
|
return ''
|
@@ -119,14 +119,42 @@ if __name__ == "__main__":
|
|
119
119
|
generator.run()
|
120
120
|
|
121
121
|
class crawlManager:
|
122
|
-
def __init__(self, url
|
122
|
+
def __init__(self, url, req_mgr, url_mgr, source_code=None, parse_type="html.parser"):
|
123
|
+
self.url_mgr = url_mgr
|
124
|
+
self.req_mgr = req_mgr
|
123
125
|
self.url = url
|
124
|
-
self.source_code = source_code
|
125
126
|
self.parse_type = parse_type
|
126
|
-
self.
|
127
|
-
self.
|
128
|
-
self.
|
129
|
-
|
127
|
+
self.source_code = source_code or req_mgr.source_code
|
128
|
+
self.soup = BeautifulSoup(self.source_code or "", parse_type)
|
129
|
+
self.base_netloc = urlparse(self.url).netloc
|
130
|
+
|
131
|
+
def is_internal(self, link):
|
132
|
+
u = urlparse(link)
|
133
|
+
return (not u.netloc) or (u.netloc == self.base_netloc)
|
134
|
+
|
135
|
+
def links_on_page(self):
|
136
|
+
out = set()
|
137
|
+
for a in self.soup.find_all("a", href=True):
|
138
|
+
out.add(urljoin(self.url, a["href"]))
|
139
|
+
return out
|
140
|
+
|
141
|
+
def crawl(self, start=None, max_depth=2, _depth=0, visited=None, session=None):
|
142
|
+
start = start or self.url
|
143
|
+
visited = visited or set()
|
144
|
+
if _depth > max_depth or start in visited:
|
145
|
+
return visited
|
146
|
+
visited.add(start)
|
147
|
+
|
148
|
+
# fetch
|
149
|
+
r = self.req_mgr.session.get(start, timeout=30)
|
150
|
+
r.raise_for_status()
|
151
|
+
soup = BeautifulSoup(r.text, self.parse_type)
|
152
|
+
|
153
|
+
for a in soup.find_all("a", href=True):
|
154
|
+
link = urljoin(start, a["href"])
|
155
|
+
if self.is_internal(link) and link not in visited:
|
156
|
+
self.crawl(link, max_depth=max_depth, _depth=_depth+1, visited=visited)
|
157
|
+
return visited
|
130
158
|
def get_new_source_and_url(self, url=None):
|
131
159
|
"""Fetches new source code and response for a given URL."""
|
132
160
|
url = url
|
@@ -194,26 +222,7 @@ class crawlManager:
|
|
194
222
|
return ('yearly', '0.3')
|
195
223
|
return ('weekly', '1.0')
|
196
224
|
|
197
|
-
|
198
|
-
"""Recursively crawls the site up to max_depth and returns valid internal links."""
|
199
|
-
visited = visited or set()
|
200
|
-
if depth > max_depth or url in visited:
|
201
|
-
return []
|
202
|
-
|
203
|
-
visited.add(url)
|
204
|
-
try:
|
205
|
-
soup = get_soup(url)
|
206
|
-
links = []
|
207
|
-
for tag in soup.find_all('a', href=True):
|
208
|
-
link = urljoin(url, tag['href'])
|
209
|
-
if urlparse(link).netloc == urlparse(url).netloc and link not in visited:
|
210
|
-
links.append(link)
|
211
|
-
self.crawl(link, max_depth, depth + 1, visited)
|
212
|
-
return links
|
213
|
-
except Exception as e:
|
214
|
-
print(f"Error crawling {url}: {e}")
|
215
|
-
return []
|
216
|
-
|
225
|
+
|
217
226
|
def get_meta_info(self, url=None):
|
218
227
|
"""Fetches metadata, including title and meta tags, from the page."""
|
219
228
|
url = url or self.url
|
@@ -1,48 +1,26 @@
|
|
1
|
-
import os
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 '
|
23
|
-
f'Safari/537.36" -H "Accept: */*" "{website}"'
|
24
|
-
)
|
25
|
-
|
26
|
-
def download_site(website, destination_dir, filename):
|
27
|
-
os.makedirs(destination_dir, exist_ok=True)
|
28
|
-
os.chmod(destination_dir, 0o755) # set directory permissions if needed
|
29
|
-
|
30
|
-
destination_path = os.path.join(destination_dir, filename)
|
31
|
-
|
32
|
-
# GET the resource
|
33
|
-
response = requests.get(website, headers={
|
34
|
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
35
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
36
|
-
"Chrome/91.0.4472.124 Safari/537.36",
|
37
|
-
"Accept": "*/*"
|
38
|
-
}, allow_redirects=True)
|
39
|
-
|
40
|
-
# Raise an exception if the download fails
|
41
|
-
response.raise_for_status()
|
42
|
-
|
43
|
-
# Write content to file
|
1
|
+
import os, subprocess, requests
|
2
|
+
|
3
|
+
def curl_download(website, destination_path, user_agent=None):
|
4
|
+
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
|
5
|
+
ua = user_agent or ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
6
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
7
|
+
"Chrome/91.0.4472.124 Safari/537.36")
|
8
|
+
subprocess.run([
|
9
|
+
"curl","-L","--output", destination_path,
|
10
|
+
"-H", f"User-Agent: {ua}",
|
11
|
+
"-H", "Accept: */*",
|
12
|
+
website
|
13
|
+
], check=True)
|
14
|
+
|
15
|
+
def requests_download(website, destination_path, headers=None):
|
16
|
+
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
|
17
|
+
hdr = {"User-Agent": ("Mozilla/5.0 ... Chrome/91.0 Safari/537.36"),
|
18
|
+
"Accept": "*/*"}
|
19
|
+
if headers: hdr.update(headers)
|
20
|
+
r = requests.get(website, headers=hdr, allow_redirects=True, timeout=30)
|
21
|
+
r.raise_for_status()
|
44
22
|
with open(destination_path, "wb") as f:
|
45
|
-
f.write(
|
46
|
-
|
47
|
-
|
48
|
-
|
23
|
+
f.write(r.content)
|
24
|
+
|
25
|
+
if __name__ == "__main__":
|
26
|
+
pass # no side effects
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# meta_dump.py
|
2
|
+
from abstract_webtools.managers.networkManager import NetworkManager
|
3
|
+
from abstract_webtools.managers.userAgentManager import UserAgentManager
|
4
|
+
from abstract_webtools.managers.soupManager.soupManager import soupManager
|
5
|
+
import json, sys
|
6
|
+
|
7
|
+
def dump_all_meta(url: str):
|
8
|
+
ua = UserAgentManager(browser="Chrome", operating_system="Windows")
|
9
|
+
net = NetworkManager(user_agent_manager=ua)
|
10
|
+
|
11
|
+
r = net.session.get(url, timeout=30)
|
12
|
+
r.raise_for_status()
|
13
|
+
|
14
|
+
sm = soupManager(url=url, source_code=r.text, req_mgr=net)
|
15
|
+
out = {
|
16
|
+
"url": url,
|
17
|
+
"title": sm.soup.title.string.strip() if sm.soup.title and sm.soup.title.string else None,
|
18
|
+
"meta": sm.all_meta(),
|
19
|
+
"citation": sm.citation_dict(),
|
20
|
+
"links": sm.all_links(),
|
21
|
+
"json_ld": sm.all_jsonld(),
|
22
|
+
}
|
23
|
+
print(json.dumps(out, indent=2, ensure_ascii=False))
|
24
|
+
|
25
|
+
if __name__ == "__main__":
|
26
|
+
url = sys.argv[1]
|
27
|
+
dump_all_meta(url)
|
@@ -1,15 +1,50 @@
|
|
1
|
+
from typing import Optional, List
|
2
|
+
import requests
|
1
3
|
from ..abstract_webtools import *
|
2
|
-
from
|
4
|
+
from .sslManager import SSLManager
|
5
|
+
from .cipherManager import CipherManager
|
6
|
+
|
7
|
+
class TLSAdapter(HTTPAdapter):
|
8
|
+
def __init__(self, ssl_manager: SSLManager=None):
|
9
|
+
ssl_manager = ssl_manager or SSLManager()
|
10
|
+
self.ssl_context = ssl_manager.ssl_context
|
11
|
+
super().__init__()
|
12
|
+
def init_poolmanager(self, *args, **kwargs):
|
13
|
+
kwargs['ssl_context'] = self.ssl_context
|
14
|
+
return super().init_poolmanager(*args, **kwargs)
|
15
|
+
|
3
16
|
class NetworkManager:
|
4
|
-
def __init__(self, user_agent_manager=None,ssl_manager=None,
|
5
|
-
|
6
|
-
|
7
|
-
self.
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
self.
|
14
|
-
self.
|
15
|
-
|
17
|
+
def __init__(self, user_agent_manager=None, ssl_manager=None, proxies=None, cookies=None,
|
18
|
+
ciphers=None, certification: Optional[str]=None, ssl_options: Optional[List[str]]=None):
|
19
|
+
self.ua_mgr = user_agent_manager or UserAgentManager()
|
20
|
+
self.ssl_mgr = ssl_manager or SSLManager(
|
21
|
+
ciphers=ciphers or CipherManager().ciphers_string,
|
22
|
+
ssl_options=ssl_options,
|
23
|
+
certification=certification
|
24
|
+
)
|
25
|
+
|
26
|
+
self.session = requests.Session()
|
27
|
+
self.session.headers.update({
|
28
|
+
"User-Agent": self.ua_mgr.user_agent,
|
29
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
30
|
+
"Accept-Language": "en-US,en;q=0.9",
|
31
|
+
"Connection": "keep-alive"
|
32
|
+
})
|
33
|
+
adapter = TLSAdapter(self.ssl_mgr)
|
34
|
+
self.session.mount("https://", adapter)
|
35
|
+
self.session.mount("http://", HTTPAdapter())
|
36
|
+
|
37
|
+
if proxies:
|
38
|
+
self.session.proxies = proxies
|
39
|
+
if cookies:
|
40
|
+
if isinstance(cookies, requests.cookies.RequestsCookieJar):
|
41
|
+
self.session.cookies = cookies
|
42
|
+
elif isinstance(cookies, dict):
|
43
|
+
jar = requests.cookies.RequestsCookieJar()
|
44
|
+
for k,v in cookies.items(): jar.set(k,v)
|
45
|
+
self.session.cookies = jar
|
46
|
+
# if string: up to you—parse or ignore
|
47
|
+
|
48
|
+
# retries (optional)
|
49
|
+
from requests.adapters import Retry
|
50
|
+
self.session.adapters['https://'].max_retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
|
@@ -1,36 +1,114 @@
|
|
1
|
-
import os
|
2
|
-
from
|
3
|
-
from
|
4
|
-
from urllib.parse import urlparse
|
5
|
-
from abstract_utilities import *
|
1
|
+
import os, time, re, json, logging, urllib3, requests,tempfile, shutil, socket, atexit, errno
|
2
|
+
from urllib.parse import urlparse, urljoin
|
3
|
+
from bs4 import BeautifulSoup # if you prefer, keep using your parser
|
6
4
|
from selenium import webdriver
|
7
5
|
from selenium.webdriver.chrome.options import Options
|
8
|
-
import
|
9
|
-
import
|
6
|
+
from selenium.webdriver.common.by import By
|
7
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
8
|
+
from selenium.webdriver.support import expected_conditions as EC
|
9
|
+
from abstract_security import get_env_value
|
10
|
+
from abstract_utilities import *
|
11
|
+
from .urlManager import * # your urlManager
|
10
12
|
|
11
|
-
# Suppress urllib3 warnings and debug logs
|
12
13
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
13
14
|
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
14
|
-
|
15
|
-
# Suppress Selenium logs
|
16
15
|
logging.getLogger("selenium").setLevel(logging.WARNING)
|
17
16
|
|
18
|
-
|
19
|
-
from selenium import webdriver
|
20
|
-
from selenium.webdriver.chrome.options import Options
|
21
|
-
|
22
|
-
# Setup Chrome options
|
17
|
+
# ---- Chrome options (keep yours; add safe fallbacks) ----
|
23
18
|
chrome_options = Options()
|
24
|
-
|
25
|
-
|
19
|
+
_bin = get_env_value('CHROME_BINARY')
|
20
|
+
if _bin:
|
21
|
+
chrome_options.binary_location = _bin
|
22
|
+
chrome_options.add_argument("--headless=new")
|
26
23
|
chrome_options.add_argument("--no-sandbox")
|
27
24
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
28
25
|
chrome_options.add_argument("--disable-gpu")
|
29
26
|
chrome_options.add_argument("--disable-software-rasterizer")
|
30
27
|
chrome_options.add_argument("--disable-extensions")
|
31
28
|
chrome_options.add_argument("--remote-debugging-port=9222")
|
29
|
+
chrome_prefs = {"profile.managed_default_content_settings.images": 2}
|
30
|
+
chrome_options.experimental_options["prefs"] = chrome_prefs
|
31
|
+
|
32
|
+
MIN_HTML_BYTES = 2048 # tune: consider <2KB suspicious for real pages
|
33
|
+
# --- NEW helpers: unique temp profile + free port + options builder ---
|
34
|
+
|
35
|
+
def _free_port() -> int:
|
36
|
+
s = socket.socket()
|
37
|
+
s.bind(("127.0.0.1", 0))
|
38
|
+
port = s.getsockname()[1]
|
39
|
+
s.close()
|
40
|
+
return port
|
41
|
+
|
42
|
+
def _make_profile_dir(base="/var/tmp/selenium-profiles") -> str:
|
43
|
+
os.makedirs(base, exist_ok=True)
|
44
|
+
return tempfile.mkdtemp(prefix="cw-", dir=base)
|
45
|
+
|
46
|
+
def _make_chrome_options(binary_path: str | None = None,
|
47
|
+
user_data_dir: str | None = None) -> tuple[Options, str]:
|
48
|
+
opts = Options()
|
49
|
+
if binary_path:
|
50
|
+
opts.binary_location = binary_path
|
51
|
+
opts.add_argument("--headless=new")
|
52
|
+
opts.add_argument("--no-sandbox")
|
53
|
+
opts.add_argument("--disable-dev-shm-usage")
|
54
|
+
opts.add_argument("--disable-gpu")
|
55
|
+
opts.add_argument("--disable-software-rasterizer")
|
56
|
+
opts.add_argument("--disable-extensions")
|
57
|
+
|
58
|
+
prof = user_data_dir or _make_profile_dir()
|
59
|
+
opts.add_argument(f"--user-data-dir={prof}")
|
60
|
+
opts.add_argument(f"--remote-debugging-port={_free_port()}")
|
61
|
+
|
62
|
+
prefs = {"profile.managed_default_content_settings.images": 2}
|
63
|
+
opts.add_experimental_option("prefs", prefs)
|
64
|
+
return opts, prof
|
65
|
+
|
32
66
|
|
67
|
+
def _looks_like_html(text_or_bytes: bytes | str) -> bool:
|
68
|
+
if not text_or_bytes:
|
69
|
+
return False
|
70
|
+
s = text_or_bytes if isinstance(text_or_bytes, str) else text_or_bytes.decode("utf-8", "ignore")
|
71
|
+
if len(s) < MIN_HTML_BYTES:
|
72
|
+
return False
|
73
|
+
lowered = s.lower()
|
74
|
+
return ("<html" in lowered and "</html>" in lowered) or "<body" in lowered
|
33
75
|
|
76
|
+
def _requests_fallback(url: str, headers: dict | None = None, timeout: float = 15.0):
|
77
|
+
"""Plain requests fallback. Returns `requests.Response | None`."""
|
78
|
+
try:
|
79
|
+
sess = requests.Session()
|
80
|
+
sess.headers.update(headers or {"User-Agent": "Mozilla/5.0"})
|
81
|
+
# honor simple redirects and cert issues as needed
|
82
|
+
resp = sess.get(url, timeout=timeout, allow_redirects=True, verify=False)
|
83
|
+
return resp
|
84
|
+
except Exception as e:
|
85
|
+
logging.warning(f"requests fallback failed for {url}: {e}")
|
86
|
+
return None
|
87
|
+
|
88
|
+
def _wait_until_ready(driver, timeout: float = 10.0):
|
89
|
+
"""Waits for DOM readiness and presence of <body>."""
|
90
|
+
try:
|
91
|
+
WebDriverWait(driver, timeout).until(
|
92
|
+
lambda d: d.execute_script("return document.readyState") in ("interactive", "complete")
|
93
|
+
)
|
94
|
+
except Exception:
|
95
|
+
pass
|
96
|
+
try:
|
97
|
+
WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
98
|
+
except Exception:
|
99
|
+
pass
|
100
|
+
# small settle delay for late JS injections
|
101
|
+
time.sleep(0.3)
|
102
|
+
def normalize_url(url, base_url=None):
|
103
|
+
manager = seleniumManager(url)
|
104
|
+
base_url = manager.base_url
|
105
|
+
if url.startswith(base_url):
|
106
|
+
url = url[len(base_url):]
|
107
|
+
normalized_url = urljoin(base_url, url.split('#')[0])
|
108
|
+
if not normalized_url.startswith(base_url):
|
109
|
+
return None
|
110
|
+
return normalized_url
|
111
|
+
# ---- Singleton driver manager (your class; small fixes) ----
|
34
112
|
class SingletonMeta(type):
|
35
113
|
_instances = {}
|
36
114
|
def __call__(cls, *args, **kwargs):
|
@@ -41,79 +119,123 @@ class SingletonMeta(type):
|
|
41
119
|
|
42
120
|
class seleniumManager(metaclass=SingletonMeta):
|
43
121
|
def __init__(self, url):
|
44
|
-
if
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
122
|
+
if getattr(self, "initialized", False):
|
123
|
+
return
|
124
|
+
self.initialized = True
|
125
|
+
|
126
|
+
p = urlparse(url)
|
127
|
+
self.domain = p.netloc
|
128
|
+
self.scheme = p.scheme or "https"
|
129
|
+
self.base_url = f"{self.scheme}://{self.domain}"
|
130
|
+
|
131
|
+
self.site_dir = os.path.join("/var/tmp", "cw-sites", self.domain)
|
132
|
+
os.makedirs(self.site_dir, exist_ok=True)
|
133
|
+
|
134
|
+
self._sessions: dict[str, dict] = {} # key -> {"driver": ..., "profile": ...}
|
135
|
+
atexit.register(lambda sm=self: sm.close_all())
|
136
|
+
|
55
137
|
def get_url_to_path(self, url):
|
56
138
|
url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
|
57
|
-
|
58
|
-
if
|
59
|
-
|
60
|
-
|
61
|
-
for
|
62
|
-
|
63
|
-
os.makedirs(
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
path = self.get_url_to_path(url)
|
71
|
-
return path
|
139
|
+
p = urlparse(url)
|
140
|
+
if p.netloc == self.domain:
|
141
|
+
parts = [x for x in p.path.split('/') if x]
|
142
|
+
d = self.site_dir
|
143
|
+
for seg in parts[:-1]:
|
144
|
+
d = os.path.join(d, seg)
|
145
|
+
os.makedirs(d, exist_ok=True)
|
146
|
+
last = parts[-1] if parts else "index.html"
|
147
|
+
ext = os.path.splitext(last)[-1] or ".html"
|
148
|
+
if not hasattr(self, "page_type"):
|
149
|
+
self.page_type = []
|
150
|
+
self.page_type.append(ext if not self.page_type else self.page_type[-1])
|
151
|
+
return os.path.join(d, last)
|
72
152
|
|
73
153
|
def get_with_netloc(self, url):
|
74
|
-
|
75
|
-
if
|
76
|
-
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
154
|
+
p = urlparse(url)
|
155
|
+
if p.netloc == '':
|
156
|
+
url = f"{self.scheme}://{self.domain}/{url.strip().lstrip('/')}"
|
77
157
|
return url
|
78
158
|
|
79
|
-
def get_driver(self, url):
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
return self.drivers[url]
|
87
|
-
def normalize_url(url, base_url=None):
|
88
|
-
"""
|
89
|
-
Normalize and resolve relative URLs, ensuring proper domain and format.
|
90
|
-
"""
|
91
|
-
# If URL starts with the base URL repeated, remove the extra part
|
92
|
-
manager = seleniumManager(url)
|
93
|
-
base_url = manager.base_url
|
94
|
-
if url.startswith(base_url):
|
95
|
-
url = url[len(base_url):]
|
159
|
+
def get_driver(self, url) -> tuple[str, webdriver.Chrome]:
|
160
|
+
bin_path = get_env_value('CHROME_BINARY')
|
161
|
+
opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
|
162
|
+
driver = webdriver.Chrome(options=opts)
|
163
|
+
key = f"{url}#{time.time()}"
|
164
|
+
self._sessions[key] = {"driver": driver, "profile": prof}
|
165
|
+
return key, driver
|
96
166
|
|
97
|
-
|
98
|
-
|
167
|
+
def close_driver(self, key: str):
|
168
|
+
sess = self._sessions.pop(key, None)
|
169
|
+
if not sess: return
|
170
|
+
try:
|
171
|
+
try: sess["driver"].quit()
|
172
|
+
except Exception: pass
|
173
|
+
finally:
|
174
|
+
shutil.rmtree(sess.get("profile") or "", ignore_errors=True)
|
99
175
|
|
100
|
-
|
101
|
-
|
102
|
-
|
176
|
+
def close_all(self):
|
177
|
+
for key in list(self._sessions.keys()):
|
178
|
+
self.close_driver(key)
|
103
179
|
|
104
|
-
|
105
|
-
|
106
|
-
|
180
|
+
|
181
|
+
|
182
|
+
# ---- Hardened page-source retrieval with fallback ----
|
183
|
+
def get_selenium_source(url, max_retries: int = 2, request_fallback: bool = True, timeout: float = 12.0):
|
107
184
|
url_mgr = urlManager(url)
|
108
|
-
if url_mgr.url:
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
185
|
+
if not url_mgr.url:
|
186
|
+
return None
|
187
|
+
url = str(url_mgr.url)
|
188
|
+
|
189
|
+
manager = seleniumManager(url)
|
190
|
+
key, driver = manager.get_driver(url)
|
191
|
+
|
192
|
+
last_exc = None
|
193
|
+
try:
|
194
|
+
for attempt in range(1, max_retries + 1):
|
195
|
+
try:
|
196
|
+
driver.get(url)
|
197
|
+
_wait_until_ready(driver, timeout=timeout)
|
198
|
+
html = driver.page_source or ""
|
199
|
+
if not _looks_like_html(html):
|
200
|
+
html = driver.execute_script(
|
201
|
+
"return document.documentElement ? document.documentElement.outerHTML : '';"
|
202
|
+
) or html
|
203
|
+
if _looks_like_html(html):
|
204
|
+
return html
|
205
|
+
logging.warning(f"Selenium returned suspicious HTML (len={len(html)}) for {url} "
|
206
|
+
f"[attempt {attempt}/{max_retries}]")
|
207
|
+
except Exception as e:
|
208
|
+
last_exc = e
|
209
|
+
logging.warning(f"Selenium attempt {attempt}/{max_retries} failed for {url}: {e}")
|
210
|
+
time.sleep(0.5 * attempt)
|
211
|
+
|
212
|
+
if request_fallback:
|
213
|
+
resp = _requests_fallback(url, headers={"User-Agent": "Mozilla/5.0"})
|
214
|
+
if resp is not None:
|
215
|
+
ctype = (resp.headers.get("content-type") or "").lower()
|
216
|
+
body = resp.text if hasattr(resp, "text") else (
|
217
|
+
resp.content.decode("utf-8", "ignore") if hasattr(resp, "content") else ""
|
218
|
+
)
|
219
|
+
if "application/json" in ctype:
|
220
|
+
try:
|
221
|
+
return json.dumps(resp.json())
|
222
|
+
except Exception:
|
223
|
+
return body
|
224
|
+
return body if _looks_like_html(body) or body else None
|
225
|
+
finally:
|
226
|
+
# critical: release the user-data-dir to avoid “already in use”
|
227
|
+
manager.close_driver(key)
|
228
|
+
|
229
|
+
if last_exc:
|
230
|
+
logging.error(f"Unable to retrieve page for {url}: {last_exc}")
|
231
|
+
return None
|
119
232
|
|
233
|
+
def get_driver(self, url):
|
234
|
+
# always new
|
235
|
+
bin_path = get_env_value('CHROME_BINARY')
|
236
|
+
opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
|
237
|
+
driver = webdriver.Chrome(options=opts)
|
238
|
+
# store so close_all() can clean up
|
239
|
+
key = f"{url}#{time.time()}"
|
240
|
+
self._sessions[key] = {"driver": driver, "profile": prof}
|
241
|
+
return driver
|