abstract-webtools 0.1.6.145__py3-none-any.whl → 0.1.6.147__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,20 @@
1
1
  class CipherManager:
2
2
  @staticmethod
3
- def get_default_ciphers()-> list:
3
+ def get_default_ciphers() -> list:
4
4
  return [
5
- "ECDHE-RSA-AES256-GCM-SHA384", "ECDHE-ECDSA-AES256-GCM-SHA384",
6
- "ECDHE-RSA-AES256-SHA384", "ECDHE-ECDSA-AES256-SHA384",
7
- "ECDHE-RSA-AES256-SHA", "ECDHE-ECDSA-AES256-SHA",
8
- "ECDHE-RSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-SHA256",
9
- "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-AES128-SHA256",
10
- "AES256-SHA", "AES128-SHA"
5
+ "ECDHE-RSA-AES256-GCM-SHA384","ECDHE-ECDSA-AES256-GCM-SHA384",
6
+ "ECDHE-RSA-AES256-SHA384","ECDHE-ECDSA-AES256-SHA384",
7
+ "ECDHE-RSA-AES256-SHA","ECDHE-ECDSA-AES256-SHA",
8
+ "ECDHE-RSA-AES128-GCM-SHA256","ECDHE-RSA-AES128-SHA256",
9
+ "ECDHE-ECDSA-AES128-GCM-SHA256","ECDHE-ECDSA-AES128-SHA256",
10
+ "AES256-SHA","AES128-SHA"
11
11
  ]
12
+ def __init__(self, cipher_list=None):
13
+ self.cipher_list = cipher_list or self.get_default_ciphers()
14
+ if isinstance(self.cipher_list, str):
15
+ self.cipher_list = [c.strip() for c in self.cipher_list.split(',') if c.strip()]
16
+ self.ciphers_string = ','.join(self.cipher_list) if self.cipher_list else ''
12
17
 
13
- def __init__(self,cipher_list=None):
14
- if cipher_list == None:
15
- cipher_list=self.get_default_ciphers()
16
- self.cipher_list = cipher_list
17
- self.create_list()
18
- self.ciphers_string = self.add_string_list()
19
18
  def add_string_list(self):
20
19
  if len(self.cipher_list)==0:
21
20
  return ''
@@ -119,14 +119,42 @@ if __name__ == "__main__":
119
119
  generator.run()
120
120
 
121
121
  class crawlManager:
122
- def __init__(self, url=None, req_mgr=None, url_mgr=None, source_code=None, parse_type="html.parser"):
122
+ def __init__(self, url, req_mgr, url_mgr, source_code=None, parse_type="html.parser"):
123
+ self.url_mgr = url_mgr
124
+ self.req_mgr = req_mgr
123
125
  self.url = url
124
- self.source_code = source_code
125
126
  self.parse_type = parse_type
126
- self.url_mgr = url_mgr or urlManager(url=self.url)
127
- self.req_mgr = req_mgr or requestManager(url_mgr=self.url_mgr)
128
- self.get_new_source_and_url(url)
129
- self.sime_map_mgr = SitemapGenerator(self.url_mgr.domain)
127
+ self.source_code = source_code or req_mgr.source_code
128
+ self.soup = BeautifulSoup(self.source_code or "", parse_type)
129
+ self.base_netloc = urlparse(self.url).netloc
130
+
131
+ def is_internal(self, link):
132
+ u = urlparse(link)
133
+ return (not u.netloc) or (u.netloc == self.base_netloc)
134
+
135
+ def links_on_page(self):
136
+ out = set()
137
+ for a in self.soup.find_all("a", href=True):
138
+ out.add(urljoin(self.url, a["href"]))
139
+ return out
140
+
141
+ def crawl(self, start=None, max_depth=2, _depth=0, visited=None, session=None):
142
+ start = start or self.url
143
+ visited = visited or set()
144
+ if _depth > max_depth or start in visited:
145
+ return visited
146
+ visited.add(start)
147
+
148
+ # fetch
149
+ r = self.req_mgr.session.get(start, timeout=30)
150
+ r.raise_for_status()
151
+ soup = BeautifulSoup(r.text, self.parse_type)
152
+
153
+ for a in soup.find_all("a", href=True):
154
+ link = urljoin(start, a["href"])
155
+ if self.is_internal(link) and link not in visited:
156
+ self.crawl(link, max_depth=max_depth, _depth=_depth+1, visited=visited)
157
+ return visited
130
158
  def get_new_source_and_url(self, url=None):
131
159
  """Fetches new source code and response for a given URL."""
132
160
  url = url
@@ -194,26 +222,7 @@ class crawlManager:
194
222
  return ('yearly', '0.3')
195
223
  return ('weekly', '1.0')
196
224
 
197
- def crawl(self, url, max_depth=3, depth=1, visited=None):
198
- """Recursively crawls the site up to max_depth and returns valid internal links."""
199
- visited = visited or set()
200
- if depth > max_depth or url in visited:
201
- return []
202
-
203
- visited.add(url)
204
- try:
205
- soup = get_soup(url)
206
- links = []
207
- for tag in soup.find_all('a', href=True):
208
- link = urljoin(url, tag['href'])
209
- if urlparse(link).netloc == urlparse(url).netloc and link not in visited:
210
- links.append(link)
211
- self.crawl(link, max_depth, depth + 1, visited)
212
- return links
213
- except Exception as e:
214
- print(f"Error crawling {url}: {e}")
215
- return []
216
-
225
+
217
226
  def get_meta_info(self, url=None):
218
227
  """Fetches metadata, including title and meta tags, from the page."""
219
228
  url = url or self.url
@@ -1,48 +1,26 @@
1
- import os
2
- import requests
3
- import os
4
- import subprocess
5
- import stat
6
-
7
- def get_site(website, destination_dir, filename):
8
- # Ensure the directory exists
9
- os.makedirs(destination_dir, exist_ok=True)
10
-
11
- # Adjust directory permissions if needed (e.g. rwxr-xr-x -> 0o755)
12
- os.chmod(destination_dir, 0o755)
13
-
14
- # Construct the complete file path
15
- destination_path = os.path.join(destination_dir, filename)
16
-
17
- # Use curl to download the site
18
- # The example user-agent is arbitrary; you can change it to your needs
19
- os.system(
20
- f'curl -L --output "{destination_path}" '
21
- f'-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
22
- f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 '
23
- f'Safari/537.36" -H "Accept: */*" "{website}"'
24
- )
25
-
26
- def download_site(website, destination_dir, filename):
27
- os.makedirs(destination_dir, exist_ok=True)
28
- os.chmod(destination_dir, 0o755) # set directory permissions if needed
29
-
30
- destination_path = os.path.join(destination_dir, filename)
31
-
32
- # GET the resource
33
- response = requests.get(website, headers={
34
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
35
- "AppleWebKit/537.36 (KHTML, like Gecko) "
36
- "Chrome/91.0.4472.124 Safari/537.36",
37
- "Accept": "*/*"
38
- }, allow_redirects=True)
39
-
40
- # Raise an exception if the download fails
41
- response.raise_for_status()
42
-
43
- # Write content to file
1
+ import os, subprocess, requests
2
+
3
+ def curl_download(website, destination_path, user_agent=None):
4
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
5
+ ua = user_agent or ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
6
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
7
+ "Chrome/91.0.4472.124 Safari/537.36")
8
+ subprocess.run([
9
+ "curl","-L","--output", destination_path,
10
+ "-H", f"User-Agent: {ua}",
11
+ "-H", "Accept: */*",
12
+ website
13
+ ], check=True)
14
+
15
+ def requests_download(website, destination_path, headers=None):
16
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
17
+ hdr = {"User-Agent": ("Mozilla/5.0 ... Chrome/91.0 Safari/537.36"),
18
+ "Accept": "*/*"}
19
+ if headers: hdr.update(headers)
20
+ r = requests.get(website, headers=hdr, allow_redirects=True, timeout=30)
21
+ r.raise_for_status()
44
22
  with open(destination_path, "wb") as f:
45
- f.write(response.content)
46
- website = 'https://www.pornhub.com'
47
- destination = '/home/computron/Documents/doge'
48
- get_site(website,destination,'doge')
23
+ f.write(r.content)
24
+
25
+ if __name__ == "__main__":
26
+ pass # no side effects
@@ -0,0 +1,27 @@
1
+ # meta_dump.py
2
+ from abstract_webtools.managers.networkManager import NetworkManager
3
+ from abstract_webtools.managers.userAgentManager import UserAgentManager
4
+ from abstract_webtools.managers.soupManager.soupManager import soupManager
5
+ import json, sys
6
+
7
+ def dump_all_meta(url: str):
8
+ ua = UserAgentManager(browser="Chrome", operating_system="Windows")
9
+ net = NetworkManager(user_agent_manager=ua)
10
+
11
+ r = net.session.get(url, timeout=30)
12
+ r.raise_for_status()
13
+
14
+ sm = soupManager(url=url, source_code=r.text, req_mgr=net)
15
+ out = {
16
+ "url": url,
17
+ "title": sm.soup.title.string.strip() if sm.soup.title and sm.soup.title.string else None,
18
+ "meta": sm.all_meta(),
19
+ "citation": sm.citation_dict(),
20
+ "links": sm.all_links(),
21
+ "json_ld": sm.all_jsonld(),
22
+ }
23
+ print(json.dumps(out, indent=2, ensure_ascii=False))
24
+
25
+ if __name__ == "__main__":
26
+ url = sys.argv[1]
27
+ dump_all_meta(url)
@@ -1,15 +1,50 @@
1
+ from typing import Optional, List
2
+ import requests
1
3
  from ..abstract_webtools import *
2
- from ..big_user_agent_list import *
4
+ from .sslManager import SSLManager
5
+ from .cipherManager import CipherManager
6
+
7
+ class TLSAdapter(HTTPAdapter):
8
+ def __init__(self, ssl_manager: SSLManager=None):
9
+ ssl_manager = ssl_manager or SSLManager()
10
+ self.ssl_context = ssl_manager.ssl_context
11
+ super().__init__()
12
+ def init_poolmanager(self, *args, **kwargs):
13
+ kwargs['ssl_context'] = self.ssl_context
14
+ return super().init_poolmanager(*args, **kwargs)
15
+
3
16
  class NetworkManager:
4
- def __init__(self, user_agent_manager=None,ssl_manager=None, tls_adapter=None,user_agent=None,proxies=None,cookies=None,ciphers=None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None):
5
- if ssl_manager == None:
6
- ssl_manager = SSLManager(ciphers=ciphers, ssl_options=ssl_options, certification=certification)
7
- self.ssl_manager=ssl_manager
8
- if tls_adapter == None:
9
- tls_adapter=TLSAdapter(ssl_manager=ssl_manager,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
10
- self.tls_adapter=tls_adapter
11
- self.ciphers=tls_adapter.ciphers
12
- self.certification=tls_adapter.certification
13
- self.ssl_options=tls_adapter.ssl_options
14
- self.proxies=None or {}
15
- self.cookies=cookies or "cb4c883efc59d0e990caf7508902591f4569e7bf-1617321078-0-150"
17
+ def __init__(self, user_agent_manager=None, ssl_manager=None, proxies=None, cookies=None,
18
+ ciphers=None, certification: Optional[str]=None, ssl_options: Optional[List[str]]=None):
19
+ self.ua_mgr = user_agent_manager or UserAgentManager()
20
+ self.ssl_mgr = ssl_manager or SSLManager(
21
+ ciphers=ciphers or CipherManager().ciphers_string,
22
+ ssl_options=ssl_options,
23
+ certification=certification
24
+ )
25
+
26
+ self.session = requests.Session()
27
+ self.session.headers.update({
28
+ "User-Agent": self.ua_mgr.user_agent,
29
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
30
+ "Accept-Language": "en-US,en;q=0.9",
31
+ "Connection": "keep-alive"
32
+ })
33
+ adapter = TLSAdapter(self.ssl_mgr)
34
+ self.session.mount("https://", adapter)
35
+ self.session.mount("http://", HTTPAdapter())
36
+
37
+ if proxies:
38
+ self.session.proxies = proxies
39
+ if cookies:
40
+ if isinstance(cookies, requests.cookies.RequestsCookieJar):
41
+ self.session.cookies = cookies
42
+ elif isinstance(cookies, dict):
43
+ jar = requests.cookies.RequestsCookieJar()
44
+ for k,v in cookies.items(): jar.set(k,v)
45
+ self.session.cookies = jar
46
+ # if string: up to you—parse or ignore
47
+
48
+ # retries (optional)
49
+ from requests.adapters import Retry
50
+ self.session.adapters['https://'].max_retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
@@ -4,7 +4,7 @@ from ..cipherManager import *
4
4
  from ..sslManager import *
5
5
  from ..tlsAdapter import *
6
6
  from ..networkManager import *
7
- from ..seleniumManager import *
7
+ from ..seleneumManager import *
8
8
  from ..urlManager import *
9
9
  logging.basicConfig(level=logging.INFO)
10
10
 
@@ -239,3 +239,4 @@ def get_driver(self, url):
239
239
  key = f"{url}#{time.time()}"
240
240
  self._sessions[key] = {"driver": driver, "profile": prof}
241
241
  return driver
242
+ seleneumManager = seleniumManager
@@ -1,36 +1,114 @@
1
- import os
2
- from ..abstract_webtools import *
3
- from .urlManager import *
4
- from urllib.parse import urlparse
5
- from abstract_utilities import *
1
+ import os, time, re, json, logging, urllib3, requests,tempfile, shutil, socket, atexit, errno
2
+ from urllib.parse import urlparse, urljoin
3
+ from bs4 import BeautifulSoup # if you prefer, keep using your parser
6
4
  from selenium import webdriver
7
5
  from selenium.webdriver.chrome.options import Options
8
- import logging
9
- import urllib3
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.webdriver.support.ui import WebDriverWait
8
+ from selenium.webdriver.support import expected_conditions as EC
9
+ from abstract_security import get_env_value
10
+ from abstract_utilities import *
11
+ from .urlManager import * # your urlManager
10
12
 
11
- # Suppress urllib3 warnings and debug logs
12
13
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13
14
  logging.getLogger("urllib3").setLevel(logging.WARNING)
14
-
15
- # Suppress Selenium logs
16
15
  logging.getLogger("selenium").setLevel(logging.WARNING)
17
16
 
18
- import os
19
- from selenium import webdriver
20
- from selenium.webdriver.chrome.options import Options
21
-
22
- # Setup Chrome options
17
+ # ---- Chrome options (keep yours; add safe fallbacks) ----
23
18
  chrome_options = Options()
24
- #chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
25
- chrome_options.add_argument("--headless") # Run in headless mode
19
+ _bin = get_env_value('CHROME_BINARY')
20
+ if _bin:
21
+ chrome_options.binary_location = _bin
22
+ chrome_options.add_argument("--headless=new")
26
23
  chrome_options.add_argument("--no-sandbox")
27
24
  chrome_options.add_argument("--disable-dev-shm-usage")
28
25
  chrome_options.add_argument("--disable-gpu")
29
26
  chrome_options.add_argument("--disable-software-rasterizer")
30
27
  chrome_options.add_argument("--disable-extensions")
31
28
  chrome_options.add_argument("--remote-debugging-port=9222")
29
+ chrome_prefs = {"profile.managed_default_content_settings.images": 2}
30
+ chrome_options.experimental_options["prefs"] = chrome_prefs
31
+
32
+ MIN_HTML_BYTES = 2048 # tune: consider <2KB suspicious for real pages
33
+ # --- NEW helpers: unique temp profile + free port + options builder ---
34
+
35
+ def _free_port() -> int:
36
+ s = socket.socket()
37
+ s.bind(("127.0.0.1", 0))
38
+ port = s.getsockname()[1]
39
+ s.close()
40
+ return port
41
+
42
+ def _make_profile_dir(base="/var/tmp/selenium-profiles") -> str:
43
+ os.makedirs(base, exist_ok=True)
44
+ return tempfile.mkdtemp(prefix="cw-", dir=base)
45
+
46
+ def _make_chrome_options(binary_path: str | None = None,
47
+ user_data_dir: str | None = None) -> tuple[Options, str]:
48
+ opts = Options()
49
+ if binary_path:
50
+ opts.binary_location = binary_path
51
+ opts.add_argument("--headless=new")
52
+ opts.add_argument("--no-sandbox")
53
+ opts.add_argument("--disable-dev-shm-usage")
54
+ opts.add_argument("--disable-gpu")
55
+ opts.add_argument("--disable-software-rasterizer")
56
+ opts.add_argument("--disable-extensions")
57
+
58
+ prof = user_data_dir or _make_profile_dir()
59
+ opts.add_argument(f"--user-data-dir={prof}")
60
+ opts.add_argument(f"--remote-debugging-port={_free_port()}")
61
+
62
+ prefs = {"profile.managed_default_content_settings.images": 2}
63
+ opts.add_experimental_option("prefs", prefs)
64
+ return opts, prof
65
+
32
66
 
67
+ def _looks_like_html(text_or_bytes: bytes | str) -> bool:
68
+ if not text_or_bytes:
69
+ return False
70
+ s = text_or_bytes if isinstance(text_or_bytes, str) else text_or_bytes.decode("utf-8", "ignore")
71
+ if len(s) < MIN_HTML_BYTES:
72
+ return False
73
+ lowered = s.lower()
74
+ return ("<html" in lowered and "</html>" in lowered) or "<body" in lowered
33
75
 
76
+ def _requests_fallback(url: str, headers: dict | None = None, timeout: float = 15.0):
77
+ """Plain requests fallback. Returns `requests.Response | None`."""
78
+ try:
79
+ sess = requests.Session()
80
+ sess.headers.update(headers or {"User-Agent": "Mozilla/5.0"})
81
+ # honor simple redirects and cert issues as needed
82
+ resp = sess.get(url, timeout=timeout, allow_redirects=True, verify=False)
83
+ return resp
84
+ except Exception as e:
85
+ logging.warning(f"requests fallback failed for {url}: {e}")
86
+ return None
87
+
88
+ def _wait_until_ready(driver, timeout: float = 10.0):
89
+ """Waits for DOM readiness and presence of <body>."""
90
+ try:
91
+ WebDriverWait(driver, timeout).until(
92
+ lambda d: d.execute_script("return document.readyState") in ("interactive", "complete")
93
+ )
94
+ except Exception:
95
+ pass
96
+ try:
97
+ WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
98
+ except Exception:
99
+ pass
100
+ # small settle delay for late JS injections
101
+ time.sleep(0.3)
102
+ def normalize_url(url, base_url=None):
103
+ manager = seleniumManager(url)
104
+ base_url = manager.base_url
105
+ if url.startswith(base_url):
106
+ url = url[len(base_url):]
107
+ normalized_url = urljoin(base_url, url.split('#')[0])
108
+ if not normalized_url.startswith(base_url):
109
+ return None
110
+ return normalized_url
111
+ # ---- Singleton driver manager (your class; small fixes) ----
34
112
  class SingletonMeta(type):
35
113
  _instances = {}
36
114
  def __call__(cls, *args, **kwargs):
@@ -41,79 +119,123 @@ class SingletonMeta(type):
41
119
 
42
120
  class seleniumManager(metaclass=SingletonMeta):
43
121
  def __init__(self, url):
44
- if not hasattr(self, 'initialized'): # Prevent reinitialization
45
- self.initialized = True
46
- parsed_url = urlparse(url)
47
- self.domain = parsed_url.netloc
48
- self.scheme = parsed_url.scheme
49
- self.base_url= f"{self.scheme}{self.domain}"
50
- self.site_dir = os.path.join(os.getcwd(), self.domain)
51
- os.makedirs(self.site_dir, exist_ok=True)
52
- self.drivers = {}
53
- self.page_type = []
54
-
122
+ if getattr(self, "initialized", False):
123
+ return
124
+ self.initialized = True
125
+
126
+ p = urlparse(url)
127
+ self.domain = p.netloc
128
+ self.scheme = p.scheme or "https"
129
+ self.base_url = f"{self.scheme}://{self.domain}"
130
+
131
+ self.site_dir = os.path.join("/var/tmp", "cw-sites", self.domain)
132
+ os.makedirs(self.site_dir, exist_ok=True)
133
+
134
+ self._sessions: dict[str, dict] = {} # key -> {"driver": ..., "profile": ...}
135
+ atexit.register(lambda sm=self: sm.close_all())
136
+
55
137
  def get_url_to_path(self, url):
56
138
  url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
57
- parsed_url = urlparse(url)
58
- if parsed_url.netloc == self.domain:
59
- paths = parsed_url.path.split('/')
60
- dir_path = self.site_dir
61
- for path in paths[:-1]:
62
- dir_path = os.path.join(dir_path, path)
63
- os.makedirs(dir_path, exist_ok=True)
64
- self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
65
-
66
- dir_path = os.path.join(dir_path, paths[-1])
67
- return dir_path
68
-
69
- def saved_url_check(self, url):
70
- path = self.get_url_to_path(url)
71
- return path
139
+ p = urlparse(url)
140
+ if p.netloc == self.domain:
141
+ parts = [x for x in p.path.split('/') if x]
142
+ d = self.site_dir
143
+ for seg in parts[:-1]:
144
+ d = os.path.join(d, seg)
145
+ os.makedirs(d, exist_ok=True)
146
+ last = parts[-1] if parts else "index.html"
147
+ ext = os.path.splitext(last)[-1] or ".html"
148
+ if not hasattr(self, "page_type"):
149
+ self.page_type = []
150
+ self.page_type.append(ext if not self.page_type else self.page_type[-1])
151
+ return os.path.join(d, last)
72
152
 
73
153
  def get_with_netloc(self, url):
74
- parsed_url = urlparse(url)
75
- if parsed_url.netloc == '':
76
- url = f"{self.scheme}://{self.domain}/{url.strip()}"
154
+ p = urlparse(url)
155
+ if p.netloc == '':
156
+ url = f"{self.scheme}://{self.domain}/{url.strip().lstrip('/')}"
77
157
  return url
78
158
 
79
- def get_driver(self, url):
80
- if url and url not in self.drivers:
81
- chrome_options = Options()
82
- chrome_options.add_argument("--headless")
83
- driver = webdriver.Chrome(options=chrome_options)
84
- self.drivers[url] = driver
85
- driver.get(url)
86
- return self.drivers[url]
87
- def normalize_url(url, base_url=None):
88
- """
89
- Normalize and resolve relative URLs, ensuring proper domain and format.
90
- """
91
- # If URL starts with the base URL repeated, remove the extra part
92
- manager = seleniumManager(url)
93
- base_url = manager.base_url
94
- if url.startswith(base_url):
95
- url = url[len(base_url):]
159
+ def get_driver(self, url) -> tuple[str, webdriver.Chrome]:
160
+ bin_path = get_env_value('CHROME_BINARY')
161
+ opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
162
+ driver = webdriver.Chrome(options=opts)
163
+ key = f"{url}#{time.time()}"
164
+ self._sessions[key] = {"driver": driver, "profile": prof}
165
+ return key, driver
96
166
 
97
- # Resolve the URL against the base URL
98
- normalized_url = urljoin(base_url, url.split('#')[0])
167
+ def close_driver(self, key: str):
168
+ sess = self._sessions.pop(key, None)
169
+ if not sess: return
170
+ try:
171
+ try: sess["driver"].quit()
172
+ except Exception: pass
173
+ finally:
174
+ shutil.rmtree(sess.get("profile") or "", ignore_errors=True)
99
175
 
100
- # Ensure only URLs belonging to the base domain are kept
101
- if not normalized_url.startswith(base_url):
102
- return None
176
+ def close_all(self):
177
+ for key in list(self._sessions.keys()):
178
+ self.close_driver(key)
103
179
 
104
- return normalized_url
105
- # Function to get Selenium page source
106
- def get_selenium_source(url):
180
+
181
+
182
+ # ---- Hardened page-source retrieval with fallback ----
183
+ def get_selenium_source(url, max_retries: int = 2, request_fallback: bool = True, timeout: float = 12.0):
107
184
  url_mgr = urlManager(url)
108
- if url_mgr.url:
109
- url = str(url_mgr.url)
110
- manager = seleniumManager(url)
111
- driver = manager.get_driver(url)
112
- try:
113
- # Get page source
114
- page_source = driver.page_source
115
- return page_source
116
- finally:
117
- # Don't quit the driver unless you're done with all interactions
118
- pass
185
+ if not url_mgr.url:
186
+ return None
187
+ url = str(url_mgr.url)
188
+
189
+ manager = seleniumManager(url)
190
+ key, driver = manager.get_driver(url)
191
+
192
+ last_exc = None
193
+ try:
194
+ for attempt in range(1, max_retries + 1):
195
+ try:
196
+ driver.get(url)
197
+ _wait_until_ready(driver, timeout=timeout)
198
+ html = driver.page_source or ""
199
+ if not _looks_like_html(html):
200
+ html = driver.execute_script(
201
+ "return document.documentElement ? document.documentElement.outerHTML : '';"
202
+ ) or html
203
+ if _looks_like_html(html):
204
+ return html
205
+ logging.warning(f"Selenium returned suspicious HTML (len={len(html)}) for {url} "
206
+ f"[attempt {attempt}/{max_retries}]")
207
+ except Exception as e:
208
+ last_exc = e
209
+ logging.warning(f"Selenium attempt {attempt}/{max_retries} failed for {url}: {e}")
210
+ time.sleep(0.5 * attempt)
211
+
212
+ if request_fallback:
213
+ resp = _requests_fallback(url, headers={"User-Agent": "Mozilla/5.0"})
214
+ if resp is not None:
215
+ ctype = (resp.headers.get("content-type") or "").lower()
216
+ body = resp.text if hasattr(resp, "text") else (
217
+ resp.content.decode("utf-8", "ignore") if hasattr(resp, "content") else ""
218
+ )
219
+ if "application/json" in ctype:
220
+ try:
221
+ return json.dumps(resp.json())
222
+ except Exception:
223
+ return body
224
+ return body if _looks_like_html(body) or body else None
225
+ finally:
226
+ # critical: release the user-data-dir to avoid “already in use”
227
+ manager.close_driver(key)
228
+
229
+ if last_exc:
230
+ logging.error(f"Unable to retrieve page for {url}: {last_exc}")
231
+ return None
119
232
 
233
+ def get_driver(self, url):
234
+ # always new
235
+ bin_path = get_env_value('CHROME_BINARY')
236
+ opts, prof = _make_chrome_options(binary_path=bin_path, user_data_dir=None)
237
+ driver = webdriver.Chrome(options=opts)
238
+ # store so close_all() can clean up
239
+ key = f"{url}#{time.time()}"
240
+ self._sessions[key] = {"driver": driver, "profile": prof}
241
+ return driver