blackops-core 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- blackops_core/__init__.py +9 -0
- blackops_core/auth.py +241 -0
- blackops_core/browser_crawler.py +210 -0
- blackops_core/crawler.py +357 -0
- blackops_core/dorker.py +221 -0
- blackops_core/http/__init__.py +14 -0
- blackops_core/http/_cookies.py +42 -0
- blackops_core/http/client.py +235 -0
- blackops_core/js_api_discover.py +230 -0
- blackops_core/openapi.py +395 -0
- blackops_core/passive.py +24 -0
- blackops_core/source_map.py +217 -0
- blackops_core/ssh.py +141 -0
- blackops_core/winexec.py +222 -0
- blackops_core/ws.py +204 -0
- blackops_core-0.1.5.dist-info/METADATA +33 -0
- blackops_core-0.1.5.dist-info/RECORD +19 -0
- blackops_core-0.1.5.dist-info/WHEEL +4 -0
- blackops_core-0.1.5.dist-info/licenses/LICENSE +661 -0
blackops_core/auth.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (c) 2026 CommonHuman-Lab
|
|
3
|
+
"""
|
|
4
|
+
blackops-core — auth.py
|
|
5
|
+
Form-based and token-based authentication helpers.
|
|
6
|
+
|
|
7
|
+
Provides:
|
|
8
|
+
- form_login() — POST credentials to an HTML login form, return session cookies/headers
|
|
9
|
+
- bearer_login() — OAuth 2.0 client credentials grant, return Authorization header
|
|
10
|
+
- extract_csrf() — pull a CSRF token from an HTML page
|
|
11
|
+
- AuthResult — carries cookies (str) + headers (dict) ready for subsequent requests
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import urllib.parse as up
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from html.parser import HTMLParser
|
|
20
|
+
from typing import Any, Dict, List, Optional
|
|
21
|
+
|
|
22
|
+
from .http.client import HttpClient
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
_CSRF_NAMES = frozenset({
|
|
27
|
+
"csrf_token", "_token", "xsrf_token", "authenticity_token",
|
|
28
|
+
"csrfmiddlewaretoken", "_csrf", "csrf", "__requestverificationtoken",
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Public data type
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class AuthResult:
|
|
38
|
+
"""Session credentials ready to pass to any CommonHuman-Lab scanner."""
|
|
39
|
+
cookies: str = ""
|
|
40
|
+
headers: Dict[str, str] = field(default_factory=dict)
|
|
41
|
+
|
|
42
|
+
def is_empty(self) -> bool:
|
|
43
|
+
return not self.cookies and not self.headers
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# Public API
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
def form_login(
|
|
51
|
+
login_url: str,
|
|
52
|
+
username: str,
|
|
53
|
+
password: str,
|
|
54
|
+
username_field: str = "username",
|
|
55
|
+
password_field: str = "password",
|
|
56
|
+
extra_fields: Optional[Dict[str, str]] = None,
|
|
57
|
+
client: Optional[HttpClient] = None,
|
|
58
|
+
timeout: int = 15,
|
|
59
|
+
) -> AuthResult:
|
|
60
|
+
"""Submit an HTML login form and return the resulting session.
|
|
61
|
+
|
|
62
|
+
Fetches the login page, extracts CSRF tokens and hidden fields, POSTs
|
|
63
|
+
credentials to the form action, then collects session cookies. If the
|
|
64
|
+
server responds with a JSON body containing a token field the Bearer
|
|
65
|
+
header is populated automatically.
|
|
66
|
+
|
|
67
|
+
Returns an empty AuthResult on network or parse failure.
|
|
68
|
+
"""
|
|
69
|
+
c = client or HttpClient(timeout=timeout)
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
resp = c._session.get(login_url, timeout=timeout)
|
|
73
|
+
except Exception as exc:
|
|
74
|
+
logger.warning("form_login: GET %s failed: %s", login_url, exc)
|
|
75
|
+
return AuthResult()
|
|
76
|
+
|
|
77
|
+
parser = _FormParser()
|
|
78
|
+
parser.feed(resp.text)
|
|
79
|
+
|
|
80
|
+
action = parser.action or login_url
|
|
81
|
+
if action and not action.startswith(("http://", "https://")):
|
|
82
|
+
action = up.urljoin(login_url, action)
|
|
83
|
+
|
|
84
|
+
body: Dict[str, str] = dict(parser.fields)
|
|
85
|
+
body[username_field] = username
|
|
86
|
+
body[password_field] = password
|
|
87
|
+
if extra_fields:
|
|
88
|
+
body.update(extra_fields)
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
post_resp = c._session.post(action, data=body, timeout=timeout, allow_redirects=True)
|
|
92
|
+
except Exception as exc:
|
|
93
|
+
logger.warning("form_login: POST %s failed: %s", action, exc)
|
|
94
|
+
return AuthResult()
|
|
95
|
+
|
|
96
|
+
cookies = "; ".join(
|
|
97
|
+
f"{name}={val}"
|
|
98
|
+
for name, val in c._session.cookies.items()
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
result_headers: Dict[str, str] = {}
|
|
102
|
+
try:
|
|
103
|
+
j = post_resp.json()
|
|
104
|
+
for key in ("token", "access_token", "accessToken", "jwt", "id_token"):
|
|
105
|
+
if key in j and isinstance(j[key], str):
|
|
106
|
+
result_headers["Authorization"] = f"Bearer {j[key]}"
|
|
107
|
+
break
|
|
108
|
+
except Exception:
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
result = AuthResult(cookies=cookies, headers=result_headers)
|
|
112
|
+
if result.is_empty():
|
|
113
|
+
logger.warning("form_login: no cookies or token obtained from %s", login_url)
|
|
114
|
+
else:
|
|
115
|
+
logger.info("form_login: authenticated via %s (%d cookies)", login_url, cookies.count(";") + 1)
|
|
116
|
+
return result
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def bearer_login(
|
|
120
|
+
token_url: str,
|
|
121
|
+
client_id: str,
|
|
122
|
+
client_secret: str,
|
|
123
|
+
grant_type: str = "client_credentials",
|
|
124
|
+
client: Optional[HttpClient] = None,
|
|
125
|
+
timeout: int = 15,
|
|
126
|
+
) -> AuthResult:
|
|
127
|
+
"""OAuth 2.0 token endpoint — client credentials or password grant.
|
|
128
|
+
|
|
129
|
+
Returns AuthResult with Authorization: Bearer <token> header populated,
|
|
130
|
+
or empty AuthResult on failure.
|
|
131
|
+
"""
|
|
132
|
+
c = client or HttpClient(timeout=timeout)
|
|
133
|
+
body = {
|
|
134
|
+
"grant_type": grant_type,
|
|
135
|
+
"client_id": client_id,
|
|
136
|
+
"client_secret": client_secret,
|
|
137
|
+
}
|
|
138
|
+
try:
|
|
139
|
+
resp = c._session.post(token_url, data=body, timeout=timeout)
|
|
140
|
+
j = resp.json()
|
|
141
|
+
token = j.get("access_token") or j.get("token") or j.get("id_token")
|
|
142
|
+
if token:
|
|
143
|
+
logger.info("bearer_login: obtained token from %s", token_url)
|
|
144
|
+
return AuthResult(headers={"Authorization": f"Bearer {token}"})
|
|
145
|
+
except Exception as exc:
|
|
146
|
+
logger.warning("bearer_login: %s failed: %s", token_url, exc)
|
|
147
|
+
return AuthResult()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def http_auth(auth_type: str, cred: str) -> Any:
|
|
151
|
+
"""Return a requests-compatible auth object for Basic, Digest, or NTLM auth.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
auth_type: ``"basic"``, ``"digest"``, or ``"ntlm"``.
|
|
155
|
+
cred: Credentials in ``"username:password"`` format. The password
|
|
156
|
+
may itself contain colons — only the first colon is used as
|
|
157
|
+
the delimiter.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
A ``requests.auth.HTTPBasicAuth``, ``requests.auth.HTTPDigestAuth``,
|
|
161
|
+
or ``requests_ntlm.HttpNtlmAuth`` instance ready to be passed to
|
|
162
|
+
``HttpClient(auth=...)``.
|
|
163
|
+
|
|
164
|
+
Raises:
|
|
165
|
+
ValueError: Invalid *auth_type* or malformed *cred*.
|
|
166
|
+
ImportError: ``auth_type="ntlm"`` requested but ``requests-ntlm`` is
|
|
167
|
+
not installed (``pip install blackops-core[ntlm]``).
|
|
168
|
+
"""
|
|
169
|
+
if not cred or ":" not in cred:
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"auth_cred must be in 'username:password' format, got {cred!r}"
|
|
172
|
+
)
|
|
173
|
+
user, _, password = cred.partition(":")
|
|
174
|
+
|
|
175
|
+
if auth_type == "basic":
|
|
176
|
+
from requests.auth import HTTPBasicAuth
|
|
177
|
+
return HTTPBasicAuth(user, password)
|
|
178
|
+
if auth_type == "digest":
|
|
179
|
+
from requests.auth import HTTPDigestAuth
|
|
180
|
+
return HTTPDigestAuth(user, password)
|
|
181
|
+
if auth_type == "ntlm":
|
|
182
|
+
try:
|
|
183
|
+
from requests_ntlm import HttpNtlmAuth # type: ignore[import]
|
|
184
|
+
except ImportError as exc:
|
|
185
|
+
raise ImportError(
|
|
186
|
+
"NTLM auth requires requests-ntlm: pip install blackops-core[ntlm]"
|
|
187
|
+
) from exc
|
|
188
|
+
return HttpNtlmAuth(user, password)
|
|
189
|
+
raise ValueError(
|
|
190
|
+
f"Unknown auth_type {auth_type!r}. Supported values: basic, digest, ntlm"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def extract_csrf(html: str) -> Optional[str]:
|
|
195
|
+
"""Extract a CSRF token from an HTML page.
|
|
196
|
+
|
|
197
|
+
Scans for ``<input type="hidden">`` elements whose name matches known
|
|
198
|
+
CSRF field name patterns. Returns the first value found, or None.
|
|
199
|
+
"""
|
|
200
|
+
parser = _FormParser()
|
|
201
|
+
parser.feed(html)
|
|
202
|
+
for name, value in parser.fields.items():
|
|
203
|
+
if name.lower() in _CSRF_NAMES and value:
|
|
204
|
+
return value
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
# Internal HTML form parser
|
|
210
|
+
# ---------------------------------------------------------------------------
|
|
211
|
+
|
|
212
|
+
class _FormParser(HTMLParser):
|
|
213
|
+
"""Minimal parser that extracts the first HTML form's action and fields."""
|
|
214
|
+
|
|
215
|
+
def __init__(self) -> None:
|
|
216
|
+
super().__init__()
|
|
217
|
+
self.action: str = ""
|
|
218
|
+
self.method: str = "post"
|
|
219
|
+
self.fields: Dict[str, str] = {}
|
|
220
|
+
self._in_form: bool = False
|
|
221
|
+
self._done: bool = False
|
|
222
|
+
|
|
223
|
+
def handle_starttag(self, tag: str, attrs: List) -> None:
|
|
224
|
+
if self._done:
|
|
225
|
+
return
|
|
226
|
+
a = dict(attrs)
|
|
227
|
+
if tag == "form" and not self._in_form:
|
|
228
|
+
self._in_form = True
|
|
229
|
+
self.action = a.get("action", "")
|
|
230
|
+
self.method = a.get("method", "post").lower()
|
|
231
|
+
elif tag == "input" and self._in_form:
|
|
232
|
+
name = a.get("name", "")
|
|
233
|
+
value = a.get("value") or ""
|
|
234
|
+
itype = a.get("type", "text").lower()
|
|
235
|
+
if name and itype not in ("submit", "button", "image", "reset", "file"):
|
|
236
|
+
self.fields[name] = value
|
|
237
|
+
|
|
238
|
+
def handle_endtag(self, tag: str) -> None:
|
|
239
|
+
if tag == "form" and self._in_form:
|
|
240
|
+
self._in_form = False
|
|
241
|
+
self._done = True
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (c) 2026 CommonHuman-Lab
|
|
3
|
+
"""
|
|
4
|
+
blackops-core — browser_crawler.py
|
|
5
|
+
Headless Chromium-based URL discovery for JavaScript-rendered sites.
|
|
6
|
+
|
|
7
|
+
Unlike the standard BFS crawler (which parses static HTML), this module
|
|
8
|
+
renders each page with Selenium, waits for JavaScript to complete, and
|
|
9
|
+
collects all links present in the fully-rendered DOM. Same-origin only.
|
|
10
|
+
|
|
11
|
+
Requires: selenium>=4.0
|
|
12
|
+
pip install 'blackops-core[browser]'
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
import time
|
|
18
|
+
import urllib.parse as up
|
|
19
|
+
from collections import deque
|
|
20
|
+
from typing import Dict, List, Optional, Set
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
_WAIT_FIRST_PAGE = 1.0 # max seconds to wait for first page readyState=complete
|
|
25
|
+
_WAIT_SUBSEQUENT = 0.75 # max seconds to wait for subsequent pages
|
|
26
|
+
_SPA_SETTLE = 0.25 # fixed pause after readyState=complete for SPA first render
|
|
27
|
+
_POLL_INTERVAL = 0.05 # polling granularity in seconds
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _wait_for_ready(driver, timeout_s: float) -> None:
|
|
31
|
+
"""Poll until document.readyState == 'complete' or timeout elapses, then settle."""
|
|
32
|
+
deadline = time.monotonic() + timeout_s
|
|
33
|
+
while True:
|
|
34
|
+
try:
|
|
35
|
+
if driver.execute_script("return document.readyState") == "complete":
|
|
36
|
+
break
|
|
37
|
+
except Exception:
|
|
38
|
+
pass
|
|
39
|
+
remaining = deadline - time.monotonic()
|
|
40
|
+
if remaining <= 0:
|
|
41
|
+
break
|
|
42
|
+
time.sleep(min(_POLL_INTERVAL, remaining))
|
|
43
|
+
time.sleep(_SPA_SETTLE)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def browser_crawl(
|
|
47
|
+
start_url: str,
|
|
48
|
+
max_pages: int = 50,
|
|
49
|
+
max_depth: int = 2,
|
|
50
|
+
headless: bool = True,
|
|
51
|
+
cookies: str = "",
|
|
52
|
+
extra_headers: Optional[Dict[str, str]] = None,
|
|
53
|
+
chromium_path: str = "",
|
|
54
|
+
chromedriver_path: str = "",
|
|
55
|
+
spa_wait_s: float = _WAIT_SUBSEQUENT,
|
|
56
|
+
) -> List[str]:
|
|
57
|
+
"""Discover URLs by rendering pages with headless Chromium.
|
|
58
|
+
|
|
59
|
+
Performs BFS from ``start_url``, rendering each page with Selenium and
|
|
60
|
+
collecting ``<a href>`` links from the fully-rendered DOM. Only follows
|
|
61
|
+
same-origin URLs. Returns a deduplicated list of visited URLs.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
start_url:
|
|
66
|
+
Seed URL to start from.
|
|
67
|
+
max_pages:
|
|
68
|
+
Stop after visiting this many unique pages.
|
|
69
|
+
max_depth:
|
|
70
|
+
Maximum BFS depth from start_url.
|
|
71
|
+
headless:
|
|
72
|
+
Run Chromium without a visible window (default True).
|
|
73
|
+
cookies:
|
|
74
|
+
Cookie string injected before the first request (``name=val; name2=val2``).
|
|
75
|
+
extra_headers:
|
|
76
|
+
Not injected at the driver level (Selenium has limited header support);
|
|
77
|
+
reserved for future CDP-based header injection.
|
|
78
|
+
chromium_path:
|
|
79
|
+
Path to Chromium binary. Auto-detected if empty.
|
|
80
|
+
chromedriver_path:
|
|
81
|
+
Path to chromedriver binary. Auto-detected if empty.
|
|
82
|
+
spa_wait_s:
|
|
83
|
+
Seconds to wait after each page navigation for JS to render.
|
|
84
|
+
"""
|
|
85
|
+
try:
|
|
86
|
+
driver = _setup_driver(headless, chromium_path, chromedriver_path)
|
|
87
|
+
except ImportError as exc:
|
|
88
|
+
logger.error(
|
|
89
|
+
"browser_crawl requires selenium — pip install 'blackops-core[browser]'. %s", exc
|
|
90
|
+
)
|
|
91
|
+
return []
|
|
92
|
+
except Exception as exc:
|
|
93
|
+
logger.error("browser_crawl: failed to start Chromium driver: %s", exc)
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
parsed_start = up.urlparse(start_url)
|
|
97
|
+
origin = f"{parsed_start.scheme}://{parsed_start.netloc}"
|
|
98
|
+
|
|
99
|
+
visited: List[str] = []
|
|
100
|
+
seen: Set[str] = set()
|
|
101
|
+
queue: deque[tuple[str, int]] = deque([(start_url, 0)])
|
|
102
|
+
seen.add(_normalise(start_url))
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
# Inject cookies on the origin before any navigation
|
|
106
|
+
if cookies:
|
|
107
|
+
try:
|
|
108
|
+
driver.get(origin)
|
|
109
|
+
_wait_for_ready(driver, 0.5)
|
|
110
|
+
for pair in cookies.split(";"):
|
|
111
|
+
pair = pair.strip()
|
|
112
|
+
if "=" in pair:
|
|
113
|
+
name, _, value = pair.partition("=")
|
|
114
|
+
driver.add_cookie({"name": name.strip(), "value": value.strip()})
|
|
115
|
+
except Exception as exc:
|
|
116
|
+
logger.debug("browser_crawl: cookie injection failed: %s", exc)
|
|
117
|
+
|
|
118
|
+
while queue and len(visited) < max_pages:
|
|
119
|
+
url, depth = queue.popleft()
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
driver.get(url)
|
|
123
|
+
wait = _WAIT_FIRST_PAGE if depth == 0 else spa_wait_s
|
|
124
|
+
_wait_for_ready(driver, wait)
|
|
125
|
+
except Exception as exc:
|
|
126
|
+
logger.debug("browser_crawl: page load failed %s: %s", url, exc)
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
visited.append(url)
|
|
130
|
+
logger.debug("browser_crawl: visited %s (depth=%d)", url, depth)
|
|
131
|
+
|
|
132
|
+
if depth >= max_depth:
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
links: List[str] = driver.execute_script(
|
|
137
|
+
"return Array.from(document.querySelectorAll('a[href]'))"
|
|
138
|
+
".map(a => a.href)"
|
|
139
|
+
".filter(h => h.startsWith('http'));"
|
|
140
|
+
) or []
|
|
141
|
+
except Exception:
|
|
142
|
+
links = []
|
|
143
|
+
|
|
144
|
+
for link in links:
|
|
145
|
+
norm = _normalise(link)
|
|
146
|
+
if norm in seen:
|
|
147
|
+
continue
|
|
148
|
+
parsed = up.urlparse(link)
|
|
149
|
+
if f"{parsed.scheme}://{parsed.netloc}" != origin:
|
|
150
|
+
continue
|
|
151
|
+
seen.add(norm)
|
|
152
|
+
queue.append((link, depth + 1))
|
|
153
|
+
|
|
154
|
+
except Exception as exc:
|
|
155
|
+
logger.warning("browser_crawl: unexpected error: %s", exc)
|
|
156
|
+
finally:
|
|
157
|
+
try:
|
|
158
|
+
driver.quit()
|
|
159
|
+
except Exception:
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
logger.info("browser_crawl: discovered %d URL(s) from %s", len(visited), start_url)
|
|
163
|
+
return visited
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# ---------------------------------------------------------------------------
|
|
167
|
+
# Selenium driver factory
|
|
168
|
+
# ---------------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
def _setup_driver(headless: bool, chromium_path: str, chromedriver_path: str):
|
|
171
|
+
from selenium import webdriver # noqa: PLC0415
|
|
172
|
+
from selenium.webdriver.chrome.options import Options # noqa: PLC0415
|
|
173
|
+
from selenium.webdriver.chrome.service import Service # noqa: PLC0415
|
|
174
|
+
|
|
175
|
+
opts = Options()
|
|
176
|
+
if headless:
|
|
177
|
+
opts.add_argument("--headless")
|
|
178
|
+
opts.add_argument("--no-sandbox")
|
|
179
|
+
opts.add_argument("--disable-dev-shm-usage")
|
|
180
|
+
opts.add_argument("--disable-gpu")
|
|
181
|
+
opts.add_argument("--disable-extensions")
|
|
182
|
+
opts.add_argument("--no-first-run")
|
|
183
|
+
opts.add_argument("--blink-settings=imagesEnabled=false")
|
|
184
|
+
|
|
185
|
+
if not chromium_path:
|
|
186
|
+
import shutil
|
|
187
|
+
for candidate in ("chromium", "chromium-browser", "google-chrome"):
|
|
188
|
+
found = shutil.which(candidate)
|
|
189
|
+
if found:
|
|
190
|
+
chromium_path = found
|
|
191
|
+
break
|
|
192
|
+
if chromium_path:
|
|
193
|
+
opts.binary_location = chromium_path
|
|
194
|
+
|
|
195
|
+
if not chromedriver_path:
|
|
196
|
+
import shutil
|
|
197
|
+
found = shutil.which("chromedriver")
|
|
198
|
+
if found:
|
|
199
|
+
chromedriver_path = found
|
|
200
|
+
|
|
201
|
+
svc = Service(chromedriver_path) if chromedriver_path else Service()
|
|
202
|
+
driver = webdriver.Chrome(service=svc, options=opts)
|
|
203
|
+
driver.set_page_load_timeout(15)
|
|
204
|
+
return driver
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _normalise(url: str) -> str:
|
|
208
|
+
"""Strip fragment for deduplication."""
|
|
209
|
+
parsed = up.urlparse(url)
|
|
210
|
+
return up.urlunparse(parsed._replace(fragment="")).rstrip("/")
|