blackops-core 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (c) 2026 CommonHuman-Lab
3
+ """
4
+ blackops-core — shared HTTP, crawling, and scanning infrastructure.
5
+ """
6
+
7
+ __version__ = "0.1.5"
8
+
9
+ __all__ = ["__version__"]
blackops_core/auth.py ADDED
@@ -0,0 +1,241 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (c) 2026 CommonHuman-Lab
3
+ """
4
+ blackops-core — auth.py
5
+ Form-based and token-based authentication helpers.
6
+
7
+ Provides:
8
+ - form_login() — POST credentials to an HTML login form, return session cookies/headers
9
+ - bearer_login() — OAuth 2.0 client credentials grant, return Authorization header
10
+ - extract_csrf() — pull a CSRF token from an HTML page
11
+ - AuthResult — carries cookies (str) + headers (dict) ready for subsequent requests
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import logging
17
+ import urllib.parse as up
18
+ from dataclasses import dataclass, field
19
+ from html.parser import HTMLParser
20
+ from typing import Any, Dict, List, Optional
21
+
22
+ from .http.client import HttpClient
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ _CSRF_NAMES = frozenset({
27
+ "csrf_token", "_token", "xsrf_token", "authenticity_token",
28
+ "csrfmiddlewaretoken", "_csrf", "csrf", "__requestverificationtoken",
29
+ })
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Public data type
34
+ # ---------------------------------------------------------------------------
35
+
36
+ @dataclass
37
+ class AuthResult:
38
+ """Session credentials ready to pass to any CommonHuman-Lab scanner."""
39
+ cookies: str = ""
40
+ headers: Dict[str, str] = field(default_factory=dict)
41
+
42
+ def is_empty(self) -> bool:
43
+ return not self.cookies and not self.headers
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # Public API
48
+ # ---------------------------------------------------------------------------
49
+
50
+ def form_login(
51
+ login_url: str,
52
+ username: str,
53
+ password: str,
54
+ username_field: str = "username",
55
+ password_field: str = "password",
56
+ extra_fields: Optional[Dict[str, str]] = None,
57
+ client: Optional[HttpClient] = None,
58
+ timeout: int = 15,
59
+ ) -> AuthResult:
60
+ """Submit an HTML login form and return the resulting session.
61
+
62
+ Fetches the login page, extracts CSRF tokens and hidden fields, POSTs
63
+ credentials to the form action, then collects session cookies. If the
64
+ server responds with a JSON body containing a token field the Bearer
65
+ header is populated automatically.
66
+
67
+ Returns an empty AuthResult on network or parse failure.
68
+ """
69
+ c = client or HttpClient(timeout=timeout)
70
+
71
+ try:
72
+ resp = c._session.get(login_url, timeout=timeout)
73
+ except Exception as exc:
74
+ logger.warning("form_login: GET %s failed: %s", login_url, exc)
75
+ return AuthResult()
76
+
77
+ parser = _FormParser()
78
+ parser.feed(resp.text)
79
+
80
+ action = parser.action or login_url
81
+ if action and not action.startswith(("http://", "https://")):
82
+ action = up.urljoin(login_url, action)
83
+
84
+ body: Dict[str, str] = dict(parser.fields)
85
+ body[username_field] = username
86
+ body[password_field] = password
87
+ if extra_fields:
88
+ body.update(extra_fields)
89
+
90
+ try:
91
+ post_resp = c._session.post(action, data=body, timeout=timeout, allow_redirects=True)
92
+ except Exception as exc:
93
+ logger.warning("form_login: POST %s failed: %s", action, exc)
94
+ return AuthResult()
95
+
96
+ cookies = "; ".join(
97
+ f"{name}={val}"
98
+ for name, val in c._session.cookies.items()
99
+ )
100
+
101
+ result_headers: Dict[str, str] = {}
102
+ try:
103
+ j = post_resp.json()
104
+ for key in ("token", "access_token", "accessToken", "jwt", "id_token"):
105
+ if key in j and isinstance(j[key], str):
106
+ result_headers["Authorization"] = f"Bearer {j[key]}"
107
+ break
108
+ except Exception:
109
+ pass
110
+
111
+ result = AuthResult(cookies=cookies, headers=result_headers)
112
+ if result.is_empty():
113
+ logger.warning("form_login: no cookies or token obtained from %s", login_url)
114
+ else:
115
+ logger.info("form_login: authenticated via %s (%d cookies)", login_url, cookies.count(";") + 1)
116
+ return result
117
+
118
+
119
+ def bearer_login(
120
+ token_url: str,
121
+ client_id: str,
122
+ client_secret: str,
123
+ grant_type: str = "client_credentials",
124
+ client: Optional[HttpClient] = None,
125
+ timeout: int = 15,
126
+ ) -> AuthResult:
127
+ """OAuth 2.0 token endpoint — client credentials or password grant.
128
+
129
+ Returns AuthResult with Authorization: Bearer <token> header populated,
130
+ or empty AuthResult on failure.
131
+ """
132
+ c = client or HttpClient(timeout=timeout)
133
+ body = {
134
+ "grant_type": grant_type,
135
+ "client_id": client_id,
136
+ "client_secret": client_secret,
137
+ }
138
+ try:
139
+ resp = c._session.post(token_url, data=body, timeout=timeout)
140
+ j = resp.json()
141
+ token = j.get("access_token") or j.get("token") or j.get("id_token")
142
+ if token:
143
+ logger.info("bearer_login: obtained token from %s", token_url)
144
+ return AuthResult(headers={"Authorization": f"Bearer {token}"})
145
+ except Exception as exc:
146
+ logger.warning("bearer_login: %s failed: %s", token_url, exc)
147
+ return AuthResult()
148
+
149
+
150
+ def http_auth(auth_type: str, cred: str) -> Any:
151
+ """Return a requests-compatible auth object for Basic, Digest, or NTLM auth.
152
+
153
+ Args:
154
+ auth_type: ``"basic"``, ``"digest"``, or ``"ntlm"``.
155
+ cred: Credentials in ``"username:password"`` format. The password
156
+ may itself contain colons — only the first colon is used as
157
+ the delimiter.
158
+
159
+ Returns:
160
+ A ``requests.auth.HTTPBasicAuth``, ``requests.auth.HTTPDigestAuth``,
161
+ or ``requests_ntlm.HttpNtlmAuth`` instance ready to be passed to
162
+ ``HttpClient(auth=...)``.
163
+
164
+ Raises:
165
+ ValueError: Invalid *auth_type* or malformed *cred*.
166
+ ImportError: ``auth_type="ntlm"`` requested but ``requests-ntlm`` is
167
+ not installed (``pip install blackops-core[ntlm]``).
168
+ """
169
+ if not cred or ":" not in cred:
170
+ raise ValueError(
171
+ f"auth_cred must be in 'username:password' format, got {cred!r}"
172
+ )
173
+ user, _, password = cred.partition(":")
174
+
175
+ if auth_type == "basic":
176
+ from requests.auth import HTTPBasicAuth
177
+ return HTTPBasicAuth(user, password)
178
+ if auth_type == "digest":
179
+ from requests.auth import HTTPDigestAuth
180
+ return HTTPDigestAuth(user, password)
181
+ if auth_type == "ntlm":
182
+ try:
183
+ from requests_ntlm import HttpNtlmAuth # type: ignore[import]
184
+ except ImportError as exc:
185
+ raise ImportError(
186
+ "NTLM auth requires requests-ntlm: pip install blackops-core[ntlm]"
187
+ ) from exc
188
+ return HttpNtlmAuth(user, password)
189
+ raise ValueError(
190
+ f"Unknown auth_type {auth_type!r}. Supported values: basic, digest, ntlm"
191
+ )
192
+
193
+
194
+ def extract_csrf(html: str) -> Optional[str]:
195
+ """Extract a CSRF token from an HTML page.
196
+
197
+ Scans for ``<input type="hidden">`` elements whose name matches known
198
+ CSRF field name patterns. Returns the first value found, or None.
199
+ """
200
+ parser = _FormParser()
201
+ parser.feed(html)
202
+ for name, value in parser.fields.items():
203
+ if name.lower() in _CSRF_NAMES and value:
204
+ return value
205
+ return None
206
+
207
+
208
+ # ---------------------------------------------------------------------------
209
+ # Internal HTML form parser
210
+ # ---------------------------------------------------------------------------
211
+
212
+ class _FormParser(HTMLParser):
213
+ """Minimal parser that extracts the first HTML form's action and fields."""
214
+
215
+ def __init__(self) -> None:
216
+ super().__init__()
217
+ self.action: str = ""
218
+ self.method: str = "post"
219
+ self.fields: Dict[str, str] = {}
220
+ self._in_form: bool = False
221
+ self._done: bool = False
222
+
223
+ def handle_starttag(self, tag: str, attrs: List) -> None:
224
+ if self._done:
225
+ return
226
+ a = dict(attrs)
227
+ if tag == "form" and not self._in_form:
228
+ self._in_form = True
229
+ self.action = a.get("action", "")
230
+ self.method = a.get("method", "post").lower()
231
+ elif tag == "input" and self._in_form:
232
+ name = a.get("name", "")
233
+ value = a.get("value") or ""
234
+ itype = a.get("type", "text").lower()
235
+ if name and itype not in ("submit", "button", "image", "reset", "file"):
236
+ self.fields[name] = value
237
+
238
+ def handle_endtag(self, tag: str) -> None:
239
+ if tag == "form" and self._in_form:
240
+ self._in_form = False
241
+ self._done = True
@@ -0,0 +1,210 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (c) 2026 CommonHuman-Lab
3
+ """
4
+ blackops-core — browser_crawler.py
5
+ Headless Chromium-based URL discovery for JavaScript-rendered sites.
6
+
7
+ Unlike the standard BFS crawler (which parses static HTML), this module
8
+ renders each page with Selenium, waits for JavaScript to complete, and
9
+ collects all links present in the fully-rendered DOM. Same-origin only.
10
+
11
+ Requires: selenium>=4.0
12
+ pip install 'blackops-core[browser]'
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+ import time
18
+ import urllib.parse as up
19
+ from collections import deque
20
+ from typing import Dict, List, Optional, Set
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ _WAIT_FIRST_PAGE = 1.0 # max seconds to wait for first page readyState=complete
25
+ _WAIT_SUBSEQUENT = 0.75 # max seconds to wait for subsequent pages
26
+ _SPA_SETTLE = 0.25 # fixed pause after readyState=complete for SPA first render
27
+ _POLL_INTERVAL = 0.05 # polling granularity in seconds
28
+
29
+
30
+ def _wait_for_ready(driver, timeout_s: float) -> None:
31
+ """Poll until document.readyState == 'complete' or timeout elapses, then settle."""
32
+ deadline = time.monotonic() + timeout_s
33
+ while True:
34
+ try:
35
+ if driver.execute_script("return document.readyState") == "complete":
36
+ break
37
+ except Exception:
38
+ pass
39
+ remaining = deadline - time.monotonic()
40
+ if remaining <= 0:
41
+ break
42
+ time.sleep(min(_POLL_INTERVAL, remaining))
43
+ time.sleep(_SPA_SETTLE)
44
+
45
+
46
+ def browser_crawl(
47
+ start_url: str,
48
+ max_pages: int = 50,
49
+ max_depth: int = 2,
50
+ headless: bool = True,
51
+ cookies: str = "",
52
+ extra_headers: Optional[Dict[str, str]] = None,
53
+ chromium_path: str = "",
54
+ chromedriver_path: str = "",
55
+ spa_wait_s: float = _WAIT_SUBSEQUENT,
56
+ ) -> List[str]:
57
+ """Discover URLs by rendering pages with headless Chromium.
58
+
59
+ Performs BFS from ``start_url``, rendering each page with Selenium and
60
+ collecting ``<a href>`` links from the fully-rendered DOM. Only follows
61
+ same-origin URLs. Returns a deduplicated list of visited URLs.
62
+
63
+ Parameters
64
+ ----------
65
+ start_url:
66
+ Seed URL to start from.
67
+ max_pages:
68
+ Stop after visiting this many unique pages.
69
+ max_depth:
70
+ Maximum BFS depth from start_url.
71
+ headless:
72
+ Run Chromium without a visible window (default True).
73
+ cookies:
74
+ Cookie string injected before the first request (``name=val; name2=val2``).
75
+ extra_headers:
76
+ Not injected at the driver level (Selenium has limited header support);
77
+ reserved for future CDP-based header injection.
78
+ chromium_path:
79
+ Path to Chromium binary. Auto-detected if empty.
80
+ chromedriver_path:
81
+ Path to chromedriver binary. Auto-detected if empty.
82
+ spa_wait_s:
83
+ Seconds to wait after each page navigation for JS to render.
84
+ """
85
+ try:
86
+ driver = _setup_driver(headless, chromium_path, chromedriver_path)
87
+ except ImportError as exc:
88
+ logger.error(
89
+ "browser_crawl requires selenium — pip install 'blackops-core[browser]'. %s", exc
90
+ )
91
+ return []
92
+ except Exception as exc:
93
+ logger.error("browser_crawl: failed to start Chromium driver: %s", exc)
94
+ return []
95
+
96
+ parsed_start = up.urlparse(start_url)
97
+ origin = f"{parsed_start.scheme}://{parsed_start.netloc}"
98
+
99
+ visited: List[str] = []
100
+ seen: Set[str] = set()
101
+ queue: deque[tuple[str, int]] = deque([(start_url, 0)])
102
+ seen.add(_normalise(start_url))
103
+
104
+ try:
105
+ # Inject cookies on the origin before any navigation
106
+ if cookies:
107
+ try:
108
+ driver.get(origin)
109
+ _wait_for_ready(driver, 0.5)
110
+ for pair in cookies.split(";"):
111
+ pair = pair.strip()
112
+ if "=" in pair:
113
+ name, _, value = pair.partition("=")
114
+ driver.add_cookie({"name": name.strip(), "value": value.strip()})
115
+ except Exception as exc:
116
+ logger.debug("browser_crawl: cookie injection failed: %s", exc)
117
+
118
+ while queue and len(visited) < max_pages:
119
+ url, depth = queue.popleft()
120
+
121
+ try:
122
+ driver.get(url)
123
+ wait = _WAIT_FIRST_PAGE if depth == 0 else spa_wait_s
124
+ _wait_for_ready(driver, wait)
125
+ except Exception as exc:
126
+ logger.debug("browser_crawl: page load failed %s: %s", url, exc)
127
+ continue
128
+
129
+ visited.append(url)
130
+ logger.debug("browser_crawl: visited %s (depth=%d)", url, depth)
131
+
132
+ if depth >= max_depth:
133
+ continue
134
+
135
+ try:
136
+ links: List[str] = driver.execute_script(
137
+ "return Array.from(document.querySelectorAll('a[href]'))"
138
+ ".map(a => a.href)"
139
+ ".filter(h => h.startsWith('http'));"
140
+ ) or []
141
+ except Exception:
142
+ links = []
143
+
144
+ for link in links:
145
+ norm = _normalise(link)
146
+ if norm in seen:
147
+ continue
148
+ parsed = up.urlparse(link)
149
+ if f"{parsed.scheme}://{parsed.netloc}" != origin:
150
+ continue
151
+ seen.add(norm)
152
+ queue.append((link, depth + 1))
153
+
154
+ except Exception as exc:
155
+ logger.warning("browser_crawl: unexpected error: %s", exc)
156
+ finally:
157
+ try:
158
+ driver.quit()
159
+ except Exception:
160
+ pass
161
+
162
+ logger.info("browser_crawl: discovered %d URL(s) from %s", len(visited), start_url)
163
+ return visited
164
+
165
+
166
+ # ---------------------------------------------------------------------------
167
+ # Selenium driver factory
168
+ # ---------------------------------------------------------------------------
169
+
170
+ def _setup_driver(headless: bool, chromium_path: str, chromedriver_path: str):
171
+ from selenium import webdriver # noqa: PLC0415
172
+ from selenium.webdriver.chrome.options import Options # noqa: PLC0415
173
+ from selenium.webdriver.chrome.service import Service # noqa: PLC0415
174
+
175
+ opts = Options()
176
+ if headless:
177
+ opts.add_argument("--headless")
178
+ opts.add_argument("--no-sandbox")
179
+ opts.add_argument("--disable-dev-shm-usage")
180
+ opts.add_argument("--disable-gpu")
181
+ opts.add_argument("--disable-extensions")
182
+ opts.add_argument("--no-first-run")
183
+ opts.add_argument("--blink-settings=imagesEnabled=false")
184
+
185
+ if not chromium_path:
186
+ import shutil
187
+ for candidate in ("chromium", "chromium-browser", "google-chrome"):
188
+ found = shutil.which(candidate)
189
+ if found:
190
+ chromium_path = found
191
+ break
192
+ if chromium_path:
193
+ opts.binary_location = chromium_path
194
+
195
+ if not chromedriver_path:
196
+ import shutil
197
+ found = shutil.which("chromedriver")
198
+ if found:
199
+ chromedriver_path = found
200
+
201
+ svc = Service(chromedriver_path) if chromedriver_path else Service()
202
+ driver = webdriver.Chrome(service=svc, options=opts)
203
+ driver.set_page_load_timeout(15)
204
+ return driver
205
+
206
+
207
+ def _normalise(url: str) -> str:
208
+ """Strip fragment for deduplication."""
209
+ parsed = up.urlparse(url)
210
+ return up.urlunparse(parsed._replace(fragment="")).rstrip("/")