commonhuman-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (c) 2026 CommonHuman-Lab
3
+ """
4
+ commonhuman-core — shared HTTP, crawling, and scanning infrastructure.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ __all__ = ["__version__"]
@@ -0,0 +1,301 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (c) 2026 CommonHuman-Lab
3
+ """
4
+ Multi-threaded BFS web crawler.
5
+
6
+ Discovers links and HTML forms within a target origin.
7
+ Respects same-origin constraint, max depth, max page limits, and
8
+ optional URL exclusion patterns.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import re
14
+ import urllib.parse as up
15
+ from collections import deque
16
+ from concurrent.futures import ThreadPoolExecutor, as_completed
17
+ from dataclasses import dataclass, field
18
+ from html.parser import HTMLParser
19
+ from typing import Dict, List, Optional, Set, Tuple
20
+
21
+ from .http.client import HttpClient
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Public data types
25
+ # ---------------------------------------------------------------------------
26
+
27
+
28
+ @dataclass
29
+ class FormTarget:
30
+ """An HTML form discovered during crawling."""
31
+ method: str # "GET" | "POST"
32
+ params: Dict[str, str] # {name: default_value} — injectable fields
33
+ action: str # resolved absolute action URL
34
+ base_data: Dict[str, str] = field(default_factory=dict) # hidden / submit fields
35
+
36
+
37
+ @dataclass
38
+ class CrawlResult:
39
+ """Aggregated output of a crawl run."""
40
+ visited_urls: List[str] = field(default_factory=list)
41
+ form_targets: List[FormTarget] = field(default_factory=list)
42
+ url_params: List[Tuple[str, List[str]]] = field(default_factory=list)
43
+ page_sources: Dict[str, str] = field(default_factory=dict)
44
+
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # Public API
48
+ # ---------------------------------------------------------------------------
49
+
50
+
51
+ def crawl(
52
+ start_url: str,
53
+ injector: HttpClient,
54
+ max_pages: int = 50,
55
+ max_depth: int = 3,
56
+ threads: int = 5,
57
+ same_origin: bool = True,
58
+ exclude_patterns: Optional[List[str]] = None,
59
+ ) -> CrawlResult:
60
+ """BFS crawl from ``start_url``.
61
+
62
+ Parameters
63
+ ----------
64
+ start_url:
65
+ URL to begin crawling from.
66
+ injector:
67
+ An :class:`~commonhuman_core.http.HttpClient` (or subclass) used for
68
+ all HTTP requests.
69
+ max_pages:
70
+ Stop after visiting this many unique pages.
71
+ max_depth:
72
+ Maximum BFS depth from ``start_url``.
73
+ threads:
74
+ Thread-pool size for parallel page fetching.
75
+ same_origin:
76
+ If ``True`` (default), skip URLs that are off-origin.
77
+ exclude_patterns:
78
+ Optional list of regex strings. Any URL matching one is skipped.
79
+
80
+ Returns
81
+ -------
82
+ CrawlResult
83
+ Discovered pages, forms, URL parameters, and raw page sources.
84
+ """
85
+ compiled_excludes = [re.compile(p) for p in (exclude_patterns or [])]
86
+
87
+ def _is_excluded(url: str) -> bool:
88
+ return any(p.search(url) for p in compiled_excludes)
89
+
90
+ result: CrawlResult = CrawlResult()
91
+ visited: Set[str] = set()
92
+ queue: deque = deque()
93
+ queue.append((_normalise(start_url), 0))
94
+
95
+ with ThreadPoolExecutor(max_workers=threads) as pool:
96
+ while queue and len(visited) < max_pages:
97
+ batch: List[Tuple[str, int]] = []
98
+ while queue and len(batch) < threads * 2:
99
+ url, depth = queue.popleft()
100
+ norm = _normalise(url)
101
+ if norm in visited:
102
+ continue
103
+ if same_origin and not injector.same_origin(norm, start_url):
104
+ continue
105
+ if _is_excluded(norm):
106
+ continue
107
+ visited.add(norm)
108
+ batch.append((norm, depth))
109
+
110
+ if not batch:
111
+ break
112
+
113
+ futures = {
114
+ pool.submit(_fetch_page, url, injector): (url, depth)
115
+ for url, depth in batch
116
+ }
117
+
118
+ for future in as_completed(futures):
119
+ url, depth = futures[future]
120
+ try:
121
+ page = future.result()
122
+ except Exception:
123
+ continue
124
+
125
+ if page is None:
126
+ continue
127
+ html, links, forms = page
128
+
129
+ result.visited_urls.append(url)
130
+ result.page_sources[url] = html
131
+
132
+ params = injector.get_params(url)
133
+ if params:
134
+ result.url_params.append((url, params))
135
+
136
+ for form in forms:
137
+ result.form_targets.append(form)
138
+
139
+ if depth < max_depth:
140
+ for link in links:
141
+ norm = _normalise(link)
142
+ if norm not in visited and not _is_excluded(norm):
143
+ queue.append((norm, depth + 1))
144
+
145
+ return result
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # Page fetching
150
+ # ---------------------------------------------------------------------------
151
+
152
+
153
+ def _fetch_page(
154
+ url: str,
155
+ injector: HttpClient,
156
+ ) -> Optional[Tuple[str, List[str], List[FormTarget]]]:
157
+ try:
158
+ resp = injector.get(url)
159
+ except Exception:
160
+ return None
161
+
162
+ if resp.status_code >= 400:
163
+ return None
164
+
165
+ ct = resp.headers.get("content-type", "")
166
+ if "html" not in ct and "javascript" not in ct:
167
+ return None
168
+
169
+ html = resp.text
170
+ # Use the final URL after redirects as the base so relative links and
171
+ # form actions resolve correctly (critical for 301 /path → /path/ redirects).
172
+ effective_url = resp.url if resp.url else url
173
+ return html, _extract_links(html, effective_url), _extract_forms(html, effective_url)
174
+
175
+
176
+ # ---------------------------------------------------------------------------
177
+ # HTML parsers
178
+ # ---------------------------------------------------------------------------
179
+
180
+
181
+ class _LinkParser(HTMLParser):
182
+ def __init__(self, base_url: str) -> None:
183
+ super().__init__()
184
+ self.base_url = base_url
185
+ self.links: List[str] = []
186
+
187
+ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
188
+ if tag.lower() != "a":
189
+ return
190
+ attr_dict = {k.lower(): v for k, v in attrs if v is not None}
191
+ href = attr_dict.get("href", "").strip()
192
+ if not href or href.startswith(("javascript:", "mailto:", "#")):
193
+ return
194
+ try:
195
+ abs_url = up.urljoin(self.base_url, href)
196
+ parsed = up.urlparse(abs_url)
197
+ self.links.append(up.urlunparse(parsed._replace(fragment="")))
198
+ except Exception: # pragma: no cover
199
+ pass
200
+
201
+
202
+ class _FormParser(HTMLParser):
203
+ _SKIP_TYPES = {"button", "image", "reset"}
204
+ _SUBMIT_TYPES = {"submit"}
205
+ _HIDDEN_TYPES = {"hidden"}
206
+
207
+ def __init__(self, base_url: str) -> None:
208
+ super().__init__()
209
+ self.base_url = base_url
210
+ self.forms: List[FormTarget] = []
211
+ self._in_form = False
212
+ self._current_action = base_url
213
+ self._current_method = "GET"
214
+ self._current_params: Dict[str, str] = {}
215
+ self._current_base: Dict[str, str] = {}
216
+
217
+ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
218
+ tag = tag.lower()
219
+ attr_dict = {k.lower(): (v or "") for k, v in attrs}
220
+
221
+ if tag == "form":
222
+ self._in_form = True
223
+ action_raw = attr_dict.get("action", "").strip()
224
+ try:
225
+ self._current_action = (
226
+ up.urljoin(self.base_url, action_raw) if action_raw else self.base_url
227
+ )
228
+ except Exception: # pragma: no cover
229
+ self._current_action = self.base_url
230
+ self._current_method = attr_dict.get("method", "GET").upper()
231
+ self._current_params = {}
232
+ self._current_base = {}
233
+
234
+ elif self._in_form and tag == "input":
235
+ input_type = attr_dict.get("type", "text").lower()
236
+ name = attr_dict.get("name", "").strip()
237
+ if not name or input_type in self._SKIP_TYPES:
238
+ return
239
+ if input_type in self._SUBMIT_TYPES:
240
+ self._current_base[name] = attr_dict.get("value", "")
241
+ elif input_type in self._HIDDEN_TYPES:
242
+ self._current_base[name] = attr_dict.get("value", "")
243
+ else:
244
+ self._current_params[name] = attr_dict.get("value", "")
245
+
246
+ elif self._in_form and tag in ("textarea", "select"):
247
+ name = attr_dict.get("name", "").strip()
248
+ if name:
249
+ self._current_params[name] = ""
250
+
251
+ def handle_endtag(self, tag: str) -> None:
252
+ if tag.lower() == "form" and self._in_form:
253
+ if self._current_params:
254
+ self.forms.append(FormTarget(
255
+ method=self._current_method,
256
+ params=self._current_params,
257
+ action=self._current_action,
258
+ base_data=self._current_base,
259
+ ))
260
+ self._in_form = False
261
+ self._current_params = {}
262
+ self._current_base = {}
263
+
264
+
265
+ def _extract_links(html: str, base_url: str) -> List[str]:
266
+ parser = _LinkParser(base_url)
267
+ try:
268
+ parser.feed(html)
269
+ except Exception: # pragma: no cover
270
+ pass
271
+ return parser.links
272
+
273
+
274
+ def _extract_forms(html: str, base_url: str) -> List[FormTarget]:
275
+ parser = _FormParser(base_url)
276
+ try:
277
+ parser.feed(html)
278
+ except Exception: # pragma: no cover
279
+ pass
280
+ return parser.forms
281
+
282
+
283
+ # ---------------------------------------------------------------------------
284
+ # Helpers
285
+ # ---------------------------------------------------------------------------
286
+
287
+
288
+ def _normalise(url: str) -> str:
289
+ """Lowercase scheme+host, strip trailing slash and fragment."""
290
+ try:
291
+ p = up.urlparse(url)
292
+ return up.urlunparse((
293
+ p.scheme.lower(),
294
+ p.netloc.lower(),
295
+ p.path.rstrip("/") or "/",
296
+ p.params,
297
+ p.query,
298
+ "",
299
+ ))
300
+ except Exception: # pragma: no cover
301
+ return url
@@ -0,0 +1,13 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (c) 2026 CommonHuman-Lab
3
+ """Public HTTP API for commonhuman-core."""
4
+
5
+ from .client import HttpClient, DEFAULT_UA
6
+ from ._cookies import parse_cookie_string, parse_post_data
7
+
8
+ __all__ = [
9
+ "HttpClient",
10
+ "DEFAULT_UA",
11
+ "parse_cookie_string",
12
+ "parse_post_data",
13
+ ]
@@ -0,0 +1,42 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (c) 2026 CommonHuman-Lab
3
+ """Cookie string and POST body parsing helpers."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import urllib.parse as up
9
+ from typing import Dict
10
+
11
+
12
+ def parse_cookie_string(cookies: str) -> Dict[str, str]:
13
+ """Parse ``'name=value; name2=value2'`` or a JSON object string into a dict."""
14
+ cookies = cookies.strip()
15
+ if cookies.startswith("{"):
16
+ try:
17
+ return json.loads(cookies)
18
+ except Exception:
19
+ pass
20
+ result: Dict[str, str] = {}
21
+ for part in cookies.split(";"):
22
+ part = part.strip()
23
+ if "=" in part:
24
+ k, _, v = part.partition("=")
25
+ result[k.strip()] = v.strip()
26
+ return result
27
+
28
+
29
+ def parse_post_data(raw: str) -> Dict[str, str]:
30
+ """Parse a raw POST body — supports ``application/x-www-form-urlencoded`` and JSON.
31
+
32
+ Returns a flat ``{key: value}`` dict.
33
+ """
34
+ raw = raw.strip()
35
+ if raw.startswith("{"):
36
+ try:
37
+ data = json.loads(raw)
38
+ return {str(k): str(v) for k, v in data.items()}
39
+ except Exception:
40
+ pass
41
+ parsed = up.parse_qs(raw, keep_blank_values=True)
42
+ return {k: v[0] if v else "" for k, v in parsed.items()}
@@ -0,0 +1,217 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (c) 2026 CommonHuman-Lab
3
+ """
4
+ HttpClient — shared HTTP session for CommonHuman-Lab scanners.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import time
10
+ import urllib.parse as up
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ import urllib3
14
+ import requests
15
+ from requests import Response
16
+ from requests.adapters import HTTPAdapter
17
+ from urllib3.util.retry import Retry
18
+
19
+ from ._cookies import parse_cookie_string
20
+
21
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
22
+
23
+ DEFAULT_UA = (
24
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
25
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
26
+ "Chrome/124.0.0.0 Safari/537.36"
27
+ )
28
+
29
+ _RATE_LIMIT_BACKOFF = 5.0 # seconds to wait on 429
30
+ _RATE_LIMIT_RETRIES = 2 # max retries on 429
31
+
32
+
33
+ class HttpClient:
34
+ """
35
+ Thin wrapper around ``requests.Session`` providing:
36
+
37
+ - Configurable proxy, headers, cookies, SSL verification
38
+ - Automatic retry on transient connection/read errors
39
+ - 429 rate-limit back-off with ``Retry-After`` header support
40
+ - Per-request delay (rate throttling)
41
+ - Request counter (for scan result reporting)
42
+ - Injection helpers for GET params, POST body, JSON body, path
43
+ segments, cookies, and custom headers
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ timeout: int = 15,
49
+ proxy: Optional[str] = None,
50
+ headers: Optional[Dict[str, str]] = None,
51
+ cookies: Optional[str] = None,
52
+ verify_ssl: bool = False,
53
+ delay: float = 0.0,
54
+ ) -> None:
55
+ self.timeout = timeout
56
+ self.request_count = 0
57
+ self.delay = max(0.0, delay)
58
+
59
+ self._session = requests.Session()
60
+ self._session.verify = verify_ssl
61
+
62
+ retry = Retry(
63
+ total=2,
64
+ backoff_factor=0.3,
65
+ status_forcelist=(),
66
+ allowed_methods=["GET", "POST", "HEAD"],
67
+ )
68
+ adapter = HTTPAdapter(max_retries=retry)
69
+ self._session.mount("http://", adapter)
70
+ self._session.mount("https://", adapter)
71
+
72
+ base_headers: Dict[str, str] = {"User-Agent": DEFAULT_UA}
73
+ if headers:
74
+ base_headers.update(headers)
75
+ self._session.headers.update(base_headers)
76
+
77
+ if cookies:
78
+ self._session.cookies.update(parse_cookie_string(cookies))
79
+
80
+ if proxy:
81
+ self._session.proxies = {"http": proxy, "https": proxy}
82
+
83
+ # ------------------------------------------------------------------
84
+ # Core HTTP
85
+ # ------------------------------------------------------------------
86
+
87
+ def get(self, url: str, params: Optional[Dict[str, str]] = None, **kwargs) -> Response:
88
+ if self.delay:
89
+ time.sleep(self.delay)
90
+ self.request_count += 1
91
+ resp = self._session.get(url, params=params, timeout=self.timeout, **kwargs)
92
+ return self._handle_rate_limit(
93
+ resp,
94
+ lambda: self._session.get(url, params=params, timeout=self.timeout, **kwargs),
95
+ )
96
+
97
+ def post(
98
+ self,
99
+ url: str,
100
+ data: Optional[Dict[str, Any]] = None,
101
+ json_body: Optional[Any] = None,
102
+ **kwargs,
103
+ ) -> Response:
104
+ if self.delay:
105
+ time.sleep(self.delay)
106
+ self.request_count += 1
107
+ resp = self._session.post(url, data=data, json=json_body, timeout=self.timeout, **kwargs)
108
+ return self._handle_rate_limit(
109
+ resp,
110
+ lambda: self._session.post(
111
+ url, data=data, json=json_body, timeout=self.timeout, **kwargs
112
+ ),
113
+ )
114
+
115
+ def head(self, url: str, **kwargs) -> Response:
116
+ self.request_count += 1
117
+ return self._session.head(url, timeout=self.timeout, allow_redirects=True, **kwargs)
118
+
119
+ def _handle_rate_limit(self, resp: Response, retry_fn) -> Response:
120
+ """Back off and retry when the server returns HTTP 429."""
121
+ for _ in range(_RATE_LIMIT_RETRIES):
122
+ if resp.status_code != 429:
123
+ break
124
+ wait = _RATE_LIMIT_BACKOFF
125
+ retry_after = resp.headers.get("Retry-After", "")
126
+ if retry_after:
127
+ try:
128
+ wait = max(float(retry_after), _RATE_LIMIT_BACKOFF)
129
+ except ValueError:
130
+ pass
131
+ time.sleep(wait)
132
+ self.request_count += 1
133
+ resp = retry_fn()
134
+ return resp
135
+
136
+ # ------------------------------------------------------------------
137
+ # Injection helpers
138
+ # ------------------------------------------------------------------
139
+
140
+ def inject_get(self, url: str, param: str, payload: str) -> Response:
141
+ """Replace the value of ``param`` in the URL query string with ``payload``."""
142
+ parsed = up.urlparse(url)
143
+ qs = up.parse_qs(parsed.query, keep_blank_values=True)
144
+ qs[param] = [payload]
145
+ target = up.urlunparse(parsed._replace(query=up.urlencode(qs, doseq=True)))
146
+ return self.get(target)
147
+
148
+ def inject_post(
149
+ self,
150
+ url: str,
151
+ param: str,
152
+ payload: str,
153
+ base_data: Optional[Dict[str, str]] = None,
154
+ ) -> Response:
155
+ """Replace the value of ``param`` in a POST form body with ``payload``."""
156
+ data = dict(base_data or {})
157
+ data[param] = payload
158
+ return self.post(url, data=data)
159
+
160
+ def inject_post_json(
161
+ self,
162
+ url: str,
163
+ param: str,
164
+ payload: str,
165
+ base_data: Optional[Dict[str, Any]] = None,
166
+ ) -> Response:
167
+ """Replace the value of ``param`` in a JSON POST body with ``payload``."""
168
+ body = dict(base_data or {})
169
+ body[param] = payload
170
+ return self.post(url, json_body=body)
171
+
172
+ def inject_path(self, url: str, segment_index: int, payload: str) -> Response:
173
+ """Replace the path segment at ``segment_index`` (0-based) with ``payload``.
174
+
175
+ Useful for REST-style path parameters such as ``/api/user/:id``.
176
+ Pass ``-1`` to append as a new trailing segment.
177
+ """
178
+ parsed = up.urlparse(url)
179
+ parts = parsed.path.split("/")
180
+ if segment_index == -1:
181
+ parts.append(up.quote(str(payload), safe=""))
182
+ elif 0 <= segment_index < len(parts):
183
+ parts[segment_index] = up.quote(str(payload), safe="")
184
+ target = up.urlunparse(parsed._replace(path="/".join(parts)))
185
+ return self.get(target)
186
+
187
+ def inject_cookie(self, url: str, cookie_name: str, payload: str) -> Response:
188
+ """Override ``cookie_name`` with ``payload`` for this single request."""
189
+ return self.get(url, cookies={cookie_name: payload})
190
+
191
+ def inject_header(self, url: str, header_name: str, payload: str) -> Response:
192
+ """Send ``payload`` as the value of ``header_name`` for this single request."""
193
+ return self.get(url, headers={header_name: payload})
194
+
195
+ # ------------------------------------------------------------------
196
+ # URL utilities
197
+ # ------------------------------------------------------------------
198
+
199
+ @staticmethod
200
+ def get_params(url: str) -> List[str]:
201
+ """Return query parameter names from ``url``."""
202
+ return list(up.parse_qs(up.urlparse(url).query, keep_blank_values=True).keys())
203
+
204
+ @staticmethod
205
+ def get_base_url(url: str) -> str:
206
+ """Return ``scheme://netloc`` from ``url``."""
207
+ p = up.urlparse(url)
208
+ return f"{p.scheme}://{p.netloc}"
209
+
210
+ @staticmethod
211
+ def same_origin(url_a: str, url_b: str) -> bool:
212
+ """Return ``True`` if both URLs share the same scheme and netloc."""
213
+ pa, pb = up.urlparse(url_a), up.urlparse(url_b)
214
+ return pa.scheme == pb.scheme and pa.netloc == pb.netloc
215
+
216
+ def close(self) -> None:
217
+ self._session.close()
@@ -0,0 +1,24 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (c) 2026 CommonHuman-Lab
3
+ """Passive analysis helpers for CommonHuman-Lab scanners."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Optional
8
+
9
+ from requests import Response
10
+
11
+ from .http.client import HttpClient
12
+
13
+
14
+ def fetch_seed(injector: HttpClient, url: str) -> Optional[Response]:
15
+ """Fetch ``url`` once for passive analysis.
16
+
17
+ Returns the :class:`~requests.Response` on success, or ``None`` if the
18
+ request fails or returns a 4xx/5xx status.
19
+ """
20
+ try:
21
+ resp = injector.get(url)
22
+ except Exception:
23
+ return None
24
+ return resp if resp.status_code < 400 else None