PyPI - commonhuman-core - Versions diffs - 0.1.0__py3-none-any.whl - Mend

commonhuman-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

commonhuman_core/__init__.py +9 -0
commonhuman_core/crawler.py +301 -0
commonhuman_core/http/__init__.py +13 -0
commonhuman_core/http/_cookies.py +42 -0
commonhuman_core/http/client.py +217 -0
commonhuman_core/passive.py +24 -0
commonhuman_core-0.1.0.dist-info/METADATA +263 -0
commonhuman_core-0.1.0.dist-info/RECORD +10 -0
commonhuman_core-0.1.0.dist-info/WHEEL +4 -0
commonhuman_core-0.1.0.dist-info/licenses/LICENSE +661 -0

commonhuman_core/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (c) 2026 CommonHuman-Lab
+"""
+commonhuman-core — shared HTTP, crawling, and scanning infrastructure.
+"""
+__version__ = "0.1.0"
+__all__ = ["__version__"]

commonhuman_core/crawler.py ADDED Viewed

@@ -0,0 +1,301 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (c) 2026 CommonHuman-Lab
+"""
+Multi-threaded BFS web crawler.
+Discovers links and HTML forms within a target origin.
+Respects same-origin constraint, max depth, max page limits, and
+optional URL exclusion patterns.
+"""
+from __future__ import annotations
+import re
+import urllib.parse as up
+from collections import deque
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from html.parser import HTMLParser
+from typing import Dict, List, Optional, Set, Tuple
+from .http.client import HttpClient
+# ---------------------------------------------------------------------------
+# Public data types
+# ---------------------------------------------------------------------------
+@dataclass
+class FormTarget:
+    """An HTML form discovered during crawling."""
+    method:    str                        # "GET" | "POST"
+    params:    Dict[str, str]             # {name: default_value} — injectable fields
+    action:    str                        # resolved absolute action URL
+    base_data: Dict[str, str] = field(default_factory=dict)  # hidden / submit fields
+@dataclass
+class CrawlResult:
+    """Aggregated output of a crawl run."""
+    visited_urls: List[str]                       = field(default_factory=list)
+    form_targets: List[FormTarget]                = field(default_factory=list)
+    url_params:   List[Tuple[str, List[str]]]     = field(default_factory=list)
+    page_sources: Dict[str, str]                  = field(default_factory=dict)
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def crawl(
+    start_url:        str,
+    injector:         HttpClient,
+    max_pages:        int = 50,
+    max_depth:        int = 3,
+    threads:          int = 5,
+    same_origin:      bool = True,
+    exclude_patterns: Optional[List[str]] = None,
+) -> CrawlResult:
+    """BFS crawl from ``start_url``.
+    Parameters
+    ----------
+    start_url:
+        URL to begin crawling from.
+    injector:
+        An :class:`~commonhuman_core.http.HttpClient` (or subclass) used for
+        all HTTP requests.
+    max_pages:
+        Stop after visiting this many unique pages.
+    max_depth:
+        Maximum BFS depth from ``start_url``.
+    threads:
+        Thread-pool size for parallel page fetching.
+    same_origin:
+        If ``True`` (default), skip URLs that are off-origin.
+    exclude_patterns:
+        Optional list of regex strings.  Any URL matching one is skipped.
+    Returns
+    -------
+    CrawlResult
+        Discovered pages, forms, URL parameters, and raw page sources.
+    """
+    compiled_excludes = [re.compile(p) for p in (exclude_patterns or [])]
+    def _is_excluded(url: str) -> bool:
+        return any(p.search(url) for p in compiled_excludes)
+    result:  CrawlResult = CrawlResult()
+    visited: Set[str]    = set()
+    queue:   deque       = deque()
+    queue.append((_normalise(start_url), 0))
+    with ThreadPoolExecutor(max_workers=threads) as pool:
+        while queue and len(visited) < max_pages:
+            batch: List[Tuple[str, int]] = []
+            while queue and len(batch) < threads * 2:
+                url, depth = queue.popleft()
+                norm = _normalise(url)
+                if norm in visited:
+                    continue
+                if same_origin and not injector.same_origin(norm, start_url):
+                    continue
+                if _is_excluded(norm):
+                    continue
+                visited.add(norm)
+                batch.append((norm, depth))
+            if not batch:
+                break
+            futures = {
+                pool.submit(_fetch_page, url, injector): (url, depth)
+                for url, depth in batch
+            }
+            for future in as_completed(futures):
+                url, depth = futures[future]
+                try:
+                    page = future.result()
+                except Exception:
+                    continue
+                if page is None:
+                    continue
+                html, links, forms = page
+                result.visited_urls.append(url)
+                result.page_sources[url] = html
+                params = injector.get_params(url)
+                if params:
+                    result.url_params.append((url, params))
+                for form in forms:
+                    result.form_targets.append(form)
+                if depth < max_depth:
+                    for link in links:
+                        norm = _normalise(link)
+                        if norm not in visited and not _is_excluded(norm):
+                            queue.append((norm, depth + 1))
+    return result
+# ---------------------------------------------------------------------------
+# Page fetching
+# ---------------------------------------------------------------------------
+def _fetch_page(
+    url: str,
+    injector: HttpClient,
+) -> Optional[Tuple[str, List[str], List[FormTarget]]]:
+    try:
+        resp = injector.get(url)
+    except Exception:
+        return None
+    if resp.status_code >= 400:
+        return None
+    ct = resp.headers.get("content-type", "")
+    if "html" not in ct and "javascript" not in ct:
+        return None
+    html = resp.text
+    # Use the final URL after redirects as the base so relative links and
+    # form actions resolve correctly (critical for 301 /path → /path/ redirects).
+    effective_url = resp.url if resp.url else url
+    return html, _extract_links(html, effective_url), _extract_forms(html, effective_url)
+# ---------------------------------------------------------------------------
+# HTML parsers
+# ---------------------------------------------------------------------------
+class _LinkParser(HTMLParser):
+    def __init__(self, base_url: str) -> None:
+        super().__init__()
+        self.base_url = base_url
+        self.links: List[str] = []
+    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
+        if tag.lower() != "a":
+            return
+        attr_dict = {k.lower(): v for k, v in attrs if v is not None}
+        href = attr_dict.get("href", "").strip()
+        if not href or href.startswith(("javascript:", "mailto:", "#")):
+            return
+        try:
+            abs_url = up.urljoin(self.base_url, href)
+            parsed  = up.urlparse(abs_url)
+            self.links.append(up.urlunparse(parsed._replace(fragment="")))
+        except Exception:  # pragma: no cover
+            pass
+class _FormParser(HTMLParser):
+    _SKIP_TYPES   = {"button", "image", "reset"}
+    _SUBMIT_TYPES = {"submit"}
+    _HIDDEN_TYPES = {"hidden"}
+    def __init__(self, base_url: str) -> None:
+        super().__init__()
+        self.base_url        = base_url
+        self.forms: List[FormTarget] = []
+        self._in_form        = False
+        self._current_action = base_url
+        self._current_method = "GET"
+        self._current_params: Dict[str, str] = {}
+        self._current_base:   Dict[str, str] = {}
+    def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
+        tag      = tag.lower()
+        attr_dict = {k.lower(): (v or "") for k, v in attrs}
+        if tag == "form":
+            self._in_form = True
+            action_raw = attr_dict.get("action", "").strip()
+            try:
+                self._current_action = (
+                    up.urljoin(self.base_url, action_raw) if action_raw else self.base_url
+                )
+            except Exception:  # pragma: no cover
+                self._current_action = self.base_url
+            self._current_method = attr_dict.get("method", "GET").upper()
+            self._current_params = {}
+            self._current_base   = {}
+        elif self._in_form and tag == "input":
+            input_type = attr_dict.get("type", "text").lower()
+            name = attr_dict.get("name", "").strip()
+            if not name or input_type in self._SKIP_TYPES:
+                return
+            if input_type in self._SUBMIT_TYPES:
+                self._current_base[name] = attr_dict.get("value", "")
+            elif input_type in self._HIDDEN_TYPES:
+                self._current_base[name] = attr_dict.get("value", "")
+            else:
+                self._current_params[name] = attr_dict.get("value", "")
+        elif self._in_form and tag in ("textarea", "select"):
+            name = attr_dict.get("name", "").strip()
+            if name:
+                self._current_params[name] = ""
+    def handle_endtag(self, tag: str) -> None:
+        if tag.lower() == "form" and self._in_form:
+            if self._current_params:
+                self.forms.append(FormTarget(
+                    method=self._current_method,
+                    params=self._current_params,
+                    action=self._current_action,
+                    base_data=self._current_base,
+                ))
+            self._in_form        = False
+            self._current_params = {}
+            self._current_base   = {}
+def _extract_links(html: str, base_url: str) -> List[str]:
+    parser = _LinkParser(base_url)
+    try:
+        parser.feed(html)
+    except Exception:  # pragma: no cover
+        pass
+    return parser.links
+def _extract_forms(html: str, base_url: str) -> List[FormTarget]:
+    parser = _FormParser(base_url)
+    try:
+        parser.feed(html)
+    except Exception:  # pragma: no cover
+        pass
+    return parser.forms
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _normalise(url: str) -> str:
+    """Lowercase scheme+host, strip trailing slash and fragment."""
+    try:
+        p = up.urlparse(url)
+        return up.urlunparse((
+            p.scheme.lower(),
+            p.netloc.lower(),
+            p.path.rstrip("/") or "/",
+            p.params,
+            p.query,
+            "",
+        ))
+    except Exception:  # pragma: no cover
+        return url

commonhuman_core/http/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (c) 2026 CommonHuman-Lab
+"""Public HTTP API for commonhuman-core."""
+from .client import HttpClient, DEFAULT_UA
+from ._cookies import parse_cookie_string, parse_post_data
+__all__ = [
+    "HttpClient",
+    "DEFAULT_UA",
+    "parse_cookie_string",
+    "parse_post_data",
+]

commonhuman_core/http/_cookies.py ADDED Viewed

@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (c) 2026 CommonHuman-Lab
+"""Cookie string and POST body parsing helpers."""
+from __future__ import annotations
+import json
+import urllib.parse as up
+from typing import Dict
+def parse_cookie_string(cookies: str) -> Dict[str, str]:
+    """Parse ``'name=value; name2=value2'`` or a JSON object string into a dict."""
+    cookies = cookies.strip()
+    if cookies.startswith("{"):
+        try:
+            return json.loads(cookies)
+        except Exception:
+            pass
+    result: Dict[str, str] = {}
+    for part in cookies.split(";"):
+        part = part.strip()
+        if "=" in part:
+            k, _, v = part.partition("=")
+            result[k.strip()] = v.strip()
+    return result
+def parse_post_data(raw: str) -> Dict[str, str]:
+    """Parse a raw POST body — supports ``application/x-www-form-urlencoded`` and JSON.
+    Returns a flat ``{key: value}`` dict.
+    """
+    raw = raw.strip()
+    if raw.startswith("{"):
+        try:
+            data = json.loads(raw)
+            return {str(k): str(v) for k, v in data.items()}
+        except Exception:
+            pass
+    parsed = up.parse_qs(raw, keep_blank_values=True)
+    return {k: v[0] if v else "" for k, v in parsed.items()}

commonhuman_core/http/client.py ADDED Viewed

@@ -0,0 +1,217 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (c) 2026 CommonHuman-Lab
+"""
+HttpClient — shared HTTP session for CommonHuman-Lab scanners.
+"""
+from __future__ import annotations
+import time
+import urllib.parse as up
+from typing import Any, Dict, List, Optional
+import urllib3
+import requests
+from requests import Response
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from ._cookies import parse_cookie_string
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+DEFAULT_UA = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/124.0.0.0 Safari/537.36"
+)
+_RATE_LIMIT_BACKOFF  = 5.0  # seconds to wait on 429
+_RATE_LIMIT_RETRIES  = 2    # max retries on 429
+class HttpClient:
+    """
+    Thin wrapper around ``requests.Session`` providing:
+    - Configurable proxy, headers, cookies, SSL verification
+    - Automatic retry on transient connection/read errors
+    - 429 rate-limit back-off with ``Retry-After`` header support
+    - Per-request delay (rate throttling)
+    - Request counter (for scan result reporting)
+    - Injection helpers for GET params, POST body, JSON body, path
+      segments, cookies, and custom headers
+    """
+    def __init__(
+        self,
+        timeout:    int = 15,
+        proxy:      Optional[str] = None,
+        headers:    Optional[Dict[str, str]] = None,
+        cookies:    Optional[str] = None,
+        verify_ssl: bool = False,
+        delay:      float = 0.0,
+    ) -> None:
+        self.timeout       = timeout
+        self.request_count = 0
+        self.delay         = max(0.0, delay)
+        self._session = requests.Session()
+        self._session.verify = verify_ssl
+        retry = Retry(
+            total=2,
+            backoff_factor=0.3,
+            status_forcelist=(),
+            allowed_methods=["GET", "POST", "HEAD"],
+        )
+        adapter = HTTPAdapter(max_retries=retry)
+        self._session.mount("http://",  adapter)
+        self._session.mount("https://", adapter)
+        base_headers: Dict[str, str] = {"User-Agent": DEFAULT_UA}
+        if headers:
+            base_headers.update(headers)
+        self._session.headers.update(base_headers)
+        if cookies:
+            self._session.cookies.update(parse_cookie_string(cookies))
+        if proxy:
+            self._session.proxies = {"http": proxy, "https": proxy}
+    # ------------------------------------------------------------------
+    # Core HTTP
+    # ------------------------------------------------------------------
+    def get(self, url: str, params: Optional[Dict[str, str]] = None, **kwargs) -> Response:
+        if self.delay:
+            time.sleep(self.delay)
+        self.request_count += 1
+        resp = self._session.get(url, params=params, timeout=self.timeout, **kwargs)
+        return self._handle_rate_limit(
+            resp,
+            lambda: self._session.get(url, params=params, timeout=self.timeout, **kwargs),
+        )
+    def post(
+        self,
+        url: str,
+        data: Optional[Dict[str, Any]] = None,
+        json_body: Optional[Any] = None,
+        **kwargs,
+    ) -> Response:
+        if self.delay:
+            time.sleep(self.delay)
+        self.request_count += 1
+        resp = self._session.post(url, data=data, json=json_body, timeout=self.timeout, **kwargs)
+        return self._handle_rate_limit(
+            resp,
+            lambda: self._session.post(
+                url, data=data, json=json_body, timeout=self.timeout, **kwargs
+            ),
+        )
+    def head(self, url: str, **kwargs) -> Response:
+        self.request_count += 1
+        return self._session.head(url, timeout=self.timeout, allow_redirects=True, **kwargs)
+    def _handle_rate_limit(self, resp: Response, retry_fn) -> Response:
+        """Back off and retry when the server returns HTTP 429."""
+        for _ in range(_RATE_LIMIT_RETRIES):
+            if resp.status_code != 429:
+                break
+            wait = _RATE_LIMIT_BACKOFF
+            retry_after = resp.headers.get("Retry-After", "")
+            if retry_after:
+                try:
+                    wait = max(float(retry_after), _RATE_LIMIT_BACKOFF)
+                except ValueError:
+                    pass
+            time.sleep(wait)
+            self.request_count += 1
+            resp = retry_fn()
+        return resp
+    # ------------------------------------------------------------------
+    # Injection helpers
+    # ------------------------------------------------------------------
+    def inject_get(self, url: str, param: str, payload: str) -> Response:
+        """Replace the value of ``param`` in the URL query string with ``payload``."""
+        parsed = up.urlparse(url)
+        qs = up.parse_qs(parsed.query, keep_blank_values=True)
+        qs[param] = [payload]
+        target = up.urlunparse(parsed._replace(query=up.urlencode(qs, doseq=True)))
+        return self.get(target)
+    def inject_post(
+        self,
+        url: str,
+        param: str,
+        payload: str,
+        base_data: Optional[Dict[str, str]] = None,
+    ) -> Response:
+        """Replace the value of ``param`` in a POST form body with ``payload``."""
+        data = dict(base_data or {})
+        data[param] = payload
+        return self.post(url, data=data)
+    def inject_post_json(
+        self,
+        url: str,
+        param: str,
+        payload: str,
+        base_data: Optional[Dict[str, Any]] = None,
+    ) -> Response:
+        """Replace the value of ``param`` in a JSON POST body with ``payload``."""
+        body = dict(base_data or {})
+        body[param] = payload
+        return self.post(url, json_body=body)
+    def inject_path(self, url: str, segment_index: int, payload: str) -> Response:
+        """Replace the path segment at ``segment_index`` (0-based) with ``payload``.
+        Useful for REST-style path parameters such as ``/api/user/:id``.
+        Pass ``-1`` to append as a new trailing segment.
+        """
+        parsed = up.urlparse(url)
+        parts  = parsed.path.split("/")
+        if segment_index == -1:
+            parts.append(up.quote(str(payload), safe=""))
+        elif 0 <= segment_index < len(parts):
+            parts[segment_index] = up.quote(str(payload), safe="")
+        target = up.urlunparse(parsed._replace(path="/".join(parts)))
+        return self.get(target)
+    def inject_cookie(self, url: str, cookie_name: str, payload: str) -> Response:
+        """Override ``cookie_name`` with ``payload`` for this single request."""
+        return self.get(url, cookies={cookie_name: payload})
+    def inject_header(self, url: str, header_name: str, payload: str) -> Response:
+        """Send ``payload`` as the value of ``header_name`` for this single request."""
+        return self.get(url, headers={header_name: payload})
+    # ------------------------------------------------------------------
+    # URL utilities
+    # ------------------------------------------------------------------
+    @staticmethod
+    def get_params(url: str) -> List[str]:
+        """Return query parameter names from ``url``."""
+        return list(up.parse_qs(up.urlparse(url).query, keep_blank_values=True).keys())
+    @staticmethod
+    def get_base_url(url: str) -> str:
+        """Return ``scheme://netloc`` from ``url``."""
+        p = up.urlparse(url)
+        return f"{p.scheme}://{p.netloc}"
+    @staticmethod
+    def same_origin(url_a: str, url_b: str) -> bool:
+        """Return ``True`` if both URLs share the same scheme and netloc."""
+        pa, pb = up.urlparse(url_a), up.urlparse(url_b)
+        return pa.scheme == pb.scheme and pa.netloc == pb.netloc
+    def close(self) -> None:
+        self._session.close()

commonhuman_core/passive.py ADDED Viewed

@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# Copyright (c) 2026 CommonHuman-Lab
+"""Passive analysis helpers for CommonHuman-Lab scanners."""
+from __future__ import annotations
+from typing import Optional
+from requests import Response
+from .http.client import HttpClient
+def fetch_seed(injector: HttpClient, url: str) -> Optional[Response]:
+    """Fetch ``url`` once for passive analysis.
+    Returns the :class:`~requests.Response` on success, or ``None`` if the
+    request fails or returns a 4xx/5xx status.
+    """
+    try:
+        resp = injector.get(url)
+    except Exception:
+        return None
+    return resp if resp.status_code < 400 else None