PyPI - browser-recon - Versions diffs - 0.3.0__py3-none-any.whl - Mend

browser-recon 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

browser_recon/__init__.py +3 -0
browser_recon/analysis/__init__.py +16 -0
browser_recon/analysis/headers.py +317 -0
browser_recon/analysis/interactions.py +413 -0
browser_recon/analysis/redirects.py +46 -0
browser_recon/capture/__init__.py +221 -0
browser_recon/capture/cdp_monitor.py +1720 -0
browser_recon/capture/chrome_launcher.py +441 -0
browser_recon/cli/__init__.py +1 -0
browser_recon/cli/_url_helpers.py +260 -0
browser_recon/cli/config.py +186 -0
browser_recon/cli/login.py +218 -0
browser_recon/cli/main.py +2249 -0
browser_recon/cli/poll.py +382 -0
browser_recon/cli/report_opener.py +29 -0
browser_recon/cli/spinner.py +174 -0
browser_recon/detection/__init__.py +8 -0
browser_recon/detection/rules/__init__.py +16 -0
browser_recon/detection/rules/anti_bot/__init__.py +240 -0
browser_recon/detection/rules/auth_flow.py +214 -0
browser_recon/detection/rules/rate_limit_signals.py +98 -0
browser_recon/llm_eval/__init__.py +46 -0
browser_recon/llm_eval/cost_tracker.py +100 -0
browser_recon/llm_eval/fixtures.py +367 -0
browser_recon/llm_eval/report.py +218 -0
browser_recon/llm_eval/runner.py +352 -0
browser_recon/llm_eval/scan_loader.py +145 -0
browser_recon/models.py +788 -0
browser_recon/reporting/__init__.py +7 -0
browser_recon/reporting/duration.py +48 -0
browser_recon/transport/__init__.py +1 -0
browser_recon/transport/capture_upload.py +273 -0
browser_recon/transport/uploader.py +1676 -0
browser_recon/utils.py +125 -0
browser_recon-0.3.0.dist-info/METADATA +22 -0
browser_recon-0.3.0.dist-info/RECORD +38 -0
browser_recon-0.3.0.dist-info/WHEEL +4 -0
browser_recon-0.3.0.dist-info/entry_points.txt +2 -0

browser_recon/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""browser-recon: scan a URL and produce a scraping reconnaissance report."""
+__version__: str = "0.3.0"

browser_recon/analysis/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Analysis subpackage (capture-time only).
+T53.6 trimmed this package to the small set of helpers
+:mod:`browser_recon.capture.cdp_monitor` needs while Chrome is live:
+* :mod:`browser_recon.analysis.headers` -- header/CDN classifiers
+  exposed via :func:`is_cdn_url`.
+* :mod:`browser_recon.analysis.interactions` -- DOM-observer +
+  interaction-log injection / parse helpers.
+* :mod:`browser_recon.analysis.redirects` -- redirect-chain
+  extractor for the CDP response listener.
+The full endpoint-inventory orchestrator + architecture / framework /
+dependency classifiers moved to ``browser_recon_server.analysis_server``
+in T53.2.
+"""

browser_recon/analysis/headers.py ADDED Viewed

@@ -0,0 +1,317 @@
+"""Header analysis: CORS, caching, and replay header extraction.
+Extracted from the existing browser-recon skill at
+``~/.claude/skills/browser-recon/scripts/analyzers/headers.py`` (task
+1.a.11). Verbatim port of ``is_cdn_url``, ``extract_cors_info`` and
+``extract_caching_info`` modulo the import-hack swap for proper
+``browser_recon.models`` / ``browser_recon.utils`` imports.
+``extract_replay_headers`` carries one Tier A behaviour fix — see
+``phase-1-analysis-spec.md`` § Audit of existing code and
+``build-sequence.md`` § Tier A — analyzers/headers.py.
+Tier A — cookie-name source
+---------------------------
+After PII scrubbing (``phase-1-privacy-transport-spec.md``), the
+request-side ``Cookie:`` header value is stripped to an empty string.
+Parsing it then yields no cookie names, leaving
+``ReplayHeaders.required_cookies`` empty for every endpoint — useless
+for the synthesis layer.
+The fix: ``extract_replay_headers`` now takes an optional
+``cookies: list[CapturedCookie] | None`` argument. When provided AND at
+least one captured cookie's ``domain`` matches the request URL's host,
+the captured-cookies inventory is the PRIMARY source for cookie names
+(filtered by domain match, sorted alphabetically, deduplicated). The
+header-parsing logic is retained as a fallback for callers that pass
+``cookies=None`` or for requests whose host has no matching cookie.
+Domain-matching rule:
+* ``request_host == cookie.domain.lstrip(".")``, OR
+* ``request_host.endswith("." + cookie.domain.lstrip("."))``.
+So ``.example.com`` matches ``www.example.com``, ``api.example.com`` and
+``example.com``; ``api.example.com`` (no leading dot) matches only
+``api.example.com``.
+"""
+from __future__ import annotations
+from urllib.parse import urlsplit
+from browser_recon.models import (
+    CachingInfo,
+    CapturedCookie,
+    CapturedRequest,
+    CORSInfo,
+    ReplayHeaders,
+)
+from browser_recon.utils import base_url, is_api_request
+# CDN/static asset domains — shared across monitor and analyzers.
+# Only exact domains or suffixes, no loose prefix matches like "cdn." or "media."
+CDN_DOMAINS = frozenset(
+    (
+        "static.licdn.com",
+        "dms.licdn.com",
+        "media.licdn.com",
+        "cloudfront.net",
+        "akamaihd.net",
+        "fastly.net",
+        "cdnjs.cloudflare.com",
+        "unpkg.com",
+        "jsdelivr.net",
+        "googleapis.com",  # Google CDN (fonts, storage, etc.)
+    )
+)
+_STANDARD_BROWSER_X_HEADERS = {
+    "x-forwarded-for",
+    "x-forwarded-host",
+    "x-forwarded-proto",
+    "x-forwarded-port",
+    "x-real-ip",
+    "x-request-id",
+    "x-correlation-id",
+    "x-amz-date",
+    "x-amz-security-token",
+    "x-ratelimit-limit",
+    "x-ratelimit-remaining",
+    "x-ratelimit-reset",
+}
+def is_cdn_url(url: str) -> bool:
+    """Check if URL is from a CDN/static asset domain."""
+    host = urlsplit(url).hostname or ""
+    host_lower = host.lower()
+    for domain in CDN_DOMAINS:
+        if host_lower == domain or host_lower.endswith("." + domain):
+            return True
+    return False
+def extract_cors_info(requests: dict[str, CapturedRequest]) -> list[CORSInfo]:
+    """Per-base-URL CORS summary for API requests.
+    CDN/static-asset hosts are skipped (CORS ``*`` is the default and
+    not informative for scraping).
+    """
+    seen: set[str] = set()
+    results: list[CORSInfo] = []
+    for req in requests.values():
+        if not is_api_request(req):
+            continue
+        # Skip CDN/static domains — CORS origin=* is expected and not useful
+        if is_cdn_url(req.url):
+            continue
+        headers_lower = {k.lower(): v for k, v in req.response_headers.items()}
+        allow_origin = headers_lower.get("access-control-allow-origin", "")
+        if not allow_origin:
+            continue
+        key = base_url(req.url)
+        if key in seen:
+            continue
+        seen.add(key)
+        allow_methods = headers_lower.get("access-control-allow-methods", "")
+        allow_headers = headers_lower.get("access-control-allow-headers", "")
+        allow_creds = (
+            headers_lower.get("access-control-allow-credentials", "").lower() == "true"
+        )
+        externally_callable = allow_origin == "*"
+        results.append(
+            CORSInfo(
+                request_url=key,
+                allow_origin=allow_origin,
+                allow_methods=allow_methods,
+                allow_headers=allow_headers,
+                allow_credentials=allow_creds,
+                externally_callable=externally_callable,
+            )
+        )
+    return results
+def extract_caching_info(requests: dict[str, CapturedRequest]) -> list[CachingInfo]:
+    """Per-base-URL caching-header summary for API requests."""
+    seen: set[str] = set()
+    results: list[CachingInfo] = []
+    for req in requests.values():
+        if not is_api_request(req):
+            continue
+        headers_lower = {k.lower(): v for k, v in req.response_headers.items()}
+        cache_control = headers_lower.get("cache-control", "")
+        etag = headers_lower.get("etag", "")
+        last_modified = headers_lower.get("last-modified", "")
+        vary = headers_lower.get("vary", "")
+        if not any([cache_control, etag, last_modified, vary]):
+            continue
+        key = base_url(req.url)
+        if key in seen:
+            continue
+        seen.add(key)
+        results.append(
+            CachingInfo(
+                request_url=key,
+                cache_control=cache_control,
+                etag=etag,
+                last_modified=last_modified,
+                vary=vary,
+            )
+        )
+    return results
+def _cookies_for_host(host: str, cookies: list[CapturedCookie]) -> list[CapturedCookie]:
+    """Return cookies whose ``domain`` matches ``host`` per the standard rule.
+    A cookie applies to the request if
+    ``host == cookie.domain.lstrip(".")`` or
+    ``host.endswith("." + cookie.domain.lstrip("."))``. Empty hosts
+    (e.g., ``data:`` URLs) match nothing.
+    """
+    if not host:
+        return []
+    host_lower = host.lower()
+    matched: list[CapturedCookie] = []
+    for cookie in cookies:
+        domain = (cookie.domain or "").lstrip(".").lower()
+        if not domain:
+            continue
+        if host_lower == domain or host_lower.endswith("." + domain):
+            matched.append(cookie)
+    return matched
+def extract_replay_headers(
+    requests: dict[str, CapturedRequest],
+    cookies: list[CapturedCookie] | None = None,
+) -> list[ReplayHeaders]:
+    """Per-base-URL replay-header summary for API requests.
+    Per-base-URL dedup with merge: later requests at the same base
+    contribute their CSRF tokens / custom X-headers (union) and their
+    cookie names (appended without duplicates); ``authorization``,
+    ``referer``, ``origin`` and ``x_requested_with`` keep the first
+    non-empty value seen.
+    Tier A cookie-name fix: when ``cookies`` is provided AND at least
+    one captured cookie's ``domain`` matches the request URL's host, the
+    captured-cookies inventory is the PRIMARY source for cookie names
+    (sorted alphabetically, deduplicated). This is the post-PII-scrub
+    safe path, since the request-side ``Cookie:`` header value is
+    stripped to empty by the scrubber. When ``cookies`` is ``None`` (the
+    default — back-compat with the source skill) or no captured cookies
+    match the request's host, the function falls back to parsing the
+    raw request ``Cookie:`` header.
+    """
+    seen: dict[str, ReplayHeaders] = {}
+    csrf_header_names = {"x-csrf-token", "x-xsrf-token", "x-csrftoken"}
+    for req in requests.values():
+        if not is_api_request(req):
+            continue
+        key = base_url(req.url)
+        headers_lower = {k.lower(): v for k, v in req.request_headers.items()}
+        headers_original = {k.lower(): (k, v) for k, v in req.request_headers.items()}
+        authorization = headers_lower.get("authorization", "")
+        x_requested_with = headers_lower.get("x-requested-with", "")
+        referer = headers_lower.get("referer", "")
+        origin = headers_lower.get("origin", "")
+        csrf_tokens: dict[str, str] = {}
+        for name in csrf_header_names:
+            if name in headers_lower:
+                orig_name = headers_original[name][0]
+                csrf_tokens[orig_name] = headers_lower[name]
+        custom_x: dict[str, str] = {}
+        for lower_name, (orig_name, value) in headers_original.items():
+            if not lower_name.startswith("x-"):
+                continue
+            if lower_name in csrf_header_names:
+                continue
+            if lower_name == "x-requested-with":
+                continue
+            if lower_name in _STANDARD_BROWSER_X_HEADERS:
+                continue
+            custom_x[orig_name] = value
+        # ----- Tier A cookie-name source -----
+        cookie_names: list[str] = []
+        used_inventory = False
+        if cookies is not None:
+            host = (urlsplit(req.url).hostname or "").lower()
+            matching = _cookies_for_host(host, cookies)
+            if matching:
+                # Sorted + deduped names from inventory.
+                cookie_names = sorted({c.name for c in matching})
+                used_inventory = True
+        if not used_inventory:
+            raw_cookie = headers_lower.get("cookie", "")
+            if raw_cookie:
+                for pair in raw_cookie.split(";"):
+                    stripped = pair.strip()
+                    if "=" in stripped:
+                        cookie_names.append(stripped.split("=", 1)[0].strip())
+        has_content = any(
+            [
+                authorization,
+                csrf_tokens,
+                custom_x,
+                cookie_names,
+                referer,
+                origin,
+                x_requested_with,
+            ]
+        )
+        if not has_content:
+            continue
+        if key in seen:
+            existing = seen[key]
+            if authorization and not existing.authorization:
+                existing.authorization = authorization
+            existing.csrf_tokens.update(csrf_tokens)
+            existing.custom_x_headers.update(custom_x)
+            for name in cookie_names:
+                if name not in existing.required_cookies:
+                    existing.required_cookies.append(name)
+            if referer and not existing.referer:
+                existing.referer = referer
+            if origin and not existing.origin:
+                existing.origin = origin
+            if x_requested_with and not existing.x_requested_with:
+                existing.x_requested_with = x_requested_with
+        else:
+            seen[key] = ReplayHeaders(
+                request_url=key,
+                authorization=authorization,
+                csrf_tokens=csrf_tokens,
+                custom_x_headers=custom_x,
+                required_cookies=cookie_names,
+                referer=referer,
+                origin=origin,
+                x_requested_with=x_requested_with,
+            )
+    return list(seen.values())