browser-recon 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. browser_recon/__init__.py +3 -0
  2. browser_recon/analysis/__init__.py +16 -0
  3. browser_recon/analysis/headers.py +317 -0
  4. browser_recon/analysis/interactions.py +413 -0
  5. browser_recon/analysis/redirects.py +46 -0
  6. browser_recon/capture/__init__.py +221 -0
  7. browser_recon/capture/cdp_monitor.py +1720 -0
  8. browser_recon/capture/chrome_launcher.py +441 -0
  9. browser_recon/cli/__init__.py +1 -0
  10. browser_recon/cli/_url_helpers.py +260 -0
  11. browser_recon/cli/config.py +186 -0
  12. browser_recon/cli/login.py +218 -0
  13. browser_recon/cli/main.py +2249 -0
  14. browser_recon/cli/poll.py +382 -0
  15. browser_recon/cli/report_opener.py +29 -0
  16. browser_recon/cli/spinner.py +174 -0
  17. browser_recon/detection/__init__.py +8 -0
  18. browser_recon/detection/rules/__init__.py +16 -0
  19. browser_recon/detection/rules/anti_bot/__init__.py +240 -0
  20. browser_recon/detection/rules/auth_flow.py +214 -0
  21. browser_recon/detection/rules/rate_limit_signals.py +98 -0
  22. browser_recon/llm_eval/__init__.py +46 -0
  23. browser_recon/llm_eval/cost_tracker.py +100 -0
  24. browser_recon/llm_eval/fixtures.py +367 -0
  25. browser_recon/llm_eval/report.py +218 -0
  26. browser_recon/llm_eval/runner.py +352 -0
  27. browser_recon/llm_eval/scan_loader.py +145 -0
  28. browser_recon/models.py +788 -0
  29. browser_recon/reporting/__init__.py +7 -0
  30. browser_recon/reporting/duration.py +48 -0
  31. browser_recon/transport/__init__.py +1 -0
  32. browser_recon/transport/capture_upload.py +273 -0
  33. browser_recon/transport/uploader.py +1676 -0
  34. browser_recon/utils.py +125 -0
  35. browser_recon-0.3.0.dist-info/METADATA +22 -0
  36. browser_recon-0.3.0.dist-info/RECORD +38 -0
  37. browser_recon-0.3.0.dist-info/WHEEL +4 -0
  38. browser_recon-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,3 @@
1
+ """browser-recon: scan a URL and produce a scraping reconnaissance report."""
2
+
3
+ __version__: str = "0.3.0"
@@ -0,0 +1,16 @@
1
+ """Analysis subpackage (capture-time only).
2
+
3
+ T53.6 trimmed this package to the small set of helpers
4
+ :mod:`browser_recon.capture.cdp_monitor` needs while Chrome is live:
5
+
6
+ * :mod:`browser_recon.analysis.headers` -- header/CDN classifiers
7
+ exposed via :func:`is_cdn_url`.
8
+ * :mod:`browser_recon.analysis.interactions` -- DOM-observer +
9
+ interaction-log injection / parse helpers.
10
+ * :mod:`browser_recon.analysis.redirects` -- redirect-chain
11
+ extractor for the CDP response listener.
12
+
13
+ The full endpoint-inventory orchestrator + architecture / framework /
14
+ dependency classifiers moved to ``browser_recon_server.analysis_server``
15
+ in T53.2.
16
+ """
@@ -0,0 +1,317 @@
1
+ """Header analysis: CORS, caching, and replay header extraction.
2
+
3
+ Extracted from the existing browser-recon skill at
4
+ ``~/.claude/skills/browser-recon/scripts/analyzers/headers.py`` (task
5
+ 1.a.11). Verbatim port of ``is_cdn_url``, ``extract_cors_info`` and
6
+ ``extract_caching_info`` modulo the import-hack swap for proper
7
+ ``browser_recon.models`` / ``browser_recon.utils`` imports.
8
+
9
+ ``extract_replay_headers`` carries one Tier A behaviour fix — see
10
+ ``phase-1-analysis-spec.md`` § Audit of existing code and
11
+ ``build-sequence.md`` § Tier A — analyzers/headers.py.
12
+
13
+ Tier A — cookie-name source
14
+ ---------------------------
15
+ After PII scrubbing (``phase-1-privacy-transport-spec.md``), the
16
+ request-side ``Cookie:`` header value is stripped to an empty string.
17
+ Parsing it then yields no cookie names, leaving
18
+ ``ReplayHeaders.required_cookies`` empty for every endpoint — useless
19
+ for the synthesis layer.
20
+
21
+ The fix: ``extract_replay_headers`` now takes an optional
22
+ ``cookies: list[CapturedCookie] | None`` argument. When provided AND at
23
+ least one captured cookie's ``domain`` matches the request URL's host,
24
+ the captured-cookies inventory is the PRIMARY source for cookie names
25
+ (filtered by domain match, sorted alphabetically, deduplicated). The
26
+ header-parsing logic is retained as a fallback for callers that pass
27
+ ``cookies=None`` or for requests whose host has no matching cookie.
28
+
29
+ Domain-matching rule:
30
+ * ``request_host == cookie.domain.lstrip(".")``, OR
31
+ * ``request_host.endswith("." + cookie.domain.lstrip("."))``.
32
+
33
+ So ``.example.com`` matches ``www.example.com``, ``api.example.com`` and
34
+ ``example.com``; ``api.example.com`` (no leading dot) matches only
35
+ ``api.example.com``.
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ from urllib.parse import urlsplit
41
+
42
+ from browser_recon.models import (
43
+ CachingInfo,
44
+ CapturedCookie,
45
+ CapturedRequest,
46
+ CORSInfo,
47
+ ReplayHeaders,
48
+ )
49
+ from browser_recon.utils import base_url, is_api_request
50
+
51
+ # CDN/static asset domains — shared across monitor and analyzers.
52
+ # Only exact domains or suffixes, no loose prefix matches like "cdn." or "media."
53
+ CDN_DOMAINS = frozenset(
54
+ (
55
+ "static.licdn.com",
56
+ "dms.licdn.com",
57
+ "media.licdn.com",
58
+ "cloudfront.net",
59
+ "akamaihd.net",
60
+ "fastly.net",
61
+ "cdnjs.cloudflare.com",
62
+ "unpkg.com",
63
+ "jsdelivr.net",
64
+ "googleapis.com", # Google CDN (fonts, storage, etc.)
65
+ )
66
+ )
67
+
68
+ _STANDARD_BROWSER_X_HEADERS = {
69
+ "x-forwarded-for",
70
+ "x-forwarded-host",
71
+ "x-forwarded-proto",
72
+ "x-forwarded-port",
73
+ "x-real-ip",
74
+ "x-request-id",
75
+ "x-correlation-id",
76
+ "x-amz-date",
77
+ "x-amz-security-token",
78
+ "x-ratelimit-limit",
79
+ "x-ratelimit-remaining",
80
+ "x-ratelimit-reset",
81
+ }
82
+
83
+
84
+ def is_cdn_url(url: str) -> bool:
85
+ """Check if URL is from a CDN/static asset domain."""
86
+ host = urlsplit(url).hostname or ""
87
+ host_lower = host.lower()
88
+ for domain in CDN_DOMAINS:
89
+ if host_lower == domain or host_lower.endswith("." + domain):
90
+ return True
91
+ return False
92
+
93
+
94
+ def extract_cors_info(requests: dict[str, CapturedRequest]) -> list[CORSInfo]:
95
+ """Per-base-URL CORS summary for API requests.
96
+
97
+ CDN/static-asset hosts are skipped (CORS ``*`` is the default and
98
+ not informative for scraping).
99
+ """
100
+ seen: set[str] = set()
101
+ results: list[CORSInfo] = []
102
+
103
+ for req in requests.values():
104
+ if not is_api_request(req):
105
+ continue
106
+
107
+ # Skip CDN/static domains — CORS origin=* is expected and not useful
108
+ if is_cdn_url(req.url):
109
+ continue
110
+
111
+ headers_lower = {k.lower(): v for k, v in req.response_headers.items()}
112
+ allow_origin = headers_lower.get("access-control-allow-origin", "")
113
+ if not allow_origin:
114
+ continue
115
+
116
+ key = base_url(req.url)
117
+ if key in seen:
118
+ continue
119
+ seen.add(key)
120
+
121
+ allow_methods = headers_lower.get("access-control-allow-methods", "")
122
+ allow_headers = headers_lower.get("access-control-allow-headers", "")
123
+ allow_creds = (
124
+ headers_lower.get("access-control-allow-credentials", "").lower() == "true"
125
+ )
126
+
127
+ externally_callable = allow_origin == "*"
128
+
129
+ results.append(
130
+ CORSInfo(
131
+ request_url=key,
132
+ allow_origin=allow_origin,
133
+ allow_methods=allow_methods,
134
+ allow_headers=allow_headers,
135
+ allow_credentials=allow_creds,
136
+ externally_callable=externally_callable,
137
+ )
138
+ )
139
+
140
+ return results
141
+
142
+
143
+ def extract_caching_info(requests: dict[str, CapturedRequest]) -> list[CachingInfo]:
144
+ """Per-base-URL caching-header summary for API requests."""
145
+ seen: set[str] = set()
146
+ results: list[CachingInfo] = []
147
+
148
+ for req in requests.values():
149
+ if not is_api_request(req):
150
+ continue
151
+
152
+ headers_lower = {k.lower(): v for k, v in req.response_headers.items()}
153
+ cache_control = headers_lower.get("cache-control", "")
154
+ etag = headers_lower.get("etag", "")
155
+ last_modified = headers_lower.get("last-modified", "")
156
+ vary = headers_lower.get("vary", "")
157
+
158
+ if not any([cache_control, etag, last_modified, vary]):
159
+ continue
160
+
161
+ key = base_url(req.url)
162
+ if key in seen:
163
+ continue
164
+ seen.add(key)
165
+
166
+ results.append(
167
+ CachingInfo(
168
+ request_url=key,
169
+ cache_control=cache_control,
170
+ etag=etag,
171
+ last_modified=last_modified,
172
+ vary=vary,
173
+ )
174
+ )
175
+
176
+ return results
177
+
178
+
179
+ def _cookies_for_host(host: str, cookies: list[CapturedCookie]) -> list[CapturedCookie]:
180
+ """Return cookies whose ``domain`` matches ``host`` per the standard rule.
181
+
182
+ A cookie applies to the request if
183
+ ``host == cookie.domain.lstrip(".")`` or
184
+ ``host.endswith("." + cookie.domain.lstrip("."))``. Empty hosts
185
+ (e.g., ``data:`` URLs) match nothing.
186
+ """
187
+ if not host:
188
+ return []
189
+ host_lower = host.lower()
190
+ matched: list[CapturedCookie] = []
191
+ for cookie in cookies:
192
+ domain = (cookie.domain or "").lstrip(".").lower()
193
+ if not domain:
194
+ continue
195
+ if host_lower == domain or host_lower.endswith("." + domain):
196
+ matched.append(cookie)
197
+ return matched
198
+
199
+
200
+ def extract_replay_headers(
201
+ requests: dict[str, CapturedRequest],
202
+ cookies: list[CapturedCookie] | None = None,
203
+ ) -> list[ReplayHeaders]:
204
+ """Per-base-URL replay-header summary for API requests.
205
+
206
+ Per-base-URL dedup with merge: later requests at the same base
207
+ contribute their CSRF tokens / custom X-headers (union) and their
208
+ cookie names (appended without duplicates); ``authorization``,
209
+ ``referer``, ``origin`` and ``x_requested_with`` keep the first
210
+ non-empty value seen.
211
+
212
+ Tier A cookie-name fix: when ``cookies`` is provided AND at least
213
+ one captured cookie's ``domain`` matches the request URL's host, the
214
+ captured-cookies inventory is the PRIMARY source for cookie names
215
+ (sorted alphabetically, deduplicated). This is the post-PII-scrub
216
+ safe path, since the request-side ``Cookie:`` header value is
217
+ stripped to empty by the scrubber. When ``cookies`` is ``None`` (the
218
+ default — back-compat with the source skill) or no captured cookies
219
+ match the request's host, the function falls back to parsing the
220
+ raw request ``Cookie:`` header.
221
+ """
222
+ seen: dict[str, ReplayHeaders] = {}
223
+
224
+ csrf_header_names = {"x-csrf-token", "x-xsrf-token", "x-csrftoken"}
225
+
226
+ for req in requests.values():
227
+ if not is_api_request(req):
228
+ continue
229
+
230
+ key = base_url(req.url)
231
+ headers_lower = {k.lower(): v for k, v in req.request_headers.items()}
232
+ headers_original = {k.lower(): (k, v) for k, v in req.request_headers.items()}
233
+
234
+ authorization = headers_lower.get("authorization", "")
235
+ x_requested_with = headers_lower.get("x-requested-with", "")
236
+ referer = headers_lower.get("referer", "")
237
+ origin = headers_lower.get("origin", "")
238
+
239
+ csrf_tokens: dict[str, str] = {}
240
+ for name in csrf_header_names:
241
+ if name in headers_lower:
242
+ orig_name = headers_original[name][0]
243
+ csrf_tokens[orig_name] = headers_lower[name]
244
+
245
+ custom_x: dict[str, str] = {}
246
+ for lower_name, (orig_name, value) in headers_original.items():
247
+ if not lower_name.startswith("x-"):
248
+ continue
249
+ if lower_name in csrf_header_names:
250
+ continue
251
+ if lower_name == "x-requested-with":
252
+ continue
253
+ if lower_name in _STANDARD_BROWSER_X_HEADERS:
254
+ continue
255
+ custom_x[orig_name] = value
256
+
257
+ # ----- Tier A cookie-name source -----
258
+ cookie_names: list[str] = []
259
+ used_inventory = False
260
+ if cookies is not None:
261
+ host = (urlsplit(req.url).hostname or "").lower()
262
+ matching = _cookies_for_host(host, cookies)
263
+ if matching:
264
+ # Sorted + deduped names from inventory.
265
+ cookie_names = sorted({c.name for c in matching})
266
+ used_inventory = True
267
+
268
+ if not used_inventory:
269
+ raw_cookie = headers_lower.get("cookie", "")
270
+ if raw_cookie:
271
+ for pair in raw_cookie.split(";"):
272
+ stripped = pair.strip()
273
+ if "=" in stripped:
274
+ cookie_names.append(stripped.split("=", 1)[0].strip())
275
+
276
+ has_content = any(
277
+ [
278
+ authorization,
279
+ csrf_tokens,
280
+ custom_x,
281
+ cookie_names,
282
+ referer,
283
+ origin,
284
+ x_requested_with,
285
+ ]
286
+ )
287
+ if not has_content:
288
+ continue
289
+
290
+ if key in seen:
291
+ existing = seen[key]
292
+ if authorization and not existing.authorization:
293
+ existing.authorization = authorization
294
+ existing.csrf_tokens.update(csrf_tokens)
295
+ existing.custom_x_headers.update(custom_x)
296
+ for name in cookie_names:
297
+ if name not in existing.required_cookies:
298
+ existing.required_cookies.append(name)
299
+ if referer and not existing.referer:
300
+ existing.referer = referer
301
+ if origin and not existing.origin:
302
+ existing.origin = origin
303
+ if x_requested_with and not existing.x_requested_with:
304
+ existing.x_requested_with = x_requested_with
305
+ else:
306
+ seen[key] = ReplayHeaders(
307
+ request_url=key,
308
+ authorization=authorization,
309
+ csrf_tokens=csrf_tokens,
310
+ custom_x_headers=custom_x,
311
+ required_cookies=cookie_names,
312
+ referer=referer,
313
+ origin=origin,
314
+ x_requested_with=x_requested_with,
315
+ )
316
+
317
+ return list(seen.values())