websec-validator 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. websec_validator/__init__.py +14 -0
  2. websec_validator/briefing.py +218 -0
  3. websec_validator/calibration.json +75 -0
  4. websec_validator/calibration.py +226 -0
  5. websec_validator/cli.py +395 -0
  6. websec_validator/constitution.py +81 -0
  7. websec_validator/corpus.json +49 -0
  8. websec_validator/dynamic.py +249 -0
  9. websec_validator/extractors/__init__.py +56 -0
  10. websec_validator/extractors/auth.py +77 -0
  11. websec_validator/extractors/authz.py +130 -0
  12. websec_validator/extractors/base.py +101 -0
  13. websec_validator/extractors/client_exposure.py +48 -0
  14. websec_validator/extractors/graphql.py +71 -0
  15. websec_validator/extractors/iac_ci.py +65 -0
  16. websec_validator/extractors/integrations.py +55 -0
  17. websec_validator/extractors/routes.py +215 -0
  18. websec_validator/extractors/schemas.py +75 -0
  19. websec_validator/extractors/stack.py +80 -0
  20. websec_validator/extractors/surface.py +86 -0
  21. websec_validator/extractors/tenant.py +33 -0
  22. websec_validator/findings.py +199 -0
  23. websec_validator/probes.py +79 -0
  24. websec_validator/proof.py +96 -0
  25. websec_validator/recon.py +28 -0
  26. websec_validator/report.py +114 -0
  27. websec_validator/scanners.py +248 -0
  28. websec_validator/templates/probes/bola-cross-tenant.sh +192 -0
  29. websec_validator/templates/probes/bola-write-verbs.py +147 -0
  30. websec_validator/templates/probes/compare-roles.sh +69 -0
  31. websec_validator/templates/probes/dlp-bypass-offline.py +149 -0
  32. websec_validator/templates/probes/hs256-brute-force.py +90 -0
  33. websec_validator/templates/probes/jwt-attacks.sh +161 -0
  34. websec_validator/templates/probes/mass-assignment.py +201 -0
  35. websec_validator/templates/probes/race-conditions.py +144 -0
  36. websec_validator/templates/probes/rate-limit-burst.sh +136 -0
  37. websec_validator/templates/probes/s3-assess.sh +120 -0
  38. websec_validator/templates/probes/ssrf-probes.sh +189 -0
  39. websec_validator/templates/probes/webhook-forgery.py +113 -0
  40. websec_validator/templates/reports/FINDINGS-SUMMARY.md.template +75 -0
  41. websec_validator/templates/reports/access-control-matrix.md.template +65 -0
  42. websec_validator/templates/reports/findings-triage.md.template +28 -0
  43. websec_validator/templates/reports/pentest-handover-brief.md.template +121 -0
  44. websec_validator/templates/reports/per-tool-FINDINGS.md.template +37 -0
  45. websec_validator-0.2.0.dist-info/METADATA +232 -0
  46. websec_validator-0.2.0.dist-info/RECORD +50 -0
  47. websec_validator-0.2.0.dist-info/WHEEL +5 -0
  48. websec_validator-0.2.0.dist-info/entry_points.txt +2 -0
  49. websec_validator-0.2.0.dist-info/licenses/LICENSE +21 -0
  50. websec_validator-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,249 @@
1
+ """Dynamic phase (v1) — authenticated, READ-ONLY cross-tenant BOLA against a live target.
2
+
3
+ This closes the loop: the static recon found the group-scoped routes + the tenant
4
+ key; here we mint two real role tokens and check whether one tenant can read
5
+ another tenant's data. v1 is **GET-only** (no mutation) so it is safe to run
6
+ against a shared test environment. Write-verb BOLA / mass-assignment come later,
7
+ explicitly gated.
8
+
9
+ Config (JSON):
10
+ {
11
+ "target": "https://host",
12
+ "login_path": "/api/auth/login",
13
+ "token_json_path": "tokens.accessToken",
14
+ "user_json_path": "user",
15
+ "tenant_field": "groupIds", # field on the user object holding tenant id(s)
16
+ "tenant_path_param": "groupId", # the {param} in routes that is the tenant boundary
17
+ "roles": { "agentA": {"email": "..", "password": ".."},
18
+ "agentB": {"email": "..", "password": ".."} }
19
+ }
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ import re
26
+ import urllib.error
27
+ import urllib.request
28
+ from pathlib import Path
29
+
30
+
31
+ def _dig(d: dict, dotted: str):
32
+ cur = d
33
+ for part in dotted.split("."):
34
+ if not isinstance(cur, dict):
35
+ return None
36
+ cur = cur.get(part)
37
+ return cur
38
+
39
+
40
+ def _request(method: str, url: str, token: str | None, timeout: int = 20, data: bytes | None = None):
41
+ headers = {"Accept": "application/json"}
42
+ if token:
43
+ headers["Authorization"] = f"Bearer {token}"
44
+ if data is not None:
45
+ headers["Content-Type"] = "application/json"
46
+ req = urllib.request.Request(url, method=method, headers=headers, data=data)
47
+ try:
48
+ r = urllib.request.urlopen(req, timeout=timeout)
49
+ return r.status, r.read(4000).decode(errors="replace")
50
+ except urllib.error.HTTPError as e:
51
+ return e.code, e.read(1000).decode(errors="replace")
52
+ except Exception as e:
53
+ return None, f"{type(e).__name__}: {e}"
54
+
55
+
56
+ def is_localhost(target: str) -> bool:
57
+ import urllib.parse
58
+ return (urllib.parse.urlparse(target).hostname or "") in ("localhost", "127.0.0.1", "::1", "0.0.0.0")
59
+
60
+
61
+ def mint(cfg: dict, role: str) -> dict:
62
+ """Log in one role → {token, tenant}. Returns {} on failure."""
63
+ r = cfg["roles"][role]
64
+ body = json.dumps({"email": r["email"], "password": r["password"]}).encode()
65
+ req = urllib.request.Request(cfg["target"] + cfg.get("login_path", "/api/auth/login"),
66
+ data=body, headers={"Content-Type": "application/json"})
67
+ try:
68
+ d = json.load(urllib.request.urlopen(req, timeout=20))
69
+ except Exception as e:
70
+ return {"error": f"{type(e).__name__}: {e}"}
71
+ token = _dig(d, cfg.get("token_json_path", "tokens.accessToken"))
72
+ user = _dig(d, cfg.get("user_json_path", "user")) or {}
73
+ tenants = user.get(cfg.get("tenant_field", "groupIds")) or []
74
+ return {"token": token, "tenant": tenants[0] if tenants else None,
75
+ "email": user.get("email"), "role": user.get("role")}
76
+
77
+
78
+ def _tenant_only_get_endpoints(facts: dict, param: str) -> list:
79
+ """GET endpoints whose ONLY path param is the tenant param — clean cross-tenant
80
+ list targets that need no other fixture id."""
81
+ out = []
82
+ brace = re.compile(r"\{([^}]+)\}")
83
+ for e in (facts.get("routes") or {}).get("endpoints", []):
84
+ if e.get("method") != "GET":
85
+ continue
86
+ params = brace.findall(e.get("path", ""))
87
+ if params == [param]:
88
+ out.append(e["path"])
89
+ return sorted(set(out))
90
+
91
+
92
+ def cross_tenant_bola(cfg: dict, facts: dict) -> dict:
93
+ """For each tenant-scoped GET list endpoint, try to read the OTHER tenant's data."""
94
+ param = cfg.get("tenant_path_param", "groupId")
95
+ a, b = mint(cfg, "agentA"), mint(cfg, "agentB")
96
+ if not a.get("token") or not b.get("token"):
97
+ return {"error": "could not mint both agent tokens", "agentA": a.get("error"), "agentB": b.get("error")}
98
+ if a.get("tenant") == b.get("tenant") or not (a.get("tenant") and b.get("tenant")):
99
+ return {"error": f"agents are not in two distinct tenants (A={a.get('tenant')}, B={b.get('tenant')})"}
100
+
101
+ endpoints = _tenant_only_get_endpoints(facts, param)
102
+ results = []
103
+ for path in endpoints:
104
+ # attacker A tries to read B's tenant data, and vice-versa
105
+ for atk, vic, direction in ((a, b, "A→B"), (b, a, "B→A")):
106
+ url = cfg["target"] + path.replace("{" + param + "}", vic["tenant"])
107
+ code, body = _request("GET", url, atk["token"])
108
+ if code in (401, 403, 404):
109
+ verdict = "blocked"
110
+ elif code in (200, 206) and body and body.strip() not in ("[]", "{}", '{"data":[]}'):
111
+ verdict = "LEAK"
112
+ elif code in (200, 206):
113
+ verdict = "blocked-empty" # 200 but no cross-tenant data returned
114
+ else:
115
+ verdict = "investigate"
116
+ results.append({"path": path, "direction": direction, "status": code, "verdict": verdict})
117
+
118
+ blocked = sum(1 for r in results if r["verdict"].startswith("blocked"))
119
+ leaks = [r for r in results if r["verdict"] == "LEAK"]
120
+ return {
121
+ "target": cfg["target"],
122
+ "tenant_param": param,
123
+ "agentA": {"email": a.get("email"), "tenant": a.get("tenant")},
124
+ "agentB": {"email": b.get("email"), "tenant": b.get("tenant")},
125
+ "endpoints_tested": len(endpoints),
126
+ "checks": len(results),
127
+ "blocked": blocked,
128
+ "leaks": leaks,
129
+ "results": results,
130
+ "summary": f"{blocked}/{len(results)} cross-tenant GET reads blocked" + (f" — {len(leaks)} LEAK(S)!" if leaks else " — all isolated"),
131
+ }
132
+
133
+
134
+ # GET endpoints that are NOT safe to hit even read-only — they trigger real work
135
+ # (cron ticks, scraping, content generation, seeding, sending, uploads).
136
+ SIDE_EFFECTING = re.compile(
137
+ r"/cron|/seed|generate|regenerate|/trigger|/sync|/send|/run\b|social-image|"
138
+ r"sponsor-post|upload|/refresh|/rebuild|/process|/dispatch|/import|/export|/scrape(?![\w-])", re.I)
139
+
140
+
141
+ def unauth_reachability(target: str, facts: dict, max_endpoints: int = 50) -> dict:
142
+ """STRICT read-only: GET each genuine data-read endpoint with NO auth, to see
143
+ which are reachable unauthenticated. Skips side-effecting GETs and any path
144
+ with an unfilled {param}. Records status + byte size only (never the body)."""
145
+ eps = []
146
+ for e in (facts.get("routes") or {}).get("endpoints", []):
147
+ p = e.get("path", "")
148
+ if e.get("method") != "GET" or "{" in p or SIDE_EFFECTING.search(p):
149
+ continue
150
+ eps.append(p)
151
+ eps = sorted(set(eps))[:max_endpoints]
152
+
153
+ results, skipped = [], [e.get("path") for e in (facts.get("routes") or {}).get("endpoints", [])
154
+ if e.get("method") == "GET" and SIDE_EFFECTING.search(e.get("path", ""))]
155
+ for path in eps:
156
+ code, body = _request("GET", target + path, token=None, timeout=15)
157
+ n = len(body) if isinstance(body, str) else 0
158
+ if code in (401, 403):
159
+ verdict = "protected"
160
+ elif code in (301, 302, 307, 308):
161
+ verdict = "redirect (likely to login)"
162
+ elif code in (200, 206) and n > 2:
163
+ verdict = "OPEN-no-auth"
164
+ elif code in (200, 206):
165
+ verdict = "open-empty"
166
+ elif code == 404:
167
+ verdict = "404"
168
+ else:
169
+ verdict = f"http-{code}"
170
+ results.append({"path": path, "status": code, "bytes": n, "verdict": verdict})
171
+
172
+ openish = [r for r in results if r["verdict"] == "OPEN-no-auth"]
173
+ return {
174
+ "target": target,
175
+ "mode": "STRICT read-only · unauthenticated · GET-only · side-effecting paths skipped",
176
+ "tested": len(results),
177
+ "skipped_side_effecting": sorted(set(skipped)),
178
+ "open_no_auth": openish,
179
+ "results": results,
180
+ "summary": f"{len(openish)}/{len(results)} data-read GET endpoints reachable WITHOUT auth"
181
+ + (" — review whether these should be public" if openish else " — all gated"),
182
+ }
183
+
184
+
185
+ WRITE_VERBS = {"POST", "PUT", "PATCH", "DELETE"}
186
+
187
+
188
+ def write_auth_enforcement(target: str, facts: dict, max_endpoints: int = 80) -> dict:
189
+ """LOCALHOST-ONLY. Does each write endpoint ENFORCE auth? Sends the write verb
190
+ UNAUTHENTICATED with an empty `{}` body and dummy IDs in path params, then reads
191
+ the status: 401/403 = auth enforced (good); 400/422/404/405 = reached the
192
+ handler/validation with no auth gate (auth likely MISSING — verify); 2xx =
193
+ executed unauthenticated (critical). Empty body + dummy id keep it
194
+ non-destructive (validation rejects before any real mutation)."""
195
+ eps = []
196
+ for e in (facts.get("routes") or {}).get("endpoints", []):
197
+ p = e.get("path", "")
198
+ if e.get("method") in WRITE_VERBS and not SIDE_EFFECTING.search(p):
199
+ eps.append((e["method"], p))
200
+ eps = sorted(set(eps))[:max_endpoints]
201
+
202
+ results = []
203
+ for method, path in eps:
204
+ url = target + re.sub(r"\{[^}]+\}", "websec-nonexistent-id", path)
205
+ code, _ = _request(method, url, token=None, data=b"{}")
206
+ if code in (401, 403):
207
+ verdict = "auth-enforced"
208
+ elif code in (200, 201, 204):
209
+ verdict = "EXECUTED-UNAUTH"
210
+ elif code in (400, 422, 404, 405, 409, 415, 500):
211
+ verdict = "no-auth-gate (reached handler/validation)"
212
+ else:
213
+ verdict = f"http-{code}"
214
+ results.append({"method": method, "path": path, "status": code, "verdict": verdict})
215
+
216
+ missing = [r for r in results if r["verdict"] != "auth-enforced" and not r["verdict"].startswith("http-")]
217
+ executed = [r for r in results if r["verdict"] == "EXECUTED-UNAUTH"]
218
+ enforced = sum(1 for r in results if r["verdict"] == "auth-enforced")
219
+ return {
220
+ "note": "Heuristic: a protected route returns 401/403 BEFORE validation; a 400/404 unauth means "
221
+ "the request reached the handler with no auth gate. VERIFY each — but inconsistency vs "
222
+ "sibling routes is high-signal. Empty body + dummy ids keep this non-destructive.",
223
+ "tested": len(results),
224
+ "auth_enforced": enforced,
225
+ "no_auth_gate": missing,
226
+ "executed_unauth": executed,
227
+ "results": results,
228
+ "summary": f"{enforced}/{len(results)} write endpoints enforce auth · "
229
+ f"{len(missing)} reached with no auth gate · {len(executed)} executed unauthenticated",
230
+ }
231
+
232
+
233
+ def run_unauth(target: str, facts_path: Path, outdir: Path, probe_writes: bool = False) -> dict:
234
+ facts = json.loads(Path(facts_path).read_text())
235
+ res = {"unauth_reachability": unauth_reachability(target, facts)}
236
+ if probe_writes:
237
+ res["write_auth_enforcement"] = write_auth_enforcement(target, facts)
238
+ outdir.mkdir(parents=True, exist_ok=True)
239
+ (outdir / "dynamic-unauth-findings.json").write_text(json.dumps(res, indent=2))
240
+ return res
241
+
242
+
243
+ def run_dynamic(config_path: Path, facts_path: Path, outdir: Path) -> dict:
244
+ cfg = json.loads(Path(config_path).read_text())
245
+ facts = json.loads(Path(facts_path).read_text())
246
+ res = {"cross_tenant_bola": cross_tenant_bola(cfg, facts)}
247
+ outdir.mkdir(parents=True, exist_ok=True)
248
+ (outdir / "dynamic-findings.json").write_text(json.dumps(res, indent=2))
249
+ return res
@@ -0,0 +1,56 @@
1
+ """Extractor registry + the run_all driver.
2
+
3
+ Order matters: stack runs first (later extractors read facts['stack']), then the
4
+ surface/authz extractors. Adding a new dimension = drop a module here and append
5
+ it to REGISTRY — that's the whole extension model.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+
12
+ from .auth import AuthExtractor
13
+ from .authz import AuthzExtractor
14
+ from .base import Extractor, RepoContext
15
+ from .client_exposure import ClientExposureExtractor
16
+ from .graphql import GraphQLExtractor
17
+ from .iac_ci import IacCiExtractor
18
+ from .integrations import IntegrationsExtractor
19
+ from .routes import RoutesExtractor
20
+ from .schemas import SchemasExtractor
21
+ from .stack import StackExtractor
22
+ from .surface import SurfaceExtractor
23
+ from .tenant import TenantExtractor
24
+
25
+ # Order matters: stack first (others read facts['stack']); authz after routes
26
+ # (reads facts['routes']).
27
+ REGISTRY: list[Extractor] = [
28
+ StackExtractor(),
29
+ RoutesExtractor(),
30
+ AuthExtractor(),
31
+ AuthzExtractor(),
32
+ TenantExtractor(),
33
+ SurfaceExtractor(),
34
+ SchemasExtractor(),
35
+ IacCiExtractor(),
36
+ ClientExposureExtractor(),
37
+ GraphQLExtractor(),
38
+ IntegrationsExtractor(),
39
+ ]
40
+
41
+
42
+ def run_all(root: Path, version: str) -> dict:
43
+ """Walk the repo once, run every extractor, return the merged FACTS dict."""
44
+ ctx = RepoContext(root)
45
+ facts: dict = {
46
+ "tool": "websec-validator",
47
+ "version": version,
48
+ "target": str(root.resolve()),
49
+ "files_scanned": len(ctx.code_files),
50
+ }
51
+ for ext in REGISTRY:
52
+ try:
53
+ facts[ext.name] = ext.extract(ctx, facts)
54
+ except Exception as e: # one extractor must never sink the whole run
55
+ facts[ext.name] = {"error": f"{type(e).__name__}: {e}"}
56
+ return facts
@@ -0,0 +1,77 @@
1
+ """Auth model extractor — scheme, login surface, guards.
2
+
3
+ Uses framework + route signals (e.g. a NextAuth catch-all route is a dead
4
+ giveaway) before falling back to grep, so it doesn't coin-flip between bearer and
5
+ cookie the way naive signal-counting does.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+
12
+ from .base import Extractor, RepoContext
13
+
14
+ JWT_LIBS = re.compile(r"jsonwebtoken|\bjose\b|\bPyJWT\b|import\s+jwt\b|get_jwt_identity|"
15
+ r"jwt\.(?:sign|verify|encode|decode)|jwtVerify|flask_jwt|@?jwt_required|token_required", re.I)
16
+ PASSPORT = re.compile(r"\bpassport\b|passport-jwt|passport-local")
17
+ SESSION = re.compile(r"express-session|cookie-session|iron-session|flask\.session|request\.session|getServerSession|getToken", re.I)
18
+ APIKEY = re.compile(r"x-api-key|api[_-]?key|apikey", re.I)
19
+ GUARDS = re.compile(r"requireAuth|requirePermission|requireRole|isAuthenticated|@login_required|@require|ensureAuth|withAuth|getServerSession|verifyToken|authMiddleware|@roles_required|can\(|ability\.", re.I)
20
+
21
+
22
+ class AuthExtractor(Extractor):
23
+ name = "auth"
24
+ category = "authn"
25
+
26
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
27
+ frameworks = set((facts.get("stack") or {}).get("frameworks", []))
28
+ routes = facts.get("routes") or {}
29
+ auth_eps = (routes.get("targeting") or {}).get("auth_endpoints", [])
30
+
31
+ # scheme: framework/route signals first, then grep
32
+ jwt = passport = session = apikey = 0
33
+ guard_files = []
34
+ for _p, rel, text in ctx.iter_code():
35
+ if JWT_LIBS.search(text):
36
+ jwt += 1
37
+ if PASSPORT.search(text):
38
+ passport += 1
39
+ if SESSION.search(text):
40
+ session += 1
41
+ if APIKEY.search(text):
42
+ apikey += 1
43
+ if GUARDS.search(text) and len(guard_files) < 25:
44
+ guard_files.append(rel)
45
+
46
+ nextauth = "nextauth" in frameworks or any("nextauth" in e.lower() for e in auth_eps)
47
+
48
+ # Detect ALL schemes present, then pick a primary by priority. A JWT app
49
+ # that also wires Passport for SSO must read as primary=jwt, not passport
50
+ # (the bug the WhatsApp app exposed). Priority: nextauth > jwt > session > passport > api-key.
51
+ detected = []
52
+ if nextauth:
53
+ detected.append("nextauth (session JWT in cookie)")
54
+ if jwt:
55
+ detected.append("jwt (bearer)")
56
+ if session:
57
+ detected.append("session-cookie")
58
+ if passport:
59
+ detected.append("passport (often SSO/OAuth strategies)")
60
+ if apikey:
61
+ detected.append("api-key")
62
+ primary = detected[0] if detected else "unknown"
63
+ token_location = ("cookie" if primary.startswith("nextauth") or primary.startswith("session")
64
+ else "bearer" if primary.startswith("jwt")
65
+ else "header" if primary.startswith("api-key")
66
+ else "cookie-or-bearer" if primary.startswith("passport") else "unknown")
67
+
68
+ return {
69
+ "scheme": primary,
70
+ "schemes_detected": detected,
71
+ "token_location": token_location,
72
+ "login_endpoints": auth_eps,
73
+ "guard_files": guard_files,
74
+ "signal_counts": {"jwt": jwt, "passport": passport, "session": session, "api_key": apikey},
75
+ "note": "AGENT: confirm the PRIMARY auth flow + how a test token is minted before the JWT/auth "
76
+ "probes. Multiple schemes often mean primary bearer/session + secondary SSO (passport).",
77
+ }
@@ -0,0 +1,130 @@
1
+ """Authorization extractor — the access-control map (who can reach what).
2
+
3
+ Per your methodology this is the highest-value test. For each endpoint we decide
4
+ whether a guard protects it, using three signals:
5
+ 1. a guard pattern in the handler's own file (incl. `router.use(authenticate)`),
6
+ 2. coverage by a Next.js middleware matcher,
7
+ 3. a GLOBAL auth middleware (`app.use(authenticate)`) — when present, routes are
8
+ protected by default and "no visible guard" becomes a *verify* signal, not an
9
+ alarm (this is what inflated the count on the Express monorepo).
10
+
11
+ File-level heuristic → results are HINTS the agent confirms. The high-signal
12
+ output is write endpoints with no visible guard that also don't look public.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import re
18
+ from pathlib import Path
19
+
20
+ from .base import Extractor, RepoContext
21
+
22
+ WRITE_VERBS = {"POST", "PUT", "PATCH", "DELETE"}
23
+
24
+ GUARD = re.compile(
25
+ r"requireAuth|requirePermission|requireRole|requireGroupAccess|isAuthenticated|"
26
+ r"@login_required|@jwt_required|@permission_required|@roles_required|ensureAuth|"
27
+ r"withAuth|getServerSession|getToken\s*\(|verifyToken|authMiddleware|@UseGuards|"
28
+ r"@Roles\b|Depends\s*\(\s*(?:get_current_user|oauth2_scheme|require_)|Security\s*\(|"
29
+ r"PermissionRequired|LoginRequired|passport\.authenticate|"
30
+ r"\.use\s*\(\s*[\w.]*(?:[Aa]uth|[Vv]erifyToken|[Rr]equire|[Gg]uard|jwt)\w*", re.I)
31
+
32
+ # a global, path-less auth middleware → everything downstream is protected by default
33
+ GLOBAL_AUTH = re.compile(
34
+ r"app\.use\s*\(\s*[\w.]*(?:authenticate|requireAuth|authMiddleware|verifyToken|"
35
+ r"isAuthenticated|jwtMiddleware|ensureAuth)\w*\s*\)", re.I)
36
+
37
+ PUBLIC_HINT = re.compile(
38
+ r"/(login|logout|register|signup|signin|health|healthz|ping|status|webhooks?|"
39
+ r"public|\.well-known|robots|favicon|sitemap|callback|refresh|csrf|metrics)\b", re.I)
40
+
41
+ ROLE = re.compile(
42
+ r"@Roles\s*\(([^)]*)\)|allowedRoles\s*=\s*\[([^\]]*)\]|"
43
+ r"\b(?:role|roles)\b\s*[!=]==?\s*['\"]([\w:.-]+)['\"]|"
44
+ r"has_?[Rr]ole\s*\(\s*['\"]([\w:.-]+)['\"]|"
45
+ r"authorizeRoles\s*\(([^)]*)\)|permission_required\s*\(\s*['\"]([\w:.-]+)['\"]")
46
+
47
+
48
+ def _parse_next_middleware(ctx: RepoContext) -> dict:
49
+ for cand in ("middleware.ts", "middleware.js", "src/middleware.ts", "src/middleware.js"):
50
+ txt = ctx.manifest(cand)
51
+ if not txt:
52
+ continue
53
+ matchers = re.findall(r"matcher\s*:\s*\[([^\]]*)\]", txt)
54
+ patterns = re.findall(r"['\"]([^'\"]+)['\"]", matchers[0]) if matchers else []
55
+ roles = [m for grp in ROLE.findall(txt) for m in grp if m]
56
+ return {"present": True, "file": cand, "matchers": patterns, "role_checks": roles}
57
+ return {"present": False, "matchers": []}
58
+
59
+
60
+ def _matcher_covers(path: str, matchers: list) -> bool:
61
+ for m in matchers:
62
+ base = m.split(":")[0].split("(")[0].rstrip("/*")
63
+ if base and path.startswith(base):
64
+ return True
65
+ if m.startswith("/(") or m == "/:path*":
66
+ return True
67
+ return False
68
+
69
+
70
+ def _collect_roles(text: str, roles: set) -> None:
71
+ for grp in ROLE.findall(text or ""):
72
+ for m in grp:
73
+ if not m:
74
+ continue
75
+ for part in m.split(","):
76
+ v = part.strip().strip("'\" ")
77
+ if v and len(v) < 40:
78
+ roles.add(v)
79
+
80
+
81
+ class AuthzExtractor(Extractor):
82
+ name = "authz"
83
+ category = "authz"
84
+
85
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
86
+ endpoints = (facts.get("routes") or {}).get("endpoints", [])
87
+ mw = _parse_next_middleware(ctx)
88
+
89
+ global_auth = any(GLOBAL_AUTH.search(t) for _p, _r, t in ctx.iter_code())
90
+ roles: set = set(mw.get("role_checks", []))
91
+ protected = no_guard = unknown = 0
92
+ no_guard_writes, egs = [], []
93
+
94
+ for e in endpoints:
95
+ cp = e.get("code_path", "")
96
+ text = ctx.text(Path(cp)) if cp else ""
97
+ _collect_roles(text, roles)
98
+ guarded = bool(text and GUARD.search(text)) or _matcher_covers(e.get("path", ""), mw.get("matchers", []))
99
+ relcp = ctx.rel(Path(cp)) if cp else ""
100
+ egs.append({"method": e.get("method"), "path": e.get("path"), "code_path": relcp,
101
+ "guarded": bool(guarded), "analyzed": bool(text),
102
+ "public_hint": bool(PUBLIC_HINT.search(e.get("path", "")))})
103
+ if guarded:
104
+ protected += 1
105
+ elif not text:
106
+ unknown += 1
107
+ else:
108
+ no_guard += 1
109
+ if e.get("method") in WRITE_VERBS and not PUBLIC_HINT.search(e.get("path", "")):
110
+ no_guard_writes.append(f"{e['method']} {e['path']} ({relcp or '?'})")
111
+
112
+ if global_auth:
113
+ note = ("A GLOBAL auth middleware (`app.use(<auth>)`) was detected — most routes are likely "
114
+ "protected by default. The list below is write endpoints with NO guard visible in their "
115
+ "own handler file; they MAY be covered globally. Verify each is either covered or an "
116
+ "intentional public exemption — don't assume they're vulnerable.")
117
+ else:
118
+ note = ("No global auth middleware detected. Write endpoints with no visible guard are "
119
+ "high-signal missing-authz leads — verify each.")
120
+
121
+ return {
122
+ "global_auth_middleware": global_auth,
123
+ "next_middleware": mw,
124
+ "roles_detected": sorted(r for r in roles if r),
125
+ "guard_summary": {"with_visible_guard": protected,
126
+ "no_visible_guard": no_guard, "unknown": unknown},
127
+ "endpoint_guards": egs[:400],
128
+ "write_endpoints_without_visible_guard": sorted(set(no_guard_writes))[:60],
129
+ "note": note,
130
+ }
@@ -0,0 +1,101 @@
1
+ """Extractor framework — the backbone of the recon engine.
2
+
3
+ Each extractor reads a shared, walked-once RepoContext and returns its slice of
4
+ FACTS. Extractors are deterministic (no LLM, no network to the target) and
5
+ degrade gracefully — a missing tool or unrecognized framework yields partial
6
+ facts, never a crash. This is what lets the engine scale to a big monorepo and
7
+ still say something useful.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+
14
+ SKIP_DIRS = {".git", "node_modules", "dist", "build", ".next", ".nuxt", "venv",
15
+ ".venv", "__pycache__", ".mypy_cache", ".pytest_cache", "coverage",
16
+ ".turbo", "out", "target", ".gradle", "vendor", "site-packages",
17
+ ".terraform", "security", ".websec-out", "websec-out", ".cache",
18
+ ".svelte-kit", "storybook-static", ".serverless",
19
+ # agent tooling + editor dirs + worktree copies — not the target app
20
+ ".wolf", ".claude", ".worktrees", ".idea", ".vscode", ".agent", ".agents"}
21
+ CODE_EXT = {".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs", ".py", ".go", ".rb",
22
+ ".java", ".php", ".prisma"}
23
+ MAX_FILES = 12000
24
+ MAX_BYTES = 2_000_000
25
+
26
+
27
+ class RepoContext:
28
+ """Walk the tree once; cache file text; serve cheap queries to every extractor."""
29
+
30
+ def __init__(self, root: Path):
31
+ self.root = root
32
+ self._text: dict[Path, str] = {}
33
+ self.code_files: list[Path] = []
34
+ self.stack: dict = {} # filled by StackExtractor, read by the rest
35
+ self._walk()
36
+
37
+ def _walk(self) -> None:
38
+ n = 0
39
+ for p in self.root.rglob("*"):
40
+ if n >= MAX_FILES:
41
+ break
42
+ # match SKIP_DIRS against parts RELATIVE to the scan root — otherwise a
43
+ # repo located under e.g. ~/.cache or any dir named like a skip-dir would
44
+ # have its whole tree skipped.
45
+ if p.is_dir() or any(part in SKIP_DIRS for part in p.relative_to(self.root).parts):
46
+ continue
47
+ if p.suffix.lower() in CODE_EXT:
48
+ self.code_files.append(p)
49
+ n += 1
50
+
51
+ def rel(self, p: Path) -> str:
52
+ try:
53
+ return str(p.relative_to(self.root))
54
+ except ValueError:
55
+ return str(p)
56
+
57
+ def text(self, p: Path) -> str:
58
+ if p not in self._text:
59
+ try:
60
+ self._text[p] = "" if p.stat().st_size > MAX_BYTES else p.read_text(errors="ignore")
61
+ except Exception:
62
+ self._text[p] = ""
63
+ return self._text[p]
64
+
65
+ def iter_code(self):
66
+ """Yield (path, relpath, text) for every cached code file."""
67
+ for p in self.code_files:
68
+ yield p, self.rel(p), self.text(p)
69
+
70
+ def manifest(self, name: str) -> str:
71
+ f = self.root / name
72
+ try:
73
+ return f.read_text(errors="ignore") if f.is_file() else ""
74
+ except Exception:
75
+ return ""
76
+
77
+ def glob(self, pattern: str, limit: int = 2000) -> list[Path]:
78
+ """rglob filtered against SKIP_DIRS (for file-based framework detection)."""
79
+ out = []
80
+ for p in self.root.rglob(pattern):
81
+ if any(part in SKIP_DIRS for part in p.relative_to(self.root).parts):
82
+ continue
83
+ out.append(p)
84
+ if len(out) >= limit:
85
+ break
86
+ return out
87
+
88
+ def exists(self, *names: str) -> bool:
89
+ return any((self.root / n).exists() for n in names)
90
+
91
+
92
+ class Extractor:
93
+ """Base class. Subclasses set `name`/`category` and implement extract()."""
94
+
95
+ name: str = "extractor"
96
+ category: str = "misc"
97
+
98
+ def extract(self, ctx: RepoContext, facts: dict) -> dict: # pragma: no cover
99
+ """Return this extractor's slice of FACTS. `facts` holds prior extractors'
100
+ results (stack runs first), so later extractors can branch on them."""
101
+ raise NotImplementedError
@@ -0,0 +1,48 @@
1
+ """Client-side exposure extractor — secrets that leak into the browser bundle.
2
+
3
+ The Next.js/Vite footgun: any `NEXT_PUBLIC_*` / `VITE_*` var is inlined into the
4
+ client bundle, and a server-only secret referenced from a client component ships
5
+ to every visitor. Cheap static scan, high signal.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+
12
+ from .base import Extractor, RepoContext
13
+
14
+ PUBLIC_ENV = re.compile(r"\b(NEXT_PUBLIC_\w+|VITE_\w+|REACT_APP_\w+|GATSBY_\w+|EXPO_PUBLIC_\w+|PUBLIC_\w{2,})\b")
15
+ SECRETISH = re.compile(r"SECRET|PRIVATE|TOKEN|PASSWORD|PASSWD|API_?KEY|ACCESS_?KEY|CLIENT_SECRET|CREDENTIAL", re.I)
16
+ SERVER_SECRET = re.compile(r"process\.env\.([A-Z0-9_]*(?:SECRET|PRIVATE|TOKEN|PASSWORD|API_?KEY|ACCESS_?KEY)[A-Z0-9_]*)")
17
+
18
+
19
+ class ClientExposureExtractor(Extractor):
20
+ name = "client_exposure"
21
+ category = "exposure"
22
+
23
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
24
+ public_vars: set = set()
25
+ public_secret_leaks = [] # public-prefixed AND secret-named → ships to client
26
+ server_secret_in_client = [] # server secret referenced from a 'use client' file
27
+
28
+ for _p, rel, text in ctx.iter_code():
29
+ for v in PUBLIC_ENV.findall(text):
30
+ public_vars.add(v)
31
+ if SECRETISH.search(v):
32
+ public_secret_leaks.append(f"{v} ({rel})")
33
+ if "use client" in text[:200] or "'use client'" in text[:200] or '"use client"' in text[:200]:
34
+ for s in SERVER_SECRET.findall(text):
35
+ server_secret_in_client.append(f"{s} ({rel})")
36
+
37
+ nextcfg = (ctx.manifest("next.config.js") + ctx.manifest("next.config.mjs")
38
+ + ctx.manifest("next.config.ts"))
39
+ sourcemaps = "productionBrowserSourceMaps: true" in nextcfg
40
+
41
+ return {
42
+ "public_env_vars": sorted(public_vars)[:40],
43
+ "public_secret_leaks": sorted(set(public_secret_leaks)), # HIGH if non-empty
44
+ "server_secret_in_client_component": sorted(set(server_secret_in_client)), # HIGH if non-empty
45
+ "production_source_maps": sourcemaps,
46
+ "note": "public_secret_leaks and server_secret_in_client_component ship secrets to the browser — "
47
+ "treat as HIGH and confirm. Plain NEXT_PUBLIC_* without secret-ish names are usually fine.",
48
+ }