PyPI - websec-validator - Versions diffs - 0.2.0__py3-none-any.whl - Mend

websec-validator 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

websec_validator/__init__.py +14 -0
websec_validator/briefing.py +218 -0
websec_validator/calibration.json +75 -0
websec_validator/calibration.py +226 -0
websec_validator/cli.py +395 -0
websec_validator/constitution.py +81 -0
websec_validator/corpus.json +49 -0
websec_validator/dynamic.py +249 -0
websec_validator/extractors/__init__.py +56 -0
websec_validator/extractors/auth.py +77 -0
websec_validator/extractors/authz.py +130 -0
websec_validator/extractors/base.py +101 -0
websec_validator/extractors/client_exposure.py +48 -0
websec_validator/extractors/graphql.py +71 -0
websec_validator/extractors/iac_ci.py +65 -0
websec_validator/extractors/integrations.py +55 -0
websec_validator/extractors/routes.py +215 -0
websec_validator/extractors/schemas.py +75 -0
websec_validator/extractors/stack.py +80 -0
websec_validator/extractors/surface.py +86 -0
websec_validator/extractors/tenant.py +33 -0
websec_validator/findings.py +199 -0
websec_validator/probes.py +79 -0
websec_validator/proof.py +96 -0
websec_validator/recon.py +28 -0
websec_validator/report.py +114 -0
websec_validator/scanners.py +248 -0
websec_validator/templates/probes/bola-cross-tenant.sh +192 -0
websec_validator/templates/probes/bola-write-verbs.py +147 -0
websec_validator/templates/probes/compare-roles.sh +69 -0
websec_validator/templates/probes/dlp-bypass-offline.py +149 -0
websec_validator/templates/probes/hs256-brute-force.py +90 -0
websec_validator/templates/probes/jwt-attacks.sh +161 -0
websec_validator/templates/probes/mass-assignment.py +201 -0
websec_validator/templates/probes/race-conditions.py +144 -0
websec_validator/templates/probes/rate-limit-burst.sh +136 -0
websec_validator/templates/probes/s3-assess.sh +120 -0
websec_validator/templates/probes/ssrf-probes.sh +189 -0
websec_validator/templates/probes/webhook-forgery.py +113 -0
websec_validator/templates/reports/FINDINGS-SUMMARY.md.template +75 -0
websec_validator/templates/reports/access-control-matrix.md.template +65 -0
websec_validator/templates/reports/findings-triage.md.template +28 -0
websec_validator/templates/reports/pentest-handover-brief.md.template +121 -0
websec_validator/templates/reports/per-tool-FINDINGS.md.template +37 -0
websec_validator-0.2.0.dist-info/METADATA +232 -0
websec_validator-0.2.0.dist-info/RECORD +50 -0
websec_validator-0.2.0.dist-info/WHEEL +5 -0
websec_validator-0.2.0.dist-info/entry_points.txt +2 -0
websec_validator-0.2.0.dist-info/licenses/LICENSE +21 -0
websec_validator-0.2.0.dist-info/top_level.txt +1 -0

websec_validator/dynamic.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""Dynamic phase (v1) — authenticated, READ-ONLY cross-tenant BOLA against a live target.
+This closes the loop: the static recon found the group-scoped routes + the tenant
+key; here we mint two real role tokens and check whether one tenant can read
+another tenant's data. v1 is **GET-only** (no mutation) so it is safe to run
+against a shared test environment. Write-verb BOLA / mass-assignment come later,
+explicitly gated.
+Config (JSON):
+{
+  "target": "https://host",
+  "login_path": "/api/auth/login",
+  "token_json_path": "tokens.accessToken",
+  "user_json_path": "user",
+  "tenant_field": "groupIds",          # field on the user object holding tenant id(s)
+  "tenant_path_param": "groupId",       # the {param} in routes that is the tenant boundary
+  "roles": { "agentA": {"email": "..", "password": ".."},
+             "agentB": {"email": "..", "password": ".."} }
+}
+"""
+from __future__ import annotations
+import json
+import re
+import urllib.error
+import urllib.request
+from pathlib import Path
+def _dig(d: dict, dotted: str):
+    cur = d
+    for part in dotted.split("."):
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(part)
+    return cur
+def _request(method: str, url: str, token: str | None, timeout: int = 20, data: bytes | None = None):
+    headers = {"Accept": "application/json"}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    if data is not None:
+        headers["Content-Type"] = "application/json"
+    req = urllib.request.Request(url, method=method, headers=headers, data=data)
+    try:
+        r = urllib.request.urlopen(req, timeout=timeout)
+        return r.status, r.read(4000).decode(errors="replace")
+    except urllib.error.HTTPError as e:
+        return e.code, e.read(1000).decode(errors="replace")
+    except Exception as e:
+        return None, f"{type(e).__name__}: {e}"
+def is_localhost(target: str) -> bool:
+    import urllib.parse
+    return (urllib.parse.urlparse(target).hostname or "") in ("localhost", "127.0.0.1", "::1", "0.0.0.0")
+def mint(cfg: dict, role: str) -> dict:
+    """Log in one role → {token, tenant}. Returns {} on failure."""
+    r = cfg["roles"][role]
+    body = json.dumps({"email": r["email"], "password": r["password"]}).encode()
+    req = urllib.request.Request(cfg["target"] + cfg.get("login_path", "/api/auth/login"),
+                                 data=body, headers={"Content-Type": "application/json"})
+    try:
+        d = json.load(urllib.request.urlopen(req, timeout=20))
+    except Exception as e:
+        return {"error": f"{type(e).__name__}: {e}"}
+    token = _dig(d, cfg.get("token_json_path", "tokens.accessToken"))
+    user = _dig(d, cfg.get("user_json_path", "user")) or {}
+    tenants = user.get(cfg.get("tenant_field", "groupIds")) or []
+    return {"token": token, "tenant": tenants[0] if tenants else None,
+            "email": user.get("email"), "role": user.get("role")}
+def _tenant_only_get_endpoints(facts: dict, param: str) -> list:
+    """GET endpoints whose ONLY path param is the tenant param — clean cross-tenant
+    list targets that need no other fixture id."""
+    out = []
+    brace = re.compile(r"\{([^}]+)\}")
+    for e in (facts.get("routes") or {}).get("endpoints", []):
+        if e.get("method") != "GET":
+            continue
+        params = brace.findall(e.get("path", ""))
+        if params == [param]:
+            out.append(e["path"])
+    return sorted(set(out))
+def cross_tenant_bola(cfg: dict, facts: dict) -> dict:
+    """For each tenant-scoped GET list endpoint, try to read the OTHER tenant's data."""
+    param = cfg.get("tenant_path_param", "groupId")
+    a, b = mint(cfg, "agentA"), mint(cfg, "agentB")
+    if not a.get("token") or not b.get("token"):
+        return {"error": "could not mint both agent tokens", "agentA": a.get("error"), "agentB": b.get("error")}
+    if a.get("tenant") == b.get("tenant") or not (a.get("tenant") and b.get("tenant")):
+        return {"error": f"agents are not in two distinct tenants (A={a.get('tenant')}, B={b.get('tenant')})"}
+    endpoints = _tenant_only_get_endpoints(facts, param)
+    results = []
+    for path in endpoints:
+        # attacker A tries to read B's tenant data, and vice-versa
+        for atk, vic, direction in ((a, b, "A→B"), (b, a, "B→A")):
+            url = cfg["target"] + path.replace("{" + param + "}", vic["tenant"])
+            code, body = _request("GET", url, atk["token"])
+            if code in (401, 403, 404):
+                verdict = "blocked"
+            elif code in (200, 206) and body and body.strip() not in ("[]", "{}", '{"data":[]}'):
+                verdict = "LEAK"
+            elif code in (200, 206):
+                verdict = "blocked-empty"   # 200 but no cross-tenant data returned
+            else:
+                verdict = "investigate"
+            results.append({"path": path, "direction": direction, "status": code, "verdict": verdict})
+    blocked = sum(1 for r in results if r["verdict"].startswith("blocked"))
+    leaks = [r for r in results if r["verdict"] == "LEAK"]
+    return {
+        "target": cfg["target"],
+        "tenant_param": param,
+        "agentA": {"email": a.get("email"), "tenant": a.get("tenant")},
+        "agentB": {"email": b.get("email"), "tenant": b.get("tenant")},
+        "endpoints_tested": len(endpoints),
+        "checks": len(results),
+        "blocked": blocked,
+        "leaks": leaks,
+        "results": results,
+        "summary": f"{blocked}/{len(results)} cross-tenant GET reads blocked" + (f" — {len(leaks)} LEAK(S)!" if leaks else " — all isolated"),
+    }
+# GET endpoints that are NOT safe to hit even read-only — they trigger real work
+# (cron ticks, scraping, content generation, seeding, sending, uploads).
+SIDE_EFFECTING = re.compile(
+    r"/cron|/seed|generate|regenerate|/trigger|/sync|/send|/run\b|social-image|"
+    r"sponsor-post|upload|/refresh|/rebuild|/process|/dispatch|/import|/export|/scrape(?![\w-])", re.I)
+def unauth_reachability(target: str, facts: dict, max_endpoints: int = 50) -> dict:
+    """STRICT read-only: GET each genuine data-read endpoint with NO auth, to see
+    which are reachable unauthenticated. Skips side-effecting GETs and any path
+    with an unfilled {param}. Records status + byte size only (never the body)."""
+    eps = []
+    for e in (facts.get("routes") or {}).get("endpoints", []):
+        p = e.get("path", "")
+        if e.get("method") != "GET" or "{" in p or SIDE_EFFECTING.search(p):
+            continue
+        eps.append(p)
+    eps = sorted(set(eps))[:max_endpoints]
+    results, skipped = [], [e.get("path") for e in (facts.get("routes") or {}).get("endpoints", [])
+                            if e.get("method") == "GET" and SIDE_EFFECTING.search(e.get("path", ""))]
+    for path in eps:
+        code, body = _request("GET", target + path, token=None, timeout=15)
+        n = len(body) if isinstance(body, str) else 0
+        if code in (401, 403):
+            verdict = "protected"
+        elif code in (301, 302, 307, 308):
+            verdict = "redirect (likely to login)"
+        elif code in (200, 206) and n > 2:
+            verdict = "OPEN-no-auth"
+        elif code in (200, 206):
+            verdict = "open-empty"
+        elif code == 404:
+            verdict = "404"
+        else:
+            verdict = f"http-{code}"
+        results.append({"path": path, "status": code, "bytes": n, "verdict": verdict})
+    openish = [r for r in results if r["verdict"] == "OPEN-no-auth"]
+    return {
+        "target": target,
+        "mode": "STRICT read-only · unauthenticated · GET-only · side-effecting paths skipped",
+        "tested": len(results),
+        "skipped_side_effecting": sorted(set(skipped)),
+        "open_no_auth": openish,
+        "results": results,
+        "summary": f"{len(openish)}/{len(results)} data-read GET endpoints reachable WITHOUT auth"
+                   + (" — review whether these should be public" if openish else " — all gated"),
+    }
+WRITE_VERBS = {"POST", "PUT", "PATCH", "DELETE"}
+def write_auth_enforcement(target: str, facts: dict, max_endpoints: int = 80) -> dict:
+    """LOCALHOST-ONLY. Does each write endpoint ENFORCE auth? Sends the write verb
+    UNAUTHENTICATED with an empty `{}` body and dummy IDs in path params, then reads
+    the status: 401/403 = auth enforced (good); 400/422/404/405 = reached the
+    handler/validation with no auth gate (auth likely MISSING — verify); 2xx =
+    executed unauthenticated (critical). Empty body + dummy id keep it
+    non-destructive (validation rejects before any real mutation)."""
+    eps = []
+    for e in (facts.get("routes") or {}).get("endpoints", []):
+        p = e.get("path", "")
+        if e.get("method") in WRITE_VERBS and not SIDE_EFFECTING.search(p):
+            eps.append((e["method"], p))
+    eps = sorted(set(eps))[:max_endpoints]
+    results = []
+    for method, path in eps:
+        url = target + re.sub(r"\{[^}]+\}", "websec-nonexistent-id", path)
+        code, _ = _request(method, url, token=None, data=b"{}")
+        if code in (401, 403):
+            verdict = "auth-enforced"
+        elif code in (200, 201, 204):
+            verdict = "EXECUTED-UNAUTH"
+        elif code in (400, 422, 404, 405, 409, 415, 500):
+            verdict = "no-auth-gate (reached handler/validation)"
+        else:
+            verdict = f"http-{code}"
+        results.append({"method": method, "path": path, "status": code, "verdict": verdict})
+    missing = [r for r in results if r["verdict"] != "auth-enforced" and not r["verdict"].startswith("http-")]
+    executed = [r for r in results if r["verdict"] == "EXECUTED-UNAUTH"]
+    enforced = sum(1 for r in results if r["verdict"] == "auth-enforced")
+    return {
+        "note": "Heuristic: a protected route returns 401/403 BEFORE validation; a 400/404 unauth means "
+                "the request reached the handler with no auth gate. VERIFY each — but inconsistency vs "
+                "sibling routes is high-signal. Empty body + dummy ids keep this non-destructive.",
+        "tested": len(results),
+        "auth_enforced": enforced,
+        "no_auth_gate": missing,
+        "executed_unauth": executed,
+        "results": results,
+        "summary": f"{enforced}/{len(results)} write endpoints enforce auth · "
+                   f"{len(missing)} reached with no auth gate · {len(executed)} executed unauthenticated",
+    }
+def run_unauth(target: str, facts_path: Path, outdir: Path, probe_writes: bool = False) -> dict:
+    facts = json.loads(Path(facts_path).read_text())
+    res = {"unauth_reachability": unauth_reachability(target, facts)}
+    if probe_writes:
+        res["write_auth_enforcement"] = write_auth_enforcement(target, facts)
+    outdir.mkdir(parents=True, exist_ok=True)
+    (outdir / "dynamic-unauth-findings.json").write_text(json.dumps(res, indent=2))
+    return res
+def run_dynamic(config_path: Path, facts_path: Path, outdir: Path) -> dict:
+    cfg = json.loads(Path(config_path).read_text())
+    facts = json.loads(Path(facts_path).read_text())
+    res = {"cross_tenant_bola": cross_tenant_bola(cfg, facts)}
+    outdir.mkdir(parents=True, exist_ok=True)
+    (outdir / "dynamic-findings.json").write_text(json.dumps(res, indent=2))
+    return res

websec_validator/extractors/__init__.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Extractor registry + the run_all driver.
+Order matters: stack runs first (later extractors read facts['stack']), then the
+surface/authz extractors. Adding a new dimension = drop a module here and append
+it to REGISTRY — that's the whole extension model.
+"""
+from __future__ import annotations
+from pathlib import Path
+from .auth import AuthExtractor
+from .authz import AuthzExtractor
+from .base import Extractor, RepoContext
+from .client_exposure import ClientExposureExtractor
+from .graphql import GraphQLExtractor
+from .iac_ci import IacCiExtractor
+from .integrations import IntegrationsExtractor
+from .routes import RoutesExtractor
+from .schemas import SchemasExtractor
+from .stack import StackExtractor
+from .surface import SurfaceExtractor
+from .tenant import TenantExtractor
+# Order matters: stack first (others read facts['stack']); authz after routes
+# (reads facts['routes']).
+REGISTRY: list[Extractor] = [
+    StackExtractor(),
+    RoutesExtractor(),
+    AuthExtractor(),
+    AuthzExtractor(),
+    TenantExtractor(),
+    SurfaceExtractor(),
+    SchemasExtractor(),
+    IacCiExtractor(),
+    ClientExposureExtractor(),
+    GraphQLExtractor(),
+    IntegrationsExtractor(),
+]
+def run_all(root: Path, version: str) -> dict:
+    """Walk the repo once, run every extractor, return the merged FACTS dict."""
+    ctx = RepoContext(root)
+    facts: dict = {
+        "tool": "websec-validator",
+        "version": version,
+        "target": str(root.resolve()),
+        "files_scanned": len(ctx.code_files),
+    }
+    for ext in REGISTRY:
+        try:
+            facts[ext.name] = ext.extract(ctx, facts)
+        except Exception as e:  # one extractor must never sink the whole run
+            facts[ext.name] = {"error": f"{type(e).__name__}: {e}"}
+    return facts

websec_validator/extractors/auth.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""Auth model extractor — scheme, login surface, guards.
+Uses framework + route signals (e.g. a NextAuth catch-all route is a dead
+giveaway) before falling back to grep, so it doesn't coin-flip between bearer and
+cookie the way naive signal-counting does.
+"""
+from __future__ import annotations
+import re
+from .base import Extractor, RepoContext
+JWT_LIBS = re.compile(r"jsonwebtoken|\bjose\b|\bPyJWT\b|import\s+jwt\b|get_jwt_identity|"
+                      r"jwt\.(?:sign|verify|encode|decode)|jwtVerify|flask_jwt|@?jwt_required|token_required", re.I)
+PASSPORT = re.compile(r"\bpassport\b|passport-jwt|passport-local")
+SESSION = re.compile(r"express-session|cookie-session|iron-session|flask\.session|request\.session|getServerSession|getToken", re.I)
+APIKEY = re.compile(r"x-api-key|api[_-]?key|apikey", re.I)
+GUARDS = re.compile(r"requireAuth|requirePermission|requireRole|isAuthenticated|@login_required|@require|ensureAuth|withAuth|getServerSession|verifyToken|authMiddleware|@roles_required|can\(|ability\.", re.I)
+class AuthExtractor(Extractor):
+    name = "auth"
+    category = "authn"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        frameworks = set((facts.get("stack") or {}).get("frameworks", []))
+        routes = facts.get("routes") or {}
+        auth_eps = (routes.get("targeting") or {}).get("auth_endpoints", [])
+        # scheme: framework/route signals first, then grep
+        jwt = passport = session = apikey = 0
+        guard_files = []
+        for _p, rel, text in ctx.iter_code():
+            if JWT_LIBS.search(text):
+                jwt += 1
+            if PASSPORT.search(text):
+                passport += 1
+            if SESSION.search(text):
+                session += 1
+            if APIKEY.search(text):
+                apikey += 1
+            if GUARDS.search(text) and len(guard_files) < 25:
+                guard_files.append(rel)
+        nextauth = "nextauth" in frameworks or any("nextauth" in e.lower() for e in auth_eps)
+        # Detect ALL schemes present, then pick a primary by priority. A JWT app
+        # that also wires Passport for SSO must read as primary=jwt, not passport
+        # (the bug the WhatsApp app exposed). Priority: nextauth > jwt > session > passport > api-key.
+        detected = []
+        if nextauth:
+            detected.append("nextauth (session JWT in cookie)")
+        if jwt:
+            detected.append("jwt (bearer)")
+        if session:
+            detected.append("session-cookie")
+        if passport:
+            detected.append("passport (often SSO/OAuth strategies)")
+        if apikey:
+            detected.append("api-key")
+        primary = detected[0] if detected else "unknown"
+        token_location = ("cookie" if primary.startswith("nextauth") or primary.startswith("session")
+                          else "bearer" if primary.startswith("jwt")
+                          else "header" if primary.startswith("api-key")
+                          else "cookie-or-bearer" if primary.startswith("passport") else "unknown")
+        return {
+            "scheme": primary,
+            "schemes_detected": detected,
+            "token_location": token_location,
+            "login_endpoints": auth_eps,
+            "guard_files": guard_files,
+            "signal_counts": {"jwt": jwt, "passport": passport, "session": session, "api_key": apikey},
+            "note": "AGENT: confirm the PRIMARY auth flow + how a test token is minted before the JWT/auth "
+                    "probes. Multiple schemes often mean primary bearer/session + secondary SSO (passport).",
+        }

websec_validator/extractors/authz.py ADDED Viewed

@@ -0,0 +1,130 @@
+"""Authorization extractor — the access-control map (who can reach what).
+Per your methodology this is the highest-value test. For each endpoint we decide
+whether a guard protects it, using three signals:
+  1. a guard pattern in the handler's own file (incl. `router.use(authenticate)`),
+  2. coverage by a Next.js middleware matcher,
+  3. a GLOBAL auth middleware (`app.use(authenticate)`) — when present, routes are
+     protected by default and "no visible guard" becomes a *verify* signal, not an
+     alarm (this is what inflated the count on the Express monorepo).
+File-level heuristic → results are HINTS the agent confirms. The high-signal
+output is write endpoints with no visible guard that also don't look public.
+"""
+from __future__ import annotations
+import re
+from pathlib import Path
+from .base import Extractor, RepoContext
+WRITE_VERBS = {"POST", "PUT", "PATCH", "DELETE"}
+GUARD = re.compile(
+    r"requireAuth|requirePermission|requireRole|requireGroupAccess|isAuthenticated|"
+    r"@login_required|@jwt_required|@permission_required|@roles_required|ensureAuth|"
+    r"withAuth|getServerSession|getToken\s*\(|verifyToken|authMiddleware|@UseGuards|"
+    r"@Roles\b|Depends\s*\(\s*(?:get_current_user|oauth2_scheme|require_)|Security\s*\(|"
+    r"PermissionRequired|LoginRequired|passport\.authenticate|"
+    r"\.use\s*\(\s*[\w.]*(?:[Aa]uth|[Vv]erifyToken|[Rr]equire|[Gg]uard|jwt)\w*", re.I)
+# a global, path-less auth middleware → everything downstream is protected by default
+GLOBAL_AUTH = re.compile(
+    r"app\.use\s*\(\s*[\w.]*(?:authenticate|requireAuth|authMiddleware|verifyToken|"
+    r"isAuthenticated|jwtMiddleware|ensureAuth)\w*\s*\)", re.I)
+PUBLIC_HINT = re.compile(
+    r"/(login|logout|register|signup|signin|health|healthz|ping|status|webhooks?|"
+    r"public|\.well-known|robots|favicon|sitemap|callback|refresh|csrf|metrics)\b", re.I)
+ROLE = re.compile(
+    r"@Roles\s*\(([^)]*)\)|allowedRoles\s*=\s*\[([^\]]*)\]|"
+    r"\b(?:role|roles)\b\s*[!=]==?\s*['\"]([\w:.-]+)['\"]|"
+    r"has_?[Rr]ole\s*\(\s*['\"]([\w:.-]+)['\"]|"
+    r"authorizeRoles\s*\(([^)]*)\)|permission_required\s*\(\s*['\"]([\w:.-]+)['\"]")
+def _parse_next_middleware(ctx: RepoContext) -> dict:
+    for cand in ("middleware.ts", "middleware.js", "src/middleware.ts", "src/middleware.js"):
+        txt = ctx.manifest(cand)
+        if not txt:
+            continue
+        matchers = re.findall(r"matcher\s*:\s*\[([^\]]*)\]", txt)
+        patterns = re.findall(r"['\"]([^'\"]+)['\"]", matchers[0]) if matchers else []
+        roles = [m for grp in ROLE.findall(txt) for m in grp if m]
+        return {"present": True, "file": cand, "matchers": patterns, "role_checks": roles}
+    return {"present": False, "matchers": []}
+def _matcher_covers(path: str, matchers: list) -> bool:
+    for m in matchers:
+        base = m.split(":")[0].split("(")[0].rstrip("/*")
+        if base and path.startswith(base):
+            return True
+        if m.startswith("/(") or m == "/:path*":
+            return True
+    return False
+def _collect_roles(text: str, roles: set) -> None:
+    for grp in ROLE.findall(text or ""):
+        for m in grp:
+            if not m:
+                continue
+            for part in m.split(","):
+                v = part.strip().strip("'\" ")
+                if v and len(v) < 40:
+                    roles.add(v)
+class AuthzExtractor(Extractor):
+    name = "authz"
+    category = "authz"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        endpoints = (facts.get("routes") or {}).get("endpoints", [])
+        mw = _parse_next_middleware(ctx)
+        global_auth = any(GLOBAL_AUTH.search(t) for _p, _r, t in ctx.iter_code())
+        roles: set = set(mw.get("role_checks", []))
+        protected = no_guard = unknown = 0
+        no_guard_writes, egs = [], []
+        for e in endpoints:
+            cp = e.get("code_path", "")
+            text = ctx.text(Path(cp)) if cp else ""
+            _collect_roles(text, roles)
+            guarded = bool(text and GUARD.search(text)) or _matcher_covers(e.get("path", ""), mw.get("matchers", []))
+            relcp = ctx.rel(Path(cp)) if cp else ""
+            egs.append({"method": e.get("method"), "path": e.get("path"), "code_path": relcp,
+                        "guarded": bool(guarded), "analyzed": bool(text),
+                        "public_hint": bool(PUBLIC_HINT.search(e.get("path", "")))})
+            if guarded:
+                protected += 1
+            elif not text:
+                unknown += 1
+            else:
+                no_guard += 1
+                if e.get("method") in WRITE_VERBS and not PUBLIC_HINT.search(e.get("path", "")):
+                    no_guard_writes.append(f"{e['method']} {e['path']}  ({relcp or '?'})")
+        if global_auth:
+            note = ("A GLOBAL auth middleware (`app.use(<auth>)`) was detected — most routes are likely "
+                    "protected by default. The list below is write endpoints with NO guard visible in their "
+                    "own handler file; they MAY be covered globally. Verify each is either covered or an "
+                    "intentional public exemption — don't assume they're vulnerable.")
+        else:
+            note = ("No global auth middleware detected. Write endpoints with no visible guard are "
+                    "high-signal missing-authz leads — verify each.")
+        return {
+            "global_auth_middleware": global_auth,
+            "next_middleware": mw,
+            "roles_detected": sorted(r for r in roles if r),
+            "guard_summary": {"with_visible_guard": protected,
+                              "no_visible_guard": no_guard, "unknown": unknown},
+            "endpoint_guards": egs[:400],
+            "write_endpoints_without_visible_guard": sorted(set(no_guard_writes))[:60],
+            "note": note,
+        }

websec_validator/extractors/base.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Extractor framework — the backbone of the recon engine.
+Each extractor reads a shared, walked-once RepoContext and returns its slice of
+FACTS. Extractors are deterministic (no LLM, no network to the target) and
+degrade gracefully — a missing tool or unrecognized framework yields partial
+facts, never a crash. This is what lets the engine scale to a big monorepo and
+still say something useful.
+"""
+from __future__ import annotations
+from pathlib import Path
+SKIP_DIRS = {".git", "node_modules", "dist", "build", ".next", ".nuxt", "venv",
+             ".venv", "__pycache__", ".mypy_cache", ".pytest_cache", "coverage",
+             ".turbo", "out", "target", ".gradle", "vendor", "site-packages",
+             ".terraform", "security", ".websec-out", "websec-out", ".cache",
+             ".svelte-kit", "storybook-static", ".serverless",
+             # agent tooling + editor dirs + worktree copies — not the target app
+             ".wolf", ".claude", ".worktrees", ".idea", ".vscode", ".agent", ".agents"}
+CODE_EXT = {".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs", ".py", ".go", ".rb",
+            ".java", ".php", ".prisma"}
+MAX_FILES = 12000
+MAX_BYTES = 2_000_000
+class RepoContext:
+    """Walk the tree once; cache file text; serve cheap queries to every extractor."""
+    def __init__(self, root: Path):
+        self.root = root
+        self._text: dict[Path, str] = {}
+        self.code_files: list[Path] = []
+        self.stack: dict = {}          # filled by StackExtractor, read by the rest
+        self._walk()
+    def _walk(self) -> None:
+        n = 0
+        for p in self.root.rglob("*"):
+            if n >= MAX_FILES:
+                break
+            # match SKIP_DIRS against parts RELATIVE to the scan root — otherwise a
+            # repo located under e.g. ~/.cache or any dir named like a skip-dir would
+            # have its whole tree skipped.
+            if p.is_dir() or any(part in SKIP_DIRS for part in p.relative_to(self.root).parts):
+                continue
+            if p.suffix.lower() in CODE_EXT:
+                self.code_files.append(p)
+                n += 1
+    def rel(self, p: Path) -> str:
+        try:
+            return str(p.relative_to(self.root))
+        except ValueError:
+            return str(p)
+    def text(self, p: Path) -> str:
+        if p not in self._text:
+            try:
+                self._text[p] = "" if p.stat().st_size > MAX_BYTES else p.read_text(errors="ignore")
+            except Exception:
+                self._text[p] = ""
+        return self._text[p]
+    def iter_code(self):
+        """Yield (path, relpath, text) for every cached code file."""
+        for p in self.code_files:
+            yield p, self.rel(p), self.text(p)
+    def manifest(self, name: str) -> str:
+        f = self.root / name
+        try:
+            return f.read_text(errors="ignore") if f.is_file() else ""
+        except Exception:
+            return ""
+    def glob(self, pattern: str, limit: int = 2000) -> list[Path]:
+        """rglob filtered against SKIP_DIRS (for file-based framework detection)."""
+        out = []
+        for p in self.root.rglob(pattern):
+            if any(part in SKIP_DIRS for part in p.relative_to(self.root).parts):
+                continue
+            out.append(p)
+            if len(out) >= limit:
+                break
+        return out
+    def exists(self, *names: str) -> bool:
+        return any((self.root / n).exists() for n in names)
+class Extractor:
+    """Base class. Subclasses set `name`/`category` and implement extract()."""
+    name: str = "extractor"
+    category: str = "misc"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:  # pragma: no cover
+        """Return this extractor's slice of FACTS. `facts` holds prior extractors'
+        results (stack runs first), so later extractors can branch on them."""
+        raise NotImplementedError

websec_validator/extractors/client_exposure.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""Client-side exposure extractor — secrets that leak into the browser bundle.
+The Next.js/Vite footgun: any `NEXT_PUBLIC_*` / `VITE_*` var is inlined into the
+client bundle, and a server-only secret referenced from a client component ships
+to every visitor. Cheap static scan, high signal.
+"""
+from __future__ import annotations
+import re
+from .base import Extractor, RepoContext
+PUBLIC_ENV = re.compile(r"\b(NEXT_PUBLIC_\w+|VITE_\w+|REACT_APP_\w+|GATSBY_\w+|EXPO_PUBLIC_\w+|PUBLIC_\w{2,})\b")
+SECRETISH = re.compile(r"SECRET|PRIVATE|TOKEN|PASSWORD|PASSWD|API_?KEY|ACCESS_?KEY|CLIENT_SECRET|CREDENTIAL", re.I)
+SERVER_SECRET = re.compile(r"process\.env\.([A-Z0-9_]*(?:SECRET|PRIVATE|TOKEN|PASSWORD|API_?KEY|ACCESS_?KEY)[A-Z0-9_]*)")
+class ClientExposureExtractor(Extractor):
+    name = "client_exposure"
+    category = "exposure"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        public_vars: set = set()
+        public_secret_leaks = []      # public-prefixed AND secret-named → ships to client
+        server_secret_in_client = []  # server secret referenced from a 'use client' file
+        for _p, rel, text in ctx.iter_code():
+            for v in PUBLIC_ENV.findall(text):
+                public_vars.add(v)
+                if SECRETISH.search(v):
+                    public_secret_leaks.append(f"{v}  ({rel})")
+            if "use client" in text[:200] or "'use client'" in text[:200] or '"use client"' in text[:200]:
+                for s in SERVER_SECRET.findall(text):
+                    server_secret_in_client.append(f"{s}  ({rel})")
+        nextcfg = (ctx.manifest("next.config.js") + ctx.manifest("next.config.mjs")
+                   + ctx.manifest("next.config.ts"))
+        sourcemaps = "productionBrowserSourceMaps: true" in nextcfg
+        return {
+            "public_env_vars": sorted(public_vars)[:40],
+            "public_secret_leaks": sorted(set(public_secret_leaks)),     # HIGH if non-empty
+            "server_secret_in_client_component": sorted(set(server_secret_in_client)),  # HIGH if non-empty
+            "production_source_maps": sourcemaps,
+            "note": "public_secret_leaks and server_secret_in_client_component ship secrets to the browser — "
+                    "treat as HIGH and confirm. Plain NEXT_PUBLIC_* without secret-ish names are usually fine.",
+        }