PyPI - websec-validator - Versions diffs - 0.2.0__py3-none-any.whl - Mend

websec-validator 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

websec_validator/__init__.py +14 -0
websec_validator/briefing.py +218 -0
websec_validator/calibration.json +75 -0
websec_validator/calibration.py +226 -0
websec_validator/cli.py +395 -0
websec_validator/constitution.py +81 -0
websec_validator/corpus.json +49 -0
websec_validator/dynamic.py +249 -0
websec_validator/extractors/__init__.py +56 -0
websec_validator/extractors/auth.py +77 -0
websec_validator/extractors/authz.py +130 -0
websec_validator/extractors/base.py +101 -0
websec_validator/extractors/client_exposure.py +48 -0
websec_validator/extractors/graphql.py +71 -0
websec_validator/extractors/iac_ci.py +65 -0
websec_validator/extractors/integrations.py +55 -0
websec_validator/extractors/routes.py +215 -0
websec_validator/extractors/schemas.py +75 -0
websec_validator/extractors/stack.py +80 -0
websec_validator/extractors/surface.py +86 -0
websec_validator/extractors/tenant.py +33 -0
websec_validator/findings.py +199 -0
websec_validator/probes.py +79 -0
websec_validator/proof.py +96 -0
websec_validator/recon.py +28 -0
websec_validator/report.py +114 -0
websec_validator/scanners.py +248 -0
websec_validator/templates/probes/bola-cross-tenant.sh +192 -0
websec_validator/templates/probes/bola-write-verbs.py +147 -0
websec_validator/templates/probes/compare-roles.sh +69 -0
websec_validator/templates/probes/dlp-bypass-offline.py +149 -0
websec_validator/templates/probes/hs256-brute-force.py +90 -0
websec_validator/templates/probes/jwt-attacks.sh +161 -0
websec_validator/templates/probes/mass-assignment.py +201 -0
websec_validator/templates/probes/race-conditions.py +144 -0
websec_validator/templates/probes/rate-limit-burst.sh +136 -0
websec_validator/templates/probes/s3-assess.sh +120 -0
websec_validator/templates/probes/ssrf-probes.sh +189 -0
websec_validator/templates/probes/webhook-forgery.py +113 -0
websec_validator/templates/reports/FINDINGS-SUMMARY.md.template +75 -0
websec_validator/templates/reports/access-control-matrix.md.template +65 -0
websec_validator/templates/reports/findings-triage.md.template +28 -0
websec_validator/templates/reports/pentest-handover-brief.md.template +121 -0
websec_validator/templates/reports/per-tool-FINDINGS.md.template +37 -0
websec_validator-0.2.0.dist-info/METADATA +232 -0
websec_validator-0.2.0.dist-info/RECORD +50 -0
websec_validator-0.2.0.dist-info/WHEEL +5 -0
websec_validator-0.2.0.dist-info/entry_points.txt +2 -0
websec_validator-0.2.0.dist-info/licenses/LICENSE +21 -0
websec_validator-0.2.0.dist-info/top_level.txt +1 -0

websec_validator/extractors/graphql.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""GraphQL surface extractor.
+GraphQL is its own attack surface (introspection schema-dump, alias/depth DoS,
+GET-method mutations). Noir collapses a GraphQL server to one `POST /graphql`
+endpoint, so we add the detail: is introspection on, is the playground exposed,
+is there any depth/complexity limiting. Only emits when GraphQL is present.
+"""
+from __future__ import annotations
+import re
+from .base import Extractor, RepoContext
+SCHEMA_CODE = re.compile(
+    r"makeExecutableSchema|buildSchema|new ApolloServer|createYoga|type-graphql|"
+    r"@Resolver|@ObjectType|gql`|type\s+Query\b|type\s+Mutation\b|strawberry\.|"
+    r"graphene\.|ariadne|mercurius", re.I)
+INTROSPECTION_ON = re.compile(r"introspection\s*:\s*true")
+INTROSPECTION_OFF = re.compile(r"introspection\s*:\s*false|NoSchemaIntrospection|NoIntrospection")
+PLAYGROUND = re.compile(r"playground\s*:\s*true|graphiql\s*:\s*true|LandingPageGraphQLPlayground|LandingPageLocalDefault")
+LIMITING = re.compile(r"graphql-depth-limit|depthLimit|costAnalysis|graphql-cost-analysis|"
+                      r"createComplexityLimitRule|query-complexity|graphql-armor")
+class GraphQLExtractor(Extractor):
+    name = "graphql"
+    category = "surface"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        frameworks = set((facts.get("stack") or {}).get("frameworks", []))
+        schema_files = [ctx.rel(p) for p in (ctx.glob("**/*.graphql", 60) + ctx.glob("**/*.gql", 60))]
+        endpoints = [e for e in (facts.get("routes") or {}).get("endpoints", [])
+                     if "graphql" in e.get("path", "").lower()]
+        if not ({"graphql", "apollo-graphql"} & frameworks) and not schema_files and not endpoints:
+            return {"present": False}
+        introspection, playground, limiting, code_hit = "unknown", False, False, False
+        for _p, _rel, text in ctx.iter_code():
+            if SCHEMA_CODE.search(text):
+                code_hit = True
+                if INTROSPECTION_ON.search(text):
+                    introspection = "enabled"
+                elif INTROSPECTION_OFF.search(text) and introspection != "enabled":
+                    introspection = "disabled"
+                if PLAYGROUND.search(text):
+                    playground = True
+                if LIMITING.search(text):
+                    limiting = True
+        if not (code_hit or schema_files or endpoints):
+            return {"present": False}
+        findings = []
+        if introspection in ("enabled", "unknown"):
+            findings.append({"severity": "HIGH" if introspection == "enabled" else "MEDIUM",
+                             "issue": f"introspection {'ENABLED' if introspection == 'enabled' else 'not explicitly disabled'}",
+                             "detail": "schema-dump exposure — disable in prod / add NoSchemaIntrospection"})
+        if playground:
+            findings.append({"severity": "MEDIUM", "issue": "GraphQL playground/landing page enabled",
+                             "detail": "disable in production"})
+        if not limiting:
+            findings.append({"severity": "MEDIUM", "issue": "no query depth/complexity limiting detected",
+                             "detail": "alias/deep-query DoS — add depth+cost limits (e.g. graphql-armor)"})
+        return {"present": True,
+                "endpoints": [f"{e['method']} {e['path']}" for e in endpoints] or ["(server detected; endpoint not routed by Noir)"],
+                "schema_files": schema_files[:20], "introspection": introspection,
+                "playground_enabled": playground, "query_limiting_detected": limiting,
+                "findings": findings, "maps_to_probe": "graphql-cop (run externally against the /graphql endpoint)"}

websec_validator/extractors/iac_ci.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""IaC + CI/CD extractor — the pipeline + infra attack surface.
+The commonly-missed P0 surface: GitHub Actions script injection via untrusted
+context, third-party actions pinned to mutable tags, Dockerfiles running as root,
+and committed Terraform state. Pure static globbing — no tools required (zizmor /
+Checkov can be layered later for depth).
+"""
+from __future__ import annotations
+import re
+from .base import Extractor, RepoContext
+# untrusted GitHub Actions contexts an attacker can control
+UNTRUSTED = re.compile(
+    r"\$\{\{\s*github\.(?:head_ref|event\.(?:pull_request|issue|comment|review|"
+    r"head_commit|workflow_run)[^}]*|event\.[^}]*\.(?:title|body|name|email|ref|label|message)[^}]*)\s*\}\}")
+USES = re.compile(r"uses:\s*([^\s@#]+)@([^\s#'\"]+)")
+SHA40 = re.compile(r"^[0-9a-f]{40}$")
+class IacCiExtractor(Extractor):
+    name = "iac_ci"
+    category = "infra"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        findings = []
+        # --- GitHub Actions ---
+        for wf in ctx.glob(".github/workflows/*.yml") + ctx.glob(".github/workflows/*.yaml"):
+            rel, text = ctx.rel(wf), ctx.text(wf)
+            contexts = sorted(set(UNTRUSTED.findall(text)))
+            if contexts:
+                findings.append({"severity": "HIGH", "kind": "gha-script-injection", "file": rel,
+                                 "detail": "untrusted context in workflow (dangerous if used in a run: step) — "
+                                           + ", ".join("github." + c for c in contexts[:4])})
+            unpinned = sorted({f"{a}@{r}" for a, r in USES.findall(text)
+                               if not SHA40.match(r) and not a.startswith("./")})
+            if unpinned:
+                findings.append({"severity": "MEDIUM", "kind": "gha-unpinned-action", "file": rel,
+                                 "detail": "actions pinned to a mutable tag (pin to a commit SHA): "
+                                           + ", ".join(unpinned[:6])})
+        # --- Dockerfiles ---
+        for df in ctx.glob("**/Dockerfile") + ctx.glob("**/Dockerfile.*"):
+            rel, text = ctx.rel(df), ctx.text(df)
+            users = re.findall(r"^\s*USER\s+(\S+)", text, re.M)
+            if not users or users[-1].lower() in ("root", "0"):
+                findings.append({"severity": "MEDIUM", "kind": "docker-root",
+                                 "file": rel, "detail": "container runs as root (add a non-root USER)"})
+            if "HEALTHCHECK" not in text:
+                findings.append({"severity": "LOW", "kind": "docker-no-healthcheck",
+                                 "file": rel, "detail": "no HEALTHCHECK defined"})
+        # --- Terraform state committed ---
+        for tf in ctx.glob("**/*.tfstate")[:5]:
+            findings.append({"severity": "HIGH", "kind": "terraform-state-committed", "file": ctx.rel(tf),
+                             "detail": "tfstate may contain plaintext secrets (DB passwords, keys) — must not be committed"})
+        by_sev: dict = {}
+        for f in findings:
+            by_sev[f["severity"]] = by_sev.get(f["severity"], 0) + 1
+        return {"findings": findings, "by_severity": by_sev,
+                "workflows_scanned": len(ctx.glob(".github/workflows/*.yml") + ctx.glob(".github/workflows/*.yaml"))}

websec_validator/extractors/integrations.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""Integrations + webhooks extractor.
+Inbound webhooks that don't verify a signature are a forgery/replay surface;
+each outbound third-party SDK is a trust boundary + secret-handling surface.
+Reads the route inventory to find webhook endpoints, then checks each handler
+file for signature-verification code.
+"""
+from __future__ import annotations
+import re
+from pathlib import Path
+from .base import Extractor, RepoContext
+WEBHOOK_PATH = re.compile(r"webhook|/hook|/callback|/inbound", re.I)
+SIG_VERIFY = re.compile(
+    r"createHmac|\bhmac\b|timingSafeEqual|verif\w*[Ss]ignature|X-Hub-Signature|"
+    r"X-Signature|Stripe-Signature|\bsvix\b|constant_time_compare|compare_digest|"
+    r"verifyWebhook|signature", re.I)
+SDKS = {"stripe": "Stripe", "twilio": "Twilio", "@sendgrid": "SendGrid", "messagebird": "MessageBird/Bird",
+        "@slack": "Slack", "openai": "OpenAI", "@anthropic": "Anthropic", "octokit": "GitHub",
+        "plaid": "Plaid", "@aws-sdk": "AWS", "aws-sdk": "AWS", "firebase": "Firebase",
+        "mailgun": "Mailgun", "@sentry": "Sentry", "paypal": "PayPal", "squareup": "Square",
+        "@google-cloud": "GCP", "appsync": "AppSync", "wpapi": "WordPress", "@wordpress": "WordPress"}
+class IntegrationsExtractor(Extractor):
+    name = "integrations"
+    category = "surface"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        endpoints = (facts.get("routes") or {}).get("endpoints", [])
+        webhook_eps = [e for e in endpoints if WEBHOOK_PATH.search(e.get("path", ""))]
+        unverified = []
+        for e in webhook_eps:
+            cp = e.get("code_path", "")
+            text = ctx.text(Path(cp)) if cp else ""
+            if not (text and SIG_VERIFY.search(text)):
+                unverified.append(f"{e['method']} {e['path']}  ({ctx.rel(Path(cp)) if cp else '?'})")
+        blob = " ".join(ctx.text(p) for p in ctx.glob("**/package.json", 80)).lower()
+        blob += " ".join(ctx.text(p) for p in (ctx.glob("**/requirements*.txt", 40) + ctx.glob("**/pyproject.toml", 40))).lower()
+        detected = sorted({label for dep, label in SDKS.items() if dep.lower() in blob})
+        return {
+            "webhook_endpoints": [f"{e['method']} {e['path']}" for e in webhook_eps],
+            "webhooks_without_sig_verification": sorted(set(unverified)),   # HIGH if non-empty
+            "third_party_integrations": detected,
+            "note": "Webhooks with no signature-verification code in their handler = forgery/replay risk "
+                    "(run webhook-forgery; verify against your middleware). Each integration is an outbound "
+                    "trust + secret-handling surface (SSRF, secret leakage, supply-chain).",
+        }

websec_validator/extractors/routes.py ADDED Viewed

@@ -0,0 +1,215 @@
+"""Route / endpoint extractor — the spine of the attack surface.
+Primary engine: **OWASP Noir** (owasp-noir/noir) — 50+ frameworks, real parsing
+(Next.js App Router, Express, NestJS, Flask, FastAPI, Django, Rails, Go...),
+emits method + path + typed params + code path. We shell out to it and parse its
+JSON. If Noir isn't installed we fall back to a framework-aware regex pass so the
+tool still produces something — but Noir is strongly preferred and the briefing
+says so when it's missing.
+We then DERIVE the high-value targeting signals that make probes precise:
+  - write endpoints           → BOLA-write / mass-assignment targets
+  - path-param endpoints      → IDOR / BOLA enumeration targets
+  - url/domain-ish params     → SSRF candidates
+  - redirect-ish params       → open-redirect candidates
+  - file-upload params        → upload / path-traversal candidates
+  - auth endpoints            → login surface
+"""
+from __future__ import annotations
+import json
+import re
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from .base import Extractor, RepoContext
+WRITE_VERBS = {"POST", "PUT", "PATCH", "DELETE"}
+EXCLUDE_GLOBS = "*.test.ts,*.test.tsx,*.spec.ts,*.test.js,*.spec.js,*_test.go,*_test.py,test_*.py,*.stories.tsx"
+# param-name heuristics → attack class
+SSRF_NAMES = re.compile(r"^(url|uri|link|domain|host|endpoint|webhook|feed|rss|image|img|src|proxy|fetch|target|origin|site|address)s?$", re.I)
+REDIRECT_NAMES = re.compile(r"^(redirect|redirect_?uri|next|return|return_?url|callback|continue|dest|destination|goto)s?$", re.I)
+TRAVERSAL_NAMES = re.compile(r"^(file|filename|filepath|path|dir|folder|template|name|key|attachment|download|doc)s?$", re.I)
+TEMPLATED = ("BASE_URL", "localhost", "127.0.0.1", "${", "{{")
+ASSET_GLOB = re.compile(r"\*\.\w+")
+def _clean_path(p: str) -> str:
+    p = re.sub(r":(\w+)", r"{\1}", p)    # Express :id  -> {id}
+    p = re.sub(r"\*(\w+)", r"{\1}", p)    # splat *key   -> {key}
+    return p
+def _is_noise(path: str) -> bool:
+    if not path or not path.startswith("/"):
+        return True
+    if any(t in path for t in TEMPLATED):
+        return True
+    return bool(ASSET_GLOB.search(path))   # static-asset glob route (/*.png)
+def _noir_scan(root: Path) -> list | None:
+    """Run Noir → list of endpoint dicts, or None if Noir unavailable/failed."""
+    if not shutil.which("noir"):
+        return None
+    with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tf:
+        out = Path(tf.name)
+    try:
+        proc = subprocess.run(
+            ["noir", "scan", str(root), "-f", "json", "-o", str(out),
+             "--exclude-path", EXCLUDE_GLOBS, "--no-log", "--no-color"],
+            capture_output=True, text=True, timeout=300)
+        if not out.exists():
+            return None
+        data = json.loads(out.read_text() or "{}")
+        return data.get("endpoints", []) if isinstance(data, dict) else (data or [])
+    except Exception:
+        return None
+    finally:
+        try:
+            out.unlink()
+        except Exception:
+            pass
+def _normalize_noir(eps: list) -> list:
+    rows, seen = [], set()
+    for e in eps:
+        if e.get("internal"):
+            continue
+        path = e.get("url") or e.get("path") or ""
+        # Noir keeps Django <int:pk> / <str:name> notation — normalize to {pk}/{name}
+        path = re.sub(r"<(?:[\w]+:)?([\w]+)>", r"{\1}", path)
+        path = _clean_path(path)
+        if _is_noise(path):
+            continue
+        method = (e.get("method") or "GET").upper()
+        if (method, path) in seen:
+            continue
+        seen.add((method, path))
+        params = [{"name": p.get("name", ""), "where": p.get("param_type", "")}
+                  for p in (e.get("params") or [])]
+        cp = (e.get("details", {}) or {}).get("code_paths") or [{}]
+        rows.append({
+            "method": method,
+            "path": path,
+            "params": params,
+            "technology": (e.get("details", {}) or {}).get("technology", ""),
+            "code_path": cp[0].get("path", ""),
+            "source": "noir",
+        })
+    return rows
+# ---- regex fallback (only when Noir is absent) ---------------------------------------------
+def _fallback(ctx: RepoContext) -> list:
+    rows = []
+    rows += _fallback_next_app_router(ctx)
+    rows += _fallback_regex(ctx)
+    # clean + filter noise + de-dup on (method, path)
+    seen, out = set(), []
+    for r in rows:
+        r["path"] = _clean_path(r["path"])
+        if _is_noise(r["path"]):
+            continue
+        k = (r["method"], r["path"])
+        if k not in seen:
+            seen.add(k)
+            out.append(r)
+    return out
+def _fallback_next_app_router(ctx: RepoContext) -> list:
+    rows = []
+    method_rx = re.compile(r"export\s+(?:async\s+)?function\s+(GET|POST|PUT|PATCH|DELETE|HEAD|OPTIONS)\b")
+    for p in ctx.glob("**/route.ts") + ctx.glob("**/route.js") + ctx.glob("**/route.tsx"):
+        rel = ctx.rel(p)
+        m = re.search(r"(?:^|/)(?:src/)?(?:app|pages)/(.*)/route\.[tj]sx?$", rel)
+        if not m:
+            continue
+        seg = m.group(1)
+        seg = re.sub(r"\(([^)]+)\)/?", "", seg)            # route groups (group)
+        seg = re.sub(r"\[\.\.\.([^\]]+)\]", r"{\1}", seg)    # [...slug]
+        seg = re.sub(r"\[([^\]]+)\]", r"{\1}", seg)          # [id]
+        path = "/" + seg.strip("/")
+        for verb in method_rx.findall(ctx.text(p)):
+            rows.append({"method": verb, "path": path, "params": [],
+                         "technology": "js_nextjs", "code_path": rel, "source": "fallback"})
+    return rows
+def _fallback_regex(ctx: RepoContext) -> list:
+    rows = []
+    express = re.compile(r"\b(?:router|app)\.(get|post|put|patch|delete)\s*\(\s*['\"`]([^'\"`]+)")
+    flask = re.compile(r"@\w+\.route\s*\(\s*['\"]([^'\"]+)['\"](?:.*methods\s*=\s*\[([^\]]*)\])?", re.S)
+    fastapi = re.compile(r"@\w+\.(get|post|put|patch|delete)\s*\(\s*['\"]([^'\"]+)")
+    for _p, rel, text in ctx.iter_code():
+        for verb, path in express.findall(text):
+            rows.append({"method": verb.upper(), "path": path, "params": [],
+                         "technology": "express", "code_path": rel, "source": "fallback"})
+        for verb, path in fastapi.findall(text):
+            rows.append({"method": verb.upper(), "path": path, "params": [],
+                         "technology": "fastapi", "code_path": rel, "source": "fallback"})
+        for path, methods in flask.findall(text):
+            for verb in (re.findall(r"['\"](\w+)['\"]", methods) or ["GET"]):
+                rows.append({"method": verb.upper(), "path": path, "params": [],
+                             "technology": "flask", "code_path": rel, "source": "fallback"})
+    return rows
+def _derive(routes: list) -> dict:
+    """Turn the route list into per-attack-class targeting the probes consume."""
+    writes, idor, ssrf, redirect, upload, auth_eps = [], [], [], [], [], []
+    for r in routes:
+        sig = f"{r['method']} {r['path']}"
+        if r["method"] in WRITE_VERBS:
+            writes.append(sig)
+        if "{" in r["path"] or any(p["where"] == "path" for p in r["params"]):
+            idor.append(sig)
+        if re.search(r"/(login|signin|sign-in|auth|token|session|oauth)\b", r["path"], re.I):
+            auth_eps.append(sig)
+        for p in r["params"]:
+            nm = p["name"]
+            if SSRF_NAMES.match(nm):
+                ssrf.append(f"{sig}  (param: {nm})")
+            elif REDIRECT_NAMES.match(nm):
+                redirect.append(f"{sig}  (param: {nm})")
+            elif p["where"] == "form" and TRAVERSAL_NAMES.match(nm):
+                upload.append(f"{sig}  (param: {nm})")
+    dedup = lambda xs: sorted(set(xs))
+    return {"write_endpoints": dedup(writes), "idor_candidates": dedup(idor),
+            "ssrf_candidates": dedup(ssrf), "open_redirect_candidates": dedup(redirect),
+            "upload_candidates": dedup(upload), "auth_endpoints": dedup(auth_eps)}
+class RoutesExtractor(Extractor):
+    name = "routes"
+    category = "surface"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        eps = _noir_scan(ctx.root)
+        if eps is not None:
+            routes = _normalize_noir(eps)
+            engine = "noir"
+        else:
+            routes = _fallback(ctx)
+            engine = "regex-fallback (install OWASP Noir for full coverage: brew install noir)"
+        by_method: dict = {}
+        by_tech: dict = {}
+        for r in routes:
+            by_method[r["method"]] = by_method.get(r["method"], 0) + 1
+            by_tech[r["technology"]] = by_tech.get(r["technology"], 0) + 1
+        return {
+            "engine": engine,
+            "count": len(routes),
+            "by_method": by_method,
+            "by_technology": by_tech,
+            "endpoints": routes,
+            "targeting": _derive(routes),
+        }

websec_validator/extractors/schemas.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Schema / entity extractor — the data model + its sensitive fields.
+Borrowed from DocGuard's multilang model scanners. Finds ORM/schema models
+(Pydantic, SQLAlchemy, Django, Prisma, Mongoose, TypeORM, Zod, Sequelize) and the
+**sensitive field names** they use (role, isAdmin, groupId, passwordHash, …). That
+turns mass-assignment / BOPLA probes from a generic guess into "try injecting THIS
+app's privileged fields", and surfaces the object-ownership/tenant fields BOLA
+depends on.
+"""
+from __future__ import annotations
+import re
+from .base import Extractor, RepoContext
+DECLS = [
+    ("pydantic", re.compile(r"class\s+(\w+)\s*\([^)]*BaseModel")),
+    ("sqlalchemy", re.compile(r"class\s+(\w+)\s*\([^)]*\bBase\b[^)]*\)")),
+    ("django", re.compile(r"class\s+(\w+)\s*\([^)]*models\.Model")),
+    ("prisma", re.compile(r"\bmodel\s+(\w+)\s*\{")),
+    ("mongoose", re.compile(r"\b(\w+)\s*=\s*(?:new\s+)?(?:mongoose\.)?Schema\s*\(")),
+    ("typeorm", re.compile(r"@Entity\([^)]*\)\s*(?:export\s+)?class\s+(\w+)")),
+    ("zod", re.compile(r"\b(\w+)\s*=\s*z\.object\s*\(")),
+    ("sequelize", re.compile(r"sequelize\.define\s*\(\s*['\"](\w+)['\"]")),
+]
+SENSITIVE = re.compile(
+    r"^(roles?|is_?admin|admin|permissions?|scopes?|password|password_?hash|pwd|"
+    r"owner|owner_?id|user_?id|group_?id|tenant_?id|org_?id|organization_?id|account_?id|"
+    r"balance|credits?|is_?verified|verified|status|plan|tier|enabled|active|api_?key|"
+    r"secret|token|email_?verified|stripe_?customer|subscription)$", re.I)
+MODELISH_PATH = re.compile(r"/models?/|/schemas?/|/entit|\.prisma$|\.model\.|\.entity\.", re.I)
+IDENT = re.compile(r"\b([A-Za-z_]\w*)\b")
+class SchemasExtractor(Extractor):
+    name = "schemas"
+    category = "data"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        orms: set = set()
+        entities: list = []
+        sensitive: set = set()
+        for _p, rel, text in ctx.iter_code():
+            is_model_file = bool(MODELISH_PATH.search(rel))
+            for label, rx in DECLS:
+                for m in rx.finditer(text):
+                    orms.add(label)
+                    is_model_file = True
+                    if m.groups() and m.group(1) and len(entities) < 80:
+                        entities.append({"name": m.group(1), "type": label, "file": rel})
+            if is_model_file:
+                for w in IDENT.findall(text):
+                    if SENSITIVE.match(w):
+                        sensitive.add(w)
+        # de-dup entities by (name,type)
+        seen, ents = set(), []
+        for e in entities:
+            k = (e["name"], e["type"])
+            if k not in seen:
+                seen.add(k)
+                ents.append(e)
+        return {
+            "orms": sorted(orms),
+            "entity_count": len(ents),
+            "entities": ents[:60],
+            "sensitive_fields": sorted(sensitive),
+            "note": "Mass-assignment/BOPLA probes should try injecting these app-specific privileged "
+                    "fields into update/create payloads; ownership/tenant fields here are what BOLA must isolate.",
+        }

websec_validator/extractors/stack.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Stack extractor — languages, frameworks, package managers, datastores.
+Monorepo-aware: aggregates every package.json / Python manifest in the tree
+(node_modules excluded by SKIP_DIRS), so a backend/ service's Express + DynamoDB
+deps are seen even when the repo root is just a workspace shell. Runs first; its
+result is stashed on ctx.stack for later extractors.
+"""
+from __future__ import annotations
+from .base import Extractor, RepoContext
+NODE_FRAMEWORKS = {"express": "express", "fastify": "fastify", "koa": "koa",
+                   "@nestjs/core": "nestjs", "next": "next", "@hapi/hapi": "hapi",
+                   "next-auth": "nextauth", "@remix-run": "remix", "svelte": "sveltekit",
+                   "@apollo/server": "apollo-graphql", "graphql": "graphql"}
+PY_FRAMEWORKS = {"fastapi": "fastapi", "flask": "flask", "django": "django",
+                 "starlette": "starlette", "sanic": "sanic", "tornado": "tornado",
+                 "aiohttp": "aiohttp"}
+DATASTORES = {"pg": "postgres", "postgres": "postgres", "mysql": "mysql",
+              "mysql2": "mysql", "mongodb": "mongo", "mongoose": "mongo",
+              "@aws-sdk/client-dynamodb": "dynamodb", "@aws-sdk/lib-dynamodb": "dynamodb",
+              "dynamodb": "dynamodb", "redis": "redis", "ioredis": "redis",
+              "sqlite": "sqlite", "prisma": "prisma(sql)", "sequelize": "sql-orm",
+              "typeorm": "sql-orm", "drizzle-orm": "sql-orm", "sqlalchemy": "sql-orm",
+              "psycopg2": "postgres", "pymongo": "mongo", "boto3": "aws"}
+class StackExtractor(Extractor):
+    name = "stack"
+    category = "inventory"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        langs, frameworks, managers, datastores = set(), set(), set(), set()
+        pkgs = ctx.glob("**/package.json", 120)
+        node_text = " ".join(ctx.text(p) for p in pkgs)
+        if node_text:
+            langs.add("node")
+            managers.add("npm")
+            for dep, label in NODE_FRAMEWORKS.items():
+                if f'"{dep}"' in node_text or f'"{dep}/' in node_text:
+                    frameworks.add(label)
+            for dep, label in DATASTORES.items():
+                if f'"{dep}"' in node_text:
+                    datastores.add(label)
+            if '"typescript"' in node_text or ctx.glob("**/tsconfig.json", 1):
+                langs.add("typescript")
+        if ctx.glob("**/pnpm-lock.yaml", 1):
+            managers.add("pnpm")
+        if ctx.glob("**/yarn.lock", 1):
+            managers.add("yarn")
+        py_manifests = (ctx.glob("**/requirements*.txt", 80) + ctx.glob("**/pyproject.toml", 80)
+                        + ctx.glob("**/setup.py", 80) + ctx.glob("**/Pipfile", 80))
+        py_text = " ".join(ctx.text(p) for p in py_manifests).lower()
+        if py_text.strip():
+            langs.add("python")
+            managers.add("pip")
+            for dep, label in PY_FRAMEWORKS.items():
+                if dep in py_text:
+                    frameworks.add(label)
+            for dep, label in DATASTORES.items():
+                if dep in py_text:
+                    datastores.add(label)
+        if ctx.glob("**/go.mod", 1):
+            langs.add("go")
+        if ctx.glob("**/Gemfile", 1):
+            langs.add("ruby")
+        result = {
+            "languages": sorted(langs),
+            "frameworks": sorted(frameworks),
+            "package_managers": sorted(managers),
+            "datastores": sorted(datastores),
+            "monorepo": len(pkgs) > 1 or ctx.exists("pnpm-workspace.yaml", "lerna.json", "nx.json", "turbo.json"),
+            "services": len(pkgs),
+        }
+        ctx.stack = result
+        return result

websec_validator/extractors/surface.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""Attack-surface extractor — code-level dangerous sinks, user-control-aware.
+Each signature embeds a user-input marker (`req.`/`request.`/string concat/
+template interpolation/format) so a match means "dangerous op fed by something
+that looks attacker-influenced", not merely "this function is used anywhere".
+Signatures derived from the recon-engine research. Each class maps to the probe
+that exercises it, so the briefing can point probes at the right files.
+"""
+from __future__ import annotations
+import re
+from .base import Extractor, RepoContext
+# user-controlled markers (kept loose on purpose)
+_U = r"(?:req\.|request\.|\+|`[^`]*\$\{|f['\"]|%\s*[\(%]|\.format\s*\(|searchParams|nextUrl|params\[)"
+# class -> (probe it feeds, gating, compiled regex)
+#   gating: None | "sql" | "nosql"  (datastore-dependent classes)
+SINKS = {
+    "ssrf": ("ssrf-probes", None, re.compile(
+        r"(?:axios|got|node-fetch|superagent|needle|httpx|urllib\.request)\b.*\b" + _U
+        + r"|\bfetch\s*\(\s*" + _U + r"|requests\.(?:get|post|put|request)\s*\(\s*" + _U)),
+    "command-injection": ("ssrf-probes", None, re.compile(
+        r"(?:child_process\.exec|\bexecSync|\bexec|\bspawn|os\.system|subprocess\.(?:run|call|check_output|Popen))\s*\([^)]*"
+        + _U + r"|shell\s*=\s*True")),
+    "sql-injection": ("bola-write-verbs", "sql", re.compile(
+        r"(?:\.query|\.execute|\.raw|cursor\.execute|sequelize\.query|knex\.raw)\s*\([^)]*(?:\$\{|\+|%\s*[\(%]|\.format\s*\(|f['\"])")),
+    "nosql-injection": ("bola-write-verbs", "nosql", re.compile(
+        r"\.(?:find|findOne|update|updateOne|deleteOne|aggregate)\s*\(\s*(?:req\.|request\.)|\$where")),
+    "path-traversal": ("bola-write-verbs", None, re.compile(
+        r"(?:fs\.(?:readFile|writeFile|createReadStream|unlink|readdir)|sendFile|os\.path\.join|\bopen|path\.(?:join|resolve))\s*\([^)]*"
+        + _U)),
+    "ssti": ("ssrf-probes", None, re.compile(
+        r"(?:render_template_string|renderString|nunjucks\.renderString|ejs\.render|pug\.compile|Handlebars\.compile|new\s+Template|Template\s*\()\s*\([^)]*"
+        + _U)),
+    "open-redirect": ("bola-write-verbs", None, re.compile(
+        r"(?:res\.redirect|HttpResponseRedirect|RedirectResponse|return\s+redirect|res\.setHeader\s*\(\s*['\"]Location)\s*\([^)]*"
+        + _U)),
+    "insecure-deserialization": ("bola-write-verbs", None, re.compile(
+        r"pickle\.loads?\s*\(|cPickle\.loads?\s*\(|yaml\.load\s*\((?![^)]*Loader)|node-serialize.*unserialize\s*\(|\bunserialize\s*\(")),
+    "xxe": ("ssrf-probes", None, re.compile(
+        r"libxmljs\.parseXml\s*\(|lxml\.etree\.(?:parse|fromstring|XML)\s*\(|xml\.etree\.ElementTree\.(?:parse|fromstring)\s*\(|new\s+DOMParser")),
+    "prototype-pollution": ("mass-assignment", None, re.compile(
+        r"(?:_\.merge|_\.mergeWith|_\.defaultsDeep|Object\.assign)\s*\([^)]*(?:req\.|request\.)|\.update\s*\([^)]*request\.(?:json|get_json|form)")),
+    "redos": ("ssrf-probes", None, re.compile(
+        r"new\s+RegExp\s*\([^)]*(?:req\.|request\.|\+)|re\.(?:compile|match|search|fullmatch)\s*\([^,)]*(?:request\.|f['\"])")),
+    "eval-injection": ("bola-write-verbs", None, re.compile(
+        r"\beval\s*\([^)]*" + _U + r"|new\s+Function\s*\([^)]*" + _U)),
+}
+class SurfaceExtractor(Extractor):
+    name = "surface"
+    category = "sinks"
+    def extract(self, ctx: RepoContext, facts: dict) -> dict:
+        datastores = set((facts.get("stack") or {}).get("datastores", []))
+        has_sql = any("sql" in d or d in ("postgres", "mysql", "sqlite") for d in datastores)
+        has_nosql = any(d in ("mongo", "dynamodb") for d in datastores)
+        found: dict = {k: [] for k in SINKS}
+        counts: dict = {k: 0 for k in SINKS}
+        for _p, rel, text in ctx.iter_code():
+            for cls, (_probe, gate, rx) in SINKS.items():
+                if gate == "sql" and not has_sql:
+                    continue
+                if gate == "nosql" and not has_nosql:
+                    continue
+                if rx.search(text):
+                    counts[cls] += 1
+                    if len(found[cls]) < 60:
+                        found[cls].append(rel)
+        sinks = {k: {"probe": SINKS[k][0], "count": counts[k], "files": found[k]}
+                 for k in SINKS if counts[k]}
+        return {
+            "sinks": sinks,
+            "sink_counts": {k: counts[k] for k in SINKS if counts[k]},
+            "datastore_class": ("sql" if has_sql else ("nosql" if has_nosql else "unknown")),
+            "note": "Each sink hit is user-input-gated (req./request./concat/interp), so these are "
+                    "higher-confidence leads. Cross-reference the files with routes.targeting to pick "
+                    "the endpoint to probe. On a NoSQL/JSON API, SQLi alerts from generic scanners are "
+                    "usually false positives.",
+        }