websec-validator 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. websec_validator/__init__.py +14 -0
  2. websec_validator/briefing.py +218 -0
  3. websec_validator/calibration.json +75 -0
  4. websec_validator/calibration.py +226 -0
  5. websec_validator/cli.py +395 -0
  6. websec_validator/constitution.py +81 -0
  7. websec_validator/corpus.json +49 -0
  8. websec_validator/dynamic.py +249 -0
  9. websec_validator/extractors/__init__.py +56 -0
  10. websec_validator/extractors/auth.py +77 -0
  11. websec_validator/extractors/authz.py +130 -0
  12. websec_validator/extractors/base.py +101 -0
  13. websec_validator/extractors/client_exposure.py +48 -0
  14. websec_validator/extractors/graphql.py +71 -0
  15. websec_validator/extractors/iac_ci.py +65 -0
  16. websec_validator/extractors/integrations.py +55 -0
  17. websec_validator/extractors/routes.py +215 -0
  18. websec_validator/extractors/schemas.py +75 -0
  19. websec_validator/extractors/stack.py +80 -0
  20. websec_validator/extractors/surface.py +86 -0
  21. websec_validator/extractors/tenant.py +33 -0
  22. websec_validator/findings.py +199 -0
  23. websec_validator/probes.py +79 -0
  24. websec_validator/proof.py +96 -0
  25. websec_validator/recon.py +28 -0
  26. websec_validator/report.py +114 -0
  27. websec_validator/scanners.py +248 -0
  28. websec_validator/templates/probes/bola-cross-tenant.sh +192 -0
  29. websec_validator/templates/probes/bola-write-verbs.py +147 -0
  30. websec_validator/templates/probes/compare-roles.sh +69 -0
  31. websec_validator/templates/probes/dlp-bypass-offline.py +149 -0
  32. websec_validator/templates/probes/hs256-brute-force.py +90 -0
  33. websec_validator/templates/probes/jwt-attacks.sh +161 -0
  34. websec_validator/templates/probes/mass-assignment.py +201 -0
  35. websec_validator/templates/probes/race-conditions.py +144 -0
  36. websec_validator/templates/probes/rate-limit-burst.sh +136 -0
  37. websec_validator/templates/probes/s3-assess.sh +120 -0
  38. websec_validator/templates/probes/ssrf-probes.sh +189 -0
  39. websec_validator/templates/probes/webhook-forgery.py +113 -0
  40. websec_validator/templates/reports/FINDINGS-SUMMARY.md.template +75 -0
  41. websec_validator/templates/reports/access-control-matrix.md.template +65 -0
  42. websec_validator/templates/reports/findings-triage.md.template +28 -0
  43. websec_validator/templates/reports/pentest-handover-brief.md.template +121 -0
  44. websec_validator/templates/reports/per-tool-FINDINGS.md.template +37 -0
  45. websec_validator-0.2.0.dist-info/METADATA +232 -0
  46. websec_validator-0.2.0.dist-info/RECORD +50 -0
  47. websec_validator-0.2.0.dist-info/WHEEL +5 -0
  48. websec_validator-0.2.0.dist-info/entry_points.txt +2 -0
  49. websec_validator-0.2.0.dist-info/licenses/LICENSE +21 -0
  50. websec_validator-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,71 @@
1
+ """GraphQL surface extractor.
2
+
3
+ GraphQL is its own attack surface (introspection schema-dump, alias/depth DoS,
4
+ GET-method mutations). Noir collapses a GraphQL server to one `POST /graphql`
5
+ endpoint, so we add the detail: is introspection on, is the playground exposed,
6
+ is there any depth/complexity limiting. Only emits when GraphQL is present.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+
13
+ from .base import Extractor, RepoContext
14
+
15
+ SCHEMA_CODE = re.compile(
16
+ r"makeExecutableSchema|buildSchema|new ApolloServer|createYoga|type-graphql|"
17
+ r"@Resolver|@ObjectType|gql`|type\s+Query\b|type\s+Mutation\b|strawberry\.|"
18
+ r"graphene\.|ariadne|mercurius", re.I)
19
+ INTROSPECTION_ON = re.compile(r"introspection\s*:\s*true")
20
+ INTROSPECTION_OFF = re.compile(r"introspection\s*:\s*false|NoSchemaIntrospection|NoIntrospection")
21
+ PLAYGROUND = re.compile(r"playground\s*:\s*true|graphiql\s*:\s*true|LandingPageGraphQLPlayground|LandingPageLocalDefault")
22
+ LIMITING = re.compile(r"graphql-depth-limit|depthLimit|costAnalysis|graphql-cost-analysis|"
23
+ r"createComplexityLimitRule|query-complexity|graphql-armor")
24
+
25
+
26
+ class GraphQLExtractor(Extractor):
27
+ name = "graphql"
28
+ category = "surface"
29
+
30
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
31
+ frameworks = set((facts.get("stack") or {}).get("frameworks", []))
32
+ schema_files = [ctx.rel(p) for p in (ctx.glob("**/*.graphql", 60) + ctx.glob("**/*.gql", 60))]
33
+ endpoints = [e for e in (facts.get("routes") or {}).get("endpoints", [])
34
+ if "graphql" in e.get("path", "").lower()]
35
+
36
+ if not ({"graphql", "apollo-graphql"} & frameworks) and not schema_files and not endpoints:
37
+ return {"present": False}
38
+
39
+ introspection, playground, limiting, code_hit = "unknown", False, False, False
40
+ for _p, _rel, text in ctx.iter_code():
41
+ if SCHEMA_CODE.search(text):
42
+ code_hit = True
43
+ if INTROSPECTION_ON.search(text):
44
+ introspection = "enabled"
45
+ elif INTROSPECTION_OFF.search(text) and introspection != "enabled":
46
+ introspection = "disabled"
47
+ if PLAYGROUND.search(text):
48
+ playground = True
49
+ if LIMITING.search(text):
50
+ limiting = True
51
+
52
+ if not (code_hit or schema_files or endpoints):
53
+ return {"present": False}
54
+
55
+ findings = []
56
+ if introspection in ("enabled", "unknown"):
57
+ findings.append({"severity": "HIGH" if introspection == "enabled" else "MEDIUM",
58
+ "issue": f"introspection {'ENABLED' if introspection == 'enabled' else 'not explicitly disabled'}",
59
+ "detail": "schema-dump exposure — disable in prod / add NoSchemaIntrospection"})
60
+ if playground:
61
+ findings.append({"severity": "MEDIUM", "issue": "GraphQL playground/landing page enabled",
62
+ "detail": "disable in production"})
63
+ if not limiting:
64
+ findings.append({"severity": "MEDIUM", "issue": "no query depth/complexity limiting detected",
65
+ "detail": "alias/deep-query DoS — add depth+cost limits (e.g. graphql-armor)"})
66
+
67
+ return {"present": True,
68
+ "endpoints": [f"{e['method']} {e['path']}" for e in endpoints] or ["(server detected; endpoint not routed by Noir)"],
69
+ "schema_files": schema_files[:20], "introspection": introspection,
70
+ "playground_enabled": playground, "query_limiting_detected": limiting,
71
+ "findings": findings, "maps_to_probe": "graphql-cop (run externally against the /graphql endpoint)"}
@@ -0,0 +1,65 @@
1
+ """IaC + CI/CD extractor — the pipeline + infra attack surface.
2
+
3
+ The commonly-missed P0 surface: GitHub Actions script injection via untrusted
4
+ context, third-party actions pinned to mutable tags, Dockerfiles running as root,
5
+ and committed Terraform state. Pure static globbing — no tools required (zizmor /
6
+ Checkov can be layered later for depth).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+
13
+ from .base import Extractor, RepoContext
14
+
15
+ # untrusted GitHub Actions contexts an attacker can control
16
+ UNTRUSTED = re.compile(
17
+ r"\$\{\{\s*github\.(?:head_ref|event\.(?:pull_request|issue|comment|review|"
18
+ r"head_commit|workflow_run)[^}]*|event\.[^}]*\.(?:title|body|name|email|ref|label|message)[^}]*)\s*\}\}")
19
+ USES = re.compile(r"uses:\s*([^\s@#]+)@([^\s#'\"]+)")
20
+ SHA40 = re.compile(r"^[0-9a-f]{40}$")
21
+
22
+
23
+ class IacCiExtractor(Extractor):
24
+ name = "iac_ci"
25
+ category = "infra"
26
+
27
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
28
+ findings = []
29
+
30
+ # --- GitHub Actions ---
31
+ for wf in ctx.glob(".github/workflows/*.yml") + ctx.glob(".github/workflows/*.yaml"):
32
+ rel, text = ctx.rel(wf), ctx.text(wf)
33
+ contexts = sorted(set(UNTRUSTED.findall(text)))
34
+ if contexts:
35
+ findings.append({"severity": "HIGH", "kind": "gha-script-injection", "file": rel,
36
+ "detail": "untrusted context in workflow (dangerous if used in a run: step) — "
37
+ + ", ".join("github." + c for c in contexts[:4])})
38
+ unpinned = sorted({f"{a}@{r}" for a, r in USES.findall(text)
39
+ if not SHA40.match(r) and not a.startswith("./")})
40
+ if unpinned:
41
+ findings.append({"severity": "MEDIUM", "kind": "gha-unpinned-action", "file": rel,
42
+ "detail": "actions pinned to a mutable tag (pin to a commit SHA): "
43
+ + ", ".join(unpinned[:6])})
44
+
45
+ # --- Dockerfiles ---
46
+ for df in ctx.glob("**/Dockerfile") + ctx.glob("**/Dockerfile.*"):
47
+ rel, text = ctx.rel(df), ctx.text(df)
48
+ users = re.findall(r"^\s*USER\s+(\S+)", text, re.M)
49
+ if not users or users[-1].lower() in ("root", "0"):
50
+ findings.append({"severity": "MEDIUM", "kind": "docker-root",
51
+ "file": rel, "detail": "container runs as root (add a non-root USER)"})
52
+ if "HEALTHCHECK" not in text:
53
+ findings.append({"severity": "LOW", "kind": "docker-no-healthcheck",
54
+ "file": rel, "detail": "no HEALTHCHECK defined"})
55
+
56
+ # --- Terraform state committed ---
57
+ for tf in ctx.glob("**/*.tfstate")[:5]:
58
+ findings.append({"severity": "HIGH", "kind": "terraform-state-committed", "file": ctx.rel(tf),
59
+ "detail": "tfstate may contain plaintext secrets (DB passwords, keys) — must not be committed"})
60
+
61
+ by_sev: dict = {}
62
+ for f in findings:
63
+ by_sev[f["severity"]] = by_sev.get(f["severity"], 0) + 1
64
+ return {"findings": findings, "by_severity": by_sev,
65
+ "workflows_scanned": len(ctx.glob(".github/workflows/*.yml") + ctx.glob(".github/workflows/*.yaml"))}
@@ -0,0 +1,55 @@
1
+ """Integrations + webhooks extractor.
2
+
3
+ Inbound webhooks that don't verify a signature are a forgery/replay surface;
4
+ each outbound third-party SDK is a trust boundary + secret-handling surface.
5
+ Reads the route inventory to find webhook endpoints, then checks each handler
6
+ file for signature-verification code.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from pathlib import Path
13
+
14
+ from .base import Extractor, RepoContext
15
+
16
+ WEBHOOK_PATH = re.compile(r"webhook|/hook|/callback|/inbound", re.I)
17
+ SIG_VERIFY = re.compile(
18
+ r"createHmac|\bhmac\b|timingSafeEqual|verif\w*[Ss]ignature|X-Hub-Signature|"
19
+ r"X-Signature|Stripe-Signature|\bsvix\b|constant_time_compare|compare_digest|"
20
+ r"verifyWebhook|signature", re.I)
21
+
22
+ SDKS = {"stripe": "Stripe", "twilio": "Twilio", "@sendgrid": "SendGrid", "messagebird": "MessageBird/Bird",
23
+ "@slack": "Slack", "openai": "OpenAI", "@anthropic": "Anthropic", "octokit": "GitHub",
24
+ "plaid": "Plaid", "@aws-sdk": "AWS", "aws-sdk": "AWS", "firebase": "Firebase",
25
+ "mailgun": "Mailgun", "@sentry": "Sentry", "paypal": "PayPal", "squareup": "Square",
26
+ "@google-cloud": "GCP", "appsync": "AppSync", "wpapi": "WordPress", "@wordpress": "WordPress"}
27
+
28
+
29
+ class IntegrationsExtractor(Extractor):
30
+ name = "integrations"
31
+ category = "surface"
32
+
33
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
34
+ endpoints = (facts.get("routes") or {}).get("endpoints", [])
35
+ webhook_eps = [e for e in endpoints if WEBHOOK_PATH.search(e.get("path", ""))]
36
+
37
+ unverified = []
38
+ for e in webhook_eps:
39
+ cp = e.get("code_path", "")
40
+ text = ctx.text(Path(cp)) if cp else ""
41
+ if not (text and SIG_VERIFY.search(text)):
42
+ unverified.append(f"{e['method']} {e['path']} ({ctx.rel(Path(cp)) if cp else '?'})")
43
+
44
+ blob = " ".join(ctx.text(p) for p in ctx.glob("**/package.json", 80)).lower()
45
+ blob += " ".join(ctx.text(p) for p in (ctx.glob("**/requirements*.txt", 40) + ctx.glob("**/pyproject.toml", 40))).lower()
46
+ detected = sorted({label for dep, label in SDKS.items() if dep.lower() in blob})
47
+
48
+ return {
49
+ "webhook_endpoints": [f"{e['method']} {e['path']}" for e in webhook_eps],
50
+ "webhooks_without_sig_verification": sorted(set(unverified)), # HIGH if non-empty
51
+ "third_party_integrations": detected,
52
+ "note": "Webhooks with no signature-verification code in their handler = forgery/replay risk "
53
+ "(run webhook-forgery; verify against your middleware). Each integration is an outbound "
54
+ "trust + secret-handling surface (SSRF, secret leakage, supply-chain).",
55
+ }
@@ -0,0 +1,215 @@
1
+ """Route / endpoint extractor — the spine of the attack surface.
2
+
3
+ Primary engine: **OWASP Noir** (owasp-noir/noir) — 50+ frameworks, real parsing
4
+ (Next.js App Router, Express, NestJS, Flask, FastAPI, Django, Rails, Go...),
5
+ emits method + path + typed params + code path. We shell out to it and parse its
6
+ JSON. If Noir isn't installed we fall back to a framework-aware regex pass so the
7
+ tool still produces something — but Noir is strongly preferred and the briefing
8
+ says so when it's missing.
9
+
10
+ We then DERIVE the high-value targeting signals that make probes precise:
11
+ - write endpoints → BOLA-write / mass-assignment targets
12
+ - path-param endpoints → IDOR / BOLA enumeration targets
13
+ - url/domain-ish params → SSRF candidates
14
+ - redirect-ish params → open-redirect candidates
15
+ - file-upload params → upload / path-traversal candidates
16
+ - auth endpoints → login surface
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import re
23
+ import shutil
24
+ import subprocess
25
+ import tempfile
26
+ from pathlib import Path
27
+
28
+ from .base import Extractor, RepoContext
29
+
30
+ WRITE_VERBS = {"POST", "PUT", "PATCH", "DELETE"}
31
+ EXCLUDE_GLOBS = "*.test.ts,*.test.tsx,*.spec.ts,*.test.js,*.spec.js,*_test.go,*_test.py,test_*.py,*.stories.tsx"
32
+
33
+ # param-name heuristics → attack class
34
+ SSRF_NAMES = re.compile(r"^(url|uri|link|domain|host|endpoint|webhook|feed|rss|image|img|src|proxy|fetch|target|origin|site|address)s?$", re.I)
35
+ REDIRECT_NAMES = re.compile(r"^(redirect|redirect_?uri|next|return|return_?url|callback|continue|dest|destination|goto)s?$", re.I)
36
+ TRAVERSAL_NAMES = re.compile(r"^(file|filename|filepath|path|dir|folder|template|name|key|attachment|download|doc)s?$", re.I)
37
+
38
+ TEMPLATED = ("BASE_URL", "localhost", "127.0.0.1", "${", "{{")
39
+ ASSET_GLOB = re.compile(r"\*\.\w+")
40
+
41
+
42
+ def _clean_path(p: str) -> str:
43
+ p = re.sub(r":(\w+)", r"{\1}", p) # Express :id -> {id}
44
+ p = re.sub(r"\*(\w+)", r"{\1}", p) # splat *key -> {key}
45
+ return p
46
+
47
+
48
+ def _is_noise(path: str) -> bool:
49
+ if not path or not path.startswith("/"):
50
+ return True
51
+ if any(t in path for t in TEMPLATED):
52
+ return True
53
+ return bool(ASSET_GLOB.search(path)) # static-asset glob route (/*.png)
54
+
55
+
56
+ def _noir_scan(root: Path) -> list | None:
57
+ """Run Noir → list of endpoint dicts, or None if Noir unavailable/failed."""
58
+ if not shutil.which("noir"):
59
+ return None
60
+ with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tf:
61
+ out = Path(tf.name)
62
+ try:
63
+ proc = subprocess.run(
64
+ ["noir", "scan", str(root), "-f", "json", "-o", str(out),
65
+ "--exclude-path", EXCLUDE_GLOBS, "--no-log", "--no-color"],
66
+ capture_output=True, text=True, timeout=300)
67
+ if not out.exists():
68
+ return None
69
+ data = json.loads(out.read_text() or "{}")
70
+ return data.get("endpoints", []) if isinstance(data, dict) else (data or [])
71
+ except Exception:
72
+ return None
73
+ finally:
74
+ try:
75
+ out.unlink()
76
+ except Exception:
77
+ pass
78
+
79
+
80
+ def _normalize_noir(eps: list) -> list:
81
+ rows, seen = [], set()
82
+ for e in eps:
83
+ if e.get("internal"):
84
+ continue
85
+ path = e.get("url") or e.get("path") or ""
86
+ # Noir keeps Django <int:pk> / <str:name> notation — normalize to {pk}/{name}
87
+ path = re.sub(r"<(?:[\w]+:)?([\w]+)>", r"{\1}", path)
88
+ path = _clean_path(path)
89
+ if _is_noise(path):
90
+ continue
91
+ method = (e.get("method") or "GET").upper()
92
+ if (method, path) in seen:
93
+ continue
94
+ seen.add((method, path))
95
+ params = [{"name": p.get("name", ""), "where": p.get("param_type", "")}
96
+ for p in (e.get("params") or [])]
97
+ cp = (e.get("details", {}) or {}).get("code_paths") or [{}]
98
+ rows.append({
99
+ "method": method,
100
+ "path": path,
101
+ "params": params,
102
+ "technology": (e.get("details", {}) or {}).get("technology", ""),
103
+ "code_path": cp[0].get("path", ""),
104
+ "source": "noir",
105
+ })
106
+ return rows
107
+
108
+
109
+ # ---- regex fallback (only when Noir is absent) ---------------------------------------------
110
+
111
+ def _fallback(ctx: RepoContext) -> list:
112
+ rows = []
113
+ rows += _fallback_next_app_router(ctx)
114
+ rows += _fallback_regex(ctx)
115
+ # clean + filter noise + de-dup on (method, path)
116
+ seen, out = set(), []
117
+ for r in rows:
118
+ r["path"] = _clean_path(r["path"])
119
+ if _is_noise(r["path"]):
120
+ continue
121
+ k = (r["method"], r["path"])
122
+ if k not in seen:
123
+ seen.add(k)
124
+ out.append(r)
125
+ return out
126
+
127
+
128
+ def _fallback_next_app_router(ctx: RepoContext) -> list:
129
+ rows = []
130
+ method_rx = re.compile(r"export\s+(?:async\s+)?function\s+(GET|POST|PUT|PATCH|DELETE|HEAD|OPTIONS)\b")
131
+ for p in ctx.glob("**/route.ts") + ctx.glob("**/route.js") + ctx.glob("**/route.tsx"):
132
+ rel = ctx.rel(p)
133
+ m = re.search(r"(?:^|/)(?:src/)?(?:app|pages)/(.*)/route\.[tj]sx?$", rel)
134
+ if not m:
135
+ continue
136
+ seg = m.group(1)
137
+ seg = re.sub(r"\(([^)]+)\)/?", "", seg) # route groups (group)
138
+ seg = re.sub(r"\[\.\.\.([^\]]+)\]", r"{\1}", seg) # [...slug]
139
+ seg = re.sub(r"\[([^\]]+)\]", r"{\1}", seg) # [id]
140
+ path = "/" + seg.strip("/")
141
+ for verb in method_rx.findall(ctx.text(p)):
142
+ rows.append({"method": verb, "path": path, "params": [],
143
+ "technology": "js_nextjs", "code_path": rel, "source": "fallback"})
144
+ return rows
145
+
146
+
147
+ def _fallback_regex(ctx: RepoContext) -> list:
148
+ rows = []
149
+ express = re.compile(r"\b(?:router|app)\.(get|post|put|patch|delete)\s*\(\s*['\"`]([^'\"`]+)")
150
+ flask = re.compile(r"@\w+\.route\s*\(\s*['\"]([^'\"]+)['\"](?:.*methods\s*=\s*\[([^\]]*)\])?", re.S)
151
+ fastapi = re.compile(r"@\w+\.(get|post|put|patch|delete)\s*\(\s*['\"]([^'\"]+)")
152
+ for _p, rel, text in ctx.iter_code():
153
+ for verb, path in express.findall(text):
154
+ rows.append({"method": verb.upper(), "path": path, "params": [],
155
+ "technology": "express", "code_path": rel, "source": "fallback"})
156
+ for verb, path in fastapi.findall(text):
157
+ rows.append({"method": verb.upper(), "path": path, "params": [],
158
+ "technology": "fastapi", "code_path": rel, "source": "fallback"})
159
+ for path, methods in flask.findall(text):
160
+ for verb in (re.findall(r"['\"](\w+)['\"]", methods) or ["GET"]):
161
+ rows.append({"method": verb.upper(), "path": path, "params": [],
162
+ "technology": "flask", "code_path": rel, "source": "fallback"})
163
+ return rows
164
+
165
+
166
+ def _derive(routes: list) -> dict:
167
+ """Turn the route list into per-attack-class targeting the probes consume."""
168
+ writes, idor, ssrf, redirect, upload, auth_eps = [], [], [], [], [], []
169
+ for r in routes:
170
+ sig = f"{r['method']} {r['path']}"
171
+ if r["method"] in WRITE_VERBS:
172
+ writes.append(sig)
173
+ if "{" in r["path"] or any(p["where"] == "path" for p in r["params"]):
174
+ idor.append(sig)
175
+ if re.search(r"/(login|signin|sign-in|auth|token|session|oauth)\b", r["path"], re.I):
176
+ auth_eps.append(sig)
177
+ for p in r["params"]:
178
+ nm = p["name"]
179
+ if SSRF_NAMES.match(nm):
180
+ ssrf.append(f"{sig} (param: {nm})")
181
+ elif REDIRECT_NAMES.match(nm):
182
+ redirect.append(f"{sig} (param: {nm})")
183
+ elif p["where"] == "form" and TRAVERSAL_NAMES.match(nm):
184
+ upload.append(f"{sig} (param: {nm})")
185
+ dedup = lambda xs: sorted(set(xs))
186
+ return {"write_endpoints": dedup(writes), "idor_candidates": dedup(idor),
187
+ "ssrf_candidates": dedup(ssrf), "open_redirect_candidates": dedup(redirect),
188
+ "upload_candidates": dedup(upload), "auth_endpoints": dedup(auth_eps)}
189
+
190
+
191
+ class RoutesExtractor(Extractor):
192
+ name = "routes"
193
+ category = "surface"
194
+
195
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
196
+ eps = _noir_scan(ctx.root)
197
+ if eps is not None:
198
+ routes = _normalize_noir(eps)
199
+ engine = "noir"
200
+ else:
201
+ routes = _fallback(ctx)
202
+ engine = "regex-fallback (install OWASP Noir for full coverage: brew install noir)"
203
+ by_method: dict = {}
204
+ by_tech: dict = {}
205
+ for r in routes:
206
+ by_method[r["method"]] = by_method.get(r["method"], 0) + 1
207
+ by_tech[r["technology"]] = by_tech.get(r["technology"], 0) + 1
208
+ return {
209
+ "engine": engine,
210
+ "count": len(routes),
211
+ "by_method": by_method,
212
+ "by_technology": by_tech,
213
+ "endpoints": routes,
214
+ "targeting": _derive(routes),
215
+ }
@@ -0,0 +1,75 @@
1
+ """Schema / entity extractor — the data model + its sensitive fields.
2
+
3
+ Borrowed from DocGuard's multilang model scanners. Finds ORM/schema models
4
+ (Pydantic, SQLAlchemy, Django, Prisma, Mongoose, TypeORM, Zod, Sequelize) and the
5
+ **sensitive field names** they use (role, isAdmin, groupId, passwordHash, …). That
6
+ turns mass-assignment / BOPLA probes from a generic guess into "try injecting THIS
7
+ app's privileged fields", and surfaces the object-ownership/tenant fields BOLA
8
+ depends on.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import re
14
+
15
+ from .base import Extractor, RepoContext
16
+
17
+ DECLS = [
18
+ ("pydantic", re.compile(r"class\s+(\w+)\s*\([^)]*BaseModel")),
19
+ ("sqlalchemy", re.compile(r"class\s+(\w+)\s*\([^)]*\bBase\b[^)]*\)")),
20
+ ("django", re.compile(r"class\s+(\w+)\s*\([^)]*models\.Model")),
21
+ ("prisma", re.compile(r"\bmodel\s+(\w+)\s*\{")),
22
+ ("mongoose", re.compile(r"\b(\w+)\s*=\s*(?:new\s+)?(?:mongoose\.)?Schema\s*\(")),
23
+ ("typeorm", re.compile(r"@Entity\([^)]*\)\s*(?:export\s+)?class\s+(\w+)")),
24
+ ("zod", re.compile(r"\b(\w+)\s*=\s*z\.object\s*\(")),
25
+ ("sequelize", re.compile(r"sequelize\.define\s*\(\s*['\"](\w+)['\"]")),
26
+ ]
27
+
28
+ SENSITIVE = re.compile(
29
+ r"^(roles?|is_?admin|admin|permissions?|scopes?|password|password_?hash|pwd|"
30
+ r"owner|owner_?id|user_?id|group_?id|tenant_?id|org_?id|organization_?id|account_?id|"
31
+ r"balance|credits?|is_?verified|verified|status|plan|tier|enabled|active|api_?key|"
32
+ r"secret|token|email_?verified|stripe_?customer|subscription)$", re.I)
33
+
34
+ MODELISH_PATH = re.compile(r"/models?/|/schemas?/|/entit|\.prisma$|\.model\.|\.entity\.", re.I)
35
+ IDENT = re.compile(r"\b([A-Za-z_]\w*)\b")
36
+
37
+
38
+ class SchemasExtractor(Extractor):
39
+ name = "schemas"
40
+ category = "data"
41
+
42
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
43
+ orms: set = set()
44
+ entities: list = []
45
+ sensitive: set = set()
46
+
47
+ for _p, rel, text in ctx.iter_code():
48
+ is_model_file = bool(MODELISH_PATH.search(rel))
49
+ for label, rx in DECLS:
50
+ for m in rx.finditer(text):
51
+ orms.add(label)
52
+ is_model_file = True
53
+ if m.groups() and m.group(1) and len(entities) < 80:
54
+ entities.append({"name": m.group(1), "type": label, "file": rel})
55
+ if is_model_file:
56
+ for w in IDENT.findall(text):
57
+ if SENSITIVE.match(w):
58
+ sensitive.add(w)
59
+
60
+ # de-dup entities by (name,type)
61
+ seen, ents = set(), []
62
+ for e in entities:
63
+ k = (e["name"], e["type"])
64
+ if k not in seen:
65
+ seen.add(k)
66
+ ents.append(e)
67
+
68
+ return {
69
+ "orms": sorted(orms),
70
+ "entity_count": len(ents),
71
+ "entities": ents[:60],
72
+ "sensitive_fields": sorted(sensitive),
73
+ "note": "Mass-assignment/BOPLA probes should try injecting these app-specific privileged "
74
+ "fields into update/create payloads; ownership/tenant fields here are what BOLA must isolate.",
75
+ }
@@ -0,0 +1,80 @@
1
+ """Stack extractor — languages, frameworks, package managers, datastores.
2
+
3
+ Monorepo-aware: aggregates every package.json / Python manifest in the tree
4
+ (node_modules excluded by SKIP_DIRS), so a backend/ service's Express + DynamoDB
5
+ deps are seen even when the repo root is just a workspace shell. Runs first; its
6
+ result is stashed on ctx.stack for later extractors.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from .base import Extractor, RepoContext
12
+
13
+ NODE_FRAMEWORKS = {"express": "express", "fastify": "fastify", "koa": "koa",
14
+ "@nestjs/core": "nestjs", "next": "next", "@hapi/hapi": "hapi",
15
+ "next-auth": "nextauth", "@remix-run": "remix", "svelte": "sveltekit",
16
+ "@apollo/server": "apollo-graphql", "graphql": "graphql"}
17
+ PY_FRAMEWORKS = {"fastapi": "fastapi", "flask": "flask", "django": "django",
18
+ "starlette": "starlette", "sanic": "sanic", "tornado": "tornado",
19
+ "aiohttp": "aiohttp"}
20
+ DATASTORES = {"pg": "postgres", "postgres": "postgres", "mysql": "mysql",
21
+ "mysql2": "mysql", "mongodb": "mongo", "mongoose": "mongo",
22
+ "@aws-sdk/client-dynamodb": "dynamodb", "@aws-sdk/lib-dynamodb": "dynamodb",
23
+ "dynamodb": "dynamodb", "redis": "redis", "ioredis": "redis",
24
+ "sqlite": "sqlite", "prisma": "prisma(sql)", "sequelize": "sql-orm",
25
+ "typeorm": "sql-orm", "drizzle-orm": "sql-orm", "sqlalchemy": "sql-orm",
26
+ "psycopg2": "postgres", "pymongo": "mongo", "boto3": "aws"}
27
+
28
+
29
+ class StackExtractor(Extractor):
30
+ name = "stack"
31
+ category = "inventory"
32
+
33
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
34
+ langs, frameworks, managers, datastores = set(), set(), set(), set()
35
+
36
+ pkgs = ctx.glob("**/package.json", 120)
37
+ node_text = " ".join(ctx.text(p) for p in pkgs)
38
+ if node_text:
39
+ langs.add("node")
40
+ managers.add("npm")
41
+ for dep, label in NODE_FRAMEWORKS.items():
42
+ if f'"{dep}"' in node_text or f'"{dep}/' in node_text:
43
+ frameworks.add(label)
44
+ for dep, label in DATASTORES.items():
45
+ if f'"{dep}"' in node_text:
46
+ datastores.add(label)
47
+ if '"typescript"' in node_text or ctx.glob("**/tsconfig.json", 1):
48
+ langs.add("typescript")
49
+ if ctx.glob("**/pnpm-lock.yaml", 1):
50
+ managers.add("pnpm")
51
+ if ctx.glob("**/yarn.lock", 1):
52
+ managers.add("yarn")
53
+
54
+ py_manifests = (ctx.glob("**/requirements*.txt", 80) + ctx.glob("**/pyproject.toml", 80)
55
+ + ctx.glob("**/setup.py", 80) + ctx.glob("**/Pipfile", 80))
56
+ py_text = " ".join(ctx.text(p) for p in py_manifests).lower()
57
+ if py_text.strip():
58
+ langs.add("python")
59
+ managers.add("pip")
60
+ for dep, label in PY_FRAMEWORKS.items():
61
+ if dep in py_text:
62
+ frameworks.add(label)
63
+ for dep, label in DATASTORES.items():
64
+ if dep in py_text:
65
+ datastores.add(label)
66
+ if ctx.glob("**/go.mod", 1):
67
+ langs.add("go")
68
+ if ctx.glob("**/Gemfile", 1):
69
+ langs.add("ruby")
70
+
71
+ result = {
72
+ "languages": sorted(langs),
73
+ "frameworks": sorted(frameworks),
74
+ "package_managers": sorted(managers),
75
+ "datastores": sorted(datastores),
76
+ "monorepo": len(pkgs) > 1 or ctx.exists("pnpm-workspace.yaml", "lerna.json", "nx.json", "turbo.json"),
77
+ "services": len(pkgs),
78
+ }
79
+ ctx.stack = result
80
+ return result
@@ -0,0 +1,86 @@
1
+ """Attack-surface extractor — code-level dangerous sinks, user-control-aware.
2
+
3
+ Each signature embeds a user-input marker (`req.`/`request.`/string concat/
4
+ template interpolation/format) so a match means "dangerous op fed by something
5
+ that looks attacker-influenced", not merely "this function is used anywhere".
6
+ Signatures derived from the recon-engine research. Each class maps to the probe
7
+ that exercises it, so the briefing can point probes at the right files.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+
14
+ from .base import Extractor, RepoContext
15
+
16
+ # user-controlled markers (kept loose on purpose)
17
+ _U = r"(?:req\.|request\.|\+|`[^`]*\$\{|f['\"]|%\s*[\(%]|\.format\s*\(|searchParams|nextUrl|params\[)"
18
+
19
+ # class -> (probe it feeds, gating, compiled regex)
20
+ # gating: None | "sql" | "nosql" (datastore-dependent classes)
21
+ SINKS = {
22
+ "ssrf": ("ssrf-probes", None, re.compile(
23
+ r"(?:axios|got|node-fetch|superagent|needle|httpx|urllib\.request)\b.*\b" + _U
24
+ + r"|\bfetch\s*\(\s*" + _U + r"|requests\.(?:get|post|put|request)\s*\(\s*" + _U)),
25
+ "command-injection": ("ssrf-probes", None, re.compile(
26
+ r"(?:child_process\.exec|\bexecSync|\bexec|\bspawn|os\.system|subprocess\.(?:run|call|check_output|Popen))\s*\([^)]*"
27
+ + _U + r"|shell\s*=\s*True")),
28
+ "sql-injection": ("bola-write-verbs", "sql", re.compile(
29
+ r"(?:\.query|\.execute|\.raw|cursor\.execute|sequelize\.query|knex\.raw)\s*\([^)]*(?:\$\{|\+|%\s*[\(%]|\.format\s*\(|f['\"])")),
30
+ "nosql-injection": ("bola-write-verbs", "nosql", re.compile(
31
+ r"\.(?:find|findOne|update|updateOne|deleteOne|aggregate)\s*\(\s*(?:req\.|request\.)|\$where")),
32
+ "path-traversal": ("bola-write-verbs", None, re.compile(
33
+ r"(?:fs\.(?:readFile|writeFile|createReadStream|unlink|readdir)|sendFile|os\.path\.join|\bopen|path\.(?:join|resolve))\s*\([^)]*"
34
+ + _U)),
35
+ "ssti": ("ssrf-probes", None, re.compile(
36
+ r"(?:render_template_string|renderString|nunjucks\.renderString|ejs\.render|pug\.compile|Handlebars\.compile|new\s+Template|Template\s*\()\s*\([^)]*"
37
+ + _U)),
38
+ "open-redirect": ("bola-write-verbs", None, re.compile(
39
+ r"(?:res\.redirect|HttpResponseRedirect|RedirectResponse|return\s+redirect|res\.setHeader\s*\(\s*['\"]Location)\s*\([^)]*"
40
+ + _U)),
41
+ "insecure-deserialization": ("bola-write-verbs", None, re.compile(
42
+ r"pickle\.loads?\s*\(|cPickle\.loads?\s*\(|yaml\.load\s*\((?![^)]*Loader)|node-serialize.*unserialize\s*\(|\bunserialize\s*\(")),
43
+ "xxe": ("ssrf-probes", None, re.compile(
44
+ r"libxmljs\.parseXml\s*\(|lxml\.etree\.(?:parse|fromstring|XML)\s*\(|xml\.etree\.ElementTree\.(?:parse|fromstring)\s*\(|new\s+DOMParser")),
45
+ "prototype-pollution": ("mass-assignment", None, re.compile(
46
+ r"(?:_\.merge|_\.mergeWith|_\.defaultsDeep|Object\.assign)\s*\([^)]*(?:req\.|request\.)|\.update\s*\([^)]*request\.(?:json|get_json|form)")),
47
+ "redos": ("ssrf-probes", None, re.compile(
48
+ r"new\s+RegExp\s*\([^)]*(?:req\.|request\.|\+)|re\.(?:compile|match|search|fullmatch)\s*\([^,)]*(?:request\.|f['\"])")),
49
+ "eval-injection": ("bola-write-verbs", None, re.compile(
50
+ r"\beval\s*\([^)]*" + _U + r"|new\s+Function\s*\([^)]*" + _U)),
51
+ }
52
+
53
+
54
+ class SurfaceExtractor(Extractor):
55
+ name = "surface"
56
+ category = "sinks"
57
+
58
+ def extract(self, ctx: RepoContext, facts: dict) -> dict:
59
+ datastores = set((facts.get("stack") or {}).get("datastores", []))
60
+ has_sql = any("sql" in d or d in ("postgres", "mysql", "sqlite") for d in datastores)
61
+ has_nosql = any(d in ("mongo", "dynamodb") for d in datastores)
62
+
63
+ found: dict = {k: [] for k in SINKS}
64
+ counts: dict = {k: 0 for k in SINKS}
65
+ for _p, rel, text in ctx.iter_code():
66
+ for cls, (_probe, gate, rx) in SINKS.items():
67
+ if gate == "sql" and not has_sql:
68
+ continue
69
+ if gate == "nosql" and not has_nosql:
70
+ continue
71
+ if rx.search(text):
72
+ counts[cls] += 1
73
+ if len(found[cls]) < 60:
74
+ found[cls].append(rel)
75
+
76
+ sinks = {k: {"probe": SINKS[k][0], "count": counts[k], "files": found[k]}
77
+ for k in SINKS if counts[k]}
78
+ return {
79
+ "sinks": sinks,
80
+ "sink_counts": {k: counts[k] for k in SINKS if counts[k]},
81
+ "datastore_class": ("sql" if has_sql else ("nosql" if has_nosql else "unknown")),
82
+ "note": "Each sink hit is user-input-gated (req./request./concat/interp), so these are "
83
+ "higher-confidence leads. Cross-reference the files with routes.targeting to pick "
84
+ "the endpoint to probe. On a NoSQL/JSON API, SQLi alerts from generic scanners are "
85
+ "usually false positives.",
86
+ }