websec-validator 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. websec_validator/__init__.py +14 -0
  2. websec_validator/briefing.py +218 -0
  3. websec_validator/calibration.json +75 -0
  4. websec_validator/calibration.py +226 -0
  5. websec_validator/cli.py +395 -0
  6. websec_validator/constitution.py +81 -0
  7. websec_validator/corpus.json +49 -0
  8. websec_validator/dynamic.py +249 -0
  9. websec_validator/extractors/__init__.py +56 -0
  10. websec_validator/extractors/auth.py +77 -0
  11. websec_validator/extractors/authz.py +130 -0
  12. websec_validator/extractors/base.py +101 -0
  13. websec_validator/extractors/client_exposure.py +48 -0
  14. websec_validator/extractors/graphql.py +71 -0
  15. websec_validator/extractors/iac_ci.py +65 -0
  16. websec_validator/extractors/integrations.py +55 -0
  17. websec_validator/extractors/routes.py +215 -0
  18. websec_validator/extractors/schemas.py +75 -0
  19. websec_validator/extractors/stack.py +80 -0
  20. websec_validator/extractors/surface.py +86 -0
  21. websec_validator/extractors/tenant.py +33 -0
  22. websec_validator/findings.py +199 -0
  23. websec_validator/probes.py +79 -0
  24. websec_validator/proof.py +96 -0
  25. websec_validator/recon.py +28 -0
  26. websec_validator/report.py +114 -0
  27. websec_validator/scanners.py +248 -0
  28. websec_validator/templates/probes/bola-cross-tenant.sh +192 -0
  29. websec_validator/templates/probes/bola-write-verbs.py +147 -0
  30. websec_validator/templates/probes/compare-roles.sh +69 -0
  31. websec_validator/templates/probes/dlp-bypass-offline.py +149 -0
  32. websec_validator/templates/probes/hs256-brute-force.py +90 -0
  33. websec_validator/templates/probes/jwt-attacks.sh +161 -0
  34. websec_validator/templates/probes/mass-assignment.py +201 -0
  35. websec_validator/templates/probes/race-conditions.py +144 -0
  36. websec_validator/templates/probes/rate-limit-burst.sh +136 -0
  37. websec_validator/templates/probes/s3-assess.sh +120 -0
  38. websec_validator/templates/probes/ssrf-probes.sh +189 -0
  39. websec_validator/templates/probes/webhook-forgery.py +113 -0
  40. websec_validator/templates/reports/FINDINGS-SUMMARY.md.template +75 -0
  41. websec_validator/templates/reports/access-control-matrix.md.template +65 -0
  42. websec_validator/templates/reports/findings-triage.md.template +28 -0
  43. websec_validator/templates/reports/pentest-handover-brief.md.template +121 -0
  44. websec_validator/templates/reports/per-tool-FINDINGS.md.template +37 -0
  45. websec_validator-0.2.0.dist-info/METADATA +232 -0
  46. websec_validator-0.2.0.dist-info/RECORD +50 -0
  47. websec_validator-0.2.0.dist-info/WHEEL +5 -0
  48. websec_validator-0.2.0.dist-info/entry_points.txt +2 -0
  49. websec_validator-0.2.0.dist-info/licenses/LICENSE +21 -0
  50. websec_validator-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,14 @@
1
+ """websec-validator — local-first security recon that briefs an AI coding agent.
2
+
3
+ The tool does the deterministic half (read the repo, run the scanners it finds,
4
+ stage the probe library tailored to what it discovered) and emits three artifacts:
5
+
6
+ 1. findings.json — de-duplicated static scanner results
7
+ 2. FACTS.json — stack, routes, auth-model candidates, attack surface
8
+ 3. AGENT-BRIEFING.md — marching orders + staged probe scripts for your AI agent
9
+
10
+ It never calls an LLM, never runs a server, and never needs a running instance of
11
+ the target app. Running the probes and applying fixes is the agent + human's job.
12
+ """
13
+
14
+ __version__ = "0.1.0"
@@ -0,0 +1,218 @@
1
+ """Render AGENT-BRIEFING.md — the marching orders for the AI coding agent.
2
+
3
+ Now driven by the full recon facts: it leads with the *targeting* (which exact
4
+ endpoints are SSRF/IDOR/upload candidates), because that's what turns a generic
5
+ probe into a precise one.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+
11
+ def _bullets(items, empty="_(none)_", cap=40):
12
+ items = list(items or [])
13
+ if not items:
14
+ return empty
15
+ shown = items[:cap]
16
+ out = "\n".join(f"- {x}" for x in shown)
17
+ if len(items) > cap:
18
+ out += f"\n- _…and {len(items) - cap} more (see FACTS.json)_"
19
+ return out
20
+
21
+
22
+ def _section(title, items):
23
+ return f"**{title}** ({len(items or [])}):\n{_bullets(items)}\n"
24
+
25
+
26
+ def render(facts: dict, scanners: dict, scan_results: list, probe_manifest: list,
27
+ unified: dict | None = None) -> str:
28
+ stack = facts.get("stack", {})
29
+ auth = facts.get("auth", {})
30
+ routes = facts.get("routes", {})
31
+ tgt = routes.get("targeting", {})
32
+ tenant = facts.get("tenant", {})
33
+ surface = facts.get("surface", {})
34
+ sink_summary = ", ".join(f"{k} ({n})" for k, n in surface.get("sink_counts", {}).items()) or "_none_"
35
+
36
+ authz = facts.get("authz", {})
37
+ gs = authz.get("guard_summary", {})
38
+ global_auth = authz.get("global_auth_middleware", False)
39
+ roles_str = ", ".join(f"`{r}`" for r in authz.get("roles_detected", [])) or "_none detected_"
40
+ unprot = authz.get("write_endpoints_without_visible_guard", [])
41
+ unprot_section = (_section("Write endpoints with NO guard visible in their handler file (verify)", unprot)
42
+ if unprot else "_Every write endpoint has a visible guard or looks public — still spot-check._")
43
+ mw = authz.get("next_middleware", {})
44
+ mw_line = (f"Next.js middleware `{mw.get('file')}` gates matchers: {mw.get('matchers')}"
45
+ if mw.get("present") else "_No Next.js middleware.ts found — auth is per-handler._")
46
+
47
+ iac = facts.get("iac_ci", {})
48
+ iac_findings = iac.get("findings", [])
49
+ iac_lines = "\n".join(f"- **{f['severity']}** `{f['kind']}` — `{f['file']}` — {f['detail']}"
50
+ for f in iac_findings[:20]) or "_none_"
51
+ client = facts.get("client_exposure", {})
52
+ client_leaks = client.get("public_secret_leaks", []) + client.get("server_secret_in_client_component", [])
53
+ client_section = _bullets(client_leaks) if client_leaks else "_none detected_"
54
+
55
+ gql = facts.get("graphql", {})
56
+ if gql.get("present"):
57
+ gfind = "; ".join(f"{x['severity']} {x['issue']}" for x in gql.get("findings", [])) or "no obvious issues"
58
+ gql_line = f"{', '.join(gql.get('endpoints', []))} · introspection={gql.get('introspection')} · {gfind}"
59
+ else:
60
+ gql_line = "_no GraphQL detected_"
61
+ integ = facts.get("integrations", {})
62
+ integ_line = ", ".join(integ.get("third_party_integrations", [])) or "none detected"
63
+ wh_unverified = integ.get("webhooks_without_sig_verification", [])
64
+ wh_line = (_section("⚠ Webhooks with NO signature-verification in their handler (verify)", wh_unverified)
65
+ if wh_unverified else f"_{len(integ.get('webhook_endpoints', []))} webhook endpoint(s); signature code present or none found_")
66
+
67
+ avail = ", ".join(s["name"] for s in scanners.get("available", [])) or "none on PATH"
68
+ missing = "\n".join(f"- **{s['name']}** ({s['category']}) — `{s.get('install','')}`"
69
+ for s in scanners.get("missing", [])) or "_all relevant scanners present_"
70
+ if scan_results:
71
+ scan_lines = "\n".join(
72
+ (f"- **{r.get('name')}** → {r.get('findings','?')} finding(s) (`{r.get('output','')}`)"
73
+ if "findings" in r else f"- **{r.get('name')}** → {r.get('status','?')}")
74
+ for r in scan_results)
75
+ else:
76
+ scan_lines = "_Detected but not executed — run `websec run <repo> --scan`._"
77
+
78
+ if unified:
79
+ top_lines = "\n".join(
80
+ f"- **{t['severity']}** [{t['category']}] {t['title']} — `{t['file']}` ({'+'.join(t['tools'])})"
81
+ for t in unified.get("top", [])) or "_no findings_"
82
+ findings_block = (
83
+ f"**{unified['total']} de-duplicated findings** "
84
+ f"({unified['cross_tool_or_dup_merged']} cross-tool/duplicate merged) · "
85
+ f"by severity {unified['by_severity']} · by category {unified['by_category']}\n\n"
86
+ f"Top findings (full list in `findings.json`):\n{top_lines}")
87
+ else:
88
+ findings_block = scan_lines
89
+
90
+ probe_lines = "\n".join(
91
+ f"- **{p['key']}** — {p.get('attack_class','')} \n"
92
+ f" `{p['file']}` · _supply:_ {p.get('agent_must_supply','')}"
93
+ for p in probe_manifest if "attack_class" in p)
94
+
95
+ endpoints = routes.get("endpoints", [])
96
+ inventory = _bullets([f"`{e['method']:6}` {e['path']}" for e in endpoints], cap=80)
97
+
98
+ return f"""# AGENT BRIEFING — security pass for `{facts.get('target','')}`
99
+
100
+ > Generated by **websec-validator v{facts.get('version','')}** — deterministic recon, no LLM.
101
+ > **You are an AI coding agent.** The tool did the reliable half (mapping the surface + staging
102
+ > the probes). You + the human do the reasoning, the running, and the fixing.
103
+
104
+ | Lane | Owns |
105
+ |---|---|
106
+ | 🔧 tool (done) | recon → {routes.get('count',0)} endpoints, scanner findings, staged probes |
107
+ | 🤖 you | confirm auth/tenant model, finalize + run the probes at the targets below, triage, fix |
108
+ | 🧑 human | running TEST instance + test accounts; review every diff |
109
+
110
+ ⚠️ Static findings + recon need **no running app**. The probes need a **live test instance + test
111
+ credentials** — ask the human, never fabricate, never hit production.
112
+
113
+ ---
114
+
115
+ ## 1. What this app is (detected)
116
+
117
+ - **Languages:** {", ".join(stack.get("languages", [])) or "?"} · **Frameworks:** {", ".join(stack.get("frameworks", [])) or "?"}
118
+ - **Datastores:** {", ".join(stack.get("datastores", [])) or "?"} · **Monorepo:** {stack.get("monorepo", False)}
119
+ - **Auth scheme:** `{auth.get("scheme","?")}` (token in {auth.get("token_location","?")}) · guard files: {len(auth.get("guard_files", []))}
120
+ - **Route engine:** {routes.get("engine","?")} · **{routes.get('count',0)} endpoints** · by method: {routes.get("by_method", {})}
121
+
122
+ ## 2. ★ Tenant boundary (confirm first — highest value, easiest to get wrong)
123
+
124
+ {_bullets([f"`{t['key']}` — {t['occurrences']}×" for t in tenant.get("candidates", [])],
125
+ "_no common tenant key found — confirm whether this app is multi-tenant; if not, skip cross-tenant probes_")}
126
+
127
+ {tenant.get("note","")}
128
+
129
+ ## 3. ★ Attack surface & targeting (point the probes HERE)
130
+
131
+ {_section("IDOR / BOLA candidates — endpoints with a path/object id", tgt.get("idor_candidates"))}
132
+ {_section("SSRF candidates — endpoints taking a url/domain-ish param", tgt.get("ssrf_candidates"))}
133
+ {_section("Open-redirect candidates", tgt.get("open_redirect_candidates"))}
134
+ {_section("File-upload candidates — path-traversal / content-type", tgt.get("upload_candidates"))}
135
+ {_section("Write endpoints — mass-assignment / BOLA-write", tgt.get("write_endpoints"))}
136
+ {_section("Auth endpoints", tgt.get("auth_endpoints"))}
137
+ **Code-level sinks** (cross-reference with the above): {sink_summary}
138
+
139
+ **Mass-assignment targets** — this app's privileged model fields (try injecting these into create/update payloads): {", ".join(facts.get("schemas", {}).get("sensitive_fields", [])) or "_none detected_"} · ORMs: {", ".join(facts.get("schemas", {}).get("orms", [])) or "?"}
140
+
141
+ ## 3b. ★ Access control (who can reach what — your #1 test)
142
+
143
+ Guard coverage (file-level heuristic): {gs.get("with_visible_guard",0)} with visible guard · {gs.get("no_visible_guard",0)} none visible · {gs.get("unknown",0)} unknown. Global auth middleware: **{global_auth}**. Roles in code: {roles_str}
144
+
145
+ {authz.get("note","")}
146
+
147
+ {unprot_section}
148
+ {mw_line}
149
+
150
+ ## 3c. Config, CI/CD & client-side risks
151
+
152
+ **Pipeline / IaC** ({len(iac_findings)} finding(s)):
153
+ {iac_lines}
154
+
155
+ **Client-side secret exposure** (ships to the browser if real): {client_section}
156
+ Production source maps exposed: {client.get("production_source_maps", False)}
157
+
158
+ **GraphQL surface:** {gql_line}
159
+
160
+ **Third-party integrations:** {integ_line}
161
+ {wh_line}
162
+
163
+ ## 4. Static findings (no running app needed)
164
+
165
+ Scanners available: {avail}
166
+
167
+ {findings_block}
168
+
169
+ Install for fuller coverage:
170
+ {missing}
171
+
172
+ ## 5. Tailored probes (staged — drafts you finalize against §2–§3)
173
+
174
+ {probe_lines}
175
+
176
+ Keep these in the repo after you run them — re-running after a fix proves "still blocked, now safer."
177
+
178
+ ## 6. How to work this — verify with a debate, then fix
179
+
180
+ The findings ledger (`findings-ledger.json` / REPORT.md) comes pre-ranked with a **confidence**
181
+ (HIGH = dynamically confirmed; MEDIUM/LOW = hypothesis). Each finding also carries a **calibrated**
182
+ estimate — `calibrated.p` (measured real-vuln rate for that attack-class/confidence bucket on a
183
+ labeled vuln corpus), `calibrated.ci` (95% interval), `calibrated.n` (sample size), `calibrated.basis`.
184
+ **A wide CI or `basis: prior (uncalibrated)` means thin data — lean on the debate, not the number.**
185
+ The rates skew optimistic (the corpus is deliberately vulnerable); to be conservative, threshold on the
186
+ CI lower bound. **The calibration self-improves:** every `websec dynamic` run folds its *confirmed*
187
+ results (a write that executed unauthenticated = real; one that's auth-enforced = a recon false positive)
188
+ into a local overlay, so these numbers personalize to your apps the more you run it. **Verify before you
189
+ report** — especially MEDIUM/LOW — by running a 4-role debate per finding (this is the FP killer):
190
+
191
+ - **Advocate** — argue it's real; cite the evidence chain + the CWE / OWASP-API.
192
+ - **Challenger** — try hard to *refute* it: false positive? intended-public? unreachable? guarded by a
193
+ pattern the static scan missed? (default to skepticism)
194
+ - **Mediator** — decide: confirmed / false-positive / needs-data. You may override the tool.
195
+ - **Explainer** — write the survivor up: exact `curl` repro, real impact, and the fix.
196
+
197
+ **Generate probes the same way** — a Positive perspective (intended behavior holds) + Negative
198
+ (bypass / injection / error) + Edge (boundary / concurrency / unusual input), then a Critic dedupes
199
+ them into one runnable suite. More perspectives = broader coverage.
200
+
201
+ **Verify the constitution** (`CONSTITUTION.md`): every ⬜ line is a Given/When/Then to confirm with a
202
+ probe — flip it to ✅ holds or 🔴 VIOLATED.
203
+
204
+ Order: static triage (on a {surface.get("datastore_class","?")} datastore, injection alerts are usually FPs) →
205
+ confirm the auth/tenant model → run §3-targeted probes (low-priv, then cross-tenant; record PASS counts
206
+ like "14/14 blocked") → fix what fails → re-run to confirm. **Human reviews every diff; never run
207
+ destructive or production probes without explicit authorization.**
208
+
209
+ ## 7. Hand back
210
+
211
+ What was tested, what held (PASS counts), what's open (repro + fix), which probes are now regression tests. Cite `FACTS.json` + `scanners/`.
212
+
213
+ ---
214
+
215
+ ## Appendix A — full endpoint inventory
216
+
217
+ {inventory}
218
+ """
@@ -0,0 +1,75 @@
1
+ {
2
+ "meta": {
3
+ "corpus": [
4
+ "VAmPI",
5
+ "NodeGoat",
6
+ "DVGA"
7
+ ],
8
+ "n_total": 59,
9
+ "method": "binomial proportion + Wilson 95% CI",
10
+ "min_n": 5,
11
+ "unmatched_rule": "unmatched finding = false positive",
12
+ "researched_classes": [
13
+ "command-injection",
14
+ "graphql",
15
+ "missing-auth",
16
+ "sqli",
17
+ "ssti"
18
+ ],
19
+ "caveat": "indicative \u2014 calibrated on a deliberately-vulnerable app corpus; skews optimistic on clean production code"
20
+ },
21
+ "by_class_label": {
22
+ "command-injection|LOW": {
23
+ "n": 1,
24
+ "k": 1,
25
+ "p": 1.0,
26
+ "ci": [
27
+ 0.207,
28
+ 1.0
29
+ ]
30
+ },
31
+ "graphql|MEDIUM": {
32
+ "n": 2,
33
+ "k": 2,
34
+ "p": 1.0,
35
+ "ci": [
36
+ 0.342,
37
+ 1.0
38
+ ]
39
+ },
40
+ "missing-auth|MEDIUM": {
41
+ "n": 41,
42
+ "k": 27,
43
+ "p": 0.659,
44
+ "ci": [
45
+ 0.505,
46
+ 0.784
47
+ ]
48
+ }
49
+ },
50
+ "by_label": {
51
+ "LOW": {
52
+ "n": 8,
53
+ "k": 1,
54
+ "p": 0.125,
55
+ "ci": [
56
+ 0.022,
57
+ 0.471
58
+ ]
59
+ },
60
+ "MEDIUM": {
61
+ "n": 51,
62
+ "k": 29,
63
+ "p": 0.569,
64
+ "ci": [
65
+ 0.433,
66
+ 0.695
67
+ ]
68
+ }
69
+ },
70
+ "prior": {
71
+ "HIGH": 0.85,
72
+ "MEDIUM": 0.5,
73
+ "LOW": 0.25
74
+ }
75
+ }
@@ -0,0 +1,226 @@
1
+ """Calibrated confidence (CJE) — turn the rule-based HIGH/MEDIUM/LOW labels into
2
+ *measured* real-rates with honest confidence intervals.
3
+
4
+ WHAT THIS IS (honest scope): run the recon ledger against a labeled vuln-app corpus,
5
+ count how often each (attack_class, label) bucket actually corresponds to a real,
6
+ documented vulnerability, and express it as an observed rate + a **Wilson score
7
+ interval**. With a small corpus the INTERVAL is the headline — a wide CI means
8
+ "grounded, but not enough data to be sure yet." The numbers tighten as the corpus grows.
9
+
10
+ WHAT THIS IS NOT: calibrated on *deliberately-vulnerable* apps, so the rates skew
11
+ OPTIMISTIC for normal/clean code (real repos have a far lower base rate of true vulns).
12
+ Every per-finding estimate carries the sample size `n` and a `basis` so the consumer
13
+ can see how much to trust it; a finding that doesn't match a documented vuln is counted
14
+ as a false positive (the corpus is well-documented, so unlisted findings are noise).
15
+
16
+ No ML, no deps — binomial proportion + Wilson interval (stdlib `math`). The cell
17
+ structure upgrades cleanly to isotonic regression if a large labeled set ever exists.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ import math
24
+ import os
25
+ from importlib import resources
26
+ from pathlib import Path
27
+
28
+ Z95 = 1.959963984540054 # z for a 95% two-sided interval
29
+ MIN_N = 5 # a cell needs ≥ this many samples to be used (else fall back a tier)
30
+ # uncalibrated fallback prior — used ONLY when we have no data; always labeled as such
31
+ PRIOR = {"HIGH": 0.85, "MEDIUM": 0.5, "LOW": 0.25}
32
+ CAVEAT = ("indicative — calibrated on a deliberately-vulnerable app corpus; "
33
+ "skews optimistic on clean production code")
34
+
35
+ # Self-improving LOCAL overlay: user-global, gitignored (lives outside any repo), never
36
+ # shipped. It accrues *confirmed* labels from your own dynamic runs (and optional hand-labels)
37
+ # and is merged on top of the shipped public table so the numbers personalize to YOUR apps.
38
+ LOCAL_PATH = Path(os.environ.get("WEBSEC_CALIBRATION_HOME",
39
+ str(Path.home() / ".cache" / "websec-validator"))) / "calibration-local.json"
40
+
41
+
42
+ def wilson(k: int, n: int, z: float = Z95) -> tuple:
43
+ """95% Wilson score interval for k successes in n trials → (lo, hi), clamped to [0,1].
44
+
45
+ Wilson (not the normal approximation) because it stays sane at small n and extreme
46
+ p — exactly our regime. n=0 → (0,1): maximal ignorance.
47
+ """
48
+ if n <= 0:
49
+ return (0.0, 1.0)
50
+ phat = k / n
51
+ denom = 1 + z * z / n
52
+ center = (phat + z * z / (2 * n)) / denom
53
+ half = (z / denom) * math.sqrt(phat * (1 - phat) / n + z * z / (4 * n * n))
54
+ return (max(0.0, center - half), min(1.0, center + half))
55
+
56
+
57
+ def _cell(k: int, n: int) -> dict:
58
+ lo, hi = wilson(k, n)
59
+ return {"n": n, "k": k, "p": round(k / n, 3) if n else None,
60
+ "ci": [round(lo, 3), round(hi, 3)]}
61
+
62
+
63
+ def is_real(attack_class: str, location: str, truth: list) -> bool:
64
+ """A finding is REAL iff it matches a documented truth entry, else a false positive.
65
+
66
+ (Conservative rule, per design decision: on a well-documented vuln app, a finding
67
+ that isn't on the known-vuln list is treated as noise.)
68
+ """
69
+ loc = (location or "").lower()
70
+ for t in (truth or []):
71
+ if t.get("class") != attack_class:
72
+ continue
73
+ sub = (t.get("location_contains") or "").lower()
74
+ if not sub or sub == "*" or sub in loc:
75
+ return True
76
+ return False
77
+
78
+
79
+ def fit(labeled: list, corpus_names: list, researched_classes: set | None = None) -> dict:
80
+ """labeled: list of {attack_class, confidence, is_real}. Returns the calibration table.
81
+
82
+ `researched_classes`: classes for which the corpus has actual ground truth. Per-class
83
+ cells are published ONLY for these — a class we never researched would otherwise emit a
84
+ misleading p=0 (every finding auto-counted FP). Such findings still count as FP in the
85
+ per-label aggregate (conservative), but at runtime fall back to that aggregate.
86
+ """
87
+ by_cl: dict = {}
88
+ by_l: dict = {}
89
+ for r in labeled:
90
+ cl = f"{r['attack_class']}|{r['confidence']}"
91
+ by_cl.setdefault(cl, [0, 0])
92
+ by_l.setdefault(r["confidence"], [0, 0])
93
+ by_cl[cl][1] += 1
94
+ by_l[r["confidence"]][1] += 1
95
+ if r["is_real"]:
96
+ by_cl[cl][0] += 1
97
+ by_l[r["confidence"]][0] += 1
98
+ cells = {k: _cell(v[0], v[1]) for k, v in sorted(by_cl.items())}
99
+ if researched_classes is not None:
100
+ rc = set(researched_classes)
101
+ cells = {k: c for k, c in cells.items() if k.split("|", 1)[0] in rc}
102
+ return {
103
+ "meta": {"corpus": corpus_names, "n_total": len(labeled),
104
+ "method": "binomial proportion + Wilson 95% CI", "min_n": MIN_N,
105
+ "unmatched_rule": "unmatched finding = false positive",
106
+ "researched_classes": sorted(researched_classes) if researched_classes is not None else None,
107
+ "caveat": CAVEAT},
108
+ "by_class_label": cells,
109
+ "by_label": {k: _cell(v[0], v[1]) for k, v in sorted(by_l.items())},
110
+ "prior": PRIOR,
111
+ }
112
+
113
+
114
+ def load_shipped() -> dict | None:
115
+ """Load the shipped, public, corpus-based calibration.json (best-effort)."""
116
+ try:
117
+ p = resources.files("websec_validator").joinpath("calibration.json")
118
+ return json.loads(p.read_text())
119
+ except Exception:
120
+ return None
121
+
122
+
123
+ def load_local() -> dict | None:
124
+ """Load the user-global self-improving overlay (raw cell counts; best-effort)."""
125
+ try:
126
+ if LOCAL_PATH.is_file():
127
+ return json.loads(LOCAL_PATH.read_text())
128
+ except Exception:
129
+ pass
130
+ return None
131
+
132
+
133
+ def _merge(shipped: dict | None, local: dict | None) -> dict | None:
134
+ """Combine the shipped table with the local overlay by SUMMING cell counts, then
135
+ recomputing Wilson. Local samples are confirmed (oracle), so they're not filtered."""
136
+ if not shipped and not local:
137
+ return None
138
+ base = json.loads(json.dumps(shipped)) if shipped else {"meta": {"caveat": CAVEAT},
139
+ "by_class_label": {}, "by_label": {}}
140
+ base.setdefault("prior", PRIOR)
141
+ base.setdefault("meta", {})
142
+ if local:
143
+ for grp in ("by_class_label", "by_label"):
144
+ merged = dict(base.get(grp, {}))
145
+ for key, lc in (local.get(grp, {}) or {}).items():
146
+ sc = merged.get(key, {})
147
+ merged[key] = _cell(sc.get("k", 0) + lc.get("k", 0), sc.get("n", 0) + lc.get("n", 0))
148
+ base[grp] = merged
149
+ ls = (local.get("meta", {}) or {}).get("samples", 0)
150
+ base["meta"]["personalized"] = True
151
+ base["meta"]["local_samples"] = ls
152
+ base["meta"]["caveat"] = (base["meta"].get("caveat", CAVEAT)
153
+ + f" · +{ls} confirmed local sample(s) folded in (personalized to your apps)")
154
+ return base
155
+
156
+
157
+ def load() -> dict | None:
158
+ """Merged calibration the runtime uses: shipped public table + your LOCAL self-improving overlay."""
159
+ return _merge(load_shipped(), load_local())
160
+
161
+
162
+ def record_samples(labeled: list, runs: int = 1) -> dict | None:
163
+ """Fold confirmed labeled samples into the LOCAL overlay (best-effort; user-global, gitignored).
164
+
165
+ `labeled`: list of {attack_class, confidence, is_real}. Returns the updated overlay, or None
166
+ if there was nothing to record / the write failed (never raises — calibration is non-critical).
167
+ """
168
+ if not labeled:
169
+ return None
170
+ try:
171
+ local = load_local() or {"meta": {"source": "local self-improving overlay", "samples": 0, "runs": 0},
172
+ "by_class_label": {}, "by_label": {}}
173
+ for r in labeled:
174
+ for grp, key in (("by_class_label", f"{r['attack_class']}|{r['confidence']}"),
175
+ ("by_label", r["confidence"])):
176
+ cell = local.setdefault(grp, {}).setdefault(key, {"n": 0, "k": 0})
177
+ cell["n"] += 1
178
+ cell["k"] += 1 if r.get("is_real") else 0
179
+ local["meta"]["samples"] = local["meta"].get("samples", 0) + len(labeled)
180
+ local["meta"]["runs"] = local["meta"].get("runs", 0) + runs
181
+ LOCAL_PATH.parent.mkdir(parents=True, exist_ok=True)
182
+ LOCAL_PATH.write_text(json.dumps(local, indent=2) + "\n")
183
+ return local
184
+ except Exception:
185
+ return None
186
+
187
+
188
+ def samples_from_dynamic(dynamic: dict) -> list:
189
+ """Turn a dynamic run into confirmed calibration samples — dynamic is an ORACLE.
190
+
191
+ Write-verb auth enforcement is unambiguous: a write that EXECUTED unauthenticated (or reached
192
+ the handler past the auth gate) is a real missing-auth; one that's auth-enforced is a recon
193
+ FALSE POSITIVE (recon flagged it, the live app actually blocks it). Cross-tenant LEAKs are
194
+ confirmed BOLA. (Unauth GET reachability is excluded — a public endpoint reached without auth
195
+ may be intended, so it's not a clean label.)
196
+ """
197
+ out = []
198
+ for r in (((dynamic or {}).get("write_auth_enforcement", {}) or {}).get("results", []) or []):
199
+ v = r.get("verdict", "")
200
+ if v == "auth-enforced":
201
+ out.append({"attack_class": "missing-auth", "confidence": "MEDIUM", "is_real": False})
202
+ elif v == "EXECUTED-UNAUTH" or v.startswith("no-auth-gate"):
203
+ out.append({"attack_class": "missing-auth", "confidence": "MEDIUM", "is_real": True})
204
+ for _lk in (((dynamic or {}).get("cross_tenant_bola", {}) or {}).get("leaks", []) or []):
205
+ out.append({"attack_class": "bola", "confidence": "MEDIUM", "is_real": True})
206
+ return out
207
+
208
+
209
+ def apply(attack_class: str, confidence: str, table: dict | None) -> dict:
210
+ """Attach a calibrated estimate for a finding's (attack_class, confidence) bucket.
211
+
212
+ Three-tier graceful fallback: per-(class,label) if it has ≥ min_n samples, else
213
+ per-label, else an explicitly-flagged uncalibrated prior. Always reports `n` + `basis`.
214
+ """
215
+ if table:
216
+ min_n = table.get("meta", {}).get("min_n", MIN_N)
217
+ caveat = table.get("meta", {}).get("caveat", CAVEAT)
218
+ cl = table.get("by_class_label", {}).get(f"{attack_class}|{confidence}")
219
+ if cl and cl["n"] >= min_n:
220
+ return {"p": cl["p"], "ci": cl["ci"], "n": cl["n"], "basis": "class+label", "note": caveat}
221
+ lab = table.get("by_label", {}).get(confidence)
222
+ if lab and lab["n"] >= min_n:
223
+ return {"p": lab["p"], "ci": lab["ci"], "n": lab["n"], "basis": "label", "note": caveat}
224
+ prior = (table or {}).get("prior", PRIOR)
225
+ return {"p": prior.get(confidence, 0.5), "ci": [0.0, 1.0], "n": 0,
226
+ "basis": "prior (uncalibrated)", "note": "no calibration data for this bucket — uncalibrated prior"}