@clear-capabilities/agentic-security-scanner 0.74.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (331) hide show
  1. package/CHANGELOG.md +1580 -0
  2. package/bin/.agentic-security/findings.json +1577 -0
  3. package/bin/.agentic-security/last-scan.json +1577 -0
  4. package/bin/.agentic-security/last-scan.json.sig +1 -0
  5. package/bin/.agentic-security/scan-history.json +465 -0
  6. package/bin/.agentic-security/streak.json +25 -0
  7. package/bin/agentic-security-audit.js +198 -0
  8. package/bin/agentic-security-consistency.js +80 -0
  9. package/bin/agentic-security-diff.js +136 -0
  10. package/bin/agentic-security-lsp.js +12 -0
  11. package/bin/agentic-security-mcp.js +40 -0
  12. package/bin/agentic-security-rule.js +153 -0
  13. package/bin/agentic-security.js +1683 -0
  14. package/dist/117.index.js +207 -0
  15. package/dist/178.index.js +250 -0
  16. package/dist/218.index.js +793 -0
  17. package/dist/227.index.js +192 -0
  18. package/dist/301.index.js +167 -0
  19. package/dist/384.index.js +18 -0
  20. package/dist/476.index.js +126 -0
  21. package/dist/513.index.js +373 -0
  22. package/dist/520.index.js +13 -0
  23. package/dist/601.index.js +1038 -0
  24. package/dist/634.index.js +1892 -0
  25. package/dist/637.index.js +216 -0
  26. package/dist/660.index.js +131 -0
  27. package/dist/675.index.js +451 -0
  28. package/dist/826.index.js +188 -0
  29. package/dist/830.index.js +133 -0
  30. package/dist/agentic-security.mjs +272 -0
  31. package/dist/agentic-security.mjs.sha256 +1 -0
  32. package/dist/calibration-seed.json +27 -0
  33. package/package.json +77 -0
  34. package/src/.agentic-security/findings.json +80844 -0
  35. package/src/.agentic-security/last-scan.json +80844 -0
  36. package/src/.agentic-security/last-scan.json.sig +1 -0
  37. package/src/.agentic-security/scan-history.json +8408 -0
  38. package/src/.agentic-security/streak.json +26 -0
  39. package/src/badge.js +188 -0
  40. package/src/compare.js +203 -0
  41. package/src/dataflow/.agentic-security/findings.json +3487 -0
  42. package/src/dataflow/.agentic-security/last-scan.json +3487 -0
  43. package/src/dataflow/.agentic-security/last-scan.json.sig +1 -0
  44. package/src/dataflow/.agentic-security/scan-history.json +735 -0
  45. package/src/dataflow/.agentic-security/streak.json +24 -0
  46. package/src/dataflow/CLAUDE.md +38 -0
  47. package/src/dataflow/access-paths.js +172 -0
  48. package/src/dataflow/async-sequencing.js +177 -0
  49. package/src/dataflow/backward.js +201 -0
  50. package/src/dataflow/catalog-expanded.js +485 -0
  51. package/src/dataflow/catalog.js +659 -0
  52. package/src/dataflow/cross-repo.js +219 -0
  53. package/src/dataflow/engine.js +588 -0
  54. package/src/dataflow/exception-flow.js +116 -0
  55. package/src/dataflow/exploit-prover.js +187 -0
  56. package/src/dataflow/higher-order.js +221 -0
  57. package/src/dataflow/ifds.js +347 -0
  58. package/src/dataflow/implicit-flow.js +129 -0
  59. package/src/dataflow/incremental.js +229 -0
  60. package/src/dataflow/index.js +181 -0
  61. package/src/dataflow/numeric-domain.js +192 -0
  62. package/src/dataflow/path-feasibility.js +114 -0
  63. package/src/dataflow/points-to.js +337 -0
  64. package/src/dataflow/polyglot.js +190 -0
  65. package/src/dataflow/proven-clean.js +159 -0
  66. package/src/dataflow/receiver-context.js +76 -0
  67. package/src/dataflow/sanitizer-proof.js +154 -0
  68. package/src/dataflow/soft-taint.js +140 -0
  69. package/src/dataflow/string-domain.js +234 -0
  70. package/src/dataflow/stub-aware-filter.js +100 -0
  71. package/src/dataflow/summaries.js +132 -0
  72. package/src/dataflow/symbolic-exec.js +238 -0
  73. package/src/dataflow/tabulation.js +135 -0
  74. package/src/engine.js +7763 -0
  75. package/src/history-scan.js +229 -0
  76. package/src/index.js +3 -0
  77. package/src/integrations/.agentic-security/findings.json +1504 -0
  78. package/src/integrations/.agentic-security/last-scan.json +1504 -0
  79. package/src/integrations/.agentic-security/scan-history.json +40 -0
  80. package/src/integrations/.agentic-security/streak.json +21 -0
  81. package/src/integrations/index.js +321 -0
  82. package/src/integrations/tickets.js +200 -0
  83. package/src/ir/.agentic-security/findings.json +3036 -0
  84. package/src/ir/.agentic-security/last-scan.json +3036 -0
  85. package/src/ir/.agentic-security/last-scan.json.sig +1 -0
  86. package/src/ir/.agentic-security/scan-history.json +364 -0
  87. package/src/ir/.agentic-security/streak.json +23 -0
  88. package/src/ir/CLAUDE.md +172 -0
  89. package/src/ir/callgraph.js +73 -0
  90. package/src/ir/class-hierarchy.js +195 -0
  91. package/src/ir/index.js +152 -0
  92. package/src/ir/parser-cs.js +260 -0
  93. package/src/ir/parser-java.js +286 -0
  94. package/src/ir/parser-js.js +413 -0
  95. package/src/ir/parser-kt.js +258 -0
  96. package/src/ir/parser-py-cst.js +136 -0
  97. package/src/ir/parser-py.helper.py +501 -0
  98. package/src/ir/parser-py.js +312 -0
  99. package/src/ir/ssa.js +315 -0
  100. package/src/ir/type-stubs.js +288 -0
  101. package/src/leaderboard.js +152 -0
  102. package/src/llm-validator/.agentic-security/findings.json +1891 -0
  103. package/src/llm-validator/.agentic-security/last-scan.json +1891 -0
  104. package/src/llm-validator/.agentic-security/last-scan.json.sig +1 -0
  105. package/src/llm-validator/.agentic-security/scan-history.json +168 -0
  106. package/src/llm-validator/.agentic-security/streak.json +20 -0
  107. package/src/llm-validator/consistency.js +141 -0
  108. package/src/llm-validator/index.js +437 -0
  109. package/src/lsp/.agentic-security/findings.json +28 -0
  110. package/src/lsp/.agentic-security/last-scan.json +28 -0
  111. package/src/lsp/.agentic-security/scan-history.json +79 -0
  112. package/src/lsp/.agentic-security/streak.json +22 -0
  113. package/src/lsp/server.js +275 -0
  114. package/src/mcp/.agentic-security/findings.json +8358 -0
  115. package/src/mcp/.agentic-security/last-scan.json +8358 -0
  116. package/src/mcp/.agentic-security/last-scan.json.sig +1 -0
  117. package/src/mcp/.agentic-security/scan-history.json +1125 -0
  118. package/src/mcp/.agentic-security/streak.json +22 -0
  119. package/src/mcp/CLAUDE.md +54 -0
  120. package/src/mcp/audit.js +136 -0
  121. package/src/mcp/redact.js +75 -0
  122. package/src/mcp/server.js +158 -0
  123. package/src/mcp/stdio.js +83 -0
  124. package/src/mcp/tools.js +940 -0
  125. package/src/mcp/validate.js +49 -0
  126. package/src/personality.js +164 -0
  127. package/src/poc-video.js +239 -0
  128. package/src/posture/.agentic-security/findings.json +51239 -0
  129. package/src/posture/.agentic-security/last-scan.json +51239 -0
  130. package/src/posture/.agentic-security/last-scan.json.sig +1 -0
  131. package/src/posture/.agentic-security/scan-history.json +5557 -0
  132. package/src/posture/.agentic-security/streak.json +24 -0
  133. package/src/posture/CLAUDE.md +42 -0
  134. package/src/posture/adversarial-self-test.js +114 -0
  135. package/src/posture/adversary-agent.js +204 -0
  136. package/src/posture/agents-memory.js +135 -0
  137. package/src/posture/ai-code-fingerprint.js +171 -0
  138. package/src/posture/aibom.js +284 -0
  139. package/src/posture/api-inventory.js +96 -0
  140. package/src/posture/attack-playbooks.js +305 -0
  141. package/src/posture/auditor-agent.js +115 -0
  142. package/src/posture/auth-posture-import.js +135 -0
  143. package/src/posture/baseline-compare.js +114 -0
  144. package/src/posture/blast-radius.js +836 -0
  145. package/src/posture/bounty-prediction.js +141 -0
  146. package/src/posture/business-logic.js +239 -0
  147. package/src/posture/calibration-drift.js +93 -0
  148. package/src/posture/calibration-seed.json +27 -0
  149. package/src/posture/calibration.js +204 -0
  150. package/src/posture/clustering.js +75 -0
  151. package/src/posture/concurrency-checker.js +265 -0
  152. package/src/posture/confidence.js +65 -0
  153. package/src/posture/container-runtime.js +149 -0
  154. package/src/posture/counterfactual.js +109 -0
  155. package/src/posture/cross-lang-graphql.js +165 -0
  156. package/src/posture/cross-lang-grpc.js +166 -0
  157. package/src/posture/cross-lang-meta.js +101 -0
  158. package/src/posture/cross-lang-openapi.js +187 -0
  159. package/src/posture/cross-lang-orm.js +153 -0
  160. package/src/posture/cross-lang-queues.js +210 -0
  161. package/src/posture/crown-jewels.js +110 -0
  162. package/src/posture/custom-rules.js +361 -0
  163. package/src/posture/cve-alert-daemon.js +433 -0
  164. package/src/posture/cve-lookup.js +129 -0
  165. package/src/posture/dead-code.js +430 -0
  166. package/src/posture/defender-agent.js +158 -0
  167. package/src/posture/deploy-platform.js +204 -0
  168. package/src/posture/detector-fuzz.js +61 -0
  169. package/src/posture/deterministic.js +99 -0
  170. package/src/posture/drift.js +165 -0
  171. package/src/posture/epss.js +156 -0
  172. package/src/posture/exploitability-probability.js +212 -0
  173. package/src/posture/exploitability.js +121 -0
  174. package/src/posture/feature-flags.js +110 -0
  175. package/src/posture/finding-defaults.js +132 -0
  176. package/src/posture/fix-history.js +411 -0
  177. package/src/posture/fix-plan.js +121 -0
  178. package/src/posture/fix-verify-loop.js +157 -0
  179. package/src/posture/fix-verify.js +130 -0
  180. package/src/posture/flow-narration.js +105 -0
  181. package/src/posture/grader-calibration.js +156 -0
  182. package/src/posture/harness-discovery.js +113 -0
  183. package/src/posture/holdout-eval.js +144 -0
  184. package/src/posture/iac-reachability.js +163 -0
  185. package/src/posture/iam-policy.js +128 -0
  186. package/src/posture/integrity.js +97 -0
  187. package/src/posture/learning.js +166 -0
  188. package/src/posture/license-policy.js +109 -0
  189. package/src/posture/llm-redteam-prompts.js +418 -0
  190. package/src/posture/llm-redteam.js +303 -0
  191. package/src/posture/material-change.js +163 -0
  192. package/src/posture/mitigation-composite.js +55 -0
  193. package/src/posture/mttr.js +91 -0
  194. package/src/posture/network-policy-import.js +126 -0
  195. package/src/posture/path-predicates.js +99 -0
  196. package/src/posture/persona-prioritization.js +153 -0
  197. package/src/posture/poc-cwe-map.js +51 -0
  198. package/src/posture/poc-generator.js +500 -0
  199. package/src/posture/policy-gate.js +174 -0
  200. package/src/posture/pre-incident-archaeology.js +110 -0
  201. package/src/posture/profile.js +93 -0
  202. package/src/posture/reachability-filter.js +42 -0
  203. package/src/posture/regression-test-gen.js +200 -0
  204. package/src/posture/reverse-blast-radius.js +110 -0
  205. package/src/posture/router.js +109 -0
  206. package/src/posture/rule-overrides.js +198 -0
  207. package/src/posture/rule-pack-signing.js +209 -0
  208. package/src/posture/rule-packs.js +143 -0
  209. package/src/posture/rule-synthesis.js +108 -0
  210. package/src/posture/ruleset-version.js +71 -0
  211. package/src/posture/sbom.js +129 -0
  212. package/src/posture/schema-aware-bridge.js +207 -0
  213. package/src/posture/security-trend.js +87 -0
  214. package/src/posture/semantic-clone.js +114 -0
  215. package/src/posture/specification-mining.js +170 -0
  216. package/src/posture/stable-id.js +75 -0
  217. package/src/posture/stack-playbook.js +229 -0
  218. package/src/posture/streak.js +249 -0
  219. package/src/posture/suppressions.js +135 -0
  220. package/src/posture/telemetry-ingest.js +112 -0
  221. package/src/posture/threat-model.js +145 -0
  222. package/src/posture/three-agent-pipeline.js +74 -0
  223. package/src/posture/triage.js +146 -0
  224. package/src/posture/trust-boundary-diagram.js +115 -0
  225. package/src/posture/type-narrowing.js +129 -0
  226. package/src/posture/validator-metrics.js +179 -0
  227. package/src/posture/verifier-ephemeral.js +118 -0
  228. package/src/posture/verifier-target.js +147 -0
  229. package/src/posture/verifier.js +257 -0
  230. package/src/posture/version.js +75 -0
  231. package/src/posture/waf-ingest.js +200 -0
  232. package/src/posture/why-fired.js +141 -0
  233. package/src/pr-comment.js +172 -0
  234. package/src/pr-delta.js +198 -0
  235. package/src/report/.agentic-security/findings.json +79 -0
  236. package/src/report/.agentic-security/last-scan.json +79 -0
  237. package/src/report/.agentic-security/last-scan.json.sig +1 -0
  238. package/src/report/.agentic-security/scan-history.json +332 -0
  239. package/src/report/.agentic-security/streak.json +23 -0
  240. package/src/report/index.js +1136 -0
  241. package/src/report/mascot.js +42 -0
  242. package/src/runScan.js +141 -0
  243. package/src/sast/.agentic-security/findings.json +5051 -0
  244. package/src/sast/.agentic-security/last-scan.json +5051 -0
  245. package/src/sast/.agentic-security/last-scan.json.sig +1 -0
  246. package/src/sast/.agentic-security/scan-history.json +788 -0
  247. package/src/sast/.agentic-security/streak.json +23 -0
  248. package/src/sast/CLAUDE.md +39 -0
  249. package/src/sast/_comment-strip.js +46 -0
  250. package/src/sast/agent-tool-escalation.js +131 -0
  251. package/src/sast/auth-provider.js +171 -0
  252. package/src/sast/authz.js +236 -0
  253. package/src/sast/bench-shape/.agentic-security/findings.json +28 -0
  254. package/src/sast/bench-shape/.agentic-security/last-scan.json +28 -0
  255. package/src/sast/bench-shape/.agentic-security/scan-history.json +24 -0
  256. package/src/sast/bench-shape/.agentic-security/streak.json +22 -0
  257. package/src/sast/bench-shape/index.js +62 -0
  258. package/src/sast/claude-hook-injection.js +199 -0
  259. package/src/sast/claude-md-prompt-injection.js +170 -0
  260. package/src/sast/claude-settings.js +165 -0
  261. package/src/sast/client-side.js +149 -0
  262. package/src/sast/cpp-bench-extras.js +122 -0
  263. package/src/sast/cpp-dataflow.js +430 -0
  264. package/src/sast/cpp.js +248 -0
  265. package/src/sast/csharp.js +152 -0
  266. package/src/sast/csrf.js +82 -0
  267. package/src/sast/dart-flutter.js +173 -0
  268. package/src/sast/db-rls.js +147 -0
  269. package/src/sast/db-taint.js +215 -0
  270. package/src/sast/defi-deep.js +242 -0
  271. package/src/sast/deserialization-gadgets.js +113 -0
  272. package/src/sast/django-hardening.js +230 -0
  273. package/src/sast/env-hygiene.js +125 -0
  274. package/src/sast/fastapi-hardening.js +145 -0
  275. package/src/sast/go-extended.js +84 -0
  276. package/src/sast/host-header.js +106 -0
  277. package/src/sast/index.js +17 -0
  278. package/src/sast/java-ast-folding.js +561 -0
  279. package/src/sast/java-bench-extras.js +708 -0
  280. package/src/sast/java-collection-passthrough.js +178 -0
  281. package/src/sast/java-constant-fold.js +244 -0
  282. package/src/sast/java-deserialization.js +125 -0
  283. package/src/sast/jndi.js +104 -0
  284. package/src/sast/juliet-shape.js +324 -0
  285. package/src/sast/jwt-exp.js +104 -0
  286. package/src/sast/kotlin.js +82 -0
  287. package/src/sast/laravel-hardening.js +198 -0
  288. package/src/sast/ldap-injection.js +100 -0
  289. package/src/sast/llm-owasp.js +465 -0
  290. package/src/sast/llm-stored-prompt.js +103 -0
  291. package/src/sast/llm-trading-agent.js +161 -0
  292. package/src/sast/llm.js +308 -0
  293. package/src/sast/logic.js +140 -0
  294. package/src/sast/mass-assignment.js +101 -0
  295. package/src/sast/mcp-audit.js +242 -0
  296. package/src/sast/mobile-manifest.js +195 -0
  297. package/src/sast/model-load.js +164 -0
  298. package/src/sast/mutation-xss.js +87 -0
  299. package/src/sast/nosql-injection.js +82 -0
  300. package/src/sast/open-redirect.js +119 -0
  301. package/src/sast/php.js +91 -0
  302. package/src/sast/pipeline.js +122 -0
  303. package/src/sast/primary-cwe-java.js +155 -0
  304. package/src/sast/prompt-firewall.js +151 -0
  305. package/src/sast/prompt-template.js +157 -0
  306. package/src/sast/prototype-pollution.js +112 -0
  307. package/src/sast/python-sinks.js +195 -0
  308. package/src/sast/quarkus-hardening.js +102 -0
  309. package/src/sast/rag-poisoning.js +118 -0
  310. package/src/sast/rate-limit.js +128 -0
  311. package/src/sast/response-splitting.js +138 -0
  312. package/src/sast/ruby.js +108 -0
  313. package/src/sast/rust.js +105 -0
  314. package/src/sast/solidity.js +167 -0
  315. package/src/sast/springboot-hardening.js +186 -0
  316. package/src/sast/ssrf-cloud-metadata.js +80 -0
  317. package/src/sast/ssti.js +116 -0
  318. package/src/sast/swift.js +162 -0
  319. package/src/sast/toctou.js +95 -0
  320. package/src/sast/webhook.js +101 -0
  321. package/src/sast/xpath-injection.js +51 -0
  322. package/src/sast/xxe.js +140 -0
  323. package/src/sast/zip-slip.js +200 -0
  324. package/src/sca/base-images.json +45 -0
  325. package/src/sca/container.js +107 -0
  326. package/src/sca/dep-confusion.js +134 -0
  327. package/src/sca/index.js +6 -0
  328. package/src/sca/popular-packages.json +41 -0
  329. package/src/sca/sarif-ingest.js +187 -0
  330. package/src/sca/vuln-function-hints.json +89 -0
  331. package/src/secrets/index.js +4 -0
@@ -0,0 +1,130 @@
1
+ // Closed-loop /fix verification (Sentinel-parity FR-L4-4, FR-L4-5).
2
+ //
3
+ // Given a candidate patch (the new file content + the finding stableId being
4
+ // fixed), verify it:
5
+ //
6
+ // 1. The original finding's stableId no longer fires on the patched file.
7
+ // 2. No new findings at severity ≥ medium were introduced by the patch.
8
+ // 3. The project's existing linter (when present) passes on the patched file.
9
+ //
10
+ // If any of those fail, the caller is expected to NOT apply the patch and
11
+ // instead surface a "fix plan" — a numbered list of steps the engineer can
12
+ // follow — rather than dump a broken patch on the user.
13
+
14
+ import { spawnSync } from 'node:child_process';
15
+ import * as fs from 'node:fs';
16
+ import * as path from 'node:path';
17
+ import { runFullScan } from '../engine.js';
18
+
19
+ const SEVERITY_RANK = { critical: 0, high: 1, medium: 2, low: 3, info: 4 };
20
+
21
+ // Run a focused re-scan over just the patched file(s) using the in-memory
22
+ // engine. No filesystem write needed — we hand the new content in via the
23
+ // fileContents map.
24
+ export async function verifyPatch({
25
+ scanRoot,
26
+ originalFindingStableId,
27
+ files, // { [relPath]: newContent }
28
+ depFileContents = {},
29
+ } = {}) {
30
+ if (!files || typeof files !== 'object') return { ok: false, reason: 'no-files-provided' };
31
+ const fileContents = { ...files };
32
+ let scan;
33
+ try {
34
+ scan = await runFullScan({ fileContents, depFileContents, scanRoot }, () => {});
35
+ } catch (e) {
36
+ return { ok: false, reason: 'rescan-failed', error: e.message };
37
+ }
38
+ const findings = (scan && scan.findings) || [];
39
+ const stillHasOriginal = !!originalFindingStableId &&
40
+ findings.some(f => f.stableId === originalFindingStableId);
41
+ if (stillHasOriginal) {
42
+ return { ok: false, reason: 'original-finding-still-present', stableId: originalFindingStableId };
43
+ }
44
+ const introducedHighOrAbove = findings.filter(f =>
45
+ (SEVERITY_RANK[f.severity] ?? 9) <= SEVERITY_RANK.medium);
46
+ // Don't count findings on lines outside the patched files — but our
47
+ // fileContents map IS the patched files, so every finding is in-scope.
48
+ return {
49
+ ok: introducedHighOrAbove.length === 0,
50
+ reason: introducedHighOrAbove.length === 0 ? 'verified' : 'introduced-new-findings',
51
+ introduced: introducedHighOrAbove.map(f => ({
52
+ vuln: f.vuln, file: f.file, line: f.line, severity: f.severity,
53
+ stableId: f.stableId,
54
+ })),
55
+ };
56
+ }
57
+
58
+ // Detect which linter the project uses and run it on the patched files.
59
+ // Returns { ok, runner, output } or { ok: true, runner: 'none' } when no
60
+ // linter is configured (silent pass).
61
+ export function runProjectLinter(scanRoot, filePaths) {
62
+ if (!scanRoot || !Array.isArray(filePaths) || filePaths.length === 0) {
63
+ return { ok: true, runner: 'none' };
64
+ }
65
+ const has = (p) => { try { return fs.existsSync(path.join(scanRoot, p)); } catch { return false; } };
66
+ // Pick the linter by config file present in the repo root.
67
+ const jsFiles = filePaths.filter(f => /\.(?:js|jsx|ts|tsx|mjs|cjs)$/i.test(f));
68
+ const pyFiles = filePaths.filter(f => /\.py$/i.test(f));
69
+ const goFiles = filePaths.filter(f => /\.go$/i.test(f));
70
+ const javaFiles = filePaths.filter(f => /\.java$/i.test(f));
71
+
72
+ if (jsFiles.length && (has('.eslintrc') || has('.eslintrc.json') || has('.eslintrc.js') || has('eslint.config.js') || has('eslint.config.mjs'))) {
73
+ return runLinter(scanRoot, 'eslint', ['--no-error-on-unmatched-pattern', ...jsFiles]);
74
+ }
75
+ if (pyFiles.length && (has('pyproject.toml') || has('ruff.toml') || has('.ruff.toml'))) {
76
+ return runLinter(scanRoot, 'ruff', ['check', ...pyFiles]);
77
+ }
78
+ if (pyFiles.length && has('.flake8')) {
79
+ return runLinter(scanRoot, 'flake8', pyFiles);
80
+ }
81
+ if (goFiles.length && (has('.golangci.yml') || has('.golangci.yaml'))) {
82
+ return runLinter(scanRoot, 'golangci-lint', ['run', ...goFiles]);
83
+ }
84
+ if (javaFiles.length && has('checkstyle.xml')) {
85
+ return runLinter(scanRoot, 'checkstyle', ['-c', 'checkstyle.xml', ...javaFiles]);
86
+ }
87
+ return { ok: true, runner: 'none' };
88
+ }
89
+
90
+ function runLinter(cwd, cmd, args) {
91
+ let r;
92
+ try {
93
+ r = spawnSync(cmd, args, { cwd, encoding: 'utf8', timeout: 60_000 });
94
+ } catch (e) {
95
+ return { ok: true, runner: cmd, skipped: true, reason: 'binary-missing', error: e.message };
96
+ }
97
+ if (r.error && r.error.code === 'ENOENT') {
98
+ return { ok: true, runner: cmd, skipped: true, reason: 'binary-missing' };
99
+ }
100
+ if (r.status === null) {
101
+ return { ok: false, runner: cmd, reason: 'timed-out', output: (r.stderr || r.stdout || '').slice(-2000) };
102
+ }
103
+ return {
104
+ ok: r.status === 0,
105
+ runner: cmd,
106
+ exitCode: r.status,
107
+ output: ((r.stderr || '') + (r.stdout || '')).slice(-2000),
108
+ };
109
+ }
110
+
111
+ // Top-level verify: re-scan + lint. Returns the combined verdict + a
112
+ // human-readable summary string suitable for surfacing to the user.
113
+ export async function verifyFix({
114
+ scanRoot,
115
+ originalFindingStableId,
116
+ files,
117
+ depFileContents,
118
+ } = {}) {
119
+ const rescan = await verifyPatch({ scanRoot, originalFindingStableId, files, depFileContents });
120
+ const lint = runProjectLinter(scanRoot, Object.keys(files || {}));
121
+ const ok = rescan.ok && (lint.ok || lint.skipped);
122
+ const summary = [
123
+ `re-scan: ${rescan.ok ? 'PASS' : 'FAIL — ' + rescan.reason}`,
124
+ `linter: ${lint.runner === 'none' ? 'skipped (no linter config)'
125
+ : lint.skipped ? `${lint.runner} not installed`
126
+ : lint.ok ? `${lint.runner} PASS`
127
+ : `${lint.runner} FAIL (exit ${lint.exitCode})`}`,
128
+ ].join('\n');
129
+ return { ok, rescan, lint, summary };
130
+ }
@@ -0,0 +1,105 @@
1
+ // LLM-driven flow narration (FR-LOGIC-6).
2
+ //
3
+ // For each high-severity finding, produce a one-paragraph narrative of:
4
+ // - how the attacker gets to this code path
5
+ // - what they get if it works
6
+ // - what it costs the business
7
+ //
8
+ // Two modes:
9
+ // 1. LLM mode (AGENTIC_SECURITY_LLM_ENDPOINT set): post the finding to
10
+ // the configured LLM endpoint, get back a sanitized narrative.
11
+ // 2. Template mode (default): emit a deterministic template based on the
12
+ // finding's family + cost-framing data from blast-radius.
13
+ //
14
+ // Fail-closed: any LLM error → template fallback, never a missing field.
15
+
16
+ const TEMPLATES = {
17
+ 'sql-injection': (f) =>
18
+ `An unauthenticated attacker sends a crafted request to ${_routeOf(f)} containing UNION-style SQL syntax in the ${f.source?.variable || 'tainted'} field. The server's database driver executes the injected query verbatim, returning rows from any table the connection has read access to. Typical impact: full table dump of users (emails, password hashes), bypass of authentication via boolean-blind exfiltration. If the DB role has write privileges, the attacker can also INSERT/UPDATE arbitrary rows. Recovery cost: incident response, customer notification, password reset, regulatory reporting if PII leaked.`,
19
+ 'command-injection': (f) =>
20
+ `The handler at ${f.file}:${f.line} passes user-controlled input to a shell-spawning function. An attacker can append shell metacharacters (";", "$(...)", backticks) to execute arbitrary commands as the application's UID. Typical impact: read of /etc/passwd, /proc/self/environ (env vars including secrets), outbound connections to attacker-controlled hosts (data exfil). On unprivileged containers the blast radius is limited to that container; on privileged or root-owned processes, the attacker can pivot to the host.`,
21
+ 'xss': (f) =>
22
+ `An attacker injects HTML/JS markup into user-controllable input. The server reflects (or stores) it without encoding, so when a victim browser renders the page, the attacker's script executes in the victim's session origin. Typical impact: session cookie theft, CSRF-bypass on internal endpoints, account takeover via API calls executed under the victim's auth. Cost: incident response, customer notification, potential data egress depending on what the victim's session can access.`,
23
+ 'ssrf': (f) =>
24
+ `The handler fetches a URL constructed from user input. An attacker supplies a URL pointing at cloud-metadata endpoints (169.254.169.254 on AWS, metadata.google.internal on GCP) or internal services not exposed externally. Typical impact: theft of IAM credentials attached to the instance, fingerprinting / exploitation of internal services, port-scanning the VPC. Cost: full AWS account compromise in the worst case (IAM credential rotation, audit, blast-radius review of every action taken under the leaked credentials).`,
25
+ 'path-traversal': (f) =>
26
+ `The handler opens a file at a path derived from user input without confining the resolved path to an intended directory. An attacker submits "../../etc/passwd" (or %-encoded variants) to read arbitrary files the application has access to. Typical impact: leakage of config files, secrets, source code, /etc/passwd. Cost: depends on what files are readable — usually low-to-medium unless secrets land in the readable set.`,
27
+ 'code-injection': (f) =>
28
+ `User input is fed into a code-evaluation function (eval, new Function, exec). An attacker supplies arbitrary code that executes in the application's runtime context, with full access to the application's data, env, and outbound network. Typical impact: equivalent to remote code execution; same recovery cost as command-injection.`,
29
+ 'csrf': (f) =>
30
+ `The state-changing endpoint at ${f.file}:${f.line} doesn't validate that the request originated from your own application. An attacker hosts a page that issues a same-shape request from a logged-in victim's browser. Typical impact: state changes performed under the victim's identity — password change, money movement, role escalation. Cost: depends on what state can change; for billing endpoints, this is fraud-level.`,
31
+ 'open-redirect': (f) =>
32
+ `The endpoint redirects to a URL the attacker controls. Used as part of phishing chains: victim clicks a legitimate-looking link to your domain, gets redirected to attacker.example, enters credentials thinking they're still on your site. Typical impact: phishing-amplified credential theft; reputational damage if your domain ends up on a phish-tracking list.`,
33
+ 'insecure-deserialization': (f) =>
34
+ `The handler deserializes attacker-controlled bytes via pickle/yaml-load/Marshal. The deserialization callback invokes arbitrary code from class constructors / __reduce__ / __wakeup__. Typical impact: equivalent to remote code execution. Cost: full incident response, including investigating whether the attacker established persistence.`,
35
+ 'xxe': (f) =>
36
+ `The XML parser at ${f.file}:${f.line} resolves external entities. An attacker submits XML referencing file:///etc/passwd or http://internal/. Typical impact: file disclosure, SSRF, blind out-of-band exfiltration of secrets. Cost: similar to SSRF + path-traversal combined.`,
37
+ };
38
+
39
+ function _routeOf(f) {
40
+ if (!f) return '<endpoint>';
41
+ return `${f.file || '?'}:${f.line || '?'}`;
42
+ }
43
+
44
+ function _templateFor(f) {
45
+ const fam = f.family;
46
+ if (TEMPLATES[fam]) return TEMPLATES[fam](f);
47
+ return `A finding of type "${f.vuln || fam || 'unknown'}" at ${_routeOf(f)}. Severity: ${f.severity || 'unknown'}. Review the remediation field for class-specific guidance.`;
48
+ }
49
+
50
+ // Render the narration without an LLM. Always available; used as the fallback
51
+ // when no LLM endpoint is configured.
52
+ function _renderTemplate(f) {
53
+ return _templateFor(f);
54
+ }
55
+
56
+ // Optional LLM call. Disabled by default; opt-in via env. Falls back to the
57
+ // template on any error.
58
+ async function _renderLlm(f) {
59
+ const endpoint = process.env.AGENTIC_SECURITY_LLM_ENDPOINT;
60
+ if (!endpoint) return null;
61
+ const apiKey = process.env.AGENTIC_SECURITY_LLM_API_KEY;
62
+ const headers = { 'Content-Type': 'application/json' };
63
+ if (apiKey) headers['Authorization'] = `Bearer ${apiKey}`;
64
+ const prompt = `You are explaining a security finding to a developer in one paragraph.
65
+ Vuln: ${f.vuln}
66
+ CWE: ${f.cwe}
67
+ Severity: ${f.severity}
68
+ Location: ${f.file}:${f.line}
69
+ Snippet: ${(f.snippet || '').slice(0, 200)}
70
+
71
+ Write ONE paragraph (5-7 sentences) covering: (1) how an attacker reaches this code, (2) what they get if exploited, (3) typical recovery cost. Plain English, no marketing language, no emoji.`;
72
+ try {
73
+ const r = await fetch(endpoint, { method: 'POST', headers, body: JSON.stringify({ prompt }) });
74
+ if (!r.ok) return null;
75
+ const j = await r.json().catch(() => null);
76
+ const text = j && (j.response || j.text || j.content || j.output ||
77
+ j.choices?.[0]?.message?.content || j.message?.content);
78
+ if (typeof text !== 'string' || text.length < 30) return null;
79
+ // Sanitize: strip control chars, markdown fences, HTML metachars.
80
+ return text.replace(/[\x00-\x1f\x7f]/g, ' ').replace(/[<>&]/g, ' ')
81
+ .replace(/```/g, '').replace(/\s+/g, ' ').trim().slice(0, 1500);
82
+ } catch { return null; }
83
+ }
84
+
85
+ /**
86
+ * Annotate findings with f.narration. Default mode is template; opt-in to
87
+ * LLM via AGENTIC_SECURITY_LLM_ENDPOINT.
88
+ */
89
+ export async function annotateNarration(findings, opts = {}) {
90
+ if (!Array.isArray(findings)) return;
91
+ const useLlm = !!opts.useLlm || process.env.AGENTIC_SECURITY_FLOW_NARRATION_LLM === '1';
92
+ for (const f of findings) {
93
+ if (!f || typeof f !== 'object') continue;
94
+ // Only narrate severity ≥ high to keep output tight on noisy projects.
95
+ if (!/critical|high/i.test(f.severity || '')) {
96
+ f.narration = null;
97
+ continue;
98
+ }
99
+ let text = useLlm ? await _renderLlm(f) : null;
100
+ if (!text) text = _renderTemplate(f);
101
+ f.narration = text;
102
+ }
103
+ }
104
+
105
+ export const _internals = { TEMPLATES, _renderTemplate, _templateFor };
@@ -0,0 +1,156 @@
1
+ // Human ⇆ LLM grader calibration (eval-post recommendation #5).
2
+ //
3
+ // The Anthropic eval-post quote we're implementing:
4
+ // "LLM-based rubrics should be frequently calibrated against expert
5
+ // human judgment to grade these agents effectively."
6
+ //
7
+ // We have two grader streams in this codebase already:
8
+ // - HUMAN: `/triage` writes per-finding verdicts (tp/fp/wontfix) to
9
+ // `.agentic-security/triage-feedback.json`, keyed by stableId.
10
+ // - LLM: `llm-validator` writes per-finding verdicts (accept/reject/
11
+ // escalate) to its cache under `.agentic-security/llm-cache/*.json`,
12
+ // plus `validator_verdict` on each finding in `last-scan.json`.
13
+ //
14
+ // This module joins them on stableId and reports inter-rater agreement
15
+ // (Cohen's κ) over the overlap. κ < 0.6 means the LLM rubric is drifting
16
+ // from human judgment — operator should re-tune the prompt or escalate
17
+ // human review.
18
+ //
19
+ // Verdict mapping (TP/FP are the only ones κ measures):
20
+ // HUMAN tp ↔ LLM accept ("real finding")
21
+ // HUMAN fp ↔ LLM reject ("false positive")
22
+ // HUMAN wontfix → excluded from κ (not a quality signal)
23
+ // LLM escalate → excluded from κ (deliberate "I don't know")
24
+ // LLM unvalidated → excluded from κ (validator didn't run)
25
+
26
+ import * as fs from 'node:fs';
27
+ import * as path from 'node:path';
28
+
29
+ const TRIAGE_FILE = '.agentic-security/triage-feedback.json';
30
+ const SCAN_FILE = '.agentic-security/last-scan.json';
31
+
32
+ function _loadTriageFeedback(scanRoot) {
33
+ const fp = path.join(scanRoot, TRIAGE_FILE);
34
+ if (!fs.existsSync(fp)) return [];
35
+ try { return JSON.parse(fs.readFileSync(fp, 'utf8')).entries || []; }
36
+ catch { return []; }
37
+ }
38
+
39
+ function _loadScanVerdicts(scanRoot) {
40
+ const fp = path.join(scanRoot, SCAN_FILE);
41
+ if (!fs.existsSync(fp)) return [];
42
+ try {
43
+ const scan = JSON.parse(fs.readFileSync(fp, 'utf8'));
44
+ return (scan.findings || []).map(f => ({
45
+ stableId: f.stableId || null,
46
+ verdict: f.validator_verdict || null,
47
+ confidence: typeof f.llm_confidence === 'number' ? f.llm_confidence : null,
48
+ })).filter(e => e.stableId && e.verdict);
49
+ } catch { return []; }
50
+ }
51
+
52
+ // Cohen's κ for two binary raters:
53
+ // p_o = observed agreement = (concordant) / n
54
+ // p_e = expected by chance = sum over classes c of (p_human_c * p_llm_c)
55
+ // κ = (p_o - p_e) / (1 - p_e)
56
+ // κ = 1: perfect agreement. κ = 0: chance. κ < 0: worse than chance.
57
+ // Common threshold: κ >= 0.6 is "substantial agreement" (Landis & Koch).
58
+ export function cohensKappa(pairs) {
59
+ if (!Array.isArray(pairs) || pairs.length === 0) return { kappa: null, reason: 'no-pairs' };
60
+ let agree = 0, hPos = 0, lPos = 0, n = pairs.length;
61
+ for (const { human, llm } of pairs) {
62
+ if (human === llm) agree++;
63
+ if (human === 'positive') hPos++;
64
+ if (llm === 'positive') lPos++;
65
+ }
66
+ const pO = agree / n;
67
+ const pHumanPos = hPos / n, pLlmPos = lPos / n;
68
+ const pE = pHumanPos * pLlmPos + (1 - pHumanPos) * (1 - pLlmPos);
69
+ if (pE >= 0.9999) {
70
+ // All raters agree on the same class — κ is undefined (1-pE → 0). Report
71
+ // perfect or majority agreement honestly without dividing by ~0.
72
+ return { kappa: pO === 1 ? 1 : null, reason: pO === 1 ? 'perfect-agreement' : 'pE-saturated', pO, pE, n };
73
+ }
74
+ const kappa = (pO - pE) / (1 - pE);
75
+ return { kappa, pO, pE, n };
76
+ }
77
+
78
+ // Join human triage with LLM verdicts on stableId. Returns:
79
+ // { pairs: [{ stableId, human, llm }], ... }
80
+ // where `human` and `llm` are mapped into the {positive, negative} binary used
81
+ // for κ. Findings that fall into the excluded buckets (wontfix, escalate,
82
+ // unvalidated) are stripped from `pairs` but counted under `excluded`.
83
+ export function joinHumanLlm(triageEntries, validatorEntries) {
84
+ // Most-recent triage entry wins per stableId.
85
+ const latestHuman = new Map();
86
+ for (const e of triageEntries) {
87
+ if (!e.stableId) continue;
88
+ const prev = latestHuman.get(e.stableId);
89
+ if (!prev || String(e.at || '').localeCompare(String(prev.at || '')) > 0) {
90
+ latestHuman.set(e.stableId, e);
91
+ }
92
+ }
93
+ const llmById = new Map();
94
+ for (const v of validatorEntries) llmById.set(v.stableId, v);
95
+
96
+ const pairs = [];
97
+ const excluded = { human_wontfix: 0, llm_escalate: 0, llm_unvalidated: 0, llm_not_applicable: 0, no_llm_for_this_stableid: 0 };
98
+ for (const [stableId, hum] of latestHuman) {
99
+ const llm = llmById.get(stableId);
100
+ if (!llm) { excluded.no_llm_for_this_stableid++; continue; }
101
+ if (hum.verdict === 'wontfix') { excluded.human_wontfix++; continue; }
102
+ if (llm.verdict === 'escalate') { excluded.llm_escalate++; continue; }
103
+ if (llm.verdict === 'unvalidated') { excluded.llm_unvalidated++; continue; }
104
+ if (llm.verdict === 'not-applicable') { excluded.llm_not_applicable++; continue; }
105
+ // Map to binary.
106
+ const humanBin = hum.verdict === 'tp' ? 'positive' : hum.verdict === 'fp' ? 'negative' : null;
107
+ const llmBin = llm.verdict === 'accept' ? 'positive' : llm.verdict === 'reject' ? 'negative' : null;
108
+ if (!humanBin || !llmBin) continue;
109
+ pairs.push({ stableId, human: humanBin, llm: llmBin, llmConfidence: llm.confidence });
110
+ }
111
+ return { pairs, excluded, totalTriaged: latestHuman.size, totalValidated: llmById.size };
112
+ }
113
+
114
+ // Full calibration report for a scanRoot.
115
+ //
116
+ // alarmAt: the κ threshold below which the operator should re-tune. Default
117
+ // 0.6 matches the "substantial agreement" cutoff in Landis & Koch (1977).
118
+ // MIN_N_FOR_ALARM: don't alarm on n<10; the CI on a small sample swamps κ.
119
+ const MIN_N_FOR_ALARM = 10;
120
+
121
+ export function calibrateGraders(scanRoot, { alarmAt = 0.6 } = {}) {
122
+ const triage = _loadTriageFeedback(scanRoot);
123
+ const llm = _loadScanVerdicts(scanRoot);
124
+ const join = joinHumanLlm(triage, llm);
125
+ const kapp = cohensKappa(join.pairs);
126
+ const alarm = kapp.kappa !== null && kapp.n >= MIN_N_FOR_ALARM && kapp.kappa < alarmAt;
127
+ return {
128
+ when: new Date().toISOString(),
129
+ triageEntries: triage.length,
130
+ validatorEntries: llm.length,
131
+ overlap: join.pairs.length,
132
+ excluded: join.excluded,
133
+ kappa: kapp.kappa,
134
+ pObserved: kapp.pO,
135
+ pExpected: kapp.pE,
136
+ kappaInterpretation: _interpretKappa(kapp.kappa, kapp.n),
137
+ alarm,
138
+ alarmThreshold: alarmAt,
139
+ note: alarm
140
+ ? `LLM verdicts diverging from human triage (κ=${kapp.kappa?.toFixed(3)} < ${alarmAt}). Re-tune the validator prompt or escalate human review.`
141
+ : kapp.kappa === null
142
+ ? `Insufficient overlap (n=${kapp.n ?? 0}); skip calibration until more triage feedback accumulates.`
143
+ : `Validator and human triage substantially agree (κ=${kapp.kappa.toFixed(3)}, n=${kapp.n}).`,
144
+ };
145
+ }
146
+
147
+ function _interpretKappa(k, n) {
148
+ if (k === null) return 'undefined';
149
+ if (n < MIN_N_FOR_ALARM) return 'insufficient-sample';
150
+ if (k < 0) return 'worse-than-chance';
151
+ if (k < 0.2) return 'slight';
152
+ if (k < 0.4) return 'fair';
153
+ if (k < 0.6) return 'moderate';
154
+ if (k < 0.8) return 'substantial';
155
+ return 'almost-perfect';
156
+ }
@@ -0,0 +1,113 @@
1
+ // Multi-harness configuration discovery.
2
+ //
3
+ // Finds every agent-harness configuration directory the user has, both at
4
+ // the project root AND under ~/. The discovered files feed the
5
+ // claude-settings / claude-md-prompt-injection / claude-hook-injection
6
+ // detectors so we audit Claude / Cursor / Codex / Gemini / Kiro / OpenCode /
7
+ // Trae / Qwen / Zed / Continue / Aider with one sweep.
8
+ //
9
+ // Used by the `/scan --harness` mode.
10
+
11
+ import * as fs from 'node:fs/promises';
12
+ import * as path from 'node:path';
13
+ import * as os from 'node:os';
14
+
15
+ export const HARNESS_DIRS = [
16
+ '.claude', '.cursor', '.codex', '.gemini', '.kiro',
17
+ '.opencode', '.trae', '.qwen', '.zed', '.continue', '.aider',
18
+ '.codebuddy', '.copilot',
19
+ ];
20
+
21
+ const HARNESS_FILES = [
22
+ // settings + permissions
23
+ 'settings.json', 'settings.local.json', 'config.json',
24
+ // instruction files (lifted into context every session)
25
+ 'CLAUDE.md', 'AGENTS.md', 'GEMINI.md', 'CURSOR.md', 'CODEX.md',
26
+ 'KIRO.md', 'QWEN.md', 'TRAE.md', 'OPENCODE.md', 'SYSTEM_PROMPT.md',
27
+ // mcp
28
+ 'mcp.json', '.mcp.json', 'mcp_servers.json', 'claude_desktop_config.json',
29
+ // hooks
30
+ 'hooks.json', 'hooks.yml', 'hooks.yaml',
31
+ ];
32
+
33
+ const HARNESS_SUBDIRS = ['agents', 'skills', 'commands', 'hooks', 'rules'];
34
+
35
+ const MAX_FILE_SIZE = 1_000_000;
36
+
37
+ async function _readSafe(fp) {
38
+ try {
39
+ const stat = await fs.stat(fp);
40
+ if (stat.size > MAX_FILE_SIZE) return null;
41
+ return await fs.readFile(fp, 'utf8');
42
+ } catch { return null; }
43
+ }
44
+
45
+ async function _walkHarnessDir(harnessRoot, harnessName, out) {
46
+ // Top-level config files.
47
+ for (const fn of HARNESS_FILES) {
48
+ const fp = path.join(harnessRoot, fn);
49
+ const content = await _readSafe(fp);
50
+ if (content !== null) out[fp] = content;
51
+ }
52
+ // Subdirs holding instruction-style files.
53
+ for (const sub of HARNESS_SUBDIRS) {
54
+ const dp = path.join(harnessRoot, sub);
55
+ try {
56
+ const entries = await fs.readdir(dp, { withFileTypes: true });
57
+ for (const e of entries) {
58
+ if (!e.isFile()) continue;
59
+ if (!/\.(?:md|json|yaml|yml)$/i.test(e.name)) continue;
60
+ const fp = path.join(dp, e.name);
61
+ const content = await _readSafe(fp);
62
+ if (content !== null) out[fp] = content;
63
+ }
64
+ } catch { /* dir does not exist — fine */ }
65
+ }
66
+ // Project-root CLAUDE.md / AGENTS.md (some users put them outside .claude/).
67
+ // Only walked when the harness is .claude.
68
+ void harnessName;
69
+ }
70
+
71
+ // Discover harness configs at one of:
72
+ // 1. The project root (e.g. /path/to/repo/.claude, /path/to/repo/.cursor)
73
+ // 2. Home directory (e.g. ~/.claude, ~/.cursor) — opt-in via includeHome=true
74
+ export async function discoverHarnessConfigs(projectRoot, opts = {}) {
75
+ const includeHome = !!opts.includeHome;
76
+ const out = {};
77
+
78
+ // Project-rooted instruction files commonly placed at repo root.
79
+ for (const fn of ['CLAUDE.md', 'AGENTS.md', 'GEMINI.md', 'CURSOR.md', 'CODEX.md', 'KIRO.md', 'QWEN.md', 'TRAE.md', 'OPENCODE.md']) {
80
+ const fp = path.join(projectRoot, fn);
81
+ const content = await _readSafe(fp);
82
+ if (content !== null) out[fp] = content;
83
+ }
84
+
85
+ for (const dir of HARNESS_DIRS) {
86
+ const harnessRoot = path.join(projectRoot, dir);
87
+ try { await fs.access(harnessRoot); } catch { continue; }
88
+ await _walkHarnessDir(harnessRoot, dir, out);
89
+ }
90
+
91
+ if (includeHome) {
92
+ const home = os.homedir();
93
+ if (home) {
94
+ for (const dir of HARNESS_DIRS) {
95
+ const harnessRoot = path.join(home, dir);
96
+ try { await fs.access(harnessRoot); } catch { continue; }
97
+ await _walkHarnessDir(harnessRoot, dir, out);
98
+ }
99
+ }
100
+ }
101
+
102
+ return out;
103
+ }
104
+
105
+ // Inventory of which harnesses are present (for grade / summary).
106
+ export function summarizeHarnessPresence(fileContents) {
107
+ const present = new Set();
108
+ for (const fp of Object.keys(fileContents || {})) {
109
+ const m = /\.(claude|cursor|codex|gemini|kiro|opencode|trae|qwen|zed|continue|aider|codebuddy|copilot)[\\/]/.exec(fp);
110
+ if (m) present.add(m[1]);
111
+ }
112
+ return [...present].sort();
113
+ }
@@ -0,0 +1,144 @@
1
+ // Held-out evaluator (premortem #16).
2
+ //
3
+ // Takes a labeled JSONL file of (predicted_confidence, actual_label) records
4
+ // and computes the honest measurements an auditor will ask for: Brier score,
5
+ // expected calibration error (ECE), per-family precision/recall.
6
+ //
7
+ // Replaces the tautological computeBrierFromHistory removed in premortem #9.
8
+ //
9
+ // Input file shape — one JSON object per line:
10
+ //
11
+ // {"family": "sql-injection", "stableId": "...", "predicted": 0.78,
12
+ // "actual": 1, "note": "TP — exploited in pen-test"}
13
+ // {"family": "xss", "predicted": 0.65, "actual": 0,
14
+ // "note": "FP — sanitizer present but flow-analysis missed it"}
15
+ //
16
+ // `actual` must be 0 (false-positive / clean) or 1 (true-positive). `predicted`
17
+ // is the calibrated_confidence we'd ship to the customer.
18
+ //
19
+ // Output shape — see `evaluateHeldOut` return value.
20
+
21
+ import * as fs from 'node:fs';
22
+ import { brierScore, computeBrierOnHeldOut, wilsonInterval } from './calibration.js';
23
+
24
+ const ECE_BINS_DEFAULT = 10;
25
+
26
+ export function parseLabeledJsonl(text) {
27
+ if (typeof text !== 'string' || !text.length) return [];
28
+ const out = [];
29
+ for (const line of text.split('\n')) {
30
+ const t = line.trim();
31
+ if (!t) continue;
32
+ try {
33
+ const o = JSON.parse(t);
34
+ if (!o || typeof o !== 'object') continue;
35
+ const p = typeof o.predicted === 'number' ? o.predicted : null;
36
+ const a = (o.actual === 1 || o.actual === true) ? 1
37
+ : (o.actual === 0 || o.actual === false) ? 0
38
+ : null;
39
+ if (p === null || a === null) continue;
40
+ out.push({
41
+ family: typeof o.family === 'string' ? o.family : 'unknown',
42
+ stableId: typeof o.stableId === 'string' ? o.stableId : null,
43
+ predicted: Math.max(0, Math.min(1, p)),
44
+ actual: a,
45
+ note: typeof o.note === 'string' ? o.note : '',
46
+ });
47
+ } catch { /* skip malformed lines */ }
48
+ }
49
+ return out;
50
+ }
51
+
52
+ export function loadLabeledJsonl(filepath) {
53
+ if (!filepath || !fs.existsSync(filepath)) return [];
54
+ return parseLabeledJsonl(fs.readFileSync(filepath, 'utf8'));
55
+ }
56
+
57
+ // Expected calibration error: bucket predictions into `nBins` equal-width
58
+ // bins, compare bucket-mean prediction vs bucket-mean actual.
59
+ // ECE = sum over bins of (|bin|/N) * |mean_pred - mean_actual|.
60
+ // ECE = 0 is perfect; common-good calibration ≤ 0.05.
61
+ export function expectedCalibrationError(samples, nBins = ECE_BINS_DEFAULT) {
62
+ if (!Array.isArray(samples) || samples.length === 0) return null;
63
+ const n = samples.length;
64
+ const bins = Array.from({ length: nBins }, () => ({ preds: [], actuals: [] }));
65
+ for (const s of samples) {
66
+ const p = Math.max(0, Math.min(1, s.predicted));
67
+ // Clamp only for the bin index — `1.0` lands in the last bin without
68
+ // distorting the bin's mean prediction.
69
+ const idx = Math.min(nBins - 1, Math.floor(Math.min(0.99999, p) * nBins));
70
+ bins[idx].preds.push(p);
71
+ bins[idx].actuals.push(s.actual);
72
+ }
73
+ let ece = 0;
74
+ const per = [];
75
+ for (let i = 0; i < nBins; i++) {
76
+ const b = bins[i];
77
+ if (b.preds.length === 0) { per.push({ bin: i, n: 0 }); continue; }
78
+ const mp = b.preds.reduce((a, c) => a + c, 0) / b.preds.length;
79
+ const ma = b.actuals.reduce((a, c) => a + c, 0) / b.actuals.length;
80
+ ece += (b.preds.length / n) * Math.abs(mp - ma);
81
+ per.push({ bin: i, n: b.preds.length, mean_pred: mp, mean_actual: ma, gap: mp - ma });
82
+ }
83
+ return { ece, perBin: per, nBins, total: n };
84
+ }
85
+
86
+ export function perFamily(samples) {
87
+ const fams = {};
88
+ for (const s of samples) {
89
+ const f = s.family || 'unknown';
90
+ if (!fams[f]) fams[f] = { tp: 0, fp: 0, fn: 0, tn: 0, n: 0 };
91
+ fams[f].n++;
92
+ // We don't have a separate threshold here; "TP" = positive label;
93
+ // "FP" = negative label. Precision is the engine's positive predictive
94
+ // value at the operating point its calibration assigned.
95
+ if (s.actual === 1) fams[f].tp++;
96
+ else if (s.actual === 0) fams[f].fp++;
97
+ }
98
+ return fams;
99
+ }
100
+
101
+ // One-shot evaluation: Brier + ECE + per-family TP/FP + overall precision.
102
+ // Returns null only when there's truly no data; never returns a tautological
103
+ // zero.
104
+ export function evaluateHeldOut(samples) {
105
+ if (!Array.isArray(samples) || samples.length === 0) {
106
+ return { ok: false, reason: 'no-samples' };
107
+ }
108
+ const brierR = computeBrierOnHeldOut(samples.map(s => ({
109
+ predicted: s.predicted, actual: s.actual,
110
+ })));
111
+ const ece = expectedCalibrationError(samples);
112
+ const fams = perFamily(samples);
113
+ const totalTP = Object.values(fams).reduce((a, f) => a + f.tp, 0);
114
+ const totalFP = Object.values(fams).reduce((a, f) => a + f.fp, 0);
115
+ const precision = (totalTP + totalFP) > 0 ? totalTP / (totalTP + totalFP) : 0;
116
+ // Wilson CI on the overall positive-rate as a calibration sanity check.
117
+ const ci = wilsonInterval(totalTP, totalTP + totalFP);
118
+ return {
119
+ ok: true,
120
+ n: samples.length,
121
+ brier: brierR.brier,
122
+ ece: ece?.ece ?? null,
123
+ eceDetail: ece,
124
+ precision,
125
+ precisionCi95: ci,
126
+ perFamily: fams,
127
+ notes: [
128
+ ...(samples.length < 100 ? ['n<100: Brier and ECE have wide confidence; treat as directional, not decision-grade.'] : []),
129
+ ...(brierR.brier !== null && brierR.brier > 0.10 ? [`brier=${brierR.brier.toFixed(3)} exceeds PRD target 0.10`] : []),
130
+ ...(ece && ece.ece > 0.05 ? [`ece=${ece.ece.toFixed(3)} exceeds 0.05 calibration target`] : []),
131
+ ],
132
+ };
133
+ }
134
+
135
+ // CLI-friendly summary line.
136
+ export function summarize(result) {
137
+ if (!result || !result.ok) return `held-out: ${result?.reason || 'unknown error'}`;
138
+ return [
139
+ `n=${result.n}`,
140
+ `brier=${result.brier != null ? result.brier.toFixed(3) : 'null'}`,
141
+ `ece=${result.ece != null ? result.ece.toFixed(3) : 'null'}`,
142
+ `precision=${result.precision.toFixed(3)} CI95=[${result.precisionCi95[0].toFixed(3)},${result.precisionCi95[1].toFixed(3)}]`,
143
+ ].join(' · ');
144
+ }