cipher-security 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/bin/cipher.js +10 -0
  2. package/lib/analyze/consistency.js +566 -0
  3. package/lib/analyze/constitution.js +110 -0
  4. package/lib/analyze/sharding.js +251 -0
  5. package/lib/autonomous/agent-tool.js +165 -0
  6. package/lib/autonomous/framework.js +17 -0
  7. package/lib/autonomous/handoff.js +506 -0
  8. package/lib/autonomous/modes/blue.js +26 -0
  9. package/lib/autonomous/modes/red.js +28 -0
  10. package/lib/benchmark/agent.js +88 -26
  11. package/lib/benchmark/baselines.js +3 -0
  12. package/lib/benchmark/claude-code-solver.js +254 -0
  13. package/lib/benchmark/cognitive.js +283 -0
  14. package/lib/benchmark/index.js +12 -2
  15. package/lib/benchmark/knowledge.js +281 -0
  16. package/lib/benchmark/llm.js +156 -15
  17. package/lib/benchmark/models.js +5 -2
  18. package/lib/benchmark/nyu-ctf.js +192 -0
  19. package/lib/benchmark/overthewire.js +347 -0
  20. package/lib/benchmark/picoctf.js +281 -0
  21. package/lib/benchmark/prompts.js +280 -0
  22. package/lib/benchmark/registry.js +219 -0
  23. package/lib/benchmark/remote-solver.js +356 -0
  24. package/lib/benchmark/remote-target.js +263 -0
  25. package/lib/benchmark/reporter.js +35 -0
  26. package/lib/benchmark/runner.js +174 -10
  27. package/lib/benchmark/sandbox.js +35 -0
  28. package/lib/benchmark/scorer.js +22 -4
  29. package/lib/benchmark/solver.js +34 -1
  30. package/lib/benchmark/tools.js +262 -16
  31. package/lib/commands.js +9 -0
  32. package/lib/execution/council.js +434 -0
  33. package/lib/execution/parallel.js +292 -0
  34. package/lib/gates/circuit-breaker.js +135 -0
  35. package/lib/gates/confidence.js +302 -0
  36. package/lib/gates/corrections.js +219 -0
  37. package/lib/gates/self-check.js +245 -0
  38. package/lib/gateway/commands.js +727 -0
  39. package/lib/guardrails/engine.js +364 -0
  40. package/lib/mcp/server.js +349 -3
  41. package/lib/memory/compressor.js +94 -7
  42. package/lib/pipeline/hooks.js +288 -0
  43. package/lib/pipeline/index.js +11 -0
  44. package/lib/review/budget.js +210 -0
  45. package/lib/review/engine.js +526 -0
  46. package/lib/review/layers/acceptance-auditor.js +279 -0
  47. package/lib/review/layers/blind-hunter.js +500 -0
  48. package/lib/review/layers/defense-in-depth.js +209 -0
  49. package/lib/review/layers/edge-case-hunter.js +266 -0
  50. package/lib/review/panel.js +519 -0
  51. package/lib/review/two-stage.js +244 -0
  52. package/lib/session/cost-tracker.js +203 -0
  53. package/lib/session/logger.js +349 -0
  54. package/package.json +1 -1
@@ -0,0 +1,245 @@
1
+ // Copyright (c) 2026 defconxt. All rights reserved.
2
+ // Licensed under AGPL-3.0 — see LICENSE file for details.
3
+ // CIPHER is a trademark of defconxt.
4
+
5
+ /**
6
+ * Self-Check Module — Post-output hallucination and verification gate.
7
+ *
8
+ * Analyzes agent output for unverified claims, hedging language,
9
+ * rationalization patterns, and hallucination red flags. Returns
10
+ * structured findings with a trust score.
11
+ *
12
+ * Sources:
13
+ * - SuperClaude SelfCheckProtocol: 4 mandatory questions + 7 hallucination red flags
14
+ * - Existing confidence.js: detectHedging() + detectRationalizations()
15
+ *
16
+ * @module gates/self-check
17
+ */
18
+
19
+ import { detectHedging, detectRationalizations } from './confidence.js';
20
+
21
+ // ---------------------------------------------------------------------------
22
+ // Hallucination red flags (from SuperClaude)
23
+ // ---------------------------------------------------------------------------
24
+
25
+ /**
26
+ * 7 hallucination red flags — claims made without supporting evidence.
27
+ * Each pattern fires when the text asserts a positive outcome
28
+ * without nearby evidence phrases.
29
+ */
30
+ const HALLUCINATION_RED_FLAGS = [
31
+ {
32
+ pattern: /\btests?\s+pass(?:es|ed|ing)?\b/i,
33
+ category: 'claim-without-output',
34
+ description: 'Claims tests pass without showing test output',
35
+ },
36
+ {
37
+ pattern: /\beverything\s+works?\b/i,
38
+ category: 'blanket-success',
39
+ description: 'Blanket "everything works" without specific evidence',
40
+ },
41
+ {
42
+ pattern: /\bimplementation\s+(?:is\s+)?complete\b/i,
43
+ category: 'premature-completion',
44
+ description: 'Claims implementation complete without verification',
45
+ },
46
+ {
47
+ pattern: /\bshould\s+work\s+now\b/i,
48
+ category: 'unverified-fix',
49
+ description: 'Claims fix works without running verification',
50
+ },
51
+ {
52
+ pattern: /\bI'm\s+confident\b/i,
53
+ category: 'false-confidence',
54
+ description: 'Expresses confidence without supporting evidence',
55
+ },
56
+ {
57
+ pattern: /\bjust\s+fixed\s+it\b/i,
58
+ category: 'unverified-fix',
59
+ description: 'Claims fix applied without showing verification',
60
+ },
61
+ {
62
+ pattern: /\bminor\s+issue\b/i,
63
+ category: 'severity-downplay',
64
+ description: 'Downplays issue severity without analysis',
65
+ },
66
+ ];
67
+
68
+ // ---------------------------------------------------------------------------
69
+ // Evidence phrases — presence near a claim reduces suspicion
70
+ // ---------------------------------------------------------------------------
71
+
72
+ const EVIDENCE_PHRASES = [
73
+ /\b(?:output|result|log|trace)\s*(?:shows?|confirms?|indicates?)\b/i,
74
+ /\bexit\s*code\s*(?:0|zero)\b/i,
75
+ /\b\d+\s+(?:tests?\s+)?pass(?:ed|ing)?\b/i, // "42 tests passing"
76
+ /\bverified\s+(?:by|via|with|using)\b/i,
77
+ /\bas\s+(?:shown|confirmed|demonstrated)\s+(?:by|above|below)\b/i,
78
+ /\b(?:stdout|stderr|output):\s/i,
79
+ /```[\s\S]{10,}```/, // code blocks with substantial content
80
+ ];
81
+
82
+ /**
83
+ * Check if text contains evidence near a claim.
84
+ * Looks in a 200-char window around the claim position.
85
+ * @param {string} text
86
+ * @param {number} claimIndex — position of the claim in text
87
+ * @returns {boolean}
88
+ */
89
+ function hasNearbyEvidence(text, claimIndex) {
90
+ const windowStart = Math.max(0, claimIndex - 200);
91
+ const windowEnd = Math.min(text.length, claimIndex + 200);
92
+ const window = text.slice(windowStart, windowEnd);
93
+ return EVIDENCE_PHRASES.some(p => p.test(window));
94
+ }
95
+
96
+ // ---------------------------------------------------------------------------
97
+ // Unverified completion detection
98
+ // ---------------------------------------------------------------------------
99
+
100
+ const COMPLETION_CLAIMS = [
101
+ /\b(?:task|work|implementation|feature|fix)\s+(?:is\s+)?(?:done|complete|finished|ready)\b/i,
102
+ /\bsuccessfully\s+(?:implemented|completed|fixed|resolved)\b/i,
103
+ /\ball\s+(?:tests?\s+)?pass(?:es|ed|ing)?\b/i,
104
+ /\bno\s+(?:errors?|failures?|issues?|problems?)\b/i,
105
+ ];
106
+
107
+ /**
108
+ * Detect completion claims without nearby evidence.
109
+ * @param {string} text
110
+ * @returns {Array<{text: string, category: string}>}
111
+ */
112
+ function detectUnverifiedClaims(text) {
113
+ const claims = [];
114
+ for (const pattern of COMPLETION_CLAIMS) {
115
+ const match = text.match(pattern);
116
+ if (match && !hasNearbyEvidence(text, match.index)) {
117
+ claims.push({ text: match[0], category: 'unverified-completion' });
118
+ }
119
+ }
120
+ return claims;
121
+ }
122
+
123
+ // ---------------------------------------------------------------------------
124
+ // detectHallucinations (exported for direct use)
125
+ // ---------------------------------------------------------------------------
126
+
127
+ /**
128
+ * Detect hallucination red flags in agent output.
129
+ * @param {string} text
130
+ * @returns {{ found: boolean, flags: Array<{text: string, category: string, description: string}> }}
131
+ */
132
+ export function detectHallucinations(text) {
133
+ const flags = [];
134
+ for (const { pattern, category, description } of HALLUCINATION_RED_FLAGS) {
135
+ const match = text.match(pattern);
136
+ if (match && !hasNearbyEvidence(text, match.index)) {
137
+ flags.push({ text: match[0], category, description });
138
+ }
139
+ }
140
+ return { found: flags.length > 0, flags };
141
+ }
142
+
143
+ // ---------------------------------------------------------------------------
144
+ // SelfChecker
145
+ // ---------------------------------------------------------------------------
146
+
147
+ /**
148
+ * @typedef {Object} SelfCheckFinding
149
+ * @property {'hedging'|'hallucination'|'rationalization'|'unverified'} type
150
+ * @property {string} text — The matched text
151
+ * @property {'warning'|'error'} severity
152
+ * @property {string} category — Specific category within the type
153
+ */
154
+
155
+ /**
156
+ * @typedef {Object} SelfCheckResult
157
+ * @property {SelfCheckFinding[]} findings
158
+ * @property {number} score — Trust score (0–1), 1.0 = no issues
159
+ * @property {boolean} shouldProceed — false when score < 0.5
160
+ */
161
+
162
+ export class SelfChecker {
163
+ /**
164
+ * Analyze agent output for quality issues.
165
+ * @param {string} text — Agent output text
166
+ * @returns {SelfCheckResult}
167
+ */
168
+ check(text) {
169
+ if (!text || text.trim().length === 0) {
170
+ return { findings: [], score: 1.0, shouldProceed: true };
171
+ }
172
+
173
+ /** @type {SelfCheckFinding[]} */
174
+ const findings = [];
175
+
176
+ // 1. Hedging detection (reuse from confidence.js)
177
+ const hedging = detectHedging(text);
178
+ for (const match of hedging.matches) {
179
+ findings.push({
180
+ type: 'hedging',
181
+ text: match,
182
+ severity: 'warning',
183
+ category: 'hedging-language',
184
+ });
185
+ }
186
+
187
+ // 2. Rationalization detection (reuse from confidence.js)
188
+ const rationalizations = detectRationalizations(text);
189
+ for (const r of rationalizations.rationalizations) {
190
+ findings.push({
191
+ type: 'rationalization',
192
+ text: r.text,
193
+ severity: 'error',
194
+ category: r.category,
195
+ });
196
+ }
197
+
198
+ // 3. Hallucination red flags
199
+ const hallucinations = detectHallucinations(text);
200
+ for (const flag of hallucinations.flags) {
201
+ findings.push({
202
+ type: 'hallucination',
203
+ text: flag.text,
204
+ severity: 'error',
205
+ category: flag.category,
206
+ });
207
+ }
208
+
209
+ // 4. Unverified completion claims
210
+ const unverified = detectUnverifiedClaims(text);
211
+ for (const claim of unverified) {
212
+ findings.push({
213
+ type: 'unverified',
214
+ text: claim.text,
215
+ severity: 'warning',
216
+ category: claim.category,
217
+ });
218
+ }
219
+
220
+ // Compute trust score
221
+ const score = this._computeScore(findings);
222
+
223
+ return {
224
+ findings,
225
+ score,
226
+ shouldProceed: score >= 0.5,
227
+ };
228
+ }
229
+
230
+ /**
231
+ * Compute trust score from findings.
232
+ * Errors weigh more than warnings.
233
+ * @private
234
+ */
235
+ _computeScore(findings) {
236
+ if (findings.length === 0) return 1.0;
237
+
238
+ let penalty = 0;
239
+ for (const f of findings) {
240
+ penalty += f.severity === 'error' ? 0.15 : 0.08;
241
+ }
242
+
243
+ return Math.max(0, +(1 - penalty).toFixed(3));
244
+ }
245
+ }