cipher-security 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cipher.js +10 -0
- package/lib/analyze/consistency.js +566 -0
- package/lib/analyze/constitution.js +110 -0
- package/lib/analyze/sharding.js +251 -0
- package/lib/autonomous/agent-tool.js +165 -0
- package/lib/autonomous/framework.js +17 -0
- package/lib/autonomous/handoff.js +506 -0
- package/lib/autonomous/modes/blue.js +26 -0
- package/lib/autonomous/modes/red.js +28 -0
- package/lib/benchmark/agent.js +88 -26
- package/lib/benchmark/baselines.js +3 -0
- package/lib/benchmark/claude-code-solver.js +254 -0
- package/lib/benchmark/cognitive.js +283 -0
- package/lib/benchmark/index.js +12 -2
- package/lib/benchmark/knowledge.js +281 -0
- package/lib/benchmark/llm.js +156 -15
- package/lib/benchmark/models.js +5 -2
- package/lib/benchmark/nyu-ctf.js +192 -0
- package/lib/benchmark/overthewire.js +347 -0
- package/lib/benchmark/picoctf.js +281 -0
- package/lib/benchmark/prompts.js +280 -0
- package/lib/benchmark/registry.js +219 -0
- package/lib/benchmark/remote-solver.js +356 -0
- package/lib/benchmark/remote-target.js +263 -0
- package/lib/benchmark/reporter.js +35 -0
- package/lib/benchmark/runner.js +174 -10
- package/lib/benchmark/sandbox.js +35 -0
- package/lib/benchmark/scorer.js +22 -4
- package/lib/benchmark/solver.js +34 -1
- package/lib/benchmark/tools.js +262 -16
- package/lib/commands.js +9 -0
- package/lib/execution/council.js +434 -0
- package/lib/execution/parallel.js +292 -0
- package/lib/gates/circuit-breaker.js +135 -0
- package/lib/gates/confidence.js +302 -0
- package/lib/gates/corrections.js +219 -0
- package/lib/gates/self-check.js +245 -0
- package/lib/gateway/commands.js +727 -0
- package/lib/guardrails/engine.js +364 -0
- package/lib/mcp/server.js +349 -3
- package/lib/memory/compressor.js +94 -7
- package/lib/pipeline/hooks.js +288 -0
- package/lib/pipeline/index.js +11 -0
- package/lib/review/budget.js +210 -0
- package/lib/review/engine.js +526 -0
- package/lib/review/layers/acceptance-auditor.js +279 -0
- package/lib/review/layers/blind-hunter.js +500 -0
- package/lib/review/layers/defense-in-depth.js +209 -0
- package/lib/review/layers/edge-case-hunter.js +266 -0
- package/lib/review/panel.js +519 -0
- package/lib/review/two-stage.js +244 -0
- package/lib/session/cost-tracker.js +203 -0
- package/lib/session/logger.js +349 -0
- package/package.json +1 -1
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
// CIPHER is a trademark of defconxt.
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Self-Check Module — Post-output hallucination and verification gate.
|
|
7
|
+
*
|
|
8
|
+
* Analyzes agent output for unverified claims, hedging language,
|
|
9
|
+
* rationalization patterns, and hallucination red flags. Returns
|
|
10
|
+
* structured findings with a trust score.
|
|
11
|
+
*
|
|
12
|
+
* Sources:
|
|
13
|
+
* - SuperClaude SelfCheckProtocol: 4 mandatory questions + 7 hallucination red flags
|
|
14
|
+
* - Existing confidence.js: detectHedging() + detectRationalizations()
|
|
15
|
+
*
|
|
16
|
+
* @module gates/self-check
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { detectHedging, detectRationalizations } from './confidence.js';
|
|
20
|
+
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Hallucination red flags (from SuperClaude)
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* 7 hallucination red flags — claims made without supporting evidence.
|
|
27
|
+
* Each pattern fires when the text asserts a positive outcome
|
|
28
|
+
* without nearby evidence phrases.
|
|
29
|
+
*/
|
|
30
|
+
const HALLUCINATION_RED_FLAGS = [
|
|
31
|
+
{
|
|
32
|
+
pattern: /\btests?\s+pass(?:es|ed|ing)?\b/i,
|
|
33
|
+
category: 'claim-without-output',
|
|
34
|
+
description: 'Claims tests pass without showing test output',
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
pattern: /\beverything\s+works?\b/i,
|
|
38
|
+
category: 'blanket-success',
|
|
39
|
+
description: 'Blanket "everything works" without specific evidence',
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
pattern: /\bimplementation\s+(?:is\s+)?complete\b/i,
|
|
43
|
+
category: 'premature-completion',
|
|
44
|
+
description: 'Claims implementation complete without verification',
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
pattern: /\bshould\s+work\s+now\b/i,
|
|
48
|
+
category: 'unverified-fix',
|
|
49
|
+
description: 'Claims fix works without running verification',
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
pattern: /\bI'm\s+confident\b/i,
|
|
53
|
+
category: 'false-confidence',
|
|
54
|
+
description: 'Expresses confidence without supporting evidence',
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
pattern: /\bjust\s+fixed\s+it\b/i,
|
|
58
|
+
category: 'unverified-fix',
|
|
59
|
+
description: 'Claims fix applied without showing verification',
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
pattern: /\bminor\s+issue\b/i,
|
|
63
|
+
category: 'severity-downplay',
|
|
64
|
+
description: 'Downplays issue severity without analysis',
|
|
65
|
+
},
|
|
66
|
+
];
|
|
67
|
+
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
// Evidence phrases — presence near a claim reduces suspicion
|
|
70
|
+
// ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
const EVIDENCE_PHRASES = [
|
|
73
|
+
/\b(?:output|result|log|trace)\s*(?:shows?|confirms?|indicates?)\b/i,
|
|
74
|
+
/\bexit\s*code\s*(?:0|zero)\b/i,
|
|
75
|
+
/\b\d+\s+(?:tests?\s+)?pass(?:ed|ing)?\b/i, // "42 tests passing"
|
|
76
|
+
/\bverified\s+(?:by|via|with|using)\b/i,
|
|
77
|
+
/\bas\s+(?:shown|confirmed|demonstrated)\s+(?:by|above|below)\b/i,
|
|
78
|
+
/\b(?:stdout|stderr|output):\s/i,
|
|
79
|
+
/```[\s\S]{10,}```/, // code blocks with substantial content
|
|
80
|
+
];
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Check if text contains evidence near a claim.
|
|
84
|
+
* Looks in a 200-char window around the claim position.
|
|
85
|
+
* @param {string} text
|
|
86
|
+
* @param {number} claimIndex — position of the claim in text
|
|
87
|
+
* @returns {boolean}
|
|
88
|
+
*/
|
|
89
|
+
function hasNearbyEvidence(text, claimIndex) {
|
|
90
|
+
const windowStart = Math.max(0, claimIndex - 200);
|
|
91
|
+
const windowEnd = Math.min(text.length, claimIndex + 200);
|
|
92
|
+
const window = text.slice(windowStart, windowEnd);
|
|
93
|
+
return EVIDENCE_PHRASES.some(p => p.test(window));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
// Unverified completion detection
|
|
98
|
+
// ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
const COMPLETION_CLAIMS = [
|
|
101
|
+
/\b(?:task|work|implementation|feature|fix)\s+(?:is\s+)?(?:done|complete|finished|ready)\b/i,
|
|
102
|
+
/\bsuccessfully\s+(?:implemented|completed|fixed|resolved)\b/i,
|
|
103
|
+
/\ball\s+(?:tests?\s+)?pass(?:es|ed|ing)?\b/i,
|
|
104
|
+
/\bno\s+(?:errors?|failures?|issues?|problems?)\b/i,
|
|
105
|
+
];
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Detect completion claims without nearby evidence.
|
|
109
|
+
* @param {string} text
|
|
110
|
+
* @returns {Array<{text: string, category: string}>}
|
|
111
|
+
*/
|
|
112
|
+
function detectUnverifiedClaims(text) {
|
|
113
|
+
const claims = [];
|
|
114
|
+
for (const pattern of COMPLETION_CLAIMS) {
|
|
115
|
+
const match = text.match(pattern);
|
|
116
|
+
if (match && !hasNearbyEvidence(text, match.index)) {
|
|
117
|
+
claims.push({ text: match[0], category: 'unverified-completion' });
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return claims;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// ---------------------------------------------------------------------------
|
|
124
|
+
// detectHallucinations (exported for direct use)
|
|
125
|
+
// ---------------------------------------------------------------------------
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Detect hallucination red flags in agent output.
|
|
129
|
+
* @param {string} text
|
|
130
|
+
* @returns {{ found: boolean, flags: Array<{text: string, category: string, description: string}> }}
|
|
131
|
+
*/
|
|
132
|
+
export function detectHallucinations(text) {
|
|
133
|
+
const flags = [];
|
|
134
|
+
for (const { pattern, category, description } of HALLUCINATION_RED_FLAGS) {
|
|
135
|
+
const match = text.match(pattern);
|
|
136
|
+
if (match && !hasNearbyEvidence(text, match.index)) {
|
|
137
|
+
flags.push({ text: match[0], category, description });
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return { found: flags.length > 0, flags };
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// ---------------------------------------------------------------------------
|
|
144
|
+
// SelfChecker
|
|
145
|
+
// ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* @typedef {Object} SelfCheckFinding
|
|
149
|
+
* @property {'hedging'|'hallucination'|'rationalization'|'unverified'} type
|
|
150
|
+
* @property {string} text — The matched text
|
|
151
|
+
* @property {'warning'|'error'} severity
|
|
152
|
+
* @property {string} category — Specific category within the type
|
|
153
|
+
*/
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* @typedef {Object} SelfCheckResult
|
|
157
|
+
* @property {SelfCheckFinding[]} findings
|
|
158
|
+
* @property {number} score — Trust score (0–1), 1.0 = no issues
|
|
159
|
+
* @property {boolean} shouldProceed — false when score < 0.5
|
|
160
|
+
*/
|
|
161
|
+
|
|
162
|
+
export class SelfChecker {
|
|
163
|
+
/**
|
|
164
|
+
* Analyze agent output for quality issues.
|
|
165
|
+
* @param {string} text — Agent output text
|
|
166
|
+
* @returns {SelfCheckResult}
|
|
167
|
+
*/
|
|
168
|
+
check(text) {
|
|
169
|
+
if (!text || text.trim().length === 0) {
|
|
170
|
+
return { findings: [], score: 1.0, shouldProceed: true };
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/** @type {SelfCheckFinding[]} */
|
|
174
|
+
const findings = [];
|
|
175
|
+
|
|
176
|
+
// 1. Hedging detection (reuse from confidence.js)
|
|
177
|
+
const hedging = detectHedging(text);
|
|
178
|
+
for (const match of hedging.matches) {
|
|
179
|
+
findings.push({
|
|
180
|
+
type: 'hedging',
|
|
181
|
+
text: match,
|
|
182
|
+
severity: 'warning',
|
|
183
|
+
category: 'hedging-language',
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// 2. Rationalization detection (reuse from confidence.js)
|
|
188
|
+
const rationalizations = detectRationalizations(text);
|
|
189
|
+
for (const r of rationalizations.rationalizations) {
|
|
190
|
+
findings.push({
|
|
191
|
+
type: 'rationalization',
|
|
192
|
+
text: r.text,
|
|
193
|
+
severity: 'error',
|
|
194
|
+
category: r.category,
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// 3. Hallucination red flags
|
|
199
|
+
const hallucinations = detectHallucinations(text);
|
|
200
|
+
for (const flag of hallucinations.flags) {
|
|
201
|
+
findings.push({
|
|
202
|
+
type: 'hallucination',
|
|
203
|
+
text: flag.text,
|
|
204
|
+
severity: 'error',
|
|
205
|
+
category: flag.category,
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// 4. Unverified completion claims
|
|
210
|
+
const unverified = detectUnverifiedClaims(text);
|
|
211
|
+
for (const claim of unverified) {
|
|
212
|
+
findings.push({
|
|
213
|
+
type: 'unverified',
|
|
214
|
+
text: claim.text,
|
|
215
|
+
severity: 'warning',
|
|
216
|
+
category: claim.category,
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Compute trust score
|
|
221
|
+
const score = this._computeScore(findings);
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
findings,
|
|
225
|
+
score,
|
|
226
|
+
shouldProceed: score >= 0.5,
|
|
227
|
+
};
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Compute trust score from findings.
|
|
232
|
+
* Errors weigh more than warnings.
|
|
233
|
+
* @private
|
|
234
|
+
*/
|
|
235
|
+
_computeScore(findings) {
|
|
236
|
+
if (findings.length === 0) return 1.0;
|
|
237
|
+
|
|
238
|
+
let penalty = 0;
|
|
239
|
+
for (const f of findings) {
|
|
240
|
+
penalty += f.severity === 'error' ? 0.15 : 0.08;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
return Math.max(0, +(1 - penalty).toFixed(3));
|
|
244
|
+
}
|
|
245
|
+
}
|