erosolar-cli 1.7.22 → 1.7.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/responseVerifier.d.ts +108 -5
- package/dist/core/responseVerifier.d.ts.map +1 -1
- package/dist/core/responseVerifier.js +483 -233
- package/dist/core/responseVerifier.js.map +1 -1
- package/dist/shell/interactiveShell.d.ts +7 -2
- package/dist/shell/interactiveShell.d.ts.map +1 -1
- package/dist/shell/interactiveShell.js +41 -15
- package/dist/shell/interactiveShell.js.map +1 -1
- package/package.json +1 -1
|
@@ -9,11 +9,248 @@
|
|
|
9
9
|
*
|
|
10
10
|
* @license MIT
|
|
11
11
|
*/
|
|
12
|
-
import { exec } from 'node:child_process';
|
|
12
|
+
import { exec, spawn } from 'node:child_process';
|
|
13
13
|
import { promisify } from 'node:util';
|
|
14
14
|
import * as fs from 'node:fs/promises';
|
|
15
15
|
import * as path from 'node:path';
|
|
16
16
|
const execAsync = promisify(exec);
|
|
17
|
+
/**
|
|
18
|
+
* Spawns a fresh isolated erosolar-cli instance for testing
|
|
19
|
+
*/
|
|
20
|
+
async function spawnIsolatedCLI(cwd, timeout = 60000) {
|
|
21
|
+
const cliPath = path.join(cwd, 'dist/bin/erosolar.js');
|
|
22
|
+
// Verify CLI exists
|
|
23
|
+
try {
|
|
24
|
+
await fs.access(cliPath);
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
throw new Error(`CLI not found at ${cliPath}. Run build first.`);
|
|
28
|
+
}
|
|
29
|
+
let output = '';
|
|
30
|
+
let errors = '';
|
|
31
|
+
let exitResolve;
|
|
32
|
+
const exitPromise = new Promise(resolve => { exitResolve = resolve; });
|
|
33
|
+
const child = spawn('node', [cliPath, '--plain'], {
|
|
34
|
+
cwd,
|
|
35
|
+
env: { ...process.env, EROSOLAR_TEST_MODE: '1', NO_COLOR: '1' },
|
|
36
|
+
stdio: ['pipe', 'pipe', 'pipe']
|
|
37
|
+
});
|
|
38
|
+
child.stdout.on('data', (data) => { output += data.toString(); });
|
|
39
|
+
child.stderr.on('data', (data) => { errors += data.toString(); });
|
|
40
|
+
child.on('close', (code) => { exitResolve(code); });
|
|
41
|
+
child.on('error', (err) => { errors += err.message; exitResolve(1); });
|
|
42
|
+
// Set timeout
|
|
43
|
+
const timeoutId = setTimeout(() => {
|
|
44
|
+
child.kill('SIGTERM');
|
|
45
|
+
errors += `\nTimeout after ${timeout}ms`;
|
|
46
|
+
}, timeout);
|
|
47
|
+
child.on('close', () => clearTimeout(timeoutId));
|
|
48
|
+
// Wait for startup (look for prompt or give it 2 seconds)
|
|
49
|
+
await new Promise(resolve => {
|
|
50
|
+
const checkStartup = setInterval(() => {
|
|
51
|
+
if (output.includes('erosolar') || output.includes('>') || output.length > 100) {
|
|
52
|
+
clearInterval(checkStartup);
|
|
53
|
+
resolve();
|
|
54
|
+
}
|
|
55
|
+
}, 100);
|
|
56
|
+
setTimeout(() => { clearInterval(checkStartup); resolve(); }, 2000);
|
|
57
|
+
});
|
|
58
|
+
return {
|
|
59
|
+
process: child,
|
|
60
|
+
stdin: child.stdin,
|
|
61
|
+
output,
|
|
62
|
+
errors,
|
|
63
|
+
exitPromise
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Sends a command to the spawned CLI and waits for response
|
|
68
|
+
*/
|
|
69
|
+
async function sendCommand(cli, command, waitMs = 5000) {
|
|
70
|
+
const outputBefore = cli.output.length;
|
|
71
|
+
cli.stdin.write(command + '\n');
|
|
72
|
+
// Wait for output to stabilize
|
|
73
|
+
await new Promise(resolve => {
|
|
74
|
+
let lastLength = cli.output.length;
|
|
75
|
+
const checkInterval = setInterval(() => {
|
|
76
|
+
if (cli.output.length > lastLength) {
|
|
77
|
+
lastLength = cli.output.length;
|
|
78
|
+
}
|
|
79
|
+
else if (cli.output.length > outputBefore) {
|
|
80
|
+
clearInterval(checkInterval);
|
|
81
|
+
resolve();
|
|
82
|
+
}
|
|
83
|
+
}, 200);
|
|
84
|
+
setTimeout(() => { clearInterval(checkInterval); resolve(); }, waitMs);
|
|
85
|
+
});
|
|
86
|
+
return cli.output.slice(outputBefore);
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Runs an isolated runtime test in a fresh CLI instance
|
|
90
|
+
*/
|
|
91
|
+
export async function runIsolatedTest(test, cwd, llmVerifier) {
|
|
92
|
+
const startTime = Date.now();
|
|
93
|
+
const result = {
|
|
94
|
+
test,
|
|
95
|
+
success: false,
|
|
96
|
+
output: '',
|
|
97
|
+
errors: '',
|
|
98
|
+
exitCode: null,
|
|
99
|
+
duration: 0,
|
|
100
|
+
matchedPatterns: [],
|
|
101
|
+
unmatchedPatterns: []
|
|
102
|
+
};
|
|
103
|
+
try {
|
|
104
|
+
// Rebuild if required
|
|
105
|
+
if (test.requiresBuild) {
|
|
106
|
+
try {
|
|
107
|
+
await execAsync('npm run build', { cwd, timeout: 120000 });
|
|
108
|
+
}
|
|
109
|
+
catch (buildErr) {
|
|
110
|
+
result.errors = `Build failed: ${buildErr instanceof Error ? buildErr.message : 'unknown'}`;
|
|
111
|
+
result.duration = Date.now() - startTime;
|
|
112
|
+
return result;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
// Spawn fresh CLI
|
|
116
|
+
const cli = await spawnIsolatedCLI(cwd, test.timeout || 60000);
|
|
117
|
+
// Execute each command
|
|
118
|
+
for (const cmd of test.commands) {
|
|
119
|
+
const cmdOutput = await sendCommand(cli, cmd);
|
|
120
|
+
result.output += `> ${cmd}\n${cmdOutput}\n`;
|
|
121
|
+
}
|
|
122
|
+
// Gracefully exit
|
|
123
|
+
cli.stdin.write('/quit\n');
|
|
124
|
+
await new Promise(resolve => setTimeout(resolve, 500));
|
|
125
|
+
cli.process.kill('SIGTERM');
|
|
126
|
+
result.exitCode = await cli.exitPromise;
|
|
127
|
+
result.errors = cli.errors;
|
|
128
|
+
// Check expected output patterns
|
|
129
|
+
if (test.expectedOutputs) {
|
|
130
|
+
for (const pattern of test.expectedOutputs) {
|
|
131
|
+
if (result.output.includes(pattern) || new RegExp(pattern, 'i').test(result.output)) {
|
|
132
|
+
result.matchedPatterns.push(pattern);
|
|
133
|
+
}
|
|
134
|
+
else {
|
|
135
|
+
result.unmatchedPatterns.push(pattern);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
// LLM assessment of behavior if specified
|
|
140
|
+
if (test.expectedBehavior && llmVerifier) {
|
|
141
|
+
const assessPrompt = `Assess if this CLI output demonstrates the expected behavior.
|
|
142
|
+
|
|
143
|
+
EXPECTED BEHAVIOR: ${test.expectedBehavior}
|
|
144
|
+
|
|
145
|
+
CLI OUTPUT:
|
|
146
|
+
---
|
|
147
|
+
${result.output.slice(0, 4000)}
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
Return JSON: {"matches": true/false, "confidence": 0-100, "reasoning": "explanation"}`;
|
|
151
|
+
try {
|
|
152
|
+
const assessment = await llmVerifier(assessPrompt);
|
|
153
|
+
const match = assessment.match(/\{[\s\S]*\}/);
|
|
154
|
+
if (match) {
|
|
155
|
+
const parsed = JSON.parse(match[0]);
|
|
156
|
+
result.llmAssessment = `${parsed.matches ? '✅' : '❌'} [${parsed.confidence}%] ${parsed.reasoning}`;
|
|
157
|
+
if (!parsed.matches || parsed.confidence < 70) {
|
|
158
|
+
result.unmatchedPatterns.push(`behavior: ${test.expectedBehavior}`);
|
|
159
|
+
}
|
|
160
|
+
else {
|
|
161
|
+
result.matchedPatterns.push(`behavior: ${test.expectedBehavior}`);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
catch {
|
|
166
|
+
result.llmAssessment = 'LLM assessment failed';
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
// Determine success
|
|
170
|
+
result.success = result.unmatchedPatterns.length === 0 &&
|
|
171
|
+
(result.matchedPatterns.length > 0 || !test.expectedOutputs?.length);
|
|
172
|
+
}
|
|
173
|
+
catch (err) {
|
|
174
|
+
result.errors = err instanceof Error ? err.message : 'Unknown error';
|
|
175
|
+
}
|
|
176
|
+
result.duration = Date.now() - startTime;
|
|
177
|
+
return result;
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Generates isolated runtime tests for self-referential claims
|
|
181
|
+
* (claims about erosolar-cli's own behavior/features)
|
|
182
|
+
*/
|
|
183
|
+
export async function generateIsolatedTests(claims, llmVerifier) {
|
|
184
|
+
const selfClaims = claims.filter(c => c.statement.toLowerCase().includes('erosolar') ||
|
|
185
|
+
c.statement.toLowerCase().includes('cli') ||
|
|
186
|
+
c.statement.toLowerCase().includes('command') ||
|
|
187
|
+
c.statement.toLowerCase().includes('feature') ||
|
|
188
|
+
c.category === 'behavior' ||
|
|
189
|
+
c.category === 'feature');
|
|
190
|
+
if (selfClaims.length === 0)
|
|
191
|
+
return [];
|
|
192
|
+
const prompt = `Generate isolated CLI tests for these claims about erosolar-cli behavior.
|
|
193
|
+
|
|
194
|
+
CLAIMS:
|
|
195
|
+
${selfClaims.map((c, i) => `${i + 1}. ${c.statement}`).join('\n')}
|
|
196
|
+
|
|
197
|
+
For each claim, generate a test that:
|
|
198
|
+
1. Spawns a fresh CLI instance
|
|
199
|
+
2. Sends commands to test the claimed behavior
|
|
200
|
+
3. Checks expected output patterns
|
|
201
|
+
|
|
202
|
+
Return JSON array:
|
|
203
|
+
[{
|
|
204
|
+
"id": "test-1",
|
|
205
|
+
"description": "what we're testing",
|
|
206
|
+
"commands": ["command1", "command2"],
|
|
207
|
+
"expectedOutputs": ["pattern1", "pattern2"],
|
|
208
|
+
"expectedBehavior": "description of expected behavior",
|
|
209
|
+
"requiresBuild": false,
|
|
210
|
+
"timeout": 30000
|
|
211
|
+
}]
|
|
212
|
+
|
|
213
|
+
Output ONLY valid JSON array.`;
|
|
214
|
+
try {
|
|
215
|
+
const response = await llmVerifier(prompt);
|
|
216
|
+
const match = response.match(/\[[\s\S]*\]/);
|
|
217
|
+
if (match) {
|
|
218
|
+
return JSON.parse(match[0]);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
catch {
|
|
222
|
+
// Fall through to manual generation
|
|
223
|
+
}
|
|
224
|
+
// Fallback: generate basic tests for self-claims
|
|
225
|
+
return selfClaims.map((c, i) => ({
|
|
226
|
+
id: `iso-${i}`,
|
|
227
|
+
description: c.statement,
|
|
228
|
+
commands: ['/help'], // Basic smoke test
|
|
229
|
+
expectedBehavior: c.statement,
|
|
230
|
+
timeout: 30000
|
|
231
|
+
}));
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* Runs all isolated tests and returns aggregated results
|
|
235
|
+
*/
|
|
236
|
+
export async function runIsolatedVerification(claims, cwd, llmVerifier) {
|
|
237
|
+
if (!llmVerifier) {
|
|
238
|
+
return { tests: [], summary: { total: 0, passed: 0, failed: 0 }, allPassed: true };
|
|
239
|
+
}
|
|
240
|
+
const tests = await generateIsolatedTests(claims, llmVerifier);
|
|
241
|
+
const results = [];
|
|
242
|
+
for (const test of tests) {
|
|
243
|
+
const result = await runIsolatedTest(test, cwd, llmVerifier);
|
|
244
|
+
results.push(result);
|
|
245
|
+
}
|
|
246
|
+
const passed = results.filter(r => r.success).length;
|
|
247
|
+
const failed = results.filter(r => !r.success).length;
|
|
248
|
+
return {
|
|
249
|
+
tests: results,
|
|
250
|
+
summary: { total: tests.length, passed, failed },
|
|
251
|
+
allPassed: failed === 0
|
|
252
|
+
};
|
|
253
|
+
}
|
|
17
254
|
/**
|
|
18
255
|
* LLM-based claim extraction prompt.
|
|
19
256
|
* Used when pattern matching isn't sufficient.
|
|
@@ -764,57 +1001,12 @@ export function generateVerificationTest(claim) {
|
|
|
764
1001
|
}
|
|
765
1002
|
}
|
|
766
1003
|
/**
|
|
767
|
-
* Verify all claims in an assistant response
|
|
1004
|
+
* Verify all claims in an assistant response using LLM-based semantic analysis.
|
|
1005
|
+
* Requires a VerificationContext with an llmVerifier function.
|
|
1006
|
+
* All claim extraction and verification is done via LLM.
|
|
768
1007
|
*/
|
|
769
|
-
export async function verifyResponse(response, responseId) {
|
|
770
|
-
|
|
771
|
-
const results = [];
|
|
772
|
-
for (const claim of claims) {
|
|
773
|
-
const test = generateVerificationTest(claim);
|
|
774
|
-
try {
|
|
775
|
-
const result = await test();
|
|
776
|
-
results.push(result);
|
|
777
|
-
}
|
|
778
|
-
catch (err) {
|
|
779
|
-
results.push({
|
|
780
|
-
claim,
|
|
781
|
-
verified: false,
|
|
782
|
-
confidence: 'low',
|
|
783
|
-
evidence: 'Verification test failed to execute',
|
|
784
|
-
error: err instanceof Error ? err.message : 'Unknown error',
|
|
785
|
-
timestamp: new Date().toISOString()
|
|
786
|
-
});
|
|
787
|
-
}
|
|
788
|
-
}
|
|
789
|
-
const verified = results.filter(r => r.verified).length;
|
|
790
|
-
const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
|
|
791
|
-
const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
|
|
792
|
-
let overallVerdict;
|
|
793
|
-
if (failed > 0) {
|
|
794
|
-
overallVerdict = 'contradicted';
|
|
795
|
-
}
|
|
796
|
-
else if (verified === claims.length && claims.length > 0) {
|
|
797
|
-
overallVerdict = 'verified';
|
|
798
|
-
}
|
|
799
|
-
else if (verified > 0) {
|
|
800
|
-
overallVerdict = 'partially_verified';
|
|
801
|
-
}
|
|
802
|
-
else {
|
|
803
|
-
overallVerdict = 'unverified';
|
|
804
|
-
}
|
|
805
|
-
return {
|
|
806
|
-
responseId: responseId || `response-${Date.now()}`,
|
|
807
|
-
timestamp: new Date().toISOString(),
|
|
808
|
-
claims,
|
|
809
|
-
results,
|
|
810
|
-
summary: {
|
|
811
|
-
total: claims.length,
|
|
812
|
-
verified,
|
|
813
|
-
failed,
|
|
814
|
-
inconclusive
|
|
815
|
-
},
|
|
816
|
-
overallVerdict
|
|
817
|
-
};
|
|
1008
|
+
export async function verifyResponse(response, context, responseId) {
|
|
1009
|
+
return verifyResponseComprehensive(response, context, responseId);
|
|
818
1010
|
}
|
|
819
1011
|
/**
|
|
820
1012
|
* Format a verification report for display
|
|
@@ -854,10 +1046,11 @@ export function formatVerificationReport(report) {
|
|
|
854
1046
|
return lines.join('\n');
|
|
855
1047
|
}
|
|
856
1048
|
/**
|
|
857
|
-
* Quick verification - returns true if response claims are valid
|
|
1049
|
+
* Quick verification - returns true if response claims are valid.
|
|
1050
|
+
* Requires a VerificationContext with llmVerifier for LLM-based semantic analysis.
|
|
858
1051
|
*/
|
|
859
|
-
export async function quickVerify(response) {
|
|
860
|
-
const report = await verifyResponse(response);
|
|
1052
|
+
export async function quickVerify(response, context) {
|
|
1053
|
+
const report = await verifyResponse(response, context);
|
|
861
1054
|
return report.overallVerdict === 'verified' || report.overallVerdict === 'partially_verified';
|
|
862
1055
|
}
|
|
863
1056
|
/**
|
|
@@ -1120,203 +1313,59 @@ export function generateExtendedVerificationTest(claim, context) {
|
|
|
1120
1313
|
case 'error_fixed':
|
|
1121
1314
|
case 'feature_implemented':
|
|
1122
1315
|
case 'refactor_complete':
|
|
1123
|
-
// These require semantic verification -
|
|
1316
|
+
// These require semantic verification - LLM is required
|
|
1124
1317
|
return async () => {
|
|
1125
|
-
if (context.llmVerifier) {
|
|
1126
|
-
return
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
const resolvedPath = path.isAbsolute(relatedPath)
|
|
1133
|
-
? relatedPath
|
|
1134
|
-
: path.resolve(context.workingDirectory, relatedPath);
|
|
1135
|
-
const stats = await fs.stat(resolvedPath);
|
|
1136
|
-
const recentlyModified = (Date.now() - stats.mtimeMs) < 10 * 60 * 1000;
|
|
1137
|
-
return {
|
|
1138
|
-
...baseResult,
|
|
1139
|
-
verified: recentlyModified,
|
|
1140
|
-
confidence: 'low',
|
|
1141
|
-
evidence: recentlyModified
|
|
1142
|
-
? `Related file ${relatedPath} was recently modified`
|
|
1143
|
-
: `Related file ${relatedPath} exists but wasn't recently modified`
|
|
1144
|
-
};
|
|
1145
|
-
}
|
|
1146
|
-
catch {
|
|
1147
|
-
return {
|
|
1148
|
-
...baseResult,
|
|
1149
|
-
verified: false,
|
|
1150
|
-
confidence: 'low',
|
|
1151
|
-
evidence: 'Could not verify - no LLM available and related file not found'
|
|
1152
|
-
};
|
|
1153
|
-
}
|
|
1318
|
+
if (!context.llmVerifier) {
|
|
1319
|
+
return {
|
|
1320
|
+
...baseResult,
|
|
1321
|
+
verified: false,
|
|
1322
|
+
confidence: 'low',
|
|
1323
|
+
evidence: 'Semantic verification requires LLM verifier'
|
|
1324
|
+
};
|
|
1154
1325
|
}
|
|
1155
|
-
return
|
|
1156
|
-
...baseResult,
|
|
1157
|
-
verified: false,
|
|
1158
|
-
confidence: 'low',
|
|
1159
|
-
evidence: 'Semantic verification required but no LLM verifier available'
|
|
1160
|
-
};
|
|
1326
|
+
return verifyClaimWithLLM(claim, context);
|
|
1161
1327
|
};
|
|
1162
1328
|
case 'data_transformed':
|
|
1163
|
-
return async () => {
|
|
1164
|
-
// Check if we have before/after state to compare
|
|
1165
|
-
if (context.previousState && context.currentState) {
|
|
1166
|
-
const inputKey = claim.params.input;
|
|
1167
|
-
const outputKey = claim.params.output;
|
|
1168
|
-
if (inputKey && outputKey) {
|
|
1169
|
-
const inputExists = context.previousState[inputKey] !== undefined;
|
|
1170
|
-
const outputExists = context.currentState[outputKey] !== undefined;
|
|
1171
|
-
return {
|
|
1172
|
-
...baseResult,
|
|
1173
|
-
verified: inputExists && outputExists,
|
|
1174
|
-
confidence: inputExists && outputExists ? 'medium' : 'low',
|
|
1175
|
-
evidence: `Input "${inputKey}" ${inputExists ? 'found' : 'missing'}, Output "${outputKey}" ${outputExists ? 'found' : 'missing'}`
|
|
1176
|
-
};
|
|
1177
|
-
}
|
|
1178
|
-
}
|
|
1179
|
-
// Fall back to LLM verification
|
|
1180
|
-
if (context.llmVerifier) {
|
|
1181
|
-
return verifyClaimWithLLM(claim, context);
|
|
1182
|
-
}
|
|
1183
|
-
return {
|
|
1184
|
-
...baseResult,
|
|
1185
|
-
verified: false,
|
|
1186
|
-
confidence: 'low',
|
|
1187
|
-
evidence: 'Cannot verify data transformation without state comparison or LLM'
|
|
1188
|
-
};
|
|
1189
|
-
};
|
|
1190
1329
|
case 'database_updated':
|
|
1191
|
-
return async () => {
|
|
1192
|
-
// Can't directly verify database changes without connection info
|
|
1193
|
-
// Check if there's a command we can run
|
|
1194
|
-
const checkCommand = claim.params.checkCommand;
|
|
1195
|
-
if (checkCommand) {
|
|
1196
|
-
try {
|
|
1197
|
-
const { stdout } = await execAsync(checkCommand, {
|
|
1198
|
-
timeout: 10000,
|
|
1199
|
-
cwd: context.workingDirectory
|
|
1200
|
-
});
|
|
1201
|
-
return {
|
|
1202
|
-
...baseResult,
|
|
1203
|
-
verified: true,
|
|
1204
|
-
confidence: 'medium',
|
|
1205
|
-
evidence: `Check command output: ${stdout.slice(0, 200)}`
|
|
1206
|
-
};
|
|
1207
|
-
}
|
|
1208
|
-
catch (err) {
|
|
1209
|
-
return {
|
|
1210
|
-
...baseResult,
|
|
1211
|
-
verified: false,
|
|
1212
|
-
confidence: 'medium',
|
|
1213
|
-
evidence: 'Database check command failed',
|
|
1214
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
1215
|
-
};
|
|
1216
|
-
}
|
|
1217
|
-
}
|
|
1218
|
-
// Fall back to LLM
|
|
1219
|
-
if (context.llmVerifier) {
|
|
1220
|
-
return verifyClaimWithLLM(claim, context);
|
|
1221
|
-
}
|
|
1222
|
-
return {
|
|
1223
|
-
...baseResult,
|
|
1224
|
-
verified: false,
|
|
1225
|
-
confidence: 'low',
|
|
1226
|
-
evidence: 'Cannot verify database changes without check command or LLM'
|
|
1227
|
-
};
|
|
1228
|
-
};
|
|
1229
1330
|
case 'permission_granted':
|
|
1230
|
-
return async () => {
|
|
1231
|
-
const targetPath = claim.params.path;
|
|
1232
|
-
const expectedMode = claim.params.mode;
|
|
1233
|
-
if (targetPath) {
|
|
1234
|
-
try {
|
|
1235
|
-
const resolvedPath = path.isAbsolute(targetPath)
|
|
1236
|
-
? targetPath
|
|
1237
|
-
: path.resolve(context.workingDirectory, targetPath);
|
|
1238
|
-
const stats = await fs.stat(resolvedPath);
|
|
1239
|
-
const mode = (stats.mode & 0o777).toString(8);
|
|
1240
|
-
if (expectedMode) {
|
|
1241
|
-
const matches = mode === expectedMode;
|
|
1242
|
-
return {
|
|
1243
|
-
...baseResult,
|
|
1244
|
-
verified: matches,
|
|
1245
|
-
confidence: 'high',
|
|
1246
|
-
evidence: matches
|
|
1247
|
-
? `File has expected permissions: ${mode}`
|
|
1248
|
-
: `Expected mode ${expectedMode}, got ${mode}`
|
|
1249
|
-
};
|
|
1250
|
-
}
|
|
1251
|
-
return {
|
|
1252
|
-
...baseResult,
|
|
1253
|
-
verified: true,
|
|
1254
|
-
confidence: 'medium',
|
|
1255
|
-
evidence: `File permissions: ${mode}`
|
|
1256
|
-
};
|
|
1257
|
-
}
|
|
1258
|
-
catch (err) {
|
|
1259
|
-
return {
|
|
1260
|
-
...baseResult,
|
|
1261
|
-
verified: false,
|
|
1262
|
-
confidence: 'high',
|
|
1263
|
-
evidence: 'Could not check file permissions',
|
|
1264
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
1265
|
-
};
|
|
1266
|
-
}
|
|
1267
|
-
}
|
|
1268
|
-
// Fall back to LLM
|
|
1269
|
-
if (context.llmVerifier) {
|
|
1270
|
-
return verifyClaimWithLLM(claim, context);
|
|
1271
|
-
}
|
|
1272
|
-
return {
|
|
1273
|
-
...baseResult,
|
|
1274
|
-
verified: false,
|
|
1275
|
-
confidence: 'low',
|
|
1276
|
-
evidence: 'Cannot verify permission without file path or LLM'
|
|
1277
|
-
};
|
|
1278
|
-
};
|
|
1279
1331
|
case 'generic':
|
|
1280
1332
|
default:
|
|
1281
|
-
//
|
|
1333
|
+
// All these claim types require LLM verification
|
|
1282
1334
|
return async () => {
|
|
1283
|
-
if (context.llmVerifier) {
|
|
1284
|
-
return
|
|
1335
|
+
if (!context.llmVerifier) {
|
|
1336
|
+
return {
|
|
1337
|
+
...baseResult,
|
|
1338
|
+
verified: false,
|
|
1339
|
+
confidence: 'low',
|
|
1340
|
+
evidence: `${claim.type} verification requires LLM verifier`
|
|
1341
|
+
};
|
|
1285
1342
|
}
|
|
1286
|
-
return
|
|
1287
|
-
...baseResult,
|
|
1288
|
-
verified: false,
|
|
1289
|
-
confidence: 'low',
|
|
1290
|
-
evidence: 'Generic claim requires LLM verification which is not available'
|
|
1291
|
-
};
|
|
1343
|
+
return verifyClaimWithLLM(claim, context);
|
|
1292
1344
|
};
|
|
1293
1345
|
}
|
|
1294
1346
|
}
|
|
1295
1347
|
/**
|
|
1296
|
-
* Comprehensive verification using
|
|
1348
|
+
* Comprehensive verification using LLM-based semantic analysis.
|
|
1349
|
+
* Requires an LLM verifier - all claims are verified through LLM semantic analysis.
|
|
1297
1350
|
*/
|
|
1298
1351
|
export async function verifyResponseComprehensive(response, context, responseId) {
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1352
|
+
if (!context.llmVerifier) {
|
|
1353
|
+
return {
|
|
1354
|
+
responseId: responseId || `response-${Date.now()}`,
|
|
1355
|
+
timestamp: new Date().toISOString(),
|
|
1356
|
+
claims: [],
|
|
1357
|
+
results: [],
|
|
1358
|
+
summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
|
|
1359
|
+
overallVerdict: 'unverified'
|
|
1360
|
+
};
|
|
1361
|
+
}
|
|
1362
|
+
// Extract ALL claims using LLM (required)
|
|
1363
|
+
const claims = await extractClaimsWithLLM(response, context.llmVerifier);
|
|
1303
1364
|
const results = [];
|
|
1304
1365
|
for (const claim of claims) {
|
|
1305
|
-
//
|
|
1306
|
-
const standardTypes = [
|
|
1307
|
-
'file_created', 'file_modified', 'file_deleted', 'code_compiles',
|
|
1308
|
-
'tests_pass', 'git_committed', 'package_published', 'command_executed',
|
|
1309
|
-
'dependency_installed', 'service_running', 'url_accessible', 'content_contains'
|
|
1310
|
-
];
|
|
1311
|
-
let test;
|
|
1312
|
-
if (standardTypes.includes(claim.type)) {
|
|
1313
|
-
test = generateVerificationTest(claim);
|
|
1314
|
-
}
|
|
1315
|
-
else {
|
|
1316
|
-
test = generateExtendedVerificationTest(claim, context);
|
|
1317
|
-
}
|
|
1366
|
+
// ALL claims are verified via LLM semantic analysis
|
|
1318
1367
|
try {
|
|
1319
|
-
const result = await
|
|
1368
|
+
const result = await verifyClaimWithLLM(claim, context);
|
|
1320
1369
|
results.push(result);
|
|
1321
1370
|
}
|
|
1322
1371
|
catch (err) {
|
|
@@ -1324,7 +1373,7 @@ export async function verifyResponseComprehensive(response, context, responseId)
|
|
|
1324
1373
|
claim,
|
|
1325
1374
|
verified: false,
|
|
1326
1375
|
confidence: 'low',
|
|
1327
|
-
evidence: '
|
|
1376
|
+
evidence: 'LLM verification failed',
|
|
1328
1377
|
error: err instanceof Error ? err.message : 'Unknown error',
|
|
1329
1378
|
timestamp: new Date().toISOString()
|
|
1330
1379
|
});
|
|
@@ -1814,4 +1863,205 @@ export async function verifyResponseHybrid(response, context, responseId) {
|
|
|
1814
1863
|
overallVerdict
|
|
1815
1864
|
};
|
|
1816
1865
|
}
|
|
1866
|
+
const UNIVERSAL_EXTRACT = `Extract ALL verifiable claims from this AI response. Include explicit claims, implicit claims, state changes, results, assertions.
|
|
1867
|
+
|
|
1868
|
+
RESPONSE:
|
|
1869
|
+
---
|
|
1870
|
+
{RESPONSE}
|
|
1871
|
+
---
|
|
1872
|
+
CONTEXT: {CONTEXT}
|
|
1873
|
+
DIR: {WORKING_DIR}
|
|
1874
|
+
|
|
1875
|
+
Return JSON array: [{"id":"c1","statement":"claim","category":"file_op|code|state|data|behavior|fact|other","verifiable":true/false,"verificationApproach":"how","priority":"critical|high|medium|low","context":{}}]
|
|
1876
|
+
Output ONLY valid JSON.`;
|
|
1877
|
+
const UNIVERSAL_GEN = `Generate verification code for: {STATEMENT}
|
|
1878
|
+
Category: {CATEGORY} | Approach: {APPROACH} | Context: {CONTEXT} | Dir: {WORKING_DIR} | Platform: {PLATFORM}
|
|
1879
|
+
|
|
1880
|
+
Use shell/javascript/python. READ-ONLY only.
|
|
1881
|
+
Return JSON: {"steps":[{"type":"shell|javascript|python","code":"code","desc":"what"}],"success":"success criteria","failure":"failure criteria","confPass":0-100,"confFail":0-100,"safe":{"ok":true/false,"why":"reason"}}
|
|
1882
|
+
Output ONLY valid JSON.`;
|
|
1883
|
+
const UNIVERSAL_ASSESS = `Assess: RESPONSE:{RESPONSE} CLAIMS:{CLAIMS} RESULTS:{RESULTS}
|
|
1884
|
+
Return JSON: {"trust":0-100,"summary":"text","concerns":[]}
|
|
1885
|
+
Output ONLY valid JSON.`;
|
|
1886
|
+
const UNSAFE = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /exec\s*\(/i, /child_process/i, /os\.system/i, /subprocess/i, /curl.*\|.*sh/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i, /kill/i];
|
|
1887
|
+
export function validateUniversalCode(c) {
|
|
1888
|
+
for (const p of UNSAFE)
|
|
1889
|
+
if (p.test(c))
|
|
1890
|
+
return { safe: false, reason: p.source };
|
|
1891
|
+
return c.length > 5000 ? { safe: false, reason: 'too long' } : { safe: true, reason: 'ok' };
|
|
1892
|
+
}
|
|
1893
|
+
async function runUniversalStep(s, cwd) {
|
|
1894
|
+
const v = validateUniversalCode(s.code);
|
|
1895
|
+
if (!v.safe)
|
|
1896
|
+
return { ok: false, out: v.reason };
|
|
1897
|
+
try {
|
|
1898
|
+
if (s.type === 'shell') {
|
|
1899
|
+
const { stdout, stderr } = await execAsync(s.code, { cwd, timeout: 30000, maxBuffer: 5 * 1024 * 1024 });
|
|
1900
|
+
return { ok: true, out: stdout + stderr };
|
|
1901
|
+
}
|
|
1902
|
+
if (s.type === 'javascript') {
|
|
1903
|
+
const w = `(async()=>{try{const fs=require('fs').promises;const r=await(async()=>{${s.code}})();console.log(JSON.stringify({ok:1,r}))}catch(e){console.log(JSON.stringify({ok:0,e:e.message}))}})()`;
|
|
1904
|
+
const { stdout } = await execAsync(`node -e ${JSON.stringify(w)}`, { cwd, timeout: 30000 });
|
|
1905
|
+
return { ok: true, out: stdout };
|
|
1906
|
+
}
|
|
1907
|
+
if (s.type === 'python') {
|
|
1908
|
+
const { stdout, stderr } = await execAsync(`python3 -c ${JSON.stringify(s.code)}`, { cwd, timeout: 30000 });
|
|
1909
|
+
return { ok: true, out: stdout + stderr };
|
|
1910
|
+
}
|
|
1911
|
+
return { ok: false, out: 'unknown type' };
|
|
1912
|
+
}
|
|
1913
|
+
catch (e) {
|
|
1914
|
+
return { ok: false, out: e instanceof Error ? e.message : 'err' };
|
|
1915
|
+
}
|
|
1916
|
+
}
|
|
1917
|
+
export async function extractUniversalClaims(r, ctx) {
|
|
1918
|
+
if (!ctx.llmVerifier)
|
|
1919
|
+
return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
|
|
1920
|
+
try {
|
|
1921
|
+
const p = UNIVERSAL_EXTRACT.replace('{RESPONSE}', r.slice(0, 8000)).replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '').replace('{WORKING_DIR}', ctx.workingDirectory);
|
|
1922
|
+
const res = await ctx.llmVerifier(p);
|
|
1923
|
+
const m = res.match(/\[[\s\S]*\]/);
|
|
1924
|
+
if (m)
|
|
1925
|
+
return JSON.parse(m[0]);
|
|
1926
|
+
}
|
|
1927
|
+
catch { /* fall through */ }
|
|
1928
|
+
return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
|
|
1929
|
+
}
|
|
1930
|
+
export async function verifyUniversalClaim(claim, ctx) {
|
|
1931
|
+
const base = { claim, timestamp: new Date().toISOString() };
|
|
1932
|
+
if (!claim.verifiable)
|
|
1933
|
+
return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'Not verifiable', reasoning: 'Cannot verify' };
|
|
1934
|
+
if (!ctx.llmVerifier)
|
|
1935
|
+
return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'No LLM', reasoning: 'Needs LLM' };
|
|
1936
|
+
try {
|
|
1937
|
+
const p = UNIVERSAL_GEN.replace('{STATEMENT}', claim.statement).replace('{CATEGORY}', claim.category).replace('{APPROACH}', claim.verificationApproach).replace('{CONTEXT}', JSON.stringify(claim.context)).replace('{WORKING_DIR}', ctx.workingDirectory).replace('{PLATFORM}', process.platform);
|
|
1938
|
+
const res = await ctx.llmVerifier(p);
|
|
1939
|
+
const m = res.match(/\{[\s\S]*\}/);
|
|
1940
|
+
if (!m)
|
|
1941
|
+
throw new Error('bad');
|
|
1942
|
+
const plan = JSON.parse(m[0]);
|
|
1943
|
+
if (!plan.safe.ok)
|
|
1944
|
+
return { ...base, verified: false, confidence: 0, method: 'blocked', evidence: plan.safe.why, reasoning: 'Unsafe' };
|
|
1945
|
+
let allOk = true, out = '', code = '';
|
|
1946
|
+
for (const s of plan.steps) {
|
|
1947
|
+
code += s.code + '\n';
|
|
1948
|
+
const r = await runUniversalStep(s, ctx.workingDirectory);
|
|
1949
|
+
out += r.out + '\n';
|
|
1950
|
+
if (!r.ok)
|
|
1951
|
+
allOk = false;
|
|
1952
|
+
}
|
|
1953
|
+
return { ...base, verified: allOk, confidence: allOk ? plan.confPass : plan.confFail, method: plan.steps.map(s => s.type).join('+'), evidence: allOk ? plan.success : plan.failure, reasoning: allOk ? 'All passed' : 'Some failed', executedCode: code, rawOutput: out.slice(0, 2000) };
|
|
1954
|
+
}
|
|
1955
|
+
catch (e) {
|
|
1956
|
+
return { ...base, verified: false, confidence: 10, method: 'error', evidence: 'Failed', reasoning: e instanceof Error ? e.message : 'err' };
|
|
1957
|
+
}
|
|
1958
|
+
}
|
|
1959
|
+
export async function verifyResponseUniversal(response, ctx, id) {
|
|
1960
|
+
const claims = await extractUniversalClaims(response, ctx);
|
|
1961
|
+
const results = [];
|
|
1962
|
+
// Identify self-referential claims (about erosolar-cli itself)
|
|
1963
|
+
const selfClaims = claims.filter(c => c.statement.toLowerCase().includes('erosolar') ||
|
|
1964
|
+
c.statement.toLowerCase().includes('cli') ||
|
|
1965
|
+
c.category === 'behavior' ||
|
|
1966
|
+
c.category === 'feature');
|
|
1967
|
+
const regularClaims = claims.filter(c => !selfClaims.includes(c));
|
|
1968
|
+
// Run isolated runtime tests for self-referential claims
|
|
1969
|
+
if (selfClaims.length > 0 && ctx.llmVerifier) {
|
|
1970
|
+
const isoResults = await runIsolatedVerification(selfClaims.map(c => ({ statement: c.statement, category: c.category, context: c.context })), ctx.workingDirectory, ctx.llmVerifier);
|
|
1971
|
+
// Convert isolated results to UniversalVerificationResult
|
|
1972
|
+
for (let i = 0; i < selfClaims.length && i < isoResults.tests.length; i++) {
|
|
1973
|
+
const claim = selfClaims[i];
|
|
1974
|
+
const isoTest = isoResults.tests[i];
|
|
1975
|
+
results.push({
|
|
1976
|
+
claim,
|
|
1977
|
+
verified: isoTest.success,
|
|
1978
|
+
confidence: isoTest.success ? 90 : (isoTest.matchedPatterns.length > 0 ? 50 : 20),
|
|
1979
|
+
method: 'isolated-runtime',
|
|
1980
|
+
evidence: isoTest.success ? `Verified in fresh CLI instance` : `Failed: ${isoTest.unmatchedPatterns.join(', ')}`,
|
|
1981
|
+
reasoning: isoTest.llmAssessment || (isoTest.success ? 'All patterns matched in isolated runtime' : 'Patterns not matched'),
|
|
1982
|
+
executedCode: isoTest.test.commands.join('\n'),
|
|
1983
|
+
rawOutput: isoTest.output.slice(0, 2000),
|
|
1984
|
+
timestamp: new Date().toISOString()
|
|
1985
|
+
});
|
|
1986
|
+
}
|
|
1987
|
+
}
|
|
1988
|
+
// Verify regular claims with standard approach
|
|
1989
|
+
for (const c of regularClaims) {
|
|
1990
|
+
results.push(c.verifiable || c.priority === 'critical' || c.priority === 'high'
|
|
1991
|
+
? await verifyUniversalClaim(c, ctx)
|
|
1992
|
+
: { claim: c, verified: false, confidence: 0, method: 'skip', evidence: 'Low priority', reasoning: 'Skipped', timestamp: new Date().toISOString() });
|
|
1993
|
+
}
|
|
1994
|
+
const vClaims = claims.filter(c => c.verifiable).length;
|
|
1995
|
+
const verified = results.filter(r => r.verified).length;
|
|
1996
|
+
const failed = results.filter(r => !r.verified && r.confidence > 50).length;
|
|
1997
|
+
const inconclusive = results.filter(r => !r.verified && r.confidence <= 50 && r.method !== 'skip').length;
|
|
1998
|
+
const avgConf = results.length ? results.reduce((s, r) => s + r.confidence, 0) / results.length : 0;
|
|
1999
|
+
// Count isolated tests for assessment
|
|
2000
|
+
const isoCount = results.filter(r => r.method === 'isolated-runtime').length;
|
|
2001
|
+
const isoVerified = results.filter(r => r.method === 'isolated-runtime' && r.verified).length;
|
|
2002
|
+
let assessment = '', trust = 0;
|
|
2003
|
+
if (ctx.llmVerifier)
|
|
2004
|
+
try {
|
|
2005
|
+
const isoSummary = isoCount > 0 ? ` Isolated runtime tests: ${isoVerified}/${isoCount} passed.` : '';
|
|
2006
|
+
const p = UNIVERSAL_ASSESS.replace('{RESPONSE}', response.slice(0, 4000)).replace('{CLAIMS}', JSON.stringify(claims.slice(0, 15))).replace('{RESULTS}', JSON.stringify(results.slice(0, 15)));
|
|
2007
|
+
const r = await ctx.llmVerifier(p);
|
|
2008
|
+
const m = r.match(/\{[\s\S]*\}/);
|
|
2009
|
+
if (m) {
|
|
2010
|
+
const a = JSON.parse(m[0]);
|
|
2011
|
+
trust = a.trust;
|
|
2012
|
+
assessment = a.summary + isoSummary + (a.concerns?.length ? ` Concerns: ${a.concerns.join('; ')}` : '');
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
catch {
|
|
2016
|
+
trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
|
|
2017
|
+
assessment = `${verified}/${vClaims} verified`;
|
|
2018
|
+
}
|
|
2019
|
+
else {
|
|
2020
|
+
trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
|
|
2021
|
+
assessment = `${verified}/${vClaims} verified`;
|
|
2022
|
+
}
|
|
2023
|
+
return { responseId: id || `u-${Date.now()}`, originalResponse: response, timestamp: new Date().toISOString(), claims, results, summary: { totalClaims: claims.length, verifiableClaims: vClaims, verified, failed, inconclusive, averageConfidence: Math.round(avgConf) }, overallAssessment: assessment, trustScore: trust };
|
|
2024
|
+
}
|
|
2025
|
+
export async function quickUniversalVerify(r, ctx) {
|
|
2026
|
+
const claims = await extractUniversalClaims(r, ctx);
|
|
2027
|
+
const crit = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 5);
|
|
2028
|
+
if (!crit.length)
|
|
2029
|
+
return { trustScore: 50, summary: 'No critical claims' };
|
|
2030
|
+
let v = 0;
|
|
2031
|
+
for (const c of crit)
|
|
2032
|
+
if ((await verifyUniversalClaim(c, ctx)).verified)
|
|
2033
|
+
v++;
|
|
2034
|
+
return { trustScore: Math.round(v / crit.length * 100), summary: `${v}/${crit.length} critical verified` };
|
|
2035
|
+
}
|
|
2036
|
+
export function formatUniversalReport(r) {
|
|
2037
|
+
const bar = '█'.repeat(Math.round(r.trustScore / 10)) + '░'.repeat(10 - Math.round(r.trustScore / 10));
|
|
2038
|
+
const icon = r.trustScore >= 80 ? '✅' : r.trustScore >= 50 ? '⚠️' : '❌';
|
|
2039
|
+
let out = `╔════════════════════════════════════════════════════════════╗\n║ UNIVERSAL VERIFICATION REPORT ║\n╚════════════════════════════════════════════════════════════╝\n\n`;
|
|
2040
|
+
out += `Trust: ${icon} ${r.trustScore}/100 [${bar}]\n${r.overallAssessment}\n\nClaims: ${r.summary.totalClaims} | ✅ ${r.summary.verified} | ❌ ${r.summary.failed} | ❓ ${r.summary.inconclusive}\n\n`;
|
|
2041
|
+
// Group results by method
|
|
2042
|
+
const isoResults = r.results.filter(x => x.method === 'isolated-runtime');
|
|
2043
|
+
const otherResults = r.results.filter(x => x.method !== 'isolated-runtime');
|
|
2044
|
+
// Show isolated runtime tests first (most robust verification)
|
|
2045
|
+
if (isoResults.length > 0) {
|
|
2046
|
+
out += `🔬 ISOLATED RUNTIME TESTS (fresh CLI instance):\n`;
|
|
2047
|
+
for (const x of isoResults.slice(0, 4)) {
|
|
2048
|
+
out += ` ${x.verified ? '✅' : '❌'} [${x.confidence}%] ${x.claim.statement.slice(0, 50)}...\n`;
|
|
2049
|
+
if (x.reasoning)
|
|
2050
|
+
out += ` └─ ${x.reasoning.slice(0, 60)}\n`;
|
|
2051
|
+
}
|
|
2052
|
+
if (isoResults.length > 4)
|
|
2053
|
+
out += ` ... +${isoResults.length - 4} more isolated tests\n`;
|
|
2054
|
+
out += '\n';
|
|
2055
|
+
}
|
|
2056
|
+
// Show other verification results
|
|
2057
|
+
if (otherResults.length > 0) {
|
|
2058
|
+
out += `📋 STANDARD VERIFICATION:\n`;
|
|
2059
|
+
for (const x of otherResults.slice(0, 6)) {
|
|
2060
|
+
out += ` ${x.verified ? '✅' : x.confidence > 50 ? '❌' : '❓'} [${x.confidence}%] ${x.claim.statement.slice(0, 50)}...\n`;
|
|
2061
|
+
}
|
|
2062
|
+
if (otherResults.length > 6)
|
|
2063
|
+
out += ` ... +${otherResults.length - 6} more\n`;
|
|
2064
|
+
}
|
|
2065
|
+
return out;
|
|
2066
|
+
}
|
|
1817
2067
|
//# sourceMappingURL=responseVerifier.js.map
|