erosolar-cli 1.7.24 → 1.7.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/responseVerifier.d.ts +29 -210
- package/dist/core/responseVerifier.d.ts.map +1 -1
- package/dist/core/responseVerifier.js +241 -1834
- package/dist/core/responseVerifier.js.map +1 -1
- package/dist/shell/interactiveShell.d.ts.map +1 -1
- package/dist/shell/interactiveShell.js +12 -13
- package/dist/shell/interactiveShell.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* AI Response Verification System
|
|
2
|
+
* AI Response Verification System - Isolated Runtime Only
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* 2. Generating runtime verification tests
|
|
7
|
-
* 3. Executing tests to verify claims
|
|
8
|
-
* 4. Reporting verification results
|
|
4
|
+
* Verifies assistant claims by spawning fresh CLI instances and running
|
|
5
|
+
* actual runtime tests. All verification happens in isolation.
|
|
9
6
|
*
|
|
10
7
|
* @license MIT
|
|
11
8
|
*/
|
|
@@ -14,6 +11,9 @@ import { promisify } from 'node:util';
|
|
|
14
11
|
import * as fs from 'node:fs/promises';
|
|
15
12
|
import * as path from 'node:path';
|
|
16
13
|
const execAsync = promisify(exec);
|
|
14
|
+
// ============================================================================
|
|
15
|
+
// ISOLATED RUNTIME - Core Functions
|
|
16
|
+
// ============================================================================
|
|
17
17
|
/**
|
|
18
18
|
* Spawns a fresh isolated erosolar-cli instance for testing
|
|
19
19
|
*/
|
|
@@ -45,7 +45,7 @@ async function spawnIsolatedCLI(cwd, timeout = 60000) {
|
|
|
45
45
|
errors += `\nTimeout after ${timeout}ms`;
|
|
46
46
|
}, timeout);
|
|
47
47
|
child.on('close', () => clearTimeout(timeoutId));
|
|
48
|
-
// Wait for startup
|
|
48
|
+
// Wait for startup
|
|
49
49
|
await new Promise(resolve => {
|
|
50
50
|
const checkStartup = setInterval(() => {
|
|
51
51
|
if (output.includes('erosolar') || output.includes('>') || output.length > 100) {
|
|
@@ -69,7 +69,6 @@ async function spawnIsolatedCLI(cwd, timeout = 60000) {
|
|
|
69
69
|
async function sendCommand(cli, command, waitMs = 5000) {
|
|
70
70
|
const outputBefore = cli.output.length;
|
|
71
71
|
cli.stdin.write(command + '\n');
|
|
72
|
-
// Wait for output to stabilize
|
|
73
72
|
await new Promise(resolve => {
|
|
74
73
|
let lastLength = cli.output.length;
|
|
75
74
|
const checkInterval = setInterval(() => {
|
|
@@ -86,7 +85,25 @@ async function sendCommand(cli, command, waitMs = 5000) {
|
|
|
86
85
|
return cli.output.slice(outputBefore);
|
|
87
86
|
}
|
|
88
87
|
/**
|
|
89
|
-
*
|
|
88
|
+
* Run a shell command for verification (file checks, etc.)
|
|
89
|
+
*/
|
|
90
|
+
async function runShellVerification(cmd, cwd) {
|
|
91
|
+
// Safety check - block dangerous commands
|
|
92
|
+
const dangerous = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i];
|
|
93
|
+
for (const p of dangerous) {
|
|
94
|
+
if (p.test(cmd))
|
|
95
|
+
return { ok: false, out: `Blocked dangerous command: ${p.source}` };
|
|
96
|
+
}
|
|
97
|
+
try {
|
|
98
|
+
const { stdout, stderr } = await execAsync(cmd, { cwd, timeout: 30000 });
|
|
99
|
+
return { ok: true, out: stdout + stderr };
|
|
100
|
+
}
|
|
101
|
+
catch (e) {
|
|
102
|
+
return { ok: false, out: e instanceof Error ? e.message : 'Command failed' };
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Runs an isolated runtime test
|
|
90
107
|
*/
|
|
91
108
|
export async function runIsolatedTest(test, cwd, llmVerifier) {
|
|
92
109
|
const startTime = Date.now();
|
|
@@ -112,19 +129,29 @@ export async function runIsolatedTest(test, cwd, llmVerifier) {
|
|
|
112
129
|
return result;
|
|
113
130
|
}
|
|
114
131
|
}
|
|
115
|
-
//
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
132
|
+
// Run shell commands first if any (file checks, etc.)
|
|
133
|
+
if (test.shellCommands && test.shellCommands.length > 0) {
|
|
134
|
+
for (const cmd of test.shellCommands) {
|
|
135
|
+
const shellResult = await runShellVerification(cmd, cwd);
|
|
136
|
+
result.output += `$ ${cmd}\n${shellResult.out}\n`;
|
|
137
|
+
if (!shellResult.ok) {
|
|
138
|
+
result.errors += shellResult.out + '\n';
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// Run CLI commands if any
|
|
143
|
+
if (test.commands && test.commands.length > 0) {
|
|
144
|
+
const cli = await spawnIsolatedCLI(cwd, test.timeout || 60000);
|
|
145
|
+
for (const cmd of test.commands) {
|
|
146
|
+
const cmdOutput = await sendCommand(cli, cmd);
|
|
147
|
+
result.output += `> ${cmd}\n${cmdOutput}\n`;
|
|
148
|
+
}
|
|
149
|
+
cli.stdin.write('/quit\n');
|
|
150
|
+
await new Promise(resolve => setTimeout(resolve, 500));
|
|
151
|
+
cli.process.kill('SIGTERM');
|
|
152
|
+
result.exitCode = await cli.exitPromise;
|
|
153
|
+
result.errors += cli.errors;
|
|
121
154
|
}
|
|
122
|
-
// Gracefully exit
|
|
123
|
-
cli.stdin.write('/quit\n');
|
|
124
|
-
await new Promise(resolve => setTimeout(resolve, 500));
|
|
125
|
-
cli.process.kill('SIGTERM');
|
|
126
|
-
result.exitCode = await cli.exitPromise;
|
|
127
|
-
result.errors = cli.errors;
|
|
128
155
|
// Check expected output patterns
|
|
129
156
|
if (test.expectedOutputs) {
|
|
130
157
|
for (const pattern of test.expectedOutputs) {
|
|
@@ -136,13 +163,13 @@ export async function runIsolatedTest(test, cwd, llmVerifier) {
|
|
|
136
163
|
}
|
|
137
164
|
}
|
|
138
165
|
}
|
|
139
|
-
// LLM assessment of behavior
|
|
166
|
+
// LLM assessment of behavior
|
|
140
167
|
if (test.expectedBehavior && llmVerifier) {
|
|
141
|
-
const assessPrompt = `Assess if this
|
|
168
|
+
const assessPrompt = `Assess if this output demonstrates the expected behavior.
|
|
142
169
|
|
|
143
|
-
EXPECTED
|
|
170
|
+
EXPECTED: ${test.expectedBehavior}
|
|
144
171
|
|
|
145
|
-
|
|
172
|
+
OUTPUT:
|
|
146
173
|
---
|
|
147
174
|
${result.output.slice(0, 4000)}
|
|
148
175
|
---
|
|
@@ -168,7 +195,7 @@ Return JSON: {"matches": true/false, "confidence": 0-100, "reasoning": "explanat
|
|
|
168
195
|
}
|
|
169
196
|
// Determine success
|
|
170
197
|
result.success = result.unmatchedPatterns.length === 0 &&
|
|
171
|
-
(result.matchedPatterns.length > 0 || !test.expectedOutputs?.length);
|
|
198
|
+
(result.matchedPatterns.length > 0 || (!test.expectedOutputs?.length && !test.expectedBehavior));
|
|
172
199
|
}
|
|
173
200
|
catch (err) {
|
|
174
201
|
result.errors = err instanceof Error ? err.message : 'Unknown error';
|
|
@@ -176,1212 +203,196 @@ Return JSON: {"matches": true/false, "confidence": 0-100, "reasoning": "explanat
|
|
|
176
203
|
result.duration = Date.now() - startTime;
|
|
177
204
|
return result;
|
|
178
205
|
}
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
export async function generateIsolatedTests(claims, llmVerifier) {
|
|
184
|
-
const selfClaims = claims.filter(c => c.statement.toLowerCase().includes('erosolar') ||
|
|
185
|
-
c.statement.toLowerCase().includes('cli') ||
|
|
186
|
-
c.statement.toLowerCase().includes('command') ||
|
|
187
|
-
c.statement.toLowerCase().includes('feature') ||
|
|
188
|
-
c.category === 'behavior' ||
|
|
189
|
-
c.category === 'feature');
|
|
190
|
-
if (selfClaims.length === 0)
|
|
191
|
-
return [];
|
|
192
|
-
const prompt = `Generate isolated CLI tests for these claims about erosolar-cli behavior.
|
|
206
|
+
// ============================================================================
|
|
207
|
+
// CLAIM EXTRACTION - LLM extracts claims from responses
|
|
208
|
+
// ============================================================================
|
|
209
|
+
const EXTRACT_CLAIMS_PROMPT = `Extract ALL verifiable claims from this AI assistant response.
|
|
193
210
|
|
|
194
|
-
|
|
195
|
-
|
|
211
|
+
RESPONSE:
|
|
212
|
+
---
|
|
213
|
+
{RESPONSE}
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
CONTEXT: {CONTEXT}
|
|
217
|
+
WORKING_DIR: {WORKING_DIR}
|
|
196
218
|
|
|
197
|
-
For each claim,
|
|
198
|
-
1.
|
|
199
|
-
2.
|
|
200
|
-
3.
|
|
219
|
+
For each claim, determine:
|
|
220
|
+
1. What specific assertion is being made
|
|
221
|
+
2. Category: file_op (created/modified/deleted files), code (compiles/tests pass), command (executed successfully), state (something changed), behavior (feature works), fact (verifiable truth)
|
|
222
|
+
3. How it can be verified (shell command, file check, CLI test, etc.)
|
|
223
|
+
4. Priority: critical (must verify), high (should verify), medium (nice to verify), low (optional)
|
|
201
224
|
|
|
202
225
|
Return JSON array:
|
|
203
226
|
[{
|
|
204
|
-
"id": "
|
|
205
|
-
"
|
|
206
|
-
"
|
|
207
|
-
"
|
|
208
|
-
"
|
|
209
|
-
"
|
|
210
|
-
"timeout": 30000
|
|
227
|
+
"id": "c1",
|
|
228
|
+
"statement": "the specific claim",
|
|
229
|
+
"category": "file_op|code|command|state|behavior|fact",
|
|
230
|
+
"verifiable": true,
|
|
231
|
+
"priority": "critical|high|medium|low",
|
|
232
|
+
"context": {"path": "/path/if/relevant", "command": "if relevant"}
|
|
211
233
|
}]
|
|
212
234
|
|
|
213
235
|
Output ONLY valid JSON array.`;
|
|
236
|
+
/**
|
|
237
|
+
* Extract claims from assistant response using LLM
|
|
238
|
+
*/
|
|
239
|
+
async function extractClaims(response, ctx) {
|
|
240
|
+
if (!ctx.llmVerifier)
|
|
241
|
+
return [];
|
|
214
242
|
try {
|
|
215
|
-
const
|
|
216
|
-
|
|
243
|
+
const prompt = EXTRACT_CLAIMS_PROMPT
|
|
244
|
+
.replace('{RESPONSE}', response.slice(0, 8000))
|
|
245
|
+
.replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '')
|
|
246
|
+
.replace('{WORKING_DIR}', ctx.workingDirectory);
|
|
247
|
+
const result = await ctx.llmVerifier(prompt);
|
|
248
|
+
const match = result.match(/\[[\s\S]*\]/);
|
|
217
249
|
if (match) {
|
|
218
250
|
return JSON.parse(match[0]);
|
|
219
251
|
}
|
|
220
252
|
}
|
|
221
253
|
catch {
|
|
222
|
-
// Fall through
|
|
254
|
+
// Fall through
|
|
223
255
|
}
|
|
224
|
-
|
|
225
|
-
return selfClaims.map((c, i) => ({
|
|
226
|
-
id: `iso-${i}`,
|
|
227
|
-
description: c.statement,
|
|
228
|
-
commands: ['/help'], // Basic smoke test
|
|
229
|
-
expectedBehavior: c.statement,
|
|
230
|
-
timeout: 30000
|
|
231
|
-
}));
|
|
256
|
+
return [];
|
|
232
257
|
}
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
if (!llmVerifier) {
|
|
238
|
-
return { tests: [], summary: { total: 0, passed: 0, failed: 0 }, allPassed: true };
|
|
239
|
-
}
|
|
240
|
-
const tests = await generateIsolatedTests(claims, llmVerifier);
|
|
241
|
-
const results = [];
|
|
242
|
-
for (const test of tests) {
|
|
243
|
-
const result = await runIsolatedTest(test, cwd, llmVerifier);
|
|
244
|
-
results.push(result);
|
|
245
|
-
}
|
|
246
|
-
const passed = results.filter(r => r.success).length;
|
|
247
|
-
const failed = results.filter(r => !r.success).length;
|
|
248
|
-
return {
|
|
249
|
-
tests: results,
|
|
250
|
-
summary: { total: tests.length, passed, failed },
|
|
251
|
-
allPassed: failed === 0
|
|
252
|
-
};
|
|
253
|
-
}
|
|
254
|
-
/**
|
|
255
|
-
* LLM-based claim extraction prompt.
|
|
256
|
-
* Used when pattern matching isn't sufficient.
|
|
257
|
-
*/
|
|
258
|
-
const CLAIM_EXTRACTION_PROMPT = `Analyze this assistant response and extract ALL verifiable claims - anything the assistant claims to have done or accomplished.
|
|
259
|
-
|
|
260
|
-
For each claim, identify:
|
|
261
|
-
1. Type: One of these claim types:
|
|
262
|
-
|
|
263
|
-
FILE OPERATIONS:
|
|
264
|
-
- file_created: A new file was created
|
|
265
|
-
- file_modified: An existing file was changed
|
|
266
|
-
- file_deleted: A file was removed
|
|
267
|
-
- content_contains: A file contains specific content
|
|
268
|
-
|
|
269
|
-
BUILD/TEST:
|
|
270
|
-
- code_compiles: Code builds/compiles without errors
|
|
271
|
-
- tests_pass: Tests run successfully
|
|
272
|
-
|
|
273
|
-
VERSION CONTROL:
|
|
274
|
-
- git_committed: Changes were committed to git
|
|
275
|
-
- package_published: Package was published to npm
|
|
276
|
-
|
|
277
|
-
SYSTEM:
|
|
278
|
-
- command_executed: A shell command was run
|
|
279
|
-
- dependency_installed: A package/dependency was installed
|
|
280
|
-
- service_running: A service/server is running (on a port)
|
|
281
|
-
- url_accessible: A URL is accessible/working
|
|
282
|
-
- env_var_set: Environment variable was set
|
|
283
|
-
|
|
284
|
-
CONFIGURATION:
|
|
285
|
-
- config_changed: Configuration file was updated
|
|
286
|
-
- permission_granted: File permissions were changed
|
|
287
|
-
|
|
288
|
-
API/DATA:
|
|
289
|
-
- api_response: API returned expected response
|
|
290
|
-
- database_updated: Database record was modified
|
|
291
|
-
- data_transformed: Data was transformed correctly
|
|
292
|
-
|
|
293
|
-
SEMANTIC (require deeper analysis):
|
|
294
|
-
- error_fixed: An error/bug was fixed
|
|
295
|
-
- feature_implemented: A feature was implemented
|
|
296
|
-
- refactor_complete: Code was refactored
|
|
297
|
-
|
|
298
|
-
CATCH-ALL:
|
|
299
|
-
- generic: Any other verifiable claim not covered above
|
|
258
|
+
// ============================================================================
|
|
259
|
+
// TEST GENERATION - LLM generates isolated tests for claims
|
|
260
|
+
// ============================================================================
|
|
261
|
+
const GENERATE_TESTS_PROMPT = `Generate isolated runtime tests for these claims.
|
|
300
262
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
Return a JSON array of claims. Each claim should have:
|
|
304
|
-
- type: one of the types above
|
|
305
|
-
- description: human readable description of what was claimed
|
|
306
|
-
- params: Object with relevant fields:
|
|
307
|
-
- path: file path (for file operations)
|
|
308
|
-
- command: shell command (for command_executed)
|
|
309
|
-
- version: version number (for package_published)
|
|
310
|
-
- hash: git commit hash (for git_committed)
|
|
311
|
-
- package: package name (for dependency_installed)
|
|
312
|
-
- port: port number (for service_running)
|
|
313
|
-
- name: process/service name (for service_running, env_var_set)
|
|
314
|
-
- url: URL (for url_accessible, api_response)
|
|
315
|
-
- content: text to search for (for content_contains)
|
|
316
|
-
- key: config key path like "server.port" (for config_changed)
|
|
317
|
-
- value: expected value (for config_changed, env_var_set)
|
|
318
|
-
- status: HTTP status code (for api_response)
|
|
319
|
-
- body: expected response body (for api_response)
|
|
320
|
-
- mode: file permission mode like "755" (for permission_granted)
|
|
321
|
-
- checkCommand: command that can verify the claim (for database_updated)
|
|
263
|
+
CLAIMS:
|
|
264
|
+
{CLAIMS}
|
|
322
265
|
|
|
323
|
-
|
|
266
|
+
WORKING_DIR: {WORKING_DIR}
|
|
267
|
+
PLATFORM: {PLATFORM}
|
|
324
268
|
|
|
325
|
-
|
|
269
|
+
For each claim, generate a test that verifies it using:
|
|
270
|
+
- Shell commands (for file checks, git status, etc.)
|
|
271
|
+
- CLI commands (for testing CLI behavior in fresh instance)
|
|
272
|
+
- Expected output patterns
|
|
326
273
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
274
|
+
Return JSON array:
|
|
275
|
+
[{
|
|
276
|
+
"id": "test-1",
|
|
277
|
+
"description": "what we're testing",
|
|
278
|
+
"shellCommands": ["ls -la path", "cat file"],
|
|
279
|
+
"commands": ["/help", "some input"],
|
|
280
|
+
"expectedOutputs": ["pattern1", "pattern2"],
|
|
281
|
+
"expectedBehavior": "description for LLM assessment",
|
|
282
|
+
"requiresBuild": false,
|
|
283
|
+
"timeout": 30000
|
|
284
|
+
}]
|
|
331
285
|
|
|
332
|
-
|
|
286
|
+
Use READ-ONLY commands only. No destructive operations.
|
|
287
|
+
Output ONLY valid JSON array.`;
|
|
333
288
|
/**
|
|
334
|
-
*
|
|
335
|
-
* Falls back to pattern matching if LLM extraction fails.
|
|
289
|
+
* Generate isolated tests for claims
|
|
336
290
|
*/
|
|
337
|
-
|
|
338
|
-
if (!
|
|
339
|
-
|
|
340
|
-
return extractClaims(response);
|
|
341
|
-
}
|
|
291
|
+
async function generateTests(claims, ctx) {
|
|
292
|
+
if (!ctx.llmVerifier || claims.length === 0)
|
|
293
|
+
return [];
|
|
342
294
|
try {
|
|
343
|
-
const prompt =
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
}
|
|
350
|
-
const parsed = JSON.parse(jsonMatch[0]);
|
|
351
|
-
return parsed.map(claim => ({
|
|
352
|
-
type: claim.type,
|
|
353
|
-
description: claim.description,
|
|
354
|
-
evidence: 'Extracted by LLM',
|
|
355
|
-
params: claim.params
|
|
356
|
-
}));
|
|
357
|
-
}
|
|
358
|
-
catch {
|
|
359
|
-
// LLM extraction failed, fall back to patterns
|
|
360
|
-
return extractClaims(response);
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
/**
|
|
364
|
-
* Extract verifiable claims from an assistant response.
|
|
365
|
-
* Covers common patterns for file operations, builds, tests, git, and npm.
|
|
366
|
-
*/
|
|
367
|
-
export function extractClaims(response) {
|
|
368
|
-
const claims = [];
|
|
369
|
-
const seenPaths = new Set();
|
|
370
|
-
// Helper to add file claim if not duplicate
|
|
371
|
-
const addFileClaim = (type, path, evidence) => {
|
|
372
|
-
if (path && !seenPaths.has(path)) {
|
|
373
|
-
seenPaths.add(path);
|
|
374
|
-
claims.push({
|
|
375
|
-
type,
|
|
376
|
-
description: `File ${path} was ${type === 'file_created' ? 'created' : 'modified'}`,
|
|
377
|
-
evidence,
|
|
378
|
-
params: { path }
|
|
379
|
-
});
|
|
380
|
-
}
|
|
381
|
-
};
|
|
382
|
-
// Pattern: File creation claims - comprehensive patterns
|
|
383
|
-
const fileCreationPatterns = [
|
|
384
|
-
// "I created file X", "Created X", "I've created X"
|
|
385
|
-
/(?:I(?:'ve)?\s+)?(?:created|wrote|written|generated|added)\s+(?:a\s+)?(?:new\s+)?(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
|
|
386
|
-
// "File X created", "File created at X"
|
|
387
|
-
/(?:File\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:was\s+)?(?:created|written|generated)/gi,
|
|
388
|
-
// "Created file at X", "Wrote file to X"
|
|
389
|
-
/(?:created|wrote)\s+(?:a\s+)?(?:new\s+)?file\s+(?:at|to|in)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
|
|
390
|
-
// "File created successfully" with path nearby
|
|
391
|
-
/[`"']([^\s`"',]+\.[a-zA-Z0-9]+)[`"']\s+(?:has been\s+)?(?:created|written)/gi,
|
|
392
|
-
// "successfully created X"
|
|
393
|
-
/successfully\s+(?:created|wrote|generated)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
|
|
394
|
-
// "The file X now exists" or "X now contains"
|
|
395
|
-
/(?:the\s+)?(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:now\s+)?(?:exists|contains)/gi,
|
|
396
|
-
];
|
|
397
|
-
for (const pattern of fileCreationPatterns) {
|
|
398
|
-
pattern.lastIndex = 0; // Reset regex state
|
|
399
|
-
let match;
|
|
400
|
-
while ((match = pattern.exec(response)) !== null) {
|
|
401
|
-
if (match[1]) {
|
|
402
|
-
addFileClaim('file_created', match[1], match[0]);
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
}
|
|
406
|
-
// Pattern: File modification claims - comprehensive patterns
|
|
407
|
-
const fileModPatterns = [
|
|
408
|
-
// "I modified X", "Updated X", "I've edited X"
|
|
409
|
-
/(?:I(?:'ve)?\s+)?(?:modified|updated|changed|edited|fixed|patched|amended)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
|
|
410
|
-
// "File X was updated"
|
|
411
|
-
/(?:File\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:was\s+)?(?:modified|updated|changed|edited|fixed)/gi,
|
|
412
|
-
// "Made changes to X"
|
|
413
|
-
/(?:made\s+)?changes?\s+to\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
|
|
414
|
-
// "X has been updated"
|
|
415
|
-
/[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+has\s+been\s+(?:updated|modified|changed|edited)/gi,
|
|
416
|
-
// "successfully updated X"
|
|
417
|
-
/successfully\s+(?:updated|modified|edited|fixed)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
|
|
418
|
-
];
|
|
419
|
-
for (const pattern of fileModPatterns) {
|
|
420
|
-
pattern.lastIndex = 0;
|
|
421
|
-
let match;
|
|
422
|
-
while ((match = pattern.exec(response)) !== null) {
|
|
423
|
-
if (match[1]) {
|
|
424
|
-
addFileClaim('file_modified', match[1], match[0]);
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
}
|
|
428
|
-
// Pattern: Command execution claims
|
|
429
|
-
const cmdPatterns = [
|
|
430
|
-
/(?:I(?:'ve)?\s+)?(?:ran|executed|run|running)\s+`([^`]+)`/gi,
|
|
431
|
-
/(?:Running|Executed|Ran)\s+`([^`]+)`/gi,
|
|
432
|
-
/`([^`]+)`\s+(?:completed|succeeded|finished|passed)/gi,
|
|
433
|
-
/executed\s+(?:the\s+)?command[:\s]+`([^`]+)`/gi,
|
|
434
|
-
];
|
|
435
|
-
const seenCommands = new Set();
|
|
436
|
-
for (const pattern of cmdPatterns) {
|
|
437
|
-
pattern.lastIndex = 0;
|
|
438
|
-
let match;
|
|
439
|
-
while ((match = pattern.exec(response)) !== null) {
|
|
440
|
-
const command = match[1];
|
|
441
|
-
if (command && !seenCommands.has(command)) {
|
|
442
|
-
seenCommands.add(command);
|
|
443
|
-
claims.push({
|
|
444
|
-
type: 'command_executed',
|
|
445
|
-
description: `Command "${command.slice(0, 50)}${command.length > 50 ? '...' : ''}" was executed`,
|
|
446
|
-
evidence: match[0],
|
|
447
|
-
params: { command }
|
|
448
|
-
});
|
|
449
|
-
}
|
|
450
|
-
}
|
|
451
|
-
}
|
|
452
|
-
// Pattern: Build/compile success claims
|
|
453
|
-
const buildPatterns = [
|
|
454
|
-
/(?:build|compilation|type[- ]?check)\s+(?:passed|succeeded|completed|successful|success)/gi,
|
|
455
|
-
/(?:successfully|passed)\s+(?:the\s+)?(?:build|compilation|type[- ]?check)/gi,
|
|
456
|
-
/no\s+(?:type\s+)?errors/gi,
|
|
457
|
-
/type[- ]?check(?:ing)?\s+(?:passed|succeeded|completed)/gi,
|
|
458
|
-
/(?:built|compiled)\s+successfully/gi,
|
|
459
|
-
/build\s+(?:is\s+)?(?:complete|successful)/gi,
|
|
460
|
-
];
|
|
461
|
-
let hasBuildClaim = false;
|
|
462
|
-
for (const pattern of buildPatterns) {
|
|
463
|
-
pattern.lastIndex = 0;
|
|
464
|
-
if (!hasBuildClaim && pattern.test(response)) {
|
|
465
|
-
claims.push({
|
|
466
|
-
type: 'code_compiles',
|
|
467
|
-
description: 'Code compiles without errors',
|
|
468
|
-
evidence: response.match(pattern)?.[0] || '',
|
|
469
|
-
params: {}
|
|
470
|
-
});
|
|
471
|
-
hasBuildClaim = true;
|
|
472
|
-
break;
|
|
473
|
-
}
|
|
474
|
-
}
|
|
475
|
-
// Pattern: Test pass claims
|
|
476
|
-
const testPatterns = [
|
|
477
|
-
/(?:all\s+)?tests?\s+(?:pass|passed|passing|succeeded|successful)/gi,
|
|
478
|
-
/(?:passed|passing)\s+(?:all\s+)?tests?/gi,
|
|
479
|
-
/(\d+)\s+tests?\s+passed/gi,
|
|
480
|
-
/tests?\s+(?:completed|finished)\s+successfully/gi,
|
|
481
|
-
/(?:test|tests)\s+suite\s+(?:passed|succeeded)/gi,
|
|
482
|
-
/all\s+(\d+)\s+tests?\s+(?:pass|passed)/gi,
|
|
483
|
-
];
|
|
484
|
-
let hasTestClaim = false;
|
|
485
|
-
for (const pattern of testPatterns) {
|
|
486
|
-
pattern.lastIndex = 0;
|
|
487
|
-
const match = pattern.exec(response);
|
|
488
|
-
if (!hasTestClaim && match) {
|
|
489
|
-
claims.push({
|
|
490
|
-
type: 'tests_pass',
|
|
491
|
-
description: 'Tests pass',
|
|
492
|
-
evidence: match[0],
|
|
493
|
-
params: { count: match[1] ? parseInt(match[1], 10) : undefined }
|
|
494
|
-
});
|
|
495
|
-
hasTestClaim = true;
|
|
496
|
-
break;
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
// Pattern: Git commit claims
|
|
500
|
-
const gitPatterns = [
|
|
501
|
-
/committed\s+(?:the\s+)?(?:changes?\s+)?(?:with\s+message\s+)?["']?([^"'\n]+)["']?/gi,
|
|
502
|
-
/\[(?:main|master|[a-zA-Z0-9/_-]+)\s+([a-f0-9]{7,})\]/gi,
|
|
503
|
-
/git\s+commit.*-m\s+["']([^"']+)["']/gi,
|
|
504
|
-
/created\s+(?:a\s+)?commit/gi,
|
|
505
|
-
/changes?\s+(?:have\s+been\s+)?committed/gi,
|
|
506
|
-
/commit\s+([a-f0-9]{7,})/gi,
|
|
507
|
-
];
|
|
508
|
-
let hasGitClaim = false;
|
|
509
|
-
for (const pattern of gitPatterns) {
|
|
510
|
-
pattern.lastIndex = 0;
|
|
511
|
-
const match = pattern.exec(response);
|
|
512
|
-
if (!hasGitClaim && match) {
|
|
513
|
-
claims.push({
|
|
514
|
-
type: 'git_committed',
|
|
515
|
-
description: 'Changes were committed',
|
|
516
|
-
evidence: match[0],
|
|
517
|
-
params: { hash: match[1] }
|
|
518
|
-
});
|
|
519
|
-
hasGitClaim = true;
|
|
520
|
-
break;
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
// Pattern: Package publish claims
|
|
524
|
-
const publishPatterns = [
|
|
525
|
-
/published\s+(?:to\s+)?(?:npm|registry)/gi,
|
|
526
|
-
/\+\s+[a-z@/_-]+@(\d+\.\d+\.\d+)/gi,
|
|
527
|
-
/npm\s+publish/gi,
|
|
528
|
-
/package\s+(?:was\s+)?published/gi,
|
|
529
|
-
/published\s+(?:version\s+)?v?(\d+\.\d+\.\d+)/gi,
|
|
530
|
-
/successfully\s+published/gi,
|
|
531
|
-
];
|
|
532
|
-
let hasPublishClaim = false;
|
|
533
|
-
for (const pattern of publishPatterns) {
|
|
534
|
-
pattern.lastIndex = 0;
|
|
535
|
-
const match = pattern.exec(response);
|
|
536
|
-
if (!hasPublishClaim && match) {
|
|
537
|
-
claims.push({
|
|
538
|
-
type: 'package_published',
|
|
539
|
-
description: 'Package was published',
|
|
540
|
-
evidence: match[0],
|
|
541
|
-
params: { version: match[1] }
|
|
542
|
-
});
|
|
543
|
-
hasPublishClaim = true;
|
|
544
|
-
break;
|
|
545
|
-
}
|
|
546
|
-
}
|
|
547
|
-
// Pattern: File deletion claims
|
|
548
|
-
const deletionPatterns = [
|
|
549
|
-
/(?:I(?:'ve)?\s+)?(?:deleted|removed)\s+(?:the\s+)?(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
|
|
550
|
-
/[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:was\s+)?(?:deleted|removed)/gi,
|
|
551
|
-
];
|
|
552
|
-
for (const pattern of deletionPatterns) {
|
|
553
|
-
pattern.lastIndex = 0;
|
|
554
|
-
let match;
|
|
555
|
-
while ((match = pattern.exec(response)) !== null) {
|
|
556
|
-
const filePath = match[1];
|
|
557
|
-
if (filePath && !seenPaths.has(filePath)) {
|
|
558
|
-
seenPaths.add(filePath);
|
|
559
|
-
claims.push({
|
|
560
|
-
type: 'file_deleted',
|
|
561
|
-
description: `File ${filePath} was deleted`,
|
|
562
|
-
evidence: match[0],
|
|
563
|
-
params: { path: filePath }
|
|
564
|
-
});
|
|
565
|
-
}
|
|
566
|
-
}
|
|
567
|
-
}
|
|
568
|
-
// Pattern: Dependency installation claims
|
|
569
|
-
const installPatterns = [
|
|
570
|
-
/(?:installed|added)\s+(?:the\s+)?(?:package|dependency)\s+[`"']?([^\s`"',]+)[`"']?/gi,
|
|
571
|
-
/npm\s+install(?:ed)?\s+[`"']?([^\s`"',]+)[`"']?/gi,
|
|
572
|
-
/(?:package|dependency)\s+[`"']?([^\s`"',]+)[`"']?\s+(?:was\s+)?installed/gi,
|
|
573
|
-
];
|
|
574
|
-
for (const pattern of installPatterns) {
|
|
575
|
-
pattern.lastIndex = 0;
|
|
576
|
-
let match;
|
|
577
|
-
while ((match = pattern.exec(response)) !== null) {
|
|
578
|
-
const packageName = match[1];
|
|
579
|
-
if (packageName) {
|
|
580
|
-
claims.push({
|
|
581
|
-
type: 'dependency_installed',
|
|
582
|
-
description: `Package ${packageName} was installed`,
|
|
583
|
-
evidence: match[0],
|
|
584
|
-
params: { package: packageName }
|
|
585
|
-
});
|
|
586
|
-
}
|
|
587
|
-
}
|
|
588
|
-
}
|
|
589
|
-
// Pattern: Service running claims
|
|
590
|
-
const servicePatterns = [
|
|
591
|
-
/(?:server|service|app(?:lication)?)\s+(?:is\s+)?(?:running|started|listening)\s+(?:on\s+)?(?:port\s+)?(\d+)/gi,
|
|
592
|
-
/(?:listening|running)\s+(?:on\s+)?(?:port\s+)?(\d+)/gi,
|
|
593
|
-
/started\s+(?:the\s+)?(?:server|service)\s+(?:on\s+)?(?:port\s+)?(\d+)/gi,
|
|
594
|
-
/(?:port\s+)?(\d+)\s+is\s+(?:now\s+)?(?:open|listening)/gi,
|
|
595
|
-
];
|
|
596
|
-
for (const pattern of servicePatterns) {
|
|
597
|
-
pattern.lastIndex = 0;
|
|
598
|
-
const match = pattern.exec(response);
|
|
599
|
-
if (match && match[1]) {
|
|
600
|
-
const port = parseInt(match[1], 10);
|
|
601
|
-
if (port > 0 && port < 65536) {
|
|
602
|
-
claims.push({
|
|
603
|
-
type: 'service_running',
|
|
604
|
-
description: `Service running on port ${port}`,
|
|
605
|
-
evidence: match[0],
|
|
606
|
-
params: { port }
|
|
607
|
-
});
|
|
608
|
-
break; // Only one service claim per response
|
|
609
|
-
}
|
|
610
|
-
}
|
|
611
|
-
}
|
|
612
|
-
// Pattern: URL accessible claims
|
|
613
|
-
const urlPatterns = [
|
|
614
|
-
/(?:accessible|available|live)\s+at\s+(https?:\/\/[^\s]+)/gi,
|
|
615
|
-
/(?:visit|open|access)\s+(https?:\/\/[^\s]+)/gi,
|
|
616
|
-
/(https?:\/\/[^\s]+)\s+(?:is\s+)?(?:now\s+)?(?:accessible|available|live)/gi,
|
|
617
|
-
/deployed\s+(?:to|at)\s+(https?:\/\/[^\s]+)/gi,
|
|
618
|
-
];
|
|
619
|
-
for (const pattern of urlPatterns) {
|
|
620
|
-
pattern.lastIndex = 0;
|
|
621
|
-
const match = pattern.exec(response);
|
|
622
|
-
if (match && match[1]) {
|
|
623
|
-
const url = match[1].replace(/[.,;:!?)]+$/, ''); // Remove trailing punctuation
|
|
624
|
-
claims.push({
|
|
625
|
-
type: 'url_accessible',
|
|
626
|
-
description: `URL ${url} is accessible`,
|
|
627
|
-
evidence: match[0],
|
|
628
|
-
params: { url }
|
|
629
|
-
});
|
|
630
|
-
break; // Only one URL claim per response
|
|
631
|
-
}
|
|
632
|
-
}
|
|
633
|
-
// Pattern: Content contains claims
|
|
634
|
-
const contentPatterns = [
|
|
635
|
-
/(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:now\s+)?contains?\s+[`"']([^`"']+)[`"']/gi,
|
|
636
|
-
/added\s+[`"']([^`"']+)[`"']\s+to\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
|
|
637
|
-
];
|
|
638
|
-
for (const pattern of contentPatterns) {
|
|
639
|
-
pattern.lastIndex = 0;
|
|
640
|
-
const match = pattern.exec(response);
|
|
295
|
+
const prompt = GENERATE_TESTS_PROMPT
|
|
296
|
+
.replace('{CLAIMS}', JSON.stringify(claims.slice(0, 10)))
|
|
297
|
+
.replace('{WORKING_DIR}', ctx.workingDirectory)
|
|
298
|
+
.replace('{PLATFORM}', process.platform);
|
|
299
|
+
const result = await ctx.llmVerifier(prompt);
|
|
300
|
+
const match = result.match(/\[[\s\S]*\]/);
|
|
641
301
|
if (match) {
|
|
642
|
-
|
|
643
|
-
// Pattern 2: added "text" to file
|
|
644
|
-
const isPattern2 = pattern.source.startsWith('added');
|
|
645
|
-
const filePath = isPattern2 ? match[2] : match[1];
|
|
646
|
-
const content = isPattern2 ? match[1] : match[2];
|
|
647
|
-
if (filePath && content) {
|
|
648
|
-
claims.push({
|
|
649
|
-
type: 'content_contains',
|
|
650
|
-
description: `File ${filePath} contains specified content`,
|
|
651
|
-
evidence: match[0],
|
|
652
|
-
params: { path: filePath, content }
|
|
653
|
-
});
|
|
654
|
-
}
|
|
655
|
-
}
|
|
656
|
-
}
|
|
657
|
-
return claims;
|
|
658
|
-
}
|
|
659
|
-
/**
|
|
660
|
-
* Generate a verification test for a claim
|
|
661
|
-
*/
|
|
662
|
-
export function generateVerificationTest(claim) {
|
|
663
|
-
const baseResult = {
|
|
664
|
-
claim,
|
|
665
|
-
timestamp: new Date().toISOString()
|
|
666
|
-
};
|
|
667
|
-
switch (claim.type) {
|
|
668
|
-
case 'file_created':
|
|
669
|
-
case 'file_modified':
|
|
670
|
-
return async () => {
|
|
671
|
-
const filePath = claim.params.path;
|
|
672
|
-
try {
|
|
673
|
-
const resolvedPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
|
|
674
|
-
const stats = await fs.stat(resolvedPath);
|
|
675
|
-
const recentlyModified = (Date.now() - stats.mtimeMs) < 5 * 60 * 1000; // Within 5 minutes
|
|
676
|
-
return {
|
|
677
|
-
...baseResult,
|
|
678
|
-
verified: stats.isFile(),
|
|
679
|
-
confidence: recentlyModified ? 'high' : 'medium',
|
|
680
|
-
evidence: `File exists. Size: ${stats.size} bytes. Modified: ${stats.mtime.toISOString()}`
|
|
681
|
-
};
|
|
682
|
-
}
|
|
683
|
-
catch (err) {
|
|
684
|
-
return {
|
|
685
|
-
...baseResult,
|
|
686
|
-
verified: false,
|
|
687
|
-
confidence: 'high',
|
|
688
|
-
evidence: 'File does not exist',
|
|
689
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
690
|
-
};
|
|
691
|
-
}
|
|
692
|
-
};
|
|
693
|
-
case 'file_deleted':
|
|
694
|
-
return async () => {
|
|
695
|
-
const filePath = claim.params.path;
|
|
696
|
-
try {
|
|
697
|
-
const resolvedPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
|
|
698
|
-
await fs.stat(resolvedPath);
|
|
699
|
-
return {
|
|
700
|
-
...baseResult,
|
|
701
|
-
verified: false,
|
|
702
|
-
confidence: 'high',
|
|
703
|
-
evidence: 'File still exists (deletion claim is false)'
|
|
704
|
-
};
|
|
705
|
-
}
|
|
706
|
-
catch {
|
|
707
|
-
return {
|
|
708
|
-
...baseResult,
|
|
709
|
-
verified: true,
|
|
710
|
-
confidence: 'high',
|
|
711
|
-
evidence: 'File does not exist (deletion verified)'
|
|
712
|
-
};
|
|
713
|
-
}
|
|
714
|
-
};
|
|
715
|
-
case 'code_compiles':
|
|
716
|
-
return async () => {
|
|
717
|
-
try {
|
|
718
|
-
const { stdout, stderr } = await execAsync('npm run type-check 2>&1 || npm run build 2>&1', {
|
|
719
|
-
timeout: 60000,
|
|
720
|
-
cwd: process.cwd()
|
|
721
|
-
});
|
|
722
|
-
const output = stdout + stderr;
|
|
723
|
-
const hasErrors = /error/i.test(output) && !/0 errors/i.test(output);
|
|
724
|
-
return {
|
|
725
|
-
...baseResult,
|
|
726
|
-
verified: !hasErrors,
|
|
727
|
-
confidence: 'high',
|
|
728
|
-
evidence: hasErrors ? `Compilation errors found: ${output.slice(0, 500)}` : 'Code compiles successfully'
|
|
729
|
-
};
|
|
730
|
-
}
|
|
731
|
-
catch (err) {
|
|
732
|
-
return {
|
|
733
|
-
...baseResult,
|
|
734
|
-
verified: false,
|
|
735
|
-
confidence: 'high',
|
|
736
|
-
evidence: 'Compilation check failed',
|
|
737
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
738
|
-
};
|
|
739
|
-
}
|
|
740
|
-
};
|
|
741
|
-
case 'tests_pass':
|
|
742
|
-
return async () => {
|
|
743
|
-
try {
|
|
744
|
-
const { stdout, stderr } = await execAsync('npm test 2>&1', {
|
|
745
|
-
timeout: 120000,
|
|
746
|
-
cwd: process.cwd()
|
|
747
|
-
});
|
|
748
|
-
const output = stdout + stderr;
|
|
749
|
-
const hasFailed = /fail|error/i.test(output) && !/0 failed/i.test(output);
|
|
750
|
-
return {
|
|
751
|
-
...baseResult,
|
|
752
|
-
verified: !hasFailed,
|
|
753
|
-
confidence: 'high',
|
|
754
|
-
evidence: hasFailed ? `Test failures: ${output.slice(0, 500)}` : 'All tests pass'
|
|
755
|
-
};
|
|
756
|
-
}
|
|
757
|
-
catch (err) {
|
|
758
|
-
return {
|
|
759
|
-
...baseResult,
|
|
760
|
-
verified: false,
|
|
761
|
-
confidence: 'high',
|
|
762
|
-
evidence: 'Test execution failed',
|
|
763
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
764
|
-
};
|
|
765
|
-
}
|
|
766
|
-
};
|
|
767
|
-
case 'git_committed':
|
|
768
|
-
return async () => {
|
|
769
|
-
try {
|
|
770
|
-
const { stdout } = await execAsync('git log -1 --oneline', {
|
|
771
|
-
timeout: 5000,
|
|
772
|
-
cwd: process.cwd()
|
|
773
|
-
});
|
|
774
|
-
const hash = claim.params.hash;
|
|
775
|
-
if (hash && stdout.includes(hash.slice(0, 7))) {
|
|
776
|
-
return {
|
|
777
|
-
...baseResult,
|
|
778
|
-
verified: true,
|
|
779
|
-
confidence: 'high',
|
|
780
|
-
evidence: `Commit found: ${stdout.trim()}`
|
|
781
|
-
};
|
|
782
|
-
}
|
|
783
|
-
// Check if there's a recent commit
|
|
784
|
-
const { stdout: logOutput } = await execAsync('git log -1 --format="%H %s"', {
|
|
785
|
-
timeout: 5000
|
|
786
|
-
});
|
|
787
|
-
return {
|
|
788
|
-
...baseResult,
|
|
789
|
-
verified: true,
|
|
790
|
-
confidence: 'medium',
|
|
791
|
-
evidence: `Most recent commit: ${logOutput.trim()}`
|
|
792
|
-
};
|
|
793
|
-
}
|
|
794
|
-
catch (err) {
|
|
795
|
-
return {
|
|
796
|
-
...baseResult,
|
|
797
|
-
verified: false,
|
|
798
|
-
confidence: 'high',
|
|
799
|
-
evidence: 'Git check failed',
|
|
800
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
801
|
-
};
|
|
802
|
-
}
|
|
803
|
-
};
|
|
804
|
-
case 'package_published':
|
|
805
|
-
return async () => {
|
|
806
|
-
try {
|
|
807
|
-
// Read package.json to get name and version
|
|
808
|
-
const pkgPath = path.resolve(process.cwd(), 'package.json');
|
|
809
|
-
const pkgContent = await fs.readFile(pkgPath, 'utf-8');
|
|
810
|
-
const pkg = JSON.parse(pkgContent);
|
|
811
|
-
const { stdout } = await execAsync(`npm view ${pkg.name}@${pkg.version} version 2>&1`, {
|
|
812
|
-
timeout: 10000
|
|
813
|
-
});
|
|
814
|
-
const published = stdout.trim() === pkg.version;
|
|
815
|
-
return {
|
|
816
|
-
...baseResult,
|
|
817
|
-
verified: published,
|
|
818
|
-
confidence: 'high',
|
|
819
|
-
evidence: published ? `${pkg.name}@${pkg.version} found on npm` : 'Version not found on npm'
|
|
820
|
-
};
|
|
821
|
-
}
|
|
822
|
-
catch (err) {
|
|
823
|
-
return {
|
|
824
|
-
...baseResult,
|
|
825
|
-
verified: false,
|
|
826
|
-
confidence: 'medium',
|
|
827
|
-
evidence: 'Could not verify npm publication',
|
|
828
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
829
|
-
};
|
|
830
|
-
}
|
|
831
|
-
};
|
|
832
|
-
case 'command_executed':
|
|
833
|
-
// Can't really verify past command execution, just acknowledge
|
|
834
|
-
return async () => ({
|
|
835
|
-
...baseResult,
|
|
836
|
-
verified: true, // Assume true since we can't replay
|
|
837
|
-
confidence: 'low',
|
|
838
|
-
evidence: 'Command execution cannot be retroactively verified'
|
|
839
|
-
});
|
|
840
|
-
case 'dependency_installed':
|
|
841
|
-
return async () => {
|
|
842
|
-
const packageName = claim.params.package;
|
|
843
|
-
if (!packageName) {
|
|
844
|
-
return {
|
|
845
|
-
...baseResult,
|
|
846
|
-
verified: false,
|
|
847
|
-
confidence: 'low',
|
|
848
|
-
evidence: 'No package name provided'
|
|
849
|
-
};
|
|
850
|
-
}
|
|
851
|
-
try {
|
|
852
|
-
// Check if package exists in node_modules
|
|
853
|
-
const modulePath = path.resolve(process.cwd(), 'node_modules', packageName);
|
|
854
|
-
await fs.stat(modulePath);
|
|
855
|
-
// Also verify in package.json
|
|
856
|
-
const pkgPath = path.resolve(process.cwd(), 'package.json');
|
|
857
|
-
const pkgContent = await fs.readFile(pkgPath, 'utf-8');
|
|
858
|
-
const pkg = JSON.parse(pkgContent);
|
|
859
|
-
const inDeps = pkg.dependencies?.[packageName] || pkg.devDependencies?.[packageName];
|
|
860
|
-
return {
|
|
861
|
-
...baseResult,
|
|
862
|
-
verified: true,
|
|
863
|
-
confidence: inDeps ? 'high' : 'medium',
|
|
864
|
-
evidence: inDeps
|
|
865
|
-
? `Package ${packageName} installed (${inDeps})`
|
|
866
|
-
: `Package ${packageName} found in node_modules but not in package.json`
|
|
867
|
-
};
|
|
868
|
-
}
|
|
869
|
-
catch {
|
|
870
|
-
return {
|
|
871
|
-
...baseResult,
|
|
872
|
-
verified: false,
|
|
873
|
-
confidence: 'high',
|
|
874
|
-
evidence: `Package ${packageName} not found in node_modules`
|
|
875
|
-
};
|
|
876
|
-
}
|
|
877
|
-
};
|
|
878
|
-
case 'service_running':
|
|
879
|
-
return async () => {
|
|
880
|
-
const port = claim.params.port;
|
|
881
|
-
const name = claim.params.name;
|
|
882
|
-
try {
|
|
883
|
-
if (port) {
|
|
884
|
-
// Check if port is in use
|
|
885
|
-
const { stdout } = await execAsync(`lsof -i :${port} 2>/dev/null || netstat -an | grep ${port}`, {
|
|
886
|
-
timeout: 5000
|
|
887
|
-
});
|
|
888
|
-
const isRunning = stdout.trim().length > 0;
|
|
889
|
-
return {
|
|
890
|
-
...baseResult,
|
|
891
|
-
verified: isRunning,
|
|
892
|
-
confidence: 'high',
|
|
893
|
-
evidence: isRunning ? `Service running on port ${port}` : `No service found on port ${port}`
|
|
894
|
-
};
|
|
895
|
-
}
|
|
896
|
-
else if (name) {
|
|
897
|
-
// Check if process is running by name
|
|
898
|
-
const { stdout } = await execAsync(`pgrep -f "${name}" 2>/dev/null || ps aux | grep "${name}" | grep -v grep`, {
|
|
899
|
-
timeout: 5000
|
|
900
|
-
});
|
|
901
|
-
const isRunning = stdout.trim().length > 0;
|
|
902
|
-
return {
|
|
903
|
-
...baseResult,
|
|
904
|
-
verified: isRunning,
|
|
905
|
-
confidence: 'medium',
|
|
906
|
-
evidence: isRunning ? `Process "${name}" appears to be running` : `Process "${name}" not found`
|
|
907
|
-
};
|
|
908
|
-
}
|
|
909
|
-
return {
|
|
910
|
-
...baseResult,
|
|
911
|
-
verified: false,
|
|
912
|
-
confidence: 'low',
|
|
913
|
-
evidence: 'No port or service name provided for verification'
|
|
914
|
-
};
|
|
915
|
-
}
|
|
916
|
-
catch {
|
|
917
|
-
return {
|
|
918
|
-
...baseResult,
|
|
919
|
-
verified: false,
|
|
920
|
-
confidence: 'medium',
|
|
921
|
-
evidence: 'Could not verify service status'
|
|
922
|
-
};
|
|
923
|
-
}
|
|
924
|
-
};
|
|
925
|
-
case 'url_accessible':
|
|
926
|
-
return async () => {
|
|
927
|
-
const url = claim.params.url;
|
|
928
|
-
if (!url) {
|
|
929
|
-
return {
|
|
930
|
-
...baseResult,
|
|
931
|
-
verified: false,
|
|
932
|
-
confidence: 'low',
|
|
933
|
-
evidence: 'No URL provided'
|
|
934
|
-
};
|
|
935
|
-
}
|
|
936
|
-
try {
|
|
937
|
-
const { stdout } = await execAsync(`curl -s -o /dev/null -w "%{http_code}" "${url}" 2>&1`, {
|
|
938
|
-
timeout: 10000
|
|
939
|
-
});
|
|
940
|
-
const statusCode = parseInt(stdout.trim(), 10);
|
|
941
|
-
const isAccessible = statusCode >= 200 && statusCode < 400;
|
|
942
|
-
return {
|
|
943
|
-
...baseResult,
|
|
944
|
-
verified: isAccessible,
|
|
945
|
-
confidence: 'high',
|
|
946
|
-
evidence: `URL returned status ${statusCode}`
|
|
947
|
-
};
|
|
948
|
-
}
|
|
949
|
-
catch (err) {
|
|
950
|
-
return {
|
|
951
|
-
...baseResult,
|
|
952
|
-
verified: false,
|
|
953
|
-
confidence: 'high',
|
|
954
|
-
evidence: 'URL is not accessible',
|
|
955
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
956
|
-
};
|
|
957
|
-
}
|
|
958
|
-
};
|
|
959
|
-
case 'content_contains':
|
|
960
|
-
return async () => {
|
|
961
|
-
const filePath = claim.params.path;
|
|
962
|
-
const searchText = claim.params.content;
|
|
963
|
-
if (!filePath || !searchText) {
|
|
964
|
-
return {
|
|
965
|
-
...baseResult,
|
|
966
|
-
verified: false,
|
|
967
|
-
confidence: 'low',
|
|
968
|
-
evidence: 'Missing file path or search content'
|
|
969
|
-
};
|
|
970
|
-
}
|
|
971
|
-
try {
|
|
972
|
-
const resolvedPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
|
|
973
|
-
const content = await fs.readFile(resolvedPath, 'utf-8');
|
|
974
|
-
const contains = content.includes(searchText);
|
|
975
|
-
return {
|
|
976
|
-
...baseResult,
|
|
977
|
-
verified: contains,
|
|
978
|
-
confidence: 'high',
|
|
979
|
-
evidence: contains
|
|
980
|
-
? `File contains the expected content`
|
|
981
|
-
: `File does not contain "${searchText.slice(0, 50)}..."`
|
|
982
|
-
};
|
|
983
|
-
}
|
|
984
|
-
catch (err) {
|
|
985
|
-
return {
|
|
986
|
-
...baseResult,
|
|
987
|
-
verified: false,
|
|
988
|
-
confidence: 'high',
|
|
989
|
-
evidence: 'Could not read file',
|
|
990
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
991
|
-
};
|
|
992
|
-
}
|
|
993
|
-
};
|
|
994
|
-
default:
|
|
995
|
-
return async () => ({
|
|
996
|
-
...baseResult,
|
|
997
|
-
verified: false,
|
|
998
|
-
confidence: 'low',
|
|
999
|
-
evidence: `Unknown claim type: ${claim.type}`
|
|
1000
|
-
});
|
|
1001
|
-
}
|
|
1002
|
-
}
|
|
1003
|
-
/**
|
|
1004
|
-
* Verify all claims in an assistant response using LLM-based semantic analysis.
|
|
1005
|
-
* Requires a VerificationContext with an llmVerifier function.
|
|
1006
|
-
* All claim extraction and verification is done via LLM.
|
|
1007
|
-
*/
|
|
1008
|
-
export async function verifyResponse(response, context, responseId) {
|
|
1009
|
-
return verifyResponseComprehensive(response, context, responseId);
|
|
1010
|
-
}
|
|
1011
|
-
/**
|
|
1012
|
-
* Format a verification report for display
|
|
1013
|
-
*/
|
|
1014
|
-
export function formatVerificationReport(report) {
|
|
1015
|
-
const lines = [];
|
|
1016
|
-
lines.push('═══════════════════════════════════════════════════════════');
|
|
1017
|
-
lines.push(' RESPONSE VERIFICATION REPORT');
|
|
1018
|
-
lines.push('═══════════════════════════════════════════════════════════');
|
|
1019
|
-
lines.push('');
|
|
1020
|
-
const verdictEmoji = {
|
|
1021
|
-
verified: '✅',
|
|
1022
|
-
partially_verified: '⚠️',
|
|
1023
|
-
unverified: '❓',
|
|
1024
|
-
contradicted: '❌'
|
|
1025
|
-
};
|
|
1026
|
-
lines.push(`Verdict: ${verdictEmoji[report.overallVerdict]} ${report.overallVerdict.toUpperCase()}`);
|
|
1027
|
-
lines.push(`Claims: ${report.summary.total} total, ${report.summary.verified} verified, ${report.summary.failed} failed`);
|
|
1028
|
-
lines.push('');
|
|
1029
|
-
if (report.results.length > 0) {
|
|
1030
|
-
lines.push('Verification Details:');
|
|
1031
|
-
lines.push('─────────────────────');
|
|
1032
|
-
for (const result of report.results) {
|
|
1033
|
-
const icon = result.verified ? '✅' : (result.confidence === 'high' ? '❌' : '❓');
|
|
1034
|
-
lines.push(`${icon} ${result.claim.description}`);
|
|
1035
|
-
lines.push(` Evidence: ${result.evidence.slice(0, 100)}`);
|
|
1036
|
-
if (result.error) {
|
|
1037
|
-
lines.push(` Error: ${result.error}`);
|
|
1038
|
-
}
|
|
302
|
+
return JSON.parse(match[0]);
|
|
1039
303
|
}
|
|
1040
304
|
}
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
}
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
}
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
const report = await verifyResponse(response, context);
|
|
1054
|
-
return report.overallVerdict === 'verified' || report.overallVerdict === 'partially_verified';
|
|
1055
|
-
}
|
|
1056
|
-
/**
|
|
1057
|
-
* LLM-based verification prompt for claims that can't be programmatically verified
|
|
1058
|
-
*/
|
|
1059
|
-
const LLM_VERIFICATION_PROMPT = `You are a verification assistant. Analyze whether the following claim is likely TRUE or FALSE based on the evidence provided.
|
|
1060
|
-
|
|
1061
|
-
CLAIM: {CLAIM}
|
|
1062
|
-
|
|
1063
|
-
EVIDENCE/CONTEXT:
|
|
1064
|
-
{CONTEXT}
|
|
1065
|
-
|
|
1066
|
-
Respond with a JSON object:
|
|
1067
|
-
{
|
|
1068
|
-
"verdict": "verified" | "unverified" | "inconclusive",
|
|
1069
|
-
"confidence": "high" | "medium" | "low",
|
|
1070
|
-
"reasoning": "Brief explanation of your analysis",
|
|
1071
|
-
"suggested_test": "Optional: A command or check that could verify this claim"
|
|
1072
|
-
}
|
|
1073
|
-
|
|
1074
|
-
Be conservative - only mark as "verified" if there's strong evidence. Mark as "inconclusive" if you can't determine the truth.`;
|
|
1075
|
-
/**
|
|
1076
|
-
* Verify a claim using LLM when runtime verification isn't possible
|
|
1077
|
-
*/
|
|
1078
|
-
export async function verifyClaimWithLLM(claim, context) {
|
|
1079
|
-
const baseResult = {
|
|
1080
|
-
claim,
|
|
1081
|
-
timestamp: new Date().toISOString()
|
|
1082
|
-
};
|
|
1083
|
-
if (!context.llmVerifier) {
|
|
1084
|
-
return {
|
|
1085
|
-
...baseResult,
|
|
1086
|
-
verified: false,
|
|
1087
|
-
confidence: 'low',
|
|
1088
|
-
evidence: 'No LLM verifier available for semantic verification'
|
|
305
|
+
catch {
|
|
306
|
+
// Fall through to basic tests
|
|
307
|
+
}
|
|
308
|
+
// Fallback: generate basic tests
|
|
309
|
+
return claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).map((c, i) => {
|
|
310
|
+
const test = {
|
|
311
|
+
id: `test-${i}`,
|
|
312
|
+
description: c.statement,
|
|
313
|
+
commands: [],
|
|
314
|
+
shellCommands: [],
|
|
315
|
+
expectedBehavior: c.statement,
|
|
316
|
+
timeout: 30000
|
|
1089
317
|
};
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
if (context.previousState) {
|
|
1095
|
-
contextParts.push(`Previous state: ${JSON.stringify(context.previousState, null, 2)}`);
|
|
1096
|
-
}
|
|
1097
|
-
if (context.currentState) {
|
|
1098
|
-
contextParts.push(`Current state: ${JSON.stringify(context.currentState, null, 2)}`);
|
|
318
|
+
// Add basic verification based on category
|
|
319
|
+
if (c.category === 'file_op' && c.context['path']) {
|
|
320
|
+
test.shellCommands = [`test -f "${c.context['path']}" && echo "EXISTS" || echo "NOT_FOUND"`];
|
|
321
|
+
test.expectedOutputs = ['EXISTS'];
|
|
1099
322
|
}
|
|
1100
|
-
if (
|
|
1101
|
-
|
|
323
|
+
else if (c.category === 'code') {
|
|
324
|
+
test.shellCommands = ['npm run build 2>&1 | tail -5'];
|
|
1102
325
|
}
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
const prompt = LLM_VERIFICATION_PROMPT
|
|
1106
|
-
.replace('{CLAIM}', claim.description)
|
|
1107
|
-
.replace('{CONTEXT}', contextParts.join('\n\n'));
|
|
1108
|
-
const result = await context.llmVerifier(prompt);
|
|
1109
|
-
// Parse LLM response
|
|
1110
|
-
const jsonMatch = result.match(/\{[\s\S]*\}/);
|
|
1111
|
-
if (!jsonMatch) {
|
|
1112
|
-
return {
|
|
1113
|
-
...baseResult,
|
|
1114
|
-
verified: false,
|
|
1115
|
-
confidence: 'low',
|
|
1116
|
-
evidence: 'LLM verification returned invalid response'
|
|
1117
|
-
};
|
|
326
|
+
else if (c.category === 'behavior') {
|
|
327
|
+
test.commands = ['/help'];
|
|
1118
328
|
}
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
...baseResult,
|
|
1122
|
-
verified: parsed.verdict === 'verified',
|
|
1123
|
-
confidence: parsed.confidence || 'medium',
|
|
1124
|
-
evidence: `LLM Analysis: ${parsed.reasoning}${parsed.suggested_test ? ` (Suggested test: ${parsed.suggested_test})` : ''}`
|
|
1125
|
-
};
|
|
1126
|
-
}
|
|
1127
|
-
catch (err) {
|
|
1128
|
-
return {
|
|
1129
|
-
...baseResult,
|
|
1130
|
-
verified: false,
|
|
1131
|
-
confidence: 'low',
|
|
1132
|
-
evidence: 'LLM verification failed',
|
|
1133
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
1134
|
-
};
|
|
1135
|
-
}
|
|
1136
|
-
}
|
|
1137
|
-
/**
|
|
1138
|
-
* Generate verification test for extended claim types
|
|
1139
|
-
*/
|
|
1140
|
-
export function generateExtendedVerificationTest(claim, context) {
|
|
1141
|
-
const baseResult = {
|
|
1142
|
-
claim,
|
|
1143
|
-
timestamp: new Date().toISOString()
|
|
1144
|
-
};
|
|
1145
|
-
switch (claim.type) {
|
|
1146
|
-
case 'api_response':
|
|
1147
|
-
return async () => {
|
|
1148
|
-
const url = claim.params.url;
|
|
1149
|
-
const expectedStatus = claim.params.status;
|
|
1150
|
-
const expectedBody = claim.params.body;
|
|
1151
|
-
if (!url) {
|
|
1152
|
-
return {
|
|
1153
|
-
...baseResult,
|
|
1154
|
-
verified: false,
|
|
1155
|
-
confidence: 'low',
|
|
1156
|
-
evidence: 'No API URL provided'
|
|
1157
|
-
};
|
|
1158
|
-
}
|
|
1159
|
-
try {
|
|
1160
|
-
const { stdout } = await execAsync(`curl -s -w "\\n%{http_code}" "${url}" 2>&1`, { timeout: 15000 });
|
|
1161
|
-
const lines = stdout.trim().split('\n');
|
|
1162
|
-
const statusCode = parseInt(lines.pop() || '0', 10);
|
|
1163
|
-
const body = lines.join('\n');
|
|
1164
|
-
let verified = true;
|
|
1165
|
-
const evidenceParts = [];
|
|
1166
|
-
if (expectedStatus && statusCode !== expectedStatus) {
|
|
1167
|
-
verified = false;
|
|
1168
|
-
evidenceParts.push(`Expected status ${expectedStatus}, got ${statusCode}`);
|
|
1169
|
-
}
|
|
1170
|
-
else {
|
|
1171
|
-
evidenceParts.push(`Status: ${statusCode}`);
|
|
1172
|
-
}
|
|
1173
|
-
if (expectedBody && !body.includes(expectedBody)) {
|
|
1174
|
-
verified = false;
|
|
1175
|
-
evidenceParts.push(`Expected body to contain "${expectedBody.slice(0, 50)}..."`);
|
|
1176
|
-
}
|
|
1177
|
-
return {
|
|
1178
|
-
...baseResult,
|
|
1179
|
-
verified,
|
|
1180
|
-
confidence: 'high',
|
|
1181
|
-
evidence: evidenceParts.join('. ')
|
|
1182
|
-
};
|
|
1183
|
-
}
|
|
1184
|
-
catch (err) {
|
|
1185
|
-
return {
|
|
1186
|
-
...baseResult,
|
|
1187
|
-
verified: false,
|
|
1188
|
-
confidence: 'high',
|
|
1189
|
-
evidence: 'API request failed',
|
|
1190
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
1191
|
-
};
|
|
1192
|
-
}
|
|
1193
|
-
};
|
|
1194
|
-
case 'env_var_set':
|
|
1195
|
-
return async () => {
|
|
1196
|
-
const varName = claim.params.name;
|
|
1197
|
-
const expectedValue = claim.params.value;
|
|
1198
|
-
if (!varName) {
|
|
1199
|
-
return {
|
|
1200
|
-
...baseResult,
|
|
1201
|
-
verified: false,
|
|
1202
|
-
confidence: 'low',
|
|
1203
|
-
evidence: 'No environment variable name provided'
|
|
1204
|
-
};
|
|
1205
|
-
}
|
|
1206
|
-
const actualValue = process.env[varName];
|
|
1207
|
-
if (actualValue === undefined) {
|
|
1208
|
-
return {
|
|
1209
|
-
...baseResult,
|
|
1210
|
-
verified: false,
|
|
1211
|
-
confidence: 'high',
|
|
1212
|
-
evidence: `Environment variable ${varName} is not set`
|
|
1213
|
-
};
|
|
1214
|
-
}
|
|
1215
|
-
if (expectedValue && actualValue !== expectedValue) {
|
|
1216
|
-
return {
|
|
1217
|
-
...baseResult,
|
|
1218
|
-
verified: false,
|
|
1219
|
-
confidence: 'high',
|
|
1220
|
-
evidence: `Expected ${varName}="${expectedValue}", got "${actualValue}"`
|
|
1221
|
-
};
|
|
1222
|
-
}
|
|
1223
|
-
return {
|
|
1224
|
-
...baseResult,
|
|
1225
|
-
verified: true,
|
|
1226
|
-
confidence: 'high',
|
|
1227
|
-
evidence: `${varName} is set${expectedValue ? ` to expected value` : `: ${actualValue.slice(0, 50)}`}`
|
|
1228
|
-
};
|
|
1229
|
-
};
|
|
1230
|
-
case 'config_changed':
|
|
1231
|
-
return async () => {
|
|
1232
|
-
const configPath = claim.params.path;
|
|
1233
|
-
const expectedKey = claim.params.key;
|
|
1234
|
-
const expectedValue = claim.params.value;
|
|
1235
|
-
if (!configPath) {
|
|
1236
|
-
return {
|
|
1237
|
-
...baseResult,
|
|
1238
|
-
verified: false,
|
|
1239
|
-
confidence: 'low',
|
|
1240
|
-
evidence: 'No config file path provided'
|
|
1241
|
-
};
|
|
1242
|
-
}
|
|
1243
|
-
try {
|
|
1244
|
-
const resolvedPath = path.isAbsolute(configPath)
|
|
1245
|
-
? configPath
|
|
1246
|
-
: path.resolve(context.workingDirectory, configPath);
|
|
1247
|
-
const content = await fs.readFile(resolvedPath, 'utf-8');
|
|
1248
|
-
// Try to parse as JSON
|
|
1249
|
-
let config;
|
|
1250
|
-
try {
|
|
1251
|
-
config = JSON.parse(content);
|
|
1252
|
-
}
|
|
1253
|
-
catch {
|
|
1254
|
-
// Not JSON, check raw content
|
|
1255
|
-
if (expectedValue && content.includes(String(expectedValue))) {
|
|
1256
|
-
return {
|
|
1257
|
-
...baseResult,
|
|
1258
|
-
verified: true,
|
|
1259
|
-
confidence: 'medium',
|
|
1260
|
-
evidence: `Config file contains expected value`
|
|
1261
|
-
};
|
|
1262
|
-
}
|
|
1263
|
-
return {
|
|
1264
|
-
...baseResult,
|
|
1265
|
-
verified: true,
|
|
1266
|
-
confidence: 'low',
|
|
1267
|
-
evidence: 'Config file exists but format unknown'
|
|
1268
|
-
};
|
|
1269
|
-
}
|
|
1270
|
-
if (expectedKey) {
|
|
1271
|
-
const keys = expectedKey.split('.');
|
|
1272
|
-
let value = config;
|
|
1273
|
-
for (const key of keys) {
|
|
1274
|
-
value = value?.[key];
|
|
1275
|
-
}
|
|
1276
|
-
if (expectedValue !== undefined) {
|
|
1277
|
-
const matches = JSON.stringify(value) === JSON.stringify(expectedValue);
|
|
1278
|
-
return {
|
|
1279
|
-
...baseResult,
|
|
1280
|
-
verified: matches,
|
|
1281
|
-
confidence: 'high',
|
|
1282
|
-
evidence: matches
|
|
1283
|
-
? `${expectedKey} has expected value`
|
|
1284
|
-
: `${expectedKey} = ${JSON.stringify(value)}, expected ${JSON.stringify(expectedValue)}`
|
|
1285
|
-
};
|
|
1286
|
-
}
|
|
1287
|
-
return {
|
|
1288
|
-
...baseResult,
|
|
1289
|
-
verified: value !== undefined,
|
|
1290
|
-
confidence: 'high',
|
|
1291
|
-
evidence: value !== undefined
|
|
1292
|
-
? `${expectedKey} exists: ${JSON.stringify(value).slice(0, 100)}`
|
|
1293
|
-
: `${expectedKey} not found in config`
|
|
1294
|
-
};
|
|
1295
|
-
}
|
|
1296
|
-
return {
|
|
1297
|
-
...baseResult,
|
|
1298
|
-
verified: true,
|
|
1299
|
-
confidence: 'medium',
|
|
1300
|
-
evidence: 'Config file exists and is valid JSON'
|
|
1301
|
-
};
|
|
1302
|
-
}
|
|
1303
|
-
catch (err) {
|
|
1304
|
-
return {
|
|
1305
|
-
...baseResult,
|
|
1306
|
-
verified: false,
|
|
1307
|
-
confidence: 'high',
|
|
1308
|
-
evidence: 'Could not read config file',
|
|
1309
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
1310
|
-
};
|
|
1311
|
-
}
|
|
1312
|
-
};
|
|
1313
|
-
case 'error_fixed':
|
|
1314
|
-
case 'feature_implemented':
|
|
1315
|
-
case 'refactor_complete':
|
|
1316
|
-
// These require semantic verification - LLM is required
|
|
1317
|
-
return async () => {
|
|
1318
|
-
if (!context.llmVerifier) {
|
|
1319
|
-
return {
|
|
1320
|
-
...baseResult,
|
|
1321
|
-
verified: false,
|
|
1322
|
-
confidence: 'low',
|
|
1323
|
-
evidence: 'Semantic verification requires LLM verifier'
|
|
1324
|
-
};
|
|
1325
|
-
}
|
|
1326
|
-
return verifyClaimWithLLM(claim, context);
|
|
1327
|
-
};
|
|
1328
|
-
case 'data_transformed':
|
|
1329
|
-
case 'database_updated':
|
|
1330
|
-
case 'permission_granted':
|
|
1331
|
-
case 'generic':
|
|
1332
|
-
default:
|
|
1333
|
-
// All these claim types require LLM verification
|
|
1334
|
-
return async () => {
|
|
1335
|
-
if (!context.llmVerifier) {
|
|
1336
|
-
return {
|
|
1337
|
-
...baseResult,
|
|
1338
|
-
verified: false,
|
|
1339
|
-
confidence: 'low',
|
|
1340
|
-
evidence: `${claim.type} verification requires LLM verifier`
|
|
1341
|
-
};
|
|
1342
|
-
}
|
|
1343
|
-
return verifyClaimWithLLM(claim, context);
|
|
1344
|
-
};
|
|
1345
|
-
}
|
|
329
|
+
return test;
|
|
330
|
+
});
|
|
1346
331
|
}
|
|
332
|
+
// ============================================================================
|
|
333
|
+
// MAIN VERIFICATION API
|
|
334
|
+
// ============================================================================
|
|
1347
335
|
/**
|
|
1348
|
-
*
|
|
1349
|
-
*
|
|
336
|
+
* Verify an assistant response using isolated runtime tests.
|
|
337
|
+
* This is the main entry point for verification.
|
|
1350
338
|
*/
|
|
1351
|
-
export async function
|
|
1352
|
-
|
|
339
|
+
export async function verifyResponse(response, ctx, responseId) {
|
|
340
|
+
const timestamp = new Date().toISOString();
|
|
341
|
+
const id = responseId || `verify-${Date.now()}`;
|
|
342
|
+
// Extract claims from response
|
|
343
|
+
const claims = await extractClaims(response, ctx);
|
|
344
|
+
if (claims.length === 0) {
|
|
1353
345
|
return {
|
|
1354
|
-
responseId:
|
|
1355
|
-
timestamp
|
|
346
|
+
responseId: id,
|
|
347
|
+
timestamp,
|
|
1356
348
|
claims: [],
|
|
1357
349
|
results: [],
|
|
1358
350
|
summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
|
|
1359
|
-
overallVerdict: 'unverified'
|
|
351
|
+
overallVerdict: 'unverified',
|
|
352
|
+
trustScore: 50
|
|
1360
353
|
};
|
|
1361
354
|
}
|
|
1362
|
-
//
|
|
1363
|
-
const
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
355
|
+
// Generate isolated tests for claims
|
|
356
|
+
const tests = await generateTests(claims, ctx);
|
|
357
|
+
// Run all isolated tests
|
|
358
|
+
const testResults = [];
|
|
359
|
+
for (const test of tests) {
|
|
360
|
+
const result = await runIsolatedTest(test, ctx.workingDirectory, ctx.llmVerifier);
|
|
361
|
+
testResults.push(result);
|
|
362
|
+
}
|
|
363
|
+
// Map test results back to claims
|
|
364
|
+
const results = claims.map((claim, i) => {
|
|
365
|
+
const testResult = testResults[i];
|
|
366
|
+
if (!testResult) {
|
|
367
|
+
return {
|
|
1373
368
|
claim,
|
|
1374
369
|
verified: false,
|
|
1375
370
|
confidence: 'low',
|
|
1376
|
-
evidence: '
|
|
1377
|
-
|
|
1378
|
-
timestamp
|
|
1379
|
-
}
|
|
371
|
+
evidence: 'No test generated',
|
|
372
|
+
method: 'skip',
|
|
373
|
+
timestamp
|
|
374
|
+
};
|
|
1380
375
|
}
|
|
1381
|
-
|
|
376
|
+
return {
|
|
377
|
+
claim,
|
|
378
|
+
verified: testResult.success,
|
|
379
|
+
confidence: testResult.success ? 'high' : (testResult.matchedPatterns.length > 0 ? 'medium' : 'low'),
|
|
380
|
+
evidence: testResult.success
|
|
381
|
+
? `Verified in isolated runtime: ${testResult.matchedPatterns.join(', ')}`
|
|
382
|
+
: `Failed: ${testResult.unmatchedPatterns.join(', ')}`,
|
|
383
|
+
method: 'isolated-runtime',
|
|
384
|
+
reasoning: testResult.llmAssessment,
|
|
385
|
+
executedCode: [...(testResult.test.shellCommands || []), ...(testResult.test.commands || [])].join('\n'),
|
|
386
|
+
rawOutput: testResult.output.slice(0, 2000),
|
|
387
|
+
error: testResult.errors || undefined,
|
|
388
|
+
timestamp
|
|
389
|
+
};
|
|
390
|
+
});
|
|
391
|
+
// Calculate summary
|
|
1382
392
|
const verified = results.filter(r => r.verified).length;
|
|
1383
393
|
const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
|
|
1384
394
|
const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
|
|
395
|
+
// Determine verdict
|
|
1385
396
|
let overallVerdict;
|
|
1386
397
|
if (failed > 0) {
|
|
1387
398
|
overallVerdict = 'contradicted';
|
|
@@ -1395,673 +406,69 @@ export async function verifyResponseComprehensive(response, context, responseId)
|
|
|
1395
406
|
else {
|
|
1396
407
|
overallVerdict = 'unverified';
|
|
1397
408
|
}
|
|
409
|
+
// Calculate trust score
|
|
410
|
+
const trustScore = claims.length > 0
|
|
411
|
+
? Math.round((verified / claims.length) * 100)
|
|
412
|
+
: 50;
|
|
1398
413
|
return {
|
|
1399
|
-
responseId:
|
|
1400
|
-
timestamp
|
|
414
|
+
responseId: id,
|
|
415
|
+
timestamp,
|
|
1401
416
|
claims,
|
|
1402
417
|
results,
|
|
1403
|
-
summary: {
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
failed,
|
|
1407
|
-
inconclusive
|
|
1408
|
-
},
|
|
1409
|
-
overallVerdict
|
|
418
|
+
summary: { total: claims.length, verified, failed, inconclusive },
|
|
419
|
+
overallVerdict,
|
|
420
|
+
trustScore
|
|
1410
421
|
};
|
|
1411
422
|
}
|
|
1412
423
|
/**
|
|
1413
|
-
*
|
|
1414
|
-
*/
|
|
1415
|
-
export function getVerificationStrategy(claim) {
|
|
1416
|
-
switch (claim.type) {
|
|
1417
|
-
case 'file_created':
|
|
1418
|
-
case 'file_modified':
|
|
1419
|
-
case 'file_deleted':
|
|
1420
|
-
case 'content_contains':
|
|
1421
|
-
case 'config_changed':
|
|
1422
|
-
case 'permission_granted':
|
|
1423
|
-
return 'filesystem';
|
|
1424
|
-
case 'url_accessible':
|
|
1425
|
-
case 'api_response':
|
|
1426
|
-
case 'service_running':
|
|
1427
|
-
return 'network';
|
|
1428
|
-
case 'code_compiles':
|
|
1429
|
-
case 'tests_pass':
|
|
1430
|
-
case 'command_executed':
|
|
1431
|
-
case 'dependency_installed':
|
|
1432
|
-
case 'git_committed':
|
|
1433
|
-
case 'package_published':
|
|
1434
|
-
case 'env_var_set':
|
|
1435
|
-
return 'runtime';
|
|
1436
|
-
case 'error_fixed':
|
|
1437
|
-
case 'feature_implemented':
|
|
1438
|
-
case 'refactor_complete':
|
|
1439
|
-
case 'data_transformed':
|
|
1440
|
-
return 'semantic';
|
|
1441
|
-
case 'database_updated':
|
|
1442
|
-
return 'comparison';
|
|
1443
|
-
case 'generic':
|
|
1444
|
-
default:
|
|
1445
|
-
return 'llm';
|
|
1446
|
-
}
|
|
1447
|
-
}
|
|
1448
|
-
/**
|
|
1449
|
-
* Prompt for LLM to generate verification code
|
|
424
|
+
* Format verification report for display
|
|
1450
425
|
*/
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
Parameters: {CLAIM_PARAMS}
|
|
1458
|
-
|
|
1459
|
-
WORKING DIRECTORY: {WORKING_DIR}
|
|
1460
|
-
|
|
1461
|
-
Generate a verification test. Choose the most appropriate approach:
|
|
1462
|
-
|
|
1463
|
-
1. SHELL COMMAND - For file operations, git, npm, system checks
|
|
1464
|
-
2. JAVASCRIPT - For complex logic, API calls, JSON parsing
|
|
1465
|
-
3. API - For HTTP endpoints, external services
|
|
426
|
+
export function formatVerificationReport(report) {
|
|
427
|
+
const bar = '█'.repeat(Math.round(report.trustScore / 10)) + '░'.repeat(10 - Math.round(report.trustScore / 10));
|
|
428
|
+
const icon = report.trustScore >= 80 ? '✅' : report.trustScore >= 50 ? '⚠️' : '❌';
|
|
429
|
+
let out = `╔════════════════════════════════════════════════════════════╗
|
|
430
|
+
║ ISOLATED RUNTIME VERIFICATION REPORT ║
|
|
431
|
+
╚════════════════════════════════════════════════════════════╝
|
|
1466
432
|
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
- Code must output a clear result that can be parsed
|
|
1471
|
-
- For shell: output should be parseable (exit code 0 = verified, non-zero = failed)
|
|
1472
|
-
- For JavaScript: must export/return { verified: boolean, evidence: string }
|
|
1473
|
-
- Do NOT use interactive commands
|
|
1474
|
-
- Do NOT access sensitive data or credentials
|
|
433
|
+
`;
|
|
434
|
+
out += `Trust: ${icon} ${report.trustScore}/100 [${bar}]
|
|
435
|
+
Verdict: ${report.overallVerdict.toUpperCase()}
|
|
1475
436
|
|
|
1476
|
-
|
|
1477
|
-
{
|
|
1478
|
-
"testType": "shell" | "javascript" | "api",
|
|
1479
|
-
"code": "the verification code",
|
|
1480
|
-
"description": "what this test does",
|
|
1481
|
-
"expectedOutcome": "what success looks like",
|
|
1482
|
-
"safeToRun": true | false,
|
|
1483
|
-
"safetyReason": "why it's safe/unsafe"
|
|
1484
|
-
}
|
|
437
|
+
Claims: ${report.summary.total} | ✅ ${report.summary.verified} | ❌ ${report.summary.failed} | ❓ ${report.summary.inconclusive}
|
|
1485
438
|
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
}
|
|
1494
|
-
try {
|
|
1495
|
-
const prompt = VERIFICATION_CODE_GENERATION_PROMPT
|
|
1496
|
-
.replace('{CLAIM_TYPE}', claim.type)
|
|
1497
|
-
.replace('{CLAIM_DESCRIPTION}', claim.description)
|
|
1498
|
-
.replace('{CLAIM_EVIDENCE}', claim.evidence)
|
|
1499
|
-
.replace('{CLAIM_PARAMS}', JSON.stringify(claim.params, null, 2))
|
|
1500
|
-
.replace('{WORKING_DIR}', context.workingDirectory);
|
|
1501
|
-
const result = await context.llmVerifier(prompt);
|
|
1502
|
-
// Parse the JSON response
|
|
1503
|
-
const jsonMatch = result.match(/\{[\s\S]*\}/);
|
|
1504
|
-
if (!jsonMatch) {
|
|
1505
|
-
return null;
|
|
1506
|
-
}
|
|
1507
|
-
const parsed = JSON.parse(jsonMatch[0]);
|
|
1508
|
-
return {
|
|
1509
|
-
claim,
|
|
1510
|
-
testType: parsed.testType,
|
|
1511
|
-
code: parsed.code,
|
|
1512
|
-
description: parsed.description,
|
|
1513
|
-
expectedOutcome: parsed.expectedOutcome,
|
|
1514
|
-
safetyCheck: parsed.safeToRun
|
|
1515
|
-
};
|
|
1516
|
-
}
|
|
1517
|
-
catch (err) {
|
|
1518
|
-
console.error('Failed to generate verification code:', err);
|
|
1519
|
-
return null;
|
|
1520
|
-
}
|
|
1521
|
-
}
|
|
1522
|
-
/**
|
|
1523
|
-
* Safety patterns to block dangerous code
|
|
1524
|
-
*/
|
|
1525
|
-
const DANGEROUS_PATTERNS = [
|
|
1526
|
-
/\brm\s+-rf?\b/i, // rm commands
|
|
1527
|
-
/\brmdir\b/i, // rmdir
|
|
1528
|
-
/\bdd\s+if=/i, // dd (disk destroyer)
|
|
1529
|
-
/\bmkfs\b/i, // format filesystem
|
|
1530
|
-
/\b>\s*\/dev\//i, // write to devices
|
|
1531
|
-
/\bchmod\s+777\b/i, // dangerous permissions
|
|
1532
|
-
/\bsudo\b/i, // sudo commands
|
|
1533
|
-
/\bcurl.*\|\s*sh\b/i, // pipe to shell
|
|
1534
|
-
/\bwget.*\|\s*sh\b/i, // pipe to shell
|
|
1535
|
-
/\beval\s*\(/i, // eval in JS
|
|
1536
|
-
/new\s+Function\s*\(/i, // Function constructor
|
|
1537
|
-
/child_process/i, // subprocess in JS (unless we control it)
|
|
1538
|
-
/\bexec\s*\(/i, // exec calls
|
|
1539
|
-
/\bspawn\s*\(/i, // spawn calls
|
|
1540
|
-
/writeFile/i, // file writes
|
|
1541
|
-
/appendFile/i, // file appends
|
|
1542
|
-
/unlink\s*\(/i, // file deletion
|
|
1543
|
-
/rmSync/i, // sync deletion
|
|
1544
|
-
/fs\.rm/i, // fs remove
|
|
1545
|
-
/DROP\s+TABLE/i, // SQL injection
|
|
1546
|
-
/DELETE\s+FROM/i, // SQL deletion
|
|
1547
|
-
/TRUNCATE/i, // SQL truncate
|
|
1548
|
-
/;\s*--/, // SQL comment injection
|
|
1549
|
-
/process\.exit/i, // process exit
|
|
1550
|
-
/require\s*\(\s*['"]child/i, // require child_process
|
|
1551
|
-
];
|
|
1552
|
-
/**
|
|
1553
|
-
* Validate that generated code is safe to execute
|
|
1554
|
-
*/
|
|
1555
|
-
export function validateGeneratedCode(test) {
|
|
1556
|
-
// First check the LLM's own safety assessment
|
|
1557
|
-
if (!test.safetyCheck) {
|
|
1558
|
-
return { safe: false, reason: 'LLM marked code as unsafe' };
|
|
1559
|
-
}
|
|
1560
|
-
// Check against dangerous patterns
|
|
1561
|
-
for (const pattern of DANGEROUS_PATTERNS) {
|
|
1562
|
-
if (pattern.test(test.code)) {
|
|
1563
|
-
return {
|
|
1564
|
-
safe: false,
|
|
1565
|
-
reason: `Dangerous pattern detected: ${pattern.source}`
|
|
1566
|
-
};
|
|
1567
|
-
}
|
|
1568
|
-
}
|
|
1569
|
-
// Additional checks for shell commands
|
|
1570
|
-
if (test.testType === 'shell') {
|
|
1571
|
-
// Only allow specific safe commands
|
|
1572
|
-
const safeShellPrefixes = [
|
|
1573
|
-
'ls', 'cat', 'head', 'tail', 'grep', 'find', 'stat', 'file',
|
|
1574
|
-
'test', 'echo', 'pwd', 'wc', 'diff', 'cmp',
|
|
1575
|
-
'git log', 'git status', 'git show', 'git diff', 'git branch',
|
|
1576
|
-
'npm view', 'npm list', 'npm ls',
|
|
1577
|
-
'node -e', 'node --eval',
|
|
1578
|
-
'curl -s', 'curl --silent', 'wget -q',
|
|
1579
|
-
'jq', 'python -c', 'python3 -c',
|
|
1580
|
-
'lsof', 'netstat', 'ss', 'ps',
|
|
1581
|
-
'which', 'type', 'command -v',
|
|
1582
|
-
];
|
|
1583
|
-
const trimmedCode = test.code.trim().toLowerCase();
|
|
1584
|
-
const startsWithSafe = safeShellPrefixes.some(prefix => trimmedCode.startsWith(prefix.toLowerCase()));
|
|
1585
|
-
if (!startsWithSafe) {
|
|
1586
|
-
// Check if it's a simple test/check command
|
|
1587
|
-
if (!trimmedCode.startsWith('[') && !trimmedCode.startsWith('if ')) {
|
|
1588
|
-
return {
|
|
1589
|
-
safe: false,
|
|
1590
|
-
reason: 'Shell command does not start with a known safe prefix'
|
|
1591
|
-
};
|
|
1592
|
-
}
|
|
1593
|
-
}
|
|
1594
|
-
}
|
|
1595
|
-
// For JavaScript, ensure it's a simple expression
|
|
1596
|
-
if (test.testType === 'javascript') {
|
|
1597
|
-
// Check code length - very long code is suspicious
|
|
1598
|
-
if (test.code.length > 2000) {
|
|
1599
|
-
return { safe: false, reason: 'JavaScript code too long' };
|
|
439
|
+
`;
|
|
440
|
+
out += `🔬 ISOLATED RUNTIME TESTS:\n`;
|
|
441
|
+
for (const r of report.results.slice(0, 8)) {
|
|
442
|
+
const statusIcon = r.verified ? '✅' : r.confidence === 'high' ? '❌' : '❓';
|
|
443
|
+
out += ` ${statusIcon} [${r.confidence}] ${r.claim.statement.slice(0, 50)}...\n`;
|
|
444
|
+
if (r.reasoning) {
|
|
445
|
+
out += ` └─ ${r.reasoning.slice(0, 60)}\n`;
|
|
1600
446
|
}
|
|
1601
447
|
}
|
|
1602
|
-
|
|
1603
|
-
}
|
|
1604
|
-
/**
|
|
1605
|
-
* Execute a generated verification test
|
|
1606
|
-
*/
|
|
1607
|
-
export async function executeGeneratedTest(test, context) {
|
|
1608
|
-
const baseResult = {
|
|
1609
|
-
claim: test.claim,
|
|
1610
|
-
timestamp: new Date().toISOString()
|
|
1611
|
-
};
|
|
1612
|
-
// Validate safety first
|
|
1613
|
-
const safetyResult = validateGeneratedCode(test);
|
|
1614
|
-
if (!safetyResult.safe) {
|
|
1615
|
-
return {
|
|
1616
|
-
...baseResult,
|
|
1617
|
-
verified: false,
|
|
1618
|
-
confidence: 'low',
|
|
1619
|
-
evidence: `Generated test blocked: ${safetyResult.reason}`,
|
|
1620
|
-
error: 'Safety validation failed'
|
|
1621
|
-
};
|
|
1622
|
-
}
|
|
1623
|
-
try {
|
|
1624
|
-
switch (test.testType) {
|
|
1625
|
-
case 'shell': {
|
|
1626
|
-
const { stdout, stderr } = await execAsync(test.code, {
|
|
1627
|
-
cwd: context.workingDirectory,
|
|
1628
|
-
timeout: 10000, // 10 second timeout
|
|
1629
|
-
maxBuffer: 1024 * 1024 // 1MB max output
|
|
1630
|
-
});
|
|
1631
|
-
const output = (stdout + stderr).trim();
|
|
1632
|
-
// Shell convention: exit 0 = success
|
|
1633
|
-
return {
|
|
1634
|
-
...baseResult,
|
|
1635
|
-
verified: true,
|
|
1636
|
-
confidence: 'high',
|
|
1637
|
-
evidence: `Test passed. Output: ${output.slice(0, 500)}`
|
|
1638
|
-
};
|
|
1639
|
-
}
|
|
1640
|
-
case 'javascript': {
|
|
1641
|
-
// Execute JavaScript in a sandboxed way using node -e
|
|
1642
|
-
const wrappedCode = `
|
|
1643
|
-
const result = (async () => {
|
|
1644
|
-
${test.code}
|
|
1645
|
-
})();
|
|
1646
|
-
result.then(r => console.log(JSON.stringify(r))).catch(e => {
|
|
1647
|
-
console.log(JSON.stringify({ verified: false, evidence: e.message }));
|
|
1648
|
-
});
|
|
1649
|
-
`;
|
|
1650
|
-
const { stdout } = await execAsync(`node -e ${JSON.stringify(wrappedCode)}`, {
|
|
1651
|
-
cwd: context.workingDirectory,
|
|
1652
|
-
timeout: 10000
|
|
1653
|
-
});
|
|
1654
|
-
try {
|
|
1655
|
-
const result = JSON.parse(stdout.trim());
|
|
1656
|
-
return {
|
|
1657
|
-
...baseResult,
|
|
1658
|
-
verified: result.verified,
|
|
1659
|
-
confidence: 'high',
|
|
1660
|
-
evidence: result.evidence
|
|
1661
|
-
};
|
|
1662
|
-
}
|
|
1663
|
-
catch {
|
|
1664
|
-
return {
|
|
1665
|
-
...baseResult,
|
|
1666
|
-
verified: false,
|
|
1667
|
-
confidence: 'medium',
|
|
1668
|
-
evidence: `JavaScript output: ${stdout.slice(0, 500)}`
|
|
1669
|
-
};
|
|
1670
|
-
}
|
|
1671
|
-
}
|
|
1672
|
-
case 'api': {
|
|
1673
|
-
// For API tests, use curl
|
|
1674
|
-
const { stdout } = await execAsync(test.code, {
|
|
1675
|
-
cwd: context.workingDirectory,
|
|
1676
|
-
timeout: 15000
|
|
1677
|
-
});
|
|
1678
|
-
// Try to parse as JSON result
|
|
1679
|
-
try {
|
|
1680
|
-
const result = JSON.parse(stdout.trim());
|
|
1681
|
-
return {
|
|
1682
|
-
...baseResult,
|
|
1683
|
-
verified: Boolean(result.verified ?? result.success ?? result.ok),
|
|
1684
|
-
confidence: 'high',
|
|
1685
|
-
evidence: `API response: ${JSON.stringify(result).slice(0, 500)}`
|
|
1686
|
-
};
|
|
1687
|
-
}
|
|
1688
|
-
catch {
|
|
1689
|
-
// Non-JSON response - check for success indicators
|
|
1690
|
-
const isSuccess = stdout.includes('200') || stdout.includes('success') || stdout.includes('ok');
|
|
1691
|
-
return {
|
|
1692
|
-
...baseResult,
|
|
1693
|
-
verified: isSuccess,
|
|
1694
|
-
confidence: 'medium',
|
|
1695
|
-
evidence: `API output: ${stdout.slice(0, 500)}`
|
|
1696
|
-
};
|
|
1697
|
-
}
|
|
1698
|
-
}
|
|
1699
|
-
default:
|
|
1700
|
-
return {
|
|
1701
|
-
...baseResult,
|
|
1702
|
-
verified: false,
|
|
1703
|
-
confidence: 'low',
|
|
1704
|
-
evidence: `Unknown test type: ${test.testType}`
|
|
1705
|
-
};
|
|
1706
|
-
}
|
|
1707
|
-
}
|
|
1708
|
-
catch (err) {
|
|
1709
|
-
// Command failed (non-zero exit) = verification failed
|
|
1710
|
-
return {
|
|
1711
|
-
...baseResult,
|
|
1712
|
-
verified: false,
|
|
1713
|
-
confidence: 'high',
|
|
1714
|
-
evidence: `Test failed: ${err instanceof Error ? err.message : 'Unknown error'}`,
|
|
1715
|
-
error: err instanceof Error ? err.message : 'Unknown error'
|
|
1716
|
-
};
|
|
1717
|
-
}
|
|
1718
|
-
}
|
|
1719
|
-
/**
|
|
1720
|
-
* Verify a claim using LLM-generated runtime test
|
|
1721
|
-
*/
|
|
1722
|
-
export async function verifyWithGeneratedTest(claim, context) {
|
|
1723
|
-
const baseResult = {
|
|
1724
|
-
claim,
|
|
1725
|
-
timestamp: new Date().toISOString()
|
|
1726
|
-
};
|
|
1727
|
-
// Generate verification code
|
|
1728
|
-
const test = await generateVerificationCode(claim, context);
|
|
1729
|
-
if (!test) {
|
|
1730
|
-
return {
|
|
1731
|
-
...baseResult,
|
|
1732
|
-
verified: false,
|
|
1733
|
-
confidence: 'low',
|
|
1734
|
-
evidence: 'Failed to generate verification test'
|
|
1735
|
-
};
|
|
1736
|
-
}
|
|
1737
|
-
// Execute the generated test
|
|
1738
|
-
return executeGeneratedTest(test, context);
|
|
1739
|
-
}
|
|
1740
|
-
/**
|
|
1741
|
-
* Full verification using LLM-generated tests
|
|
1742
|
-
* This is the most powerful verification method - LLM decides HOW to verify each claim
|
|
1743
|
-
*/
|
|
1744
|
-
export async function verifyResponseWithGeneratedTests(response, context, responseId) {
|
|
1745
|
-
// Extract claims using LLM
|
|
1746
|
-
const claims = context.llmVerifier
|
|
1747
|
-
? await extractClaimsWithLLM(response, context.llmVerifier)
|
|
1748
|
-
: extractClaims(response);
|
|
1749
|
-
const results = [];
|
|
1750
|
-
for (const claim of claims) {
|
|
1751
|
-
// For each claim, generate and run a custom verification test
|
|
1752
|
-
const result = await verifyWithGeneratedTest(claim, context);
|
|
1753
|
-
results.push(result);
|
|
1754
|
-
}
|
|
1755
|
-
const verified = results.filter(r => r.verified).length;
|
|
1756
|
-
const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
|
|
1757
|
-
const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
|
|
1758
|
-
let overallVerdict;
|
|
1759
|
-
if (failed > 0) {
|
|
1760
|
-
overallVerdict = 'contradicted';
|
|
1761
|
-
}
|
|
1762
|
-
else if (verified === claims.length && claims.length > 0) {
|
|
1763
|
-
overallVerdict = 'verified';
|
|
448
|
+
if (report.results.length > 8) {
|
|
449
|
+
out += ` ... +${report.results.length - 8} more\n`;
|
|
1764
450
|
}
|
|
1765
|
-
|
|
1766
|
-
overallVerdict = 'partially_verified';
|
|
1767
|
-
}
|
|
1768
|
-
else {
|
|
1769
|
-
overallVerdict = 'unverified';
|
|
1770
|
-
}
|
|
1771
|
-
return {
|
|
1772
|
-
responseId: responseId || `response-${Date.now()}`,
|
|
1773
|
-
timestamp: new Date().toISOString(),
|
|
1774
|
-
claims,
|
|
1775
|
-
results,
|
|
1776
|
-
summary: {
|
|
1777
|
-
total: claims.length,
|
|
1778
|
-
verified,
|
|
1779
|
-
failed,
|
|
1780
|
-
inconclusive
|
|
1781
|
-
},
|
|
1782
|
-
overallVerdict
|
|
1783
|
-
};
|
|
451
|
+
return out;
|
|
1784
452
|
}
|
|
1785
453
|
/**
|
|
1786
|
-
*
|
|
454
|
+
* Quick verification - verify only critical/high priority claims
|
|
1787
455
|
*/
|
|
1788
|
-
export async function
|
|
1789
|
-
const claims =
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
if (safety.safe) {
|
|
1801
|
-
// Use generated test
|
|
1802
|
-
result = await executeGeneratedTest(generatedTest, context);
|
|
1803
|
-
results.push(result);
|
|
1804
|
-
continue;
|
|
1805
|
-
}
|
|
1806
|
-
}
|
|
1807
|
-
}
|
|
1808
|
-
// Fall back to predefined verification
|
|
1809
|
-
const standardTypes = [
|
|
1810
|
-
'file_created', 'file_modified', 'file_deleted', 'code_compiles',
|
|
1811
|
-
'tests_pass', 'git_committed', 'package_published', 'command_executed',
|
|
1812
|
-
'dependency_installed', 'service_running', 'url_accessible', 'content_contains'
|
|
1813
|
-
];
|
|
1814
|
-
let test;
|
|
1815
|
-
if (standardTypes.includes(claim.type)) {
|
|
1816
|
-
test = generateVerificationTest(claim);
|
|
1817
|
-
}
|
|
1818
|
-
else {
|
|
1819
|
-
test = generateExtendedVerificationTest(claim, context);
|
|
1820
|
-
}
|
|
1821
|
-
try {
|
|
1822
|
-
result = await test();
|
|
1823
|
-
}
|
|
1824
|
-
catch (err) {
|
|
1825
|
-
result = {
|
|
1826
|
-
claim,
|
|
1827
|
-
verified: false,
|
|
1828
|
-
confidence: 'low',
|
|
1829
|
-
evidence: 'Verification failed',
|
|
1830
|
-
error: err instanceof Error ? err.message : 'Unknown error',
|
|
1831
|
-
timestamp: new Date().toISOString()
|
|
1832
|
-
};
|
|
1833
|
-
}
|
|
1834
|
-
results.push(result);
|
|
1835
|
-
}
|
|
1836
|
-
const verified = results.filter(r => r.verified).length;
|
|
1837
|
-
const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
|
|
1838
|
-
const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
|
|
1839
|
-
let overallVerdict;
|
|
1840
|
-
if (failed > 0) {
|
|
1841
|
-
overallVerdict = 'contradicted';
|
|
1842
|
-
}
|
|
1843
|
-
else if (verified === claims.length && claims.length > 0) {
|
|
1844
|
-
overallVerdict = 'verified';
|
|
1845
|
-
}
|
|
1846
|
-
else if (verified > 0) {
|
|
1847
|
-
overallVerdict = 'partially_verified';
|
|
1848
|
-
}
|
|
1849
|
-
else {
|
|
1850
|
-
overallVerdict = 'unverified';
|
|
456
|
+
export async function quickVerify(response, ctx) {
|
|
457
|
+
const claims = await extractClaims(response, ctx);
|
|
458
|
+
const critical = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 3);
|
|
459
|
+
if (critical.length === 0) {
|
|
460
|
+
return { trustScore: 50, summary: 'No critical claims to verify' };
|
|
461
|
+
}
|
|
462
|
+
const tests = await generateTests(critical, ctx);
|
|
463
|
+
let verified = 0;
|
|
464
|
+
for (const test of tests) {
|
|
465
|
+
const result = await runIsolatedTest(test, ctx.workingDirectory, ctx.llmVerifier);
|
|
466
|
+
if (result.success)
|
|
467
|
+
verified++;
|
|
1851
468
|
}
|
|
1852
469
|
return {
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
claims,
|
|
1856
|
-
results,
|
|
1857
|
-
summary: {
|
|
1858
|
-
total: claims.length,
|
|
1859
|
-
verified,
|
|
1860
|
-
failed,
|
|
1861
|
-
inconclusive
|
|
1862
|
-
},
|
|
1863
|
-
overallVerdict
|
|
470
|
+
trustScore: Math.round((verified / critical.length) * 100),
|
|
471
|
+
summary: `${verified}/${critical.length} critical claims verified`
|
|
1864
472
|
};
|
|
1865
473
|
}
|
|
1866
|
-
const UNIVERSAL_EXTRACT = `Extract ALL verifiable claims from this AI response. Include explicit claims, implicit claims, state changes, results, assertions.
|
|
1867
|
-
|
|
1868
|
-
RESPONSE:
|
|
1869
|
-
---
|
|
1870
|
-
{RESPONSE}
|
|
1871
|
-
---
|
|
1872
|
-
CONTEXT: {CONTEXT}
|
|
1873
|
-
DIR: {WORKING_DIR}
|
|
1874
|
-
|
|
1875
|
-
Return JSON array: [{"id":"c1","statement":"claim","category":"file_op|code|state|data|behavior|fact|other","verifiable":true/false,"verificationApproach":"how","priority":"critical|high|medium|low","context":{}}]
|
|
1876
|
-
Output ONLY valid JSON.`;
|
|
1877
|
-
const UNIVERSAL_GEN = `Generate verification code for: {STATEMENT}
|
|
1878
|
-
Category: {CATEGORY} | Approach: {APPROACH} | Context: {CONTEXT} | Dir: {WORKING_DIR} | Platform: {PLATFORM}
|
|
1879
|
-
|
|
1880
|
-
Use shell/javascript/python. READ-ONLY only.
|
|
1881
|
-
Return JSON: {"steps":[{"type":"shell|javascript|python","code":"code","desc":"what"}],"success":"success criteria","failure":"failure criteria","confPass":0-100,"confFail":0-100,"safe":{"ok":true/false,"why":"reason"}}
|
|
1882
|
-
Output ONLY valid JSON.`;
|
|
1883
|
-
const UNIVERSAL_ASSESS = `Assess: RESPONSE:{RESPONSE} CLAIMS:{CLAIMS} RESULTS:{RESULTS}
|
|
1884
|
-
Return JSON: {"trust":0-100,"summary":"text","concerns":[]}
|
|
1885
|
-
Output ONLY valid JSON.`;
|
|
1886
|
-
const UNSAFE = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /exec\s*\(/i, /child_process/i, /os\.system/i, /subprocess/i, /curl.*\|.*sh/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i, /kill/i];
|
|
1887
|
-
export function validateUniversalCode(c) {
|
|
1888
|
-
for (const p of UNSAFE)
|
|
1889
|
-
if (p.test(c))
|
|
1890
|
-
return { safe: false, reason: p.source };
|
|
1891
|
-
return c.length > 5000 ? { safe: false, reason: 'too long' } : { safe: true, reason: 'ok' };
|
|
1892
|
-
}
|
|
1893
|
-
async function runUniversalStep(s, cwd) {
|
|
1894
|
-
const v = validateUniversalCode(s.code);
|
|
1895
|
-
if (!v.safe)
|
|
1896
|
-
return { ok: false, out: v.reason };
|
|
1897
|
-
try {
|
|
1898
|
-
if (s.type === 'shell') {
|
|
1899
|
-
const { stdout, stderr } = await execAsync(s.code, { cwd, timeout: 30000, maxBuffer: 5 * 1024 * 1024 });
|
|
1900
|
-
return { ok: true, out: stdout + stderr };
|
|
1901
|
-
}
|
|
1902
|
-
if (s.type === 'javascript') {
|
|
1903
|
-
const w = `(async()=>{try{const fs=require('fs').promises;const r=await(async()=>{${s.code}})();console.log(JSON.stringify({ok:1,r}))}catch(e){console.log(JSON.stringify({ok:0,e:e.message}))}})()`;
|
|
1904
|
-
const { stdout } = await execAsync(`node -e ${JSON.stringify(w)}`, { cwd, timeout: 30000 });
|
|
1905
|
-
return { ok: true, out: stdout };
|
|
1906
|
-
}
|
|
1907
|
-
if (s.type === 'python') {
|
|
1908
|
-
const { stdout, stderr } = await execAsync(`python3 -c ${JSON.stringify(s.code)}`, { cwd, timeout: 30000 });
|
|
1909
|
-
return { ok: true, out: stdout + stderr };
|
|
1910
|
-
}
|
|
1911
|
-
return { ok: false, out: 'unknown type' };
|
|
1912
|
-
}
|
|
1913
|
-
catch (e) {
|
|
1914
|
-
return { ok: false, out: e instanceof Error ? e.message : 'err' };
|
|
1915
|
-
}
|
|
1916
|
-
}
|
|
1917
|
-
export async function extractUniversalClaims(r, ctx) {
|
|
1918
|
-
if (!ctx.llmVerifier)
|
|
1919
|
-
return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
|
|
1920
|
-
try {
|
|
1921
|
-
const p = UNIVERSAL_EXTRACT.replace('{RESPONSE}', r.slice(0, 8000)).replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '').replace('{WORKING_DIR}', ctx.workingDirectory);
|
|
1922
|
-
const res = await ctx.llmVerifier(p);
|
|
1923
|
-
const m = res.match(/\[[\s\S]*\]/);
|
|
1924
|
-
if (m)
|
|
1925
|
-
return JSON.parse(m[0]);
|
|
1926
|
-
}
|
|
1927
|
-
catch { /* fall through */ }
|
|
1928
|
-
return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
|
|
1929
|
-
}
|
|
1930
|
-
export async function verifyUniversalClaim(claim, ctx) {
|
|
1931
|
-
const base = { claim, timestamp: new Date().toISOString() };
|
|
1932
|
-
if (!claim.verifiable)
|
|
1933
|
-
return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'Not verifiable', reasoning: 'Cannot verify' };
|
|
1934
|
-
if (!ctx.llmVerifier)
|
|
1935
|
-
return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'No LLM', reasoning: 'Needs LLM' };
|
|
1936
|
-
try {
|
|
1937
|
-
const p = UNIVERSAL_GEN.replace('{STATEMENT}', claim.statement).replace('{CATEGORY}', claim.category).replace('{APPROACH}', claim.verificationApproach).replace('{CONTEXT}', JSON.stringify(claim.context)).replace('{WORKING_DIR}', ctx.workingDirectory).replace('{PLATFORM}', process.platform);
|
|
1938
|
-
const res = await ctx.llmVerifier(p);
|
|
1939
|
-
const m = res.match(/\{[\s\S]*\}/);
|
|
1940
|
-
if (!m)
|
|
1941
|
-
throw new Error('bad');
|
|
1942
|
-
const plan = JSON.parse(m[0]);
|
|
1943
|
-
if (!plan.safe.ok)
|
|
1944
|
-
return { ...base, verified: false, confidence: 0, method: 'blocked', evidence: plan.safe.why, reasoning: 'Unsafe' };
|
|
1945
|
-
let allOk = true, out = '', code = '';
|
|
1946
|
-
for (const s of plan.steps) {
|
|
1947
|
-
code += s.code + '\n';
|
|
1948
|
-
const r = await runUniversalStep(s, ctx.workingDirectory);
|
|
1949
|
-
out += r.out + '\n';
|
|
1950
|
-
if (!r.ok)
|
|
1951
|
-
allOk = false;
|
|
1952
|
-
}
|
|
1953
|
-
return { ...base, verified: allOk, confidence: allOk ? plan.confPass : plan.confFail, method: plan.steps.map(s => s.type).join('+'), evidence: allOk ? plan.success : plan.failure, reasoning: allOk ? 'All passed' : 'Some failed', executedCode: code, rawOutput: out.slice(0, 2000) };
|
|
1954
|
-
}
|
|
1955
|
-
catch (e) {
|
|
1956
|
-
return { ...base, verified: false, confidence: 10, method: 'error', evidence: 'Failed', reasoning: e instanceof Error ? e.message : 'err' };
|
|
1957
|
-
}
|
|
1958
|
-
}
|
|
1959
|
-
export async function verifyResponseUniversal(response, ctx, id) {
|
|
1960
|
-
const claims = await extractUniversalClaims(response, ctx);
|
|
1961
|
-
const results = [];
|
|
1962
|
-
// Identify self-referential claims (about erosolar-cli itself)
|
|
1963
|
-
const selfClaims = claims.filter(c => c.statement.toLowerCase().includes('erosolar') ||
|
|
1964
|
-
c.statement.toLowerCase().includes('cli') ||
|
|
1965
|
-
c.category === 'behavior' ||
|
|
1966
|
-
c.category === 'feature');
|
|
1967
|
-
const regularClaims = claims.filter(c => !selfClaims.includes(c));
|
|
1968
|
-
// Run isolated runtime tests for self-referential claims
|
|
1969
|
-
if (selfClaims.length > 0 && ctx.llmVerifier) {
|
|
1970
|
-
const isoResults = await runIsolatedVerification(selfClaims.map(c => ({ statement: c.statement, category: c.category, context: c.context })), ctx.workingDirectory, ctx.llmVerifier);
|
|
1971
|
-
// Convert isolated results to UniversalVerificationResult
|
|
1972
|
-
for (let i = 0; i < selfClaims.length && i < isoResults.tests.length; i++) {
|
|
1973
|
-
const claim = selfClaims[i];
|
|
1974
|
-
const isoTest = isoResults.tests[i];
|
|
1975
|
-
results.push({
|
|
1976
|
-
claim,
|
|
1977
|
-
verified: isoTest.success,
|
|
1978
|
-
confidence: isoTest.success ? 90 : (isoTest.matchedPatterns.length > 0 ? 50 : 20),
|
|
1979
|
-
method: 'isolated-runtime',
|
|
1980
|
-
evidence: isoTest.success ? `Verified in fresh CLI instance` : `Failed: ${isoTest.unmatchedPatterns.join(', ')}`,
|
|
1981
|
-
reasoning: isoTest.llmAssessment || (isoTest.success ? 'All patterns matched in isolated runtime' : 'Patterns not matched'),
|
|
1982
|
-
executedCode: isoTest.test.commands.join('\n'),
|
|
1983
|
-
rawOutput: isoTest.output.slice(0, 2000),
|
|
1984
|
-
timestamp: new Date().toISOString()
|
|
1985
|
-
});
|
|
1986
|
-
}
|
|
1987
|
-
}
|
|
1988
|
-
// Verify regular claims with standard approach
|
|
1989
|
-
for (const c of regularClaims) {
|
|
1990
|
-
results.push(c.verifiable || c.priority === 'critical' || c.priority === 'high'
|
|
1991
|
-
? await verifyUniversalClaim(c, ctx)
|
|
1992
|
-
: { claim: c, verified: false, confidence: 0, method: 'skip', evidence: 'Low priority', reasoning: 'Skipped', timestamp: new Date().toISOString() });
|
|
1993
|
-
}
|
|
1994
|
-
const vClaims = claims.filter(c => c.verifiable).length;
|
|
1995
|
-
const verified = results.filter(r => r.verified).length;
|
|
1996
|
-
const failed = results.filter(r => !r.verified && r.confidence > 50).length;
|
|
1997
|
-
const inconclusive = results.filter(r => !r.verified && r.confidence <= 50 && r.method !== 'skip').length;
|
|
1998
|
-
const avgConf = results.length ? results.reduce((s, r) => s + r.confidence, 0) / results.length : 0;
|
|
1999
|
-
// Count isolated tests for assessment
|
|
2000
|
-
const isoCount = results.filter(r => r.method === 'isolated-runtime').length;
|
|
2001
|
-
const isoVerified = results.filter(r => r.method === 'isolated-runtime' && r.verified).length;
|
|
2002
|
-
let assessment = '', trust = 0;
|
|
2003
|
-
if (ctx.llmVerifier)
|
|
2004
|
-
try {
|
|
2005
|
-
const isoSummary = isoCount > 0 ? ` Isolated runtime tests: ${isoVerified}/${isoCount} passed.` : '';
|
|
2006
|
-
const p = UNIVERSAL_ASSESS.replace('{RESPONSE}', response.slice(0, 4000)).replace('{CLAIMS}', JSON.stringify(claims.slice(0, 15))).replace('{RESULTS}', JSON.stringify(results.slice(0, 15)));
|
|
2007
|
-
const r = await ctx.llmVerifier(p);
|
|
2008
|
-
const m = r.match(/\{[\s\S]*\}/);
|
|
2009
|
-
if (m) {
|
|
2010
|
-
const a = JSON.parse(m[0]);
|
|
2011
|
-
trust = a.trust;
|
|
2012
|
-
assessment = a.summary + isoSummary + (a.concerns?.length ? ` Concerns: ${a.concerns.join('; ')}` : '');
|
|
2013
|
-
}
|
|
2014
|
-
}
|
|
2015
|
-
catch {
|
|
2016
|
-
trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
|
|
2017
|
-
assessment = `${verified}/${vClaims} verified`;
|
|
2018
|
-
}
|
|
2019
|
-
else {
|
|
2020
|
-
trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
|
|
2021
|
-
assessment = `${verified}/${vClaims} verified`;
|
|
2022
|
-
}
|
|
2023
|
-
return { responseId: id || `u-${Date.now()}`, originalResponse: response, timestamp: new Date().toISOString(), claims, results, summary: { totalClaims: claims.length, verifiableClaims: vClaims, verified, failed, inconclusive, averageConfidence: Math.round(avgConf) }, overallAssessment: assessment, trustScore: trust };
|
|
2024
|
-
}
|
|
2025
|
-
export async function quickUniversalVerify(r, ctx) {
|
|
2026
|
-
const claims = await extractUniversalClaims(r, ctx);
|
|
2027
|
-
const crit = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 5);
|
|
2028
|
-
if (!crit.length)
|
|
2029
|
-
return { trustScore: 50, summary: 'No critical claims' };
|
|
2030
|
-
let v = 0;
|
|
2031
|
-
for (const c of crit)
|
|
2032
|
-
if ((await verifyUniversalClaim(c, ctx)).verified)
|
|
2033
|
-
v++;
|
|
2034
|
-
return { trustScore: Math.round(v / crit.length * 100), summary: `${v}/${crit.length} critical verified` };
|
|
2035
|
-
}
|
|
2036
|
-
export function formatUniversalReport(r) {
|
|
2037
|
-
const bar = '█'.repeat(Math.round(r.trustScore / 10)) + '░'.repeat(10 - Math.round(r.trustScore / 10));
|
|
2038
|
-
const icon = r.trustScore >= 80 ? '✅' : r.trustScore >= 50 ? '⚠️' : '❌';
|
|
2039
|
-
let out = `╔════════════════════════════════════════════════════════════╗\n║ UNIVERSAL VERIFICATION REPORT ║\n╚════════════════════════════════════════════════════════════╝\n\n`;
|
|
2040
|
-
out += `Trust: ${icon} ${r.trustScore}/100 [${bar}]\n${r.overallAssessment}\n\nClaims: ${r.summary.totalClaims} | ✅ ${r.summary.verified} | ❌ ${r.summary.failed} | ❓ ${r.summary.inconclusive}\n\n`;
|
|
2041
|
-
// Group results by method
|
|
2042
|
-
const isoResults = r.results.filter(x => x.method === 'isolated-runtime');
|
|
2043
|
-
const otherResults = r.results.filter(x => x.method !== 'isolated-runtime');
|
|
2044
|
-
// Show isolated runtime tests first (most robust verification)
|
|
2045
|
-
if (isoResults.length > 0) {
|
|
2046
|
-
out += `🔬 ISOLATED RUNTIME TESTS (fresh CLI instance):\n`;
|
|
2047
|
-
for (const x of isoResults.slice(0, 4)) {
|
|
2048
|
-
out += ` ${x.verified ? '✅' : '❌'} [${x.confidence}%] ${x.claim.statement.slice(0, 50)}...\n`;
|
|
2049
|
-
if (x.reasoning)
|
|
2050
|
-
out += ` └─ ${x.reasoning.slice(0, 60)}\n`;
|
|
2051
|
-
}
|
|
2052
|
-
if (isoResults.length > 4)
|
|
2053
|
-
out += ` ... +${isoResults.length - 4} more isolated tests\n`;
|
|
2054
|
-
out += '\n';
|
|
2055
|
-
}
|
|
2056
|
-
// Show other verification results
|
|
2057
|
-
if (otherResults.length > 0) {
|
|
2058
|
-
out += `📋 STANDARD VERIFICATION:\n`;
|
|
2059
|
-
for (const x of otherResults.slice(0, 6)) {
|
|
2060
|
-
out += ` ${x.verified ? '✅' : x.confidence > 50 ? '❌' : '❓'} [${x.confidence}%] ${x.claim.statement.slice(0, 50)}...\n`;
|
|
2061
|
-
}
|
|
2062
|
-
if (otherResults.length > 6)
|
|
2063
|
-
out += ` ... +${otherResults.length - 6} more\n`;
|
|
2064
|
-
}
|
|
2065
|
-
return out;
|
|
2066
|
-
}
|
|
2067
474
|
//# sourceMappingURL=responseVerifier.js.map
|