erosolar-cli 1.7.23 → 1.7.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1604 +1,398 @@
1
1
  /**
2
- * AI Response Verification System
2
+ * AI Response Verification System - Isolated Runtime Only
3
3
  *
4
- * Automatically verifies assistant claims by:
5
- * 1. Extracting verifiable claims from responses
6
- * 2. Generating runtime verification tests
7
- * 3. Executing tests to verify claims
8
- * 4. Reporting verification results
4
+ * Verifies assistant claims by spawning fresh CLI instances and running
5
+ * actual runtime tests. All verification happens in isolation.
9
6
  *
10
7
  * @license MIT
11
8
  */
12
- import { exec } from 'node:child_process';
9
+ import { exec, spawn } from 'node:child_process';
13
10
  import { promisify } from 'node:util';
14
11
  import * as fs from 'node:fs/promises';
15
12
  import * as path from 'node:path';
16
13
  const execAsync = promisify(exec);
14
+ // ============================================================================
15
+ // ISOLATED RUNTIME - Core Functions
16
+ // ============================================================================
17
17
  /**
18
- * LLM-based claim extraction prompt.
19
- * Used when pattern matching isn't sufficient.
18
+ * Spawns a fresh isolated erosolar-cli instance for testing
20
19
  */
21
- const CLAIM_EXTRACTION_PROMPT = `Analyze this assistant response and extract ALL verifiable claims - anything the assistant claims to have done or accomplished.
22
-
23
- For each claim, identify:
24
- 1. Type: One of these claim types:
25
-
26
- FILE OPERATIONS:
27
- - file_created: A new file was created
28
- - file_modified: An existing file was changed
29
- - file_deleted: A file was removed
30
- - content_contains: A file contains specific content
31
-
32
- BUILD/TEST:
33
- - code_compiles: Code builds/compiles without errors
34
- - tests_pass: Tests run successfully
35
-
36
- VERSION CONTROL:
37
- - git_committed: Changes were committed to git
38
- - package_published: Package was published to npm
39
-
40
- SYSTEM:
41
- - command_executed: A shell command was run
42
- - dependency_installed: A package/dependency was installed
43
- - service_running: A service/server is running (on a port)
44
- - url_accessible: A URL is accessible/working
45
- - env_var_set: Environment variable was set
46
-
47
- CONFIGURATION:
48
- - config_changed: Configuration file was updated
49
- - permission_granted: File permissions were changed
50
-
51
- API/DATA:
52
- - api_response: API returned expected response
53
- - database_updated: Database record was modified
54
- - data_transformed: Data was transformed correctly
55
-
56
- SEMANTIC (require deeper analysis):
57
- - error_fixed: An error/bug was fixed
58
- - feature_implemented: A feature was implemented
59
- - refactor_complete: Code was refactored
60
-
61
- CATCH-ALL:
62
- - generic: Any other verifiable claim not covered above
63
-
64
- 2. The specific details needed to verify the claim
65
-
66
- Return a JSON array of claims. Each claim should have:
67
- - type: one of the types above
68
- - description: human readable description of what was claimed
69
- - params: Object with relevant fields:
70
- - path: file path (for file operations)
71
- - command: shell command (for command_executed)
72
- - version: version number (for package_published)
73
- - hash: git commit hash (for git_committed)
74
- - package: package name (for dependency_installed)
75
- - port: port number (for service_running)
76
- - name: process/service name (for service_running, env_var_set)
77
- - url: URL (for url_accessible, api_response)
78
- - content: text to search for (for content_contains)
79
- - key: config key path like "server.port" (for config_changed)
80
- - value: expected value (for config_changed, env_var_set)
81
- - status: HTTP status code (for api_response)
82
- - body: expected response body (for api_response)
83
- - mode: file permission mode like "755" (for permission_granted)
84
- - checkCommand: command that can verify the claim (for database_updated)
85
-
86
- IMPORTANT: Extract ALL claims, including semantic ones like "I fixed the bug" or "The feature is now working". Use the 'generic' type for claims that don't fit other categories.
87
-
88
- Only skip opinions ("I think..."), explanations of what code does, or future plans ("will do", "should work").
89
-
90
- Response to analyze:
91
- ---
92
- {RESPONSE}
93
- ---
94
-
95
- Return ONLY valid JSON array, no other text.`;
96
- /**
97
- * Extract claims using LLM (for complex responses).
98
- * Falls back to pattern matching if LLM extraction fails.
99
- */
100
- export async function extractClaimsWithLLM(response, llmCall) {
101
- if (!llmCall) {
102
- // No LLM available, use pattern matching
103
- return extractClaims(response);
104
- }
20
+ async function spawnIsolatedCLI(cwd, timeout = 60000) {
21
+ const cliPath = path.join(cwd, 'dist/bin/erosolar.js');
22
+ // Verify CLI exists
105
23
  try {
106
- const prompt = CLAIM_EXTRACTION_PROMPT.replace('{RESPONSE}', response.slice(0, 4000));
107
- const result = await llmCall(prompt);
108
- // Parse JSON response
109
- const jsonMatch = result.match(/\[[\s\S]*\]/);
110
- if (!jsonMatch) {
111
- return extractClaims(response);
112
- }
113
- const parsed = JSON.parse(jsonMatch[0]);
114
- return parsed.map(claim => ({
115
- type: claim.type,
116
- description: claim.description,
117
- evidence: 'Extracted by LLM',
118
- params: claim.params
119
- }));
24
+ await fs.access(cliPath);
120
25
  }
121
26
  catch {
122
- // LLM extraction failed, fall back to patterns
123
- return extractClaims(response);
124
- }
27
+ throw new Error(`CLI not found at ${cliPath}. Run build first.`);
28
+ }
29
+ let output = '';
30
+ let errors = '';
31
+ let exitResolve;
32
+ const exitPromise = new Promise(resolve => { exitResolve = resolve; });
33
+ const child = spawn('node', [cliPath, '--plain'], {
34
+ cwd,
35
+ env: { ...process.env, EROSOLAR_TEST_MODE: '1', NO_COLOR: '1' },
36
+ stdio: ['pipe', 'pipe', 'pipe']
37
+ });
38
+ child.stdout.on('data', (data) => { output += data.toString(); });
39
+ child.stderr.on('data', (data) => { errors += data.toString(); });
40
+ child.on('close', (code) => { exitResolve(code); });
41
+ child.on('error', (err) => { errors += err.message; exitResolve(1); });
42
+ // Set timeout
43
+ const timeoutId = setTimeout(() => {
44
+ child.kill('SIGTERM');
45
+ errors += `\nTimeout after ${timeout}ms`;
46
+ }, timeout);
47
+ child.on('close', () => clearTimeout(timeoutId));
48
+ // Wait for startup
49
+ await new Promise(resolve => {
50
+ const checkStartup = setInterval(() => {
51
+ if (output.includes('erosolar') || output.includes('>') || output.length > 100) {
52
+ clearInterval(checkStartup);
53
+ resolve();
54
+ }
55
+ }, 100);
56
+ setTimeout(() => { clearInterval(checkStartup); resolve(); }, 2000);
57
+ });
58
+ return {
59
+ process: child,
60
+ stdin: child.stdin,
61
+ output,
62
+ errors,
63
+ exitPromise
64
+ };
125
65
  }
126
66
  /**
127
- * Extract verifiable claims from an assistant response.
128
- * Covers common patterns for file operations, builds, tests, git, and npm.
67
+ * Sends a command to the spawned CLI and waits for response
129
68
  */
130
- export function extractClaims(response) {
131
- const claims = [];
132
- const seenPaths = new Set();
133
- // Helper to add file claim if not duplicate
134
- const addFileClaim = (type, path, evidence) => {
135
- if (path && !seenPaths.has(path)) {
136
- seenPaths.add(path);
137
- claims.push({
138
- type,
139
- description: `File ${path} was ${type === 'file_created' ? 'created' : 'modified'}`,
140
- evidence,
141
- params: { path }
142
- });
143
- }
144
- };
145
- // Pattern: File creation claims - comprehensive patterns
146
- const fileCreationPatterns = [
147
- // "I created file X", "Created X", "I've created X"
148
- /(?:I(?:'ve)?\s+)?(?:created|wrote|written|generated|added)\s+(?:a\s+)?(?:new\s+)?(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
149
- // "File X created", "File created at X"
150
- /(?:File\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:was\s+)?(?:created|written|generated)/gi,
151
- // "Created file at X", "Wrote file to X"
152
- /(?:created|wrote)\s+(?:a\s+)?(?:new\s+)?file\s+(?:at|to|in)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
153
- // "File created successfully" with path nearby
154
- /[`"']([^\s`"',]+\.[a-zA-Z0-9]+)[`"']\s+(?:has been\s+)?(?:created|written)/gi,
155
- // "successfully created X"
156
- /successfully\s+(?:created|wrote|generated)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
157
- // "The file X now exists" or "X now contains"
158
- /(?:the\s+)?(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:now\s+)?(?:exists|contains)/gi,
159
- ];
160
- for (const pattern of fileCreationPatterns) {
161
- pattern.lastIndex = 0; // Reset regex state
162
- let match;
163
- while ((match = pattern.exec(response)) !== null) {
164
- if (match[1]) {
165
- addFileClaim('file_created', match[1], match[0]);
69
+ async function sendCommand(cli, command, waitMs = 5000) {
70
+ const outputBefore = cli.output.length;
71
+ cli.stdin.write(command + '\n');
72
+ await new Promise(resolve => {
73
+ let lastLength = cli.output.length;
74
+ const checkInterval = setInterval(() => {
75
+ if (cli.output.length > lastLength) {
76
+ lastLength = cli.output.length;
166
77
  }
167
- }
168
- }
169
- // Pattern: File modification claims - comprehensive patterns
170
- const fileModPatterns = [
171
- // "I modified X", "Updated X", "I've edited X"
172
- /(?:I(?:'ve)?\s+)?(?:modified|updated|changed|edited|fixed|patched|amended)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
173
- // "File X was updated"
174
- /(?:File\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:was\s+)?(?:modified|updated|changed|edited|fixed)/gi,
175
- // "Made changes to X"
176
- /(?:made\s+)?changes?\s+to\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
177
- // "X has been updated"
178
- /[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+has\s+been\s+(?:updated|modified|changed|edited)/gi,
179
- // "successfully updated X"
180
- /successfully\s+(?:updated|modified|edited|fixed)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
181
- ];
182
- for (const pattern of fileModPatterns) {
183
- pattern.lastIndex = 0;
184
- let match;
185
- while ((match = pattern.exec(response)) !== null) {
186
- if (match[1]) {
187
- addFileClaim('file_modified', match[1], match[0]);
188
- }
189
- }
190
- }
191
- // Pattern: Command execution claims
192
- const cmdPatterns = [
193
- /(?:I(?:'ve)?\s+)?(?:ran|executed|run|running)\s+`([^`]+)`/gi,
194
- /(?:Running|Executed|Ran)\s+`([^`]+)`/gi,
195
- /`([^`]+)`\s+(?:completed|succeeded|finished|passed)/gi,
196
- /executed\s+(?:the\s+)?command[:\s]+`([^`]+)`/gi,
197
- ];
198
- const seenCommands = new Set();
199
- for (const pattern of cmdPatterns) {
200
- pattern.lastIndex = 0;
201
- let match;
202
- while ((match = pattern.exec(response)) !== null) {
203
- const command = match[1];
204
- if (command && !seenCommands.has(command)) {
205
- seenCommands.add(command);
206
- claims.push({
207
- type: 'command_executed',
208
- description: `Command "${command.slice(0, 50)}${command.length > 50 ? '...' : ''}" was executed`,
209
- evidence: match[0],
210
- params: { command }
211
- });
78
+ else if (cli.output.length > outputBefore) {
79
+ clearInterval(checkInterval);
80
+ resolve();
212
81
  }
213
- }
214
- }
215
- // Pattern: Build/compile success claims
216
- const buildPatterns = [
217
- /(?:build|compilation|type[- ]?check)\s+(?:passed|succeeded|completed|successful|success)/gi,
218
- /(?:successfully|passed)\s+(?:the\s+)?(?:build|compilation|type[- ]?check)/gi,
219
- /no\s+(?:type\s+)?errors/gi,
220
- /type[- ]?check(?:ing)?\s+(?:passed|succeeded|completed)/gi,
221
- /(?:built|compiled)\s+successfully/gi,
222
- /build\s+(?:is\s+)?(?:complete|successful)/gi,
223
- ];
224
- let hasBuildClaim = false;
225
- for (const pattern of buildPatterns) {
226
- pattern.lastIndex = 0;
227
- if (!hasBuildClaim && pattern.test(response)) {
228
- claims.push({
229
- type: 'code_compiles',
230
- description: 'Code compiles without errors',
231
- evidence: response.match(pattern)?.[0] || '',
232
- params: {}
233
- });
234
- hasBuildClaim = true;
235
- break;
236
- }
237
- }
238
- // Pattern: Test pass claims
239
- const testPatterns = [
240
- /(?:all\s+)?tests?\s+(?:pass|passed|passing|succeeded|successful)/gi,
241
- /(?:passed|passing)\s+(?:all\s+)?tests?/gi,
242
- /(\d+)\s+tests?\s+passed/gi,
243
- /tests?\s+(?:completed|finished)\s+successfully/gi,
244
- /(?:test|tests)\s+suite\s+(?:passed|succeeded)/gi,
245
- /all\s+(\d+)\s+tests?\s+(?:pass|passed)/gi,
246
- ];
247
- let hasTestClaim = false;
248
- for (const pattern of testPatterns) {
249
- pattern.lastIndex = 0;
250
- const match = pattern.exec(response);
251
- if (!hasTestClaim && match) {
252
- claims.push({
253
- type: 'tests_pass',
254
- description: 'Tests pass',
255
- evidence: match[0],
256
- params: { count: match[1] ? parseInt(match[1], 10) : undefined }
257
- });
258
- hasTestClaim = true;
259
- break;
260
- }
82
+ }, 200);
83
+ setTimeout(() => { clearInterval(checkInterval); resolve(); }, waitMs);
84
+ });
85
+ return cli.output.slice(outputBefore);
86
+ }
87
+ /**
88
+ * Run a shell command for verification (file checks, etc.)
89
+ */
90
+ async function runShellVerification(cmd, cwd) {
91
+ // Safety check - block dangerous commands
92
+ const dangerous = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i];
93
+ for (const p of dangerous) {
94
+ if (p.test(cmd))
95
+ return { ok: false, out: `Blocked dangerous command: ${p.source}` };
261
96
  }
262
- // Pattern: Git commit claims
263
- const gitPatterns = [
264
- /committed\s+(?:the\s+)?(?:changes?\s+)?(?:with\s+message\s+)?["']?([^"'\n]+)["']?/gi,
265
- /\[(?:main|master|[a-zA-Z0-9/_-]+)\s+([a-f0-9]{7,})\]/gi,
266
- /git\s+commit.*-m\s+["']([^"']+)["']/gi,
267
- /created\s+(?:a\s+)?commit/gi,
268
- /changes?\s+(?:have\s+been\s+)?committed/gi,
269
- /commit\s+([a-f0-9]{7,})/gi,
270
- ];
271
- let hasGitClaim = false;
272
- for (const pattern of gitPatterns) {
273
- pattern.lastIndex = 0;
274
- const match = pattern.exec(response);
275
- if (!hasGitClaim && match) {
276
- claims.push({
277
- type: 'git_committed',
278
- description: 'Changes were committed',
279
- evidence: match[0],
280
- params: { hash: match[1] }
281
- });
282
- hasGitClaim = true;
283
- break;
284
- }
97
+ try {
98
+ const { stdout, stderr } = await execAsync(cmd, { cwd, timeout: 30000 });
99
+ return { ok: true, out: stdout + stderr };
285
100
  }
286
- // Pattern: Package publish claims
287
- const publishPatterns = [
288
- /published\s+(?:to\s+)?(?:npm|registry)/gi,
289
- /\+\s+[a-z@/_-]+@(\d+\.\d+\.\d+)/gi,
290
- /npm\s+publish/gi,
291
- /package\s+(?:was\s+)?published/gi,
292
- /published\s+(?:version\s+)?v?(\d+\.\d+\.\d+)/gi,
293
- /successfully\s+published/gi,
294
- ];
295
- let hasPublishClaim = false;
296
- for (const pattern of publishPatterns) {
297
- pattern.lastIndex = 0;
298
- const match = pattern.exec(response);
299
- if (!hasPublishClaim && match) {
300
- claims.push({
301
- type: 'package_published',
302
- description: 'Package was published',
303
- evidence: match[0],
304
- params: { version: match[1] }
305
- });
306
- hasPublishClaim = true;
307
- break;
308
- }
101
+ catch (e) {
102
+ return { ok: false, out: e instanceof Error ? e.message : 'Command failed' };
309
103
  }
310
- // Pattern: File deletion claims
311
- const deletionPatterns = [
312
- /(?:I(?:'ve)?\s+)?(?:deleted|removed)\s+(?:the\s+)?(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
313
- /[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:was\s+)?(?:deleted|removed)/gi,
314
- ];
315
- for (const pattern of deletionPatterns) {
316
- pattern.lastIndex = 0;
317
- let match;
318
- while ((match = pattern.exec(response)) !== null) {
319
- const filePath = match[1];
320
- if (filePath && !seenPaths.has(filePath)) {
321
- seenPaths.add(filePath);
322
- claims.push({
323
- type: 'file_deleted',
324
- description: `File ${filePath} was deleted`,
325
- evidence: match[0],
326
- params: { path: filePath }
327
- });
104
+ }
105
+ /**
106
+ * Runs an isolated runtime test
107
+ */
108
+ export async function runIsolatedTest(test, cwd, llmVerifier) {
109
+ const startTime = Date.now();
110
+ const result = {
111
+ test,
112
+ success: false,
113
+ output: '',
114
+ errors: '',
115
+ exitCode: null,
116
+ duration: 0,
117
+ matchedPatterns: [],
118
+ unmatchedPatterns: []
119
+ };
120
+ try {
121
+ // Rebuild if required
122
+ if (test.requiresBuild) {
123
+ try {
124
+ await execAsync('npm run build', { cwd, timeout: 120000 });
328
125
  }
329
- }
330
- }
331
- // Pattern: Dependency installation claims
332
- const installPatterns = [
333
- /(?:installed|added)\s+(?:the\s+)?(?:package|dependency)\s+[`"']?([^\s`"',]+)[`"']?/gi,
334
- /npm\s+install(?:ed)?\s+[`"']?([^\s`"',]+)[`"']?/gi,
335
- /(?:package|dependency)\s+[`"']?([^\s`"',]+)[`"']?\s+(?:was\s+)?installed/gi,
336
- ];
337
- for (const pattern of installPatterns) {
338
- pattern.lastIndex = 0;
339
- let match;
340
- while ((match = pattern.exec(response)) !== null) {
341
- const packageName = match[1];
342
- if (packageName) {
343
- claims.push({
344
- type: 'dependency_installed',
345
- description: `Package ${packageName} was installed`,
346
- evidence: match[0],
347
- params: { package: packageName }
348
- });
126
+ catch (buildErr) {
127
+ result.errors = `Build failed: ${buildErr instanceof Error ? buildErr.message : 'unknown'}`;
128
+ result.duration = Date.now() - startTime;
129
+ return result;
349
130
  }
350
131
  }
351
- }
352
- // Pattern: Service running claims
353
- const servicePatterns = [
354
- /(?:server|service|app(?:lication)?)\s+(?:is\s+)?(?:running|started|listening)\s+(?:on\s+)?(?:port\s+)?(\d+)/gi,
355
- /(?:listening|running)\s+(?:on\s+)?(?:port\s+)?(\d+)/gi,
356
- /started\s+(?:the\s+)?(?:server|service)\s+(?:on\s+)?(?:port\s+)?(\d+)/gi,
357
- /(?:port\s+)?(\d+)\s+is\s+(?:now\s+)?(?:open|listening)/gi,
358
- ];
359
- for (const pattern of servicePatterns) {
360
- pattern.lastIndex = 0;
361
- const match = pattern.exec(response);
362
- if (match && match[1]) {
363
- const port = parseInt(match[1], 10);
364
- if (port > 0 && port < 65536) {
365
- claims.push({
366
- type: 'service_running',
367
- description: `Service running on port ${port}`,
368
- evidence: match[0],
369
- params: { port }
370
- });
371
- break; // Only one service claim per response
132
+ // Run shell commands first if any (file checks, etc.)
133
+ if (test.shellCommands && test.shellCommands.length > 0) {
134
+ for (const cmd of test.shellCommands) {
135
+ const shellResult = await runShellVerification(cmd, cwd);
136
+ result.output += `$ ${cmd}\n${shellResult.out}\n`;
137
+ if (!shellResult.ok) {
138
+ result.errors += shellResult.out + '\n';
139
+ }
372
140
  }
373
141
  }
374
- }
375
- // Pattern: URL accessible claims
376
- const urlPatterns = [
377
- /(?:accessible|available|live)\s+at\s+(https?:\/\/[^\s]+)/gi,
378
- /(?:visit|open|access)\s+(https?:\/\/[^\s]+)/gi,
379
- /(https?:\/\/[^\s]+)\s+(?:is\s+)?(?:now\s+)?(?:accessible|available|live)/gi,
380
- /deployed\s+(?:to|at)\s+(https?:\/\/[^\s]+)/gi,
381
- ];
382
- for (const pattern of urlPatterns) {
383
- pattern.lastIndex = 0;
384
- const match = pattern.exec(response);
385
- if (match && match[1]) {
386
- const url = match[1].replace(/[.,;:!?)]+$/, ''); // Remove trailing punctuation
387
- claims.push({
388
- type: 'url_accessible',
389
- description: `URL ${url} is accessible`,
390
- evidence: match[0],
391
- params: { url }
392
- });
393
- break; // Only one URL claim per response
394
- }
395
- }
396
- // Pattern: Content contains claims
397
- const contentPatterns = [
398
- /(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:now\s+)?contains?\s+[`"']([^`"']+)[`"']/gi,
399
- /added\s+[`"']([^`"']+)[`"']\s+to\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
400
- ];
401
- for (const pattern of contentPatterns) {
402
- pattern.lastIndex = 0;
403
- const match = pattern.exec(response);
404
- if (match) {
405
- // Pattern 1: file contains "text"
406
- // Pattern 2: added "text" to file
407
- const isPattern2 = pattern.source.startsWith('added');
408
- const filePath = isPattern2 ? match[2] : match[1];
409
- const content = isPattern2 ? match[1] : match[2];
410
- if (filePath && content) {
411
- claims.push({
412
- type: 'content_contains',
413
- description: `File ${filePath} contains specified content`,
414
- evidence: match[0],
415
- params: { path: filePath, content }
416
- });
142
+ // Run CLI commands if any
143
+ if (test.commands && test.commands.length > 0) {
144
+ const cli = await spawnIsolatedCLI(cwd, test.timeout || 60000);
145
+ for (const cmd of test.commands) {
146
+ const cmdOutput = await sendCommand(cli, cmd);
147
+ result.output += `> ${cmd}\n${cmdOutput}\n`;
417
148
  }
149
+ cli.stdin.write('/quit\n');
150
+ await new Promise(resolve => setTimeout(resolve, 500));
151
+ cli.process.kill('SIGTERM');
152
+ result.exitCode = await cli.exitPromise;
153
+ result.errors += cli.errors;
418
154
  }
419
- }
420
- return claims;
421
- }
422
- /**
423
- * Generate a verification test for a claim
424
- */
425
- export function generateVerificationTest(claim) {
426
- const baseResult = {
427
- claim,
428
- timestamp: new Date().toISOString()
429
- };
430
- switch (claim.type) {
431
- case 'file_created':
432
- case 'file_modified':
433
- return async () => {
434
- const filePath = claim.params.path;
435
- try {
436
- const resolvedPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
437
- const stats = await fs.stat(resolvedPath);
438
- const recentlyModified = (Date.now() - stats.mtimeMs) < 5 * 60 * 1000; // Within 5 minutes
439
- return {
440
- ...baseResult,
441
- verified: stats.isFile(),
442
- confidence: recentlyModified ? 'high' : 'medium',
443
- evidence: `File exists. Size: ${stats.size} bytes. Modified: ${stats.mtime.toISOString()}`
444
- };
155
+ // Check expected output patterns
156
+ if (test.expectedOutputs) {
157
+ for (const pattern of test.expectedOutputs) {
158
+ if (result.output.includes(pattern) || new RegExp(pattern, 'i').test(result.output)) {
159
+ result.matchedPatterns.push(pattern);
445
160
  }
446
- catch (err) {
447
- return {
448
- ...baseResult,
449
- verified: false,
450
- confidence: 'high',
451
- evidence: 'File does not exist',
452
- error: err instanceof Error ? err.message : 'Unknown error'
453
- };
454
- }
455
- };
456
- case 'file_deleted':
457
- return async () => {
458
- const filePath = claim.params.path;
459
- try {
460
- const resolvedPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
461
- await fs.stat(resolvedPath);
462
- return {
463
- ...baseResult,
464
- verified: false,
465
- confidence: 'high',
466
- evidence: 'File still exists (deletion claim is false)'
467
- };
161
+ else {
162
+ result.unmatchedPatterns.push(pattern);
468
163
  }
469
- catch {
470
- return {
471
- ...baseResult,
472
- verified: true,
473
- confidence: 'high',
474
- evidence: 'File does not exist (deletion verified)'
475
- };
476
- }
477
- };
478
- case 'code_compiles':
479
- return async () => {
480
- try {
481
- const { stdout, stderr } = await execAsync('npm run type-check 2>&1 || npm run build 2>&1', {
482
- timeout: 60000,
483
- cwd: process.cwd()
484
- });
485
- const output = stdout + stderr;
486
- const hasErrors = /error/i.test(output) && !/0 errors/i.test(output);
487
- return {
488
- ...baseResult,
489
- verified: !hasErrors,
490
- confidence: 'high',
491
- evidence: hasErrors ? `Compilation errors found: ${output.slice(0, 500)}` : 'Code compiles successfully'
492
- };
493
- }
494
- catch (err) {
495
- return {
496
- ...baseResult,
497
- verified: false,
498
- confidence: 'high',
499
- evidence: 'Compilation check failed',
500
- error: err instanceof Error ? err.message : 'Unknown error'
501
- };
502
- }
503
- };
504
- case 'tests_pass':
505
- return async () => {
506
- try {
507
- const { stdout, stderr } = await execAsync('npm test 2>&1', {
508
- timeout: 120000,
509
- cwd: process.cwd()
510
- });
511
- const output = stdout + stderr;
512
- const hasFailed = /fail|error/i.test(output) && !/0 failed/i.test(output);
513
- return {
514
- ...baseResult,
515
- verified: !hasFailed,
516
- confidence: 'high',
517
- evidence: hasFailed ? `Test failures: ${output.slice(0, 500)}` : 'All tests pass'
518
- };
519
- }
520
- catch (err) {
521
- return {
522
- ...baseResult,
523
- verified: false,
524
- confidence: 'high',
525
- evidence: 'Test execution failed',
526
- error: err instanceof Error ? err.message : 'Unknown error'
527
- };
528
- }
529
- };
530
- case 'git_committed':
531
- return async () => {
532
- try {
533
- const { stdout } = await execAsync('git log -1 --oneline', {
534
- timeout: 5000,
535
- cwd: process.cwd()
536
- });
537
- const hash = claim.params.hash;
538
- if (hash && stdout.includes(hash.slice(0, 7))) {
539
- return {
540
- ...baseResult,
541
- verified: true,
542
- confidence: 'high',
543
- evidence: `Commit found: ${stdout.trim()}`
544
- };
545
- }
546
- // Check if there's a recent commit
547
- const { stdout: logOutput } = await execAsync('git log -1 --format="%H %s"', {
548
- timeout: 5000
549
- });
550
- return {
551
- ...baseResult,
552
- verified: true,
553
- confidence: 'medium',
554
- evidence: `Most recent commit: ${logOutput.trim()}`
555
- };
556
- }
557
- catch (err) {
558
- return {
559
- ...baseResult,
560
- verified: false,
561
- confidence: 'high',
562
- evidence: 'Git check failed',
563
- error: err instanceof Error ? err.message : 'Unknown error'
564
- };
565
- }
566
- };
567
- case 'package_published':
568
- return async () => {
569
- try {
570
- // Read package.json to get name and version
571
- const pkgPath = path.resolve(process.cwd(), 'package.json');
572
- const pkgContent = await fs.readFile(pkgPath, 'utf-8');
573
- const pkg = JSON.parse(pkgContent);
574
- const { stdout } = await execAsync(`npm view ${pkg.name}@${pkg.version} version 2>&1`, {
575
- timeout: 10000
576
- });
577
- const published = stdout.trim() === pkg.version;
578
- return {
579
- ...baseResult,
580
- verified: published,
581
- confidence: 'high',
582
- evidence: published ? `${pkg.name}@${pkg.version} found on npm` : 'Version not found on npm'
583
- };
584
- }
585
- catch (err) {
586
- return {
587
- ...baseResult,
588
- verified: false,
589
- confidence: 'medium',
590
- evidence: 'Could not verify npm publication',
591
- error: err instanceof Error ? err.message : 'Unknown error'
592
- };
593
- }
594
- };
595
- case 'command_executed':
596
- // Can't really verify past command execution, just acknowledge
597
- return async () => ({
598
- ...baseResult,
599
- verified: true, // Assume true since we can't replay
600
- confidence: 'low',
601
- evidence: 'Command execution cannot be retroactively verified'
602
- });
603
- case 'dependency_installed':
604
- return async () => {
605
- const packageName = claim.params.package;
606
- if (!packageName) {
607
- return {
608
- ...baseResult,
609
- verified: false,
610
- confidence: 'low',
611
- evidence: 'No package name provided'
612
- };
613
- }
614
- try {
615
- // Check if package exists in node_modules
616
- const modulePath = path.resolve(process.cwd(), 'node_modules', packageName);
617
- await fs.stat(modulePath);
618
- // Also verify in package.json
619
- const pkgPath = path.resolve(process.cwd(), 'package.json');
620
- const pkgContent = await fs.readFile(pkgPath, 'utf-8');
621
- const pkg = JSON.parse(pkgContent);
622
- const inDeps = pkg.dependencies?.[packageName] || pkg.devDependencies?.[packageName];
623
- return {
624
- ...baseResult,
625
- verified: true,
626
- confidence: inDeps ? 'high' : 'medium',
627
- evidence: inDeps
628
- ? `Package ${packageName} installed (${inDeps})`
629
- : `Package ${packageName} found in node_modules but not in package.json`
630
- };
631
- }
632
- catch {
633
- return {
634
- ...baseResult,
635
- verified: false,
636
- confidence: 'high',
637
- evidence: `Package ${packageName} not found in node_modules`
638
- };
639
- }
640
- };
641
- case 'service_running':
642
- return async () => {
643
- const port = claim.params.port;
644
- const name = claim.params.name;
645
- try {
646
- if (port) {
647
- // Check if port is in use
648
- const { stdout } = await execAsync(`lsof -i :${port} 2>/dev/null || netstat -an | grep ${port}`, {
649
- timeout: 5000
650
- });
651
- const isRunning = stdout.trim().length > 0;
652
- return {
653
- ...baseResult,
654
- verified: isRunning,
655
- confidence: 'high',
656
- evidence: isRunning ? `Service running on port ${port}` : `No service found on port ${port}`
657
- };
658
- }
659
- else if (name) {
660
- // Check if process is running by name
661
- const { stdout } = await execAsync(`pgrep -f "${name}" 2>/dev/null || ps aux | grep "${name}" | grep -v grep`, {
662
- timeout: 5000
663
- });
664
- const isRunning = stdout.trim().length > 0;
665
- return {
666
- ...baseResult,
667
- verified: isRunning,
668
- confidence: 'medium',
669
- evidence: isRunning ? `Process "${name}" appears to be running` : `Process "${name}" not found`
670
- };
671
- }
672
- return {
673
- ...baseResult,
674
- verified: false,
675
- confidence: 'low',
676
- evidence: 'No port or service name provided for verification'
677
- };
678
- }
679
- catch {
680
- return {
681
- ...baseResult,
682
- verified: false,
683
- confidence: 'medium',
684
- evidence: 'Could not verify service status'
685
- };
686
- }
687
- };
688
- case 'url_accessible':
689
- return async () => {
690
- const url = claim.params.url;
691
- if (!url) {
692
- return {
693
- ...baseResult,
694
- verified: false,
695
- confidence: 'low',
696
- evidence: 'No URL provided'
697
- };
698
- }
699
- try {
700
- const { stdout } = await execAsync(`curl -s -o /dev/null -w "%{http_code}" "${url}" 2>&1`, {
701
- timeout: 10000
702
- });
703
- const statusCode = parseInt(stdout.trim(), 10);
704
- const isAccessible = statusCode >= 200 && statusCode < 400;
705
- return {
706
- ...baseResult,
707
- verified: isAccessible,
708
- confidence: 'high',
709
- evidence: `URL returned status ${statusCode}`
710
- };
711
- }
712
- catch (err) {
713
- return {
714
- ...baseResult,
715
- verified: false,
716
- confidence: 'high',
717
- evidence: 'URL is not accessible',
718
- error: err instanceof Error ? err.message : 'Unknown error'
719
- };
720
- }
721
- };
722
- case 'content_contains':
723
- return async () => {
724
- const filePath = claim.params.path;
725
- const searchText = claim.params.content;
726
- if (!filePath || !searchText) {
727
- return {
728
- ...baseResult,
729
- verified: false,
730
- confidence: 'low',
731
- evidence: 'Missing file path or search content'
732
- };
733
- }
734
- try {
735
- const resolvedPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
736
- const content = await fs.readFile(resolvedPath, 'utf-8');
737
- const contains = content.includes(searchText);
738
- return {
739
- ...baseResult,
740
- verified: contains,
741
- confidence: 'high',
742
- evidence: contains
743
- ? `File contains the expected content`
744
- : `File does not contain "${searchText.slice(0, 50)}..."`
745
- };
746
- }
747
- catch (err) {
748
- return {
749
- ...baseResult,
750
- verified: false,
751
- confidence: 'high',
752
- evidence: 'Could not read file',
753
- error: err instanceof Error ? err.message : 'Unknown error'
754
- };
755
- }
756
- };
757
- default:
758
- return async () => ({
759
- ...baseResult,
760
- verified: false,
761
- confidence: 'low',
762
- evidence: `Unknown claim type: ${claim.type}`
763
- });
764
- }
765
- }
766
- /**
767
- * Verify all claims in an assistant response using LLM-based semantic analysis.
768
- * Requires a VerificationContext with an llmVerifier function.
769
- * All claim extraction and verification is done via LLM.
770
- */
771
- export async function verifyResponse(response, context, responseId) {
772
- return verifyResponseComprehensive(response, context, responseId);
773
- }
774
- /**
775
- * Format a verification report for display
776
- */
777
- export function formatVerificationReport(report) {
778
- const lines = [];
779
- lines.push('═══════════════════════════════════════════════════════════');
780
- lines.push(' RESPONSE VERIFICATION REPORT');
781
- lines.push('═══════════════════════════════════════════════════════════');
782
- lines.push('');
783
- const verdictEmoji = {
784
- verified: 'āœ…',
785
- partially_verified: 'āš ļø',
786
- unverified: 'ā“',
787
- contradicted: 'āŒ'
788
- };
789
- lines.push(`Verdict: ${verdictEmoji[report.overallVerdict]} ${report.overallVerdict.toUpperCase()}`);
790
- lines.push(`Claims: ${report.summary.total} total, ${report.summary.verified} verified, ${report.summary.failed} failed`);
791
- lines.push('');
792
- if (report.results.length > 0) {
793
- lines.push('Verification Details:');
794
- lines.push('─────────────────────');
795
- for (const result of report.results) {
796
- const icon = result.verified ? 'āœ…' : (result.confidence === 'high' ? 'āŒ' : 'ā“');
797
- lines.push(`${icon} ${result.claim.description}`);
798
- lines.push(` Evidence: ${result.evidence.slice(0, 100)}`);
799
- if (result.error) {
800
- lines.push(` Error: ${result.error}`);
801
164
  }
802
165
  }
803
- }
804
- else {
805
- lines.push('No verifiable claims found in response.');
806
- }
807
- lines.push('');
808
- lines.push('═══════════════════════════════════════════════════════════');
809
- return lines.join('\n');
810
- }
811
- /**
812
- * Quick verification - returns true if response claims are valid.
813
- * Requires a VerificationContext with llmVerifier for LLM-based semantic analysis.
814
- */
815
- export async function quickVerify(response, context) {
816
- const report = await verifyResponse(response, context);
817
- return report.overallVerdict === 'verified' || report.overallVerdict === 'partially_verified';
818
- }
819
- /**
820
- * LLM-based verification prompt for claims that can't be programmatically verified
821
- */
822
- const LLM_VERIFICATION_PROMPT = `You are a verification assistant. Analyze whether the following claim is likely TRUE or FALSE based on the evidence provided.
823
-
824
- CLAIM: {CLAIM}
166
+ // LLM assessment of behavior
167
+ if (test.expectedBehavior && llmVerifier) {
168
+ const assessPrompt = `Assess if this output demonstrates the expected behavior.
825
169
 
826
- EVIDENCE/CONTEXT:
827
- {CONTEXT}
170
+ EXPECTED: ${test.expectedBehavior}
828
171
 
829
- Respond with a JSON object:
830
- {
831
- "verdict": "verified" | "unverified" | "inconclusive",
832
- "confidence": "high" | "medium" | "low",
833
- "reasoning": "Brief explanation of your analysis",
834
- "suggested_test": "Optional: A command or check that could verify this claim"
835
- }
172
+ OUTPUT:
173
+ ---
174
+ ${result.output.slice(0, 4000)}
175
+ ---
836
176
 
837
- Be conservative - only mark as "verified" if there's strong evidence. Mark as "inconclusive" if you can't determine the truth.`;
838
- /**
839
- * Verify a claim using LLM when runtime verification isn't possible
840
- */
841
- export async function verifyClaimWithLLM(claim, context) {
842
- const baseResult = {
843
- claim,
844
- timestamp: new Date().toISOString()
845
- };
846
- if (!context.llmVerifier) {
847
- return {
848
- ...baseResult,
849
- verified: false,
850
- confidence: 'low',
851
- evidence: 'No LLM verifier available for semantic verification'
852
- };
853
- }
854
- try {
855
- // Build context string
856
- const contextParts = [];
857
- if (context.previousState) {
858
- contextParts.push(`Previous state: ${JSON.stringify(context.previousState, null, 2)}`);
859
- }
860
- if (context.currentState) {
861
- contextParts.push(`Current state: ${JSON.stringify(context.currentState, null, 2)}`);
862
- }
863
- if (context.conversationHistory?.length) {
864
- contextParts.push(`Recent conversation:\n${context.conversationHistory.slice(-5).join('\n')}`);
865
- }
866
- contextParts.push(`Claim evidence: ${claim.evidence}`);
867
- contextParts.push(`Claim params: ${JSON.stringify(claim.params)}`);
868
- const prompt = LLM_VERIFICATION_PROMPT
869
- .replace('{CLAIM}', claim.description)
870
- .replace('{CONTEXT}', contextParts.join('\n\n'));
871
- const result = await context.llmVerifier(prompt);
872
- // Parse LLM response
873
- const jsonMatch = result.match(/\{[\s\S]*\}/);
874
- if (!jsonMatch) {
875
- return {
876
- ...baseResult,
877
- verified: false,
878
- confidence: 'low',
879
- evidence: 'LLM verification returned invalid response'
880
- };
881
- }
882
- const parsed = JSON.parse(jsonMatch[0]);
883
- return {
884
- ...baseResult,
885
- verified: parsed.verdict === 'verified',
886
- confidence: parsed.confidence || 'medium',
887
- evidence: `LLM Analysis: ${parsed.reasoning}${parsed.suggested_test ? ` (Suggested test: ${parsed.suggested_test})` : ''}`
888
- };
889
- }
890
- catch (err) {
891
- return {
892
- ...baseResult,
893
- verified: false,
894
- confidence: 'low',
895
- evidence: 'LLM verification failed',
896
- error: err instanceof Error ? err.message : 'Unknown error'
897
- };
898
- }
899
- }
900
- /**
901
- * Generate verification test for extended claim types
902
- */
903
- export function generateExtendedVerificationTest(claim, context) {
904
- const baseResult = {
905
- claim,
906
- timestamp: new Date().toISOString()
907
- };
908
- switch (claim.type) {
909
- case 'api_response':
910
- return async () => {
911
- const url = claim.params.url;
912
- const expectedStatus = claim.params.status;
913
- const expectedBody = claim.params.body;
914
- if (!url) {
915
- return {
916
- ...baseResult,
917
- verified: false,
918
- confidence: 'low',
919
- evidence: 'No API URL provided'
920
- };
921
- }
922
- try {
923
- const { stdout } = await execAsync(`curl -s -w "\\n%{http_code}" "${url}" 2>&1`, { timeout: 15000 });
924
- const lines = stdout.trim().split('\n');
925
- const statusCode = parseInt(lines.pop() || '0', 10);
926
- const body = lines.join('\n');
927
- let verified = true;
928
- const evidenceParts = [];
929
- if (expectedStatus && statusCode !== expectedStatus) {
930
- verified = false;
931
- evidenceParts.push(`Expected status ${expectedStatus}, got ${statusCode}`);
177
+ Return JSON: {"matches": true/false, "confidence": 0-100, "reasoning": "explanation"}`;
178
+ try {
179
+ const assessment = await llmVerifier(assessPrompt);
180
+ const match = assessment.match(/\{[\s\S]*\}/);
181
+ if (match) {
182
+ const parsed = JSON.parse(match[0]);
183
+ result.llmAssessment = `${parsed.matches ? 'āœ…' : 'āŒ'} [${parsed.confidence}%] ${parsed.reasoning}`;
184
+ if (!parsed.matches || parsed.confidence < 70) {
185
+ result.unmatchedPatterns.push(`behavior: ${test.expectedBehavior}`);
932
186
  }
933
187
  else {
934
- evidenceParts.push(`Status: ${statusCode}`);
935
- }
936
- if (expectedBody && !body.includes(expectedBody)) {
937
- verified = false;
938
- evidenceParts.push(`Expected body to contain "${expectedBody.slice(0, 50)}..."`);
188
+ result.matchedPatterns.push(`behavior: ${test.expectedBehavior}`);
939
189
  }
940
- return {
941
- ...baseResult,
942
- verified,
943
- confidence: 'high',
944
- evidence: evidenceParts.join('. ')
945
- };
946
190
  }
947
- catch (err) {
948
- return {
949
- ...baseResult,
950
- verified: false,
951
- confidence: 'high',
952
- evidence: 'API request failed',
953
- error: err instanceof Error ? err.message : 'Unknown error'
954
- };
955
- }
956
- };
957
- case 'env_var_set':
958
- return async () => {
959
- const varName = claim.params.name;
960
- const expectedValue = claim.params.value;
961
- if (!varName) {
962
- return {
963
- ...baseResult,
964
- verified: false,
965
- confidence: 'low',
966
- evidence: 'No environment variable name provided'
967
- };
968
- }
969
- const actualValue = process.env[varName];
970
- if (actualValue === undefined) {
971
- return {
972
- ...baseResult,
973
- verified: false,
974
- confidence: 'high',
975
- evidence: `Environment variable ${varName} is not set`
976
- };
977
- }
978
- if (expectedValue && actualValue !== expectedValue) {
979
- return {
980
- ...baseResult,
981
- verified: false,
982
- confidence: 'high',
983
- evidence: `Expected ${varName}="${expectedValue}", got "${actualValue}"`
984
- };
985
- }
986
- return {
987
- ...baseResult,
988
- verified: true,
989
- confidence: 'high',
990
- evidence: `${varName} is set${expectedValue ? ` to expected value` : `: ${actualValue.slice(0, 50)}`}`
991
- };
992
- };
993
- case 'config_changed':
994
- return async () => {
995
- const configPath = claim.params.path;
996
- const expectedKey = claim.params.key;
997
- const expectedValue = claim.params.value;
998
- if (!configPath) {
999
- return {
1000
- ...baseResult,
1001
- verified: false,
1002
- confidence: 'low',
1003
- evidence: 'No config file path provided'
1004
- };
1005
- }
1006
- try {
1007
- const resolvedPath = path.isAbsolute(configPath)
1008
- ? configPath
1009
- : path.resolve(context.workingDirectory, configPath);
1010
- const content = await fs.readFile(resolvedPath, 'utf-8');
1011
- // Try to parse as JSON
1012
- let config;
1013
- try {
1014
- config = JSON.parse(content);
1015
- }
1016
- catch {
1017
- // Not JSON, check raw content
1018
- if (expectedValue && content.includes(String(expectedValue))) {
1019
- return {
1020
- ...baseResult,
1021
- verified: true,
1022
- confidence: 'medium',
1023
- evidence: `Config file contains expected value`
1024
- };
1025
- }
1026
- return {
1027
- ...baseResult,
1028
- verified: true,
1029
- confidence: 'low',
1030
- evidence: 'Config file exists but format unknown'
1031
- };
1032
- }
1033
- if (expectedKey) {
1034
- const keys = expectedKey.split('.');
1035
- let value = config;
1036
- for (const key of keys) {
1037
- value = value?.[key];
1038
- }
1039
- if (expectedValue !== undefined) {
1040
- const matches = JSON.stringify(value) === JSON.stringify(expectedValue);
1041
- return {
1042
- ...baseResult,
1043
- verified: matches,
1044
- confidence: 'high',
1045
- evidence: matches
1046
- ? `${expectedKey} has expected value`
1047
- : `${expectedKey} = ${JSON.stringify(value)}, expected ${JSON.stringify(expectedValue)}`
1048
- };
1049
- }
1050
- return {
1051
- ...baseResult,
1052
- verified: value !== undefined,
1053
- confidence: 'high',
1054
- evidence: value !== undefined
1055
- ? `${expectedKey} exists: ${JSON.stringify(value).slice(0, 100)}`
1056
- : `${expectedKey} not found in config`
1057
- };
1058
- }
1059
- return {
1060
- ...baseResult,
1061
- verified: true,
1062
- confidence: 'medium',
1063
- evidence: 'Config file exists and is valid JSON'
1064
- };
1065
- }
1066
- catch (err) {
1067
- return {
1068
- ...baseResult,
1069
- verified: false,
1070
- confidence: 'high',
1071
- evidence: 'Could not read config file',
1072
- error: err instanceof Error ? err.message : 'Unknown error'
1073
- };
1074
- }
1075
- };
1076
- case 'error_fixed':
1077
- case 'feature_implemented':
1078
- case 'refactor_complete':
1079
- // These require semantic verification - LLM is required
1080
- return async () => {
1081
- if (!context.llmVerifier) {
1082
- return {
1083
- ...baseResult,
1084
- verified: false,
1085
- confidence: 'low',
1086
- evidence: 'Semantic verification requires LLM verifier'
1087
- };
1088
- }
1089
- return verifyClaimWithLLM(claim, context);
1090
- };
1091
- case 'data_transformed':
1092
- case 'database_updated':
1093
- case 'permission_granted':
1094
- case 'generic':
1095
- default:
1096
- // All these claim types require LLM verification
1097
- return async () => {
1098
- if (!context.llmVerifier) {
1099
- return {
1100
- ...baseResult,
1101
- verified: false,
1102
- confidence: 'low',
1103
- evidence: `${claim.type} verification requires LLM verifier`
1104
- };
1105
- }
1106
- return verifyClaimWithLLM(claim, context);
1107
- };
1108
- }
1109
- }
1110
- /**
1111
- * Comprehensive verification using LLM-based semantic analysis.
1112
- * Requires an LLM verifier - all claims are verified through LLM semantic analysis.
1113
- */
1114
- export async function verifyResponseComprehensive(response, context, responseId) {
1115
- if (!context.llmVerifier) {
1116
- return {
1117
- responseId: responseId || `response-${Date.now()}`,
1118
- timestamp: new Date().toISOString(),
1119
- claims: [],
1120
- results: [],
1121
- summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
1122
- overallVerdict: 'unverified'
1123
- };
1124
- }
1125
- // Extract ALL claims using LLM (required)
1126
- const claims = await extractClaimsWithLLM(response, context.llmVerifier);
1127
- const results = [];
1128
- for (const claim of claims) {
1129
- // ALL claims are verified via LLM semantic analysis
1130
- try {
1131
- const result = await verifyClaimWithLLM(claim, context);
1132
- results.push(result);
1133
- }
1134
- catch (err) {
1135
- results.push({
1136
- claim,
1137
- verified: false,
1138
- confidence: 'low',
1139
- evidence: 'LLM verification failed',
1140
- error: err instanceof Error ? err.message : 'Unknown error',
1141
- timestamp: new Date().toISOString()
1142
- });
191
+ }
192
+ catch {
193
+ result.llmAssessment = 'LLM assessment failed';
194
+ }
1143
195
  }
196
+ // Determine success
197
+ result.success = result.unmatchedPatterns.length === 0 &&
198
+ (result.matchedPatterns.length > 0 || (!test.expectedOutputs?.length && !test.expectedBehavior));
1144
199
  }
1145
- const verified = results.filter(r => r.verified).length;
1146
- const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
1147
- const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
1148
- let overallVerdict;
1149
- if (failed > 0) {
1150
- overallVerdict = 'contradicted';
1151
- }
1152
- else if (verified === claims.length && claims.length > 0) {
1153
- overallVerdict = 'verified';
1154
- }
1155
- else if (verified > 0) {
1156
- overallVerdict = 'partially_verified';
1157
- }
1158
- else {
1159
- overallVerdict = 'unverified';
1160
- }
1161
- return {
1162
- responseId: responseId || `response-${Date.now()}`,
1163
- timestamp: new Date().toISOString(),
1164
- claims,
1165
- results,
1166
- summary: {
1167
- total: claims.length,
1168
- verified,
1169
- failed,
1170
- inconclusive
1171
- },
1172
- overallVerdict
1173
- };
1174
- }
1175
- /**
1176
- * Determine the best verification strategy for a claim
1177
- */
1178
- export function getVerificationStrategy(claim) {
1179
- switch (claim.type) {
1180
- case 'file_created':
1181
- case 'file_modified':
1182
- case 'file_deleted':
1183
- case 'content_contains':
1184
- case 'config_changed':
1185
- case 'permission_granted':
1186
- return 'filesystem';
1187
- case 'url_accessible':
1188
- case 'api_response':
1189
- case 'service_running':
1190
- return 'network';
1191
- case 'code_compiles':
1192
- case 'tests_pass':
1193
- case 'command_executed':
1194
- case 'dependency_installed':
1195
- case 'git_committed':
1196
- case 'package_published':
1197
- case 'env_var_set':
1198
- return 'runtime';
1199
- case 'error_fixed':
1200
- case 'feature_implemented':
1201
- case 'refactor_complete':
1202
- case 'data_transformed':
1203
- return 'semantic';
1204
- case 'database_updated':
1205
- return 'comparison';
1206
- case 'generic':
1207
- default:
1208
- return 'llm';
200
+ catch (err) {
201
+ result.errors = err instanceof Error ? err.message : 'Unknown error';
1209
202
  }
203
+ result.duration = Date.now() - startTime;
204
+ return result;
1210
205
  }
1211
- /**
1212
- * Prompt for LLM to generate verification code
1213
- */
1214
- const VERIFICATION_CODE_GENERATION_PROMPT = `You are a verification code generator. Given a claim that an AI assistant made, generate code to verify if the claim is TRUE.
206
+ // ============================================================================
207
+ // CLAIM EXTRACTION - LLM extracts claims from responses
208
+ // ============================================================================
209
+ const EXTRACT_CLAIMS_PROMPT = `Extract ALL verifiable claims from this AI assistant response.
1215
210
 
1216
- CLAIM TO VERIFY:
1217
- Type: {CLAIM_TYPE}
1218
- Description: {CLAIM_DESCRIPTION}
1219
- Evidence: {CLAIM_EVIDENCE}
1220
- Parameters: {CLAIM_PARAMS}
1221
-
1222
- WORKING DIRECTORY: {WORKING_DIR}
1223
-
1224
- Generate a verification test. Choose the most appropriate approach:
211
+ RESPONSE:
212
+ ---
213
+ {RESPONSE}
214
+ ---
1225
215
 
1226
- 1. SHELL COMMAND - For file operations, git, npm, system checks
1227
- 2. JAVASCRIPT - For complex logic, API calls, JSON parsing
1228
- 3. API - For HTTP endpoints, external services
216
+ CONTEXT: {CONTEXT}
217
+ WORKING_DIR: {WORKING_DIR}
1229
218
 
1230
- IMPORTANT RULES:
1231
- - Code must be READ-ONLY and NON-DESTRUCTIVE (no writes, no deletes, no modifications)
1232
- - Code must complete quickly (under 10 seconds)
1233
- - Code must output a clear result that can be parsed
1234
- - For shell: output should be parseable (exit code 0 = verified, non-zero = failed)
1235
- - For JavaScript: must export/return { verified: boolean, evidence: string }
1236
- - Do NOT use interactive commands
1237
- - Do NOT access sensitive data or credentials
219
+ For each claim, determine:
220
+ 1. What specific assertion is being made
221
+ 2. Category: file_op (created/modified/deleted files), code (compiles/tests pass), command (executed successfully), state (something changed), behavior (feature works), fact (verifiable truth)
222
+ 3. How it can be verified (shell command, file check, CLI test, etc.)
223
+ 4. Priority: critical (must verify), high (should verify), medium (nice to verify), low (optional)
1238
224
 
1239
- Respond with JSON:
1240
- {
1241
- "testType": "shell" | "javascript" | "api",
1242
- "code": "the verification code",
1243
- "description": "what this test does",
1244
- "expectedOutcome": "what success looks like",
1245
- "safeToRun": true | false,
1246
- "safetyReason": "why it's safe/unsafe"
1247
- }
225
+ Return JSON array:
226
+ [{
227
+ "id": "c1",
228
+ "statement": "the specific claim",
229
+ "category": "file_op|code|command|state|behavior|fact",
230
+ "verifiable": true,
231
+ "priority": "critical|high|medium|low",
232
+ "context": {"path": "/path/if/relevant", "command": "if relevant"}
233
+ }]
1248
234
 
1249
- Only output valid JSON, nothing else.`;
235
+ Output ONLY valid JSON array.`;
1250
236
  /**
1251
- * Generate verification code using LLM
237
+ * Extract claims from assistant response using LLM
1252
238
  */
1253
- export async function generateVerificationCode(claim, context) {
1254
- if (!context.llmVerifier) {
1255
- return null;
1256
- }
239
+ async function extractClaims(response, ctx) {
240
+ if (!ctx.llmVerifier)
241
+ return [];
1257
242
  try {
1258
- const prompt = VERIFICATION_CODE_GENERATION_PROMPT
1259
- .replace('{CLAIM_TYPE}', claim.type)
1260
- .replace('{CLAIM_DESCRIPTION}', claim.description)
1261
- .replace('{CLAIM_EVIDENCE}', claim.evidence)
1262
- .replace('{CLAIM_PARAMS}', JSON.stringify(claim.params, null, 2))
1263
- .replace('{WORKING_DIR}', context.workingDirectory);
1264
- const result = await context.llmVerifier(prompt);
1265
- // Parse the JSON response
1266
- const jsonMatch = result.match(/\{[\s\S]*\}/);
1267
- if (!jsonMatch) {
1268
- return null;
243
+ const prompt = EXTRACT_CLAIMS_PROMPT
244
+ .replace('{RESPONSE}', response.slice(0, 8000))
245
+ .replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '')
246
+ .replace('{WORKING_DIR}', ctx.workingDirectory);
247
+ const result = await ctx.llmVerifier(prompt);
248
+ const match = result.match(/\[[\s\S]*\]/);
249
+ if (match) {
250
+ return JSON.parse(match[0]);
1269
251
  }
1270
- const parsed = JSON.parse(jsonMatch[0]);
1271
- return {
1272
- claim,
1273
- testType: parsed.testType,
1274
- code: parsed.code,
1275
- description: parsed.description,
1276
- expectedOutcome: parsed.expectedOutcome,
1277
- safetyCheck: parsed.safeToRun
1278
- };
1279
252
  }
1280
- catch (err) {
1281
- console.error('Failed to generate verification code:', err);
1282
- return null;
253
+ catch {
254
+ // Fall through
1283
255
  }
256
+ return [];
1284
257
  }
258
+ // ============================================================================
259
+ // TEST GENERATION - LLM generates isolated tests for claims
260
+ // ============================================================================
261
+ const GENERATE_TESTS_PROMPT = `Generate isolated runtime tests for these claims.
262
+
263
+ CLAIMS:
264
+ {CLAIMS}
265
+
266
+ WORKING_DIR: {WORKING_DIR}
267
+ PLATFORM: {PLATFORM}
268
+
269
+ For each claim, generate a test that verifies it using:
270
+ - Shell commands (for file checks, git status, etc.)
271
+ - CLI commands (for testing CLI behavior in fresh instance)
272
+ - Expected output patterns
273
+
274
+ Return JSON array:
275
+ [{
276
+ "id": "test-1",
277
+ "description": "what we're testing",
278
+ "shellCommands": ["ls -la path", "cat file"],
279
+ "commands": ["/help", "some input"],
280
+ "expectedOutputs": ["pattern1", "pattern2"],
281
+ "expectedBehavior": "description for LLM assessment",
282
+ "requiresBuild": false,
283
+ "timeout": 30000
284
+ }]
285
+
286
+ Use READ-ONLY commands only. No destructive operations.
287
+ Output ONLY valid JSON array.`;
1285
288
  /**
1286
- * Safety patterns to block dangerous code
1287
- */
1288
- const DANGEROUS_PATTERNS = [
1289
- /\brm\s+-rf?\b/i, // rm commands
1290
- /\brmdir\b/i, // rmdir
1291
- /\bdd\s+if=/i, // dd (disk destroyer)
1292
- /\bmkfs\b/i, // format filesystem
1293
- /\b>\s*\/dev\//i, // write to devices
1294
- /\bchmod\s+777\b/i, // dangerous permissions
1295
- /\bsudo\b/i, // sudo commands
1296
- /\bcurl.*\|\s*sh\b/i, // pipe to shell
1297
- /\bwget.*\|\s*sh\b/i, // pipe to shell
1298
- /\beval\s*\(/i, // eval in JS
1299
- /new\s+Function\s*\(/i, // Function constructor
1300
- /child_process/i, // subprocess in JS (unless we control it)
1301
- /\bexec\s*\(/i, // exec calls
1302
- /\bspawn\s*\(/i, // spawn calls
1303
- /writeFile/i, // file writes
1304
- /appendFile/i, // file appends
1305
- /unlink\s*\(/i, // file deletion
1306
- /rmSync/i, // sync deletion
1307
- /fs\.rm/i, // fs remove
1308
- /DROP\s+TABLE/i, // SQL injection
1309
- /DELETE\s+FROM/i, // SQL deletion
1310
- /TRUNCATE/i, // SQL truncate
1311
- /;\s*--/, // SQL comment injection
1312
- /process\.exit/i, // process exit
1313
- /require\s*\(\s*['"]child/i, // require child_process
1314
- ];
1315
- /**
1316
- * Validate that generated code is safe to execute
289
+ * Generate isolated tests for claims
1317
290
  */
1318
- export function validateGeneratedCode(test) {
1319
- // First check the LLM's own safety assessment
1320
- if (!test.safetyCheck) {
1321
- return { safe: false, reason: 'LLM marked code as unsafe' };
1322
- }
1323
- // Check against dangerous patterns
1324
- for (const pattern of DANGEROUS_PATTERNS) {
1325
- if (pattern.test(test.code)) {
1326
- return {
1327
- safe: false,
1328
- reason: `Dangerous pattern detected: ${pattern.source}`
1329
- };
291
+ async function generateTests(claims, ctx) {
292
+ if (!ctx.llmVerifier || claims.length === 0)
293
+ return [];
294
+ try {
295
+ const prompt = GENERATE_TESTS_PROMPT
296
+ .replace('{CLAIMS}', JSON.stringify(claims.slice(0, 10)))
297
+ .replace('{WORKING_DIR}', ctx.workingDirectory)
298
+ .replace('{PLATFORM}', process.platform);
299
+ const result = await ctx.llmVerifier(prompt);
300
+ const match = result.match(/\[[\s\S]*\]/);
301
+ if (match) {
302
+ return JSON.parse(match[0]);
1330
303
  }
1331
304
  }
1332
- // Additional checks for shell commands
1333
- if (test.testType === 'shell') {
1334
- // Only allow specific safe commands
1335
- const safeShellPrefixes = [
1336
- 'ls', 'cat', 'head', 'tail', 'grep', 'find', 'stat', 'file',
1337
- 'test', 'echo', 'pwd', 'wc', 'diff', 'cmp',
1338
- 'git log', 'git status', 'git show', 'git diff', 'git branch',
1339
- 'npm view', 'npm list', 'npm ls',
1340
- 'node -e', 'node --eval',
1341
- 'curl -s', 'curl --silent', 'wget -q',
1342
- 'jq', 'python -c', 'python3 -c',
1343
- 'lsof', 'netstat', 'ss', 'ps',
1344
- 'which', 'type', 'command -v',
1345
- ];
1346
- const trimmedCode = test.code.trim().toLowerCase();
1347
- const startsWithSafe = safeShellPrefixes.some(prefix => trimmedCode.startsWith(prefix.toLowerCase()));
1348
- if (!startsWithSafe) {
1349
- // Check if it's a simple test/check command
1350
- if (!trimmedCode.startsWith('[') && !trimmedCode.startsWith('if ')) {
1351
- return {
1352
- safe: false,
1353
- reason: 'Shell command does not start with a known safe prefix'
1354
- };
1355
- }
305
+ catch {
306
+ // Fall through to basic tests
307
+ }
308
+ // Fallback: generate basic tests
309
+ return claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).map((c, i) => {
310
+ const test = {
311
+ id: `test-${i}`,
312
+ description: c.statement,
313
+ commands: [],
314
+ shellCommands: [],
315
+ expectedBehavior: c.statement,
316
+ timeout: 30000
317
+ };
318
+ // Add basic verification based on category
319
+ if (c.category === 'file_op' && c.context['path']) {
320
+ test.shellCommands = [`test -f "${c.context['path']}" && echo "EXISTS" || echo "NOT_FOUND"`];
321
+ test.expectedOutputs = ['EXISTS'];
1356
322
  }
1357
- }
1358
- // For JavaScript, ensure it's a simple expression
1359
- if (test.testType === 'javascript') {
1360
- // Check code length - very long code is suspicious
1361
- if (test.code.length > 2000) {
1362
- return { safe: false, reason: 'JavaScript code too long' };
323
+ else if (c.category === 'code') {
324
+ test.shellCommands = ['npm run build 2>&1 | tail -5'];
1363
325
  }
1364
- }
1365
- return { safe: true, reason: 'All safety checks passed' };
1366
- }
1367
- /**
1368
- * Execute a generated verification test
1369
- */
1370
- export async function executeGeneratedTest(test, context) {
1371
- const baseResult = {
1372
- claim: test.claim,
1373
- timestamp: new Date().toISOString()
1374
- };
1375
- // Validate safety first
1376
- const safetyResult = validateGeneratedCode(test);
1377
- if (!safetyResult.safe) {
1378
- return {
1379
- ...baseResult,
1380
- verified: false,
1381
- confidence: 'low',
1382
- evidence: `Generated test blocked: ${safetyResult.reason}`,
1383
- error: 'Safety validation failed'
1384
- };
1385
- }
1386
- try {
1387
- switch (test.testType) {
1388
- case 'shell': {
1389
- const { stdout, stderr } = await execAsync(test.code, {
1390
- cwd: context.workingDirectory,
1391
- timeout: 10000, // 10 second timeout
1392
- maxBuffer: 1024 * 1024 // 1MB max output
1393
- });
1394
- const output = (stdout + stderr).trim();
1395
- // Shell convention: exit 0 = success
1396
- return {
1397
- ...baseResult,
1398
- verified: true,
1399
- confidence: 'high',
1400
- evidence: `Test passed. Output: ${output.slice(0, 500)}`
1401
- };
1402
- }
1403
- case 'javascript': {
1404
- // Execute JavaScript in a sandboxed way using node -e
1405
- const wrappedCode = `
1406
- const result = (async () => {
1407
- ${test.code}
1408
- })();
1409
- result.then(r => console.log(JSON.stringify(r))).catch(e => {
1410
- console.log(JSON.stringify({ verified: false, evidence: e.message }));
1411
- });
1412
- `;
1413
- const { stdout } = await execAsync(`node -e ${JSON.stringify(wrappedCode)}`, {
1414
- cwd: context.workingDirectory,
1415
- timeout: 10000
1416
- });
1417
- try {
1418
- const result = JSON.parse(stdout.trim());
1419
- return {
1420
- ...baseResult,
1421
- verified: result.verified,
1422
- confidence: 'high',
1423
- evidence: result.evidence
1424
- };
1425
- }
1426
- catch {
1427
- return {
1428
- ...baseResult,
1429
- verified: false,
1430
- confidence: 'medium',
1431
- evidence: `JavaScript output: ${stdout.slice(0, 500)}`
1432
- };
1433
- }
1434
- }
1435
- case 'api': {
1436
- // For API tests, use curl
1437
- const { stdout } = await execAsync(test.code, {
1438
- cwd: context.workingDirectory,
1439
- timeout: 15000
1440
- });
1441
- // Try to parse as JSON result
1442
- try {
1443
- const result = JSON.parse(stdout.trim());
1444
- return {
1445
- ...baseResult,
1446
- verified: Boolean(result.verified ?? result.success ?? result.ok),
1447
- confidence: 'high',
1448
- evidence: `API response: ${JSON.stringify(result).slice(0, 500)}`
1449
- };
1450
- }
1451
- catch {
1452
- // Non-JSON response - check for success indicators
1453
- const isSuccess = stdout.includes('200') || stdout.includes('success') || stdout.includes('ok');
1454
- return {
1455
- ...baseResult,
1456
- verified: isSuccess,
1457
- confidence: 'medium',
1458
- evidence: `API output: ${stdout.slice(0, 500)}`
1459
- };
1460
- }
1461
- }
1462
- default:
1463
- return {
1464
- ...baseResult,
1465
- verified: false,
1466
- confidence: 'low',
1467
- evidence: `Unknown test type: ${test.testType}`
1468
- };
326
+ else if (c.category === 'behavior') {
327
+ test.commands = ['/help'];
1469
328
  }
1470
- }
1471
- catch (err) {
1472
- // Command failed (non-zero exit) = verification failed
1473
- return {
1474
- ...baseResult,
1475
- verified: false,
1476
- confidence: 'high',
1477
- evidence: `Test failed: ${err instanceof Error ? err.message : 'Unknown error'}`,
1478
- error: err instanceof Error ? err.message : 'Unknown error'
1479
- };
1480
- }
329
+ return test;
330
+ });
1481
331
  }
332
+ // ============================================================================
333
+ // MAIN VERIFICATION API
334
+ // ============================================================================
1482
335
  /**
1483
- * Verify a claim using LLM-generated runtime test
336
+ * Verify an assistant response using isolated runtime tests.
337
+ * This is the main entry point for verification.
1484
338
  */
1485
- export async function verifyWithGeneratedTest(claim, context) {
1486
- const baseResult = {
1487
- claim,
1488
- timestamp: new Date().toISOString()
1489
- };
1490
- // Generate verification code
1491
- const test = await generateVerificationCode(claim, context);
1492
- if (!test) {
339
+ export async function verifyResponse(response, ctx, responseId) {
340
+ const timestamp = new Date().toISOString();
341
+ const id = responseId || `verify-${Date.now()}`;
342
+ // Extract claims from response
343
+ const claims = await extractClaims(response, ctx);
344
+ if (claims.length === 0) {
1493
345
  return {
1494
- ...baseResult,
1495
- verified: false,
1496
- confidence: 'low',
1497
- evidence: 'Failed to generate verification test'
346
+ responseId: id,
347
+ timestamp,
348
+ claims: [],
349
+ results: [],
350
+ summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
351
+ overallVerdict: 'unverified',
352
+ trustScore: 50
1498
353
  };
1499
354
  }
1500
- // Execute the generated test
1501
- return executeGeneratedTest(test, context);
1502
- }
1503
- /**
1504
- * Full verification using LLM-generated tests
1505
- * This is the most powerful verification method - LLM decides HOW to verify each claim
1506
- */
1507
- export async function verifyResponseWithGeneratedTests(response, context, responseId) {
1508
- // Extract claims using LLM
1509
- const claims = context.llmVerifier
1510
- ? await extractClaimsWithLLM(response, context.llmVerifier)
1511
- : extractClaims(response);
1512
- const results = [];
1513
- for (const claim of claims) {
1514
- // For each claim, generate and run a custom verification test
1515
- const result = await verifyWithGeneratedTest(claim, context);
1516
- results.push(result);
1517
- }
1518
- const verified = results.filter(r => r.verified).length;
1519
- const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
1520
- const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
1521
- let overallVerdict;
1522
- if (failed > 0) {
1523
- overallVerdict = 'contradicted';
1524
- }
1525
- else if (verified === claims.length && claims.length > 0) {
1526
- overallVerdict = 'verified';
1527
- }
1528
- else if (verified > 0) {
1529
- overallVerdict = 'partially_verified';
1530
- }
1531
- else {
1532
- overallVerdict = 'unverified';
1533
- }
1534
- return {
1535
- responseId: responseId || `response-${Date.now()}`,
1536
- timestamp: new Date().toISOString(),
1537
- claims,
1538
- results,
1539
- summary: {
1540
- total: claims.length,
1541
- verified,
1542
- failed,
1543
- inconclusive
1544
- },
1545
- overallVerdict
1546
- };
1547
- }
1548
- /**
1549
- * Hybrid verification - uses generated tests when available, falls back to predefined tests
1550
- */
1551
- export async function verifyResponseHybrid(response, context, responseId) {
1552
- const claims = context.llmVerifier
1553
- ? await extractClaimsWithLLM(response, context.llmVerifier)
1554
- : extractClaims(response);
1555
- const results = [];
1556
- for (const claim of claims) {
1557
- let result;
1558
- // Try LLM-generated test first if LLM is available
1559
- if (context.llmVerifier) {
1560
- const generatedTest = await generateVerificationCode(claim, context);
1561
- if (generatedTest) {
1562
- const safety = validateGeneratedCode(generatedTest);
1563
- if (safety.safe) {
1564
- // Use generated test
1565
- result = await executeGeneratedTest(generatedTest, context);
1566
- results.push(result);
1567
- continue;
1568
- }
1569
- }
1570
- }
1571
- // Fall back to predefined verification
1572
- const standardTypes = [
1573
- 'file_created', 'file_modified', 'file_deleted', 'code_compiles',
1574
- 'tests_pass', 'git_committed', 'package_published', 'command_executed',
1575
- 'dependency_installed', 'service_running', 'url_accessible', 'content_contains'
1576
- ];
1577
- let test;
1578
- if (standardTypes.includes(claim.type)) {
1579
- test = generateVerificationTest(claim);
1580
- }
1581
- else {
1582
- test = generateExtendedVerificationTest(claim, context);
1583
- }
1584
- try {
1585
- result = await test();
1586
- }
1587
- catch (err) {
1588
- result = {
355
+ // Generate isolated tests for claims
356
+ const tests = await generateTests(claims, ctx);
357
+ // Run all isolated tests
358
+ const testResults = [];
359
+ for (const test of tests) {
360
+ const result = await runIsolatedTest(test, ctx.workingDirectory, ctx.llmVerifier);
361
+ testResults.push(result);
362
+ }
363
+ // Map test results back to claims
364
+ const results = claims.map((claim, i) => {
365
+ const testResult = testResults[i];
366
+ if (!testResult) {
367
+ return {
1589
368
  claim,
1590
369
  verified: false,
1591
370
  confidence: 'low',
1592
- evidence: 'Verification failed',
1593
- error: err instanceof Error ? err.message : 'Unknown error',
1594
- timestamp: new Date().toISOString()
371
+ evidence: 'No test generated',
372
+ method: 'skip',
373
+ timestamp
1595
374
  };
1596
375
  }
1597
- results.push(result);
1598
- }
376
+ return {
377
+ claim,
378
+ verified: testResult.success,
379
+ confidence: testResult.success ? 'high' : (testResult.matchedPatterns.length > 0 ? 'medium' : 'low'),
380
+ evidence: testResult.success
381
+ ? `Verified in isolated runtime: ${testResult.matchedPatterns.join(', ')}`
382
+ : `Failed: ${testResult.unmatchedPatterns.join(', ')}`,
383
+ method: 'isolated-runtime',
384
+ reasoning: testResult.llmAssessment,
385
+ executedCode: [...(testResult.test.shellCommands || []), ...(testResult.test.commands || [])].join('\n'),
386
+ rawOutput: testResult.output.slice(0, 2000),
387
+ error: testResult.errors || undefined,
388
+ timestamp
389
+ };
390
+ });
391
+ // Calculate summary
1599
392
  const verified = results.filter(r => r.verified).length;
1600
393
  const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
1601
394
  const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
395
+ // Determine verdict
1602
396
  let overallVerdict;
1603
397
  if (failed > 0) {
1604
398
  overallVerdict = 'contradicted';
@@ -1612,165 +406,69 @@ export async function verifyResponseHybrid(response, context, responseId) {
1612
406
  else {
1613
407
  overallVerdict = 'unverified';
1614
408
  }
409
+ // Calculate trust score
410
+ const trustScore = claims.length > 0
411
+ ? Math.round((verified / claims.length) * 100)
412
+ : 50;
1615
413
  return {
1616
- responseId: responseId || `response-${Date.now()}`,
1617
- timestamp: new Date().toISOString(),
414
+ responseId: id,
415
+ timestamp,
1618
416
  claims,
1619
417
  results,
1620
- summary: {
1621
- total: claims.length,
1622
- verified,
1623
- failed,
1624
- inconclusive
1625
- },
1626
- overallVerdict
418
+ summary: { total: claims.length, verified, failed, inconclusive },
419
+ overallVerdict,
420
+ trustScore
1627
421
  };
1628
422
  }
1629
- const UNIVERSAL_EXTRACT = `Extract ALL verifiable claims from this AI response. Include explicit claims, implicit claims, state changes, results, assertions.
423
+ /**
424
+ * Format verification report for display
425
+ */
426
+ export function formatVerificationReport(report) {
427
+ const bar = 'ā–ˆ'.repeat(Math.round(report.trustScore / 10)) + 'ā–‘'.repeat(10 - Math.round(report.trustScore / 10));
428
+ const icon = report.trustScore >= 80 ? 'āœ…' : report.trustScore >= 50 ? 'āš ļø' : 'āŒ';
429
+ let out = `╔════════════════════════════════════════════════════════════╗
430
+ ā•‘ ISOLATED RUNTIME VERIFICATION REPORT ā•‘
431
+ ā•šā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•
1630
432
 
1631
- RESPONSE:
1632
- ---
1633
- {RESPONSE}
1634
- ---
1635
- CONTEXT: {CONTEXT}
1636
- DIR: {WORKING_DIR}
433
+ `;
434
+ out += `Trust: ${icon} ${report.trustScore}/100 [${bar}]
435
+ Verdict: ${report.overallVerdict.toUpperCase()}
1637
436
 
1638
- Return JSON array: [{"id":"c1","statement":"claim","category":"file_op|code|state|data|behavior|fact|other","verifiable":true/false,"verificationApproach":"how","priority":"critical|high|medium|low","context":{}}]
1639
- Output ONLY valid JSON.`;
1640
- const UNIVERSAL_GEN = `Generate verification code for: {STATEMENT}
1641
- Category: {CATEGORY} | Approach: {APPROACH} | Context: {CONTEXT} | Dir: {WORKING_DIR} | Platform: {PLATFORM}
437
+ Claims: ${report.summary.total} | āœ… ${report.summary.verified} | āŒ ${report.summary.failed} | ā“ ${report.summary.inconclusive}
1642
438
 
1643
- Use shell/javascript/python. READ-ONLY only.
1644
- Return JSON: {"steps":[{"type":"shell|javascript|python","code":"code","desc":"what"}],"success":"success criteria","failure":"failure criteria","confPass":0-100,"confFail":0-100,"safe":{"ok":true/false,"why":"reason"}}
1645
- Output ONLY valid JSON.`;
1646
- const UNIVERSAL_ASSESS = `Assess: RESPONSE:{RESPONSE} CLAIMS:{CLAIMS} RESULTS:{RESULTS}
1647
- Return JSON: {"trust":0-100,"summary":"text","concerns":[]}
1648
- Output ONLY valid JSON.`;
1649
- const UNSAFE = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /exec\s*\(/i, /child_process/i, /os\.system/i, /subprocess/i, /curl.*\|.*sh/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i, /kill/i];
1650
- export function validateUniversalCode(c) {
1651
- for (const p of UNSAFE)
1652
- if (p.test(c))
1653
- return { safe: false, reason: p.source };
1654
- return c.length > 5000 ? { safe: false, reason: 'too long' } : { safe: true, reason: 'ok' };
1655
- }
1656
- async function runUniversalStep(s, cwd) {
1657
- const v = validateUniversalCode(s.code);
1658
- if (!v.safe)
1659
- return { ok: false, out: v.reason };
1660
- try {
1661
- if (s.type === 'shell') {
1662
- const { stdout, stderr } = await execAsync(s.code, { cwd, timeout: 30000, maxBuffer: 5 * 1024 * 1024 });
1663
- return { ok: true, out: stdout + stderr };
1664
- }
1665
- if (s.type === 'javascript') {
1666
- const w = `(async()=>{try{const fs=require('fs').promises;const r=await(async()=>{${s.code}})();console.log(JSON.stringify({ok:1,r}))}catch(e){console.log(JSON.stringify({ok:0,e:e.message}))}})()`;
1667
- const { stdout } = await execAsync(`node -e ${JSON.stringify(w)}`, { cwd, timeout: 30000 });
1668
- return { ok: true, out: stdout };
1669
- }
1670
- if (s.type === 'python') {
1671
- const { stdout, stderr } = await execAsync(`python3 -c ${JSON.stringify(s.code)}`, { cwd, timeout: 30000 });
1672
- return { ok: true, out: stdout + stderr };
1673
- }
1674
- return { ok: false, out: 'unknown type' };
1675
- }
1676
- catch (e) {
1677
- return { ok: false, out: e instanceof Error ? e.message : 'err' };
1678
- }
1679
- }
1680
- export async function extractUniversalClaims(r, ctx) {
1681
- if (!ctx.llmVerifier)
1682
- return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
1683
- try {
1684
- const p = UNIVERSAL_EXTRACT.replace('{RESPONSE}', r.slice(0, 8000)).replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '').replace('{WORKING_DIR}', ctx.workingDirectory);
1685
- const res = await ctx.llmVerifier(p);
1686
- const m = res.match(/\[[\s\S]*\]/);
1687
- if (m)
1688
- return JSON.parse(m[0]);
1689
- }
1690
- catch { /* fall through */ }
1691
- return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
1692
- }
1693
- export async function verifyUniversalClaim(claim, ctx) {
1694
- const base = { claim, timestamp: new Date().toISOString() };
1695
- if (!claim.verifiable)
1696
- return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'Not verifiable', reasoning: 'Cannot verify' };
1697
- if (!ctx.llmVerifier)
1698
- return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'No LLM', reasoning: 'Needs LLM' };
1699
- try {
1700
- const p = UNIVERSAL_GEN.replace('{STATEMENT}', claim.statement).replace('{CATEGORY}', claim.category).replace('{APPROACH}', claim.verificationApproach).replace('{CONTEXT}', JSON.stringify(claim.context)).replace('{WORKING_DIR}', ctx.workingDirectory).replace('{PLATFORM}', process.platform);
1701
- const res = await ctx.llmVerifier(p);
1702
- const m = res.match(/\{[\s\S]*\}/);
1703
- if (!m)
1704
- throw new Error('bad');
1705
- const plan = JSON.parse(m[0]);
1706
- if (!plan.safe.ok)
1707
- return { ...base, verified: false, confidence: 0, method: 'blocked', evidence: plan.safe.why, reasoning: 'Unsafe' };
1708
- let allOk = true, out = '', code = '';
1709
- for (const s of plan.steps) {
1710
- code += s.code + '\n';
1711
- const r = await runUniversalStep(s, ctx.workingDirectory);
1712
- out += r.out + '\n';
1713
- if (!r.ok)
1714
- allOk = false;
439
+ `;
440
+ out += `šŸ”¬ ISOLATED RUNTIME TESTS:\n`;
441
+ for (const r of report.results.slice(0, 8)) {
442
+ const statusIcon = r.verified ? 'āœ…' : r.confidence === 'high' ? 'āŒ' : 'ā“';
443
+ out += ` ${statusIcon} [${r.confidence}] ${r.claim.statement.slice(0, 50)}...\n`;
444
+ if (r.reasoning) {
445
+ out += ` └─ ${r.reasoning.slice(0, 60)}\n`;
1715
446
  }
1716
- return { ...base, verified: allOk, confidence: allOk ? plan.confPass : plan.confFail, method: plan.steps.map(s => s.type).join('+'), evidence: allOk ? plan.success : plan.failure, reasoning: allOk ? 'All passed' : 'Some failed', executedCode: code, rawOutput: out.slice(0, 2000) };
1717
447
  }
1718
- catch (e) {
1719
- return { ...base, verified: false, confidence: 10, method: 'error', evidence: 'Failed', reasoning: e instanceof Error ? e.message : 'err' };
448
+ if (report.results.length > 8) {
449
+ out += ` ... +${report.results.length - 8} more\n`;
1720
450
  }
451
+ return out;
1721
452
  }
1722
- export async function verifyResponseUniversal(response, ctx, id) {
1723
- const claims = await extractUniversalClaims(response, ctx);
1724
- const results = [];
1725
- for (const c of claims)
1726
- results.push(c.verifiable || c.priority === 'critical' || c.priority === 'high' ? await verifyUniversalClaim(c, ctx) : { claim: c, verified: false, confidence: 0, method: 'skip', evidence: 'Low priority', reasoning: 'Skipped', timestamp: new Date().toISOString() });
1727
- const vClaims = claims.filter(c => c.verifiable).length;
1728
- const verified = results.filter(r => r.verified).length;
1729
- const failed = results.filter(r => !r.verified && r.confidence > 50).length;
1730
- const inconclusive = results.filter(r => !r.verified && r.confidence <= 50 && r.method !== 'skip').length;
1731
- const avgConf = results.length ? results.reduce((s, r) => s + r.confidence, 0) / results.length : 0;
1732
- let assessment = '', trust = 0;
1733
- if (ctx.llmVerifier)
1734
- try {
1735
- const p = UNIVERSAL_ASSESS.replace('{RESPONSE}', response.slice(0, 4000)).replace('{CLAIMS}', JSON.stringify(claims.slice(0, 15))).replace('{RESULTS}', JSON.stringify(results.slice(0, 15)));
1736
- const r = await ctx.llmVerifier(p);
1737
- const m = r.match(/\{[\s\S]*\}/);
1738
- if (m) {
1739
- const a = JSON.parse(m[0]);
1740
- trust = a.trust;
1741
- assessment = a.summary + (a.concerns?.length ? ` Concerns: ${a.concerns.join('; ')}` : '');
1742
- }
1743
- }
1744
- catch {
1745
- trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
1746
- assessment = `${verified}/${vClaims} verified`;
1747
- }
1748
- else {
1749
- trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
1750
- assessment = `${verified}/${vClaims} verified`;
453
+ /**
454
+ * Quick verification - verify only critical/high priority claims
455
+ */
456
+ export async function quickVerify(response, ctx) {
457
+ const claims = await extractClaims(response, ctx);
458
+ const critical = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 3);
459
+ if (critical.length === 0) {
460
+ return { trustScore: 50, summary: 'No critical claims to verify' };
461
+ }
462
+ const tests = await generateTests(critical, ctx);
463
+ let verified = 0;
464
+ for (const test of tests) {
465
+ const result = await runIsolatedTest(test, ctx.workingDirectory, ctx.llmVerifier);
466
+ if (result.success)
467
+ verified++;
1751
468
  }
1752
- return { responseId: id || `u-${Date.now()}`, originalResponse: response, timestamp: new Date().toISOString(), claims, results, summary: { totalClaims: claims.length, verifiableClaims: vClaims, verified, failed, inconclusive, averageConfidence: Math.round(avgConf) }, overallAssessment: assessment, trustScore: trust };
1753
- }
1754
- export async function quickUniversalVerify(r, ctx) {
1755
- const claims = await extractUniversalClaims(r, ctx);
1756
- const crit = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 5);
1757
- if (!crit.length)
1758
- return { trustScore: 50, summary: 'No critical claims' };
1759
- let v = 0;
1760
- for (const c of crit)
1761
- if ((await verifyUniversalClaim(c, ctx)).verified)
1762
- v++;
1763
- return { trustScore: Math.round(v / crit.length * 100), summary: `${v}/${crit.length} critical verified` };
1764
- }
1765
- export function formatUniversalReport(r) {
1766
- const bar = 'ā–ˆ'.repeat(Math.round(r.trustScore / 10)) + 'ā–‘'.repeat(10 - Math.round(r.trustScore / 10));
1767
- const icon = r.trustScore >= 80 ? 'āœ…' : r.trustScore >= 50 ? 'āš ļø' : 'āŒ';
1768
- let out = `╔════════════════════════════════════════════════════════════╗\nā•‘ UNIVERSAL VERIFICATION REPORT ā•‘\nā•šā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•\n\n`;
1769
- out += `Trust: ${icon} ${r.trustScore}/100 [${bar}]\n${r.overallAssessment}\n\nClaims: ${r.summary.totalClaims} | āœ… ${r.summary.verified} | āŒ ${r.summary.failed} | ā“ ${r.summary.inconclusive}\n\n`;
1770
- for (const x of r.results.slice(0, 8))
1771
- out += `${x.verified ? 'āœ…' : x.confidence > 50 ? 'āŒ' : 'ā“'} [${x.confidence}%] ${x.claim.statement.slice(0, 55)}...\n`;
1772
- if (r.results.length > 8)
1773
- out += `... +${r.results.length - 8} more\n`;
1774
- return out;
469
+ return {
470
+ trustScore: Math.round((verified / critical.length) * 100),
471
+ summary: `${verified}/${critical.length} critical claims verified`
472
+ };
1775
473
  }
1776
474
  //# sourceMappingURL=responseVerifier.js.map