erosolar-cli 1.7.24 → 1.7.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,8 @@
1
1
  /**
2
- * AI Response Verification System
2
+ * AI Response Verification System - Isolated Runtime Only
3
3
  *
4
- * Automatically verifies assistant claims by:
5
- * 1. Extracting verifiable claims from responses
6
- * 2. Generating runtime verification tests
7
- * 3. Executing tests to verify claims
8
- * 4. Reporting verification results
4
+ * Verifies assistant claims by spawning fresh CLI instances and running
5
+ * actual runtime tests. All verification happens in isolation.
9
6
  *
10
7
  * @license MIT
11
8
  */
@@ -14,6 +11,9 @@ import { promisify } from 'node:util';
14
11
  import * as fs from 'node:fs/promises';
15
12
  import * as path from 'node:path';
16
13
  const execAsync = promisify(exec);
14
+ // ============================================================================
15
+ // ISOLATED RUNTIME - Core Functions
16
+ // ============================================================================
17
17
  /**
18
18
  * Spawns a fresh isolated erosolar-cli instance for testing
19
19
  */
@@ -45,7 +45,7 @@ async function spawnIsolatedCLI(cwd, timeout = 60000) {
45
45
  errors += `\nTimeout after ${timeout}ms`;
46
46
  }, timeout);
47
47
  child.on('close', () => clearTimeout(timeoutId));
48
- // Wait for startup (look for prompt or give it 2 seconds)
48
+ // Wait for startup
49
49
  await new Promise(resolve => {
50
50
  const checkStartup = setInterval(() => {
51
51
  if (output.includes('erosolar') || output.includes('>') || output.length > 100) {
@@ -69,7 +69,6 @@ async function spawnIsolatedCLI(cwd, timeout = 60000) {
69
69
  async function sendCommand(cli, command, waitMs = 5000) {
70
70
  const outputBefore = cli.output.length;
71
71
  cli.stdin.write(command + '\n');
72
- // Wait for output to stabilize
73
72
  await new Promise(resolve => {
74
73
  let lastLength = cli.output.length;
75
74
  const checkInterval = setInterval(() => {
@@ -86,7 +85,25 @@ async function sendCommand(cli, command, waitMs = 5000) {
86
85
  return cli.output.slice(outputBefore);
87
86
  }
88
87
  /**
89
- * Runs an isolated runtime test in a fresh CLI instance
88
+ * Run a shell command for verification (file checks, etc.)
89
+ */
90
+ async function runShellVerification(cmd, cwd) {
91
+ // Safety check - block dangerous commands
92
+ const dangerous = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i];
93
+ for (const p of dangerous) {
94
+ if (p.test(cmd))
95
+ return { ok: false, out: `Blocked dangerous command: ${p.source}` };
96
+ }
97
+ try {
98
+ const { stdout, stderr } = await execAsync(cmd, { cwd, timeout: 30000 });
99
+ return { ok: true, out: stdout + stderr };
100
+ }
101
+ catch (e) {
102
+ return { ok: false, out: e instanceof Error ? e.message : 'Command failed' };
103
+ }
104
+ }
105
+ /**
106
+ * Runs an isolated runtime test
90
107
  */
91
108
  export async function runIsolatedTest(test, cwd, llmVerifier) {
92
109
  const startTime = Date.now();
@@ -112,19 +129,29 @@ export async function runIsolatedTest(test, cwd, llmVerifier) {
112
129
  return result;
113
130
  }
114
131
  }
115
- // Spawn fresh CLI
116
- const cli = await spawnIsolatedCLI(cwd, test.timeout || 60000);
117
- // Execute each command
118
- for (const cmd of test.commands) {
119
- const cmdOutput = await sendCommand(cli, cmd);
120
- result.output += `> ${cmd}\n${cmdOutput}\n`;
132
+ // Run shell commands first if any (file checks, etc.)
133
+ if (test.shellCommands && test.shellCommands.length > 0) {
134
+ for (const cmd of test.shellCommands) {
135
+ const shellResult = await runShellVerification(cmd, cwd);
136
+ result.output += `$ ${cmd}\n${shellResult.out}\n`;
137
+ if (!shellResult.ok) {
138
+ result.errors += shellResult.out + '\n';
139
+ }
140
+ }
141
+ }
142
+ // Run CLI commands if any
143
+ if (test.commands && test.commands.length > 0) {
144
+ const cli = await spawnIsolatedCLI(cwd, test.timeout || 60000);
145
+ for (const cmd of test.commands) {
146
+ const cmdOutput = await sendCommand(cli, cmd);
147
+ result.output += `> ${cmd}\n${cmdOutput}\n`;
148
+ }
149
+ cli.stdin.write('/quit\n');
150
+ await new Promise(resolve => setTimeout(resolve, 500));
151
+ cli.process.kill('SIGTERM');
152
+ result.exitCode = await cli.exitPromise;
153
+ result.errors += cli.errors;
121
154
  }
122
- // Gracefully exit
123
- cli.stdin.write('/quit\n');
124
- await new Promise(resolve => setTimeout(resolve, 500));
125
- cli.process.kill('SIGTERM');
126
- result.exitCode = await cli.exitPromise;
127
- result.errors = cli.errors;
128
155
  // Check expected output patterns
129
156
  if (test.expectedOutputs) {
130
157
  for (const pattern of test.expectedOutputs) {
@@ -136,13 +163,13 @@ export async function runIsolatedTest(test, cwd, llmVerifier) {
136
163
  }
137
164
  }
138
165
  }
139
- // LLM assessment of behavior if specified
166
+ // LLM assessment of behavior
140
167
  if (test.expectedBehavior && llmVerifier) {
141
- const assessPrompt = `Assess if this CLI output demonstrates the expected behavior.
168
+ const assessPrompt = `Assess if this output demonstrates the expected behavior.
142
169
 
143
- EXPECTED BEHAVIOR: ${test.expectedBehavior}
170
+ EXPECTED: ${test.expectedBehavior}
144
171
 
145
- CLI OUTPUT:
172
+ OUTPUT:
146
173
  ---
147
174
  ${result.output.slice(0, 4000)}
148
175
  ---
@@ -168,7 +195,7 @@ Return JSON: {"matches": true/false, "confidence": 0-100, "reasoning": "explanat
168
195
  }
169
196
  // Determine success
170
197
  result.success = result.unmatchedPatterns.length === 0 &&
171
- (result.matchedPatterns.length > 0 || !test.expectedOutputs?.length);
198
+ (result.matchedPatterns.length > 0 || (!test.expectedOutputs?.length && !test.expectedBehavior));
172
199
  }
173
200
  catch (err) {
174
201
  result.errors = err instanceof Error ? err.message : 'Unknown error';
@@ -176,1212 +203,196 @@ Return JSON: {"matches": true/false, "confidence": 0-100, "reasoning": "explanat
176
203
  result.duration = Date.now() - startTime;
177
204
  return result;
178
205
  }
179
- /**
180
- * Generates isolated runtime tests for self-referential claims
181
- * (claims about erosolar-cli's own behavior/features)
182
- */
183
- export async function generateIsolatedTests(claims, llmVerifier) {
184
- const selfClaims = claims.filter(c => c.statement.toLowerCase().includes('erosolar') ||
185
- c.statement.toLowerCase().includes('cli') ||
186
- c.statement.toLowerCase().includes('command') ||
187
- c.statement.toLowerCase().includes('feature') ||
188
- c.category === 'behavior' ||
189
- c.category === 'feature');
190
- if (selfClaims.length === 0)
191
- return [];
192
- const prompt = `Generate isolated CLI tests for these claims about erosolar-cli behavior.
206
+ // ============================================================================
207
+ // CLAIM EXTRACTION - LLM extracts claims from responses
208
+ // ============================================================================
209
+ const EXTRACT_CLAIMS_PROMPT = `Extract ALL verifiable claims from this AI assistant response.
193
210
 
194
- CLAIMS:
195
- ${selfClaims.map((c, i) => `${i + 1}. ${c.statement}`).join('\n')}
211
+ RESPONSE:
212
+ ---
213
+ {RESPONSE}
214
+ ---
215
+
216
+ CONTEXT: {CONTEXT}
217
+ WORKING_DIR: {WORKING_DIR}
196
218
 
197
- For each claim, generate a test that:
198
- 1. Spawns a fresh CLI instance
199
- 2. Sends commands to test the claimed behavior
200
- 3. Checks expected output patterns
219
+ For each claim, determine:
220
+ 1. What specific assertion is being made
221
+ 2. Category: file_op (created/modified/deleted files), code (compiles/tests pass), command (executed successfully), state (something changed), behavior (feature works), fact (verifiable truth)
222
+ 3. How it can be verified (shell command, file check, CLI test, etc.)
223
+ 4. Priority: critical (must verify), high (should verify), medium (nice to verify), low (optional)
201
224
 
202
225
  Return JSON array:
203
226
  [{
204
- "id": "test-1",
205
- "description": "what we're testing",
206
- "commands": ["command1", "command2"],
207
- "expectedOutputs": ["pattern1", "pattern2"],
208
- "expectedBehavior": "description of expected behavior",
209
- "requiresBuild": false,
210
- "timeout": 30000
227
+ "id": "c1",
228
+ "statement": "the specific claim",
229
+ "category": "file_op|code|command|state|behavior|fact",
230
+ "verifiable": true,
231
+ "priority": "critical|high|medium|low",
232
+ "context": {"path": "/path/if/relevant", "command": "if relevant"}
211
233
  }]
212
234
 
213
235
  Output ONLY valid JSON array.`;
236
+ /**
237
+ * Extract claims from assistant response using LLM
238
+ */
239
+ async function extractClaims(response, ctx) {
240
+ if (!ctx.llmVerifier)
241
+ return [];
214
242
  try {
215
- const response = await llmVerifier(prompt);
216
- const match = response.match(/\[[\s\S]*\]/);
243
+ const prompt = EXTRACT_CLAIMS_PROMPT
244
+ .replace('{RESPONSE}', response.slice(0, 8000))
245
+ .replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '')
246
+ .replace('{WORKING_DIR}', ctx.workingDirectory);
247
+ const result = await ctx.llmVerifier(prompt);
248
+ const match = result.match(/\[[\s\S]*\]/);
217
249
  if (match) {
218
250
  return JSON.parse(match[0]);
219
251
  }
220
252
  }
221
253
  catch {
222
- // Fall through to manual generation
254
+ // Fall through
223
255
  }
224
- // Fallback: generate basic tests for self-claims
225
- return selfClaims.map((c, i) => ({
226
- id: `iso-${i}`,
227
- description: c.statement,
228
- commands: ['/help'], // Basic smoke test
229
- expectedBehavior: c.statement,
230
- timeout: 30000
231
- }));
256
+ return [];
232
257
  }
233
- /**
234
- * Runs all isolated tests and returns aggregated results
235
- */
236
- export async function runIsolatedVerification(claims, cwd, llmVerifier) {
237
- if (!llmVerifier) {
238
- return { tests: [], summary: { total: 0, passed: 0, failed: 0 }, allPassed: true };
239
- }
240
- const tests = await generateIsolatedTests(claims, llmVerifier);
241
- const results = [];
242
- for (const test of tests) {
243
- const result = await runIsolatedTest(test, cwd, llmVerifier);
244
- results.push(result);
245
- }
246
- const passed = results.filter(r => r.success).length;
247
- const failed = results.filter(r => !r.success).length;
248
- return {
249
- tests: results,
250
- summary: { total: tests.length, passed, failed },
251
- allPassed: failed === 0
252
- };
253
- }
254
- /**
255
- * LLM-based claim extraction prompt.
256
- * Used when pattern matching isn't sufficient.
257
- */
258
- const CLAIM_EXTRACTION_PROMPT = `Analyze this assistant response and extract ALL verifiable claims - anything the assistant claims to have done or accomplished.
259
-
260
- For each claim, identify:
261
- 1. Type: One of these claim types:
262
-
263
- FILE OPERATIONS:
264
- - file_created: A new file was created
265
- - file_modified: An existing file was changed
266
- - file_deleted: A file was removed
267
- - content_contains: A file contains specific content
268
-
269
- BUILD/TEST:
270
- - code_compiles: Code builds/compiles without errors
271
- - tests_pass: Tests run successfully
272
-
273
- VERSION CONTROL:
274
- - git_committed: Changes were committed to git
275
- - package_published: Package was published to npm
276
-
277
- SYSTEM:
278
- - command_executed: A shell command was run
279
- - dependency_installed: A package/dependency was installed
280
- - service_running: A service/server is running (on a port)
281
- - url_accessible: A URL is accessible/working
282
- - env_var_set: Environment variable was set
283
-
284
- CONFIGURATION:
285
- - config_changed: Configuration file was updated
286
- - permission_granted: File permissions were changed
287
-
288
- API/DATA:
289
- - api_response: API returned expected response
290
- - database_updated: Database record was modified
291
- - data_transformed: Data was transformed correctly
292
-
293
- SEMANTIC (require deeper analysis):
294
- - error_fixed: An error/bug was fixed
295
- - feature_implemented: A feature was implemented
296
- - refactor_complete: Code was refactored
297
-
298
- CATCH-ALL:
299
- - generic: Any other verifiable claim not covered above
258
+ // ============================================================================
259
+ // TEST GENERATION - LLM generates isolated tests for claims
260
+ // ============================================================================
261
+ const GENERATE_TESTS_PROMPT = `Generate isolated runtime tests for these claims.
300
262
 
301
- 2. The specific details needed to verify the claim
302
-
303
- Return a JSON array of claims. Each claim should have:
304
- - type: one of the types above
305
- - description: human readable description of what was claimed
306
- - params: Object with relevant fields:
307
- - path: file path (for file operations)
308
- - command: shell command (for command_executed)
309
- - version: version number (for package_published)
310
- - hash: git commit hash (for git_committed)
311
- - package: package name (for dependency_installed)
312
- - port: port number (for service_running)
313
- - name: process/service name (for service_running, env_var_set)
314
- - url: URL (for url_accessible, api_response)
315
- - content: text to search for (for content_contains)
316
- - key: config key path like "server.port" (for config_changed)
317
- - value: expected value (for config_changed, env_var_set)
318
- - status: HTTP status code (for api_response)
319
- - body: expected response body (for api_response)
320
- - mode: file permission mode like "755" (for permission_granted)
321
- - checkCommand: command that can verify the claim (for database_updated)
263
+ CLAIMS:
264
+ {CLAIMS}
322
265
 
323
- IMPORTANT: Extract ALL claims, including semantic ones like "I fixed the bug" or "The feature is now working". Use the 'generic' type for claims that don't fit other categories.
266
+ WORKING_DIR: {WORKING_DIR}
267
+ PLATFORM: {PLATFORM}
324
268
 
325
- Only skip opinions ("I think..."), explanations of what code does, or future plans ("will do", "should work").
269
+ For each claim, generate a test that verifies it using:
270
+ - Shell commands (for file checks, git status, etc.)
271
+ - CLI commands (for testing CLI behavior in fresh instance)
272
+ - Expected output patterns
326
273
 
327
- Response to analyze:
328
- ---
329
- {RESPONSE}
330
- ---
274
+ Return JSON array:
275
+ [{
276
+ "id": "test-1",
277
+ "description": "what we're testing",
278
+ "shellCommands": ["ls -la path", "cat file"],
279
+ "commands": ["/help", "some input"],
280
+ "expectedOutputs": ["pattern1", "pattern2"],
281
+ "expectedBehavior": "description for LLM assessment",
282
+ "requiresBuild": false,
283
+ "timeout": 30000
284
+ }]
331
285
 
332
- Return ONLY valid JSON array, no other text.`;
286
+ Use READ-ONLY commands only. No destructive operations.
287
+ Output ONLY valid JSON array.`;
333
288
  /**
334
- * Extract claims using LLM (for complex responses).
335
- * Falls back to pattern matching if LLM extraction fails.
289
+ * Generate isolated tests for claims
336
290
  */
337
- export async function extractClaimsWithLLM(response, llmCall) {
338
- if (!llmCall) {
339
- // No LLM available, use pattern matching
340
- return extractClaims(response);
341
- }
291
+ async function generateTests(claims, ctx) {
292
+ if (!ctx.llmVerifier || claims.length === 0)
293
+ return [];
342
294
  try {
343
- const prompt = CLAIM_EXTRACTION_PROMPT.replace('{RESPONSE}', response.slice(0, 4000));
344
- const result = await llmCall(prompt);
345
- // Parse JSON response
346
- const jsonMatch = result.match(/\[[\s\S]*\]/);
347
- if (!jsonMatch) {
348
- return extractClaims(response);
349
- }
350
- const parsed = JSON.parse(jsonMatch[0]);
351
- return parsed.map(claim => ({
352
- type: claim.type,
353
- description: claim.description,
354
- evidence: 'Extracted by LLM',
355
- params: claim.params
356
- }));
357
- }
358
- catch {
359
- // LLM extraction failed, fall back to patterns
360
- return extractClaims(response);
361
- }
362
- }
363
- /**
364
- * Extract verifiable claims from an assistant response.
365
- * Covers common patterns for file operations, builds, tests, git, and npm.
366
- */
367
- export function extractClaims(response) {
368
- const claims = [];
369
- const seenPaths = new Set();
370
- // Helper to add file claim if not duplicate
371
- const addFileClaim = (type, path, evidence) => {
372
- if (path && !seenPaths.has(path)) {
373
- seenPaths.add(path);
374
- claims.push({
375
- type,
376
- description: `File ${path} was ${type === 'file_created' ? 'created' : 'modified'}`,
377
- evidence,
378
- params: { path }
379
- });
380
- }
381
- };
382
- // Pattern: File creation claims - comprehensive patterns
383
- const fileCreationPatterns = [
384
- // "I created file X", "Created X", "I've created X"
385
- /(?:I(?:'ve)?\s+)?(?:created|wrote|written|generated|added)\s+(?:a\s+)?(?:new\s+)?(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
386
- // "File X created", "File created at X"
387
- /(?:File\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:was\s+)?(?:created|written|generated)/gi,
388
- // "Created file at X", "Wrote file to X"
389
- /(?:created|wrote)\s+(?:a\s+)?(?:new\s+)?file\s+(?:at|to|in)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
390
- // "File created successfully" with path nearby
391
- /[`"']([^\s`"',]+\.[a-zA-Z0-9]+)[`"']\s+(?:has been\s+)?(?:created|written)/gi,
392
- // "successfully created X"
393
- /successfully\s+(?:created|wrote|generated)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
394
- // "The file X now exists" or "X now contains"
395
- /(?:the\s+)?(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:now\s+)?(?:exists|contains)/gi,
396
- ];
397
- for (const pattern of fileCreationPatterns) {
398
- pattern.lastIndex = 0; // Reset regex state
399
- let match;
400
- while ((match = pattern.exec(response)) !== null) {
401
- if (match[1]) {
402
- addFileClaim('file_created', match[1], match[0]);
403
- }
404
- }
405
- }
406
- // Pattern: File modification claims - comprehensive patterns
407
- const fileModPatterns = [
408
- // "I modified X", "Updated X", "I've edited X"
409
- /(?:I(?:'ve)?\s+)?(?:modified|updated|changed|edited|fixed|patched|amended)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
410
- // "File X was updated"
411
- /(?:File\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:was\s+)?(?:modified|updated|changed|edited|fixed)/gi,
412
- // "Made changes to X"
413
- /(?:made\s+)?changes?\s+to\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
414
- // "X has been updated"
415
- /[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+has\s+been\s+(?:updated|modified|changed|edited)/gi,
416
- // "successfully updated X"
417
- /successfully\s+(?:updated|modified|edited|fixed)\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
418
- ];
419
- for (const pattern of fileModPatterns) {
420
- pattern.lastIndex = 0;
421
- let match;
422
- while ((match = pattern.exec(response)) !== null) {
423
- if (match[1]) {
424
- addFileClaim('file_modified', match[1], match[0]);
425
- }
426
- }
427
- }
428
- // Pattern: Command execution claims
429
- const cmdPatterns = [
430
- /(?:I(?:'ve)?\s+)?(?:ran|executed|run|running)\s+`([^`]+)`/gi,
431
- /(?:Running|Executed|Ran)\s+`([^`]+)`/gi,
432
- /`([^`]+)`\s+(?:completed|succeeded|finished|passed)/gi,
433
- /executed\s+(?:the\s+)?command[:\s]+`([^`]+)`/gi,
434
- ];
435
- const seenCommands = new Set();
436
- for (const pattern of cmdPatterns) {
437
- pattern.lastIndex = 0;
438
- let match;
439
- while ((match = pattern.exec(response)) !== null) {
440
- const command = match[1];
441
- if (command && !seenCommands.has(command)) {
442
- seenCommands.add(command);
443
- claims.push({
444
- type: 'command_executed',
445
- description: `Command "${command.slice(0, 50)}${command.length > 50 ? '...' : ''}" was executed`,
446
- evidence: match[0],
447
- params: { command }
448
- });
449
- }
450
- }
451
- }
452
- // Pattern: Build/compile success claims
453
- const buildPatterns = [
454
- /(?:build|compilation|type[- ]?check)\s+(?:passed|succeeded|completed|successful|success)/gi,
455
- /(?:successfully|passed)\s+(?:the\s+)?(?:build|compilation|type[- ]?check)/gi,
456
- /no\s+(?:type\s+)?errors/gi,
457
- /type[- ]?check(?:ing)?\s+(?:passed|succeeded|completed)/gi,
458
- /(?:built|compiled)\s+successfully/gi,
459
- /build\s+(?:is\s+)?(?:complete|successful)/gi,
460
- ];
461
- let hasBuildClaim = false;
462
- for (const pattern of buildPatterns) {
463
- pattern.lastIndex = 0;
464
- if (!hasBuildClaim && pattern.test(response)) {
465
- claims.push({
466
- type: 'code_compiles',
467
- description: 'Code compiles without errors',
468
- evidence: response.match(pattern)?.[0] || '',
469
- params: {}
470
- });
471
- hasBuildClaim = true;
472
- break;
473
- }
474
- }
475
- // Pattern: Test pass claims
476
- const testPatterns = [
477
- /(?:all\s+)?tests?\s+(?:pass|passed|passing|succeeded|successful)/gi,
478
- /(?:passed|passing)\s+(?:all\s+)?tests?/gi,
479
- /(\d+)\s+tests?\s+passed/gi,
480
- /tests?\s+(?:completed|finished)\s+successfully/gi,
481
- /(?:test|tests)\s+suite\s+(?:passed|succeeded)/gi,
482
- /all\s+(\d+)\s+tests?\s+(?:pass|passed)/gi,
483
- ];
484
- let hasTestClaim = false;
485
- for (const pattern of testPatterns) {
486
- pattern.lastIndex = 0;
487
- const match = pattern.exec(response);
488
- if (!hasTestClaim && match) {
489
- claims.push({
490
- type: 'tests_pass',
491
- description: 'Tests pass',
492
- evidence: match[0],
493
- params: { count: match[1] ? parseInt(match[1], 10) : undefined }
494
- });
495
- hasTestClaim = true;
496
- break;
497
- }
498
- }
499
- // Pattern: Git commit claims
500
- const gitPatterns = [
501
- /committed\s+(?:the\s+)?(?:changes?\s+)?(?:with\s+message\s+)?["']?([^"'\n]+)["']?/gi,
502
- /\[(?:main|master|[a-zA-Z0-9/_-]+)\s+([a-f0-9]{7,})\]/gi,
503
- /git\s+commit.*-m\s+["']([^"']+)["']/gi,
504
- /created\s+(?:a\s+)?commit/gi,
505
- /changes?\s+(?:have\s+been\s+)?committed/gi,
506
- /commit\s+([a-f0-9]{7,})/gi,
507
- ];
508
- let hasGitClaim = false;
509
- for (const pattern of gitPatterns) {
510
- pattern.lastIndex = 0;
511
- const match = pattern.exec(response);
512
- if (!hasGitClaim && match) {
513
- claims.push({
514
- type: 'git_committed',
515
- description: 'Changes were committed',
516
- evidence: match[0],
517
- params: { hash: match[1] }
518
- });
519
- hasGitClaim = true;
520
- break;
521
- }
522
- }
523
- // Pattern: Package publish claims
524
- const publishPatterns = [
525
- /published\s+(?:to\s+)?(?:npm|registry)/gi,
526
- /\+\s+[a-z@/_-]+@(\d+\.\d+\.\d+)/gi,
527
- /npm\s+publish/gi,
528
- /package\s+(?:was\s+)?published/gi,
529
- /published\s+(?:version\s+)?v?(\d+\.\d+\.\d+)/gi,
530
- /successfully\s+published/gi,
531
- ];
532
- let hasPublishClaim = false;
533
- for (const pattern of publishPatterns) {
534
- pattern.lastIndex = 0;
535
- const match = pattern.exec(response);
536
- if (!hasPublishClaim && match) {
537
- claims.push({
538
- type: 'package_published',
539
- description: 'Package was published',
540
- evidence: match[0],
541
- params: { version: match[1] }
542
- });
543
- hasPublishClaim = true;
544
- break;
545
- }
546
- }
547
- // Pattern: File deletion claims
548
- const deletionPatterns = [
549
- /(?:I(?:'ve)?\s+)?(?:deleted|removed)\s+(?:the\s+)?(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
550
- /[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:was\s+)?(?:deleted|removed)/gi,
551
- ];
552
- for (const pattern of deletionPatterns) {
553
- pattern.lastIndex = 0;
554
- let match;
555
- while ((match = pattern.exec(response)) !== null) {
556
- const filePath = match[1];
557
- if (filePath && !seenPaths.has(filePath)) {
558
- seenPaths.add(filePath);
559
- claims.push({
560
- type: 'file_deleted',
561
- description: `File ${filePath} was deleted`,
562
- evidence: match[0],
563
- params: { path: filePath }
564
- });
565
- }
566
- }
567
- }
568
- // Pattern: Dependency installation claims
569
- const installPatterns = [
570
- /(?:installed|added)\s+(?:the\s+)?(?:package|dependency)\s+[`"']?([^\s`"',]+)[`"']?/gi,
571
- /npm\s+install(?:ed)?\s+[`"']?([^\s`"',]+)[`"']?/gi,
572
- /(?:package|dependency)\s+[`"']?([^\s`"',]+)[`"']?\s+(?:was\s+)?installed/gi,
573
- ];
574
- for (const pattern of installPatterns) {
575
- pattern.lastIndex = 0;
576
- let match;
577
- while ((match = pattern.exec(response)) !== null) {
578
- const packageName = match[1];
579
- if (packageName) {
580
- claims.push({
581
- type: 'dependency_installed',
582
- description: `Package ${packageName} was installed`,
583
- evidence: match[0],
584
- params: { package: packageName }
585
- });
586
- }
587
- }
588
- }
589
- // Pattern: Service running claims
590
- const servicePatterns = [
591
- /(?:server|service|app(?:lication)?)\s+(?:is\s+)?(?:running|started|listening)\s+(?:on\s+)?(?:port\s+)?(\d+)/gi,
592
- /(?:listening|running)\s+(?:on\s+)?(?:port\s+)?(\d+)/gi,
593
- /started\s+(?:the\s+)?(?:server|service)\s+(?:on\s+)?(?:port\s+)?(\d+)/gi,
594
- /(?:port\s+)?(\d+)\s+is\s+(?:now\s+)?(?:open|listening)/gi,
595
- ];
596
- for (const pattern of servicePatterns) {
597
- pattern.lastIndex = 0;
598
- const match = pattern.exec(response);
599
- if (match && match[1]) {
600
- const port = parseInt(match[1], 10);
601
- if (port > 0 && port < 65536) {
602
- claims.push({
603
- type: 'service_running',
604
- description: `Service running on port ${port}`,
605
- evidence: match[0],
606
- params: { port }
607
- });
608
- break; // Only one service claim per response
609
- }
610
- }
611
- }
612
- // Pattern: URL accessible claims
613
- const urlPatterns = [
614
- /(?:accessible|available|live)\s+at\s+(https?:\/\/[^\s]+)/gi,
615
- /(?:visit|open|access)\s+(https?:\/\/[^\s]+)/gi,
616
- /(https?:\/\/[^\s]+)\s+(?:is\s+)?(?:now\s+)?(?:accessible|available|live)/gi,
617
- /deployed\s+(?:to|at)\s+(https?:\/\/[^\s]+)/gi,
618
- ];
619
- for (const pattern of urlPatterns) {
620
- pattern.lastIndex = 0;
621
- const match = pattern.exec(response);
622
- if (match && match[1]) {
623
- const url = match[1].replace(/[.,;:!?)]+$/, ''); // Remove trailing punctuation
624
- claims.push({
625
- type: 'url_accessible',
626
- description: `URL ${url} is accessible`,
627
- evidence: match[0],
628
- params: { url }
629
- });
630
- break; // Only one URL claim per response
631
- }
632
- }
633
- // Pattern: Content contains claims
634
- const contentPatterns = [
635
- /(?:file\s+)?[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?\s+(?:now\s+)?contains?\s+[`"']([^`"']+)[`"']/gi,
636
- /added\s+[`"']([^`"']+)[`"']\s+to\s+[`"']?([^\s`"',]+\.[a-zA-Z0-9]+)[`"']?/gi,
637
- ];
638
- for (const pattern of contentPatterns) {
639
- pattern.lastIndex = 0;
640
- const match = pattern.exec(response);
295
+ const prompt = GENERATE_TESTS_PROMPT
296
+ .replace('{CLAIMS}', JSON.stringify(claims.slice(0, 10)))
297
+ .replace('{WORKING_DIR}', ctx.workingDirectory)
298
+ .replace('{PLATFORM}', process.platform);
299
+ const result = await ctx.llmVerifier(prompt);
300
+ const match = result.match(/\[[\s\S]*\]/);
641
301
  if (match) {
642
- // Pattern 1: file contains "text"
643
- // Pattern 2: added "text" to file
644
- const isPattern2 = pattern.source.startsWith('added');
645
- const filePath = isPattern2 ? match[2] : match[1];
646
- const content = isPattern2 ? match[1] : match[2];
647
- if (filePath && content) {
648
- claims.push({
649
- type: 'content_contains',
650
- description: `File ${filePath} contains specified content`,
651
- evidence: match[0],
652
- params: { path: filePath, content }
653
- });
654
- }
655
- }
656
- }
657
- return claims;
658
- }
659
- /**
660
- * Generate a verification test for a claim
661
- */
662
- export function generateVerificationTest(claim) {
663
- const baseResult = {
664
- claim,
665
- timestamp: new Date().toISOString()
666
- };
667
- switch (claim.type) {
668
- case 'file_created':
669
- case 'file_modified':
670
- return async () => {
671
- const filePath = claim.params.path;
672
- try {
673
- const resolvedPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
674
- const stats = await fs.stat(resolvedPath);
675
- const recentlyModified = (Date.now() - stats.mtimeMs) < 5 * 60 * 1000; // Within 5 minutes
676
- return {
677
- ...baseResult,
678
- verified: stats.isFile(),
679
- confidence: recentlyModified ? 'high' : 'medium',
680
- evidence: `File exists. Size: ${stats.size} bytes. Modified: ${stats.mtime.toISOString()}`
681
- };
682
- }
683
- catch (err) {
684
- return {
685
- ...baseResult,
686
- verified: false,
687
- confidence: 'high',
688
- evidence: 'File does not exist',
689
- error: err instanceof Error ? err.message : 'Unknown error'
690
- };
691
- }
692
- };
693
- case 'file_deleted':
694
- return async () => {
695
- const filePath = claim.params.path;
696
- try {
697
- const resolvedPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
698
- await fs.stat(resolvedPath);
699
- return {
700
- ...baseResult,
701
- verified: false,
702
- confidence: 'high',
703
- evidence: 'File still exists (deletion claim is false)'
704
- };
705
- }
706
- catch {
707
- return {
708
- ...baseResult,
709
- verified: true,
710
- confidence: 'high',
711
- evidence: 'File does not exist (deletion verified)'
712
- };
713
- }
714
- };
715
- case 'code_compiles':
716
- return async () => {
717
- try {
718
- const { stdout, stderr } = await execAsync('npm run type-check 2>&1 || npm run build 2>&1', {
719
- timeout: 60000,
720
- cwd: process.cwd()
721
- });
722
- const output = stdout + stderr;
723
- const hasErrors = /error/i.test(output) && !/0 errors/i.test(output);
724
- return {
725
- ...baseResult,
726
- verified: !hasErrors,
727
- confidence: 'high',
728
- evidence: hasErrors ? `Compilation errors found: ${output.slice(0, 500)}` : 'Code compiles successfully'
729
- };
730
- }
731
- catch (err) {
732
- return {
733
- ...baseResult,
734
- verified: false,
735
- confidence: 'high',
736
- evidence: 'Compilation check failed',
737
- error: err instanceof Error ? err.message : 'Unknown error'
738
- };
739
- }
740
- };
741
- case 'tests_pass':
742
- return async () => {
743
- try {
744
- const { stdout, stderr } = await execAsync('npm test 2>&1', {
745
- timeout: 120000,
746
- cwd: process.cwd()
747
- });
748
- const output = stdout + stderr;
749
- const hasFailed = /fail|error/i.test(output) && !/0 failed/i.test(output);
750
- return {
751
- ...baseResult,
752
- verified: !hasFailed,
753
- confidence: 'high',
754
- evidence: hasFailed ? `Test failures: ${output.slice(0, 500)}` : 'All tests pass'
755
- };
756
- }
757
- catch (err) {
758
- return {
759
- ...baseResult,
760
- verified: false,
761
- confidence: 'high',
762
- evidence: 'Test execution failed',
763
- error: err instanceof Error ? err.message : 'Unknown error'
764
- };
765
- }
766
- };
767
- case 'git_committed':
768
- return async () => {
769
- try {
770
- const { stdout } = await execAsync('git log -1 --oneline', {
771
- timeout: 5000,
772
- cwd: process.cwd()
773
- });
774
- const hash = claim.params.hash;
775
- if (hash && stdout.includes(hash.slice(0, 7))) {
776
- return {
777
- ...baseResult,
778
- verified: true,
779
- confidence: 'high',
780
- evidence: `Commit found: ${stdout.trim()}`
781
- };
782
- }
783
- // Check if there's a recent commit
784
- const { stdout: logOutput } = await execAsync('git log -1 --format="%H %s"', {
785
- timeout: 5000
786
- });
787
- return {
788
- ...baseResult,
789
- verified: true,
790
- confidence: 'medium',
791
- evidence: `Most recent commit: ${logOutput.trim()}`
792
- };
793
- }
794
- catch (err) {
795
- return {
796
- ...baseResult,
797
- verified: false,
798
- confidence: 'high',
799
- evidence: 'Git check failed',
800
- error: err instanceof Error ? err.message : 'Unknown error'
801
- };
802
- }
803
- };
804
- case 'package_published':
805
- return async () => {
806
- try {
807
- // Read package.json to get name and version
808
- const pkgPath = path.resolve(process.cwd(), 'package.json');
809
- const pkgContent = await fs.readFile(pkgPath, 'utf-8');
810
- const pkg = JSON.parse(pkgContent);
811
- const { stdout } = await execAsync(`npm view ${pkg.name}@${pkg.version} version 2>&1`, {
812
- timeout: 10000
813
- });
814
- const published = stdout.trim() === pkg.version;
815
- return {
816
- ...baseResult,
817
- verified: published,
818
- confidence: 'high',
819
- evidence: published ? `${pkg.name}@${pkg.version} found on npm` : 'Version not found on npm'
820
- };
821
- }
822
- catch (err) {
823
- return {
824
- ...baseResult,
825
- verified: false,
826
- confidence: 'medium',
827
- evidence: 'Could not verify npm publication',
828
- error: err instanceof Error ? err.message : 'Unknown error'
829
- };
830
- }
831
- };
832
- case 'command_executed':
833
- // Can't really verify past command execution, just acknowledge
834
- return async () => ({
835
- ...baseResult,
836
- verified: true, // Assume true since we can't replay
837
- confidence: 'low',
838
- evidence: 'Command execution cannot be retroactively verified'
839
- });
840
- case 'dependency_installed':
841
- return async () => {
842
- const packageName = claim.params.package;
843
- if (!packageName) {
844
- return {
845
- ...baseResult,
846
- verified: false,
847
- confidence: 'low',
848
- evidence: 'No package name provided'
849
- };
850
- }
851
- try {
852
- // Check if package exists in node_modules
853
- const modulePath = path.resolve(process.cwd(), 'node_modules', packageName);
854
- await fs.stat(modulePath);
855
- // Also verify in package.json
856
- const pkgPath = path.resolve(process.cwd(), 'package.json');
857
- const pkgContent = await fs.readFile(pkgPath, 'utf-8');
858
- const pkg = JSON.parse(pkgContent);
859
- const inDeps = pkg.dependencies?.[packageName] || pkg.devDependencies?.[packageName];
860
- return {
861
- ...baseResult,
862
- verified: true,
863
- confidence: inDeps ? 'high' : 'medium',
864
- evidence: inDeps
865
- ? `Package ${packageName} installed (${inDeps})`
866
- : `Package ${packageName} found in node_modules but not in package.json`
867
- };
868
- }
869
- catch {
870
- return {
871
- ...baseResult,
872
- verified: false,
873
- confidence: 'high',
874
- evidence: `Package ${packageName} not found in node_modules`
875
- };
876
- }
877
- };
878
- case 'service_running':
879
- return async () => {
880
- const port = claim.params.port;
881
- const name = claim.params.name;
882
- try {
883
- if (port) {
884
- // Check if port is in use
885
- const { stdout } = await execAsync(`lsof -i :${port} 2>/dev/null || netstat -an | grep ${port}`, {
886
- timeout: 5000
887
- });
888
- const isRunning = stdout.trim().length > 0;
889
- return {
890
- ...baseResult,
891
- verified: isRunning,
892
- confidence: 'high',
893
- evidence: isRunning ? `Service running on port ${port}` : `No service found on port ${port}`
894
- };
895
- }
896
- else if (name) {
897
- // Check if process is running by name
898
- const { stdout } = await execAsync(`pgrep -f "${name}" 2>/dev/null || ps aux | grep "${name}" | grep -v grep`, {
899
- timeout: 5000
900
- });
901
- const isRunning = stdout.trim().length > 0;
902
- return {
903
- ...baseResult,
904
- verified: isRunning,
905
- confidence: 'medium',
906
- evidence: isRunning ? `Process "${name}" appears to be running` : `Process "${name}" not found`
907
- };
908
- }
909
- return {
910
- ...baseResult,
911
- verified: false,
912
- confidence: 'low',
913
- evidence: 'No port or service name provided for verification'
914
- };
915
- }
916
- catch {
917
- return {
918
- ...baseResult,
919
- verified: false,
920
- confidence: 'medium',
921
- evidence: 'Could not verify service status'
922
- };
923
- }
924
- };
925
- case 'url_accessible':
926
- return async () => {
927
- const url = claim.params.url;
928
- if (!url) {
929
- return {
930
- ...baseResult,
931
- verified: false,
932
- confidence: 'low',
933
- evidence: 'No URL provided'
934
- };
935
- }
936
- try {
937
- const { stdout } = await execAsync(`curl -s -o /dev/null -w "%{http_code}" "${url}" 2>&1`, {
938
- timeout: 10000
939
- });
940
- const statusCode = parseInt(stdout.trim(), 10);
941
- const isAccessible = statusCode >= 200 && statusCode < 400;
942
- return {
943
- ...baseResult,
944
- verified: isAccessible,
945
- confidence: 'high',
946
- evidence: `URL returned status ${statusCode}`
947
- };
948
- }
949
- catch (err) {
950
- return {
951
- ...baseResult,
952
- verified: false,
953
- confidence: 'high',
954
- evidence: 'URL is not accessible',
955
- error: err instanceof Error ? err.message : 'Unknown error'
956
- };
957
- }
958
- };
959
- case 'content_contains':
960
- return async () => {
961
- const filePath = claim.params.path;
962
- const searchText = claim.params.content;
963
- if (!filePath || !searchText) {
964
- return {
965
- ...baseResult,
966
- verified: false,
967
- confidence: 'low',
968
- evidence: 'Missing file path or search content'
969
- };
970
- }
971
- try {
972
- const resolvedPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
973
- const content = await fs.readFile(resolvedPath, 'utf-8');
974
- const contains = content.includes(searchText);
975
- return {
976
- ...baseResult,
977
- verified: contains,
978
- confidence: 'high',
979
- evidence: contains
980
- ? `File contains the expected content`
981
- : `File does not contain "${searchText.slice(0, 50)}..."`
982
- };
983
- }
984
- catch (err) {
985
- return {
986
- ...baseResult,
987
- verified: false,
988
- confidence: 'high',
989
- evidence: 'Could not read file',
990
- error: err instanceof Error ? err.message : 'Unknown error'
991
- };
992
- }
993
- };
994
- default:
995
- return async () => ({
996
- ...baseResult,
997
- verified: false,
998
- confidence: 'low',
999
- evidence: `Unknown claim type: ${claim.type}`
1000
- });
1001
- }
1002
- }
1003
- /**
1004
- * Verify all claims in an assistant response using LLM-based semantic analysis.
1005
- * Requires a VerificationContext with an llmVerifier function.
1006
- * All claim extraction and verification is done via LLM.
1007
- */
1008
- export async function verifyResponse(response, context, responseId) {
1009
- return verifyResponseComprehensive(response, context, responseId);
1010
- }
1011
- /**
1012
- * Format a verification report for display
1013
- */
1014
- export function formatVerificationReport(report) {
1015
- const lines = [];
1016
- lines.push('═══════════════════════════════════════════════════════════');
1017
- lines.push(' RESPONSE VERIFICATION REPORT');
1018
- lines.push('═══════════════════════════════════════════════════════════');
1019
- lines.push('');
1020
- const verdictEmoji = {
1021
- verified: '✅',
1022
- partially_verified: '⚠️',
1023
- unverified: '❓',
1024
- contradicted: '❌'
1025
- };
1026
- lines.push(`Verdict: ${verdictEmoji[report.overallVerdict]} ${report.overallVerdict.toUpperCase()}`);
1027
- lines.push(`Claims: ${report.summary.total} total, ${report.summary.verified} verified, ${report.summary.failed} failed`);
1028
- lines.push('');
1029
- if (report.results.length > 0) {
1030
- lines.push('Verification Details:');
1031
- lines.push('─────────────────────');
1032
- for (const result of report.results) {
1033
- const icon = result.verified ? '✅' : (result.confidence === 'high' ? '❌' : '❓');
1034
- lines.push(`${icon} ${result.claim.description}`);
1035
- lines.push(` Evidence: ${result.evidence.slice(0, 100)}`);
1036
- if (result.error) {
1037
- lines.push(` Error: ${result.error}`);
1038
- }
302
+ return JSON.parse(match[0]);
1039
303
  }
1040
304
  }
1041
- else {
1042
- lines.push('No verifiable claims found in response.');
1043
- }
1044
- lines.push('');
1045
- lines.push('═══════════════════════════════════════════════════════════');
1046
- return lines.join('\n');
1047
- }
1048
- /**
1049
- * Quick verification - returns true if response claims are valid.
1050
- * Requires a VerificationContext with llmVerifier for LLM-based semantic analysis.
1051
- */
1052
- export async function quickVerify(response, context) {
1053
- const report = await verifyResponse(response, context);
1054
- return report.overallVerdict === 'verified' || report.overallVerdict === 'partially_verified';
1055
- }
1056
- /**
1057
- * LLM-based verification prompt for claims that can't be programmatically verified
1058
- */
1059
- const LLM_VERIFICATION_PROMPT = `You are a verification assistant. Analyze whether the following claim is likely TRUE or FALSE based on the evidence provided.
1060
-
1061
- CLAIM: {CLAIM}
1062
-
1063
- EVIDENCE/CONTEXT:
1064
- {CONTEXT}
1065
-
1066
- Respond with a JSON object:
1067
- {
1068
- "verdict": "verified" | "unverified" | "inconclusive",
1069
- "confidence": "high" | "medium" | "low",
1070
- "reasoning": "Brief explanation of your analysis",
1071
- "suggested_test": "Optional: A command or check that could verify this claim"
1072
- }
1073
-
1074
- Be conservative - only mark as "verified" if there's strong evidence. Mark as "inconclusive" if you can't determine the truth.`;
1075
- /**
1076
- * Verify a claim using LLM when runtime verification isn't possible
1077
- */
1078
- export async function verifyClaimWithLLM(claim, context) {
1079
- const baseResult = {
1080
- claim,
1081
- timestamp: new Date().toISOString()
1082
- };
1083
- if (!context.llmVerifier) {
1084
- return {
1085
- ...baseResult,
1086
- verified: false,
1087
- confidence: 'low',
1088
- evidence: 'No LLM verifier available for semantic verification'
305
+ catch {
306
+ // Fall through to basic tests
307
+ }
308
+ // Fallback: generate basic tests
309
+ return claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).map((c, i) => {
310
+ const test = {
311
+ id: `test-${i}`,
312
+ description: c.statement,
313
+ commands: [],
314
+ shellCommands: [],
315
+ expectedBehavior: c.statement,
316
+ timeout: 30000
1089
317
  };
1090
- }
1091
- try {
1092
- // Build context string
1093
- const contextParts = [];
1094
- if (context.previousState) {
1095
- contextParts.push(`Previous state: ${JSON.stringify(context.previousState, null, 2)}`);
1096
- }
1097
- if (context.currentState) {
1098
- contextParts.push(`Current state: ${JSON.stringify(context.currentState, null, 2)}`);
318
+ // Add basic verification based on category
319
+ if (c.category === 'file_op' && c.context['path']) {
320
+ test.shellCommands = [`test -f "${c.context['path']}" && echo "EXISTS" || echo "NOT_FOUND"`];
321
+ test.expectedOutputs = ['EXISTS'];
1099
322
  }
1100
- if (context.conversationHistory?.length) {
1101
- contextParts.push(`Recent conversation:\n${context.conversationHistory.slice(-5).join('\n')}`);
323
+ else if (c.category === 'code') {
324
+ test.shellCommands = ['npm run build 2>&1 | tail -5'];
1102
325
  }
1103
- contextParts.push(`Claim evidence: ${claim.evidence}`);
1104
- contextParts.push(`Claim params: ${JSON.stringify(claim.params)}`);
1105
- const prompt = LLM_VERIFICATION_PROMPT
1106
- .replace('{CLAIM}', claim.description)
1107
- .replace('{CONTEXT}', contextParts.join('\n\n'));
1108
- const result = await context.llmVerifier(prompt);
1109
- // Parse LLM response
1110
- const jsonMatch = result.match(/\{[\s\S]*\}/);
1111
- if (!jsonMatch) {
1112
- return {
1113
- ...baseResult,
1114
- verified: false,
1115
- confidence: 'low',
1116
- evidence: 'LLM verification returned invalid response'
1117
- };
326
+ else if (c.category === 'behavior') {
327
+ test.commands = ['/help'];
1118
328
  }
1119
- const parsed = JSON.parse(jsonMatch[0]);
1120
- return {
1121
- ...baseResult,
1122
- verified: parsed.verdict === 'verified',
1123
- confidence: parsed.confidence || 'medium',
1124
- evidence: `LLM Analysis: ${parsed.reasoning}${parsed.suggested_test ? ` (Suggested test: ${parsed.suggested_test})` : ''}`
1125
- };
1126
- }
1127
- catch (err) {
1128
- return {
1129
- ...baseResult,
1130
- verified: false,
1131
- confidence: 'low',
1132
- evidence: 'LLM verification failed',
1133
- error: err instanceof Error ? err.message : 'Unknown error'
1134
- };
1135
- }
1136
- }
1137
- /**
1138
- * Generate verification test for extended claim types
1139
- */
1140
- export function generateExtendedVerificationTest(claim, context) {
1141
- const baseResult = {
1142
- claim,
1143
- timestamp: new Date().toISOString()
1144
- };
1145
- switch (claim.type) {
1146
- case 'api_response':
1147
- return async () => {
1148
- const url = claim.params.url;
1149
- const expectedStatus = claim.params.status;
1150
- const expectedBody = claim.params.body;
1151
- if (!url) {
1152
- return {
1153
- ...baseResult,
1154
- verified: false,
1155
- confidence: 'low',
1156
- evidence: 'No API URL provided'
1157
- };
1158
- }
1159
- try {
1160
- const { stdout } = await execAsync(`curl -s -w "\\n%{http_code}" "${url}" 2>&1`, { timeout: 15000 });
1161
- const lines = stdout.trim().split('\n');
1162
- const statusCode = parseInt(lines.pop() || '0', 10);
1163
- const body = lines.join('\n');
1164
- let verified = true;
1165
- const evidenceParts = [];
1166
- if (expectedStatus && statusCode !== expectedStatus) {
1167
- verified = false;
1168
- evidenceParts.push(`Expected status ${expectedStatus}, got ${statusCode}`);
1169
- }
1170
- else {
1171
- evidenceParts.push(`Status: ${statusCode}`);
1172
- }
1173
- if (expectedBody && !body.includes(expectedBody)) {
1174
- verified = false;
1175
- evidenceParts.push(`Expected body to contain "${expectedBody.slice(0, 50)}..."`);
1176
- }
1177
- return {
1178
- ...baseResult,
1179
- verified,
1180
- confidence: 'high',
1181
- evidence: evidenceParts.join('. ')
1182
- };
1183
- }
1184
- catch (err) {
1185
- return {
1186
- ...baseResult,
1187
- verified: false,
1188
- confidence: 'high',
1189
- evidence: 'API request failed',
1190
- error: err instanceof Error ? err.message : 'Unknown error'
1191
- };
1192
- }
1193
- };
1194
- case 'env_var_set':
1195
- return async () => {
1196
- const varName = claim.params.name;
1197
- const expectedValue = claim.params.value;
1198
- if (!varName) {
1199
- return {
1200
- ...baseResult,
1201
- verified: false,
1202
- confidence: 'low',
1203
- evidence: 'No environment variable name provided'
1204
- };
1205
- }
1206
- const actualValue = process.env[varName];
1207
- if (actualValue === undefined) {
1208
- return {
1209
- ...baseResult,
1210
- verified: false,
1211
- confidence: 'high',
1212
- evidence: `Environment variable ${varName} is not set`
1213
- };
1214
- }
1215
- if (expectedValue && actualValue !== expectedValue) {
1216
- return {
1217
- ...baseResult,
1218
- verified: false,
1219
- confidence: 'high',
1220
- evidence: `Expected ${varName}="${expectedValue}", got "${actualValue}"`
1221
- };
1222
- }
1223
- return {
1224
- ...baseResult,
1225
- verified: true,
1226
- confidence: 'high',
1227
- evidence: `${varName} is set${expectedValue ? ` to expected value` : `: ${actualValue.slice(0, 50)}`}`
1228
- };
1229
- };
1230
- case 'config_changed':
1231
- return async () => {
1232
- const configPath = claim.params.path;
1233
- const expectedKey = claim.params.key;
1234
- const expectedValue = claim.params.value;
1235
- if (!configPath) {
1236
- return {
1237
- ...baseResult,
1238
- verified: false,
1239
- confidence: 'low',
1240
- evidence: 'No config file path provided'
1241
- };
1242
- }
1243
- try {
1244
- const resolvedPath = path.isAbsolute(configPath)
1245
- ? configPath
1246
- : path.resolve(context.workingDirectory, configPath);
1247
- const content = await fs.readFile(resolvedPath, 'utf-8');
1248
- // Try to parse as JSON
1249
- let config;
1250
- try {
1251
- config = JSON.parse(content);
1252
- }
1253
- catch {
1254
- // Not JSON, check raw content
1255
- if (expectedValue && content.includes(String(expectedValue))) {
1256
- return {
1257
- ...baseResult,
1258
- verified: true,
1259
- confidence: 'medium',
1260
- evidence: `Config file contains expected value`
1261
- };
1262
- }
1263
- return {
1264
- ...baseResult,
1265
- verified: true,
1266
- confidence: 'low',
1267
- evidence: 'Config file exists but format unknown'
1268
- };
1269
- }
1270
- if (expectedKey) {
1271
- const keys = expectedKey.split('.');
1272
- let value = config;
1273
- for (const key of keys) {
1274
- value = value?.[key];
1275
- }
1276
- if (expectedValue !== undefined) {
1277
- const matches = JSON.stringify(value) === JSON.stringify(expectedValue);
1278
- return {
1279
- ...baseResult,
1280
- verified: matches,
1281
- confidence: 'high',
1282
- evidence: matches
1283
- ? `${expectedKey} has expected value`
1284
- : `${expectedKey} = ${JSON.stringify(value)}, expected ${JSON.stringify(expectedValue)}`
1285
- };
1286
- }
1287
- return {
1288
- ...baseResult,
1289
- verified: value !== undefined,
1290
- confidence: 'high',
1291
- evidence: value !== undefined
1292
- ? `${expectedKey} exists: ${JSON.stringify(value).slice(0, 100)}`
1293
- : `${expectedKey} not found in config`
1294
- };
1295
- }
1296
- return {
1297
- ...baseResult,
1298
- verified: true,
1299
- confidence: 'medium',
1300
- evidence: 'Config file exists and is valid JSON'
1301
- };
1302
- }
1303
- catch (err) {
1304
- return {
1305
- ...baseResult,
1306
- verified: false,
1307
- confidence: 'high',
1308
- evidence: 'Could not read config file',
1309
- error: err instanceof Error ? err.message : 'Unknown error'
1310
- };
1311
- }
1312
- };
1313
- case 'error_fixed':
1314
- case 'feature_implemented':
1315
- case 'refactor_complete':
1316
- // These require semantic verification - LLM is required
1317
- return async () => {
1318
- if (!context.llmVerifier) {
1319
- return {
1320
- ...baseResult,
1321
- verified: false,
1322
- confidence: 'low',
1323
- evidence: 'Semantic verification requires LLM verifier'
1324
- };
1325
- }
1326
- return verifyClaimWithLLM(claim, context);
1327
- };
1328
- case 'data_transformed':
1329
- case 'database_updated':
1330
- case 'permission_granted':
1331
- case 'generic':
1332
- default:
1333
- // All these claim types require LLM verification
1334
- return async () => {
1335
- if (!context.llmVerifier) {
1336
- return {
1337
- ...baseResult,
1338
- verified: false,
1339
- confidence: 'low',
1340
- evidence: `${claim.type} verification requires LLM verifier`
1341
- };
1342
- }
1343
- return verifyClaimWithLLM(claim, context);
1344
- };
1345
- }
329
+ return test;
330
+ });
1346
331
  }
332
+ // ============================================================================
333
+ // MAIN VERIFICATION API
334
+ // ============================================================================
1347
335
  /**
1348
- * Comprehensive verification using LLM-based semantic analysis.
1349
- * Requires an LLM verifier - all claims are verified through LLM semantic analysis.
336
+ * Verify an assistant response using isolated runtime tests.
337
+ * This is the main entry point for verification.
1350
338
  */
1351
- export async function verifyResponseComprehensive(response, context, responseId) {
1352
- if (!context.llmVerifier) {
339
+ export async function verifyResponse(response, ctx, responseId) {
340
+ const timestamp = new Date().toISOString();
341
+ const id = responseId || `verify-${Date.now()}`;
342
+ // Extract claims from response
343
+ const claims = await extractClaims(response, ctx);
344
+ if (claims.length === 0) {
1353
345
  return {
1354
- responseId: responseId || `response-${Date.now()}`,
1355
- timestamp: new Date().toISOString(),
346
+ responseId: id,
347
+ timestamp,
1356
348
  claims: [],
1357
349
  results: [],
1358
350
  summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
1359
- overallVerdict: 'unverified'
351
+ overallVerdict: 'unverified',
352
+ trustScore: 50
1360
353
  };
1361
354
  }
1362
- // Extract ALL claims using LLM (required)
1363
- const claims = await extractClaimsWithLLM(response, context.llmVerifier);
1364
- const results = [];
1365
- for (const claim of claims) {
1366
- // ALL claims are verified via LLM semantic analysis
1367
- try {
1368
- const result = await verifyClaimWithLLM(claim, context);
1369
- results.push(result);
1370
- }
1371
- catch (err) {
1372
- results.push({
355
+ // Generate isolated tests for claims
356
+ const tests = await generateTests(claims, ctx);
357
+ // Run all isolated tests
358
+ const testResults = [];
359
+ for (const test of tests) {
360
+ const result = await runIsolatedTest(test, ctx.workingDirectory, ctx.llmVerifier);
361
+ testResults.push(result);
362
+ }
363
+ // Map test results back to claims
364
+ const results = claims.map((claim, i) => {
365
+ const testResult = testResults[i];
366
+ if (!testResult) {
367
+ return {
1373
368
  claim,
1374
369
  verified: false,
1375
370
  confidence: 'low',
1376
- evidence: 'LLM verification failed',
1377
- error: err instanceof Error ? err.message : 'Unknown error',
1378
- timestamp: new Date().toISOString()
1379
- });
371
+ evidence: 'No test generated',
372
+ method: 'skip',
373
+ timestamp
374
+ };
1380
375
  }
1381
- }
376
+ return {
377
+ claim,
378
+ verified: testResult.success,
379
+ confidence: testResult.success ? 'high' : (testResult.matchedPatterns.length > 0 ? 'medium' : 'low'),
380
+ evidence: testResult.success
381
+ ? `Verified in isolated runtime: ${testResult.matchedPatterns.join(', ')}`
382
+ : `Failed: ${testResult.unmatchedPatterns.join(', ')}`,
383
+ method: 'isolated-runtime',
384
+ reasoning: testResult.llmAssessment,
385
+ executedCode: [...(testResult.test.shellCommands || []), ...(testResult.test.commands || [])].join('\n'),
386
+ rawOutput: testResult.output.slice(0, 2000),
387
+ error: testResult.errors || undefined,
388
+ timestamp
389
+ };
390
+ });
391
+ // Calculate summary
1382
392
  const verified = results.filter(r => r.verified).length;
1383
393
  const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
1384
394
  const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
395
+ // Determine verdict
1385
396
  let overallVerdict;
1386
397
  if (failed > 0) {
1387
398
  overallVerdict = 'contradicted';
@@ -1395,673 +406,69 @@ export async function verifyResponseComprehensive(response, context, responseId)
1395
406
  else {
1396
407
  overallVerdict = 'unverified';
1397
408
  }
409
+ // Calculate trust score
410
+ const trustScore = claims.length > 0
411
+ ? Math.round((verified / claims.length) * 100)
412
+ : 50;
1398
413
  return {
1399
- responseId: responseId || `response-${Date.now()}`,
1400
- timestamp: new Date().toISOString(),
414
+ responseId: id,
415
+ timestamp,
1401
416
  claims,
1402
417
  results,
1403
- summary: {
1404
- total: claims.length,
1405
- verified,
1406
- failed,
1407
- inconclusive
1408
- },
1409
- overallVerdict
418
+ summary: { total: claims.length, verified, failed, inconclusive },
419
+ overallVerdict,
420
+ trustScore
1410
421
  };
1411
422
  }
1412
423
  /**
1413
- * Determine the best verification strategy for a claim
1414
- */
1415
- export function getVerificationStrategy(claim) {
1416
- switch (claim.type) {
1417
- case 'file_created':
1418
- case 'file_modified':
1419
- case 'file_deleted':
1420
- case 'content_contains':
1421
- case 'config_changed':
1422
- case 'permission_granted':
1423
- return 'filesystem';
1424
- case 'url_accessible':
1425
- case 'api_response':
1426
- case 'service_running':
1427
- return 'network';
1428
- case 'code_compiles':
1429
- case 'tests_pass':
1430
- case 'command_executed':
1431
- case 'dependency_installed':
1432
- case 'git_committed':
1433
- case 'package_published':
1434
- case 'env_var_set':
1435
- return 'runtime';
1436
- case 'error_fixed':
1437
- case 'feature_implemented':
1438
- case 'refactor_complete':
1439
- case 'data_transformed':
1440
- return 'semantic';
1441
- case 'database_updated':
1442
- return 'comparison';
1443
- case 'generic':
1444
- default:
1445
- return 'llm';
1446
- }
1447
- }
1448
- /**
1449
- * Prompt for LLM to generate verification code
424
+ * Format verification report for display
1450
425
  */
1451
- const VERIFICATION_CODE_GENERATION_PROMPT = `You are a verification code generator. Given a claim that an AI assistant made, generate code to verify if the claim is TRUE.
1452
-
1453
- CLAIM TO VERIFY:
1454
- Type: {CLAIM_TYPE}
1455
- Description: {CLAIM_DESCRIPTION}
1456
- Evidence: {CLAIM_EVIDENCE}
1457
- Parameters: {CLAIM_PARAMS}
1458
-
1459
- WORKING DIRECTORY: {WORKING_DIR}
1460
-
1461
- Generate a verification test. Choose the most appropriate approach:
1462
-
1463
- 1. SHELL COMMAND - For file operations, git, npm, system checks
1464
- 2. JAVASCRIPT - For complex logic, API calls, JSON parsing
1465
- 3. API - For HTTP endpoints, external services
426
+ export function formatVerificationReport(report) {
427
+ const bar = '█'.repeat(Math.round(report.trustScore / 10)) + '░'.repeat(10 - Math.round(report.trustScore / 10));
428
+ const icon = report.trustScore >= 80 ? '✅' : report.trustScore >= 50 ? '⚠️' : '❌';
429
+ let out = `╔════════════════════════════════════════════════════════════╗
430
+ ║ ISOLATED RUNTIME VERIFICATION REPORT ║
431
+ ╚════════════════════════════════════════════════════════════╝
1466
432
 
1467
- IMPORTANT RULES:
1468
- - Code must be READ-ONLY and NON-DESTRUCTIVE (no writes, no deletes, no modifications)
1469
- - Code must complete quickly (under 10 seconds)
1470
- - Code must output a clear result that can be parsed
1471
- - For shell: output should be parseable (exit code 0 = verified, non-zero = failed)
1472
- - For JavaScript: must export/return { verified: boolean, evidence: string }
1473
- - Do NOT use interactive commands
1474
- - Do NOT access sensitive data or credentials
433
+ `;
434
+ out += `Trust: ${icon} ${report.trustScore}/100 [${bar}]
435
+ Verdict: ${report.overallVerdict.toUpperCase()}
1475
436
 
1476
- Respond with JSON:
1477
- {
1478
- "testType": "shell" | "javascript" | "api",
1479
- "code": "the verification code",
1480
- "description": "what this test does",
1481
- "expectedOutcome": "what success looks like",
1482
- "safeToRun": true | false,
1483
- "safetyReason": "why it's safe/unsafe"
1484
- }
437
+ Claims: ${report.summary.total} | ✅ ${report.summary.verified} | ❌ ${report.summary.failed} | ❓ ${report.summary.inconclusive}
1485
438
 
1486
- Only output valid JSON, nothing else.`;
1487
- /**
1488
- * Generate verification code using LLM
1489
- */
1490
- export async function generateVerificationCode(claim, context) {
1491
- if (!context.llmVerifier) {
1492
- return null;
1493
- }
1494
- try {
1495
- const prompt = VERIFICATION_CODE_GENERATION_PROMPT
1496
- .replace('{CLAIM_TYPE}', claim.type)
1497
- .replace('{CLAIM_DESCRIPTION}', claim.description)
1498
- .replace('{CLAIM_EVIDENCE}', claim.evidence)
1499
- .replace('{CLAIM_PARAMS}', JSON.stringify(claim.params, null, 2))
1500
- .replace('{WORKING_DIR}', context.workingDirectory);
1501
- const result = await context.llmVerifier(prompt);
1502
- // Parse the JSON response
1503
- const jsonMatch = result.match(/\{[\s\S]*\}/);
1504
- if (!jsonMatch) {
1505
- return null;
1506
- }
1507
- const parsed = JSON.parse(jsonMatch[0]);
1508
- return {
1509
- claim,
1510
- testType: parsed.testType,
1511
- code: parsed.code,
1512
- description: parsed.description,
1513
- expectedOutcome: parsed.expectedOutcome,
1514
- safetyCheck: parsed.safeToRun
1515
- };
1516
- }
1517
- catch (err) {
1518
- console.error('Failed to generate verification code:', err);
1519
- return null;
1520
- }
1521
- }
1522
- /**
1523
- * Safety patterns to block dangerous code
1524
- */
1525
- const DANGEROUS_PATTERNS = [
1526
- /\brm\s+-rf?\b/i, // rm commands
1527
- /\brmdir\b/i, // rmdir
1528
- /\bdd\s+if=/i, // dd (disk destroyer)
1529
- /\bmkfs\b/i, // format filesystem
1530
- /\b>\s*\/dev\//i, // write to devices
1531
- /\bchmod\s+777\b/i, // dangerous permissions
1532
- /\bsudo\b/i, // sudo commands
1533
- /\bcurl.*\|\s*sh\b/i, // pipe to shell
1534
- /\bwget.*\|\s*sh\b/i, // pipe to shell
1535
- /\beval\s*\(/i, // eval in JS
1536
- /new\s+Function\s*\(/i, // Function constructor
1537
- /child_process/i, // subprocess in JS (unless we control it)
1538
- /\bexec\s*\(/i, // exec calls
1539
- /\bspawn\s*\(/i, // spawn calls
1540
- /writeFile/i, // file writes
1541
- /appendFile/i, // file appends
1542
- /unlink\s*\(/i, // file deletion
1543
- /rmSync/i, // sync deletion
1544
- /fs\.rm/i, // fs remove
1545
- /DROP\s+TABLE/i, // SQL injection
1546
- /DELETE\s+FROM/i, // SQL deletion
1547
- /TRUNCATE/i, // SQL truncate
1548
- /;\s*--/, // SQL comment injection
1549
- /process\.exit/i, // process exit
1550
- /require\s*\(\s*['"]child/i, // require child_process
1551
- ];
1552
- /**
1553
- * Validate that generated code is safe to execute
1554
- */
1555
- export function validateGeneratedCode(test) {
1556
- // First check the LLM's own safety assessment
1557
- if (!test.safetyCheck) {
1558
- return { safe: false, reason: 'LLM marked code as unsafe' };
1559
- }
1560
- // Check against dangerous patterns
1561
- for (const pattern of DANGEROUS_PATTERNS) {
1562
- if (pattern.test(test.code)) {
1563
- return {
1564
- safe: false,
1565
- reason: `Dangerous pattern detected: ${pattern.source}`
1566
- };
1567
- }
1568
- }
1569
- // Additional checks for shell commands
1570
- if (test.testType === 'shell') {
1571
- // Only allow specific safe commands
1572
- const safeShellPrefixes = [
1573
- 'ls', 'cat', 'head', 'tail', 'grep', 'find', 'stat', 'file',
1574
- 'test', 'echo', 'pwd', 'wc', 'diff', 'cmp',
1575
- 'git log', 'git status', 'git show', 'git diff', 'git branch',
1576
- 'npm view', 'npm list', 'npm ls',
1577
- 'node -e', 'node --eval',
1578
- 'curl -s', 'curl --silent', 'wget -q',
1579
- 'jq', 'python -c', 'python3 -c',
1580
- 'lsof', 'netstat', 'ss', 'ps',
1581
- 'which', 'type', 'command -v',
1582
- ];
1583
- const trimmedCode = test.code.trim().toLowerCase();
1584
- const startsWithSafe = safeShellPrefixes.some(prefix => trimmedCode.startsWith(prefix.toLowerCase()));
1585
- if (!startsWithSafe) {
1586
- // Check if it's a simple test/check command
1587
- if (!trimmedCode.startsWith('[') && !trimmedCode.startsWith('if ')) {
1588
- return {
1589
- safe: false,
1590
- reason: 'Shell command does not start with a known safe prefix'
1591
- };
1592
- }
1593
- }
1594
- }
1595
- // For JavaScript, ensure it's a simple expression
1596
- if (test.testType === 'javascript') {
1597
- // Check code length - very long code is suspicious
1598
- if (test.code.length > 2000) {
1599
- return { safe: false, reason: 'JavaScript code too long' };
439
+ `;
440
+ out += `🔬 ISOLATED RUNTIME TESTS:\n`;
441
+ for (const r of report.results.slice(0, 8)) {
442
+ const statusIcon = r.verified ? '✅' : r.confidence === 'high' ? '❌' : '❓';
443
+ out += ` ${statusIcon} [${r.confidence}] ${r.claim.statement.slice(0, 50)}...\n`;
444
+ if (r.reasoning) {
445
+ out += ` └─ ${r.reasoning.slice(0, 60)}\n`;
1600
446
  }
1601
447
  }
1602
- return { safe: true, reason: 'All safety checks passed' };
1603
- }
1604
- /**
1605
- * Execute a generated verification test
1606
- */
1607
- export async function executeGeneratedTest(test, context) {
1608
- const baseResult = {
1609
- claim: test.claim,
1610
- timestamp: new Date().toISOString()
1611
- };
1612
- // Validate safety first
1613
- const safetyResult = validateGeneratedCode(test);
1614
- if (!safetyResult.safe) {
1615
- return {
1616
- ...baseResult,
1617
- verified: false,
1618
- confidence: 'low',
1619
- evidence: `Generated test blocked: ${safetyResult.reason}`,
1620
- error: 'Safety validation failed'
1621
- };
1622
- }
1623
- try {
1624
- switch (test.testType) {
1625
- case 'shell': {
1626
- const { stdout, stderr } = await execAsync(test.code, {
1627
- cwd: context.workingDirectory,
1628
- timeout: 10000, // 10 second timeout
1629
- maxBuffer: 1024 * 1024 // 1MB max output
1630
- });
1631
- const output = (stdout + stderr).trim();
1632
- // Shell convention: exit 0 = success
1633
- return {
1634
- ...baseResult,
1635
- verified: true,
1636
- confidence: 'high',
1637
- evidence: `Test passed. Output: ${output.slice(0, 500)}`
1638
- };
1639
- }
1640
- case 'javascript': {
1641
- // Execute JavaScript in a sandboxed way using node -e
1642
- const wrappedCode = `
1643
- const result = (async () => {
1644
- ${test.code}
1645
- })();
1646
- result.then(r => console.log(JSON.stringify(r))).catch(e => {
1647
- console.log(JSON.stringify({ verified: false, evidence: e.message }));
1648
- });
1649
- `;
1650
- const { stdout } = await execAsync(`node -e ${JSON.stringify(wrappedCode)}`, {
1651
- cwd: context.workingDirectory,
1652
- timeout: 10000
1653
- });
1654
- try {
1655
- const result = JSON.parse(stdout.trim());
1656
- return {
1657
- ...baseResult,
1658
- verified: result.verified,
1659
- confidence: 'high',
1660
- evidence: result.evidence
1661
- };
1662
- }
1663
- catch {
1664
- return {
1665
- ...baseResult,
1666
- verified: false,
1667
- confidence: 'medium',
1668
- evidence: `JavaScript output: ${stdout.slice(0, 500)}`
1669
- };
1670
- }
1671
- }
1672
- case 'api': {
1673
- // For API tests, use curl
1674
- const { stdout } = await execAsync(test.code, {
1675
- cwd: context.workingDirectory,
1676
- timeout: 15000
1677
- });
1678
- // Try to parse as JSON result
1679
- try {
1680
- const result = JSON.parse(stdout.trim());
1681
- return {
1682
- ...baseResult,
1683
- verified: Boolean(result.verified ?? result.success ?? result.ok),
1684
- confidence: 'high',
1685
- evidence: `API response: ${JSON.stringify(result).slice(0, 500)}`
1686
- };
1687
- }
1688
- catch {
1689
- // Non-JSON response - check for success indicators
1690
- const isSuccess = stdout.includes('200') || stdout.includes('success') || stdout.includes('ok');
1691
- return {
1692
- ...baseResult,
1693
- verified: isSuccess,
1694
- confidence: 'medium',
1695
- evidence: `API output: ${stdout.slice(0, 500)}`
1696
- };
1697
- }
1698
- }
1699
- default:
1700
- return {
1701
- ...baseResult,
1702
- verified: false,
1703
- confidence: 'low',
1704
- evidence: `Unknown test type: ${test.testType}`
1705
- };
1706
- }
1707
- }
1708
- catch (err) {
1709
- // Command failed (non-zero exit) = verification failed
1710
- return {
1711
- ...baseResult,
1712
- verified: false,
1713
- confidence: 'high',
1714
- evidence: `Test failed: ${err instanceof Error ? err.message : 'Unknown error'}`,
1715
- error: err instanceof Error ? err.message : 'Unknown error'
1716
- };
1717
- }
1718
- }
1719
- /**
1720
- * Verify a claim using LLM-generated runtime test
1721
- */
1722
- export async function verifyWithGeneratedTest(claim, context) {
1723
- const baseResult = {
1724
- claim,
1725
- timestamp: new Date().toISOString()
1726
- };
1727
- // Generate verification code
1728
- const test = await generateVerificationCode(claim, context);
1729
- if (!test) {
1730
- return {
1731
- ...baseResult,
1732
- verified: false,
1733
- confidence: 'low',
1734
- evidence: 'Failed to generate verification test'
1735
- };
1736
- }
1737
- // Execute the generated test
1738
- return executeGeneratedTest(test, context);
1739
- }
1740
- /**
1741
- * Full verification using LLM-generated tests
1742
- * This is the most powerful verification method - LLM decides HOW to verify each claim
1743
- */
1744
- export async function verifyResponseWithGeneratedTests(response, context, responseId) {
1745
- // Extract claims using LLM
1746
- const claims = context.llmVerifier
1747
- ? await extractClaimsWithLLM(response, context.llmVerifier)
1748
- : extractClaims(response);
1749
- const results = [];
1750
- for (const claim of claims) {
1751
- // For each claim, generate and run a custom verification test
1752
- const result = await verifyWithGeneratedTest(claim, context);
1753
- results.push(result);
1754
- }
1755
- const verified = results.filter(r => r.verified).length;
1756
- const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
1757
- const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
1758
- let overallVerdict;
1759
- if (failed > 0) {
1760
- overallVerdict = 'contradicted';
1761
- }
1762
- else if (verified === claims.length && claims.length > 0) {
1763
- overallVerdict = 'verified';
448
+ if (report.results.length > 8) {
449
+ out += ` ... +${report.results.length - 8} more\n`;
1764
450
  }
1765
- else if (verified > 0) {
1766
- overallVerdict = 'partially_verified';
1767
- }
1768
- else {
1769
- overallVerdict = 'unverified';
1770
- }
1771
- return {
1772
- responseId: responseId || `response-${Date.now()}`,
1773
- timestamp: new Date().toISOString(),
1774
- claims,
1775
- results,
1776
- summary: {
1777
- total: claims.length,
1778
- verified,
1779
- failed,
1780
- inconclusive
1781
- },
1782
- overallVerdict
1783
- };
451
+ return out;
1784
452
  }
1785
453
  /**
1786
- * Hybrid verification - uses generated tests when available, falls back to predefined tests
454
+ * Quick verification - verify only critical/high priority claims
1787
455
  */
1788
- export async function verifyResponseHybrid(response, context, responseId) {
1789
- const claims = context.llmVerifier
1790
- ? await extractClaimsWithLLM(response, context.llmVerifier)
1791
- : extractClaims(response);
1792
- const results = [];
1793
- for (const claim of claims) {
1794
- let result;
1795
- // Try LLM-generated test first if LLM is available
1796
- if (context.llmVerifier) {
1797
- const generatedTest = await generateVerificationCode(claim, context);
1798
- if (generatedTest) {
1799
- const safety = validateGeneratedCode(generatedTest);
1800
- if (safety.safe) {
1801
- // Use generated test
1802
- result = await executeGeneratedTest(generatedTest, context);
1803
- results.push(result);
1804
- continue;
1805
- }
1806
- }
1807
- }
1808
- // Fall back to predefined verification
1809
- const standardTypes = [
1810
- 'file_created', 'file_modified', 'file_deleted', 'code_compiles',
1811
- 'tests_pass', 'git_committed', 'package_published', 'command_executed',
1812
- 'dependency_installed', 'service_running', 'url_accessible', 'content_contains'
1813
- ];
1814
- let test;
1815
- if (standardTypes.includes(claim.type)) {
1816
- test = generateVerificationTest(claim);
1817
- }
1818
- else {
1819
- test = generateExtendedVerificationTest(claim, context);
1820
- }
1821
- try {
1822
- result = await test();
1823
- }
1824
- catch (err) {
1825
- result = {
1826
- claim,
1827
- verified: false,
1828
- confidence: 'low',
1829
- evidence: 'Verification failed',
1830
- error: err instanceof Error ? err.message : 'Unknown error',
1831
- timestamp: new Date().toISOString()
1832
- };
1833
- }
1834
- results.push(result);
1835
- }
1836
- const verified = results.filter(r => r.verified).length;
1837
- const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
1838
- const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
1839
- let overallVerdict;
1840
- if (failed > 0) {
1841
- overallVerdict = 'contradicted';
1842
- }
1843
- else if (verified === claims.length && claims.length > 0) {
1844
- overallVerdict = 'verified';
1845
- }
1846
- else if (verified > 0) {
1847
- overallVerdict = 'partially_verified';
1848
- }
1849
- else {
1850
- overallVerdict = 'unverified';
456
+ export async function quickVerify(response, ctx) {
457
+ const claims = await extractClaims(response, ctx);
458
+ const critical = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 3);
459
+ if (critical.length === 0) {
460
+ return { trustScore: 50, summary: 'No critical claims to verify' };
461
+ }
462
+ const tests = await generateTests(critical, ctx);
463
+ let verified = 0;
464
+ for (const test of tests) {
465
+ const result = await runIsolatedTest(test, ctx.workingDirectory, ctx.llmVerifier);
466
+ if (result.success)
467
+ verified++;
1851
468
  }
1852
469
  return {
1853
- responseId: responseId || `response-${Date.now()}`,
1854
- timestamp: new Date().toISOString(),
1855
- claims,
1856
- results,
1857
- summary: {
1858
- total: claims.length,
1859
- verified,
1860
- failed,
1861
- inconclusive
1862
- },
1863
- overallVerdict
470
+ trustScore: Math.round((verified / critical.length) * 100),
471
+ summary: `${verified}/${critical.length} critical claims verified`
1864
472
  };
1865
473
  }
1866
- const UNIVERSAL_EXTRACT = `Extract ALL verifiable claims from this AI response. Include explicit claims, implicit claims, state changes, results, assertions.
1867
-
1868
- RESPONSE:
1869
- ---
1870
- {RESPONSE}
1871
- ---
1872
- CONTEXT: {CONTEXT}
1873
- DIR: {WORKING_DIR}
1874
-
1875
- Return JSON array: [{"id":"c1","statement":"claim","category":"file_op|code|state|data|behavior|fact|other","verifiable":true/false,"verificationApproach":"how","priority":"critical|high|medium|low","context":{}}]
1876
- Output ONLY valid JSON.`;
1877
- const UNIVERSAL_GEN = `Generate verification code for: {STATEMENT}
1878
- Category: {CATEGORY} | Approach: {APPROACH} | Context: {CONTEXT} | Dir: {WORKING_DIR} | Platform: {PLATFORM}
1879
-
1880
- Use shell/javascript/python. READ-ONLY only.
1881
- Return JSON: {"steps":[{"type":"shell|javascript|python","code":"code","desc":"what"}],"success":"success criteria","failure":"failure criteria","confPass":0-100,"confFail":0-100,"safe":{"ok":true/false,"why":"reason"}}
1882
- Output ONLY valid JSON.`;
1883
- const UNIVERSAL_ASSESS = `Assess: RESPONSE:{RESPONSE} CLAIMS:{CLAIMS} RESULTS:{RESULTS}
1884
- Return JSON: {"trust":0-100,"summary":"text","concerns":[]}
1885
- Output ONLY valid JSON.`;
1886
- const UNSAFE = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /exec\s*\(/i, /child_process/i, /os\.system/i, /subprocess/i, /curl.*\|.*sh/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i, /kill/i];
1887
- export function validateUniversalCode(c) {
1888
- for (const p of UNSAFE)
1889
- if (p.test(c))
1890
- return { safe: false, reason: p.source };
1891
- return c.length > 5000 ? { safe: false, reason: 'too long' } : { safe: true, reason: 'ok' };
1892
- }
1893
- async function runUniversalStep(s, cwd) {
1894
- const v = validateUniversalCode(s.code);
1895
- if (!v.safe)
1896
- return { ok: false, out: v.reason };
1897
- try {
1898
- if (s.type === 'shell') {
1899
- const { stdout, stderr } = await execAsync(s.code, { cwd, timeout: 30000, maxBuffer: 5 * 1024 * 1024 });
1900
- return { ok: true, out: stdout + stderr };
1901
- }
1902
- if (s.type === 'javascript') {
1903
- const w = `(async()=>{try{const fs=require('fs').promises;const r=await(async()=>{${s.code}})();console.log(JSON.stringify({ok:1,r}))}catch(e){console.log(JSON.stringify({ok:0,e:e.message}))}})()`;
1904
- const { stdout } = await execAsync(`node -e ${JSON.stringify(w)}`, { cwd, timeout: 30000 });
1905
- return { ok: true, out: stdout };
1906
- }
1907
- if (s.type === 'python') {
1908
- const { stdout, stderr } = await execAsync(`python3 -c ${JSON.stringify(s.code)}`, { cwd, timeout: 30000 });
1909
- return { ok: true, out: stdout + stderr };
1910
- }
1911
- return { ok: false, out: 'unknown type' };
1912
- }
1913
- catch (e) {
1914
- return { ok: false, out: e instanceof Error ? e.message : 'err' };
1915
- }
1916
- }
1917
- export async function extractUniversalClaims(r, ctx) {
1918
- if (!ctx.llmVerifier)
1919
- return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
1920
- try {
1921
- const p = UNIVERSAL_EXTRACT.replace('{RESPONSE}', r.slice(0, 8000)).replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '').replace('{WORKING_DIR}', ctx.workingDirectory);
1922
- const res = await ctx.llmVerifier(p);
1923
- const m = res.match(/\[[\s\S]*\]/);
1924
- if (m)
1925
- return JSON.parse(m[0]);
1926
- }
1927
- catch { /* fall through */ }
1928
- return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
1929
- }
1930
- export async function verifyUniversalClaim(claim, ctx) {
1931
- const base = { claim, timestamp: new Date().toISOString() };
1932
- if (!claim.verifiable)
1933
- return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'Not verifiable', reasoning: 'Cannot verify' };
1934
- if (!ctx.llmVerifier)
1935
- return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'No LLM', reasoning: 'Needs LLM' };
1936
- try {
1937
- const p = UNIVERSAL_GEN.replace('{STATEMENT}', claim.statement).replace('{CATEGORY}', claim.category).replace('{APPROACH}', claim.verificationApproach).replace('{CONTEXT}', JSON.stringify(claim.context)).replace('{WORKING_DIR}', ctx.workingDirectory).replace('{PLATFORM}', process.platform);
1938
- const res = await ctx.llmVerifier(p);
1939
- const m = res.match(/\{[\s\S]*\}/);
1940
- if (!m)
1941
- throw new Error('bad');
1942
- const plan = JSON.parse(m[0]);
1943
- if (!plan.safe.ok)
1944
- return { ...base, verified: false, confidence: 0, method: 'blocked', evidence: plan.safe.why, reasoning: 'Unsafe' };
1945
- let allOk = true, out = '', code = '';
1946
- for (const s of plan.steps) {
1947
- code += s.code + '\n';
1948
- const r = await runUniversalStep(s, ctx.workingDirectory);
1949
- out += r.out + '\n';
1950
- if (!r.ok)
1951
- allOk = false;
1952
- }
1953
- return { ...base, verified: allOk, confidence: allOk ? plan.confPass : plan.confFail, method: plan.steps.map(s => s.type).join('+'), evidence: allOk ? plan.success : plan.failure, reasoning: allOk ? 'All passed' : 'Some failed', executedCode: code, rawOutput: out.slice(0, 2000) };
1954
- }
1955
- catch (e) {
1956
- return { ...base, verified: false, confidence: 10, method: 'error', evidence: 'Failed', reasoning: e instanceof Error ? e.message : 'err' };
1957
- }
1958
- }
1959
- export async function verifyResponseUniversal(response, ctx, id) {
1960
- const claims = await extractUniversalClaims(response, ctx);
1961
- const results = [];
1962
- // Identify self-referential claims (about erosolar-cli itself)
1963
- const selfClaims = claims.filter(c => c.statement.toLowerCase().includes('erosolar') ||
1964
- c.statement.toLowerCase().includes('cli') ||
1965
- c.category === 'behavior' ||
1966
- c.category === 'feature');
1967
- const regularClaims = claims.filter(c => !selfClaims.includes(c));
1968
- // Run isolated runtime tests for self-referential claims
1969
- if (selfClaims.length > 0 && ctx.llmVerifier) {
1970
- const isoResults = await runIsolatedVerification(selfClaims.map(c => ({ statement: c.statement, category: c.category, context: c.context })), ctx.workingDirectory, ctx.llmVerifier);
1971
- // Convert isolated results to UniversalVerificationResult
1972
- for (let i = 0; i < selfClaims.length && i < isoResults.tests.length; i++) {
1973
- const claim = selfClaims[i];
1974
- const isoTest = isoResults.tests[i];
1975
- results.push({
1976
- claim,
1977
- verified: isoTest.success,
1978
- confidence: isoTest.success ? 90 : (isoTest.matchedPatterns.length > 0 ? 50 : 20),
1979
- method: 'isolated-runtime',
1980
- evidence: isoTest.success ? `Verified in fresh CLI instance` : `Failed: ${isoTest.unmatchedPatterns.join(', ')}`,
1981
- reasoning: isoTest.llmAssessment || (isoTest.success ? 'All patterns matched in isolated runtime' : 'Patterns not matched'),
1982
- executedCode: isoTest.test.commands.join('\n'),
1983
- rawOutput: isoTest.output.slice(0, 2000),
1984
- timestamp: new Date().toISOString()
1985
- });
1986
- }
1987
- }
1988
- // Verify regular claims with standard approach
1989
- for (const c of regularClaims) {
1990
- results.push(c.verifiable || c.priority === 'critical' || c.priority === 'high'
1991
- ? await verifyUniversalClaim(c, ctx)
1992
- : { claim: c, verified: false, confidence: 0, method: 'skip', evidence: 'Low priority', reasoning: 'Skipped', timestamp: new Date().toISOString() });
1993
- }
1994
- const vClaims = claims.filter(c => c.verifiable).length;
1995
- const verified = results.filter(r => r.verified).length;
1996
- const failed = results.filter(r => !r.verified && r.confidence > 50).length;
1997
- const inconclusive = results.filter(r => !r.verified && r.confidence <= 50 && r.method !== 'skip').length;
1998
- const avgConf = results.length ? results.reduce((s, r) => s + r.confidence, 0) / results.length : 0;
1999
- // Count isolated tests for assessment
2000
- const isoCount = results.filter(r => r.method === 'isolated-runtime').length;
2001
- const isoVerified = results.filter(r => r.method === 'isolated-runtime' && r.verified).length;
2002
- let assessment = '', trust = 0;
2003
- if (ctx.llmVerifier)
2004
- try {
2005
- const isoSummary = isoCount > 0 ? ` Isolated runtime tests: ${isoVerified}/${isoCount} passed.` : '';
2006
- const p = UNIVERSAL_ASSESS.replace('{RESPONSE}', response.slice(0, 4000)).replace('{CLAIMS}', JSON.stringify(claims.slice(0, 15))).replace('{RESULTS}', JSON.stringify(results.slice(0, 15)));
2007
- const r = await ctx.llmVerifier(p);
2008
- const m = r.match(/\{[\s\S]*\}/);
2009
- if (m) {
2010
- const a = JSON.parse(m[0]);
2011
- trust = a.trust;
2012
- assessment = a.summary + isoSummary + (a.concerns?.length ? ` Concerns: ${a.concerns.join('; ')}` : '');
2013
- }
2014
- }
2015
- catch {
2016
- trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
2017
- assessment = `${verified}/${vClaims} verified`;
2018
- }
2019
- else {
2020
- trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
2021
- assessment = `${verified}/${vClaims} verified`;
2022
- }
2023
- return { responseId: id || `u-${Date.now()}`, originalResponse: response, timestamp: new Date().toISOString(), claims, results, summary: { totalClaims: claims.length, verifiableClaims: vClaims, verified, failed, inconclusive, averageConfidence: Math.round(avgConf) }, overallAssessment: assessment, trustScore: trust };
2024
- }
2025
- export async function quickUniversalVerify(r, ctx) {
2026
- const claims = await extractUniversalClaims(r, ctx);
2027
- const crit = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 5);
2028
- if (!crit.length)
2029
- return { trustScore: 50, summary: 'No critical claims' };
2030
- let v = 0;
2031
- for (const c of crit)
2032
- if ((await verifyUniversalClaim(c, ctx)).verified)
2033
- v++;
2034
- return { trustScore: Math.round(v / crit.length * 100), summary: `${v}/${crit.length} critical verified` };
2035
- }
2036
- export function formatUniversalReport(r) {
2037
- const bar = '█'.repeat(Math.round(r.trustScore / 10)) + '░'.repeat(10 - Math.round(r.trustScore / 10));
2038
- const icon = r.trustScore >= 80 ? '✅' : r.trustScore >= 50 ? '⚠️' : '❌';
2039
- let out = `╔════════════════════════════════════════════════════════════╗\n║ UNIVERSAL VERIFICATION REPORT ║\n╚════════════════════════════════════════════════════════════╝\n\n`;
2040
- out += `Trust: ${icon} ${r.trustScore}/100 [${bar}]\n${r.overallAssessment}\n\nClaims: ${r.summary.totalClaims} | ✅ ${r.summary.verified} | ❌ ${r.summary.failed} | ❓ ${r.summary.inconclusive}\n\n`;
2041
- // Group results by method
2042
- const isoResults = r.results.filter(x => x.method === 'isolated-runtime');
2043
- const otherResults = r.results.filter(x => x.method !== 'isolated-runtime');
2044
- // Show isolated runtime tests first (most robust verification)
2045
- if (isoResults.length > 0) {
2046
- out += `🔬 ISOLATED RUNTIME TESTS (fresh CLI instance):\n`;
2047
- for (const x of isoResults.slice(0, 4)) {
2048
- out += ` ${x.verified ? '✅' : '❌'} [${x.confidence}%] ${x.claim.statement.slice(0, 50)}...\n`;
2049
- if (x.reasoning)
2050
- out += ` └─ ${x.reasoning.slice(0, 60)}\n`;
2051
- }
2052
- if (isoResults.length > 4)
2053
- out += ` ... +${isoResults.length - 4} more isolated tests\n`;
2054
- out += '\n';
2055
- }
2056
- // Show other verification results
2057
- if (otherResults.length > 0) {
2058
- out += `📋 STANDARD VERIFICATION:\n`;
2059
- for (const x of otherResults.slice(0, 6)) {
2060
- out += ` ${x.verified ? '✅' : x.confidence > 50 ? '❌' : '❓'} [${x.confidence}%] ${x.claim.statement.slice(0, 50)}...\n`;
2061
- }
2062
- if (otherResults.length > 6)
2063
- out += ` ... +${otherResults.length - 6} more\n`;
2064
- }
2065
- return out;
2066
- }
2067
474
  //# sourceMappingURL=responseVerifier.js.map