erosolar-cli 1.7.21 → 1.7.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -764,57 +764,12 @@ export function generateVerificationTest(claim) {
764
764
  }
765
765
  }
766
766
  /**
767
- * Verify all claims in an assistant response
767
+ * Verify all claims in an assistant response using LLM-based semantic analysis.
768
+ * Requires a VerificationContext with an llmVerifier function.
769
+ * All claim extraction and verification is done via LLM.
768
770
  */
769
- export async function verifyResponse(response, responseId) {
770
- const claims = extractClaims(response);
771
- const results = [];
772
- for (const claim of claims) {
773
- const test = generateVerificationTest(claim);
774
- try {
775
- const result = await test();
776
- results.push(result);
777
- }
778
- catch (err) {
779
- results.push({
780
- claim,
781
- verified: false,
782
- confidence: 'low',
783
- evidence: 'Verification test failed to execute',
784
- error: err instanceof Error ? err.message : 'Unknown error',
785
- timestamp: new Date().toISOString()
786
- });
787
- }
788
- }
789
- const verified = results.filter(r => r.verified).length;
790
- const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
791
- const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
792
- let overallVerdict;
793
- if (failed > 0) {
794
- overallVerdict = 'contradicted';
795
- }
796
- else if (verified === claims.length && claims.length > 0) {
797
- overallVerdict = 'verified';
798
- }
799
- else if (verified > 0) {
800
- overallVerdict = 'partially_verified';
801
- }
802
- else {
803
- overallVerdict = 'unverified';
804
- }
805
- return {
806
- responseId: responseId || `response-${Date.now()}`,
807
- timestamp: new Date().toISOString(),
808
- claims,
809
- results,
810
- summary: {
811
- total: claims.length,
812
- verified,
813
- failed,
814
- inconclusive
815
- },
816
- overallVerdict
817
- };
771
+ export async function verifyResponse(response, context, responseId) {
772
+ return verifyResponseComprehensive(response, context, responseId);
818
773
  }
819
774
  /**
820
775
  * Format a verification report for display
@@ -854,10 +809,11 @@ export function formatVerificationReport(report) {
854
809
  return lines.join('\n');
855
810
  }
856
811
  /**
857
- * Quick verification - returns true if response claims are valid
812
+ * Quick verification - returns true if response claims are valid.
813
+ * Requires a VerificationContext with llmVerifier for LLM-based semantic analysis.
858
814
  */
859
- export async function quickVerify(response) {
860
- const report = await verifyResponse(response);
815
+ export async function quickVerify(response, context) {
816
+ const report = await verifyResponse(response, context);
861
817
  return report.overallVerdict === 'verified' || report.overallVerdict === 'partially_verified';
862
818
  }
863
819
  /**
@@ -1120,203 +1076,59 @@ export function generateExtendedVerificationTest(claim, context) {
1120
1076
  case 'error_fixed':
1121
1077
  case 'feature_implemented':
1122
1078
  case 'refactor_complete':
1123
- // These require semantic verification - use LLM if available
1079
+ // These require semantic verification - LLM is required
1124
1080
  return async () => {
1125
- if (context.llmVerifier) {
1126
- return verifyClaimWithLLM(claim, context);
1127
- }
1128
- // Fall back to checking if related files were modified
1129
- const relatedPath = claim.params.path;
1130
- if (relatedPath) {
1131
- try {
1132
- const resolvedPath = path.isAbsolute(relatedPath)
1133
- ? relatedPath
1134
- : path.resolve(context.workingDirectory, relatedPath);
1135
- const stats = await fs.stat(resolvedPath);
1136
- const recentlyModified = (Date.now() - stats.mtimeMs) < 10 * 60 * 1000;
1137
- return {
1138
- ...baseResult,
1139
- verified: recentlyModified,
1140
- confidence: 'low',
1141
- evidence: recentlyModified
1142
- ? `Related file ${relatedPath} was recently modified`
1143
- : `Related file ${relatedPath} exists but wasn't recently modified`
1144
- };
1145
- }
1146
- catch {
1147
- return {
1148
- ...baseResult,
1149
- verified: false,
1150
- confidence: 'low',
1151
- evidence: 'Could not verify - no LLM available and related file not found'
1152
- };
1153
- }
1081
+ if (!context.llmVerifier) {
1082
+ return {
1083
+ ...baseResult,
1084
+ verified: false,
1085
+ confidence: 'low',
1086
+ evidence: 'Semantic verification requires LLM verifier'
1087
+ };
1154
1088
  }
1155
- return {
1156
- ...baseResult,
1157
- verified: false,
1158
- confidence: 'low',
1159
- evidence: 'Semantic verification required but no LLM verifier available'
1160
- };
1089
+ return verifyClaimWithLLM(claim, context);
1161
1090
  };
1162
1091
  case 'data_transformed':
1163
- return async () => {
1164
- // Check if we have before/after state to compare
1165
- if (context.previousState && context.currentState) {
1166
- const inputKey = claim.params.input;
1167
- const outputKey = claim.params.output;
1168
- if (inputKey && outputKey) {
1169
- const inputExists = context.previousState[inputKey] !== undefined;
1170
- const outputExists = context.currentState[outputKey] !== undefined;
1171
- return {
1172
- ...baseResult,
1173
- verified: inputExists && outputExists,
1174
- confidence: inputExists && outputExists ? 'medium' : 'low',
1175
- evidence: `Input "${inputKey}" ${inputExists ? 'found' : 'missing'}, Output "${outputKey}" ${outputExists ? 'found' : 'missing'}`
1176
- };
1177
- }
1178
- }
1179
- // Fall back to LLM verification
1180
- if (context.llmVerifier) {
1181
- return verifyClaimWithLLM(claim, context);
1182
- }
1183
- return {
1184
- ...baseResult,
1185
- verified: false,
1186
- confidence: 'low',
1187
- evidence: 'Cannot verify data transformation without state comparison or LLM'
1188
- };
1189
- };
1190
1092
  case 'database_updated':
1191
- return async () => {
1192
- // Can't directly verify database changes without connection info
1193
- // Check if there's a command we can run
1194
- const checkCommand = claim.params.checkCommand;
1195
- if (checkCommand) {
1196
- try {
1197
- const { stdout } = await execAsync(checkCommand, {
1198
- timeout: 10000,
1199
- cwd: context.workingDirectory
1200
- });
1201
- return {
1202
- ...baseResult,
1203
- verified: true,
1204
- confidence: 'medium',
1205
- evidence: `Check command output: ${stdout.slice(0, 200)}`
1206
- };
1207
- }
1208
- catch (err) {
1209
- return {
1210
- ...baseResult,
1211
- verified: false,
1212
- confidence: 'medium',
1213
- evidence: 'Database check command failed',
1214
- error: err instanceof Error ? err.message : 'Unknown error'
1215
- };
1216
- }
1217
- }
1218
- // Fall back to LLM
1219
- if (context.llmVerifier) {
1220
- return verifyClaimWithLLM(claim, context);
1221
- }
1222
- return {
1223
- ...baseResult,
1224
- verified: false,
1225
- confidence: 'low',
1226
- evidence: 'Cannot verify database changes without check command or LLM'
1227
- };
1228
- };
1229
1093
  case 'permission_granted':
1230
- return async () => {
1231
- const targetPath = claim.params.path;
1232
- const expectedMode = claim.params.mode;
1233
- if (targetPath) {
1234
- try {
1235
- const resolvedPath = path.isAbsolute(targetPath)
1236
- ? targetPath
1237
- : path.resolve(context.workingDirectory, targetPath);
1238
- const stats = await fs.stat(resolvedPath);
1239
- const mode = (stats.mode & 0o777).toString(8);
1240
- if (expectedMode) {
1241
- const matches = mode === expectedMode;
1242
- return {
1243
- ...baseResult,
1244
- verified: matches,
1245
- confidence: 'high',
1246
- evidence: matches
1247
- ? `File has expected permissions: ${mode}`
1248
- : `Expected mode ${expectedMode}, got ${mode}`
1249
- };
1250
- }
1251
- return {
1252
- ...baseResult,
1253
- verified: true,
1254
- confidence: 'medium',
1255
- evidence: `File permissions: ${mode}`
1256
- };
1257
- }
1258
- catch (err) {
1259
- return {
1260
- ...baseResult,
1261
- verified: false,
1262
- confidence: 'high',
1263
- evidence: 'Could not check file permissions',
1264
- error: err instanceof Error ? err.message : 'Unknown error'
1265
- };
1266
- }
1267
- }
1268
- // Fall back to LLM
1269
- if (context.llmVerifier) {
1270
- return verifyClaimWithLLM(claim, context);
1271
- }
1272
- return {
1273
- ...baseResult,
1274
- verified: false,
1275
- confidence: 'low',
1276
- evidence: 'Cannot verify permission without file path or LLM'
1277
- };
1278
- };
1279
1094
  case 'generic':
1280
1095
  default:
1281
- // For generic claims, always try LLM verification first
1096
+ // All these claim types require LLM verification
1282
1097
  return async () => {
1283
- if (context.llmVerifier) {
1284
- return verifyClaimWithLLM(claim, context);
1098
+ if (!context.llmVerifier) {
1099
+ return {
1100
+ ...baseResult,
1101
+ verified: false,
1102
+ confidence: 'low',
1103
+ evidence: `${claim.type} verification requires LLM verifier`
1104
+ };
1285
1105
  }
1286
- return {
1287
- ...baseResult,
1288
- verified: false,
1289
- confidence: 'low',
1290
- evidence: 'Generic claim requires LLM verification which is not available'
1291
- };
1106
+ return verifyClaimWithLLM(claim, context);
1292
1107
  };
1293
1108
  }
1294
1109
  }
1295
1110
  /**
1296
- * Comprehensive verification using both runtime and LLM-based strategies
1111
+ * Comprehensive verification using LLM-based semantic analysis.
1112
+ * Requires an LLM verifier - all claims are verified through LLM semantic analysis.
1297
1113
  */
1298
1114
  export async function verifyResponseComprehensive(response, context, responseId) {
1299
- // First extract claims using LLM if available, otherwise pattern matching
1300
- const claims = context.llmVerifier
1301
- ? await extractClaimsWithLLM(response, context.llmVerifier)
1302
- : extractClaims(response);
1115
+ if (!context.llmVerifier) {
1116
+ return {
1117
+ responseId: responseId || `response-${Date.now()}`,
1118
+ timestamp: new Date().toISOString(),
1119
+ claims: [],
1120
+ results: [],
1121
+ summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
1122
+ overallVerdict: 'unverified'
1123
+ };
1124
+ }
1125
+ // Extract ALL claims using LLM (required)
1126
+ const claims = await extractClaimsWithLLM(response, context.llmVerifier);
1303
1127
  const results = [];
1304
1128
  for (const claim of claims) {
1305
- // Check if this is a standard claim type that can be runtime-verified
1306
- const standardTypes = [
1307
- 'file_created', 'file_modified', 'file_deleted', 'code_compiles',
1308
- 'tests_pass', 'git_committed', 'package_published', 'command_executed',
1309
- 'dependency_installed', 'service_running', 'url_accessible', 'content_contains'
1310
- ];
1311
- let test;
1312
- if (standardTypes.includes(claim.type)) {
1313
- test = generateVerificationTest(claim);
1314
- }
1315
- else {
1316
- test = generateExtendedVerificationTest(claim, context);
1317
- }
1129
+ // ALL claims are verified via LLM semantic analysis
1318
1130
  try {
1319
- const result = await test();
1131
+ const result = await verifyClaimWithLLM(claim, context);
1320
1132
  results.push(result);
1321
1133
  }
1322
1134
  catch (err) {
@@ -1324,7 +1136,7 @@ export async function verifyResponseComprehensive(response, context, responseId)
1324
1136
  claim,
1325
1137
  verified: false,
1326
1138
  confidence: 'low',
1327
- evidence: 'Verification test failed to execute',
1139
+ evidence: 'LLM verification failed',
1328
1140
  error: err instanceof Error ? err.message : 'Unknown error',
1329
1141
  timestamp: new Date().toISOString()
1330
1142
  });
@@ -1396,4 +1208,569 @@ export function getVerificationStrategy(claim) {
1396
1208
  return 'llm';
1397
1209
  }
1398
1210
  }
1211
+ /**
1212
+ * Prompt for LLM to generate verification code
1213
+ */
1214
+ const VERIFICATION_CODE_GENERATION_PROMPT = `You are a verification code generator. Given a claim that an AI assistant made, generate code to verify if the claim is TRUE.
1215
+
1216
+ CLAIM TO VERIFY:
1217
+ Type: {CLAIM_TYPE}
1218
+ Description: {CLAIM_DESCRIPTION}
1219
+ Evidence: {CLAIM_EVIDENCE}
1220
+ Parameters: {CLAIM_PARAMS}
1221
+
1222
+ WORKING DIRECTORY: {WORKING_DIR}
1223
+
1224
+ Generate a verification test. Choose the most appropriate approach:
1225
+
1226
+ 1. SHELL COMMAND - For file operations, git, npm, system checks
1227
+ 2. JAVASCRIPT - For complex logic, API calls, JSON parsing
1228
+ 3. API - For HTTP endpoints, external services
1229
+
1230
+ IMPORTANT RULES:
1231
+ - Code must be READ-ONLY and NON-DESTRUCTIVE (no writes, no deletes, no modifications)
1232
+ - Code must complete quickly (under 10 seconds)
1233
+ - Code must output a clear result that can be parsed
1234
+ - For shell: output should be parseable (exit code 0 = verified, non-zero = failed)
1235
+ - For JavaScript: must export/return { verified: boolean, evidence: string }
1236
+ - Do NOT use interactive commands
1237
+ - Do NOT access sensitive data or credentials
1238
+
1239
+ Respond with JSON:
1240
+ {
1241
+ "testType": "shell" | "javascript" | "api",
1242
+ "code": "the verification code",
1243
+ "description": "what this test does",
1244
+ "expectedOutcome": "what success looks like",
1245
+ "safeToRun": true | false,
1246
+ "safetyReason": "why it's safe/unsafe"
1247
+ }
1248
+
1249
+ Only output valid JSON, nothing else.`;
1250
+ /**
1251
+ * Generate verification code using LLM
1252
+ */
1253
+ export async function generateVerificationCode(claim, context) {
1254
+ if (!context.llmVerifier) {
1255
+ return null;
1256
+ }
1257
+ try {
1258
+ const prompt = VERIFICATION_CODE_GENERATION_PROMPT
1259
+ .replace('{CLAIM_TYPE}', claim.type)
1260
+ .replace('{CLAIM_DESCRIPTION}', claim.description)
1261
+ .replace('{CLAIM_EVIDENCE}', claim.evidence)
1262
+ .replace('{CLAIM_PARAMS}', JSON.stringify(claim.params, null, 2))
1263
+ .replace('{WORKING_DIR}', context.workingDirectory);
1264
+ const result = await context.llmVerifier(prompt);
1265
+ // Parse the JSON response
1266
+ const jsonMatch = result.match(/\{[\s\S]*\}/);
1267
+ if (!jsonMatch) {
1268
+ return null;
1269
+ }
1270
+ const parsed = JSON.parse(jsonMatch[0]);
1271
+ return {
1272
+ claim,
1273
+ testType: parsed.testType,
1274
+ code: parsed.code,
1275
+ description: parsed.description,
1276
+ expectedOutcome: parsed.expectedOutcome,
1277
+ safetyCheck: parsed.safeToRun
1278
+ };
1279
+ }
1280
+ catch (err) {
1281
+ console.error('Failed to generate verification code:', err);
1282
+ return null;
1283
+ }
1284
+ }
1285
+ /**
1286
+ * Safety patterns to block dangerous code
1287
+ */
1288
+ const DANGEROUS_PATTERNS = [
1289
+ /\brm\s+-rf?\b/i, // rm commands
1290
+ /\brmdir\b/i, // rmdir
1291
+ /\bdd\s+if=/i, // dd (disk destroyer)
1292
+ /\bmkfs\b/i, // format filesystem
1293
+ /\b>\s*\/dev\//i, // write to devices
1294
+ /\bchmod\s+777\b/i, // dangerous permissions
1295
+ /\bsudo\b/i, // sudo commands
1296
+ /\bcurl.*\|\s*sh\b/i, // pipe to shell
1297
+ /\bwget.*\|\s*sh\b/i, // pipe to shell
1298
+ /\beval\s*\(/i, // eval in JS
1299
+ /new\s+Function\s*\(/i, // Function constructor
1300
+ /child_process/i, // subprocess in JS (unless we control it)
1301
+ /\bexec\s*\(/i, // exec calls
1302
+ /\bspawn\s*\(/i, // spawn calls
1303
+ /writeFile/i, // file writes
1304
+ /appendFile/i, // file appends
1305
+ /unlink\s*\(/i, // file deletion
1306
+ /rmSync/i, // sync deletion
1307
+ /fs\.rm/i, // fs remove
1308
+ /DROP\s+TABLE/i, // SQL injection
1309
+ /DELETE\s+FROM/i, // SQL deletion
1310
+ /TRUNCATE/i, // SQL truncate
1311
+ /;\s*--/, // SQL comment injection
1312
+ /process\.exit/i, // process exit
1313
+ /require\s*\(\s*['"]child/i, // require child_process
1314
+ ];
1315
+ /**
1316
+ * Validate that generated code is safe to execute
1317
+ */
1318
+ export function validateGeneratedCode(test) {
1319
+ // First check the LLM's own safety assessment
1320
+ if (!test.safetyCheck) {
1321
+ return { safe: false, reason: 'LLM marked code as unsafe' };
1322
+ }
1323
+ // Check against dangerous patterns
1324
+ for (const pattern of DANGEROUS_PATTERNS) {
1325
+ if (pattern.test(test.code)) {
1326
+ return {
1327
+ safe: false,
1328
+ reason: `Dangerous pattern detected: ${pattern.source}`
1329
+ };
1330
+ }
1331
+ }
1332
+ // Additional checks for shell commands
1333
+ if (test.testType === 'shell') {
1334
+ // Only allow specific safe commands
1335
+ const safeShellPrefixes = [
1336
+ 'ls', 'cat', 'head', 'tail', 'grep', 'find', 'stat', 'file',
1337
+ 'test', 'echo', 'pwd', 'wc', 'diff', 'cmp',
1338
+ 'git log', 'git status', 'git show', 'git diff', 'git branch',
1339
+ 'npm view', 'npm list', 'npm ls',
1340
+ 'node -e', 'node --eval',
1341
+ 'curl -s', 'curl --silent', 'wget -q',
1342
+ 'jq', 'python -c', 'python3 -c',
1343
+ 'lsof', 'netstat', 'ss', 'ps',
1344
+ 'which', 'type', 'command -v',
1345
+ ];
1346
+ const trimmedCode = test.code.trim().toLowerCase();
1347
+ const startsWithSafe = safeShellPrefixes.some(prefix => trimmedCode.startsWith(prefix.toLowerCase()));
1348
+ if (!startsWithSafe) {
1349
+ // Check if it's a simple test/check command
1350
+ if (!trimmedCode.startsWith('[') && !trimmedCode.startsWith('if ')) {
1351
+ return {
1352
+ safe: false,
1353
+ reason: 'Shell command does not start with a known safe prefix'
1354
+ };
1355
+ }
1356
+ }
1357
+ }
1358
+ // For JavaScript, ensure it's a simple expression
1359
+ if (test.testType === 'javascript') {
1360
+ // Check code length - very long code is suspicious
1361
+ if (test.code.length > 2000) {
1362
+ return { safe: false, reason: 'JavaScript code too long' };
1363
+ }
1364
+ }
1365
+ return { safe: true, reason: 'All safety checks passed' };
1366
+ }
1367
+ /**
1368
+ * Execute a generated verification test
1369
+ */
1370
+ export async function executeGeneratedTest(test, context) {
1371
+ const baseResult = {
1372
+ claim: test.claim,
1373
+ timestamp: new Date().toISOString()
1374
+ };
1375
+ // Validate safety first
1376
+ const safetyResult = validateGeneratedCode(test);
1377
+ if (!safetyResult.safe) {
1378
+ return {
1379
+ ...baseResult,
1380
+ verified: false,
1381
+ confidence: 'low',
1382
+ evidence: `Generated test blocked: ${safetyResult.reason}`,
1383
+ error: 'Safety validation failed'
1384
+ };
1385
+ }
1386
+ try {
1387
+ switch (test.testType) {
1388
+ case 'shell': {
1389
+ const { stdout, stderr } = await execAsync(test.code, {
1390
+ cwd: context.workingDirectory,
1391
+ timeout: 10000, // 10 second timeout
1392
+ maxBuffer: 1024 * 1024 // 1MB max output
1393
+ });
1394
+ const output = (stdout + stderr).trim();
1395
+ // Shell convention: exit 0 = success
1396
+ return {
1397
+ ...baseResult,
1398
+ verified: true,
1399
+ confidence: 'high',
1400
+ evidence: `Test passed. Output: ${output.slice(0, 500)}`
1401
+ };
1402
+ }
1403
+ case 'javascript': {
1404
+ // Execute JavaScript in a sandboxed way using node -e
1405
+ const wrappedCode = `
1406
+ const result = (async () => {
1407
+ ${test.code}
1408
+ })();
1409
+ result.then(r => console.log(JSON.stringify(r))).catch(e => {
1410
+ console.log(JSON.stringify({ verified: false, evidence: e.message }));
1411
+ });
1412
+ `;
1413
+ const { stdout } = await execAsync(`node -e ${JSON.stringify(wrappedCode)}`, {
1414
+ cwd: context.workingDirectory,
1415
+ timeout: 10000
1416
+ });
1417
+ try {
1418
+ const result = JSON.parse(stdout.trim());
1419
+ return {
1420
+ ...baseResult,
1421
+ verified: result.verified,
1422
+ confidence: 'high',
1423
+ evidence: result.evidence
1424
+ };
1425
+ }
1426
+ catch {
1427
+ return {
1428
+ ...baseResult,
1429
+ verified: false,
1430
+ confidence: 'medium',
1431
+ evidence: `JavaScript output: ${stdout.slice(0, 500)}`
1432
+ };
1433
+ }
1434
+ }
1435
+ case 'api': {
1436
+ // For API tests, use curl
1437
+ const { stdout } = await execAsync(test.code, {
1438
+ cwd: context.workingDirectory,
1439
+ timeout: 15000
1440
+ });
1441
+ // Try to parse as JSON result
1442
+ try {
1443
+ const result = JSON.parse(stdout.trim());
1444
+ return {
1445
+ ...baseResult,
1446
+ verified: Boolean(result.verified ?? result.success ?? result.ok),
1447
+ confidence: 'high',
1448
+ evidence: `API response: ${JSON.stringify(result).slice(0, 500)}`
1449
+ };
1450
+ }
1451
+ catch {
1452
+ // Non-JSON response - check for success indicators
1453
+ const isSuccess = stdout.includes('200') || stdout.includes('success') || stdout.includes('ok');
1454
+ return {
1455
+ ...baseResult,
1456
+ verified: isSuccess,
1457
+ confidence: 'medium',
1458
+ evidence: `API output: ${stdout.slice(0, 500)}`
1459
+ };
1460
+ }
1461
+ }
1462
+ default:
1463
+ return {
1464
+ ...baseResult,
1465
+ verified: false,
1466
+ confidence: 'low',
1467
+ evidence: `Unknown test type: ${test.testType}`
1468
+ };
1469
+ }
1470
+ }
1471
+ catch (err) {
1472
+ // Command failed (non-zero exit) = verification failed
1473
+ return {
1474
+ ...baseResult,
1475
+ verified: false,
1476
+ confidence: 'high',
1477
+ evidence: `Test failed: ${err instanceof Error ? err.message : 'Unknown error'}`,
1478
+ error: err instanceof Error ? err.message : 'Unknown error'
1479
+ };
1480
+ }
1481
+ }
1482
+ /**
1483
+ * Verify a claim using LLM-generated runtime test
1484
+ */
1485
+ export async function verifyWithGeneratedTest(claim, context) {
1486
+ const baseResult = {
1487
+ claim,
1488
+ timestamp: new Date().toISOString()
1489
+ };
1490
+ // Generate verification code
1491
+ const test = await generateVerificationCode(claim, context);
1492
+ if (!test) {
1493
+ return {
1494
+ ...baseResult,
1495
+ verified: false,
1496
+ confidence: 'low',
1497
+ evidence: 'Failed to generate verification test'
1498
+ };
1499
+ }
1500
+ // Execute the generated test
1501
+ return executeGeneratedTest(test, context);
1502
+ }
1503
+ /**
1504
+ * Full verification using LLM-generated tests
1505
+ * This is the most powerful verification method - LLM decides HOW to verify each claim
1506
+ */
1507
+ export async function verifyResponseWithGeneratedTests(response, context, responseId) {
1508
+ // Extract claims using LLM
1509
+ const claims = context.llmVerifier
1510
+ ? await extractClaimsWithLLM(response, context.llmVerifier)
1511
+ : extractClaims(response);
1512
+ const results = [];
1513
+ for (const claim of claims) {
1514
+ // For each claim, generate and run a custom verification test
1515
+ const result = await verifyWithGeneratedTest(claim, context);
1516
+ results.push(result);
1517
+ }
1518
+ const verified = results.filter(r => r.verified).length;
1519
+ const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
1520
+ const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
1521
+ let overallVerdict;
1522
+ if (failed > 0) {
1523
+ overallVerdict = 'contradicted';
1524
+ }
1525
+ else if (verified === claims.length && claims.length > 0) {
1526
+ overallVerdict = 'verified';
1527
+ }
1528
+ else if (verified > 0) {
1529
+ overallVerdict = 'partially_verified';
1530
+ }
1531
+ else {
1532
+ overallVerdict = 'unverified';
1533
+ }
1534
+ return {
1535
+ responseId: responseId || `response-${Date.now()}`,
1536
+ timestamp: new Date().toISOString(),
1537
+ claims,
1538
+ results,
1539
+ summary: {
1540
+ total: claims.length,
1541
+ verified,
1542
+ failed,
1543
+ inconclusive
1544
+ },
1545
+ overallVerdict
1546
+ };
1547
+ }
1548
+ /**
1549
+ * Hybrid verification - uses generated tests when available, falls back to predefined tests
1550
+ */
1551
+ export async function verifyResponseHybrid(response, context, responseId) {
1552
+ const claims = context.llmVerifier
1553
+ ? await extractClaimsWithLLM(response, context.llmVerifier)
1554
+ : extractClaims(response);
1555
+ const results = [];
1556
+ for (const claim of claims) {
1557
+ let result;
1558
+ // Try LLM-generated test first if LLM is available
1559
+ if (context.llmVerifier) {
1560
+ const generatedTest = await generateVerificationCode(claim, context);
1561
+ if (generatedTest) {
1562
+ const safety = validateGeneratedCode(generatedTest);
1563
+ if (safety.safe) {
1564
+ // Use generated test
1565
+ result = await executeGeneratedTest(generatedTest, context);
1566
+ results.push(result);
1567
+ continue;
1568
+ }
1569
+ }
1570
+ }
1571
+ // Fall back to predefined verification
1572
+ const standardTypes = [
1573
+ 'file_created', 'file_modified', 'file_deleted', 'code_compiles',
1574
+ 'tests_pass', 'git_committed', 'package_published', 'command_executed',
1575
+ 'dependency_installed', 'service_running', 'url_accessible', 'content_contains'
1576
+ ];
1577
+ let test;
1578
+ if (standardTypes.includes(claim.type)) {
1579
+ test = generateVerificationTest(claim);
1580
+ }
1581
+ else {
1582
+ test = generateExtendedVerificationTest(claim, context);
1583
+ }
1584
+ try {
1585
+ result = await test();
1586
+ }
1587
+ catch (err) {
1588
+ result = {
1589
+ claim,
1590
+ verified: false,
1591
+ confidence: 'low',
1592
+ evidence: 'Verification failed',
1593
+ error: err instanceof Error ? err.message : 'Unknown error',
1594
+ timestamp: new Date().toISOString()
1595
+ };
1596
+ }
1597
+ results.push(result);
1598
+ }
1599
+ const verified = results.filter(r => r.verified).length;
1600
+ const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
1601
+ const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
1602
+ let overallVerdict;
1603
+ if (failed > 0) {
1604
+ overallVerdict = 'contradicted';
1605
+ }
1606
+ else if (verified === claims.length && claims.length > 0) {
1607
+ overallVerdict = 'verified';
1608
+ }
1609
+ else if (verified > 0) {
1610
+ overallVerdict = 'partially_verified';
1611
+ }
1612
+ else {
1613
+ overallVerdict = 'unverified';
1614
+ }
1615
+ return {
1616
+ responseId: responseId || `response-${Date.now()}`,
1617
+ timestamp: new Date().toISOString(),
1618
+ claims,
1619
+ results,
1620
+ summary: {
1621
+ total: claims.length,
1622
+ verified,
1623
+ failed,
1624
+ inconclusive
1625
+ },
1626
+ overallVerdict
1627
+ };
1628
+ }
1629
+ const UNIVERSAL_EXTRACT = `Extract ALL verifiable claims from this AI response. Include explicit claims, implicit claims, state changes, results, assertions.
1630
+
1631
+ RESPONSE:
1632
+ ---
1633
+ {RESPONSE}
1634
+ ---
1635
+ CONTEXT: {CONTEXT}
1636
+ DIR: {WORKING_DIR}
1637
+
1638
+ Return JSON array: [{"id":"c1","statement":"claim","category":"file_op|code|state|data|behavior|fact|other","verifiable":true/false,"verificationApproach":"how","priority":"critical|high|medium|low","context":{}}]
1639
+ Output ONLY valid JSON.`;
1640
+ const UNIVERSAL_GEN = `Generate verification code for: {STATEMENT}
1641
+ Category: {CATEGORY} | Approach: {APPROACH} | Context: {CONTEXT} | Dir: {WORKING_DIR} | Platform: {PLATFORM}
1642
+
1643
+ Use shell/javascript/python. READ-ONLY only.
1644
+ Return JSON: {"steps":[{"type":"shell|javascript|python","code":"code","desc":"what"}],"success":"success criteria","failure":"failure criteria","confPass":0-100,"confFail":0-100,"safe":{"ok":true/false,"why":"reason"}}
1645
+ Output ONLY valid JSON.`;
1646
+ const UNIVERSAL_ASSESS = `Assess: RESPONSE:{RESPONSE} CLAIMS:{CLAIMS} RESULTS:{RESULTS}
1647
+ Return JSON: {"trust":0-100,"summary":"text","concerns":[]}
1648
+ Output ONLY valid JSON.`;
1649
+ const UNSAFE = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /exec\s*\(/i, /child_process/i, /os\.system/i, /subprocess/i, /curl.*\|.*sh/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i, /kill/i];
1650
+ export function validateUniversalCode(c) {
1651
+ for (const p of UNSAFE)
1652
+ if (p.test(c))
1653
+ return { safe: false, reason: p.source };
1654
+ return c.length > 5000 ? { safe: false, reason: 'too long' } : { safe: true, reason: 'ok' };
1655
+ }
1656
+ async function runUniversalStep(s, cwd) {
1657
+ const v = validateUniversalCode(s.code);
1658
+ if (!v.safe)
1659
+ return { ok: false, out: v.reason };
1660
+ try {
1661
+ if (s.type === 'shell') {
1662
+ const { stdout, stderr } = await execAsync(s.code, { cwd, timeout: 30000, maxBuffer: 5 * 1024 * 1024 });
1663
+ return { ok: true, out: stdout + stderr };
1664
+ }
1665
+ if (s.type === 'javascript') {
1666
+ const w = `(async()=>{try{const fs=require('fs').promises;const r=await(async()=>{${s.code}})();console.log(JSON.stringify({ok:1,r}))}catch(e){console.log(JSON.stringify({ok:0,e:e.message}))}})()`;
1667
+ const { stdout } = await execAsync(`node -e ${JSON.stringify(w)}`, { cwd, timeout: 30000 });
1668
+ return { ok: true, out: stdout };
1669
+ }
1670
+ if (s.type === 'python') {
1671
+ const { stdout, stderr } = await execAsync(`python3 -c ${JSON.stringify(s.code)}`, { cwd, timeout: 30000 });
1672
+ return { ok: true, out: stdout + stderr };
1673
+ }
1674
+ return { ok: false, out: 'unknown type' };
1675
+ }
1676
+ catch (e) {
1677
+ return { ok: false, out: e instanceof Error ? e.message : 'err' };
1678
+ }
1679
+ }
1680
+ export async function extractUniversalClaims(r, ctx) {
1681
+ if (!ctx.llmVerifier)
1682
+ return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
1683
+ try {
1684
+ const p = UNIVERSAL_EXTRACT.replace('{RESPONSE}', r.slice(0, 8000)).replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '').replace('{WORKING_DIR}', ctx.workingDirectory);
1685
+ const res = await ctx.llmVerifier(p);
1686
+ const m = res.match(/\[[\s\S]*\]/);
1687
+ if (m)
1688
+ return JSON.parse(m[0]);
1689
+ }
1690
+ catch { /* fall through */ }
1691
+ return extractClaims(r).map((c, i) => ({ id: `c${i}`, statement: c.description, category: c.type, verifiable: true, verificationApproach: 'runtime', priority: 'medium', context: c.params }));
1692
+ }
1693
+ export async function verifyUniversalClaim(claim, ctx) {
1694
+ const base = { claim, timestamp: new Date().toISOString() };
1695
+ if (!claim.verifiable)
1696
+ return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'Not verifiable', reasoning: 'Cannot verify' };
1697
+ if (!ctx.llmVerifier)
1698
+ return { ...base, verified: false, confidence: 0, method: 'skip', evidence: 'No LLM', reasoning: 'Needs LLM' };
1699
+ try {
1700
+ const p = UNIVERSAL_GEN.replace('{STATEMENT}', claim.statement).replace('{CATEGORY}', claim.category).replace('{APPROACH}', claim.verificationApproach).replace('{CONTEXT}', JSON.stringify(claim.context)).replace('{WORKING_DIR}', ctx.workingDirectory).replace('{PLATFORM}', process.platform);
1701
+ const res = await ctx.llmVerifier(p);
1702
+ const m = res.match(/\{[\s\S]*\}/);
1703
+ if (!m)
1704
+ throw new Error('bad');
1705
+ const plan = JSON.parse(m[0]);
1706
+ if (!plan.safe.ok)
1707
+ return { ...base, verified: false, confidence: 0, method: 'blocked', evidence: plan.safe.why, reasoning: 'Unsafe' };
1708
+ let allOk = true, out = '', code = '';
1709
+ for (const s of plan.steps) {
1710
+ code += s.code + '\n';
1711
+ const r = await runUniversalStep(s, ctx.workingDirectory);
1712
+ out += r.out + '\n';
1713
+ if (!r.ok)
1714
+ allOk = false;
1715
+ }
1716
+ return { ...base, verified: allOk, confidence: allOk ? plan.confPass : plan.confFail, method: plan.steps.map(s => s.type).join('+'), evidence: allOk ? plan.success : plan.failure, reasoning: allOk ? 'All passed' : 'Some failed', executedCode: code, rawOutput: out.slice(0, 2000) };
1717
+ }
1718
+ catch (e) {
1719
+ return { ...base, verified: false, confidence: 10, method: 'error', evidence: 'Failed', reasoning: e instanceof Error ? e.message : 'err' };
1720
+ }
1721
+ }
1722
+ export async function verifyResponseUniversal(response, ctx, id) {
1723
+ const claims = await extractUniversalClaims(response, ctx);
1724
+ const results = [];
1725
+ for (const c of claims)
1726
+ results.push(c.verifiable || c.priority === 'critical' || c.priority === 'high' ? await verifyUniversalClaim(c, ctx) : { claim: c, verified: false, confidence: 0, method: 'skip', evidence: 'Low priority', reasoning: 'Skipped', timestamp: new Date().toISOString() });
1727
+ const vClaims = claims.filter(c => c.verifiable).length;
1728
+ const verified = results.filter(r => r.verified).length;
1729
+ const failed = results.filter(r => !r.verified && r.confidence > 50).length;
1730
+ const inconclusive = results.filter(r => !r.verified && r.confidence <= 50 && r.method !== 'skip').length;
1731
+ const avgConf = results.length ? results.reduce((s, r) => s + r.confidence, 0) / results.length : 0;
1732
+ let assessment = '', trust = 0;
1733
+ if (ctx.llmVerifier)
1734
+ try {
1735
+ const p = UNIVERSAL_ASSESS.replace('{RESPONSE}', response.slice(0, 4000)).replace('{CLAIMS}', JSON.stringify(claims.slice(0, 15))).replace('{RESULTS}', JSON.stringify(results.slice(0, 15)));
1736
+ const r = await ctx.llmVerifier(p);
1737
+ const m = r.match(/\{[\s\S]*\}/);
1738
+ if (m) {
1739
+ const a = JSON.parse(m[0]);
1740
+ trust = a.trust;
1741
+ assessment = a.summary + (a.concerns?.length ? ` Concerns: ${a.concerns.join('; ')}` : '');
1742
+ }
1743
+ }
1744
+ catch {
1745
+ trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
1746
+ assessment = `${verified}/${vClaims} verified`;
1747
+ }
1748
+ else {
1749
+ trust = Math.round(avgConf * verified / Math.max(vClaims, 1));
1750
+ assessment = `${verified}/${vClaims} verified`;
1751
+ }
1752
+ return { responseId: id || `u-${Date.now()}`, originalResponse: response, timestamp: new Date().toISOString(), claims, results, summary: { totalClaims: claims.length, verifiableClaims: vClaims, verified, failed, inconclusive, averageConfidence: Math.round(avgConf) }, overallAssessment: assessment, trustScore: trust };
1753
+ }
1754
+ export async function quickUniversalVerify(r, ctx) {
1755
+ const claims = await extractUniversalClaims(r, ctx);
1756
+ const crit = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 5);
1757
+ if (!crit.length)
1758
+ return { trustScore: 50, summary: 'No critical claims' };
1759
+ let v = 0;
1760
+ for (const c of crit)
1761
+ if ((await verifyUniversalClaim(c, ctx)).verified)
1762
+ v++;
1763
+ return { trustScore: Math.round(v / crit.length * 100), summary: `${v}/${crit.length} critical verified` };
1764
+ }
1765
+ export function formatUniversalReport(r) {
1766
+ const bar = '█'.repeat(Math.round(r.trustScore / 10)) + '░'.repeat(10 - Math.round(r.trustScore / 10));
1767
+ const icon = r.trustScore >= 80 ? '✅' : r.trustScore >= 50 ? '⚠️' : '❌';
1768
+ let out = `╔════════════════════════════════════════════════════════════╗\n║ UNIVERSAL VERIFICATION REPORT ║\n╚════════════════════════════════════════════════════════════╝\n\n`;
1769
+ out += `Trust: ${icon} ${r.trustScore}/100 [${bar}]\n${r.overallAssessment}\n\nClaims: ${r.summary.totalClaims} | ✅ ${r.summary.verified} | ❌ ${r.summary.failed} | ❓ ${r.summary.inconclusive}\n\n`;
1770
+ for (const x of r.results.slice(0, 8))
1771
+ out += `${x.verified ? '✅' : x.confidence > 50 ? '❌' : '❓'} [${x.confidence}%] ${x.claim.statement.slice(0, 55)}...\n`;
1772
+ if (r.results.length > 8)
1773
+ out += `... +${r.results.length - 8} more\n`;
1774
+ return out;
1775
+ }
1399
1776
  //# sourceMappingURL=responseVerifier.js.map