@goldensheepai/toknxr-cli 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,538 @@
1
+ /**
2
+ * Execution-Based Hallucination Detection
3
+ * Integrates execution sandbox with hallucination detection for runtime analysis
4
+ */
5
+ import { ExecutionSandbox } from './execution-sandbox.js';
6
+ /**
7
+ * Default execution analysis configuration
8
+ */
9
+ const DEFAULT_EXECUTION_CONFIG = {
10
+ enableResourceMonitoring: true,
11
+ enableLogicValidation: true,
12
+ enablePerformanceAnalysis: true,
13
+ memoryThresholdMB: 64,
14
+ executionTimeThresholdMs: 3000,
15
+ cpuUsageThreshold: 80,
16
+ };
17
+ /**
18
+ * Execution-based hallucination detector
19
+ */
20
+ export class ExecutionBasedDetector {
21
+ constructor(config = {}) {
22
+ this.config = { ...DEFAULT_EXECUTION_CONFIG, ...config };
23
+ this.sandbox = new ExecutionSandbox({
24
+ maxMemoryMB: this.config.memoryThresholdMB * 2, // Allow some headroom
25
+ maxExecutionTimeMs: this.config.executionTimeThresholdMs * 2,
26
+ });
27
+ }
28
+ /**
29
+ * Detect resource-related hallucinations from execution results
30
+ */
31
+ async detectResourceHallucinations(executionResult) {
32
+ const categories = [];
33
+ if (!this.config.enableResourceMonitoring) {
34
+ return categories;
35
+ }
36
+ try {
37
+ const resourceAnalysis = this.analyzeResourceUsage(executionResult.resourceUsage);
38
+ // Memory usage hallucinations
39
+ if (resourceAnalysis.memoryUsage.isExcessive) {
40
+ categories.push({
41
+ type: 'resource',
42
+ subtype: 'physical_constraint',
43
+ severity: this.getMemorySeverity(resourceAnalysis.memoryUsage.current),
44
+ confidence: 0.9,
45
+ description: `Excessive memory usage detected: ${resourceAnalysis.memoryUsage.current.toFixed(2)}MB`,
46
+ evidence: [
47
+ {
48
+ type: 'resource_usage',
49
+ content: `Memory: ${resourceAnalysis.memoryUsage.current}MB (threshold: ${resourceAnalysis.memoryUsage.threshold}MB)`,
50
+ confidence: 1.0,
51
+ },
52
+ {
53
+ type: 'resource_usage',
54
+ content: `Peak memory: ${resourceAnalysis.memoryUsage.peak}MB`,
55
+ confidence: 0.9,
56
+ },
57
+ ],
58
+ suggestedFix: 'Optimize data structures, use generators, or implement memory-efficient algorithms',
59
+ businessImpact: {
60
+ estimatedDevTimeWasted: this.calculateMemoryImpact(resourceAnalysis.memoryUsage.current),
61
+ costMultiplier: 1.5,
62
+ qualityImpact: 30,
63
+ costOfHallucinations: this.calculateMemoryImpact(resourceAnalysis.memoryUsage.current) * 100,
64
+ },
65
+ });
66
+ }
67
+ // Execution time hallucinations
68
+ if (resourceAnalysis.executionTime.isExcessive) {
69
+ categories.push({
70
+ type: 'resource',
71
+ subtype: 'computational_boundary',
72
+ severity: this.getTimeSeverity(resourceAnalysis.executionTime.actual),
73
+ confidence: 0.85,
74
+ description: `Excessive execution time: ${resourceAnalysis.executionTime.actual}ms`,
75
+ evidence: [
76
+ {
77
+ type: 'performance_metric',
78
+ content: `Execution time: ${resourceAnalysis.executionTime.actual}ms (threshold: ${resourceAnalysis.executionTime.threshold}ms)`,
79
+ confidence: 1.0,
80
+ },
81
+ ],
82
+ suggestedFix: 'Optimize algorithm complexity, add caching, or use more efficient data structures',
83
+ businessImpact: {
84
+ estimatedDevTimeWasted: this.calculateTimeImpact(resourceAnalysis.executionTime.actual),
85
+ costMultiplier: 1.3,
86
+ qualityImpact: 20,
87
+ costOfHallucinations: this.calculateTimeImpact(resourceAnalysis.executionTime.actual) * 100,
88
+ },
89
+ });
90
+ }
91
+ // CPU usage hallucinations
92
+ if (resourceAnalysis.cpuUsage.isExcessive) {
93
+ categories.push({
94
+ type: 'resource',
95
+ subtype: 'computational_boundary',
96
+ severity: 'medium',
97
+ confidence: 0.7,
98
+ description: `High CPU usage detected: ${resourceAnalysis.cpuUsage.percentage}%`,
99
+ evidence: [
100
+ {
101
+ type: 'resource_usage',
102
+ content: `CPU usage: ${resourceAnalysis.cpuUsage.percentage}% (threshold: ${resourceAnalysis.cpuUsage.threshold}%)`,
103
+ confidence: 0.8,
104
+ },
105
+ ],
106
+ suggestedFix: 'Optimize computational complexity or add CPU usage limits',
107
+ businessImpact: {
108
+ estimatedDevTimeWasted: 1.5,
109
+ costMultiplier: 1.2,
110
+ qualityImpact: 15,
111
+ costOfHallucinations: 150.0,
112
+ },
113
+ });
114
+ }
115
+ // Timeout hallucinations
116
+ if (executionResult.timedOut) {
117
+ categories.push({
118
+ type: 'resource',
119
+ subtype: 'computational_boundary',
120
+ severity: 'high',
121
+ confidence: 1.0,
122
+ description: 'Code execution timed out, indicating potential infinite loop or excessive computation',
123
+ evidence: [
124
+ {
125
+ type: 'timeout',
126
+ content: 'Execution exceeded maximum allowed time',
127
+ confidence: 1.0,
128
+ },
129
+ ],
130
+ suggestedFix: 'Add proper termination conditions, optimize loops, or reduce computational complexity',
131
+ businessImpact: {
132
+ estimatedDevTimeWasted: 3.0,
133
+ costMultiplier: 1.6,
134
+ qualityImpact: 35,
135
+ costOfHallucinations: 300.0,
136
+ },
137
+ });
138
+ }
139
+ }
140
+ catch (error) {
141
+ console.warn('Resource analysis failed:', error);
142
+ }
143
+ return categories;
144
+ }
145
+ /**
146
+ * Detect logic-related hallucinations from code and execution results
147
+ */
148
+ async detectLogicHallucinations(code, executionResult, expectedOutput) {
149
+ const categories = [];
150
+ if (!this.config.enableLogicValidation) {
151
+ return categories;
152
+ }
153
+ try {
154
+ // 1. Analyze infinite loop risks
155
+ const loopAnalysis = this.detectInfiniteLoops(code);
156
+ if (loopAnalysis.hasInfiniteLoopRisk) {
157
+ categories.push({
158
+ type: 'logic',
159
+ subtype: 'logic_deviation',
160
+ severity: loopAnalysis.confidence > 0.8 ? 'high' : 'medium',
161
+ confidence: loopAnalysis.confidence,
162
+ description: 'Potential infinite loop or inadequate termination conditions detected',
163
+ evidence: [
164
+ {
165
+ type: 'code_pattern',
166
+ content: `Loop complexity: ${loopAnalysis.loopComplexity}`,
167
+ confidence: 0.8,
168
+ },
169
+ {
170
+ type: 'code_pattern',
171
+ content: `Termination conditions: ${loopAnalysis.terminationConditions.join(', ')}`,
172
+ confidence: 0.7,
173
+ },
174
+ ],
175
+ suggestedFix: 'Add proper loop termination conditions and bounds checking',
176
+ businessImpact: {
177
+ estimatedDevTimeWasted: 2.5,
178
+ costMultiplier: 1.4,
179
+ qualityImpact: 30,
180
+ costOfHallucinations: 250.0,
181
+ },
182
+ });
183
+ }
184
+ // 2. Validate output correctness
185
+ if (expectedOutput !== undefined) {
186
+ const correctnessAnalysis = this.validateOutputCorrectness(executionResult, expectedOutput);
187
+ if (!correctnessAnalysis.outputMatches && correctnessAnalysis.confidence > 0.7) {
188
+ categories.push({
189
+ type: 'logic',
190
+ subtype: 'logic_breakdown',
191
+ severity: this.getCorrectnessSeverity(correctnessAnalysis.similarity),
192
+ confidence: correctnessAnalysis.confidence,
193
+ description: 'Output does not match expected result, indicating logic errors',
194
+ evidence: [
195
+ {
196
+ type: 'output_comparison',
197
+ content: `Expected type: ${correctnessAnalysis.expectedType}, Actual type: ${correctnessAnalysis.actualType}`,
198
+ confidence: 0.9,
199
+ },
200
+ {
201
+ type: 'output_comparison',
202
+ content: `Similarity score: ${correctnessAnalysis.similarity.toFixed(2)}`,
203
+ confidence: 0.8,
204
+ },
205
+ ],
206
+ suggestedFix: 'Review algorithm logic and test with various input scenarios',
207
+ businessImpact: {
208
+ estimatedDevTimeWasted: this.calculateCorrectnessImpact(correctnessAnalysis.similarity),
209
+ costMultiplier: 1.3,
210
+ qualityImpact: 25,
211
+ costOfHallucinations: this.calculateCorrectnessImpact(correctnessAnalysis.similarity) * 100,
212
+ },
213
+ });
214
+ }
215
+ }
216
+ // 3. Analyze execution errors for logic issues
217
+ if (executionResult.errors.length > 0) {
218
+ const logicErrors = this.analyzeExecutionErrorsForLogic(executionResult.errors);
219
+ categories.push(...logicErrors);
220
+ }
221
+ }
222
+ catch (error) {
223
+ console.warn('Logic analysis failed:', error);
224
+ }
225
+ return categories;
226
+ }
227
+ /**
228
+ * Analyze resource usage patterns
229
+ */
230
+ analyzeResourceUsage(resourceUsage) {
231
+ return {
232
+ memoryUsage: {
233
+ current: resourceUsage.memoryMB,
234
+ peak: resourceUsage.peakMemoryMB || resourceUsage.memoryMB,
235
+ threshold: this.config.memoryThresholdMB,
236
+ isExcessive: resourceUsage.memoryMB > this.config.memoryThresholdMB,
237
+ },
238
+ executionTime: {
239
+ actual: resourceUsage.executionTimeMs,
240
+ threshold: this.config.executionTimeThresholdMs,
241
+ isExcessive: resourceUsage.executionTimeMs > this.config.executionTimeThresholdMs,
242
+ },
243
+ cpuUsage: {
244
+ percentage: resourceUsage.cpuUsage,
245
+ threshold: this.config.cpuUsageThreshold,
246
+ isExcessive: resourceUsage.cpuUsage > this.config.cpuUsageThreshold,
247
+ },
248
+ };
249
+ }
250
+ /**
251
+ * Detect potential infinite loops in code
252
+ */
253
+ detectInfiniteLoops(code) {
254
+ const analysis = {
255
+ hasInfiniteLoopRisk: false,
256
+ loopComplexity: 0,
257
+ terminationConditions: [],
258
+ confidence: 0,
259
+ };
260
+ // Detect while loops
261
+ const whileLoops = code.match(/while\s+([^:]+):/g) || [];
262
+ const forLoops = code.match(/for\s+[^:]+:/g) || [];
263
+ analysis.loopComplexity = whileLoops.length + forLoops.length;
264
+ // Check for dangerous while patterns
265
+ whileLoops.forEach(loop => {
266
+ const condition = loop.match(/while\s+([^:]+):/)?.[1] || '';
267
+ // Check for "while True" without break
268
+ if (condition.trim() === 'True' || condition.trim() === '1') {
269
+ const loopBlock = this.extractLoopBlock(code, loop);
270
+ if (!loopBlock.includes('break') && !loopBlock.includes('return')) {
271
+ analysis.hasInfiniteLoopRisk = true;
272
+ analysis.confidence = Math.max(analysis.confidence, 0.9);
273
+ }
274
+ else {
275
+ analysis.terminationConditions.push('break/return statement');
276
+ }
277
+ }
278
+ // Check for complex conditions that might not terminate
279
+ if (condition.includes('!=') || condition.includes('>=') || condition.includes('<=')) {
280
+ analysis.confidence = Math.max(analysis.confidence, 0.6);
281
+ analysis.terminationConditions.push(`condition: ${condition}`);
282
+ }
283
+ });
284
+ // Check for nested loops (higher complexity risk)
285
+ const nestedLoopPattern = /(for|while)[^:]*:[\s\S]*?(for|while)[^:]*:/g;
286
+ const nestedLoops = code.match(nestedLoopPattern) || [];
287
+ if (nestedLoops.length > 0) {
288
+ analysis.loopComplexity += nestedLoops.length;
289
+ analysis.confidence = Math.max(analysis.confidence, 0.5);
290
+ }
291
+ return analysis;
292
+ }
293
+ /**
294
+ * Validate output correctness against expected results
295
+ */
296
+ validateOutputCorrectness(executionResult, expectedOutput) {
297
+ const analysis = {
298
+ outputMatches: false,
299
+ similarity: 0,
300
+ expectedType: typeof expectedOutput,
301
+ actualType: 'undefined',
302
+ confidence: 0.8,
303
+ };
304
+ if (!executionResult.output) {
305
+ analysis.actualType = 'null';
306
+ analysis.similarity = 0;
307
+ return analysis;
308
+ }
309
+ // Try to parse the output
310
+ let actualOutput;
311
+ try {
312
+ // Try to extract the last line as the result
313
+ const outputLines = executionResult.output.trim().split('\n');
314
+ const lastLine = outputLines[outputLines.length - 1];
315
+ // Try to parse as JSON first
316
+ try {
317
+ actualOutput = JSON.parse(lastLine);
318
+ }
319
+ catch {
320
+ // If not JSON, use as string
321
+ actualOutput = lastLine;
322
+ }
323
+ }
324
+ catch {
325
+ actualOutput = executionResult.output;
326
+ }
327
+ analysis.actualType = typeof actualOutput;
328
+ // Type comparison
329
+ if (analysis.expectedType === analysis.actualType) {
330
+ analysis.similarity += 0.3;
331
+ }
332
+ // Value comparison
333
+ if (analysis.expectedType === 'string' && analysis.actualType === 'string') {
334
+ analysis.similarity += this.calculateStringSimilarity(expectedOutput.toString(), actualOutput.toString()) * 0.7;
335
+ }
336
+ else if (analysis.expectedType === 'number' && analysis.actualType === 'number') {
337
+ const diff = Math.abs(expectedOutput - actualOutput);
338
+ const maxValue = Math.max(Math.abs(expectedOutput), Math.abs(actualOutput), 1);
339
+ analysis.similarity += Math.max(0, 1 - (diff / maxValue)) * 0.7;
340
+ }
341
+ else if (expectedOutput === actualOutput) {
342
+ analysis.similarity = 1.0;
343
+ }
344
+ else {
345
+ // Try string comparison as fallback
346
+ analysis.similarity += this.calculateStringSimilarity(expectedOutput.toString(), actualOutput.toString()) * 0.5;
347
+ }
348
+ analysis.outputMatches = analysis.similarity > 0.9;
349
+ return analysis;
350
+ }
351
+ /**
352
+ * Analyze execution errors for logic-related issues
353
+ */
354
+ analyzeExecutionErrorsForLogic(errors) {
355
+ const categories = [];
356
+ const logicErrorTypes = [
357
+ 'ZeroDivisionError',
358
+ 'ValueError',
359
+ 'AssertionError',
360
+ 'LogicError',
361
+ 'RuntimeError',
362
+ ];
363
+ errors.forEach(error => {
364
+ if (logicErrorTypes.includes(error.type)) {
365
+ const severity = this.getLogicErrorSeverity(error.type);
366
+ categories.push({
367
+ type: 'logic',
368
+ subtype: error.type === 'ZeroDivisionError' ? 'logic_deviation' : 'logic_breakdown',
369
+ severity,
370
+ confidence: 0.9,
371
+ description: `Logic error detected: ${error.type}`,
372
+ evidence: [
373
+ {
374
+ type: 'execution_error',
375
+ content: error.message,
376
+ lineNumber: error.lineNumber,
377
+ confidence: 1.0,
378
+ },
379
+ ],
380
+ suggestedFix: this.getLogicErrorFix(error.type),
381
+ businessImpact: {
382
+ estimatedDevTimeWasted: severity === 'critical' ? 4.0 : severity === 'high' ? 2.5 : 1.5,
383
+ costMultiplier: severity === 'critical' ? 2.0 : severity === 'high' ? 1.5 : 1.2,
384
+ qualityImpact: severity === 'critical' ? 50 : severity === 'high' ? 35 : 20,
385
+ costOfHallucinations: (severity === 'critical' ? 4.0 : severity === 'high' ? 2.5 : 1.5) * 100,
386
+ },
387
+ });
388
+ }
389
+ });
390
+ return categories;
391
+ }
392
+ /**
393
+ * Helper methods
394
+ */
395
+ extractLoopBlock(code, loopStatement) {
396
+ const loopIndex = code.indexOf(loopStatement);
397
+ if (loopIndex === -1)
398
+ return '';
399
+ const lines = code.substring(loopIndex).split('\n');
400
+ const loopBlock = [lines[0]]; // Include the loop statement
401
+ let indentLevel = 0;
402
+ let baseIndent = -1;
403
+ for (let i = 1; i < lines.length; i++) {
404
+ const line = lines[i];
405
+ const trimmed = line.trim();
406
+ if (trimmed === '')
407
+ continue;
408
+ const currentIndent = line.length - line.trimStart().length;
409
+ if (baseIndent === -1 && trimmed !== '') {
410
+ baseIndent = currentIndent;
411
+ }
412
+ if (currentIndent <= baseIndent && trimmed !== '' && i > 1) {
413
+ break; // End of loop block
414
+ }
415
+ loopBlock.push(line);
416
+ }
417
+ return loopBlock.join('\n');
418
+ }
419
+ calculateStringSimilarity(str1, str2) {
420
+ if (str1 === str2)
421
+ return 1.0;
422
+ const longer = str1.length > str2.length ? str1 : str2;
423
+ const shorter = str1.length > str2.length ? str2 : str1;
424
+ if (longer.length === 0)
425
+ return 1.0;
426
+ const distance = this.levenshteinDistance(longer, shorter);
427
+ return (longer.length - distance) / longer.length;
428
+ }
429
+ levenshteinDistance(str1, str2) {
430
+ const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
431
+ for (let i = 0; i <= str1.length; i++)
432
+ matrix[0][i] = i;
433
+ for (let j = 0; j <= str2.length; j++)
434
+ matrix[j][0] = j;
435
+ for (let j = 1; j <= str2.length; j++) {
436
+ for (let i = 1; i <= str1.length; i++) {
437
+ const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
438
+ matrix[j][i] = Math.min(matrix[j][i - 1] + 1, matrix[j - 1][i] + 1, matrix[j - 1][i - 1] + indicator);
439
+ }
440
+ }
441
+ return matrix[str2.length][str1.length];
442
+ }
443
+ getMemorySeverity(memoryMB) {
444
+ if (memoryMB > 200)
445
+ return 'critical';
446
+ if (memoryMB > 128)
447
+ return 'high';
448
+ if (memoryMB > 64)
449
+ return 'medium';
450
+ return 'low';
451
+ }
452
+ getTimeSeverity(timeMs) {
453
+ if (timeMs > 10000)
454
+ return 'critical';
455
+ if (timeMs > 5000)
456
+ return 'high';
457
+ if (timeMs > 3000)
458
+ return 'medium';
459
+ return 'low';
460
+ }
461
+ getCorrectnessSeverity(similarity) {
462
+ if (similarity < 0.3)
463
+ return 'critical';
464
+ if (similarity < 0.5)
465
+ return 'high';
466
+ if (similarity < 0.7)
467
+ return 'medium';
468
+ return 'low';
469
+ }
470
+ getLogicErrorSeverity(errorType) {
471
+ const severityMap = {
472
+ 'ZeroDivisionError': 'high',
473
+ 'ValueError': 'medium',
474
+ 'AssertionError': 'high',
475
+ 'LogicError': 'critical',
476
+ 'RuntimeError': 'medium',
477
+ };
478
+ return severityMap[errorType] || 'medium';
479
+ }
480
+ getLogicErrorFix(errorType) {
481
+ const fixes = {
482
+ 'ZeroDivisionError': 'Add zero division check before division operations',
483
+ 'ValueError': 'Validate input values and add proper error handling',
484
+ 'AssertionError': 'Review assertion conditions and fix logic',
485
+ 'LogicError': 'Restructure the algorithm logic',
486
+ 'RuntimeError': 'Add proper error handling and resource management',
487
+ };
488
+ return fixes[errorType] || 'Review and fix the logic error';
489
+ }
490
+ calculateMemoryImpact(memoryMB) {
491
+ if (memoryMB > 200)
492
+ return 4.0;
493
+ if (memoryMB > 128)
494
+ return 2.5;
495
+ if (memoryMB > 64)
496
+ return 1.5;
497
+ return 0.5;
498
+ }
499
+ calculateTimeImpact(timeMs) {
500
+ if (timeMs > 10000)
501
+ return 3.5;
502
+ if (timeMs > 5000)
503
+ return 2.0;
504
+ if (timeMs > 3000)
505
+ return 1.0;
506
+ return 0.5;
507
+ }
508
+ calculateCorrectnessImpact(similarity) {
509
+ if (similarity < 0.3)
510
+ return 4.0;
511
+ if (similarity < 0.5)
512
+ return 3.0;
513
+ if (similarity < 0.7)
514
+ return 2.0;
515
+ return 1.0;
516
+ }
517
+ }
518
+ /**
519
+ * Factory function to create execution-based detector
520
+ */
521
+ export function createExecutionBasedDetector(config) {
522
+ return new ExecutionBasedDetector(config);
523
+ }
524
+ /**
525
+ * Utility function for quick execution-based analysis
526
+ */
527
+ export async function analyzeExecutionForHallucinations(code, expectedOutput, config) {
528
+ const detector = createExecutionBasedDetector(config);
529
+ const sandbox = new ExecutionSandbox();
530
+ const executionResult = await sandbox.execute(code);
531
+ const resourceHallucinations = await detector.detectResourceHallucinations(executionResult);
532
+ const logicHallucinations = await detector.detectLogicHallucinations(code, executionResult, expectedOutput);
533
+ return {
534
+ executionResult,
535
+ resourceHallucinations,
536
+ logicHallucinations,
537
+ };
538
+ }