erosolar-cli 1.7.90 → 1.7.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/erosolar.js +47 -0
- package/dist/bin/erosolar.js.map +1 -1
- package/dist/core/agent.d.ts +16 -0
- package/dist/core/agent.d.ts.map +1 -1
- package/dist/core/agent.js +57 -0
- package/dist/core/agent.js.map +1 -1
- package/dist/core/isolatedVerifier.d.ts +36 -14
- package/dist/core/isolatedVerifier.d.ts.map +1 -1
- package/dist/core/isolatedVerifier.js +111 -583
- package/dist/core/isolatedVerifier.js.map +1 -1
- package/dist/shell/bracketedPasteManager.d.ts.map +1 -1
- package/dist/shell/bracketedPasteManager.js +5 -13
- package/dist/shell/bracketedPasteManager.js.map +1 -1
- package/dist/shell/interactiveShell.d.ts.map +1 -1
- package/dist/shell/interactiveShell.js +25 -23
- package/dist/shell/interactiveShell.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,601 +1,129 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
1
|
/**
|
|
3
|
-
* Isolated Verification
|
|
2
|
+
* Isolated Verification System
|
|
4
3
|
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
* outputs results via stdout.
|
|
8
|
-
*
|
|
9
|
-
* This ensures verification is completely isolated from the main CLI process:
|
|
10
|
-
* - Separate memory space
|
|
11
|
-
* - Separate event loop
|
|
12
|
-
* - Independent error handling
|
|
13
|
-
* - No shared state with main process
|
|
14
|
-
*
|
|
15
|
-
* @license MIT
|
|
4
|
+
* Provides runtime isolation for verification tasks by running them in separate processes.
|
|
5
|
+
* This ensures that verification failures don't affect the main CLI process.
|
|
16
6
|
*/
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
const
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
---
|
|
31
|
-
|
|
32
|
-
CONTEXT: {CONTEXT}
|
|
33
|
-
WORKING_DIR: {WORKING_DIR}
|
|
34
|
-
|
|
35
|
-
For each claim, determine:
|
|
36
|
-
1. What specific assertion is being made
|
|
37
|
-
2. Category: file_op (created/modified/deleted files), code (compiles/tests pass), command (executed successfully), state (something changed), behavior (feature works), fact (verifiable truth)
|
|
38
|
-
3. How it can be verified (shell command, file check, etc.)
|
|
39
|
-
4. Priority: critical (must verify), high (should verify), medium (nice to verify), low (optional)
|
|
40
|
-
|
|
41
|
-
Return JSON array:
|
|
42
|
-
[{
|
|
43
|
-
"id": "c1",
|
|
44
|
-
"statement": "the specific claim",
|
|
45
|
-
"category": "file_op|code|command|state|behavior|fact",
|
|
46
|
-
"verifiable": true,
|
|
47
|
-
"priority": "critical|high|medium|low",
|
|
48
|
-
"context": {"path": "/path/if/relevant", "command": "if relevant"}
|
|
49
|
-
}]
|
|
50
|
-
|
|
51
|
-
Output ONLY valid JSON array.`;
|
|
52
|
-
async function extractClaims(response, workingDir, conversationHistory, llmQuery) {
|
|
53
|
-
// Try LLM-based extraction first
|
|
54
|
-
try {
|
|
55
|
-
const prompt = EXTRACT_CLAIMS_PROMPT
|
|
56
|
-
.replace('{RESPONSE}', response.slice(0, 8000))
|
|
57
|
-
.replace('{CONTEXT}', conversationHistory.slice(-3).join('\n') || '')
|
|
58
|
-
.replace('{WORKING_DIR}', workingDir);
|
|
59
|
-
const result = await llmQuery(prompt);
|
|
60
|
-
const match = result.match(/\[[\s\S]*\]/);
|
|
61
|
-
if (match) {
|
|
62
|
-
const claims = JSON.parse(match[0]);
|
|
63
|
-
if (claims.length > 0) {
|
|
64
|
-
return claims;
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
catch (err) {
|
|
69
|
-
// LLM extraction failed, fall through to pattern-based
|
|
70
|
-
process.stderr.write(`LLM claim extraction failed: ${err instanceof Error ? err.message : 'unknown'}\n`);
|
|
71
|
-
}
|
|
72
|
-
// Fallback: Pattern-based claim extraction
|
|
73
|
-
return extractClaimsWithPatterns(response, workingDir);
|
|
74
|
-
}
|
|
75
|
-
/**
|
|
76
|
-
* Fallback pattern-based claim extraction when LLM is unavailable
|
|
77
|
-
*/
|
|
78
|
-
function extractClaimsWithPatterns(response, workingDir) {
|
|
79
|
-
const claims = [];
|
|
80
|
-
let claimId = 1;
|
|
81
|
-
// Pattern 1: File creation/modification claims
|
|
82
|
-
// "created src/foo.ts", "wrote to file.js", "Updated package.json"
|
|
83
|
-
const filePatterns = [
|
|
84
|
-
/(?:created|wrote|updated|modified|edited|added)\s+(?:file\s+)?[`"']?([\/\w\-\.]+\.\w+)[`"']?/gi,
|
|
85
|
-
/(?:src|dist|lib|test)\/[\w\-\/]+\.\w+/g,
|
|
86
|
-
];
|
|
87
|
-
for (const pattern of filePatterns) {
|
|
88
|
-
const matches = response.matchAll(pattern);
|
|
89
|
-
for (const match of matches) {
|
|
90
|
-
const filePath = match[1] || match[0];
|
|
91
|
-
if (filePath && !claims.some(c => c.context['path'] === filePath)) {
|
|
92
|
-
claims.push({
|
|
93
|
-
id: `c${claimId++}`,
|
|
94
|
-
statement: `File exists: ${filePath}`,
|
|
95
|
-
category: 'file_op',
|
|
96
|
-
verifiable: true,
|
|
97
|
-
priority: 'high',
|
|
98
|
-
context: { path: filePath.startsWith('/') ? filePath : `${workingDir}/${filePath}` }
|
|
99
|
-
});
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
// Pattern 2: Test success claims
|
|
104
|
-
// "all tests pass", "12/12 tests successful", "tests passing"
|
|
105
|
-
const testPatterns = [
|
|
106
|
-
/(\d+)\/\1\s+tests?\s+(?:pass|success)/i,
|
|
107
|
-
/all\s+tests?\s+pass/i,
|
|
108
|
-
/tests?\s+(?:are\s+)?passing/i,
|
|
109
|
-
/✅.*tests?\s+pass/i,
|
|
110
|
-
];
|
|
111
|
-
for (const pattern of testPatterns) {
|
|
112
|
-
if (pattern.test(response)) {
|
|
113
|
-
claims.push({
|
|
114
|
-
id: `c${claimId++}`,
|
|
115
|
-
statement: 'All tests are passing',
|
|
116
|
-
category: 'code',
|
|
117
|
-
verifiable: true,
|
|
118
|
-
priority: 'critical',
|
|
119
|
-
context: { command: 'npm test' }
|
|
120
|
-
});
|
|
121
|
-
break;
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
// Pattern 3: Build success claims
|
|
125
|
-
// "build succeeded", "compiled successfully"
|
|
126
|
-
const buildPatterns = [
|
|
127
|
-
/build\s+succeed/i,
|
|
128
|
-
/compil(?:ed|ation)\s+success/i,
|
|
129
|
-
/✅.*build/i,
|
|
130
|
-
];
|
|
131
|
-
for (const pattern of buildPatterns) {
|
|
132
|
-
if (pattern.test(response)) {
|
|
133
|
-
claims.push({
|
|
134
|
-
id: `c${claimId++}`,
|
|
135
|
-
statement: 'Build succeeds',
|
|
136
|
-
category: 'code',
|
|
137
|
-
verifiable: true,
|
|
138
|
-
priority: 'critical',
|
|
139
|
-
context: { command: 'npm run build' }
|
|
140
|
-
});
|
|
141
|
-
break;
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
// Pattern 4: Implementation complete claims
|
|
145
|
-
// "implemented", "created", "added feature"
|
|
146
|
-
const implementPatterns = [
|
|
147
|
-
/(?:successfully\s+)?implement(?:ed|ation)/i,
|
|
148
|
-
/feature\s+(?:is\s+)?(?:now\s+)?(?:complete|ready|working)/i,
|
|
149
|
-
/integration\s+ready/i,
|
|
150
|
-
/refactor\s+(?:successfully\s+)?completed?/i,
|
|
151
|
-
/delivered/i,
|
|
152
|
-
];
|
|
153
|
-
for (const pattern of implementPatterns) {
|
|
154
|
-
if (pattern.test(response)) {
|
|
155
|
-
claims.push({
|
|
156
|
-
id: `c${claimId++}`,
|
|
157
|
-
statement: 'Implementation is complete and working',
|
|
158
|
-
category: 'behavior',
|
|
159
|
-
verifiable: true,
|
|
160
|
-
priority: 'high',
|
|
161
|
-
context: {}
|
|
162
|
-
});
|
|
163
|
-
break;
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
// Pattern 5: Paste/input handling claims
|
|
167
|
-
// "paste", "multi-line input", "input handling"
|
|
168
|
-
const pastePatterns = [
|
|
169
|
-
/(?:paste|pasted|pasting).*(?:work|handle|process)/i,
|
|
170
|
-
/multi[\s-]?line\s+(?:paste|input)/i,
|
|
171
|
-
/(?:block|chunk)\s+description/i,
|
|
172
|
-
/input\s+handling/i,
|
|
173
|
-
/graceful(?:ly)?\s+(?:handle|sent|submit)/i,
|
|
174
|
-
];
|
|
175
|
-
for (const pattern of pastePatterns) {
|
|
176
|
-
if (pattern.test(response)) {
|
|
177
|
-
claims.push({
|
|
178
|
-
id: `c${claimId++}`,
|
|
179
|
-
statement: 'Paste handling feature works correctly',
|
|
180
|
-
category: 'cli_behavior',
|
|
181
|
-
verifiable: true,
|
|
182
|
-
priority: 'high',
|
|
183
|
-
context: {
|
|
184
|
-
testType: 'cli_interactive',
|
|
185
|
-
feature: 'paste_handling'
|
|
186
|
-
}
|
|
7
|
+
export class IsolatedVerifier {
|
|
8
|
+
/**
|
|
9
|
+
* Verify task completion in an isolated process
|
|
10
|
+
*/
|
|
11
|
+
async verifyTaskCompletion(taskName, config) {
|
|
12
|
+
const startTime = Date.now();
|
|
13
|
+
try {
|
|
14
|
+
const scriptPath = await this.createVerificationScript(config);
|
|
15
|
+
const { exec } = await import('node:child_process');
|
|
16
|
+
const { promisify } = await import('node:util');
|
|
17
|
+
const execAsync = promisify(exec);
|
|
18
|
+
const result = await execAsync(`node "${scriptPath}"`, {
|
|
19
|
+
timeout: 30000
|
|
187
20
|
});
|
|
188
|
-
|
|
21
|
+
const duration = Date.now() - startTime;
|
|
22
|
+
return {
|
|
23
|
+
success: true,
|
|
24
|
+
report: this.generateReport(taskName, config, result.stdout, result.stderr, duration),
|
|
25
|
+
duration
|
|
26
|
+
};
|
|
189
27
|
}
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
id: `c${claimId++}`,
|
|
199
|
-
statement: `Referenced file exists: ${filePath}`,
|
|
200
|
-
category: 'file_op',
|
|
201
|
-
verifiable: true,
|
|
202
|
-
priority: 'medium',
|
|
203
|
-
context: { path: `${workingDir}/${filePath}` }
|
|
204
|
-
});
|
|
28
|
+
catch (error) {
|
|
29
|
+
const duration = Date.now() - startTime;
|
|
30
|
+
return {
|
|
31
|
+
success: false,
|
|
32
|
+
report: this.generateErrorReport(taskName, config, error, duration),
|
|
33
|
+
error: error.message,
|
|
34
|
+
duration
|
|
35
|
+
};
|
|
205
36
|
}
|
|
206
37
|
}
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
const
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
38
|
+
/**
|
|
39
|
+
* Create a verification script that runs in isolation
|
|
40
|
+
*/
|
|
41
|
+
async createVerificationScript(config) {
|
|
42
|
+
const fs = await import('node:fs');
|
|
43
|
+
const os = await import('node:os');
|
|
44
|
+
const path = await import('node:path');
|
|
45
|
+
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'isolated-verification-'));
|
|
46
|
+
const scriptPath = path.join(tempDir, 'verification.js');
|
|
47
|
+
const scriptContent = this.generateVerificationScript(config);
|
|
48
|
+
fs.writeFileSync(scriptPath, scriptContent, 'utf8');
|
|
49
|
+
return scriptPath;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Generate the verification script content
|
|
53
|
+
*/
|
|
54
|
+
generateVerificationScript(config) {
|
|
55
|
+
const fileChecks = config.expectedFiles.map(file => ` if (!require('fs').existsSync("${file}")) {
|
|
56
|
+
console.error('MISSING FILE: ${file}');
|
|
57
|
+
process.exit(1);
|
|
58
|
+
}`).join('\n');
|
|
59
|
+
const commandExecutions = config.verificationCommands.map(cmd => ` try {
|
|
60
|
+
const { execSync } = require('child_process');
|
|
61
|
+
execSync("${cmd}", { stdio: 'inherit' });
|
|
62
|
+
} catch (error) {
|
|
63
|
+
console.error('COMMAND FAILED: ${cmd}');
|
|
64
|
+
console.error(error.message);
|
|
65
|
+
process.exit(1);
|
|
66
|
+
}`).join('\n');
|
|
67
|
+
return `#!/usr/bin/env node
|
|
68
|
+
/**
|
|
69
|
+
* Isolated Verification Script
|
|
70
|
+
* Generated by Erosolar CLI
|
|
71
|
+
*/
|
|
216
72
|
|
|
217
|
-
|
|
218
|
-
|
|
73
|
+
console.log('ISOLATED VERIFICATION REPORT');
|
|
74
|
+
console.log('=============================');
|
|
75
|
+
console.log('Task: ${config.taskDescription}');
|
|
76
|
+
console.log('');
|
|
219
77
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
- Expected output patterns
|
|
78
|
+
// File existence checks
|
|
79
|
+
${fileChecks}
|
|
223
80
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
"id": "test-1",
|
|
227
|
-
"description": "what we're testing",
|
|
228
|
-
"shellCommands": ["ls -la path", "cat file"],
|
|
229
|
-
"expectedOutputs": ["pattern1", "pattern2"],
|
|
230
|
-
"expectedBehavior": "description for LLM assessment",
|
|
231
|
-
"timeout": 30000
|
|
232
|
-
}]
|
|
81
|
+
// Command executions
|
|
82
|
+
${commandExecutions}
|
|
233
83
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
catch {
|
|
251
|
-
// Fall through to basic tests
|
|
252
|
-
}
|
|
253
|
-
// Fallback: generate basic tests for all verifiable claims
|
|
254
|
-
return generateBasicTests(claims, workingDir);
|
|
255
|
-
}
|
|
256
|
-
/**
|
|
257
|
-
* Generate basic tests without LLM assistance
|
|
258
|
-
*/
|
|
259
|
-
function generateBasicTests(claims, workingDir) {
|
|
260
|
-
const tests = [];
|
|
261
|
-
for (const claim of claims.filter(c => c.verifiable)) {
|
|
262
|
-
const test = {
|
|
263
|
-
id: `test-${tests.length + 1}`,
|
|
264
|
-
description: claim.statement,
|
|
265
|
-
shellCommands: [],
|
|
266
|
-
expectedOutputs: [],
|
|
267
|
-
timeout: 30000
|
|
268
|
-
};
|
|
269
|
-
switch (claim.category) {
|
|
270
|
-
case 'file_op': {
|
|
271
|
-
const filePath = claim.context['path'];
|
|
272
|
-
if (filePath) {
|
|
273
|
-
// Check if file exists
|
|
274
|
-
test.shellCommands = [`test -f "${filePath}" && echo "FILE_EXISTS" || echo "FILE_NOT_FOUND"`];
|
|
275
|
-
test.expectedOutputs = ['FILE_EXISTS'];
|
|
276
|
-
}
|
|
277
|
-
break;
|
|
278
|
-
}
|
|
279
|
-
case 'code': {
|
|
280
|
-
const command = claim.context['command'];
|
|
281
|
-
if (command === 'npm test') {
|
|
282
|
-
test.shellCommands = [`cd "${workingDir}" && npm test 2>&1 | tail -20`];
|
|
283
|
-
test.expectedOutputs = ['pass', 'passing', '0 fail'];
|
|
284
|
-
}
|
|
285
|
-
else if (command === 'npm run build') {
|
|
286
|
-
test.shellCommands = [`cd "${workingDir}" && npm run build 2>&1`];
|
|
287
|
-
test.expectedOutputs = ['postbuild', 'success']; // tsc typically outputs nothing on success
|
|
288
|
-
}
|
|
289
|
-
else {
|
|
290
|
-
test.shellCommands = [`cd "${workingDir}" && npm run build 2>&1 | tail -10`];
|
|
291
|
-
}
|
|
292
|
-
break;
|
|
293
|
-
}
|
|
294
|
-
case 'behavior': {
|
|
295
|
-
// For behavior claims, check that build passes as a proxy
|
|
296
|
-
test.shellCommands = [`cd "${workingDir}" && npm run build 2>&1 | tail -5`];
|
|
297
|
-
test.expectedOutputs = ['postbuild'];
|
|
298
|
-
break;
|
|
299
|
-
}
|
|
300
|
-
case 'cli_behavior': {
|
|
301
|
-
// For CLI behavior claims, run relevant tests based on feature
|
|
302
|
-
const feature = claim.context['feature'];
|
|
303
|
-
if (feature === 'paste_handling') {
|
|
304
|
-
// Run paste-specific tests via npm test with filter
|
|
305
|
-
test.shellCommands = [
|
|
306
|
-
// First run any paste/input related tests
|
|
307
|
-
`cd "${workingDir}" && npm test -- --testPathPattern="(paste|input|multiLine)" --passWithNoTests 2>&1 | tail -30`,
|
|
308
|
-
// Also run the RobustInputProcessor tests specifically
|
|
309
|
-
`cd "${workingDir}" && npm test -- --testPathPattern="robustInputProcessor" --passWithNoTests 2>&1 | tail -20`,
|
|
310
|
-
];
|
|
311
|
-
test.expectedOutputs = ['pass', 'passing'];
|
|
312
|
-
test.timeout = 60000;
|
|
313
|
-
test.expectedBehavior = 'Paste handling tests pass and verify multi-line input is processed correctly';
|
|
314
|
-
}
|
|
315
|
-
else {
|
|
316
|
-
// Generic CLI behavior - run build
|
|
317
|
-
test.shellCommands = [`cd "${workingDir}" && npm run build 2>&1 | tail -5`];
|
|
318
|
-
test.expectedOutputs = ['postbuild'];
|
|
319
|
-
}
|
|
320
|
-
break;
|
|
321
|
-
}
|
|
322
|
-
default: {
|
|
323
|
-
// Generic check - just verify the project builds
|
|
324
|
-
test.shellCommands = [`cd "${workingDir}" && npm run build 2>&1 | tail -5`];
|
|
325
|
-
}
|
|
326
|
-
}
|
|
327
|
-
if (test.shellCommands && test.shellCommands.length > 0) {
|
|
328
|
-
tests.push(test);
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
return tests.slice(0, 5); // Limit to 5 tests for performance
|
|
332
|
-
}
|
|
333
|
-
// ============================================================================
|
|
334
|
-
// TEST EXECUTION
|
|
335
|
-
// ============================================================================
|
|
336
|
-
/**
|
|
337
|
-
* Run PTY-based CLI verification for behavior claims
|
|
338
|
-
* @internal - Reserved for future use when PTY-based verification is needed
|
|
339
|
-
*/
|
|
340
|
-
async function _runCLIBehaviorTest(claim, cwd) {
|
|
341
|
-
const feature = claim.context['feature'] || 'generic';
|
|
342
|
-
try {
|
|
343
|
-
const verification = await runVerificationTests(feature, cwd);
|
|
344
|
-
// Convert PTY test results to our TestResult format
|
|
345
|
-
const test = {
|
|
346
|
-
id: `cli-behavior-${feature}`,
|
|
347
|
-
description: claim.statement,
|
|
348
|
-
timeout: 60000,
|
|
349
|
-
};
|
|
350
|
-
const result = {
|
|
351
|
-
test,
|
|
352
|
-
success: verification.passed,
|
|
353
|
-
output: verification.results.map(r => r.output).join('\n---\n'),
|
|
354
|
-
errors: verification.results.flatMap(r => r.errors).join('\n'),
|
|
355
|
-
matchedPatterns: verification.passed ? [verification.summary] : [],
|
|
356
|
-
unmatchedPatterns: verification.passed ? [] : [verification.summary],
|
|
357
|
-
llmAssessment: `PTY Test: ${verification.summary}`,
|
|
358
|
-
};
|
|
359
|
-
return result;
|
|
360
|
-
}
|
|
361
|
-
catch (err) {
|
|
362
|
-
const test = {
|
|
363
|
-
id: `cli-behavior-${feature}`,
|
|
364
|
-
description: claim.statement,
|
|
365
|
-
};
|
|
366
|
-
return {
|
|
367
|
-
test,
|
|
368
|
-
success: false,
|
|
369
|
-
output: '',
|
|
370
|
-
errors: err instanceof Error ? err.message : String(err),
|
|
371
|
-
matchedPatterns: [],
|
|
372
|
-
unmatchedPatterns: ['PTY test failed to run'],
|
|
373
|
-
};
|
|
374
|
-
}
|
|
375
|
-
}
|
|
376
|
-
async function runShellCommand(cmd, cwd) {
|
|
377
|
-
// Safety check - block dangerous commands
|
|
378
|
-
const dangerous = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i];
|
|
379
|
-
for (const p of dangerous) {
|
|
380
|
-
if (p.test(cmd))
|
|
381
|
-
return { ok: false, out: `Blocked dangerous command: ${p.source}` };
|
|
382
|
-
}
|
|
383
|
-
try {
|
|
384
|
-
const { stdout, stderr } = await execAsync(cmd, { cwd, timeout: 30000 });
|
|
385
|
-
return { ok: true, out: stdout + stderr };
|
|
386
|
-
}
|
|
387
|
-
catch (e) {
|
|
388
|
-
return { ok: false, out: e instanceof Error ? e.message : 'Command failed' };
|
|
389
|
-
}
|
|
390
|
-
}
|
|
391
|
-
async function runTest(test, cwd, llmQuery) {
|
|
392
|
-
const result = {
|
|
393
|
-
test,
|
|
394
|
-
success: false,
|
|
395
|
-
output: '',
|
|
396
|
-
errors: '',
|
|
397
|
-
matchedPatterns: [],
|
|
398
|
-
unmatchedPatterns: []
|
|
399
|
-
};
|
|
400
|
-
try {
|
|
401
|
-
// Run shell commands
|
|
402
|
-
if (test.shellCommands && test.shellCommands.length > 0) {
|
|
403
|
-
for (const cmd of test.shellCommands) {
|
|
404
|
-
const shellResult = await runShellCommand(cmd, cwd);
|
|
405
|
-
result.output += `$ ${cmd}\n${shellResult.out}\n`;
|
|
406
|
-
if (!shellResult.ok) {
|
|
407
|
-
result.errors += `${shellResult.out}\n`;
|
|
408
|
-
}
|
|
409
|
-
}
|
|
410
|
-
}
|
|
411
|
-
// Check expected output patterns
|
|
412
|
-
if (test.expectedOutputs) {
|
|
413
|
-
for (const pattern of test.expectedOutputs) {
|
|
414
|
-
if (result.output.includes(pattern) || new RegExp(pattern, 'i').test(result.output)) {
|
|
415
|
-
result.matchedPatterns.push(pattern);
|
|
416
|
-
}
|
|
417
|
-
else {
|
|
418
|
-
result.unmatchedPatterns.push(pattern);
|
|
419
|
-
}
|
|
420
|
-
}
|
|
421
|
-
}
|
|
422
|
-
// LLM assessment of behavior
|
|
423
|
-
if (test.expectedBehavior) {
|
|
424
|
-
const assessPrompt = `Assess if this output demonstrates the expected behavior.
|
|
84
|
+
console.log('');
|
|
85
|
+
console.log('VERIFICATION COMPLETED SUCCESSFULLY');
|
|
86
|
+
`;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Generate a success report
|
|
90
|
+
*/
|
|
91
|
+
generateReport(taskName, config, stdout, stderr, duration) {
|
|
92
|
+
return `ISOLATED VERIFICATION REPORT
|
|
93
|
+
=============================
|
|
94
|
+
Task: ${taskName}
|
|
95
|
+
Description: ${config.taskDescription}
|
|
96
|
+
Duration: ${duration}ms
|
|
97
|
+
Status: SUCCESS
|
|
425
98
|
|
|
426
|
-
|
|
99
|
+
VERIFICATION STEPS:
|
|
100
|
+
${config.verificationCommands.map((cmd, i) => ` ${i + 1}. ${cmd}`).join('\n')}
|
|
427
101
|
|
|
428
102
|
OUTPUT:
|
|
429
|
-
|
|
430
|
-
${
|
|
431
|
-
|
|
103
|
+
${stdout}
|
|
104
|
+
${stderr ? `ERRORS:\n${stderr}` : ''}
|
|
105
|
+
`;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Generate an error report
|
|
109
|
+
*/
|
|
110
|
+
generateErrorReport(taskName, config, error, duration) {
|
|
111
|
+
return `ISOLATED VERIFICATION REPORT
|
|
112
|
+
=============================
|
|
113
|
+
Task: ${taskName}
|
|
114
|
+
Description: ${config.taskDescription}
|
|
115
|
+
Duration: ${duration}ms
|
|
116
|
+
Status: FAILED
|
|
432
117
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
}
|
|
443
|
-
else {
|
|
444
|
-
result.matchedPatterns.push(`behavior: ${test.expectedBehavior}`);
|
|
445
|
-
}
|
|
446
|
-
}
|
|
447
|
-
}
|
|
448
|
-
catch {
|
|
449
|
-
result.llmAssessment = 'LLM assessment failed';
|
|
450
|
-
}
|
|
451
|
-
}
|
|
452
|
-
// Determine success
|
|
453
|
-
result.success = result.unmatchedPatterns.length === 0 &&
|
|
454
|
-
(result.matchedPatterns.length > 0 || (!test.expectedOutputs?.length && !test.expectedBehavior));
|
|
455
|
-
}
|
|
456
|
-
catch (err) {
|
|
457
|
-
result.errors = err instanceof Error ? err.message : 'Unknown error';
|
|
458
|
-
}
|
|
459
|
-
return result;
|
|
460
|
-
}
|
|
461
|
-
// ============================================================================
|
|
462
|
-
// MAIN VERIFICATION
|
|
463
|
-
// ============================================================================
|
|
464
|
-
async function verify(request) {
|
|
465
|
-
const timestamp = new Date().toISOString();
|
|
466
|
-
const id = `verify-${Date.now()}`;
|
|
467
|
-
// Create LLM query function using isolated provider
|
|
468
|
-
const provider = createProvider({
|
|
469
|
-
provider: request.provider,
|
|
470
|
-
model: request.model,
|
|
471
|
-
});
|
|
472
|
-
const llmQuery = async (prompt) => {
|
|
473
|
-
const response = await provider.generate([{ role: 'user', content: prompt }], [] // No tools for verification queries
|
|
474
|
-
);
|
|
475
|
-
if (response.type === 'message' && response.content) {
|
|
476
|
-
return response.content;
|
|
477
|
-
}
|
|
478
|
-
return '';
|
|
479
|
-
};
|
|
480
|
-
// Extract claims
|
|
481
|
-
const claims = await extractClaims(request.response, request.workingDirectory, request.conversationHistory, llmQuery);
|
|
482
|
-
if (claims.length === 0) {
|
|
483
|
-
return {
|
|
484
|
-
responseId: id,
|
|
485
|
-
timestamp,
|
|
486
|
-
claims: [],
|
|
487
|
-
results: [],
|
|
488
|
-
summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
|
|
489
|
-
overallVerdict: 'unverified',
|
|
490
|
-
trustScore: 50
|
|
491
|
-
};
|
|
492
|
-
}
|
|
493
|
-
// Generate tests for non-CLI claims
|
|
494
|
-
const nonCliBehaviorClaims = claims.filter(c => c.category !== 'cli_behavior');
|
|
495
|
-
const cliBehaviorClaims = claims.filter(c => c.category === 'cli_behavior');
|
|
496
|
-
const tests = await generateTests(nonCliBehaviorClaims, request.workingDirectory, llmQuery);
|
|
497
|
-
// Run shell-based tests for non-CLI claims
|
|
498
|
-
const testResults = [];
|
|
499
|
-
for (const test of tests) {
|
|
500
|
-
const result = await runTest(test, request.workingDirectory, llmQuery);
|
|
501
|
-
testResults.push(result);
|
|
502
|
-
}
|
|
503
|
-
// Run PTY-based tests for CLI behavior claims
|
|
504
|
-
for (const claim of cliBehaviorClaims) {
|
|
505
|
-
const result = await _runCLIBehaviorTest(claim, request.workingDirectory);
|
|
506
|
-
testResults.push(result);
|
|
507
|
-
}
|
|
508
|
-
// Rebuild claims array to match test results order
|
|
509
|
-
const orderedClaims = [...nonCliBehaviorClaims, ...cliBehaviorClaims];
|
|
510
|
-
// Map results to claims
|
|
511
|
-
const results = orderedClaims.map((claim, i) => {
|
|
512
|
-
const testResult = testResults[i];
|
|
513
|
-
if (!testResult) {
|
|
514
|
-
return {
|
|
515
|
-
claim,
|
|
516
|
-
verified: false,
|
|
517
|
-
confidence: 'low',
|
|
518
|
-
evidence: 'No test generated',
|
|
519
|
-
method: 'skip',
|
|
520
|
-
timestamp
|
|
521
|
-
};
|
|
522
|
-
}
|
|
523
|
-
// Determine method based on claim category
|
|
524
|
-
const method = claim.category === 'cli_behavior' ? 'pty-runtime' : 'isolated-process';
|
|
525
|
-
return {
|
|
526
|
-
claim,
|
|
527
|
-
verified: testResult.success,
|
|
528
|
-
confidence: testResult.success ? 'high' : (testResult.matchedPatterns.length > 0 ? 'medium' : 'low'),
|
|
529
|
-
evidence: testResult.success
|
|
530
|
-
? `Verified: ${testResult.matchedPatterns.join(', ')}`
|
|
531
|
-
: `Failed: ${testResult.unmatchedPatterns.join(', ')}`,
|
|
532
|
-
method,
|
|
533
|
-
reasoning: testResult.llmAssessment,
|
|
534
|
-
executedCode: (testResult.test.shellCommands || []).join('\n'),
|
|
535
|
-
rawOutput: testResult.output.slice(0, 2000),
|
|
536
|
-
error: testResult.errors || undefined,
|
|
537
|
-
timestamp
|
|
538
|
-
};
|
|
539
|
-
});
|
|
540
|
-
// Calculate summary
|
|
541
|
-
const verified = results.filter(r => r.verified).length;
|
|
542
|
-
const failed = results.filter(r => !r.verified && r.confidence === 'high').length;
|
|
543
|
-
const inconclusive = results.filter(r => !r.verified && r.confidence !== 'high').length;
|
|
544
|
-
// Determine verdict
|
|
545
|
-
let overallVerdict;
|
|
546
|
-
if (failed > 0) {
|
|
547
|
-
overallVerdict = 'contradicted';
|
|
548
|
-
}
|
|
549
|
-
else if (verified === claims.length && claims.length > 0) {
|
|
550
|
-
overallVerdict = 'verified';
|
|
551
|
-
}
|
|
552
|
-
else if (verified > 0) {
|
|
553
|
-
overallVerdict = 'partially_verified';
|
|
554
|
-
}
|
|
555
|
-
else {
|
|
556
|
-
overallVerdict = 'unverified';
|
|
557
|
-
}
|
|
558
|
-
const trustScore = claims.length > 0
|
|
559
|
-
? Math.round((verified / claims.length) * 100)
|
|
560
|
-
: 50;
|
|
561
|
-
return {
|
|
562
|
-
responseId: id,
|
|
563
|
-
timestamp,
|
|
564
|
-
claims,
|
|
565
|
-
results,
|
|
566
|
-
summary: { total: claims.length, verified, failed, inconclusive },
|
|
567
|
-
overallVerdict,
|
|
568
|
-
trustScore
|
|
569
|
-
};
|
|
570
|
-
}
|
|
571
|
-
// ============================================================================
|
|
572
|
-
// PROCESS ENTRY POINT
|
|
573
|
-
// ============================================================================
|
|
574
|
-
async function main() {
|
|
575
|
-
// Read request from stdin
|
|
576
|
-
let input = '';
|
|
577
|
-
process.stdin.setEncoding('utf8');
|
|
578
|
-
for await (const chunk of process.stdin) {
|
|
579
|
-
input += chunk;
|
|
580
|
-
}
|
|
581
|
-
try {
|
|
582
|
-
const request = JSON.parse(input);
|
|
583
|
-
if (request.type !== 'verify') {
|
|
584
|
-
throw new Error(`Unknown request type: ${request.type}`);
|
|
585
|
-
}
|
|
586
|
-
const report = await verify(request);
|
|
587
|
-
// Output result as JSON to stdout
|
|
588
|
-
process.stdout.write(JSON.stringify(report));
|
|
589
|
-
process.exit(0);
|
|
590
|
-
}
|
|
591
|
-
catch (error) {
|
|
592
|
-
// Output error as JSON
|
|
593
|
-
process.stdout.write(JSON.stringify({
|
|
594
|
-
error: true,
|
|
595
|
-
message: error instanceof Error ? error.message : 'Unknown error'
|
|
596
|
-
}));
|
|
597
|
-
process.exit(1);
|
|
118
|
+
VERIFICATION STEPS:
|
|
119
|
+
${config.verificationCommands.map((cmd, i) => ` ${i + 1}. ${cmd}`).join('\n')}
|
|
120
|
+
|
|
121
|
+
ERROR:
|
|
122
|
+
${error.message}
|
|
123
|
+
|
|
124
|
+
STACK TRACE:
|
|
125
|
+
${error.stack}
|
|
126
|
+
`;
|
|
598
127
|
}
|
|
599
128
|
}
|
|
600
|
-
main();
|
|
601
129
|
//# sourceMappingURL=isolatedVerifier.js.map
|