workflow-ai 1.0.62 → 1.0.63
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -0
- package/agent-templates/CLAUDE.md.tpl +2 -0
- package/agent-templates/QWEN.md.tpl +2 -0
- package/package.json +2 -1
- package/src/init.mjs +5 -4
- package/src/lib/agent-spawner.mjs +338 -0
- package/src/runner.mjs +15 -14
- package/src/scripts/get-next-test-id.js +94 -0
- package/src/scripts/migrate-backlog-to-tests.js +406 -0
- package/src/scripts/run-skill-tests.js +1491 -0
- package/src/scripts/scan-fixtures-for-secrets.js +248 -0
- package/src/scripts/tests/timeout-cascade.test.js +28 -0
- package/templates/plan-template.md +1 -0
|
@@ -0,0 +1,1491 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import fs from 'fs';
|
|
5
|
+
import crypto from 'crypto';
|
|
6
|
+
import { fileURLToPath } from 'url';
|
|
7
|
+
import { spawn } from 'child_process';
|
|
8
|
+
import YAML from '../lib/js-yaml.mjs';
|
|
9
|
+
import { findProjectRoot } from '../lib/find-root.mjs';
|
|
10
|
+
import { spawnAgent } from '../lib/agent-spawner.mjs';
|
|
11
|
+
|
|
12
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
13
|
+
const __dirname = path.dirname(__filename);
|
|
14
|
+
const projectRoot = findProjectRoot(process.cwd());
|
|
15
|
+
|
|
16
|
+
function parseArgs() {
|
|
17
|
+
const args = process.argv.slice(2);
|
|
18
|
+
const opts = {
|
|
19
|
+
skill: null,
|
|
20
|
+
caseId: null,
|
|
21
|
+
tag: null,
|
|
22
|
+
layer: null,
|
|
23
|
+
relevant: null,
|
|
24
|
+
all: false,
|
|
25
|
+
agent: null,
|
|
26
|
+
primaryOnly: false,
|
|
27
|
+
skipSecretScan: false,
|
|
28
|
+
fast: false,
|
|
29
|
+
yes: false,
|
|
30
|
+
baselineRef: null,
|
|
31
|
+
establishBaseline: false,
|
|
32
|
+
calibrate: false,
|
|
33
|
+
severity: null
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
for (let i = 0; i < args.length; i++) {
|
|
37
|
+
const arg = args[i];
|
|
38
|
+
if (arg === '--calibrate') {
|
|
39
|
+
opts.calibrate = true;
|
|
40
|
+
} else if (arg === '--skill' && args[i + 1]) {
|
|
41
|
+
opts.skill = args[i + 1];
|
|
42
|
+
i++;
|
|
43
|
+
} else if (arg === '--case' && args[i + 1]) {
|
|
44
|
+
opts.caseId = args[i + 1];
|
|
45
|
+
i++;
|
|
46
|
+
} else if (arg === '--tag' && args[i + 1]) {
|
|
47
|
+
opts.tag = args[i + 1];
|
|
48
|
+
i++;
|
|
49
|
+
} else if (arg === '--layer' && args[i + 1]) {
|
|
50
|
+
opts.layer = args[i + 1];
|
|
51
|
+
i++;
|
|
52
|
+
} else if (arg === '--relevant' && args[i + 1]) {
|
|
53
|
+
opts.relevant = args[i + 1];
|
|
54
|
+
i++;
|
|
55
|
+
} else if (arg === '--baseline-ref' && args[i + 1]) {
|
|
56
|
+
opts.baselineRef = args[i + 1];
|
|
57
|
+
i++;
|
|
58
|
+
} else if (arg === '--all') {
|
|
59
|
+
opts.all = true;
|
|
60
|
+
} else if (arg === '--agent' && args[i + 1]) {
|
|
61
|
+
opts.agent = args[i + 1];
|
|
62
|
+
i++;
|
|
63
|
+
} else if (arg === '--primary-only') {
|
|
64
|
+
opts.primaryOnly = true;
|
|
65
|
+
} else if (arg === '--skip-secret-scan') {
|
|
66
|
+
opts.skipSecretScan = true;
|
|
67
|
+
} else if (arg === '--fast') {
|
|
68
|
+
opts.fast = true;
|
|
69
|
+
} else if (arg === '--yes') {
|
|
70
|
+
opts.yes = true;
|
|
71
|
+
} else if (arg === '--establish-baseline') {
|
|
72
|
+
opts.establishBaseline = true;
|
|
73
|
+
} else if (arg === '--pipeline' && args[i + 1]) {
|
|
74
|
+
opts.pipeline = args[i + 1];
|
|
75
|
+
i++;
|
|
76
|
+
} else if (arg === '--severity' && args[i + 1]) {
|
|
77
|
+
opts.severity = args[i + 1];
|
|
78
|
+
i++;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return opts;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function findSkillsDir() {
|
|
86
|
+
return path.join(projectRoot, 'src', 'skills');
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function findSkillTestsDir(skillName) {
|
|
90
|
+
return path.join(findSkillsDir(), skillName, 'tests');
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function loadIndexYaml(skillName) {
|
|
94
|
+
const testsDir = findSkillTestsDir(skillName);
|
|
95
|
+
const indexPath = path.join(testsDir, 'index.yaml');
|
|
96
|
+
|
|
97
|
+
if (!fs.existsSync(indexPath)) {
|
|
98
|
+
throw new Error(`index.yaml not found for skill: ${skillName}`);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
const content = fs.readFileSync(indexPath, 'utf8');
|
|
102
|
+
return YAML.load(content);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function getBaselineRef(skillName, explicitRef) {
|
|
106
|
+
if (explicitRef) {
|
|
107
|
+
return explicitRef;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const index = loadIndexYaml(skillName);
|
|
111
|
+
return index.baseline_ref || 'origin/main';
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function gitShow(baselineRef, filePath) {
|
|
115
|
+
if (process.env.TEST_GIT_MOCK) {
|
|
116
|
+
return new Promise((resolve) => {
|
|
117
|
+
try {
|
|
118
|
+
const mocks = JSON.parse(fs.readFileSync(process.env.TEST_GIT_MOCK, 'utf8'));
|
|
119
|
+
// Нормализируем путь для кроссплатформности (Windows использует \, но mocks используют /)
|
|
120
|
+
const normalizedPath = filePath.replace(/\\/g, '/');
|
|
121
|
+
const key = `${baselineRef}:${normalizedPath}`;
|
|
122
|
+
if (mocks[key]) {
|
|
123
|
+
resolve(mocks[key]);
|
|
124
|
+
} else if (mocks.__error && mocks.__error[key]) {
|
|
125
|
+
throw new Error(mocks.__error[key]);
|
|
126
|
+
} else {
|
|
127
|
+
resolve(null);
|
|
128
|
+
}
|
|
129
|
+
} catch (e) {
|
|
130
|
+
resolve(null);
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return new Promise((resolve, reject) => {
|
|
136
|
+
const proc = spawn('git', ['show', `${baselineRef}:${filePath}`], {
|
|
137
|
+
cwd: projectRoot,
|
|
138
|
+
stdio: ['ignore', 'pipe', 'pipe']
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
let stdout = '';
|
|
142
|
+
let stderr = '';
|
|
143
|
+
|
|
144
|
+
proc.stdout.on('data', (data) => { stdout += data; });
|
|
145
|
+
proc.stderr.on('data', (data) => { stderr += data; });
|
|
146
|
+
|
|
147
|
+
proc.on('close', (code) => {
|
|
148
|
+
if (code === 0) {
|
|
149
|
+
resolve(stdout);
|
|
150
|
+
} else if (stderr.includes('does not exist') || code === 128) {
|
|
151
|
+
resolve(null);
|
|
152
|
+
} else {
|
|
153
|
+
reject(new Error(`git show failed: ${stderr}`));
|
|
154
|
+
}
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
proc.on('error', (err) => {
|
|
158
|
+
reject(err);
|
|
159
|
+
});
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
async function loadBaselineMeta(skillName, caseId, baselineRef) {
|
|
164
|
+
const casesDir = path.join('src', 'skills', skillName, 'tests', 'cases', caseId);
|
|
165
|
+
const metaPath = path.join(casesDir, 'current', 'meta.json');
|
|
166
|
+
|
|
167
|
+
const gitMetaContent = await gitShow(baselineRef, metaPath);
|
|
168
|
+
|
|
169
|
+
if (!gitMetaContent) {
|
|
170
|
+
return null;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
try {
|
|
174
|
+
return JSON.parse(gitMetaContent);
|
|
175
|
+
} catch {
|
|
176
|
+
return null;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
async function analyzeGitHeadComparison(skillName, cases, baselineRef, currentRunStatuses = {}) {
|
|
181
|
+
console.error(`[DEBUG] analyzeGitHeadComparison called`);
|
|
182
|
+
console.log(`[Runner] analyzeGitHeadComparison called with ${cases.length} cases, skillName=${skillName}`);
|
|
183
|
+
|
|
184
|
+
const comparison = {
|
|
185
|
+
previously_green: 0,
|
|
186
|
+
previously_green_still_green: 0,
|
|
187
|
+
previously_green_now_red: 0,
|
|
188
|
+
previously_red: 0,
|
|
189
|
+
previously_red_still_red: 0,
|
|
190
|
+
previously_red_now_green: 0,
|
|
191
|
+
new_cases: 0
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
let hasBaselineHistory = false;
|
|
195
|
+
|
|
196
|
+
console.log(`[Runner] Starting to iterate ${cases.length} cases`);
|
|
197
|
+
for (const caseDef of cases) {
|
|
198
|
+
console.log(`[Runner] Checking case ${caseDef.id} for git history`);
|
|
199
|
+
let baselineMeta = null;
|
|
200
|
+
try {
|
|
201
|
+
baselineMeta = await loadBaselineMeta(skillName, caseDef.id, baselineRef);
|
|
202
|
+
console.log(`[Runner] loadBaselineMeta result for ${caseDef.id}:`, baselineMeta ? 'found' : 'not found');
|
|
203
|
+
|
|
204
|
+
if (!baselineMeta) {
|
|
205
|
+
comparison.new_cases++;
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
hasBaselineHistory = true;
|
|
210
|
+
|
|
211
|
+
const prevStatus = baselineMeta.status;
|
|
212
|
+
// Используем текущий статус из памяти (результат прогона), а не с диска
|
|
213
|
+
const currentStatus = currentRunStatuses[caseDef.id] || 'unknown';
|
|
214
|
+
|
|
215
|
+
if (prevStatus === 'passed') {
|
|
216
|
+
comparison.previously_green++;
|
|
217
|
+
if (currentStatus === 'passed') {
|
|
218
|
+
comparison.previously_green_still_green++;
|
|
219
|
+
} else if (currentStatus === 'failed' || currentStatus === 'error') {
|
|
220
|
+
comparison.previously_green_now_red++;
|
|
221
|
+
}
|
|
222
|
+
} else if (prevStatus === 'failed' || prevStatus === 'error') {
|
|
223
|
+
comparison.previously_red++;
|
|
224
|
+
if (currentStatus === 'failed' || currentStatus === 'error') {
|
|
225
|
+
comparison.previously_red_still_red++;
|
|
226
|
+
} else if (currentStatus === 'passed') {
|
|
227
|
+
comparison.previously_red_now_green++;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
} catch (err) {
|
|
231
|
+
console.error(`[Runner] Error loading baseline meta for ${caseDef.id}:`, err.message);
|
|
232
|
+
throw err;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
const mode = hasBaselineHistory ? 'no-regression' : 'no-baseline';
|
|
237
|
+
console.log(`[Runner] analyzeGitHeadComparison: hasBaselineHistory=${hasBaselineHistory}, mode=${mode}, cases_checked=${Object.keys(comparison).reduce((sum, key) => sum + (comparison[key] || 0), 0)}`);
|
|
238
|
+
|
|
239
|
+
return { comparison, mode };
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
function computeVerdict(comparison, mode, relevantCaseStatus, establishBaseline) {
|
|
243
|
+
// Priority 1: Check relevant case status first
|
|
244
|
+
if (relevantCaseStatus !== null && relevantCaseStatus !== 'passed') {
|
|
245
|
+
return 'relevant_case_failed';
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Priority 2: Check for regression
|
|
249
|
+
if (comparison.previously_green_now_red > 0) {
|
|
250
|
+
return 'regression_detected';
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Priority 3: Check for no-baseline mode
|
|
254
|
+
if (mode === 'no-baseline') {
|
|
255
|
+
if (establishBaseline) {
|
|
256
|
+
return 'baseline_established';
|
|
257
|
+
}
|
|
258
|
+
return 'no_baseline_failures';
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Default: ready for user review
|
|
262
|
+
return 'ready_for_user_review';
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
function generateOutcomeMessage(result) {
|
|
266
|
+
const { verdict, comparison, mode, relevantCase } = result;
|
|
267
|
+
|
|
268
|
+
let msg = `Verdict: ${verdict}. `;
|
|
269
|
+
|
|
270
|
+
if (mode === 'no-baseline') {
|
|
271
|
+
msg += `Mode: no-baseline (no baseline history found). `;
|
|
272
|
+
} else {
|
|
273
|
+
msg += `Mode: no-regression. `;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
msg += `Green→Red: ${comparison.previously_green_now_red}/${comparison.previously_green}. `;
|
|
277
|
+
msg += `Red→Green: ${comparison.previously_red_now_green}/${comparison.previously_red}. `;
|
|
278
|
+
msg += `New cases: ${comparison.new_cases}.`;
|
|
279
|
+
|
|
280
|
+
if (relevantCase) {
|
|
281
|
+
msg += ` Relevant case (${relevantCase.id}): ${relevantCase.status}.`;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return msg;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
function resolvePipelineYaml(overridePath = null) {
|
|
288
|
+
if (overridePath) {
|
|
289
|
+
const resolved = path.resolve(overridePath);
|
|
290
|
+
if (fs.existsSync(resolved)) {
|
|
291
|
+
return resolved;
|
|
292
|
+
}
|
|
293
|
+
throw new Error(`Pipeline not found: ${overridePath}`);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
const projectRootDir = findProjectRoot(process.cwd());
|
|
297
|
+
const workflowConfigPath = path.join(projectRootDir, '.workflow', 'config', 'pipeline.yaml');
|
|
298
|
+
const packageRoot = path.dirname(projectRootDir);
|
|
299
|
+
const packageConfigPath = path.join(packageRoot, 'configs', 'pipeline.yaml');
|
|
300
|
+
|
|
301
|
+
if (fs.existsSync(workflowConfigPath)) {
|
|
302
|
+
return workflowConfigPath;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
if (fs.existsSync(packageConfigPath)) {
|
|
306
|
+
return packageConfigPath;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
throw new Error('pipeline.yaml not found in .workflow/config/ or configs/');
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function loadPipelineConfig(pipelinePath = null) {
|
|
313
|
+
const resolvedPath = resolvePipelineYaml(pipelinePath);
|
|
314
|
+
const content = fs.readFileSync(resolvedPath, 'utf8');
|
|
315
|
+
const config = YAML.load(content);
|
|
316
|
+
console.log(`[Runner] Using pipeline.yaml: ${resolvedPath}`);
|
|
317
|
+
return config.pipeline || config;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
function validateAgents(agentIds, pipelineConfig) {
|
|
321
|
+
const availableAgents = Object.keys(pipelineConfig.agents || {});
|
|
322
|
+
const invalid = [];
|
|
323
|
+
|
|
324
|
+
for (const agentId of agentIds) {
|
|
325
|
+
if (!availableAgents.includes(agentId)) {
|
|
326
|
+
invalid.push(agentId);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
if (invalid.length > 0) {
|
|
331
|
+
throw new Error(`Agent(s) '${invalid.join(', ')}' from target_agents[] not found in pipeline.yaml → agents[]`);
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
return true;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
function loadTestCase(skillName, caseFile) {
|
|
338
|
+
const testsDir = findSkillTestsDir(skillName);
|
|
339
|
+
const casePath = path.join(testsDir, caseFile);
|
|
340
|
+
|
|
341
|
+
if (!fs.existsSync(casePath)) {
|
|
342
|
+
throw new Error(`Test case not found: ${casePath}`);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
const content = fs.readFileSync(casePath, 'utf8');
|
|
346
|
+
return YAML.load(content);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
function filterCasesByTag(cases, tag) {
|
|
350
|
+
if (!tag) return cases;
|
|
351
|
+
return cases.filter(c => c.tags && c.tags.includes(tag));
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
function filterCasesBySeverity(cases, severity) {
|
|
355
|
+
if (!severity) return cases;
|
|
356
|
+
return cases.filter(c => c.severity === severity);
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
function getAllSkillNamesWithTests() {
|
|
360
|
+
const skillsDir = findSkillsDir();
|
|
361
|
+
const entries = fs.readdirSync(skillsDir);
|
|
362
|
+
const skillNames = [];
|
|
363
|
+
for (const entry of entries) {
|
|
364
|
+
const fullPath = path.join(skillsDir, entry);
|
|
365
|
+
try {
|
|
366
|
+
const stat = fs.statSync(fullPath);
|
|
367
|
+
if (stat.isDirectory()) {
|
|
368
|
+
const indexPath = path.join(fullPath, 'tests', 'index.yaml');
|
|
369
|
+
if (fs.existsSync(indexPath)) {
|
|
370
|
+
skillNames.push(entry);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
} catch (e) {
|
|
374
|
+
// ignore
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
return skillNames;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
function runSecretScan() {
|
|
381
|
+
return new Promise((resolve) => {
|
|
382
|
+
const scannerPath = path.join(projectRoot, 'src', 'scripts', 'scan-fixtures-for-secrets.js');
|
|
383
|
+
console.log('[Runner] Running secret scan before L2...');
|
|
384
|
+
|
|
385
|
+
const proc = spawn(process.execPath, [scannerPath], {
|
|
386
|
+
cwd: projectRoot,
|
|
387
|
+
stdio: ['ignore', 'pipe', 'pipe']
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
let stdout = '';
|
|
391
|
+
let stderr = '';
|
|
392
|
+
|
|
393
|
+
proc.stdout.on('data', (data) => { stdout += data; });
|
|
394
|
+
proc.stderr.on('data', (data) => { stderr += data; });
|
|
395
|
+
|
|
396
|
+
proc.on('close', (code) => {
|
|
397
|
+
if (code === 0 || stdout.includes('status: passed')) {
|
|
398
|
+
console.log('[Runner] Secret scan passed');
|
|
399
|
+
resolve({ passed: true });
|
|
400
|
+
} else {
|
|
401
|
+
console.log('[Runner] Secret scan FAILED - secrets detected:');
|
|
402
|
+
console.log(stdout);
|
|
403
|
+
if (stderr) console.error(stderr);
|
|
404
|
+
resolve({ passed: false, output: stdout });
|
|
405
|
+
}
|
|
406
|
+
});
|
|
407
|
+
|
|
408
|
+
proc.on('error', (err) => {
|
|
409
|
+
console.error('[Runner] Secret scan error:', err.message);
|
|
410
|
+
resolve({ passed: true });
|
|
411
|
+
});
|
|
412
|
+
});
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
function runL0Assertions(skillName, testCase) {
|
|
416
|
+
const assertions = testCase.assertions?.static || [];
|
|
417
|
+
const results = [];
|
|
418
|
+
|
|
419
|
+
for (const assertion of assertions) {
|
|
420
|
+
if (assertion.kind === 'skill_contains') {
|
|
421
|
+
const skillFile = path.join(findSkillsDir(), skillName, assertion.file || 'SKILL.md');
|
|
422
|
+
|
|
423
|
+
if (!fs.existsSync(skillFile)) {
|
|
424
|
+
results.push({
|
|
425
|
+
passed: false,
|
|
426
|
+
kind: assertion.kind,
|
|
427
|
+
reason: assertion.reason,
|
|
428
|
+
error: `Skill file not found: ${skillFile}`
|
|
429
|
+
});
|
|
430
|
+
continue;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
const skillContent = fs.readFileSync(skillFile, 'utf8');
|
|
434
|
+
const regex = new RegExp(assertion.pattern, 'i');
|
|
435
|
+
const matches = regex.test(skillContent);
|
|
436
|
+
|
|
437
|
+
results.push({
|
|
438
|
+
passed: matches,
|
|
439
|
+
kind: assertion.kind,
|
|
440
|
+
reason: assertion.reason,
|
|
441
|
+
pattern: assertion.pattern
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
return results;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
function runL1Assertions(output, testCase) {
|
|
450
|
+
const assertions = testCase.assertions?.deterministic || [];
|
|
451
|
+
const results = [];
|
|
452
|
+
|
|
453
|
+
const outputDependentKinds = ['output_contains_all', 'output_matches', 'output_does_not_contain', 'output_yaml_shape'];
|
|
454
|
+
if (!output && assertions.some(a => outputDependentKinds.includes(a.kind))) {
|
|
455
|
+
return assertions.map(a => ({
|
|
456
|
+
passed: true,
|
|
457
|
+
skipped: true,
|
|
458
|
+
kind: a.kind,
|
|
459
|
+
reason: 'No agent output available (L2 not run)'
|
|
460
|
+
}));
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
for (const assertion of assertions) {
|
|
464
|
+
if (assertion.kind === 'output_contains_all') {
|
|
465
|
+
const missing = [];
|
|
466
|
+
for (const val of assertion.values || []) {
|
|
467
|
+
if (!output.includes(val)) {
|
|
468
|
+
missing.push(val);
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
results.push({
|
|
472
|
+
passed: missing.length === 0,
|
|
473
|
+
kind: assertion.kind,
|
|
474
|
+
missing,
|
|
475
|
+
values: assertion.values
|
|
476
|
+
});
|
|
477
|
+
} else if (assertion.kind === 'output_matches') {
|
|
478
|
+
const regex = new RegExp(assertion.regex);
|
|
479
|
+
const matches = regex.test(output);
|
|
480
|
+
results.push({
|
|
481
|
+
passed: matches,
|
|
482
|
+
kind: assertion.kind,
|
|
483
|
+
regex: assertion.regex
|
|
484
|
+
});
|
|
485
|
+
} else if (assertion.kind === 'output_does_not_contain') {
|
|
486
|
+
const found = [];
|
|
487
|
+
for (const val of assertion.values || []) {
|
|
488
|
+
if (output.includes(val)) {
|
|
489
|
+
found.push(val);
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
results.push({
|
|
493
|
+
passed: found.length === 0,
|
|
494
|
+
kind: assertion.kind,
|
|
495
|
+
found,
|
|
496
|
+
values: assertion.values
|
|
497
|
+
});
|
|
498
|
+
} else if (assertion.kind === 'output_yaml_shape') {
|
|
499
|
+
try {
|
|
500
|
+
const parsed = YAML.load(output);
|
|
501
|
+
const hasKeys = assertion.required_keys?.every(k => parsed && typeof parsed[k] !== 'undefined');
|
|
502
|
+
results.push({
|
|
503
|
+
passed: hasKeys,
|
|
504
|
+
kind: assertion.kind,
|
|
505
|
+
required_keys: assertion.required_keys
|
|
506
|
+
});
|
|
507
|
+
} catch (e) {
|
|
508
|
+
results.push({
|
|
509
|
+
passed: false,
|
|
510
|
+
kind: assertion.kind,
|
|
511
|
+
error: e.message
|
|
512
|
+
});
|
|
513
|
+
}
|
|
514
|
+
} else if (assertion.kind === 'is_json') {
|
|
515
|
+
try {
|
|
516
|
+
JSON.parse(output);
|
|
517
|
+
results.push({
|
|
518
|
+
passed: true,
|
|
519
|
+
kind: assertion.kind
|
|
520
|
+
});
|
|
521
|
+
} catch (e) {
|
|
522
|
+
results.push({
|
|
523
|
+
passed: false,
|
|
524
|
+
kind: assertion.kind,
|
|
525
|
+
error: e.message
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
} else {
|
|
529
|
+
results.push({
|
|
530
|
+
passed: false,
|
|
531
|
+
kind: assertion.kind,
|
|
532
|
+
error: `Unknown assertion kind: ${assertion.kind}`
|
|
533
|
+
});
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
return results;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
function getSkillSha(skillName) {
|
|
541
|
+
const skillsDir = findSkillsDir();
|
|
542
|
+
const skillFile = path.join(skillsDir, skillName, 'SKILL.md');
|
|
543
|
+
|
|
544
|
+
if (!fs.existsSync(skillFile)) {
|
|
545
|
+
return 'unknown';
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
const content = fs.readFileSync(skillFile, 'utf8');
|
|
549
|
+
return crypto.createHash('sha256').update(content).digest('hex').slice(0, 7);
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
function ensureDir(dir) {
|
|
553
|
+
if (!fs.existsSync(dir)) {
|
|
554
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
function loadRubric(skillName, rubricName) {
|
|
559
|
+
const rubricPath = path.join(findSkillsDir(), skillName, 'tests', 'rubrics', `${rubricName}.md`);
|
|
560
|
+
if (!fs.existsSync(rubricPath)) {
|
|
561
|
+
throw new Error(`Rubric not found: ${rubricPath}`);
|
|
562
|
+
}
|
|
563
|
+
return fs.readFileSync(rubricPath, 'utf8');
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
function findCalibrationFiles(skillName) {
|
|
567
|
+
const rubricsDir = path.join(findSkillsDir(), skillName, 'tests', 'rubrics', 'calibration');
|
|
568
|
+
if (!fs.existsSync(rubricsDir)) {
|
|
569
|
+
return [];
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
const files = fs.readdirSync(rubricsDir);
|
|
573
|
+
const calibrationMap = {};
|
|
574
|
+
|
|
575
|
+
for (const file of files) {
|
|
576
|
+
const match = file.match(/^(.+)-good\.md$/);
|
|
577
|
+
if (match) {
|
|
578
|
+
const rubricName = match[1];
|
|
579
|
+
const goodPath = path.join(rubricsDir, file);
|
|
580
|
+
const badPath = path.join(rubricsDir, `${rubricName}-bad.md`);
|
|
581
|
+
const rubricPath = path.join(findSkillsDir(), skillName, 'tests', 'rubrics', `${rubricName}.md`);
|
|
582
|
+
|
|
583
|
+
if (fs.existsSync(badPath) && fs.existsSync(rubricPath)) {
|
|
584
|
+
calibrationMap[rubricName] = {
|
|
585
|
+
good: goodPath,
|
|
586
|
+
bad: badPath,
|
|
587
|
+
rubric: rubricPath
|
|
588
|
+
};
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
return calibrationMap;
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
function extractPassThreshold(rubricContent) {
|
|
597
|
+
const match = rubricContent.match(/score\s*≥\s*(\d+)/i);
|
|
598
|
+
if (match) {
|
|
599
|
+
return parseInt(match[1], 10);
|
|
600
|
+
}
|
|
601
|
+
return 4;
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
async function runCalibrationCheck(skillName, rubricName, calibrationFiles, pipelineConfig, judgeAgentId) {
|
|
605
|
+
const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
|
|
606
|
+
if (!judgeAgentConfig) {
|
|
607
|
+
throw new Error(`Judge agent not found: ${judgeAgentId}`);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
const rubricContent = fs.readFileSync(calibrationFiles.rubric, 'utf8');
|
|
611
|
+
const threshold = extractPassThreshold(rubricContent);
|
|
612
|
+
|
|
613
|
+
const goodContent = fs.readFileSync(calibrationFiles.good, 'utf8');
|
|
614
|
+
const badContent = fs.readFileSync(calibrationFiles.bad, 'utf8');
|
|
615
|
+
|
|
616
|
+
const judgePrompt = (agentOutput, task) => `You are a judge evaluating the output of an AI agent.
|
|
617
|
+
|
|
618
|
+
## Rubric
|
|
619
|
+
${rubricContent}
|
|
620
|
+
|
|
621
|
+
## Target Agent Output
|
|
622
|
+
${agentOutput}
|
|
623
|
+
|
|
624
|
+
## Task
|
|
625
|
+
${task}
|
|
626
|
+
|
|
627
|
+
Please evaluate the output according to the rubric and provide a score from 1 to 5.
|
|
628
|
+
Output format:
|
|
629
|
+
---RESULT---
|
|
630
|
+
score: <number 1-5>
|
|
631
|
+
reason: <brief explanation>
|
|
632
|
+
---RESULT---`;
|
|
633
|
+
|
|
634
|
+
const extractGoodResponse = (content) => {
|
|
635
|
+
const match = content.match(/## Ответ агента[\s\S]*?^---$/m);
|
|
636
|
+
return match ? match[0] : content;
|
|
637
|
+
};
|
|
638
|
+
|
|
639
|
+
const goodOutput = extractGoodResponse(goodContent);
|
|
640
|
+
const badOutput = extractGoodResponse(badContent);
|
|
641
|
+
|
|
642
|
+
const [goodResult, badResult] = await Promise.all([
|
|
643
|
+
spawnAgent(judgeAgentConfig, judgePrompt(goodOutput, 'Evaluate the good response'), { timeout: 60 }),
|
|
644
|
+
spawnAgent(judgeAgentConfig, judgePrompt(badOutput, 'Evaluate the bad response'), { timeout: 60 })
|
|
645
|
+
]);
|
|
646
|
+
|
|
647
|
+
const goodScore = parseJudgeResult(goodResult.output)?.score || 3;
|
|
648
|
+
const badScore = parseJudgeResult(badResult.output)?.score || 3;
|
|
649
|
+
|
|
650
|
+
return {
|
|
651
|
+
rubricName,
|
|
652
|
+
threshold,
|
|
653
|
+
goodScore,
|
|
654
|
+
badScore,
|
|
655
|
+
goodPassed: goodScore >= threshold,
|
|
656
|
+
badPassed: badScore < threshold
|
|
657
|
+
};
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
async function runCalibrationGate(skillName, pipelineConfig) {
|
|
661
|
+
const judgeAgent = loadIndexYaml(skillName).execution?.judge_agent;
|
|
662
|
+
if (!judgeAgent) {
|
|
663
|
+
console.log('[Runner] No judge_agent configured, skipping calibration gate');
|
|
664
|
+
return { passed: true, calibrations: [] };
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
const calibrationMap = findCalibrationFiles(skillName);
|
|
668
|
+
|
|
669
|
+
if (Object.keys(calibrationMap).length === 0) {
|
|
670
|
+
console.log('[Runner] No calibration files found, skipping calibration gate');
|
|
671
|
+
return { passed: true, calibrations: [], warnings: ['calibration files absent'] };
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
const results = [];
|
|
675
|
+
const warnings = [];
|
|
676
|
+
|
|
677
|
+
for (const [rubricName, files] of Object.entries(calibrationMap)) {
|
|
678
|
+
console.log(`[Runner] Calibrating rubric: ${rubricName}`);
|
|
679
|
+
const result = await runCalibrationCheck(skillName, rubricName, files, pipelineConfig, judgeAgent);
|
|
680
|
+
results.push(result);
|
|
681
|
+
|
|
682
|
+
if (!result.goodPassed) {
|
|
683
|
+
console.error(`[Runner] ABORT: judge miscalibrated — rubric '${rubricName}' requires fix (good score=${result.goodScore}, expected ≥${result.threshold})`);
|
|
684
|
+
return {
|
|
685
|
+
passed: false,
|
|
686
|
+
calibrations: results,
|
|
687
|
+
error: `judge miscalibrated — rubric '${rubricName}' requires fix (good score=${result.goodScore}, expected ≥${result.threshold})`
|
|
688
|
+
};
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
if (!result.badPassed) {
|
|
692
|
+
console.error(`[Runner] ABORT: judge miscalibrated — rubric '${rubricName}' requires fix (bad score=${result.badScore}, expected <${result.threshold})`);
|
|
693
|
+
return {
|
|
694
|
+
passed: false,
|
|
695
|
+
calibrations: results,
|
|
696
|
+
error: `judge miscalibrated — rubric '${rubricName}' requires fix (bad score=${result.badScore}, expected <${result.threshold})`
|
|
697
|
+
};
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
console.log(`[Runner] ${rubricName}: good=${result.goodScore} (≥${result.threshold}), bad=${result.badScore} (<${result.threshold}) ✓`);
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
return { passed: true, calibrations: results, warnings };
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
async function writeTrialOutput(skillName, caseId, agentId, trialNum, output) {
|
|
707
|
+
const skillsDir = findSkillsDir();
|
|
708
|
+
const trialDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
|
|
709
|
+
ensureDir(trialDir);
|
|
710
|
+
|
|
711
|
+
const trialFile = path.join(trialDir, `${agentId}/trial-${trialNum}.md`);
|
|
712
|
+
const agentDir = path.join(trialDir, agentId);
|
|
713
|
+
ensureDir(agentDir);
|
|
714
|
+
|
|
715
|
+
fs.writeFileSync(trialFile, output, 'utf8');
|
|
716
|
+
return trialFile;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
async function writeJudgeResults(skillName, caseId, results) {
|
|
720
|
+
const skillsDir = findSkillsDir();
|
|
721
|
+
const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
|
|
722
|
+
ensureDir(caseDir);
|
|
723
|
+
|
|
724
|
+
const judgeData = {
|
|
725
|
+
per_model: {},
|
|
726
|
+
rubric_scores: results.rubric_scores || [],
|
|
727
|
+
timestamp: new Date().toISOString()
|
|
728
|
+
};
|
|
729
|
+
|
|
730
|
+
for (const [agentId, modelData] of Object.entries(results.per_model || {})) {
|
|
731
|
+
judgeData.per_model[agentId] = {
|
|
732
|
+
pass_count: modelData.pass_count,
|
|
733
|
+
total: modelData.total,
|
|
734
|
+
trials: (modelData.trials || []).map(t => ({
|
|
735
|
+
trial: t.trial,
|
|
736
|
+
score: t.score,
|
|
737
|
+
passed: t.passed
|
|
738
|
+
}))
|
|
739
|
+
};
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
fs.writeFileSync(
|
|
743
|
+
path.join(caseDir, 'judge.json'),
|
|
744
|
+
JSON.stringify(judgeData, null, 2),
|
|
745
|
+
'utf8'
|
|
746
|
+
);
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0.02, targetAgentCost = 0.01) {
|
|
750
|
+
const totalLlms = numCases * numModels * trials;
|
|
751
|
+
const judgeCalls = numCases * numModels * trials;
|
|
752
|
+
const targetCalls = numCases * numModels * trials;
|
|
753
|
+
const estimatedCost = (judgeCalls * judgeAgentCost) + (targetCalls * targetAgentCost);
|
|
754
|
+
|
|
755
|
+
console.log(`[Runner] Estimated LLM calls: ${totalLlms} (target: ${targetCalls}, judge: ${judgeCalls})`);
|
|
756
|
+
console.log(`[Runner] Estimated cost: ~$${estimatedCost.toFixed(2)}`);
|
|
757
|
+
|
|
758
|
+
if (!process.argv.includes('--yes')) {
|
|
759
|
+
const readline = await import('readline');
|
|
760
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
761
|
+
|
|
762
|
+
return new Promise((resolve) => {
|
|
763
|
+
rl.question(`Estimated ${totalLlms} LLM calls ($${estimatedCost.toFixed(2)}). Continue? [y/N] `, (answer) => {
|
|
764
|
+
rl.close();
|
|
765
|
+
if (answer.toLowerCase() === 'y' || answer.toLowerCase() === 'yes') {
|
|
766
|
+
resolve(true);
|
|
767
|
+
} else {
|
|
768
|
+
console.log('[Runner] Aborted by user');
|
|
769
|
+
process.exit(0);
|
|
770
|
+
}
|
|
771
|
+
});
|
|
772
|
+
});
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
return true;
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judgeAgentId, pipelineConfig, options = {}) {
|
|
779
|
+
const { trials = 3, concurrency = 2, timeout = 300 } = options;
|
|
780
|
+
|
|
781
|
+
const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
|
|
782
|
+
if (!judgeAgentConfig) {
|
|
783
|
+
throw new Error(`Judge agent not found: ${judgeAgentId}`);
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
let rubricName = 'default';
|
|
787
|
+
if (testCase.assertions?.rubric && testCase.assertions.rubric.length > 0) {
|
|
788
|
+
const rubricPath = testCase.assertions.rubric[0].rubric_file;
|
|
789
|
+
if (rubricPath) {
|
|
790
|
+
rubricName = path.basename(rubricPath, '.md');
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
const rubric = loadRubric(skillName, rubricName);
|
|
795
|
+
const results = {
|
|
796
|
+
per_model: {},
|
|
797
|
+
rubric_scores: [],
|
|
798
|
+
tokens: null
|
|
799
|
+
};
|
|
800
|
+
|
|
801
|
+
const caseId = caseDef?.id || 'unknown';
|
|
802
|
+
|
|
803
|
+
function buildTargetPrompt() {
|
|
804
|
+
let targetPrompt = '';
|
|
805
|
+
const testsDir = findSkillTestsDir(skillName);
|
|
806
|
+
const caseDir = caseDef?.file ? path.dirname(caseDef.file) : '';
|
|
807
|
+
|
|
808
|
+
if (testCase.scenario?.system_prompt_file) {
|
|
809
|
+
const systemPromptPath = path.join(testsDir, caseDir, testCase.scenario.system_prompt_file);
|
|
810
|
+
if (fs.existsSync(systemPromptPath)) {
|
|
811
|
+
targetPrompt += fs.readFileSync(systemPromptPath, 'utf8') + '\n\n';
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
if (testCase.scenario?.extra_instructions) {
|
|
816
|
+
targetPrompt += testCase.scenario.extra_instructions + '\n\n';
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
if (testCase.scenario?.inputs) {
|
|
820
|
+
for (const input of testCase.scenario.inputs) {
|
|
821
|
+
if (input.kind === 'file') {
|
|
822
|
+
const fixturePath = path.join(testsDir, caseDir, input.path);
|
|
823
|
+
if (fs.existsSync(fixturePath)) {
|
|
824
|
+
targetPrompt += `## ${input.as || 'Input'}\n`;
|
|
825
|
+
targetPrompt += fs.readFileSync(fixturePath, 'utf8') + '\n\n';
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
if (!targetPrompt.trim()) {
|
|
832
|
+
targetPrompt = testCase.prompt || testCase.input || '';
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
return targetPrompt;
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
for (const agentId of targetAgents) {
|
|
839
|
+
const agentConfig = pipelineConfig.agents[agentId];
|
|
840
|
+
if (!agentConfig) {
|
|
841
|
+
throw new Error(`Target agent not found: ${agentId}`);
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
results.per_model[agentId] = {
|
|
845
|
+
trials: [],
|
|
846
|
+
pass_count: 0,
|
|
847
|
+
total: trials
|
|
848
|
+
};
|
|
849
|
+
|
|
850
|
+
const tasks = [];
|
|
851
|
+
for (let trial = 1; trial <= trials; trial++) {
|
|
852
|
+
tasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
for (let i = 0; i < tasks.length; i += concurrency) {
|
|
856
|
+
const batch = tasks.slice(i, i + concurrency);
|
|
857
|
+
const batchResults = await Promise.all(
|
|
858
|
+
batch.map(async (task) => {
|
|
859
|
+
try {
|
|
860
|
+
const targetPrompt = buildTargetPrompt();
|
|
861
|
+
const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
|
|
862
|
+
timeout,
|
|
863
|
+
stageId: `${caseId}-${task.agentId}-trial-${task.trial}`
|
|
864
|
+
});
|
|
865
|
+
|
|
866
|
+
const judgePrompt = `You are a judge evaluating the output of an AI agent.
|
|
867
|
+
|
|
868
|
+
## Rubric
|
|
869
|
+
${rubric}
|
|
870
|
+
|
|
871
|
+
## Target Agent Output
|
|
872
|
+
${targetOutput.output || targetOutput.status || 'No output'}
|
|
873
|
+
|
|
874
|
+
## Task
|
|
875
|
+
${testCase.description || testCase.name || 'Evaluate the response'}
|
|
876
|
+
|
|
877
|
+
Please evaluate the output according to the rubric and provide a score from 1 to 5.
|
|
878
|
+
Output format:
|
|
879
|
+
---RESULT---
|
|
880
|
+
score: <number 1-5>
|
|
881
|
+
reason: <brief explanation>
|
|
882
|
+
---RESULT---`;
|
|
883
|
+
|
|
884
|
+
const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
|
|
885
|
+
timeout: 60,
|
|
886
|
+
stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
|
|
887
|
+
});
|
|
888
|
+
|
|
889
|
+
let score = 3;
|
|
890
|
+
const parsed = parseJudgeResult(judgeResult.output);
|
|
891
|
+
if (parsed && parsed.score) {
|
|
892
|
+
score = parsed.score;
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
|
|
896
|
+
|
|
897
|
+
return {
|
|
898
|
+
trial: task.trial,
|
|
899
|
+
agentId: task.agentId,
|
|
900
|
+
score,
|
|
901
|
+
output: targetOutput.output || '',
|
|
902
|
+
judge_output: judgeResult.output || '',
|
|
903
|
+
passed: score >= 4
|
|
904
|
+
};
|
|
905
|
+
} catch (err) {
|
|
906
|
+
console.error(`[Runner] Trial failed: ${task.agentId} trial ${task.trial}`, err.message);
|
|
907
|
+
return {
|
|
908
|
+
trial: task.trial,
|
|
909
|
+
agentId: task.agentId,
|
|
910
|
+
score: 1,
|
|
911
|
+
error: err.message,
|
|
912
|
+
passed: false
|
|
913
|
+
};
|
|
914
|
+
}
|
|
915
|
+
})
|
|
916
|
+
);
|
|
917
|
+
|
|
918
|
+
for (const result of batchResults) {
|
|
919
|
+
results.per_model[result.agentId].trials.push(result);
|
|
920
|
+
if (result.passed) {
|
|
921
|
+
results.per_model[result.agentId].pass_count++;
|
|
922
|
+
}
|
|
923
|
+
results.rubric_scores.push({
|
|
924
|
+
agentId: result.agentId,
|
|
925
|
+
trial: result.trial,
|
|
926
|
+
score: result.score
|
|
927
|
+
});
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
return results;
|
|
933
|
+
}
|
|
934
|
+
|
|
935
|
+
function parseJudgeResult(output) {
|
|
936
|
+
if (!output) return null;
|
|
937
|
+
|
|
938
|
+
const scoreMatch = output.match(/score:\s*(\d+)/i);
|
|
939
|
+
const reasonMatch = output.match(/reason:\s*(.+)/i);
|
|
940
|
+
|
|
941
|
+
if (scoreMatch) {
|
|
942
|
+
return {
|
|
943
|
+
score: parseInt(scoreMatch[1], 10),
|
|
944
|
+
reason: reasonMatch ? reasonMatch[1].trim() : ''
|
|
945
|
+
};
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
return null;
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
function aggregateResults(results, testCase) {
|
|
952
|
+
const aggregate = testCase.aggregate || 'auto';
|
|
953
|
+
const severity = testCase.severity || 'normal';
|
|
954
|
+
|
|
955
|
+
let useAll = aggregate === 'all';
|
|
956
|
+
if (aggregate === 'auto') {
|
|
957
|
+
useAll = severity === 'critical';
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
const perModelResults = {};
|
|
961
|
+
|
|
962
|
+
for (const [agentId, modelData] of Object.entries(results.per_model)) {
|
|
963
|
+
const passCount = modelData.pass_count;
|
|
964
|
+
const total = modelData.total;
|
|
965
|
+
const threshold = Math.ceil(total / 2);
|
|
966
|
+
|
|
967
|
+
let passed;
|
|
968
|
+
if (useAll) {
|
|
969
|
+
passed = passCount === total;
|
|
970
|
+
} else {
|
|
971
|
+
passed = passCount >= threshold;
|
|
972
|
+
}
|
|
973
|
+
|
|
974
|
+
perModelResults[agentId] = {
|
|
975
|
+
passed,
|
|
976
|
+
pass_count: passCount,
|
|
977
|
+
total,
|
|
978
|
+
threshold: useAll ? total : threshold
|
|
979
|
+
};
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
const allModelsPassed = Object.values(perModelResults).every(m => m.passed);
|
|
983
|
+
|
|
984
|
+
return {
|
|
985
|
+
per_model: perModelResults,
|
|
986
|
+
overall_passed: allModelsPassed
|
|
987
|
+
};
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
async function writeMetaJson(caseId, skillName, status, durationMs, l2Results = null, l1_skipped = null) {
|
|
991
|
+
const skillsDir = findSkillsDir();
|
|
992
|
+
const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
|
|
993
|
+
ensureDir(caseDir);
|
|
994
|
+
|
|
995
|
+
const meta = {
|
|
996
|
+
date: new Date().toISOString(),
|
|
997
|
+
skill_sha: getSkillSha(skillName),
|
|
998
|
+
status,
|
|
999
|
+
duration_ms: durationMs
|
|
1000
|
+
};
|
|
1001
|
+
|
|
1002
|
+
if (l1_skipped) {
|
|
1003
|
+
meta.l1_skipped = true;
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
if (l2Results) {
|
|
1007
|
+
const aggregated = aggregateResults(l2Results, {});
|
|
1008
|
+
meta.per_model = aggregated.per_model;
|
|
1009
|
+
meta.rubric_scores = l2Results.rubric_scores;
|
|
1010
|
+
if (l2Results.tokens) {
|
|
1011
|
+
meta.tokens = l2Results.tokens;
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
fs.writeFileSync(
|
|
1016
|
+
path.join(caseDir, 'meta.json'),
|
|
1017
|
+
JSON.stringify(meta, null, 2),
|
|
1018
|
+
'utf8'
|
|
1019
|
+
);
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
async function runTestsForSkill(skillName, opts) {
|
|
1023
|
+
const result = {
|
|
1024
|
+
skill: skillName,
|
|
1025
|
+
status: 'passed',
|
|
1026
|
+
total: 0,
|
|
1027
|
+
current_run: { passed: 0, failed: 0 },
|
|
1028
|
+
baseline_ref: 'origin/main',
|
|
1029
|
+
target_agents: [],
|
|
1030
|
+
judge_agent: null
|
|
1031
|
+
};
|
|
1032
|
+
let cases = [];
|
|
1033
|
+
const currentRunStatuses = {};
|
|
1034
|
+
|
|
1035
|
+
try {
|
|
1036
|
+
const index = loadIndexYaml(skillName);
|
|
1037
|
+
const pipelineConfig = loadPipelineConfig(opts.pipeline || null);
|
|
1038
|
+
|
|
1039
|
+
const defaultTargetAgents = index.execution?.target_agents || [];
|
|
1040
|
+
const judgeAgent = index.execution?.judge_agent || null;
|
|
1041
|
+
|
|
1042
|
+
if (defaultTargetAgents.length > 0) {
|
|
1043
|
+
validateAgents(defaultTargetAgents, pipelineConfig);
|
|
1044
|
+
console.log(`[Runner] target_agents from index.yaml: ${defaultTargetAgents.join(', ')}`);
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
if (judgeAgent) {
|
|
1048
|
+
validateAgents([judgeAgent], pipelineConfig);
|
|
1049
|
+
console.log(`[Runner] judge_agent from index.yaml: ${judgeAgent}`);
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
let effectiveTargetAgents = defaultTargetAgents;
|
|
1053
|
+
|
|
1054
|
+
if (opts.agent) {
|
|
1055
|
+
validateAgents([opts.agent], pipelineConfig);
|
|
1056
|
+
effectiveTargetAgents = [opts.agent];
|
|
1057
|
+
console.log(`[Runner] Override target_agents via --agent: ${opts.agent}`);
|
|
1058
|
+
} else if (opts.primaryOnly && defaultTargetAgents.length > 0) {
|
|
1059
|
+
effectiveTargetAgents = [defaultTargetAgents[0]];
|
|
1060
|
+
console.log(`[Runner] Using only primary agent: ${effectiveTargetAgents[0]}`);
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
result.target_agents = effectiveTargetAgents;
|
|
1064
|
+
result.judge_agent = judgeAgent;
|
|
1065
|
+
|
|
1066
|
+
if (opts.calibrate) {
|
|
1067
|
+
console.log(`[Runner] Running calibration gate only...`);
|
|
1068
|
+
const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
|
|
1069
|
+
|
|
1070
|
+
if (!calibrationResult.passed) {
|
|
1071
|
+
console.error(`[Runner] Calibration FAILED: ${calibrationResult.error}`);
|
|
1072
|
+
result.status = 'calibration_failed';
|
|
1073
|
+
result.error = calibrationResult.error;
|
|
1074
|
+
result.calibration = calibrationResult;
|
|
1075
|
+
return result;
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
console.log('[Runner] Calibration gate PASSED');
|
|
1079
|
+
result.calibration = calibrationResult;
|
|
1080
|
+
result.status = 'calibration_passed';
|
|
1081
|
+
return result;
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
cases = index.cases || [];
|
|
1085
|
+
|
|
1086
|
+
if (opts.tag) {
|
|
1087
|
+
cases = filterCasesByTag(cases, opts.tag);
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
if (opts.severity) {
|
|
1091
|
+
cases = filterCasesBySeverity(cases, opts.severity);
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
if (opts.caseId) {
|
|
1095
|
+
const caseDef = cases.find(c => c.id === opts.caseId);
|
|
1096
|
+
if (caseDef) {
|
|
1097
|
+
const testCase = loadTestCase(skillName, caseDef.file);
|
|
1098
|
+
if (testCase.execution?.target_agents) {
|
|
1099
|
+
validateAgents(testCase.execution.target_agents, pipelineConfig);
|
|
1100
|
+
effectiveTargetAgents = testCase.execution.target_agents;
|
|
1101
|
+
console.log(`[Runner] Override target_agents in case ${opts.caseId}: ${effectiveTargetAgents.join(', ')}`);
|
|
1102
|
+
}
|
|
1103
|
+
if (testCase.execution?.judge_agent) {
|
|
1104
|
+
const caseJudgeAgent = testCase.execution.judge_agent;
|
|
1105
|
+
validateAgents([caseJudgeAgent], pipelineConfig);
|
|
1106
|
+
console.log(`[Runner] Override judge_agent in case ${opts.caseId}: ${caseJudgeAgent}`);
|
|
1107
|
+
}
|
|
1108
|
+
cases = [caseDef];
|
|
1109
|
+
} else {
|
|
1110
|
+
throw new Error(`Case not found: ${opts.caseId}`);
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
result.total = cases.length;
|
|
1115
|
+
|
|
1116
|
+
const startTime = Date.now();
|
|
1117
|
+
|
|
1118
|
+
const runL2 = !opts.layer || opts.layer === 'l2';
|
|
1119
|
+
|
|
1120
|
+
if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent) {
|
|
1121
|
+
const trials = opts.fast ? 1 : 3;
|
|
1122
|
+
const totalModels = effectiveTargetAgents.length;
|
|
1123
|
+
const llmEstimate = cases.length * totalModels * trials * 2;
|
|
1124
|
+
await preFlightApproval(cases.length, totalModels, trials);
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
for (const caseDef of cases) {
|
|
1128
|
+
const caseStart = Date.now();
|
|
1129
|
+
|
|
1130
|
+
try {
|
|
1131
|
+
const testCase = loadTestCase(skillName, caseDef.file);
|
|
1132
|
+
|
|
1133
|
+
const hasRubric = testCase.assertions?.rubric && testCase.assertions.rubric.length > 0;
|
|
1134
|
+
|
|
1135
|
+
const runL0 = !opts.layer || opts.layer === 'static' || opts.layer === 'deterministic';
|
|
1136
|
+
const runL1 = !opts.layer || opts.layer === 'deterministic';
|
|
1137
|
+
const runL2 = !opts.layer || opts.layer === 'l2';
|
|
1138
|
+
|
|
1139
|
+
// Secret scan (only for deterministic layer)
|
|
1140
|
+
if (runL1 && !opts.skipSecretScan) {
|
|
1141
|
+
const scanResult = await runSecretScan();
|
|
1142
|
+
if (!scanResult.passed) {
|
|
1143
|
+
result.current_run.failed++;
|
|
1144
|
+
result.status = 'failed';
|
|
1145
|
+
result.error = 'Secret scan failed - secrets detected in fixtures';
|
|
1146
|
+
currentRunStatuses[caseDef.id] = 'failed';
|
|
1147
|
+
await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
|
|
1148
|
+
continue;
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
|
|
1152
|
+
// L0 static assertions
|
|
1153
|
+
if (runL0) {
|
|
1154
|
+
const l0Results = runL0Assertions(skillName, testCase);
|
|
1155
|
+
const l0Failed = l0Results.filter(r => !r.passed);
|
|
1156
|
+
if (l0Failed.length > 0) {
|
|
1157
|
+
result.current_run.failed++;
|
|
1158
|
+
result.status = 'failed';
|
|
1159
|
+
currentRunStatuses[caseDef.id] = 'failed';
|
|
1160
|
+
await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
|
|
1161
|
+
continue;
|
|
1162
|
+
}
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
if (runL1) {
|
|
1166
|
+
const mockOutput = '';
|
|
1167
|
+
const l1Results = runL1Assertions(mockOutput, testCase);
|
|
1168
|
+
const l1Failed = l1Results.filter(r => !r.passed);
|
|
1169
|
+
const l1Skipped = l1Results.some(r => r.skipped);
|
|
1170
|
+
|
|
1171
|
+
const caseStatus = l1Failed.length === 0 ? 'passed' : 'failed';
|
|
1172
|
+
currentRunStatuses[caseDef.id] = caseStatus;
|
|
1173
|
+
|
|
1174
|
+
if (l1Failed.length > 0) {
|
|
1175
|
+
result.current_run.failed++;
|
|
1176
|
+
result.status = 'failed';
|
|
1177
|
+
} else {
|
|
1178
|
+
result.current_run.passed++;
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
if (l1Skipped) {
|
|
1182
|
+
result.l1_skipped = true;
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
|
|
1186
|
+
const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
|
|
1187
|
+
|
|
1188
|
+
if (!calibrationResult.passed) {
|
|
1189
|
+
console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
|
|
1190
|
+
result.status = 'calibration_failed';
|
|
1191
|
+
result.error = calibrationResult.error;
|
|
1192
|
+
result.calibration = calibrationResult;
|
|
1193
|
+
return result;
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
|
|
1197
|
+
console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
console.log('[Runner] Calibration gate PASSED');
|
|
1201
|
+
}
|
|
1202
|
+
|
|
1203
|
+
let l2Results = null;
|
|
1204
|
+
if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
|
|
1205
|
+
const trials = opts.fast ? 1 : 3;
|
|
1206
|
+
const index = loadIndexYaml(skillName);
|
|
1207
|
+
const defaultTimeout = index.execution?.default_timeout_s || 300;
|
|
1208
|
+
const timeout = testCase.execution?.timeout_s || defaultTimeout;
|
|
1209
|
+
try {
|
|
1210
|
+
l2Results = await runL2Evaluation(
|
|
1211
|
+
skillName,
|
|
1212
|
+
testCase,
|
|
1213
|
+
caseDef,
|
|
1214
|
+
effectiveTargetAgents,
|
|
1215
|
+
judgeAgent,
|
|
1216
|
+
pipelineConfig,
|
|
1217
|
+
{ trials, concurrency: 2, timeout }
|
|
1218
|
+
);
|
|
1219
|
+
|
|
1220
|
+
const aggregated = aggregateResults(l2Results, testCase);
|
|
1221
|
+
console.log(`[Runner] L2 Results for ${caseDef.id}:`, JSON.stringify(aggregated, null, 2));
|
|
1222
|
+
|
|
1223
|
+
await writeJudgeResults(skillName, caseDef.id, l2Results);
|
|
1224
|
+
|
|
1225
|
+
if (!aggregated.overall_passed) {
|
|
1226
|
+
result.status = 'failed';
|
|
1227
|
+
currentRunStatuses[caseDef.id] = 'failed';
|
|
1228
|
+
}
|
|
1229
|
+
} catch (l2Err) {
|
|
1230
|
+
console.error(`[Runner] L2 evaluation failed:`, l2Err.message);
|
|
1231
|
+
result.status = 'failed';
|
|
1232
|
+
currentRunStatuses[caseDef.id] = 'failed';
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
await writeMetaJson(caseDef.id, skillName, caseStatus, Date.now() - caseStart, l2Results, result.l1_skipped);
|
|
1237
|
+
} else if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
|
|
1238
|
+
const trials = opts.fast ? 1 : 3;
|
|
1239
|
+
const defaultTimeout = index.execution?.default_timeout_s || 300;
|
|
1240
|
+
const timeout = testCase.execution?.timeout_s || defaultTimeout;
|
|
1241
|
+
let l2Results = null;
|
|
1242
|
+
let caseStatus = 'passed';
|
|
1243
|
+
try {
|
|
1244
|
+
l2Results = await runL2Evaluation(
|
|
1245
|
+
skillName,
|
|
1246
|
+
testCase,
|
|
1247
|
+
caseDef,
|
|
1248
|
+
effectiveTargetAgents,
|
|
1249
|
+
judgeAgent,
|
|
1250
|
+
pipelineConfig,
|
|
1251
|
+
{ trials, concurrency: 2, timeout }
|
|
1252
|
+
);
|
|
1253
|
+
|
|
1254
|
+
const aggregated = aggregateResults(l2Results, testCase);
|
|
1255
|
+
console.log(`[Runner] L2 Results for ${caseDef.id}:`, JSON.stringify(aggregated, null, 2));
|
|
1256
|
+
|
|
1257
|
+
await writeJudgeResults(skillName, caseDef.id, l2Results);
|
|
1258
|
+
|
|
1259
|
+
if (!aggregated.overall_passed) {
|
|
1260
|
+
result.status = 'failed';
|
|
1261
|
+
result.current_run.failed++;
|
|
1262
|
+
caseStatus = 'failed';
|
|
1263
|
+
} else {
|
|
1264
|
+
result.current_run.passed++;
|
|
1265
|
+
}
|
|
1266
|
+
} catch (l2Err) {
|
|
1267
|
+
console.error(`[Runner] L2 evaluation failed:`, l2Err.message);
|
|
1268
|
+
result.status = 'failed';
|
|
1269
|
+
result.current_run.failed++;
|
|
1270
|
+
caseStatus = 'failed';
|
|
1271
|
+
}
|
|
1272
|
+
|
|
1273
|
+
currentRunStatuses[caseDef.id] = caseStatus;
|
|
1274
|
+
await writeMetaJson(caseDef.id, skillName, caseStatus, Date.now() - caseStart, l2Results);
|
|
1275
|
+
} else {
|
|
1276
|
+
result.current_run.passed++;
|
|
1277
|
+
currentRunStatuses[caseDef.id] = 'passed';
|
|
1278
|
+
await writeMetaJson(caseDef.id, skillName, 'passed', Date.now() - caseStart);
|
|
1279
|
+
}
|
|
1280
|
+
} catch (e) {
|
|
1281
|
+
result.current_run.failed++;
|
|
1282
|
+
result.status = 'failed';
|
|
1283
|
+
currentRunStatuses[caseDef.id] = 'error';
|
|
1284
|
+
await writeMetaJson(caseDef.id, skillName, 'error', Date.now() - caseStart);
|
|
1285
|
+
}
|
|
1286
|
+
}
|
|
1287
|
+
} catch (e) {
|
|
1288
|
+
result.status = 'error';
|
|
1289
|
+
result.error = e.message;
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
return {
|
|
1293
|
+
...result,
|
|
1294
|
+
cases,
|
|
1295
|
+
currentRunStatuses
|
|
1296
|
+
};
|
|
1297
|
+
}
|
|
1298
|
+
|
|
1299
|
+
async function runSkillTests(opts) {
|
|
1300
|
+
// Validate options
|
|
1301
|
+
if (!opts.all && !opts.skill) {
|
|
1302
|
+
throw new Error('Either --skill or --all must be specified');
|
|
1303
|
+
}
|
|
1304
|
+
|
|
1305
|
+
const results = {
|
|
1306
|
+
status: 'passed',
|
|
1307
|
+
skill: opts.skill || 'unknown',
|
|
1308
|
+
mode: 'deterministic',
|
|
1309
|
+
total: 0,
|
|
1310
|
+
current_run: { passed: 0, failed: 0 },
|
|
1311
|
+
baseline_ref: 'origin/main',
|
|
1312
|
+
git_head_comparison: null,
|
|
1313
|
+
verdict: 'ready_for_user_review',
|
|
1314
|
+
outcome_message: ''
|
|
1315
|
+
};
|
|
1316
|
+
|
|
1317
|
+
try {
|
|
1318
|
+
if (opts.skill) {
|
|
1319
|
+
const skillResult = await runTestsForSkill(opts.skill, opts);
|
|
1320
|
+
|
|
1321
|
+
// Merge skill results
|
|
1322
|
+
results.skill = skillResult.skill;
|
|
1323
|
+
results.total = skillResult.total;
|
|
1324
|
+
results.current_run.passed = skillResult.current_run.passed;
|
|
1325
|
+
results.current_run.failed = skillResult.current_run.failed;
|
|
1326
|
+
results.status = skillResult.status;
|
|
1327
|
+
results.target_agents = skillResult.target_agents;
|
|
1328
|
+
results.judge_agent = skillResult.judge_agent;
|
|
1329
|
+
if (skillResult.error) results.error = skillResult.error;
|
|
1330
|
+
if (skillResult.calibration) results.calibration = skillResult.calibration;
|
|
1331
|
+
|
|
1332
|
+
// Prepare for git comparison (if applicable)
|
|
1333
|
+
const cases = skillResult.cases;
|
|
1334
|
+
const currentRunStatuses = skillResult.currentRunStatuses;
|
|
1335
|
+
|
|
1336
|
+
// Git comparison and verdict (skip for calibration or no cases)
|
|
1337
|
+
if (cases && cases.length > 0 && !opts.calibrate && !skillResult.status.startsWith('calibration_')) {
|
|
1338
|
+
try {
|
|
1339
|
+
const baselineRef = getBaselineRef(opts.skill, opts.baselineRef);
|
|
1340
|
+
results.baseline_ref = baselineRef;
|
|
1341
|
+
|
|
1342
|
+
console.log(`[Runner] Computing git head comparison for ${cases.length} cases with baselineRef=${baselineRef}`);
|
|
1343
|
+
const gitResult = await analyzeGitHeadComparison(opts.skill, cases, baselineRef, currentRunStatuses);
|
|
1344
|
+
const { comparison, mode } = gitResult;
|
|
1345
|
+
results.mode = mode;
|
|
1346
|
+
results.git_head_comparison = comparison;
|
|
1347
|
+
console.log(`[Runner] Git head comparison complete: mode=${mode}`);
|
|
1348
|
+
|
|
1349
|
+
let relevantCaseStatus = null;
|
|
1350
|
+
if (opts.relevant) {
|
|
1351
|
+
const relevantCaseDir = path.join(findSkillTestsDir(opts.skill), 'cases', opts.relevant, 'current', 'meta.json');
|
|
1352
|
+
if (fs.existsSync(relevantCaseDir)) {
|
|
1353
|
+
try {
|
|
1354
|
+
const meta = JSON.parse(fs.readFileSync(relevantCaseDir, 'utf8'));
|
|
1355
|
+
relevantCaseStatus = meta.status;
|
|
1356
|
+
} catch {}
|
|
1357
|
+
}
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
if (relevantCaseStatus) {
|
|
1361
|
+
results.relevant_case_status = relevantCaseStatus;
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
results.verdict = computeVerdict(comparison, mode, relevantCaseStatus, opts.establishBaseline);
|
|
1365
|
+
results.outcome_message = generateOutcomeMessage({
|
|
1366
|
+
verdict: results.verdict,
|
|
1367
|
+
comparison,
|
|
1368
|
+
mode,
|
|
1369
|
+
relevantCase: opts.relevant ? { id: opts.relevant, status: relevantCaseStatus } : null
|
|
1370
|
+
});
|
|
1371
|
+
} catch (verdictErr) {
|
|
1372
|
+
console.error('[Runner] Verdict computation failed:', verdictErr.message);
|
|
1373
|
+
console.error('[Runner] Stack:', verdictErr.stack);
|
|
1374
|
+
}
|
|
1375
|
+
}
|
|
1376
|
+
} else if (opts.all) {
|
|
1377
|
+
const skillNames = getAllSkillNamesWithTests();
|
|
1378
|
+
let total = 0;
|
|
1379
|
+
let passed = 0;
|
|
1380
|
+
let failed = 0;
|
|
1381
|
+
let overallStatus = 'passed';
|
|
1382
|
+
|
|
1383
|
+
for (const skillName of skillNames) {
|
|
1384
|
+
const skillResult = await runTestsForSkill(skillName, opts);
|
|
1385
|
+
total += skillResult.total;
|
|
1386
|
+
passed += skillResult.current_run.passed;
|
|
1387
|
+
failed += skillResult.current_run.failed;
|
|
1388
|
+
if (skillResult.status !== 'passed') {
|
|
1389
|
+
overallStatus = 'failed';
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
|
|
1393
|
+
results.total = total;
|
|
1394
|
+
results.current_run.passed = passed;
|
|
1395
|
+
results.current_run.failed = failed;
|
|
1396
|
+
results.status = overallStatus;
|
|
1397
|
+
results.skill = 'all';
|
|
1398
|
+
results.mode = 'aggregated';
|
|
1399
|
+
results.verdict = overallStatus === 'passed' ? 'all_passed' : 'aggregated_failed';
|
|
1400
|
+
results.outcome_message = overallStatus === 'passed' ? 'All skills passed' : 'Some skills failed';
|
|
1401
|
+
results.baseline_ref = null;
|
|
1402
|
+
}
|
|
1403
|
+
} catch (e) {
|
|
1404
|
+
results.status = 'error';
|
|
1405
|
+
results.error = e.message;
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
return results;
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
function printResult(result) {
|
|
1412
|
+
console.log('---RESULT---');
|
|
1413
|
+
console.log(`status: ${result.status}`);
|
|
1414
|
+
console.log(`skill: ${result.skill}`);
|
|
1415
|
+
console.log(`mode: ${result.mode}`);
|
|
1416
|
+
console.log(`total: ${result.total}`);
|
|
1417
|
+
console.log(`current_run.passed: ${result.current_run.passed}`);
|
|
1418
|
+
console.log(`current_run.failed: ${result.current_run.failed}`);
|
|
1419
|
+
|
|
1420
|
+
if (result.baseline_ref) {
|
|
1421
|
+
console.log(`baseline_ref: ${result.baseline_ref}`);
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
if (result.git_head_comparison) {
|
|
1425
|
+
const c = result.git_head_comparison;
|
|
1426
|
+
console.log(`git_head_comparison.previously_green: ${c.previously_green}`);
|
|
1427
|
+
console.log(`git_head_comparison.previously_green_still_green: ${c.previously_green_still_green}`);
|
|
1428
|
+
console.log(`git_head_comparison.previously_green_now_red: ${c.previously_green_now_red}`);
|
|
1429
|
+
console.log(`git_head_comparison.previously_red: ${c.previously_red}`);
|
|
1430
|
+
console.log(`git_head_comparison.previously_red_still_red: ${c.previously_red_still_red}`);
|
|
1431
|
+
console.log(`git_head_comparison.previously_red_now_green: ${c.previously_red_now_green}`);
|
|
1432
|
+
console.log(`git_head_comparison.new_cases: ${c.new_cases}`);
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
if (result.relevant_case_status) {
|
|
1436
|
+
console.log(`relevant_case_status: ${result.relevant_case_status}`);
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1439
|
+
if (result.verdict) {
|
|
1440
|
+
console.log(`verdict: ${result.verdict}`);
|
|
1441
|
+
}
|
|
1442
|
+
|
|
1443
|
+
if (result.outcome_message) {
|
|
1444
|
+
console.log(`outcome_message: ${result.outcome_message}`);
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1447
|
+
console.log('---RESULT---');
|
|
1448
|
+
}
|
|
1449
|
+
|
|
1450
|
+
function showHelp() {
|
|
1451
|
+
console.log('run-skill-tests.js - Runner for skill tests');
|
|
1452
|
+
console.log('');
|
|
1453
|
+
console.log('Usage:');
|
|
1454
|
+
console.log(' node run-skill-tests.js --skill <name> Run all tests for a skill');
|
|
1455
|
+
console.log(' node run-skill-tests.js --case TC-XXX-NNN Run a single test case');
|
|
1456
|
+
console.log(' node run-skill-tests.js --tag <tag> Filter tests by tag');
|
|
1457
|
+
console.log(' node run-skill-tests.js --severity <level> Filter tests by severity (e.g., critical, normal)');
|
|
1458
|
+
console.log(' node run-skill-tests.js --layer static|deterministic|l2 Run only L0, L1 or L2');
|
|
1459
|
+
console.log(' node run-skill-tests.js --relevant TC-XXX-NNN Mark relevant case for coach');
|
|
1460
|
+
console.log(' node run-skill-tests.js --baseline-ref <ref> Override baseline ref (default: origin/main)');
|
|
1461
|
+
console.log(' node run-skill-tests.js --establish-baseline Allow reds in no-baseline mode');
|
|
1462
|
+
console.log(' node run-skill-tests.js --all Run all skills');
|
|
1463
|
+
console.log(' node run-skill-tests.js --agent <id> Run only on specific model from target_agents[]');
|
|
1464
|
+
console.log(' node run-skill-tests.js --primary-only Run only on first model from target_agents[]');
|
|
1465
|
+
console.log(' node run-skill-tests.js --skip-secret-scan Skip secret scanning before L2');
|
|
1466
|
+
console.log(' node run-skill-tests.js --fast Run with trials=1 for all cases');
|
|
1467
|
+
console.log(' node run-skill-tests.js --yes Skip pre-flight approval gate');
|
|
1468
|
+
console.log(' node run-skill-tests.js --calibrate Run only calibration gate (no full suite)');
|
|
1469
|
+
}
|
|
1470
|
+
|
|
1471
|
+
async function main() {
|
|
1472
|
+
const args = process.argv.slice(2);
|
|
1473
|
+
|
|
1474
|
+
if (args.includes('--help') || args.includes('-h')) {
|
|
1475
|
+
showHelp();
|
|
1476
|
+
return;
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
const opts = parseArgs();
|
|
1480
|
+
const result = await runSkillTests(opts);
|
|
1481
|
+
printResult(result);
|
|
1482
|
+
|
|
1483
|
+
if (result.status === 'error') {
|
|
1484
|
+
process.exit(1);
|
|
1485
|
+
}
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
main().catch(e => {
|
|
1489
|
+
console.error('Fatal error:', e.message);
|
|
1490
|
+
process.exit(1);
|
|
1491
|
+
});
|