workflow-ai 1.0.62 → 1.0.63

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1491 @@
1
+ #!/usr/bin/env node
2
+
3
+ import path from 'path';
4
+ import fs from 'fs';
5
+ import crypto from 'crypto';
6
+ import { fileURLToPath } from 'url';
7
+ import { spawn } from 'child_process';
8
+ import YAML from '../lib/js-yaml.mjs';
9
+ import { findProjectRoot } from '../lib/find-root.mjs';
10
+ import { spawnAgent } from '../lib/agent-spawner.mjs';
11
+
12
+ const __filename = fileURLToPath(import.meta.url);
13
+ const __dirname = path.dirname(__filename);
14
+ const projectRoot = findProjectRoot(process.cwd());
15
+
16
+ function parseArgs() {
17
+ const args = process.argv.slice(2);
18
+ const opts = {
19
+ skill: null,
20
+ caseId: null,
21
+ tag: null,
22
+ layer: null,
23
+ relevant: null,
24
+ all: false,
25
+ agent: null,
26
+ primaryOnly: false,
27
+ skipSecretScan: false,
28
+ fast: false,
29
+ yes: false,
30
+ baselineRef: null,
31
+ establishBaseline: false,
32
+ calibrate: false,
33
+ severity: null
34
+ };
35
+
36
+ for (let i = 0; i < args.length; i++) {
37
+ const arg = args[i];
38
+ if (arg === '--calibrate') {
39
+ opts.calibrate = true;
40
+ } else if (arg === '--skill' && args[i + 1]) {
41
+ opts.skill = args[i + 1];
42
+ i++;
43
+ } else if (arg === '--case' && args[i + 1]) {
44
+ opts.caseId = args[i + 1];
45
+ i++;
46
+ } else if (arg === '--tag' && args[i + 1]) {
47
+ opts.tag = args[i + 1];
48
+ i++;
49
+ } else if (arg === '--layer' && args[i + 1]) {
50
+ opts.layer = args[i + 1];
51
+ i++;
52
+ } else if (arg === '--relevant' && args[i + 1]) {
53
+ opts.relevant = args[i + 1];
54
+ i++;
55
+ } else if (arg === '--baseline-ref' && args[i + 1]) {
56
+ opts.baselineRef = args[i + 1];
57
+ i++;
58
+ } else if (arg === '--all') {
59
+ opts.all = true;
60
+ } else if (arg === '--agent' && args[i + 1]) {
61
+ opts.agent = args[i + 1];
62
+ i++;
63
+ } else if (arg === '--primary-only') {
64
+ opts.primaryOnly = true;
65
+ } else if (arg === '--skip-secret-scan') {
66
+ opts.skipSecretScan = true;
67
+ } else if (arg === '--fast') {
68
+ opts.fast = true;
69
+ } else if (arg === '--yes') {
70
+ opts.yes = true;
71
+ } else if (arg === '--establish-baseline') {
72
+ opts.establishBaseline = true;
73
+ } else if (arg === '--pipeline' && args[i + 1]) {
74
+ opts.pipeline = args[i + 1];
75
+ i++;
76
+ } else if (arg === '--severity' && args[i + 1]) {
77
+ opts.severity = args[i + 1];
78
+ i++;
79
+ }
80
+ }
81
+
82
+ return opts;
83
+ }
84
+
85
+ function findSkillsDir() {
86
+ return path.join(projectRoot, 'src', 'skills');
87
+ }
88
+
89
+ function findSkillTestsDir(skillName) {
90
+ return path.join(findSkillsDir(), skillName, 'tests');
91
+ }
92
+
93
+ function loadIndexYaml(skillName) {
94
+ const testsDir = findSkillTestsDir(skillName);
95
+ const indexPath = path.join(testsDir, 'index.yaml');
96
+
97
+ if (!fs.existsSync(indexPath)) {
98
+ throw new Error(`index.yaml not found for skill: ${skillName}`);
99
+ }
100
+
101
+ const content = fs.readFileSync(indexPath, 'utf8');
102
+ return YAML.load(content);
103
+ }
104
+
105
+ function getBaselineRef(skillName, explicitRef) {
106
+ if (explicitRef) {
107
+ return explicitRef;
108
+ }
109
+
110
+ const index = loadIndexYaml(skillName);
111
+ return index.baseline_ref || 'origin/main';
112
+ }
113
+
114
+ function gitShow(baselineRef, filePath) {
115
+ if (process.env.TEST_GIT_MOCK) {
116
+ return new Promise((resolve) => {
117
+ try {
118
+ const mocks = JSON.parse(fs.readFileSync(process.env.TEST_GIT_MOCK, 'utf8'));
119
+ // Нормализируем путь для кроссплатформности (Windows использует \, но mocks используют /)
120
+ const normalizedPath = filePath.replace(/\\/g, '/');
121
+ const key = `${baselineRef}:${normalizedPath}`;
122
+ if (mocks[key]) {
123
+ resolve(mocks[key]);
124
+ } else if (mocks.__error && mocks.__error[key]) {
125
+ throw new Error(mocks.__error[key]);
126
+ } else {
127
+ resolve(null);
128
+ }
129
+ } catch (e) {
130
+ resolve(null);
131
+ }
132
+ });
133
+ }
134
+
135
+ return new Promise((resolve, reject) => {
136
+ const proc = spawn('git', ['show', `${baselineRef}:${filePath}`], {
137
+ cwd: projectRoot,
138
+ stdio: ['ignore', 'pipe', 'pipe']
139
+ });
140
+
141
+ let stdout = '';
142
+ let stderr = '';
143
+
144
+ proc.stdout.on('data', (data) => { stdout += data; });
145
+ proc.stderr.on('data', (data) => { stderr += data; });
146
+
147
+ proc.on('close', (code) => {
148
+ if (code === 0) {
149
+ resolve(stdout);
150
+ } else if (stderr.includes('does not exist') || code === 128) {
151
+ resolve(null);
152
+ } else {
153
+ reject(new Error(`git show failed: ${stderr}`));
154
+ }
155
+ });
156
+
157
+ proc.on('error', (err) => {
158
+ reject(err);
159
+ });
160
+ });
161
+ }
162
+
163
+ async function loadBaselineMeta(skillName, caseId, baselineRef) {
164
+ const casesDir = path.join('src', 'skills', skillName, 'tests', 'cases', caseId);
165
+ const metaPath = path.join(casesDir, 'current', 'meta.json');
166
+
167
+ const gitMetaContent = await gitShow(baselineRef, metaPath);
168
+
169
+ if (!gitMetaContent) {
170
+ return null;
171
+ }
172
+
173
+ try {
174
+ return JSON.parse(gitMetaContent);
175
+ } catch {
176
+ return null;
177
+ }
178
+ }
179
+
180
+ async function analyzeGitHeadComparison(skillName, cases, baselineRef, currentRunStatuses = {}) {
181
+ console.error(`[DEBUG] analyzeGitHeadComparison called`);
182
+ console.log(`[Runner] analyzeGitHeadComparison called with ${cases.length} cases, skillName=${skillName}`);
183
+
184
+ const comparison = {
185
+ previously_green: 0,
186
+ previously_green_still_green: 0,
187
+ previously_green_now_red: 0,
188
+ previously_red: 0,
189
+ previously_red_still_red: 0,
190
+ previously_red_now_green: 0,
191
+ new_cases: 0
192
+ };
193
+
194
+ let hasBaselineHistory = false;
195
+
196
+ console.log(`[Runner] Starting to iterate ${cases.length} cases`);
197
+ for (const caseDef of cases) {
198
+ console.log(`[Runner] Checking case ${caseDef.id} for git history`);
199
+ let baselineMeta = null;
200
+ try {
201
+ baselineMeta = await loadBaselineMeta(skillName, caseDef.id, baselineRef);
202
+ console.log(`[Runner] loadBaselineMeta result for ${caseDef.id}:`, baselineMeta ? 'found' : 'not found');
203
+
204
+ if (!baselineMeta) {
205
+ comparison.new_cases++;
206
+ continue;
207
+ }
208
+
209
+ hasBaselineHistory = true;
210
+
211
+ const prevStatus = baselineMeta.status;
212
+ // Используем текущий статус из памяти (результат прогона), а не с диска
213
+ const currentStatus = currentRunStatuses[caseDef.id] || 'unknown';
214
+
215
+ if (prevStatus === 'passed') {
216
+ comparison.previously_green++;
217
+ if (currentStatus === 'passed') {
218
+ comparison.previously_green_still_green++;
219
+ } else if (currentStatus === 'failed' || currentStatus === 'error') {
220
+ comparison.previously_green_now_red++;
221
+ }
222
+ } else if (prevStatus === 'failed' || prevStatus === 'error') {
223
+ comparison.previously_red++;
224
+ if (currentStatus === 'failed' || currentStatus === 'error') {
225
+ comparison.previously_red_still_red++;
226
+ } else if (currentStatus === 'passed') {
227
+ comparison.previously_red_now_green++;
228
+ }
229
+ }
230
+ } catch (err) {
231
+ console.error(`[Runner] Error loading baseline meta for ${caseDef.id}:`, err.message);
232
+ throw err;
233
+ }
234
+ }
235
+
236
+ const mode = hasBaselineHistory ? 'no-regression' : 'no-baseline';
237
+ console.log(`[Runner] analyzeGitHeadComparison: hasBaselineHistory=${hasBaselineHistory}, mode=${mode}, cases_checked=${Object.keys(comparison).reduce((sum, key) => sum + (comparison[key] || 0), 0)}`);
238
+
239
+ return { comparison, mode };
240
+ }
241
+
242
+ function computeVerdict(comparison, mode, relevantCaseStatus, establishBaseline) {
243
+ // Priority 1: Check relevant case status first
244
+ if (relevantCaseStatus !== null && relevantCaseStatus !== 'passed') {
245
+ return 'relevant_case_failed';
246
+ }
247
+
248
+ // Priority 2: Check for regression
249
+ if (comparison.previously_green_now_red > 0) {
250
+ return 'regression_detected';
251
+ }
252
+
253
+ // Priority 3: Check for no-baseline mode
254
+ if (mode === 'no-baseline') {
255
+ if (establishBaseline) {
256
+ return 'baseline_established';
257
+ }
258
+ return 'no_baseline_failures';
259
+ }
260
+
261
+ // Default: ready for user review
262
+ return 'ready_for_user_review';
263
+ }
264
+
265
+ function generateOutcomeMessage(result) {
266
+ const { verdict, comparison, mode, relevantCase } = result;
267
+
268
+ let msg = `Verdict: ${verdict}. `;
269
+
270
+ if (mode === 'no-baseline') {
271
+ msg += `Mode: no-baseline (no baseline history found). `;
272
+ } else {
273
+ msg += `Mode: no-regression. `;
274
+ }
275
+
276
+ msg += `Green→Red: ${comparison.previously_green_now_red}/${comparison.previously_green}. `;
277
+ msg += `Red→Green: ${comparison.previously_red_now_green}/${comparison.previously_red}. `;
278
+ msg += `New cases: ${comparison.new_cases}.`;
279
+
280
+ if (relevantCase) {
281
+ msg += ` Relevant case (${relevantCase.id}): ${relevantCase.status}.`;
282
+ }
283
+
284
+ return msg;
285
+ }
286
+
287
+ function resolvePipelineYaml(overridePath = null) {
288
+ if (overridePath) {
289
+ const resolved = path.resolve(overridePath);
290
+ if (fs.existsSync(resolved)) {
291
+ return resolved;
292
+ }
293
+ throw new Error(`Pipeline not found: ${overridePath}`);
294
+ }
295
+
296
+ const projectRootDir = findProjectRoot(process.cwd());
297
+ const workflowConfigPath = path.join(projectRootDir, '.workflow', 'config', 'pipeline.yaml');
298
+ const packageRoot = path.dirname(projectRootDir);
299
+ const packageConfigPath = path.join(packageRoot, 'configs', 'pipeline.yaml');
300
+
301
+ if (fs.existsSync(workflowConfigPath)) {
302
+ return workflowConfigPath;
303
+ }
304
+
305
+ if (fs.existsSync(packageConfigPath)) {
306
+ return packageConfigPath;
307
+ }
308
+
309
+ throw new Error('pipeline.yaml not found in .workflow/config/ or configs/');
310
+ }
311
+
312
+ function loadPipelineConfig(pipelinePath = null) {
313
+ const resolvedPath = resolvePipelineYaml(pipelinePath);
314
+ const content = fs.readFileSync(resolvedPath, 'utf8');
315
+ const config = YAML.load(content);
316
+ console.log(`[Runner] Using pipeline.yaml: ${resolvedPath}`);
317
+ return config.pipeline || config;
318
+ }
319
+
320
+ function validateAgents(agentIds, pipelineConfig) {
321
+ const availableAgents = Object.keys(pipelineConfig.agents || {});
322
+ const invalid = [];
323
+
324
+ for (const agentId of agentIds) {
325
+ if (!availableAgents.includes(agentId)) {
326
+ invalid.push(agentId);
327
+ }
328
+ }
329
+
330
+ if (invalid.length > 0) {
331
+ throw new Error(`Agent(s) '${invalid.join(', ')}' from target_agents[] not found in pipeline.yaml → agents[]`);
332
+ }
333
+
334
+ return true;
335
+ }
336
+
337
+ function loadTestCase(skillName, caseFile) {
338
+ const testsDir = findSkillTestsDir(skillName);
339
+ const casePath = path.join(testsDir, caseFile);
340
+
341
+ if (!fs.existsSync(casePath)) {
342
+ throw new Error(`Test case not found: ${casePath}`);
343
+ }
344
+
345
+ const content = fs.readFileSync(casePath, 'utf8');
346
+ return YAML.load(content);
347
+ }
348
+
349
+ function filterCasesByTag(cases, tag) {
350
+ if (!tag) return cases;
351
+ return cases.filter(c => c.tags && c.tags.includes(tag));
352
+ }
353
+
354
+ function filterCasesBySeverity(cases, severity) {
355
+ if (!severity) return cases;
356
+ return cases.filter(c => c.severity === severity);
357
+ }
358
+
359
+ function getAllSkillNamesWithTests() {
360
+ const skillsDir = findSkillsDir();
361
+ const entries = fs.readdirSync(skillsDir);
362
+ const skillNames = [];
363
+ for (const entry of entries) {
364
+ const fullPath = path.join(skillsDir, entry);
365
+ try {
366
+ const stat = fs.statSync(fullPath);
367
+ if (stat.isDirectory()) {
368
+ const indexPath = path.join(fullPath, 'tests', 'index.yaml');
369
+ if (fs.existsSync(indexPath)) {
370
+ skillNames.push(entry);
371
+ }
372
+ }
373
+ } catch (e) {
374
+ // ignore
375
+ }
376
+ }
377
+ return skillNames;
378
+ }
379
+
380
+ function runSecretScan() {
381
+ return new Promise((resolve) => {
382
+ const scannerPath = path.join(projectRoot, 'src', 'scripts', 'scan-fixtures-for-secrets.js');
383
+ console.log('[Runner] Running secret scan before L2...');
384
+
385
+ const proc = spawn(process.execPath, [scannerPath], {
386
+ cwd: projectRoot,
387
+ stdio: ['ignore', 'pipe', 'pipe']
388
+ });
389
+
390
+ let stdout = '';
391
+ let stderr = '';
392
+
393
+ proc.stdout.on('data', (data) => { stdout += data; });
394
+ proc.stderr.on('data', (data) => { stderr += data; });
395
+
396
+ proc.on('close', (code) => {
397
+ if (code === 0 || stdout.includes('status: passed')) {
398
+ console.log('[Runner] Secret scan passed');
399
+ resolve({ passed: true });
400
+ } else {
401
+ console.log('[Runner] Secret scan FAILED - secrets detected:');
402
+ console.log(stdout);
403
+ if (stderr) console.error(stderr);
404
+ resolve({ passed: false, output: stdout });
405
+ }
406
+ });
407
+
408
+ proc.on('error', (err) => {
409
+ console.error('[Runner] Secret scan error:', err.message);
410
+ resolve({ passed: true });
411
+ });
412
+ });
413
+ }
414
+
415
+ function runL0Assertions(skillName, testCase) {
416
+ const assertions = testCase.assertions?.static || [];
417
+ const results = [];
418
+
419
+ for (const assertion of assertions) {
420
+ if (assertion.kind === 'skill_contains') {
421
+ const skillFile = path.join(findSkillsDir(), skillName, assertion.file || 'SKILL.md');
422
+
423
+ if (!fs.existsSync(skillFile)) {
424
+ results.push({
425
+ passed: false,
426
+ kind: assertion.kind,
427
+ reason: assertion.reason,
428
+ error: `Skill file not found: ${skillFile}`
429
+ });
430
+ continue;
431
+ }
432
+
433
+ const skillContent = fs.readFileSync(skillFile, 'utf8');
434
+ const regex = new RegExp(assertion.pattern, 'i');
435
+ const matches = regex.test(skillContent);
436
+
437
+ results.push({
438
+ passed: matches,
439
+ kind: assertion.kind,
440
+ reason: assertion.reason,
441
+ pattern: assertion.pattern
442
+ });
443
+ }
444
+ }
445
+
446
+ return results;
447
+ }
448
+
449
+ function runL1Assertions(output, testCase) {
450
+ const assertions = testCase.assertions?.deterministic || [];
451
+ const results = [];
452
+
453
+ const outputDependentKinds = ['output_contains_all', 'output_matches', 'output_does_not_contain', 'output_yaml_shape'];
454
+ if (!output && assertions.some(a => outputDependentKinds.includes(a.kind))) {
455
+ return assertions.map(a => ({
456
+ passed: true,
457
+ skipped: true,
458
+ kind: a.kind,
459
+ reason: 'No agent output available (L2 not run)'
460
+ }));
461
+ }
462
+
463
+ for (const assertion of assertions) {
464
+ if (assertion.kind === 'output_contains_all') {
465
+ const missing = [];
466
+ for (const val of assertion.values || []) {
467
+ if (!output.includes(val)) {
468
+ missing.push(val);
469
+ }
470
+ }
471
+ results.push({
472
+ passed: missing.length === 0,
473
+ kind: assertion.kind,
474
+ missing,
475
+ values: assertion.values
476
+ });
477
+ } else if (assertion.kind === 'output_matches') {
478
+ const regex = new RegExp(assertion.regex);
479
+ const matches = regex.test(output);
480
+ results.push({
481
+ passed: matches,
482
+ kind: assertion.kind,
483
+ regex: assertion.regex
484
+ });
485
+ } else if (assertion.kind === 'output_does_not_contain') {
486
+ const found = [];
487
+ for (const val of assertion.values || []) {
488
+ if (output.includes(val)) {
489
+ found.push(val);
490
+ }
491
+ }
492
+ results.push({
493
+ passed: found.length === 0,
494
+ kind: assertion.kind,
495
+ found,
496
+ values: assertion.values
497
+ });
498
+ } else if (assertion.kind === 'output_yaml_shape') {
499
+ try {
500
+ const parsed = YAML.load(output);
501
+ const hasKeys = assertion.required_keys?.every(k => parsed && typeof parsed[k] !== 'undefined');
502
+ results.push({
503
+ passed: hasKeys,
504
+ kind: assertion.kind,
505
+ required_keys: assertion.required_keys
506
+ });
507
+ } catch (e) {
508
+ results.push({
509
+ passed: false,
510
+ kind: assertion.kind,
511
+ error: e.message
512
+ });
513
+ }
514
+ } else if (assertion.kind === 'is_json') {
515
+ try {
516
+ JSON.parse(output);
517
+ results.push({
518
+ passed: true,
519
+ kind: assertion.kind
520
+ });
521
+ } catch (e) {
522
+ results.push({
523
+ passed: false,
524
+ kind: assertion.kind,
525
+ error: e.message
526
+ });
527
+ }
528
+ } else {
529
+ results.push({
530
+ passed: false,
531
+ kind: assertion.kind,
532
+ error: `Unknown assertion kind: ${assertion.kind}`
533
+ });
534
+ }
535
+ }
536
+
537
+ return results;
538
+ }
539
+
540
+ function getSkillSha(skillName) {
541
+ const skillsDir = findSkillsDir();
542
+ const skillFile = path.join(skillsDir, skillName, 'SKILL.md');
543
+
544
+ if (!fs.existsSync(skillFile)) {
545
+ return 'unknown';
546
+ }
547
+
548
+ const content = fs.readFileSync(skillFile, 'utf8');
549
+ return crypto.createHash('sha256').update(content).digest('hex').slice(0, 7);
550
+ }
551
+
552
+ function ensureDir(dir) {
553
+ if (!fs.existsSync(dir)) {
554
+ fs.mkdirSync(dir, { recursive: true });
555
+ }
556
+ }
557
+
558
+ function loadRubric(skillName, rubricName) {
559
+ const rubricPath = path.join(findSkillsDir(), skillName, 'tests', 'rubrics', `${rubricName}.md`);
560
+ if (!fs.existsSync(rubricPath)) {
561
+ throw new Error(`Rubric not found: ${rubricPath}`);
562
+ }
563
+ return fs.readFileSync(rubricPath, 'utf8');
564
+ }
565
+
566
+ function findCalibrationFiles(skillName) {
567
+ const rubricsDir = path.join(findSkillsDir(), skillName, 'tests', 'rubrics', 'calibration');
568
+ if (!fs.existsSync(rubricsDir)) {
569
+ return [];
570
+ }
571
+
572
+ const files = fs.readdirSync(rubricsDir);
573
+ const calibrationMap = {};
574
+
575
+ for (const file of files) {
576
+ const match = file.match(/^(.+)-good\.md$/);
577
+ if (match) {
578
+ const rubricName = match[1];
579
+ const goodPath = path.join(rubricsDir, file);
580
+ const badPath = path.join(rubricsDir, `${rubricName}-bad.md`);
581
+ const rubricPath = path.join(findSkillsDir(), skillName, 'tests', 'rubrics', `${rubricName}.md`);
582
+
583
+ if (fs.existsSync(badPath) && fs.existsSync(rubricPath)) {
584
+ calibrationMap[rubricName] = {
585
+ good: goodPath,
586
+ bad: badPath,
587
+ rubric: rubricPath
588
+ };
589
+ }
590
+ }
591
+ }
592
+
593
+ return calibrationMap;
594
+ }
595
+
596
+ function extractPassThreshold(rubricContent) {
597
+ const match = rubricContent.match(/score\s*≥\s*(\d+)/i);
598
+ if (match) {
599
+ return parseInt(match[1], 10);
600
+ }
601
+ return 4;
602
+ }
603
+
604
+ async function runCalibrationCheck(skillName, rubricName, calibrationFiles, pipelineConfig, judgeAgentId) {
605
+ const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
606
+ if (!judgeAgentConfig) {
607
+ throw new Error(`Judge agent not found: ${judgeAgentId}`);
608
+ }
609
+
610
+ const rubricContent = fs.readFileSync(calibrationFiles.rubric, 'utf8');
611
+ const threshold = extractPassThreshold(rubricContent);
612
+
613
+ const goodContent = fs.readFileSync(calibrationFiles.good, 'utf8');
614
+ const badContent = fs.readFileSync(calibrationFiles.bad, 'utf8');
615
+
616
+ const judgePrompt = (agentOutput, task) => `You are a judge evaluating the output of an AI agent.
617
+
618
+ ## Rubric
619
+ ${rubricContent}
620
+
621
+ ## Target Agent Output
622
+ ${agentOutput}
623
+
624
+ ## Task
625
+ ${task}
626
+
627
+ Please evaluate the output according to the rubric and provide a score from 1 to 5.
628
+ Output format:
629
+ ---RESULT---
630
+ score: <number 1-5>
631
+ reason: <brief explanation>
632
+ ---RESULT---`;
633
+
634
+ const extractGoodResponse = (content) => {
635
+ const match = content.match(/## Ответ агента[\s\S]*?^---$/m);
636
+ return match ? match[0] : content;
637
+ };
638
+
639
+ const goodOutput = extractGoodResponse(goodContent);
640
+ const badOutput = extractGoodResponse(badContent);
641
+
642
+ const [goodResult, badResult] = await Promise.all([
643
+ spawnAgent(judgeAgentConfig, judgePrompt(goodOutput, 'Evaluate the good response'), { timeout: 60 }),
644
+ spawnAgent(judgeAgentConfig, judgePrompt(badOutput, 'Evaluate the bad response'), { timeout: 60 })
645
+ ]);
646
+
647
+ const goodScore = parseJudgeResult(goodResult.output)?.score || 3;
648
+ const badScore = parseJudgeResult(badResult.output)?.score || 3;
649
+
650
+ return {
651
+ rubricName,
652
+ threshold,
653
+ goodScore,
654
+ badScore,
655
+ goodPassed: goodScore >= threshold,
656
+ badPassed: badScore < threshold
657
+ };
658
+ }
659
+
660
+ async function runCalibrationGate(skillName, pipelineConfig) {
661
+ const judgeAgent = loadIndexYaml(skillName).execution?.judge_agent;
662
+ if (!judgeAgent) {
663
+ console.log('[Runner] No judge_agent configured, skipping calibration gate');
664
+ return { passed: true, calibrations: [] };
665
+ }
666
+
667
+ const calibrationMap = findCalibrationFiles(skillName);
668
+
669
+ if (Object.keys(calibrationMap).length === 0) {
670
+ console.log('[Runner] No calibration files found, skipping calibration gate');
671
+ return { passed: true, calibrations: [], warnings: ['calibration files absent'] };
672
+ }
673
+
674
+ const results = [];
675
+ const warnings = [];
676
+
677
+ for (const [rubricName, files] of Object.entries(calibrationMap)) {
678
+ console.log(`[Runner] Calibrating rubric: ${rubricName}`);
679
+ const result = await runCalibrationCheck(skillName, rubricName, files, pipelineConfig, judgeAgent);
680
+ results.push(result);
681
+
682
+ if (!result.goodPassed) {
683
+ console.error(`[Runner] ABORT: judge miscalibrated — rubric '${rubricName}' requires fix (good score=${result.goodScore}, expected ≥${result.threshold})`);
684
+ return {
685
+ passed: false,
686
+ calibrations: results,
687
+ error: `judge miscalibrated — rubric '${rubricName}' requires fix (good score=${result.goodScore}, expected ≥${result.threshold})`
688
+ };
689
+ }
690
+
691
+ if (!result.badPassed) {
692
+ console.error(`[Runner] ABORT: judge miscalibrated — rubric '${rubricName}' requires fix (bad score=${result.badScore}, expected <${result.threshold})`);
693
+ return {
694
+ passed: false,
695
+ calibrations: results,
696
+ error: `judge miscalibrated — rubric '${rubricName}' requires fix (bad score=${result.badScore}, expected <${result.threshold})`
697
+ };
698
+ }
699
+
700
+ console.log(`[Runner] ${rubricName}: good=${result.goodScore} (≥${result.threshold}), bad=${result.badScore} (<${result.threshold}) ✓`);
701
+ }
702
+
703
+ return { passed: true, calibrations: results, warnings };
704
+ }
705
+
706
+ async function writeTrialOutput(skillName, caseId, agentId, trialNum, output) {
707
+ const skillsDir = findSkillsDir();
708
+ const trialDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
709
+ ensureDir(trialDir);
710
+
711
+ const trialFile = path.join(trialDir, `${agentId}/trial-${trialNum}.md`);
712
+ const agentDir = path.join(trialDir, agentId);
713
+ ensureDir(agentDir);
714
+
715
+ fs.writeFileSync(trialFile, output, 'utf8');
716
+ return trialFile;
717
+ }
718
+
719
+ async function writeJudgeResults(skillName, caseId, results) {
720
+ const skillsDir = findSkillsDir();
721
+ const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
722
+ ensureDir(caseDir);
723
+
724
+ const judgeData = {
725
+ per_model: {},
726
+ rubric_scores: results.rubric_scores || [],
727
+ timestamp: new Date().toISOString()
728
+ };
729
+
730
+ for (const [agentId, modelData] of Object.entries(results.per_model || {})) {
731
+ judgeData.per_model[agentId] = {
732
+ pass_count: modelData.pass_count,
733
+ total: modelData.total,
734
+ trials: (modelData.trials || []).map(t => ({
735
+ trial: t.trial,
736
+ score: t.score,
737
+ passed: t.passed
738
+ }))
739
+ };
740
+ }
741
+
742
+ fs.writeFileSync(
743
+ path.join(caseDir, 'judge.json'),
744
+ JSON.stringify(judgeData, null, 2),
745
+ 'utf8'
746
+ );
747
+ }
748
+
749
+ async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0.02, targetAgentCost = 0.01) {
750
+ const totalLlms = numCases * numModels * trials;
751
+ const judgeCalls = numCases * numModels * trials;
752
+ const targetCalls = numCases * numModels * trials;
753
+ const estimatedCost = (judgeCalls * judgeAgentCost) + (targetCalls * targetAgentCost);
754
+
755
+ console.log(`[Runner] Estimated LLM calls: ${totalLlms} (target: ${targetCalls}, judge: ${judgeCalls})`);
756
+ console.log(`[Runner] Estimated cost: ~$${estimatedCost.toFixed(2)}`);
757
+
758
+ if (!process.argv.includes('--yes')) {
759
+ const readline = await import('readline');
760
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
761
+
762
+ return new Promise((resolve) => {
763
+ rl.question(`Estimated ${totalLlms} LLM calls ($${estimatedCost.toFixed(2)}). Continue? [y/N] `, (answer) => {
764
+ rl.close();
765
+ if (answer.toLowerCase() === 'y' || answer.toLowerCase() === 'yes') {
766
+ resolve(true);
767
+ } else {
768
+ console.log('[Runner] Aborted by user');
769
+ process.exit(0);
770
+ }
771
+ });
772
+ });
773
+ }
774
+
775
+ return true;
776
+ }
777
+
778
+ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judgeAgentId, pipelineConfig, options = {}) {
779
+ const { trials = 3, concurrency = 2, timeout = 300 } = options;
780
+
781
+ const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
782
+ if (!judgeAgentConfig) {
783
+ throw new Error(`Judge agent not found: ${judgeAgentId}`);
784
+ }
785
+
786
+ let rubricName = 'default';
787
+ if (testCase.assertions?.rubric && testCase.assertions.rubric.length > 0) {
788
+ const rubricPath = testCase.assertions.rubric[0].rubric_file;
789
+ if (rubricPath) {
790
+ rubricName = path.basename(rubricPath, '.md');
791
+ }
792
+ }
793
+
794
+ const rubric = loadRubric(skillName, rubricName);
795
+ const results = {
796
+ per_model: {},
797
+ rubric_scores: [],
798
+ tokens: null
799
+ };
800
+
801
+ const caseId = caseDef?.id || 'unknown';
802
+
803
+ function buildTargetPrompt() {
804
+ let targetPrompt = '';
805
+ const testsDir = findSkillTestsDir(skillName);
806
+ const caseDir = caseDef?.file ? path.dirname(caseDef.file) : '';
807
+
808
+ if (testCase.scenario?.system_prompt_file) {
809
+ const systemPromptPath = path.join(testsDir, caseDir, testCase.scenario.system_prompt_file);
810
+ if (fs.existsSync(systemPromptPath)) {
811
+ targetPrompt += fs.readFileSync(systemPromptPath, 'utf8') + '\n\n';
812
+ }
813
+ }
814
+
815
+ if (testCase.scenario?.extra_instructions) {
816
+ targetPrompt += testCase.scenario.extra_instructions + '\n\n';
817
+ }
818
+
819
+ if (testCase.scenario?.inputs) {
820
+ for (const input of testCase.scenario.inputs) {
821
+ if (input.kind === 'file') {
822
+ const fixturePath = path.join(testsDir, caseDir, input.path);
823
+ if (fs.existsSync(fixturePath)) {
824
+ targetPrompt += `## ${input.as || 'Input'}\n`;
825
+ targetPrompt += fs.readFileSync(fixturePath, 'utf8') + '\n\n';
826
+ }
827
+ }
828
+ }
829
+ }
830
+
831
+ if (!targetPrompt.trim()) {
832
+ targetPrompt = testCase.prompt || testCase.input || '';
833
+ }
834
+
835
+ return targetPrompt;
836
+ }
837
+
838
+ for (const agentId of targetAgents) {
839
+ const agentConfig = pipelineConfig.agents[agentId];
840
+ if (!agentConfig) {
841
+ throw new Error(`Target agent not found: ${agentId}`);
842
+ }
843
+
844
+ results.per_model[agentId] = {
845
+ trials: [],
846
+ pass_count: 0,
847
+ total: trials
848
+ };
849
+
850
+ const tasks = [];
851
+ for (let trial = 1; trial <= trials; trial++) {
852
+ tasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
853
+ }
854
+
855
+ for (let i = 0; i < tasks.length; i += concurrency) {
856
+ const batch = tasks.slice(i, i + concurrency);
857
+ const batchResults = await Promise.all(
858
+ batch.map(async (task) => {
859
+ try {
860
+ const targetPrompt = buildTargetPrompt();
861
+ const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
862
+ timeout,
863
+ stageId: `${caseId}-${task.agentId}-trial-${task.trial}`
864
+ });
865
+
866
+ const judgePrompt = `You are a judge evaluating the output of an AI agent.
867
+
868
+ ## Rubric
869
+ ${rubric}
870
+
871
+ ## Target Agent Output
872
+ ${targetOutput.output || targetOutput.status || 'No output'}
873
+
874
+ ## Task
875
+ ${testCase.description || testCase.name || 'Evaluate the response'}
876
+
877
+ Please evaluate the output according to the rubric and provide a score from 1 to 5.
878
+ Output format:
879
+ ---RESULT---
880
+ score: <number 1-5>
881
+ reason: <brief explanation>
882
+ ---RESULT---`;
883
+
884
+ const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
885
+ timeout: 60,
886
+ stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
887
+ });
888
+
889
+ let score = 3;
890
+ const parsed = parseJudgeResult(judgeResult.output);
891
+ if (parsed && parsed.score) {
892
+ score = parsed.score;
893
+ }
894
+
895
+ await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
896
+
897
+ return {
898
+ trial: task.trial,
899
+ agentId: task.agentId,
900
+ score,
901
+ output: targetOutput.output || '',
902
+ judge_output: judgeResult.output || '',
903
+ passed: score >= 4
904
+ };
905
+ } catch (err) {
906
+ console.error(`[Runner] Trial failed: ${task.agentId} trial ${task.trial}`, err.message);
907
+ return {
908
+ trial: task.trial,
909
+ agentId: task.agentId,
910
+ score: 1,
911
+ error: err.message,
912
+ passed: false
913
+ };
914
+ }
915
+ })
916
+ );
917
+
918
+ for (const result of batchResults) {
919
+ results.per_model[result.agentId].trials.push(result);
920
+ if (result.passed) {
921
+ results.per_model[result.agentId].pass_count++;
922
+ }
923
+ results.rubric_scores.push({
924
+ agentId: result.agentId,
925
+ trial: result.trial,
926
+ score: result.score
927
+ });
928
+ }
929
+ }
930
+ }
931
+
932
+ return results;
933
+ }
934
+
935
+ function parseJudgeResult(output) {
936
+ if (!output) return null;
937
+
938
+ const scoreMatch = output.match(/score:\s*(\d+)/i);
939
+ const reasonMatch = output.match(/reason:\s*(.+)/i);
940
+
941
+ if (scoreMatch) {
942
+ return {
943
+ score: parseInt(scoreMatch[1], 10),
944
+ reason: reasonMatch ? reasonMatch[1].trim() : ''
945
+ };
946
+ }
947
+
948
+ return null;
949
+ }
950
+
951
+ function aggregateResults(results, testCase) {
952
+ const aggregate = testCase.aggregate || 'auto';
953
+ const severity = testCase.severity || 'normal';
954
+
955
+ let useAll = aggregate === 'all';
956
+ if (aggregate === 'auto') {
957
+ useAll = severity === 'critical';
958
+ }
959
+
960
+ const perModelResults = {};
961
+
962
+ for (const [agentId, modelData] of Object.entries(results.per_model)) {
963
+ const passCount = modelData.pass_count;
964
+ const total = modelData.total;
965
+ const threshold = Math.ceil(total / 2);
966
+
967
+ let passed;
968
+ if (useAll) {
969
+ passed = passCount === total;
970
+ } else {
971
+ passed = passCount >= threshold;
972
+ }
973
+
974
+ perModelResults[agentId] = {
975
+ passed,
976
+ pass_count: passCount,
977
+ total,
978
+ threshold: useAll ? total : threshold
979
+ };
980
+ }
981
+
982
+ const allModelsPassed = Object.values(perModelResults).every(m => m.passed);
983
+
984
+ return {
985
+ per_model: perModelResults,
986
+ overall_passed: allModelsPassed
987
+ };
988
+ }
989
+
990
+ async function writeMetaJson(caseId, skillName, status, durationMs, l2Results = null, l1_skipped = null) {
991
+ const skillsDir = findSkillsDir();
992
+ const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
993
+ ensureDir(caseDir);
994
+
995
+ const meta = {
996
+ date: new Date().toISOString(),
997
+ skill_sha: getSkillSha(skillName),
998
+ status,
999
+ duration_ms: durationMs
1000
+ };
1001
+
1002
+ if (l1_skipped) {
1003
+ meta.l1_skipped = true;
1004
+ }
1005
+
1006
+ if (l2Results) {
1007
+ const aggregated = aggregateResults(l2Results, {});
1008
+ meta.per_model = aggregated.per_model;
1009
+ meta.rubric_scores = l2Results.rubric_scores;
1010
+ if (l2Results.tokens) {
1011
+ meta.tokens = l2Results.tokens;
1012
+ }
1013
+ }
1014
+
1015
+ fs.writeFileSync(
1016
+ path.join(caseDir, 'meta.json'),
1017
+ JSON.stringify(meta, null, 2),
1018
+ 'utf8'
1019
+ );
1020
+ }
1021
+
1022
+ async function runTestsForSkill(skillName, opts) {
1023
+ const result = {
1024
+ skill: skillName,
1025
+ status: 'passed',
1026
+ total: 0,
1027
+ current_run: { passed: 0, failed: 0 },
1028
+ baseline_ref: 'origin/main',
1029
+ target_agents: [],
1030
+ judge_agent: null
1031
+ };
1032
+ let cases = [];
1033
+ const currentRunStatuses = {};
1034
+
1035
+ try {
1036
+ const index = loadIndexYaml(skillName);
1037
+ const pipelineConfig = loadPipelineConfig(opts.pipeline || null);
1038
+
1039
+ const defaultTargetAgents = index.execution?.target_agents || [];
1040
+ const judgeAgent = index.execution?.judge_agent || null;
1041
+
1042
+ if (defaultTargetAgents.length > 0) {
1043
+ validateAgents(defaultTargetAgents, pipelineConfig);
1044
+ console.log(`[Runner] target_agents from index.yaml: ${defaultTargetAgents.join(', ')}`);
1045
+ }
1046
+
1047
+ if (judgeAgent) {
1048
+ validateAgents([judgeAgent], pipelineConfig);
1049
+ console.log(`[Runner] judge_agent from index.yaml: ${judgeAgent}`);
1050
+ }
1051
+
1052
+ let effectiveTargetAgents = defaultTargetAgents;
1053
+
1054
+ if (opts.agent) {
1055
+ validateAgents([opts.agent], pipelineConfig);
1056
+ effectiveTargetAgents = [opts.agent];
1057
+ console.log(`[Runner] Override target_agents via --agent: ${opts.agent}`);
1058
+ } else if (opts.primaryOnly && defaultTargetAgents.length > 0) {
1059
+ effectiveTargetAgents = [defaultTargetAgents[0]];
1060
+ console.log(`[Runner] Using only primary agent: ${effectiveTargetAgents[0]}`);
1061
+ }
1062
+
1063
+ result.target_agents = effectiveTargetAgents;
1064
+ result.judge_agent = judgeAgent;
1065
+
1066
+ if (opts.calibrate) {
1067
+ console.log(`[Runner] Running calibration gate only...`);
1068
+ const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
1069
+
1070
+ if (!calibrationResult.passed) {
1071
+ console.error(`[Runner] Calibration FAILED: ${calibrationResult.error}`);
1072
+ result.status = 'calibration_failed';
1073
+ result.error = calibrationResult.error;
1074
+ result.calibration = calibrationResult;
1075
+ return result;
1076
+ }
1077
+
1078
+ console.log('[Runner] Calibration gate PASSED');
1079
+ result.calibration = calibrationResult;
1080
+ result.status = 'calibration_passed';
1081
+ return result;
1082
+ }
1083
+
1084
+ cases = index.cases || [];
1085
+
1086
+ if (opts.tag) {
1087
+ cases = filterCasesByTag(cases, opts.tag);
1088
+ }
1089
+
1090
+ if (opts.severity) {
1091
+ cases = filterCasesBySeverity(cases, opts.severity);
1092
+ }
1093
+
1094
+ if (opts.caseId) {
1095
+ const caseDef = cases.find(c => c.id === opts.caseId);
1096
+ if (caseDef) {
1097
+ const testCase = loadTestCase(skillName, caseDef.file);
1098
+ if (testCase.execution?.target_agents) {
1099
+ validateAgents(testCase.execution.target_agents, pipelineConfig);
1100
+ effectiveTargetAgents = testCase.execution.target_agents;
1101
+ console.log(`[Runner] Override target_agents in case ${opts.caseId}: ${effectiveTargetAgents.join(', ')}`);
1102
+ }
1103
+ if (testCase.execution?.judge_agent) {
1104
+ const caseJudgeAgent = testCase.execution.judge_agent;
1105
+ validateAgents([caseJudgeAgent], pipelineConfig);
1106
+ console.log(`[Runner] Override judge_agent in case ${opts.caseId}: ${caseJudgeAgent}`);
1107
+ }
1108
+ cases = [caseDef];
1109
+ } else {
1110
+ throw new Error(`Case not found: ${opts.caseId}`);
1111
+ }
1112
+ }
1113
+
1114
+ result.total = cases.length;
1115
+
1116
+ const startTime = Date.now();
1117
+
1118
+ const runL2 = !opts.layer || opts.layer === 'l2';
1119
+
1120
+ if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent) {
1121
+ const trials = opts.fast ? 1 : 3;
1122
+ const totalModels = effectiveTargetAgents.length;
1123
+ const llmEstimate = cases.length * totalModels * trials * 2;
1124
+ await preFlightApproval(cases.length, totalModels, trials);
1125
+ }
1126
+
1127
+ for (const caseDef of cases) {
1128
+ const caseStart = Date.now();
1129
+
1130
+ try {
1131
+ const testCase = loadTestCase(skillName, caseDef.file);
1132
+
1133
+ const hasRubric = testCase.assertions?.rubric && testCase.assertions.rubric.length > 0;
1134
+
1135
+ const runL0 = !opts.layer || opts.layer === 'static' || opts.layer === 'deterministic';
1136
+ const runL1 = !opts.layer || opts.layer === 'deterministic';
1137
+ const runL2 = !opts.layer || opts.layer === 'l2';
1138
+
1139
+ // Secret scan (only for deterministic layer)
1140
+ if (runL1 && !opts.skipSecretScan) {
1141
+ const scanResult = await runSecretScan();
1142
+ if (!scanResult.passed) {
1143
+ result.current_run.failed++;
1144
+ result.status = 'failed';
1145
+ result.error = 'Secret scan failed - secrets detected in fixtures';
1146
+ currentRunStatuses[caseDef.id] = 'failed';
1147
+ await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1148
+ continue;
1149
+ }
1150
+ }
1151
+
1152
+ // L0 static assertions
1153
+ if (runL0) {
1154
+ const l0Results = runL0Assertions(skillName, testCase);
1155
+ const l0Failed = l0Results.filter(r => !r.passed);
1156
+ if (l0Failed.length > 0) {
1157
+ result.current_run.failed++;
1158
+ result.status = 'failed';
1159
+ currentRunStatuses[caseDef.id] = 'failed';
1160
+ await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1161
+ continue;
1162
+ }
1163
+ }
1164
+
1165
+ if (runL1) {
1166
+ const mockOutput = '';
1167
+ const l1Results = runL1Assertions(mockOutput, testCase);
1168
+ const l1Failed = l1Results.filter(r => !r.passed);
1169
+ const l1Skipped = l1Results.some(r => r.skipped);
1170
+
1171
+ const caseStatus = l1Failed.length === 0 ? 'passed' : 'failed';
1172
+ currentRunStatuses[caseDef.id] = caseStatus;
1173
+
1174
+ if (l1Failed.length > 0) {
1175
+ result.current_run.failed++;
1176
+ result.status = 'failed';
1177
+ } else {
1178
+ result.current_run.passed++;
1179
+ }
1180
+
1181
+ if (l1Skipped) {
1182
+ result.l1_skipped = true;
1183
+ }
1184
+
1185
+ if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
1186
+ const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
1187
+
1188
+ if (!calibrationResult.passed) {
1189
+ console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
1190
+ result.status = 'calibration_failed';
1191
+ result.error = calibrationResult.error;
1192
+ result.calibration = calibrationResult;
1193
+ return result;
1194
+ }
1195
+
1196
+ if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
1197
+ console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
1198
+ }
1199
+
1200
+ console.log('[Runner] Calibration gate PASSED');
1201
+ }
1202
+
1203
+ let l2Results = null;
1204
+ if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
1205
+ const trials = opts.fast ? 1 : 3;
1206
+ const index = loadIndexYaml(skillName);
1207
+ const defaultTimeout = index.execution?.default_timeout_s || 300;
1208
+ const timeout = testCase.execution?.timeout_s || defaultTimeout;
1209
+ try {
1210
+ l2Results = await runL2Evaluation(
1211
+ skillName,
1212
+ testCase,
1213
+ caseDef,
1214
+ effectiveTargetAgents,
1215
+ judgeAgent,
1216
+ pipelineConfig,
1217
+ { trials, concurrency: 2, timeout }
1218
+ );
1219
+
1220
+ const aggregated = aggregateResults(l2Results, testCase);
1221
+ console.log(`[Runner] L2 Results for ${caseDef.id}:`, JSON.stringify(aggregated, null, 2));
1222
+
1223
+ await writeJudgeResults(skillName, caseDef.id, l2Results);
1224
+
1225
+ if (!aggregated.overall_passed) {
1226
+ result.status = 'failed';
1227
+ currentRunStatuses[caseDef.id] = 'failed';
1228
+ }
1229
+ } catch (l2Err) {
1230
+ console.error(`[Runner] L2 evaluation failed:`, l2Err.message);
1231
+ result.status = 'failed';
1232
+ currentRunStatuses[caseDef.id] = 'failed';
1233
+ }
1234
+ }
1235
+
1236
+ await writeMetaJson(caseDef.id, skillName, caseStatus, Date.now() - caseStart, l2Results, result.l1_skipped);
1237
+ } else if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
1238
+ const trials = opts.fast ? 1 : 3;
1239
+ const defaultTimeout = index.execution?.default_timeout_s || 300;
1240
+ const timeout = testCase.execution?.timeout_s || defaultTimeout;
1241
+ let l2Results = null;
1242
+ let caseStatus = 'passed';
1243
+ try {
1244
+ l2Results = await runL2Evaluation(
1245
+ skillName,
1246
+ testCase,
1247
+ caseDef,
1248
+ effectiveTargetAgents,
1249
+ judgeAgent,
1250
+ pipelineConfig,
1251
+ { trials, concurrency: 2, timeout }
1252
+ );
1253
+
1254
+ const aggregated = aggregateResults(l2Results, testCase);
1255
+ console.log(`[Runner] L2 Results for ${caseDef.id}:`, JSON.stringify(aggregated, null, 2));
1256
+
1257
+ await writeJudgeResults(skillName, caseDef.id, l2Results);
1258
+
1259
+ if (!aggregated.overall_passed) {
1260
+ result.status = 'failed';
1261
+ result.current_run.failed++;
1262
+ caseStatus = 'failed';
1263
+ } else {
1264
+ result.current_run.passed++;
1265
+ }
1266
+ } catch (l2Err) {
1267
+ console.error(`[Runner] L2 evaluation failed:`, l2Err.message);
1268
+ result.status = 'failed';
1269
+ result.current_run.failed++;
1270
+ caseStatus = 'failed';
1271
+ }
1272
+
1273
+ currentRunStatuses[caseDef.id] = caseStatus;
1274
+ await writeMetaJson(caseDef.id, skillName, caseStatus, Date.now() - caseStart, l2Results);
1275
+ } else {
1276
+ result.current_run.passed++;
1277
+ currentRunStatuses[caseDef.id] = 'passed';
1278
+ await writeMetaJson(caseDef.id, skillName, 'passed', Date.now() - caseStart);
1279
+ }
1280
+ } catch (e) {
1281
+ result.current_run.failed++;
1282
+ result.status = 'failed';
1283
+ currentRunStatuses[caseDef.id] = 'error';
1284
+ await writeMetaJson(caseDef.id, skillName, 'error', Date.now() - caseStart);
1285
+ }
1286
+ }
1287
+ } catch (e) {
1288
+ result.status = 'error';
1289
+ result.error = e.message;
1290
+ }
1291
+
1292
+ return {
1293
+ ...result,
1294
+ cases,
1295
+ currentRunStatuses
1296
+ };
1297
+ }
1298
+
1299
+ async function runSkillTests(opts) {
1300
+ // Validate options
1301
+ if (!opts.all && !opts.skill) {
1302
+ throw new Error('Either --skill or --all must be specified');
1303
+ }
1304
+
1305
+ const results = {
1306
+ status: 'passed',
1307
+ skill: opts.skill || 'unknown',
1308
+ mode: 'deterministic',
1309
+ total: 0,
1310
+ current_run: { passed: 0, failed: 0 },
1311
+ baseline_ref: 'origin/main',
1312
+ git_head_comparison: null,
1313
+ verdict: 'ready_for_user_review',
1314
+ outcome_message: ''
1315
+ };
1316
+
1317
+ try {
1318
+ if (opts.skill) {
1319
+ const skillResult = await runTestsForSkill(opts.skill, opts);
1320
+
1321
+ // Merge skill results
1322
+ results.skill = skillResult.skill;
1323
+ results.total = skillResult.total;
1324
+ results.current_run.passed = skillResult.current_run.passed;
1325
+ results.current_run.failed = skillResult.current_run.failed;
1326
+ results.status = skillResult.status;
1327
+ results.target_agents = skillResult.target_agents;
1328
+ results.judge_agent = skillResult.judge_agent;
1329
+ if (skillResult.error) results.error = skillResult.error;
1330
+ if (skillResult.calibration) results.calibration = skillResult.calibration;
1331
+
1332
+ // Prepare for git comparison (if applicable)
1333
+ const cases = skillResult.cases;
1334
+ const currentRunStatuses = skillResult.currentRunStatuses;
1335
+
1336
+ // Git comparison and verdict (skip for calibration or no cases)
1337
+ if (cases && cases.length > 0 && !opts.calibrate && !skillResult.status.startsWith('calibration_')) {
1338
+ try {
1339
+ const baselineRef = getBaselineRef(opts.skill, opts.baselineRef);
1340
+ results.baseline_ref = baselineRef;
1341
+
1342
+ console.log(`[Runner] Computing git head comparison for ${cases.length} cases with baselineRef=${baselineRef}`);
1343
+ const gitResult = await analyzeGitHeadComparison(opts.skill, cases, baselineRef, currentRunStatuses);
1344
+ const { comparison, mode } = gitResult;
1345
+ results.mode = mode;
1346
+ results.git_head_comparison = comparison;
1347
+ console.log(`[Runner] Git head comparison complete: mode=${mode}`);
1348
+
1349
+ let relevantCaseStatus = null;
1350
+ if (opts.relevant) {
1351
+ const relevantCaseDir = path.join(findSkillTestsDir(opts.skill), 'cases', opts.relevant, 'current', 'meta.json');
1352
+ if (fs.existsSync(relevantCaseDir)) {
1353
+ try {
1354
+ const meta = JSON.parse(fs.readFileSync(relevantCaseDir, 'utf8'));
1355
+ relevantCaseStatus = meta.status;
1356
+ } catch {}
1357
+ }
1358
+ }
1359
+
1360
+ if (relevantCaseStatus) {
1361
+ results.relevant_case_status = relevantCaseStatus;
1362
+ }
1363
+
1364
+ results.verdict = computeVerdict(comparison, mode, relevantCaseStatus, opts.establishBaseline);
1365
+ results.outcome_message = generateOutcomeMessage({
1366
+ verdict: results.verdict,
1367
+ comparison,
1368
+ mode,
1369
+ relevantCase: opts.relevant ? { id: opts.relevant, status: relevantCaseStatus } : null
1370
+ });
1371
+ } catch (verdictErr) {
1372
+ console.error('[Runner] Verdict computation failed:', verdictErr.message);
1373
+ console.error('[Runner] Stack:', verdictErr.stack);
1374
+ }
1375
+ }
1376
+ } else if (opts.all) {
1377
+ const skillNames = getAllSkillNamesWithTests();
1378
+ let total = 0;
1379
+ let passed = 0;
1380
+ let failed = 0;
1381
+ let overallStatus = 'passed';
1382
+
1383
+ for (const skillName of skillNames) {
1384
+ const skillResult = await runTestsForSkill(skillName, opts);
1385
+ total += skillResult.total;
1386
+ passed += skillResult.current_run.passed;
1387
+ failed += skillResult.current_run.failed;
1388
+ if (skillResult.status !== 'passed') {
1389
+ overallStatus = 'failed';
1390
+ }
1391
+ }
1392
+
1393
+ results.total = total;
1394
+ results.current_run.passed = passed;
1395
+ results.current_run.failed = failed;
1396
+ results.status = overallStatus;
1397
+ results.skill = 'all';
1398
+ results.mode = 'aggregated';
1399
+ results.verdict = overallStatus === 'passed' ? 'all_passed' : 'aggregated_failed';
1400
+ results.outcome_message = overallStatus === 'passed' ? 'All skills passed' : 'Some skills failed';
1401
+ results.baseline_ref = null;
1402
+ }
1403
+ } catch (e) {
1404
+ results.status = 'error';
1405
+ results.error = e.message;
1406
+ }
1407
+
1408
+ return results;
1409
+ }
1410
+
1411
+ function printResult(result) {
1412
+ console.log('---RESULT---');
1413
+ console.log(`status: ${result.status}`);
1414
+ console.log(`skill: ${result.skill}`);
1415
+ console.log(`mode: ${result.mode}`);
1416
+ console.log(`total: ${result.total}`);
1417
+ console.log(`current_run.passed: ${result.current_run.passed}`);
1418
+ console.log(`current_run.failed: ${result.current_run.failed}`);
1419
+
1420
+ if (result.baseline_ref) {
1421
+ console.log(`baseline_ref: ${result.baseline_ref}`);
1422
+ }
1423
+
1424
+ if (result.git_head_comparison) {
1425
+ const c = result.git_head_comparison;
1426
+ console.log(`git_head_comparison.previously_green: ${c.previously_green}`);
1427
+ console.log(`git_head_comparison.previously_green_still_green: ${c.previously_green_still_green}`);
1428
+ console.log(`git_head_comparison.previously_green_now_red: ${c.previously_green_now_red}`);
1429
+ console.log(`git_head_comparison.previously_red: ${c.previously_red}`);
1430
+ console.log(`git_head_comparison.previously_red_still_red: ${c.previously_red_still_red}`);
1431
+ console.log(`git_head_comparison.previously_red_now_green: ${c.previously_red_now_green}`);
1432
+ console.log(`git_head_comparison.new_cases: ${c.new_cases}`);
1433
+ }
1434
+
1435
+ if (result.relevant_case_status) {
1436
+ console.log(`relevant_case_status: ${result.relevant_case_status}`);
1437
+ }
1438
+
1439
+ if (result.verdict) {
1440
+ console.log(`verdict: ${result.verdict}`);
1441
+ }
1442
+
1443
+ if (result.outcome_message) {
1444
+ console.log(`outcome_message: ${result.outcome_message}`);
1445
+ }
1446
+
1447
+ console.log('---RESULT---');
1448
+ }
1449
+
1450
+ function showHelp() {
1451
+ console.log('run-skill-tests.js - Runner for skill tests');
1452
+ console.log('');
1453
+ console.log('Usage:');
1454
+ console.log(' node run-skill-tests.js --skill <name> Run all tests for a skill');
1455
+ console.log(' node run-skill-tests.js --case TC-XXX-NNN Run a single test case');
1456
+ console.log(' node run-skill-tests.js --tag <tag> Filter tests by tag');
1457
+ console.log(' node run-skill-tests.js --severity <level> Filter tests by severity (e.g., critical, normal)');
1458
+ console.log(' node run-skill-tests.js --layer static|deterministic|l2 Run only L0, L1 or L2');
1459
+ console.log(' node run-skill-tests.js --relevant TC-XXX-NNN Mark relevant case for coach');
1460
+ console.log(' node run-skill-tests.js --baseline-ref <ref> Override baseline ref (default: origin/main)');
1461
+ console.log(' node run-skill-tests.js --establish-baseline Allow reds in no-baseline mode');
1462
+ console.log(' node run-skill-tests.js --all Run all skills');
1463
+ console.log(' node run-skill-tests.js --agent <id> Run only on specific model from target_agents[]');
1464
+ console.log(' node run-skill-tests.js --primary-only Run only on first model from target_agents[]');
1465
+ console.log(' node run-skill-tests.js --skip-secret-scan Skip secret scanning before L2');
1466
+ console.log(' node run-skill-tests.js --fast Run with trials=1 for all cases');
1467
+ console.log(' node run-skill-tests.js --yes Skip pre-flight approval gate');
1468
+ console.log(' node run-skill-tests.js --calibrate Run only calibration gate (no full suite)');
1469
+ }
1470
+
1471
+ async function main() {
1472
+ const args = process.argv.slice(2);
1473
+
1474
+ if (args.includes('--help') || args.includes('-h')) {
1475
+ showHelp();
1476
+ return;
1477
+ }
1478
+
1479
+ const opts = parseArgs();
1480
+ const result = await runSkillTests(opts);
1481
+ printResult(result);
1482
+
1483
+ if (result.status === 'error') {
1484
+ process.exit(1);
1485
+ }
1486
+ }
1487
+
1488
+ main().catch(e => {
1489
+ console.error('Fatal error:', e.message);
1490
+ process.exit(1);
1491
+ });