npm - workflow-ai - Versions diffs - 1.0.63 → 1.0.64 - Mend

workflow-ai 1.0.63 → 1.0.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (494) hide show

package/src/scripts/run-skill-tests.js CHANGED Viewed

@@ -13,6 +13,55 @@ const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
 const projectRoot = findProjectRoot(process.cwd());
+import os from 'os';
+import { execSync } from 'child_process';
+function createTestWorkdir(skillName, suffix = '') {
+  const prefix = suffix ? `wf-test-${skillName}-${suffix}-` : `wf-test-${skillName}-`;
+  const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
+  const workflowDir = path.join(tmpRoot, '.workflow');
+  fs.mkdirSync(workflowDir, { recursive: true });
+  for (const sub of ['tickets/backlog', 'tickets/ready', 'tickets/in-progress', 'tickets/review', 'tickets/done', 'tickets/archive', 'plans/current', 'plans/archive', 'reports', 'logs']) {
+    fs.mkdirSync(path.join(workflowDir, sub), { recursive: true });
+  }
+  fs.writeFileSync(path.join(workflowDir, 'coach-backlog.yaml'), 'version: 1\nanalyzed_tickets: []\naudited_skills: {}\n', 'utf8');
+  const srcDir = path.join(workflowDir, 'src');
+  fs.mkdirSync(srcDir, { recursive: true });
+  const realSkills = path.join(projectRoot, 'src', 'skills');
+  const realScripts = path.join(projectRoot, 'src', 'scripts');
+  const linkSkills = path.join(srcDir, 'skills');
+  const linkScripts = path.join(srcDir, 'scripts');
+  const configDir = path.join(workflowDir, 'config');
+  const realConfigs = path.join(projectRoot, 'configs');
+  // Skills are COPIED (not junctioned) so that agents cannot write to real source files.
+  fs.cpSync(realSkills, linkSkills, { recursive: true, dereference: true });
+  // Scripts and configs are junctioned — read-only for agents in practice.
+  if (process.platform === 'win32') {
+    try { execSync(`mklink /J "${linkScripts}" "${realScripts}"`, { stdio: 'pipe', shell: true }); } catch {}
+    try { execSync(`mklink /J "${configDir}" "${realConfigs}"`, { stdio: 'pipe', shell: true }); } catch {}
+  } else {
+    try { fs.symlinkSync(realScripts, linkScripts, 'dir'); } catch {}
+    try { fs.symlinkSync(realConfigs, configDir, 'dir'); } catch {}
+  }
+  return tmpRoot;
+}
+function cleanupTestWorkdir(tmpRoot) {
+  if (!tmpRoot || !fs.existsSync(tmpRoot)) return;
+  // Remove junctions first so that their targets are not touched by rmSync.
+  if (process.platform === 'win32') {
+    for (const link of ['src/scripts', 'config']) {
+      const p = path.join(tmpRoot, '.workflow', link);
+      try { execSync(`rmdir "${p}"`, { stdio: 'pipe', shell: true }); } catch {}
+    }
+  }
+  try { fs.rmSync(tmpRoot, { recursive: true, force: true }); } catch {}
+}
 function parseArgs() {
   const args = process.argv.slice(2);
   const opts = {
@@ -720,13 +769,23 @@ async function writeJudgeResults(skillName, caseId, results) {
   const skillsDir = findSkillsDir();
   const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
   ensureDir(caseDir);
-  const judgeData = {
-    per_model: {},
-    rubric_scores: results.rubric_scores || [],
-    timestamp: new Date().toISOString()
-  };
+  const judgePath = path.join(caseDir, 'judge.json');
+  let judgeData = { per_model: {}, rubric_scores: [], timestamp: new Date().toISOString() };
+  if (fs.existsSync(judgePath)) {
+    try {
+      const existing = JSON.parse(fs.readFileSync(judgePath, 'utf8'));
+      judgeData.per_model = existing.per_model || {};
+      judgeData.rubric_scores = existing.rubric_scores || [];
+    } catch {}
+  }
+  const newAgentIds = new Set(Object.keys(results.per_model || {}));
+  judgeData.rubric_scores = judgeData.rubric_scores.filter(r => !newAgentIds.has(r.agentId));
+  for (const r of (results.rubric_scores || [])) {
+    judgeData.rubric_scores.push(r);
+  }
   for (const [agentId, modelData] of Object.entries(results.per_model || {})) {
     judgeData.per_model[agentId] = {
       pass_count: modelData.pass_count,
@@ -738,12 +797,10 @@ async function writeJudgeResults(skillName, caseId, results) {
       }))
     };
   }
-  fs.writeFileSync(
-    path.join(caseDir, 'judge.json'),
-    JSON.stringify(judgeData, null, 2),
-    'utf8'
-  );
+  judgeData.timestamp = new Date().toISOString();
+  fs.writeFileSync(judgePath, JSON.stringify(judgeData, null, 2), 'utf8');
 }
 async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0.02, targetAgentCost = 0.01) {
@@ -776,7 +833,7 @@ async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0
 }
 async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judgeAgentId, pipelineConfig, options = {}) {
-  const { trials = 3, concurrency = 2, timeout = 300 } = options;
+  const { trials = 3, timeout = 300 } = options;
   const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
   if (!judgeAgentConfig) {
@@ -799,12 +856,12 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
   };
   const caseId = caseDef?.id || 'unknown';
-  function buildTargetPrompt() {
+  function buildTargetPrompt(taskWorkdir) {
     let targetPrompt = '';
     const testsDir = findSkillTestsDir(skillName);
     const caseDir = caseDef?.file ? path.dirname(caseDef.file) : '';
     if (testCase.scenario?.system_prompt_file) {
       const systemPromptPath = path.join(testsDir, caseDir, testCase.scenario.system_prompt_file);
       if (fs.existsSync(systemPromptPath)) {
@@ -824,6 +881,28 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
             targetPrompt += `## ${input.as || 'Input'}\n`;
             targetPrompt += fs.readFileSync(fixturePath, 'utf8') + '\n\n';
           }
+        } else if (input.kind === 'inline') {
+          if (input.content) {
+            targetPrompt += `## ${input.as || 'Input'}\n`;
+            targetPrompt += input.content + '\n\n';
+          }
+        } else if (input.kind === 'ticket_file') {
+          const fixturePath = path.join(testsDir, caseDir, input.path);
+          const destDir = input.dest_dir || 'in-progress';
+          const ticketId = input.ticket_id;
+          if (!ticketId) {
+            throw new Error(`ticket_file input requires ticket_id (case ${caseId})`);
+          }
+          if (!taskWorkdir) {
+            throw new Error(`ticket_file input requires task workdir (case ${caseId})`);
+          }
+          if (!fs.existsSync(fixturePath)) {
+            throw new Error(`ticket_file fixture not found: ${fixturePath}`);
+          }
+          const destPath = path.join(taskWorkdir, '.workflow', 'tickets', destDir, `${ticketId}.md`);
+          fs.mkdirSync(path.dirname(destPath), { recursive: true });
+          fs.copyFileSync(fixturePath, destPath);
+          targetPrompt += `## Context\nticket_id: ${ticketId}\n\n`;
         }
       }
     }
@@ -831,46 +910,65 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
     if (!targetPrompt.trim()) {
       targetPrompt = testCase.prompt || testCase.input || '';
     }
     return targetPrompt;
   }
+  const allTasks = [];
   for (const agentId of targetAgents) {
     const agentConfig = pipelineConfig.agents[agentId];
     if (!agentConfig) {
       throw new Error(`Target agent not found: ${agentId}`);
     }
     results.per_model[agentId] = {
       trials: [],
       pass_count: 0,
       total: trials
     };
-    const tasks = [];
     for (let trial = 1; trial <= trials; trial++) {
-      tasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
+      allTasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
     }
-    for (let i = 0; i < tasks.length; i += concurrency) {
-      const batch = tasks.slice(i, i + concurrency);
-      const batchResults = await Promise.all(
-        batch.map(async (task) => {
-          try {
-            const targetPrompt = buildTargetPrompt();
-            const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
-              timeout,
-              stageId: `${caseId}-${task.agentId}-trial-${task.trial}`
-            });
-            const judgePrompt = `You are a judge evaluating the output of an AI agent.
+  }
+  const allResults = await Promise.all(
+    allTasks.map(async (task) => {
+      const taskSuffix = `${caseId}-${task.agentId}-t${task.trial}`;
+      let taskWorkdir = null;
+      try {
+        taskWorkdir = createTestWorkdir(skillName, taskSuffix);
+        const targetPrompt = buildTargetPrompt(taskWorkdir);
+        const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
+          timeout,
+          stageId: `${caseId}-${task.agentId}-trial-${task.trial}`,
+          projectRoot: taskWorkdir
+        });
+        // Snapshot ticket files after target-run (for judge to inspect actual file state).
+        let ticketFilesSection = '';
+        const ticketInputs = (testCase.scenario?.inputs || []).filter(i => i.kind === 'ticket_file');
+        for (const input of ticketInputs) {
+          const ticketPath = path.join(
+            taskWorkdir,
+            '.workflow', 'tickets',
+            input.dest_dir || 'in-progress',
+            `${input.ticket_id}.md`
+          );
+          if (fs.existsSync(ticketPath)) {
+            const content = fs.readFileSync(ticketPath, 'utf8');
+            ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id} (${input.dest_dir || 'in-progress'}/)\n\n\`\`\`markdown\n${content}\n\`\`\`\n`;
+          } else {
+            ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id}\n\n(file missing at ${input.dest_dir || 'in-progress'}/${input.ticket_id}.md)\n`;
+          }
+        }
+        const judgePrompt = `You are a judge evaluating the output of an AI agent.
 ## Rubric
 ${rubric}
 ## Target Agent Output
 ${targetOutput.output || targetOutput.status || 'No output'}
+${ticketFilesSection}
 ## Task
 ${testCase.description || testCase.name || 'Evaluate the response'}
@@ -881,54 +979,77 @@ score: <number 1-5>
 reason: <brief explanation>
 ---RESULT---`;
-            const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
-              timeout: 60,
-              stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
-            });
-            let score = 3;
-            const parsed = parseJudgeResult(judgeResult.output);
-            if (parsed && parsed.score) {
-              score = parsed.score;
-            }
-            await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
-            return {
-              trial: task.trial,
-              agentId: task.agentId,
-              score,
-              output: targetOutput.output || '',
-              judge_output: judgeResult.output || '',
-              passed: score >= 4
-            };
-          } catch (err) {
-            console.error(`[Runner] Trial failed: ${task.agentId} trial ${task.trial}`, err.message);
-            return {
-              trial: task.trial,
-              agentId: task.agentId,
-              score: 1,
-              error: err.message,
-              passed: false
-            };
-          }
-        })
-      );
-      for (const result of batchResults) {
-        results.per_model[result.agentId].trials.push(result);
-        if (result.passed) {
-          results.per_model[result.agentId].pass_count++;
-        }
-        results.rubric_scores.push({
-          agentId: result.agentId,
-          trial: result.trial,
-          score: result.score
+        const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
+          timeout: 60,
+          stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
         });
+        let score = 3;
+        const parsed = parseJudgeResult(judgeResult.output);
+        if (parsed && parsed.score) {
+          score = parsed.score;
+        }
+        await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
+        return {
+          trial: task.trial,
+          agentId: task.agentId,
+          score,
+          output: targetOutput.output || '',
+          judge_output: judgeResult.output || '',
+          passed: score >= 4,
+          errored: false
+        };
+      } catch (err) {
+        console.error(`[Runner] Trial errored: ${task.agentId} trial ${task.trial} — ${err.message}`);
+        try {
+          await writeTrialOutput(
+            skillName,
+            caseId,
+            task.agentId,
+            task.trial,
+            `# TRIAL ERRORED\n\nagent: ${task.agentId}\ntrial: ${task.trial}\nerror: ${err.message}\n`
+          );
+        } catch {}
+        return {
+          trial: task.trial,
+          agentId: task.agentId,
+          score: null,
+          error: err.message,
+          passed: false,
+          errored: true
+        };
+      } finally {
+        if (taskWorkdir) {
+          cleanupTestWorkdir(taskWorkdir);
+        }
       }
+    })
+  );
+  for (const result of allResults) {
+    results.per_model[result.agentId].trials.push(result);
+    if (result.errored) {
+      results.per_model[result.agentId].error_count = (results.per_model[result.agentId].error_count || 0) + 1;
+    } else if (result.passed) {
+      results.per_model[result.agentId].pass_count++;
     }
+    results.rubric_scores.push({
+      agentId: result.agentId,
+      trial: result.trial,
+      score: result.score,
+      errored: !!result.errored,
+      error: result.error || undefined
+    });
   }
+  for (const agentId of Object.keys(results.per_model)) {
+    results.per_model[agentId].trials.sort((a, b) => a.trial - b.trial);
+  }
+  results.rubric_scores.sort((a, b) =>
+    a.agentId === b.agentId ? a.trial - b.trial : a.agentId.localeCompare(b.agentId)
+  );
   return results;
 }
@@ -961,19 +1082,27 @@ function aggregateResults(results, testCase) {
   for (const [agentId, modelData] of Object.entries(results.per_model)) {
     const passCount = modelData.pass_count;
+    const errorCount = modelData.error_count || 0;
     const total = modelData.total;
+    const effective = total - errorCount;
     const threshold = Math.ceil(total / 2);
     let passed;
-    if (useAll) {
+    let errored = false;
+    if (effective === 0) {
+      passed = false;
+      errored = true;
+    } else if (useAll) {
       passed = passCount === total;
     } else {
       passed = passCount >= threshold;
     }
     perModelResults[agentId] = {
       passed,
+      errored,
       pass_count: passCount,
+      error_count: errorCount,
       total,
       threshold: useAll ? total : threshold
     };
@@ -991,40 +1120,66 @@ async function writeMetaJson(caseId, skillName, status, durationMs, l2Results =
   const skillsDir = findSkillsDir();
   const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
   ensureDir(caseDir);
+  const metaPath = path.join(caseDir, 'meta.json');
+  let existing = null;
+  if (fs.existsSync(metaPath)) {
+    try {
+      existing = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
+    } catch {}
+  }
   const meta = {
     date: new Date().toISOString(),
     skill_sha: getSkillSha(skillName),
     status,
     duration_ms: durationMs
   };
   if (l1_skipped) {
     meta.l1_skipped = true;
   }
+  const mergedPerModel = (existing && existing.per_model) ? { ...existing.per_model } : {};
+  let mergedRubricScores = (existing && existing.rubric_scores) ? [...existing.rubric_scores] : [];
   if (l2Results) {
     const aggregated = aggregateResults(l2Results, {});
-    meta.per_model = aggregated.per_model;
-    meta.rubric_scores = l2Results.rubric_scores;
+    const newAgentIds = new Set(Object.keys(aggregated.per_model || {}));
+    for (const [agentId, data] of Object.entries(aggregated.per_model || {})) {
+      mergedPerModel[agentId] = data;
+    }
+    mergedRubricScores = mergedRubricScores.filter(r => !newAgentIds.has(r.agentId));
+    for (const r of (l2Results.rubric_scores || [])) {
+      mergedRubricScores.push(r);
+    }
     if (l2Results.tokens) {
       meta.tokens = l2Results.tokens;
     }
   }
-  fs.writeFileSync(
-    path.join(caseDir, 'meta.json'),
-    JSON.stringify(meta, null, 2),
-    'utf8'
-  );
+  if (Object.keys(mergedPerModel).length > 0) {
+    meta.per_model = mergedPerModel;
+  }
+  if (mergedRubricScores.length > 0) {
+    meta.rubric_scores = mergedRubricScores;
+  }
+  const allPassed = Object.values(mergedPerModel).every(m => m.passed);
+  if (Object.keys(mergedPerModel).length > 0) {
+    meta.status = allPassed ? 'passed' : 'failed';
+  }
+  fs.writeFileSync(metaPath, JSON.stringify(meta, null, 2), 'utf8');
 }
 async function runTestsForSkill(skillName, opts) {
+  console.log(`[Runner] Per-task isolated workdirs will be created for each (case × agent × trial)`);
   const result = {
     skill: skillName,
     status: 'passed',
     total: 0,
-    current_run: { passed: 0, failed: 0 },
+    current_run: { passed: 0, failed: 0, no_coverage: 0 },
     baseline_ref: 'origin/main',
     target_agents: [],
     judge_agent: null
@@ -1117,14 +1272,56 @@ async function runTestsForSkill(skillName, opts) {
     const runL2 = !opts.layer || opts.layer === 'l2';
-    if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent) {
+    const casesWithRubric = cases.filter(cd => {
+      try {
+        const tc = loadTestCase(skillName, cd.file);
+        return tc.assertions?.rubric && tc.assertions.rubric.length > 0;
+      } catch { return false; }
+    });
+    const anyHasRubric = casesWithRubric.length > 0;
+    if (casesWithRubric.length < cases.length) {
+      const missing = cases.length - casesWithRubric.length;
+      console.log(`[Runner] ${missing}/${cases.length} cases have no rubric — L2 will be skipped for them`);
+    }
+    if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric) {
       const trials = opts.fast ? 1 : 3;
       const totalModels = effectiveTargetAgents.length;
-      const llmEstimate = cases.length * totalModels * trials * 2;
-      await preFlightApproval(cases.length, totalModels, trials);
+      await preFlightApproval(casesWithRubric.length, totalModels, trials);
+    }
+    let secretScanFailed = false;
+    let calibrationFailedResult = null;
+    const anyRunL1 = !opts.layer || opts.layer === 'deterministic';
+    const anyRunL2 = !opts.layer || opts.layer === 'l2';
+    if (anyRunL1 && !opts.skipSecretScan) {
+      const scanResult = await runSecretScan();
+      if (!scanResult.passed) {
+        secretScanFailed = true;
+        result.error = 'Secret scan failed - secrets detected in fixtures';
+      }
     }
-    for (const caseDef of cases) {
+    if (anyRunL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric && !secretScanFailed) {
+      const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
+      if (!calibrationResult.passed) {
+        console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
+        calibrationFailedResult = calibrationResult;
+        result.status = 'calibration_failed';
+        result.error = calibrationResult.error;
+        result.calibration = calibrationResult;
+        return { ...result, cases, currentRunStatuses };
+      }
+      if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
+        console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
+      }
+      console.log('[Runner] Calibration gate PASSED');
+    }
+    await Promise.all(cases.map(async (caseDef) => {
       const caseStart = Date.now();
       try {
@@ -1136,17 +1333,13 @@ async function runTestsForSkill(skillName, opts) {
         const runL1 = !opts.layer || opts.layer === 'deterministic';
         const runL2 = !opts.layer || opts.layer === 'l2';
-        // Secret scan (only for deterministic layer)
-        if (runL1 && !opts.skipSecretScan) {
-          const scanResult = await runSecretScan();
-          if (!scanResult.passed) {
-            result.current_run.failed++;
-            result.status = 'failed';
-            result.error = 'Secret scan failed - secrets detected in fixtures';
-            currentRunStatuses[caseDef.id] = 'failed';
-            await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
-            continue;
-          }
+        // Secret scan result propagated from pre-loop
+        if (runL1 && !opts.skipSecretScan && secretScanFailed) {
+          result.current_run.failed++;
+          result.status = 'failed';
+          currentRunStatuses[caseDef.id] = 'failed';
+          await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
+          return;
         }
         // L0 static assertions
@@ -1158,7 +1351,7 @@ async function runTestsForSkill(skillName, opts) {
             result.status = 'failed';
             currentRunStatuses[caseDef.id] = 'failed';
             await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
-            continue;
+            return;
           }
         }
@@ -1167,13 +1360,28 @@ async function runTestsForSkill(skillName, opts) {
           const l1Results = runL1Assertions(mockOutput, testCase);
           const l1Failed = l1Results.filter(r => !r.passed);
           const l1Skipped = l1Results.some(r => r.skipped);
+          const l1Declared = (testCase.assertions?.deterministic || []).length;
+          const l1Executed = l1Results.filter(r => !r.skipped).length;
-          const caseStatus = l1Failed.length === 0 ? 'passed' : 'failed';
-          currentRunStatuses[caseDef.id] = caseStatus;
+          const willRunL2 = runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric;
+          const noCoverage = l1Declared > 0 && l1Executed === 0 && !willRunL2;
+          let caseStatus;
           if (l1Failed.length > 0) {
+            caseStatus = 'failed';
+          } else if (noCoverage) {
+            caseStatus = 'no_coverage';
+          } else {
+            caseStatus = 'passed';
+          }
+          currentRunStatuses[caseDef.id] = caseStatus;
+          if (caseStatus === 'failed') {
             result.current_run.failed++;
             result.status = 'failed';
+          } else if (caseStatus === 'no_coverage') {
+            result.current_run.no_coverage = (result.current_run.no_coverage || 0) + 1;
+            console.log(`[Runner] ${caseDef.id}: no_coverage — L1 assertions require agent output but L2 is not configured (no rubric or no agents)`);
           } else {
             result.current_run.passed++;
           }
@@ -1182,36 +1390,25 @@ async function runTestsForSkill(skillName, opts) {
             result.l1_skipped = true;
           }
-          if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
-            const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
-            if (!calibrationResult.passed) {
-              console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
-              result.status = 'calibration_failed';
-              result.error = calibrationResult.error;
-              result.calibration = calibrationResult;
-              return result;
-            }
-            if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
-              console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
-            }
-            console.log('[Runner] Calibration gate PASSED');
-          }
           let l2Results = null;
-          if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
+          if (willRunL2) {
             const trials = opts.fast ? 1 : 3;
             const index = loadIndexYaml(skillName);
             const defaultTimeout = index.execution?.default_timeout_s || 300;
             const timeout = testCase.execution?.timeout_s || defaultTimeout;
+            const caseTargetAgents = testCase.execution?.target_agents;
+            const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
+              ? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
+              : effectiveTargetAgents;
+            if (caseTargetAgents && caseTargetAgents.length > 0) {
+              console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
+            }
             try {
               l2Results = await runL2Evaluation(
                 skillName,
                 testCase,
                 caseDef,
-                effectiveTargetAgents,
+                perCaseAgents,
                 judgeAgent,
                 pipelineConfig,
                 { trials, concurrency: 2, timeout }
@@ -1238,6 +1435,13 @@ async function runTestsForSkill(skillName, opts) {
           const trials = opts.fast ? 1 : 3;
           const defaultTimeout = index.execution?.default_timeout_s || 300;
           const timeout = testCase.execution?.timeout_s || defaultTimeout;
+          const caseTargetAgents = testCase.execution?.target_agents;
+          const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
+            ? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
+            : effectiveTargetAgents;
+          if (caseTargetAgents && caseTargetAgents.length > 0) {
+            console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
+          }
           let l2Results = null;
           let caseStatus = 'passed';
           try {
@@ -1245,7 +1449,7 @@ async function runTestsForSkill(skillName, opts) {
               skillName,
               testCase,
               caseDef,
-              effectiveTargetAgents,
+              perCaseAgents,
               judgeAgent,
               pipelineConfig,
               { trials, concurrency: 2, timeout }
@@ -1283,6 +1487,10 @@ async function runTestsForSkill(skillName, opts) {
         currentRunStatuses[caseDef.id] = 'error';
         await writeMetaJson(caseDef.id, skillName, 'error', Date.now() - caseStart);
       }
+    }));
+    if (result.status === 'passed' && result.current_run.no_coverage > 0 && result.current_run.passed === 0) {
+      result.status = 'no_coverage';
     }
   } catch (e) {
     result.status = 'error';
@@ -1307,7 +1515,7 @@ async function runSkillTests(opts) {
     skill: opts.skill || 'unknown',
     mode: 'deterministic',
     total: 0,
-    current_run: { passed: 0, failed: 0 },
+    current_run: { passed: 0, failed: 0, no_coverage: 0 },
     baseline_ref: 'origin/main',
     git_head_comparison: null,
     verdict: 'ready_for_user_review',
@@ -1323,6 +1531,7 @@ async function runSkillTests(opts) {
       results.total = skillResult.total;
       results.current_run.passed = skillResult.current_run.passed;
       results.current_run.failed = skillResult.current_run.failed;
+      results.current_run.no_coverage = skillResult.current_run.no_coverage || 0;
       results.status = skillResult.status;
       results.target_agents = skillResult.target_agents;
       results.judge_agent = skillResult.judge_agent;
@@ -1406,7 +1615,7 @@ async function runSkillTests(opts) {
   }
   return results;
-}
+}
 function printResult(result) {
   console.log('---RESULT---');
@@ -1416,6 +1625,9 @@ function printResult(result) {
   console.log(`total: ${result.total}`);
   console.log(`current_run.passed: ${result.current_run.passed}`);
   console.log(`current_run.failed: ${result.current_run.failed}`);
+  if (result.current_run.no_coverage) {
+    console.log(`current_run.no_coverage: ${result.current_run.no_coverage}`);
+  }
   if (result.baseline_ref) {
     console.log(`baseline_ref: ${result.baseline_ref}`);