workflow-ai 1.0.63 → 1.0.65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (576) hide show
  1. package/README.md +239 -145
  2. package/configs/agent-health-rules.yaml +64 -0
  3. package/configs/config.yaml +134 -0
  4. package/configs/pipeline.yaml +901 -0
  5. package/configs/ticket-movement-rules.yaml +80 -0
  6. package/package.json +1 -1
  7. package/src/global-dir.mjs +25 -1
  8. package/src/init.mjs +20 -3
  9. package/src/lib/agent-health-registry.mjs +245 -0
  10. package/src/lib/artifact-snapshot.mjs +233 -0
  11. package/src/lib/error-classifier.mjs +274 -0
  12. package/src/lib/test-error-classifier.mjs +60 -0
  13. package/src/lib/test-extends.mjs +58 -0
  14. package/src/lib/test-version.mjs +21 -0
  15. package/src/scripts/move-to-review.js +5 -7
  16. package/src/scripts/reset-agent-health.js +62 -0
  17. package/src/scripts/run-skill-tests.js +348 -136
  18. package/src/skills/analyze-report/README.md +44 -0
  19. package/src/skills/analyze-report/SKILL.md +121 -0
  20. package/src/skills/analyze-report/algorithms/progress-assessment.md +108 -0
  21. package/src/skills/analyze-report/knowledge/analysis-frameworks.md +66 -0
  22. package/src/skills/analyze-report/knowledge/report-structure.md +61 -0
  23. package/src/skills/analyze-report/scripts/calc-plan-metrics.js +234 -0
  24. package/src/skills/analyze-report/templates/analysis-report.md +80 -0
  25. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-1.md +69 -0
  26. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-2.md +103 -0
  27. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-3.md +99 -0
  28. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/judge.json +163 -0
  29. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-1.md +89 -0
  30. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-2.md +88 -0
  31. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-3.md +100 -0
  32. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-1.md +77 -0
  33. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-2.md +64 -0
  34. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-3.md +110 -0
  35. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-1.md +74 -0
  36. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-2.md +38 -0
  37. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-3.md +61 -0
  38. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/meta.json +115 -0
  39. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001-evidence-from-log.yaml +60 -0
  40. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-1.md +90 -0
  41. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-2.md +89 -0
  42. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-3.md +77 -0
  43. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/judge.json +163 -0
  44. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-1.md +84 -0
  45. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-2.md +77 -0
  46. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-3.md +89 -0
  47. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-1.md +103 -0
  48. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-2.md +103 -0
  49. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-3.md +103 -0
  50. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-1.md +93 -0
  51. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-2.md +93 -0
  52. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-3.md +86 -0
  53. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/meta.json +115 -0
  54. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002-result-block-format.yaml +44 -0
  55. package/src/skills/analyze-report/tests/fixtures/REPORT-002-incorrect-attribution.md +27 -0
  56. package/src/skills/analyze-report/tests/fixtures/pipeline-2026-04-06_qa-001-skip.log +32 -0
  57. package/src/skills/analyze-report/tests/index.yaml +25 -0
  58. package/src/skills/analyze-report/tests/rubrics/evidence-from-log.md +22 -0
  59. package/src/skills/analyze-report/tests/rubrics/result-block-format.md +22 -0
  60. package/src/skills/analyze-report/workflows/progress.md +158 -0
  61. package/src/skills/analyze-report/workflows/retrospective.md +143 -0
  62. package/src/skills/coach/README.md +43 -0
  63. package/src/skills/coach/SKILL.md +167 -0
  64. package/src/skills/coach/SKILL.md.legacy +157 -0
  65. package/src/skills/coach/algorithms/gap-analysis.md +69 -0
  66. package/src/skills/coach/algorithms/improvement-prioritization.md +62 -0
  67. package/src/skills/coach/algorithms/skill-scoring.md +80 -0
  68. package/src/skills/coach/knowledge/audit-applied-changes-clean.txt +11 -0
  69. package/src/skills/coach/knowledge/backlog-management.md +67 -0
  70. package/src/skills/coach/knowledge/backlog-management.md.legacy +90 -0
  71. package/src/skills/coach/knowledge/common-antipatterns.md +76 -0
  72. package/src/skills/coach/knowledge/prompt-engineering.md +45 -0
  73. package/src/skills/coach/knowledge/shared-knowledge-guide.md +44 -0
  74. package/src/skills/coach/knowledge/skill-anatomy.md +49 -0
  75. package/src/skills/coach/knowledge/test-authorship.md +141 -0
  76. package/src/skills/coach/templates/audit-report.md +39 -0
  77. package/src/skills/coach/templates/coach-backlog-init.yaml +14 -0
  78. package/src/skills/coach/templates/coach-backlog-init.yaml.legacy +10 -0
  79. package/src/skills/coach/templates/improvement-plan.md +42 -0
  80. package/src/skills/coach/templates/new-skill.md +95 -0
  81. package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-1.md +58 -0
  82. package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-2.md +65 -0
  83. package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-3.md +58 -0
  84. package/src/skills/coach/tests/cases/TC-COACH-001/current/judge.json +151 -0
  85. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-1.md +46 -0
  86. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-2.md +0 -0
  87. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-3.md +75 -0
  88. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-1.md +81 -0
  89. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-2.md +101 -0
  90. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-3.md +91 -0
  91. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-1.md +48 -0
  92. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-2.md +30 -0
  93. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-3.md +55 -0
  94. package/src/skills/coach/tests/cases/TC-COACH-001/current/meta.json +94 -0
  95. package/src/skills/coach/tests/cases/TC-COACH-001-evidence-based-temporal-diagram.yaml +53 -0
  96. package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-1.md +46 -0
  97. package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-2.md +50 -0
  98. package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-3.md +48 -0
  99. package/src/skills/coach/tests/cases/TC-COACH-002/current/judge.json +151 -0
  100. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-1.md +0 -0
  101. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-2.md +37 -0
  102. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-3.md +30 -0
  103. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-1.md +23 -0
  104. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-2.md +29 -0
  105. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-3.md +35 -0
  106. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-1.md +13 -0
  107. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-2.md +19 -0
  108. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-3.md +33 -0
  109. package/src/skills/coach/tests/cases/TC-COACH-002/current/meta.json +94 -0
  110. package/src/skills/coach/tests/cases/TC-COACH-002-root-cause-first.yaml +57 -0
  111. package/src/skills/coach/tests/fixtures/pipeline-2026-04-06_id-collision.log +77 -0
  112. package/src/skills/coach/tests/index.yaml +29 -0
  113. package/src/skills/coach/tests/rubrics/calibration/evidence-based-bad.md +13 -0
  114. package/src/skills/coach/tests/rubrics/calibration/evidence-based-good.md +29 -0
  115. package/src/skills/coach/tests/rubrics/evidence-based.md +26 -0
  116. package/src/skills/coach/tests/rubrics/root-cause-first.md +21 -0
  117. package/src/skills/coach/workflows/analyze.md +79 -0
  118. package/src/skills/coach/workflows/analyze.md.legacy +64 -0
  119. package/src/skills/coach/workflows/audit.md +74 -0
  120. package/src/skills/coach/workflows/audit.md.legacy +59 -0
  121. package/src/skills/coach/workflows/create.md +80 -0
  122. package/src/skills/coach/workflows/create.md.legacy +67 -0
  123. package/src/skills/coach/workflows/improve.md +71 -0
  124. package/src/skills/coach/workflows/improve.md.legacy +60 -0
  125. package/src/skills/coach/workflows/research.md +55 -0
  126. package/src/skills/coach/workflows/review.md +52 -0
  127. package/src/skills/coach/workflows/review.md.legacy +48 -0
  128. package/src/skills/coach/workflows/test.md +97 -0
  129. package/src/skills/create-plan/README.md +39 -0
  130. package/src/skills/create-plan/SKILL.md +104 -0
  131. package/src/skills/create-plan/algorithms/risk-assessment.md +73 -0
  132. package/src/skills/create-plan/knowledge/plan-completeness.md +67 -0
  133. package/src/skills/create-plan/knowledge/plan-lifecycle.md +33 -0
  134. package/src/skills/create-plan/knowledge/task-verification-pairs.md +151 -0
  135. package/src/skills/create-plan/scripts/validate-completeness.js +182 -0
  136. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-1.md +5 -0
  137. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-2.md +39 -0
  138. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-3.md +35 -0
  139. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/judge.json +167 -0
  140. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-1.md +5 -0
  141. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-2.md +10 -0
  142. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-3.md +5 -0
  143. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-1.md +26 -0
  144. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-2.md +86 -0
  145. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-3.md +5 -0
  146. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-1.md +11 -0
  147. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-2.md +15 -0
  148. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-3.md +14 -0
  149. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/meta.json +119 -0
  150. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001-validate-completeness.yaml +41 -0
  151. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-1.md +25 -0
  152. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-2.md +30 -0
  153. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-3.md +37 -0
  154. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/judge.json +164 -0
  155. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-1.md +3 -0
  156. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-2.md +11 -0
  157. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-3.md +13 -0
  158. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-1.md +44 -0
  159. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-2.md +5 -0
  160. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-3.md +49 -0
  161. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-1.md +6 -0
  162. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-2.md +11 -0
  163. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-3.md +16 -0
  164. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/meta.json +116 -0
  165. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002-task-granularity.yaml +39 -0
  166. package/src/skills/create-plan/tests/index.yaml +25 -0
  167. package/src/skills/create-plan/tests/rubrics/task-granularity.md +21 -0
  168. package/src/skills/create-plan/tests/rubrics/validate-completeness.md +21 -0
  169. package/src/skills/create-plan/workflows/create.md +136 -0
  170. package/src/skills/create-report/README.md +40 -0
  171. package/src/skills/create-report/SKILL.md +73 -0
  172. package/src/skills/create-report/algorithms/metric-calculation.md +93 -0
  173. package/src/skills/create-report/knowledge/report-metrics.md +82 -0
  174. package/src/skills/create-report/scripts/calc-metrics.js +383 -0
  175. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-1.md +25 -0
  176. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-2.md +26 -0
  177. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-3.md +28 -0
  178. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/judge.json +163 -0
  179. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-1.md +4 -0
  180. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-2.md +3 -0
  181. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-3.md +6 -0
  182. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-1.md +8 -0
  183. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-2.md +12 -0
  184. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-3.md +7 -0
  185. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-1.md +12 -0
  186. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-2.md +22 -0
  187. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-3.md +13 -0
  188. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/meta.json +115 -0
  189. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001-root-cause-attribution.yaml +57 -0
  190. package/src/skills/create-report/tests/index.yaml +20 -0
  191. package/src/skills/create-report/tests/rubrics/root-cause-attribution.md +21 -0
  192. package/src/skills/create-report/workflows/standard.md +175 -0
  193. package/src/skills/decompose-gaps/README.md +39 -0
  194. package/src/skills/decompose-gaps/SKILL.md +78 -0
  195. package/src/skills/decompose-gaps/algorithms/scope-check.md +110 -0
  196. package/src/skills/decompose-gaps/knowledge/scope-validation.md +65 -0
  197. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-1.md +41 -0
  198. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-2.md +41 -0
  199. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-3.md +56 -0
  200. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/judge.json +164 -0
  201. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-1.md +25 -0
  202. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-2.md +17 -0
  203. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-3.md +22 -0
  204. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-1.md +25 -0
  205. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-2.md +5 -0
  206. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-3.md +29 -0
  207. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-1.md +27 -0
  208. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-2.md +35 -0
  209. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-3.md +18 -0
  210. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/meta.json +116 -0
  211. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001-scope-exclusion.yaml +46 -0
  212. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-1.md +27 -0
  213. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-2.md +30 -0
  214. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-3.md +27 -0
  215. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/judge.json +163 -0
  216. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-1.md +0 -0
  217. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-2.md +15 -0
  218. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-3.md +7 -0
  219. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-1.md +21 -0
  220. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-2.md +38 -0
  221. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-3.md +16 -0
  222. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-1.md +5 -0
  223. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-2.md +10 -0
  224. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-3.md +9 -0
  225. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/meta.json +115 -0
  226. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002-glob-before-write.yaml +36 -0
  227. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/claude-sonnet/trial-1.md +30 -0
  228. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/claude-sonnet/trial-2.md +30 -0
  229. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/claude-sonnet/trial-3.md +30 -0
  230. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/judge.json +165 -0
  231. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-deepseek/trial-1.md +5 -0
  232. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-deepseek/trial-2.md +26 -0
  233. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-deepseek/trial-3.md +5 -0
  234. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-glm/trial-1.md +39 -0
  235. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-glm/trial-2.md +37 -0
  236. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-glm/trial-3.md +45 -0
  237. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-minimax/trial-1.md +26 -0
  238. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-minimax/trial-2.md +27 -0
  239. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/kilo-minimax/trial-3.md +7 -0
  240. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003/current/meta.json +117 -0
  241. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-003-parent-plan-mandatory.yaml +41 -0
  242. package/src/skills/decompose-gaps/tests/index.yaml +30 -0
  243. package/src/skills/decompose-gaps/tests/rubrics/glob-before-write.md +21 -0
  244. package/src/skills/decompose-gaps/tests/rubrics/parent-plan-mandatory.md +22 -0
  245. package/src/skills/decompose-gaps/tests/rubrics/scope-exclusion.md +21 -0
  246. package/src/skills/decompose-gaps/workflows/decompose.md +123 -0
  247. package/src/skills/decompose-plan/README.md +43 -0
  248. package/src/skills/decompose-plan/SKILL.md +87 -0
  249. package/src/skills/decompose-plan/algorithms/deduplication.md +101 -0
  250. package/src/skills/decompose-plan/knowledge/atomicity-checklist.md +139 -0
  251. package/src/skills/decompose-plan/knowledge/capabilities.md +68 -0
  252. package/src/skills/decompose-plan/knowledge/human-task-rules.md +82 -0
  253. package/src/skills/decompose-plan/knowledge/scope-guard-checklist.md +73 -0
  254. package/src/skills/decompose-plan/scripts/check-atomicity-limit.js +47 -0
  255. package/src/skills/decompose-plan/scripts/check-duplicates.js +323 -0
  256. package/src/skills/decompose-plan/scripts/verify-atomicity.js +408 -0
  257. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-1.md +30 -0
  258. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-2.md +36 -0
  259. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-3.md +37 -0
  260. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/judge.json +163 -0
  261. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-1.md +20 -0
  262. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-2.md +17 -0
  263. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-3.md +28 -0
  264. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-1.md +114 -0
  265. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-2.md +137 -0
  266. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-3.md +188 -0
  267. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-1.md +0 -0
  268. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-2.md +32 -0
  269. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-3.md +110 -0
  270. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/meta.json +115 -0
  271. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001-atomicity-no-1to1.yaml +56 -0
  272. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-1.md +47 -0
  273. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-2.md +54 -0
  274. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-3.md +43 -0
  275. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/judge.json +163 -0
  276. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-1.md +15 -0
  277. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-2.md +5 -0
  278. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-3.md +12 -0
  279. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-1.md +34 -0
  280. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-2.md +30 -0
  281. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-3.md +35 -0
  282. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-1.md +0 -0
  283. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-2.md +31 -0
  284. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-3.md +0 -0
  285. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/meta.json +115 -0
  286. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002-get-next-id-mandatory.yaml +44 -0
  287. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-1.md +21 -0
  288. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-2.md +38 -0
  289. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-3.md +30 -0
  290. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/judge.json +163 -0
  291. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-1.md +31 -0
  292. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-2.md +35 -0
  293. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-3.md +48 -0
  294. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-1.md +167 -0
  295. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-2.md +62 -0
  296. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-3.md +174 -0
  297. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-1.md +0 -0
  298. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-2.md +0 -0
  299. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-3.md +0 -0
  300. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/meta.json +115 -0
  301. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003-verbatim-dod-transfer.yaml +42 -0
  302. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/claude-sonnet/trial-1.md +55 -0
  303. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/claude-sonnet/trial-2.md +49 -0
  304. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/claude-sonnet/trial-3.md +49 -0
  305. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/judge.json +163 -0
  306. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-deepseek/trial-1.md +104 -0
  307. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-deepseek/trial-2.md +45 -0
  308. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-deepseek/trial-3.md +58 -0
  309. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-glm/trial-1.md +193 -0
  310. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-glm/trial-2.md +202 -0
  311. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-glm/trial-3.md +155 -0
  312. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-minimax/trial-1.md +52 -0
  313. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-minimax/trial-2.md +17 -0
  314. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/kilo-minimax/trial-3.md +0 -0
  315. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004/current/meta.json +115 -0
  316. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-004-executor-atomicity.yaml +64 -0
  317. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/claude-sonnet/trial-1.md +59 -0
  318. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/claude-sonnet/trial-2.md +204 -0
  319. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/claude-sonnet/trial-3.md +213 -0
  320. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/judge.json +163 -0
  321. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-deepseek/trial-1.md +0 -0
  322. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-deepseek/trial-2.md +57 -0
  323. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-deepseek/trial-3.md +54 -0
  324. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-glm/trial-1.md +147 -0
  325. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-glm/trial-2.md +165 -0
  326. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-glm/trial-3.md +133 -0
  327. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-minimax/trial-1.md +81 -0
  328. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-minimax/trial-2.md +108 -0
  329. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/kilo-minimax/trial-3.md +3 -0
  330. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005/current/meta.json +114 -0
  331. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-005-capabilities-registry.yaml +78 -0
  332. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/claude-sonnet/trial-1.md +225 -0
  333. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/claude-sonnet/trial-2.md +66 -0
  334. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/claude-sonnet/trial-3.md +36 -0
  335. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/judge.json +163 -0
  336. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-deepseek/trial-1.md +42 -0
  337. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-deepseek/trial-2.md +67 -0
  338. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-deepseek/trial-3.md +40 -0
  339. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-glm/trial-1.md +122 -0
  340. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-glm/trial-2.md +131 -0
  341. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-glm/trial-3.md +138 -0
  342. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-minimax/trial-1.md +41 -0
  343. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-minimax/trial-2.md +88 -0
  344. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/kilo-minimax/trial-3.md +0 -0
  345. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006/current/meta.json +115 -0
  346. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-006-dod-threshold.yaml +72 -0
  347. package/src/skills/decompose-plan/tests/index.yaml +45 -0
  348. package/src/skills/decompose-plan/tests/rubrics/atomicity-no-1to1.md +21 -0
  349. package/src/skills/decompose-plan/tests/rubrics/capabilities-registry.md +21 -0
  350. package/src/skills/decompose-plan/tests/rubrics/dod-threshold.md +21 -0
  351. package/src/skills/decompose-plan/tests/rubrics/executor-atomicity.md +21 -0
  352. package/src/skills/decompose-plan/tests/rubrics/get-next-id-mandatory.md +21 -0
  353. package/src/skills/decompose-plan/tests/rubrics/verbatim-dod-transfer.md +21 -0
  354. package/src/skills/decompose-plan/workflows/decompose.md +305 -0
  355. package/src/skills/deep-research/README.md +36 -0
  356. package/src/skills/deep-research/SKILL.md +106 -0
  357. package/src/skills/deep-research/algorithms/source-scoring.md +63 -0
  358. package/src/skills/deep-research/algorithms/synthesis.md +67 -0
  359. package/src/skills/deep-research/knowledge/data-validation.md +44 -0
  360. package/src/skills/deep-research/knowledge/perplexity-config.md +30 -0
  361. package/src/skills/deep-research/knowledge/research-methodology.md +54 -0
  362. package/src/skills/deep-research/knowledge/source-evaluation.md +33 -0
  363. package/src/skills/deep-research/scripts/perplexity-research.js +315 -0
  364. package/src/skills/deep-research/templates/brief-summary.md +25 -0
  365. package/src/skills/deep-research/templates/research-report.md +76 -0
  366. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-1.md +48 -0
  367. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-2.md +88 -0
  368. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-3.md +56 -0
  369. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/judge.json +163 -0
  370. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-1.md +58 -0
  371. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-2.md +249 -0
  372. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-3.md +44 -0
  373. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-1.md +96 -0
  374. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-2.md +56 -0
  375. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-3.md +94 -0
  376. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-1.md +11 -0
  377. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-2.md +1 -0
  378. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-3.md +1 -0
  379. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/meta.json +115 -0
  380. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001-self-check-url.yaml +58 -0
  381. package/src/skills/deep-research/tests/index.yaml +20 -0
  382. package/src/skills/deep-research/tests/rubrics/self-check-url.md +34 -0
  383. package/src/skills/deep-research/workflows/base-checklist.md +19 -0
  384. package/src/skills/deep-research/workflows/benchmark.md +38 -0
  385. package/src/skills/deep-research/workflows/competitor.md +44 -0
  386. package/src/skills/deep-research/workflows/custom.md +32 -0
  387. package/src/skills/deep-research/workflows/market.md +44 -0
  388. package/src/skills/deep-research/workflows/technology.md +40 -0
  389. package/src/skills/deep-research/workflows/trend.md +40 -0
  390. package/src/skills/execute-task/README.md +44 -0
  391. package/src/skills/execute-task/SKILL.md +292 -0
  392. package/src/skills/execute-task/algorithms/execution-strategy.md +136 -0
  393. package/src/skills/execute-task/knowledge/context-checkpoints.md +75 -0
  394. package/src/skills/execute-task/knowledge/ticket-structure.md +70 -0
  395. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-1.md +5 -0
  396. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-2.md +5 -0
  397. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-3.md +5 -0
  398. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/judge.json +124 -0
  399. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-1.md +4 -0
  400. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-2.md +4 -0
  401. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-3.md +4 -0
  402. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-1.md +4 -0
  403. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-2.md +4 -0
  404. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-3.md +11 -0
  405. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/meta.json +88 -0
  406. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001-no-ticket-creation.yaml +48 -0
  407. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-1.md +5 -0
  408. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-2.md +6 -0
  409. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-3.md +5 -0
  410. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/judge.json +124 -0
  411. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-1.md +4 -0
  412. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-2.md +4 -0
  413. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-3.md +8 -0
  414. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-1.md +9 -0
  415. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-2.md +26 -0
  416. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-3.md +4 -0
  417. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/meta.json +89 -0
  418. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002-no-duplicate-dod.yaml +44 -0
  419. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-1.md +5 -0
  420. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-2.md +5 -0
  421. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-3.md +5 -0
  422. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/judge.json +46 -0
  423. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/meta.json +37 -0
  424. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003-verification-proportionality.yaml +46 -0
  425. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-1.md +18 -0
  426. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-2.md +16 -0
  427. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-3.md +14 -0
  428. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/judge.json +124 -0
  429. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-1.md +5 -0
  430. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-2.md +5 -0
  431. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-3.md +1 -0
  432. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-1.md +8 -0
  433. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-2.md +5 -0
  434. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-3.md +4 -0
  435. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/meta.json +89 -0
  436. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004-no-foreign-ticket-edit.yaml +50 -0
  437. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-1.md +5 -0
  438. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-2.md +5 -0
  439. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-3.md +5 -0
  440. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/judge.json +124 -0
  441. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-1.md +15 -0
  442. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-2.md +4 -0
  443. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-3.md +5 -0
  444. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-1.md +11 -0
  445. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-2.md +11 -0
  446. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-3.md +4 -0
  447. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/meta.json +88 -0
  448. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005-ticket-fields-updated.yaml +39 -0
  449. package/src/skills/execute-task/tests/fixtures/IMPL-902-create-file.md +41 -0
  450. package/src/skills/execute-task/tests/fixtures/IMPL-904-current-task.md +40 -0
  451. package/src/skills/execute-task/tests/fixtures/IMPL-906-fill-ticket.md +42 -0
  452. package/src/skills/execute-task/tests/fixtures/QA-901-button-click.md +41 -0
  453. package/src/skills/execute-task/tests/fixtures/QA-903-visual-figma.md +40 -0
  454. package/src/skills/execute-task/tests/fixtures/TASK-905-done-with-typo.md +36 -0
  455. package/src/skills/execute-task/tests/index.yaml +39 -0
  456. package/src/skills/execute-task/tests/rubrics/no-duplicate-dod.md +22 -0
  457. package/src/skills/execute-task/tests/rubrics/no-foreign-ticket-edit.md +20 -0
  458. package/src/skills/execute-task/tests/rubrics/no-ticket-creation.md +21 -0
  459. package/src/skills/execute-task/tests/rubrics/ticket-fields-updated.md +23 -0
  460. package/src/skills/execute-task/tests/rubrics/verification-proportionality.md +22 -0
  461. package/src/skills/execute-task/workflows/execute.md +104 -0
  462. package/src/skills/manual-testing/README.md +63 -0
  463. package/src/skills/manual-testing/SKILL.md +176 -0
  464. package/src/skills/manual-testing/algorithms/blocked-tool-strategy.md +74 -0
  465. package/src/skills/manual-testing/algorithms/bug-severity.md +73 -0
  466. package/src/skills/manual-testing/algorithms/mcp-budget.md +97 -0
  467. package/src/skills/manual-testing/algorithms/test-prioritization.md +69 -0
  468. package/src/skills/manual-testing/knowledge/browser-extension-testing.md +102 -0
  469. package/src/skills/manual-testing/knowledge/browser-tools.md +114 -0
  470. package/src/skills/manual-testing/knowledge/desktop-tools-advanced.md +92 -0
  471. package/src/skills/manual-testing/knowledge/desktop-tools-core.md +76 -0
  472. package/src/skills/manual-testing/knowledge/sandbox-advanced.md +83 -0
  473. package/src/skills/manual-testing/knowledge/sandbox-core.md +67 -0
  474. package/src/skills/manual-testing/knowledge/stateful-edge-cases.md +69 -0
  475. package/src/skills/manual-testing/knowledge/test-case-design.md +107 -0
  476. package/src/skills/manual-testing/knowledge/testing-types.md +45 -0
  477. package/src/skills/manual-testing/templates/bug-report.md +52 -0
  478. package/src/skills/manual-testing/templates/test-case.md +34 -0
  479. package/src/skills/manual-testing/templates/test-plan.md +97 -0
  480. package/src/skills/manual-testing/templates/test-session-report.md +56 -0
  481. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-1.md +34 -0
  482. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-2.md +32 -0
  483. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-3.md +30 -0
  484. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/judge.json +163 -0
  485. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-1.md +0 -0
  486. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-2.md +7 -0
  487. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-3.md +0 -0
  488. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-1.md +4 -0
  489. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-2.md +15 -0
  490. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-3.md +8 -0
  491. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-1.md +5 -0
  492. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-2.md +7 -0
  493. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-3.md +7 -0
  494. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/meta.json +114 -0
  495. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001-sandbox-mandatory.yaml +38 -0
  496. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-1.md +44 -0
  497. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-2.md +32 -0
  498. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-3.md +47 -0
  499. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/judge.json +163 -0
  500. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-1.md +19 -0
  501. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-2.md +15 -0
  502. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-3.md +24 -0
  503. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-1.md +19 -0
  504. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-2.md +13 -0
  505. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-3.md +18 -0
  506. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-1.md +21 -0
  507. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-2.md +15 -0
  508. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-3.md +14 -0
  509. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/meta.json +114 -0
  510. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002-visual-tc-screenshot.yaml +37 -0
  511. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/claude-sonnet/trial-1.md +76 -0
  512. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/claude-sonnet/trial-2.md +71 -0
  513. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/claude-sonnet/trial-3.md +85 -0
  514. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/judge.json +46 -0
  515. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003/current/meta.json +36 -0
  516. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-003-qa-non-ui-assertion.yaml +65 -0
  517. package/src/skills/manual-testing/tests/index.yaml +30 -0
  518. package/src/skills/manual-testing/tests/last-run-tc001-sonnet.log +140 -0
  519. package/src/skills/manual-testing/tests/last-run-tc002.log +1 -0
  520. package/src/skills/manual-testing/tests/last-run.log +1469 -0
  521. package/src/skills/manual-testing/tests/rubrics/qa-non-ui-assertion.md +31 -0
  522. package/src/skills/manual-testing/tests/rubrics/sandbox-mandatory.md +20 -0
  523. package/src/skills/manual-testing/tests/rubrics/visual-tc-screenshot.md +21 -0
  524. package/src/skills/manual-testing/workflows/acceptance.md +80 -0
  525. package/src/skills/manual-testing/workflows/exploratory.md +84 -0
  526. package/src/skills/manual-testing/workflows/regression.md +76 -0
  527. package/src/skills/manual-testing/workflows/smoke.md +109 -0
  528. package/src/skills/manual-testing/workflows/test-plan.md +75 -0
  529. package/src/skills/review-result/README.md +59 -0
  530. package/src/skills/review-result/SKILL.md +138 -0
  531. package/src/skills/review-result/algorithms/verification.md +112 -0
  532. package/src/skills/review-result/knowledge/dod-patterns.md +115 -0
  533. package/src/skills/review-result/scripts/verify-artifacts.js +384 -0
  534. package/src/skills/review-result/templates/verdict.md +153 -0
  535. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-1.md +22 -0
  536. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-2.md +7 -0
  537. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-3.md +21 -0
  538. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-1.md +6 -0
  539. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-2.md +6 -0
  540. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-3.md +18 -0
  541. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/judge.json +164 -0
  542. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-1.md +5 -0
  543. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-2.md +7 -0
  544. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-3.md +6 -0
  545. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-1.md +49 -0
  546. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-2.md +28 -0
  547. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-3.md +37 -0
  548. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-1.md +22 -0
  549. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-2.md +13 -0
  550. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-3.md +21 -0
  551. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/meta.json +116 -0
  552. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001-visual-tc-trigger.yaml +51 -0
  553. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-1.md +23 -0
  554. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-2.md +22 -0
  555. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-3.md +28 -0
  556. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-1.md +4 -0
  557. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-2.md +36 -0
  558. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-3.md +4 -0
  559. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/judge.json +163 -0
  560. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-1.md +4 -0
  561. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-2.md +0 -0
  562. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-3.md +4 -0
  563. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-1.md +39 -0
  564. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-2.md +25 -0
  565. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-3.md +32 -0
  566. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-1.md +34 -0
  567. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-2.md +8 -0
  568. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-3.md +23 -0
  569. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/meta.json +115 -0
  570. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002-path-line-suffix.yaml +39 -0
  571. package/src/skills/review-result/tests/fixtures/IMPL-902-path-with-line.md +43 -0
  572. package/src/skills/review-result/tests/fixtures/QA-901-visual-button.md +46 -0
  573. package/src/skills/review-result/tests/index.yaml +25 -0
  574. package/src/skills/review-result/tests/rubrics/path-line-suffix.md +19 -0
  575. package/src/skills/review-result/tests/rubrics/visual-tc-trigger.md +19 -0
  576. package/src/skills/review-result/workflows/review.md +209 -0
@@ -13,6 +13,55 @@ const __filename = fileURLToPath(import.meta.url);
13
13
  const __dirname = path.dirname(__filename);
14
14
  const projectRoot = findProjectRoot(process.cwd());
15
15
 
16
+ import os from 'os';
17
+ import { execSync } from 'child_process';
18
+
19
+ function createTestWorkdir(skillName, suffix = '') {
20
+ const prefix = suffix ? `wf-test-${skillName}-${suffix}-` : `wf-test-${skillName}-`;
21
+ const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
22
+ const workflowDir = path.join(tmpRoot, '.workflow');
23
+ fs.mkdirSync(workflowDir, { recursive: true });
24
+ for (const sub of ['tickets/backlog', 'tickets/ready', 'tickets/in-progress', 'tickets/review', 'tickets/done', 'tickets/archive', 'plans/current', 'plans/archive', 'reports', 'logs']) {
25
+ fs.mkdirSync(path.join(workflowDir, sub), { recursive: true });
26
+ }
27
+ fs.writeFileSync(path.join(workflowDir, 'coach-backlog.yaml'), 'version: 1\nanalyzed_tickets: []\naudited_skills: {}\n', 'utf8');
28
+
29
+ const srcDir = path.join(workflowDir, 'src');
30
+ fs.mkdirSync(srcDir, { recursive: true });
31
+ const realSkills = path.join(projectRoot, 'src', 'skills');
32
+ const realScripts = path.join(projectRoot, 'src', 'scripts');
33
+ const linkSkills = path.join(srcDir, 'skills');
34
+ const linkScripts = path.join(srcDir, 'scripts');
35
+ const configDir = path.join(workflowDir, 'config');
36
+ const realConfigs = path.join(projectRoot, 'configs');
37
+
38
+ // Skills are COPIED (not junctioned) so that agents cannot write to real source files.
39
+ fs.cpSync(realSkills, linkSkills, { recursive: true, dereference: true });
40
+
41
+ // Scripts and configs are junctioned — read-only for agents in practice.
42
+ if (process.platform === 'win32') {
43
+ try { execSync(`mklink /J "${linkScripts}" "${realScripts}"`, { stdio: 'pipe', shell: true }); } catch {}
44
+ try { execSync(`mklink /J "${configDir}" "${realConfigs}"`, { stdio: 'pipe', shell: true }); } catch {}
45
+ } else {
46
+ try { fs.symlinkSync(realScripts, linkScripts, 'dir'); } catch {}
47
+ try { fs.symlinkSync(realConfigs, configDir, 'dir'); } catch {}
48
+ }
49
+
50
+ return tmpRoot;
51
+ }
52
+
53
+ function cleanupTestWorkdir(tmpRoot) {
54
+ if (!tmpRoot || !fs.existsSync(tmpRoot)) return;
55
+ // Remove junctions first so that their targets are not touched by rmSync.
56
+ if (process.platform === 'win32') {
57
+ for (const link of ['src/scripts', 'config']) {
58
+ const p = path.join(tmpRoot, '.workflow', link);
59
+ try { execSync(`rmdir "${p}"`, { stdio: 'pipe', shell: true }); } catch {}
60
+ }
61
+ }
62
+ try { fs.rmSync(tmpRoot, { recursive: true, force: true }); } catch {}
63
+ }
64
+
16
65
  function parseArgs() {
17
66
  const args = process.argv.slice(2);
18
67
  const opts = {
@@ -720,13 +769,23 @@ async function writeJudgeResults(skillName, caseId, results) {
720
769
  const skillsDir = findSkillsDir();
721
770
  const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
722
771
  ensureDir(caseDir);
723
-
724
- const judgeData = {
725
- per_model: {},
726
- rubric_scores: results.rubric_scores || [],
727
- timestamp: new Date().toISOString()
728
- };
729
-
772
+
773
+ const judgePath = path.join(caseDir, 'judge.json');
774
+ let judgeData = { per_model: {}, rubric_scores: [], timestamp: new Date().toISOString() };
775
+ if (fs.existsSync(judgePath)) {
776
+ try {
777
+ const existing = JSON.parse(fs.readFileSync(judgePath, 'utf8'));
778
+ judgeData.per_model = existing.per_model || {};
779
+ judgeData.rubric_scores = existing.rubric_scores || [];
780
+ } catch {}
781
+ }
782
+
783
+ const newAgentIds = new Set(Object.keys(results.per_model || {}));
784
+ judgeData.rubric_scores = judgeData.rubric_scores.filter(r => !newAgentIds.has(r.agentId));
785
+ for (const r of (results.rubric_scores || [])) {
786
+ judgeData.rubric_scores.push(r);
787
+ }
788
+
730
789
  for (const [agentId, modelData] of Object.entries(results.per_model || {})) {
731
790
  judgeData.per_model[agentId] = {
732
791
  pass_count: modelData.pass_count,
@@ -738,12 +797,10 @@ async function writeJudgeResults(skillName, caseId, results) {
738
797
  }))
739
798
  };
740
799
  }
741
-
742
- fs.writeFileSync(
743
- path.join(caseDir, 'judge.json'),
744
- JSON.stringify(judgeData, null, 2),
745
- 'utf8'
746
- );
800
+
801
+ judgeData.timestamp = new Date().toISOString();
802
+
803
+ fs.writeFileSync(judgePath, JSON.stringify(judgeData, null, 2), 'utf8');
747
804
  }
748
805
 
749
806
  async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0.02, targetAgentCost = 0.01) {
@@ -776,7 +833,7 @@ async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0
776
833
  }
777
834
 
778
835
  async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judgeAgentId, pipelineConfig, options = {}) {
779
- const { trials = 3, concurrency = 2, timeout = 300 } = options;
836
+ const { trials = 3, timeout = 300 } = options;
780
837
 
781
838
  const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
782
839
  if (!judgeAgentConfig) {
@@ -799,12 +856,12 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
799
856
  };
800
857
 
801
858
  const caseId = caseDef?.id || 'unknown';
802
-
803
- function buildTargetPrompt() {
859
+
860
+ function buildTargetPrompt(taskWorkdir) {
804
861
  let targetPrompt = '';
805
862
  const testsDir = findSkillTestsDir(skillName);
806
863
  const caseDir = caseDef?.file ? path.dirname(caseDef.file) : '';
807
-
864
+
808
865
  if (testCase.scenario?.system_prompt_file) {
809
866
  const systemPromptPath = path.join(testsDir, caseDir, testCase.scenario.system_prompt_file);
810
867
  if (fs.existsSync(systemPromptPath)) {
@@ -824,6 +881,28 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
824
881
  targetPrompt += `## ${input.as || 'Input'}\n`;
825
882
  targetPrompt += fs.readFileSync(fixturePath, 'utf8') + '\n\n';
826
883
  }
884
+ } else if (input.kind === 'inline') {
885
+ if (input.content) {
886
+ targetPrompt += `## ${input.as || 'Input'}\n`;
887
+ targetPrompt += input.content + '\n\n';
888
+ }
889
+ } else if (input.kind === 'ticket_file') {
890
+ const fixturePath = path.join(testsDir, caseDir, input.path);
891
+ const destDir = input.dest_dir || 'in-progress';
892
+ const ticketId = input.ticket_id;
893
+ if (!ticketId) {
894
+ throw new Error(`ticket_file input requires ticket_id (case ${caseId})`);
895
+ }
896
+ if (!taskWorkdir) {
897
+ throw new Error(`ticket_file input requires task workdir (case ${caseId})`);
898
+ }
899
+ if (!fs.existsSync(fixturePath)) {
900
+ throw new Error(`ticket_file fixture not found: ${fixturePath}`);
901
+ }
902
+ const destPath = path.join(taskWorkdir, '.workflow', 'tickets', destDir, `${ticketId}.md`);
903
+ fs.mkdirSync(path.dirname(destPath), { recursive: true });
904
+ fs.copyFileSync(fixturePath, destPath);
905
+ targetPrompt += `## Context\nticket_id: ${ticketId}\n\n`;
827
906
  }
828
907
  }
829
908
  }
@@ -831,46 +910,65 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
831
910
  if (!targetPrompt.trim()) {
832
911
  targetPrompt = testCase.prompt || testCase.input || '';
833
912
  }
834
-
913
+
835
914
  return targetPrompt;
836
915
  }
837
916
 
917
+ const allTasks = [];
838
918
  for (const agentId of targetAgents) {
839
919
  const agentConfig = pipelineConfig.agents[agentId];
840
920
  if (!agentConfig) {
841
921
  throw new Error(`Target agent not found: ${agentId}`);
842
922
  }
843
-
844
923
  results.per_model[agentId] = {
845
924
  trials: [],
846
925
  pass_count: 0,
847
926
  total: trials
848
927
  };
849
-
850
- const tasks = [];
851
928
  for (let trial = 1; trial <= trials; trial++) {
852
- tasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
929
+ allTasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
853
930
  }
854
-
855
- for (let i = 0; i < tasks.length; i += concurrency) {
856
- const batch = tasks.slice(i, i + concurrency);
857
- const batchResults = await Promise.all(
858
- batch.map(async (task) => {
859
- try {
860
- const targetPrompt = buildTargetPrompt();
861
- const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
862
- timeout,
863
- stageId: `${caseId}-${task.agentId}-trial-${task.trial}`
864
- });
865
-
866
- const judgePrompt = `You are a judge evaluating the output of an AI agent.
931
+ }
932
+
933
+ const allResults = await Promise.all(
934
+ allTasks.map(async (task) => {
935
+ const taskSuffix = `${caseId}-${task.agentId}-t${task.trial}`;
936
+ let taskWorkdir = null;
937
+ try {
938
+ taskWorkdir = createTestWorkdir(skillName, taskSuffix);
939
+ const targetPrompt = buildTargetPrompt(taskWorkdir);
940
+ const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
941
+ timeout,
942
+ stageId: `${caseId}-${task.agentId}-trial-${task.trial}`,
943
+ projectRoot: taskWorkdir
944
+ });
945
+
946
+ // Snapshot ticket files after target-run (for judge to inspect actual file state).
947
+ let ticketFilesSection = '';
948
+ const ticketInputs = (testCase.scenario?.inputs || []).filter(i => i.kind === 'ticket_file');
949
+ for (const input of ticketInputs) {
950
+ const ticketPath = path.join(
951
+ taskWorkdir,
952
+ '.workflow', 'tickets',
953
+ input.dest_dir || 'in-progress',
954
+ `${input.ticket_id}.md`
955
+ );
956
+ if (fs.existsSync(ticketPath)) {
957
+ const content = fs.readFileSync(ticketPath, 'utf8');
958
+ ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id} (${input.dest_dir || 'in-progress'}/)\n\n\`\`\`markdown\n${content}\n\`\`\`\n`;
959
+ } else {
960
+ ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id}\n\n(file missing at ${input.dest_dir || 'in-progress'}/${input.ticket_id}.md)\n`;
961
+ }
962
+ }
963
+
964
+ const judgePrompt = `You are a judge evaluating the output of an AI agent.
867
965
 
868
966
  ## Rubric
869
967
  ${rubric}
870
968
 
871
969
  ## Target Agent Output
872
970
  ${targetOutput.output || targetOutput.status || 'No output'}
873
-
971
+ ${ticketFilesSection}
874
972
  ## Task
875
973
  ${testCase.description || testCase.name || 'Evaluate the response'}
876
974
 
@@ -881,54 +979,77 @@ score: <number 1-5>
881
979
  reason: <brief explanation>
882
980
  ---RESULT---`;
883
981
 
884
- const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
885
- timeout: 60,
886
- stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
887
- });
888
-
889
- let score = 3;
890
- const parsed = parseJudgeResult(judgeResult.output);
891
- if (parsed && parsed.score) {
892
- score = parsed.score;
893
- }
894
-
895
- await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
896
-
897
- return {
898
- trial: task.trial,
899
- agentId: task.agentId,
900
- score,
901
- output: targetOutput.output || '',
902
- judge_output: judgeResult.output || '',
903
- passed: score >= 4
904
- };
905
- } catch (err) {
906
- console.error(`[Runner] Trial failed: ${task.agentId} trial ${task.trial}`, err.message);
907
- return {
908
- trial: task.trial,
909
- agentId: task.agentId,
910
- score: 1,
911
- error: err.message,
912
- passed: false
913
- };
914
- }
915
- })
916
- );
917
-
918
- for (const result of batchResults) {
919
- results.per_model[result.agentId].trials.push(result);
920
- if (result.passed) {
921
- results.per_model[result.agentId].pass_count++;
922
- }
923
- results.rubric_scores.push({
924
- agentId: result.agentId,
925
- trial: result.trial,
926
- score: result.score
982
+ const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
983
+ timeout: 60,
984
+ stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
927
985
  });
986
+
987
+ let score = 3;
988
+ const parsed = parseJudgeResult(judgeResult.output);
989
+ if (parsed && parsed.score) {
990
+ score = parsed.score;
991
+ }
992
+
993
+ await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
994
+
995
+ return {
996
+ trial: task.trial,
997
+ agentId: task.agentId,
998
+ score,
999
+ output: targetOutput.output || '',
1000
+ judge_output: judgeResult.output || '',
1001
+ passed: score >= 4,
1002
+ errored: false
1003
+ };
1004
+ } catch (err) {
1005
+ console.error(`[Runner] Trial errored: ${task.agentId} trial ${task.trial} — ${err.message}`);
1006
+ try {
1007
+ await writeTrialOutput(
1008
+ skillName,
1009
+ caseId,
1010
+ task.agentId,
1011
+ task.trial,
1012
+ `# TRIAL ERRORED\n\nagent: ${task.agentId}\ntrial: ${task.trial}\nerror: ${err.message}\n`
1013
+ );
1014
+ } catch {}
1015
+ return {
1016
+ trial: task.trial,
1017
+ agentId: task.agentId,
1018
+ score: null,
1019
+ error: err.message,
1020
+ passed: false,
1021
+ errored: true
1022
+ };
1023
+ } finally {
1024
+ if (taskWorkdir) {
1025
+ cleanupTestWorkdir(taskWorkdir);
1026
+ }
928
1027
  }
1028
+ })
1029
+ );
1030
+
1031
+ for (const result of allResults) {
1032
+ results.per_model[result.agentId].trials.push(result);
1033
+ if (result.errored) {
1034
+ results.per_model[result.agentId].error_count = (results.per_model[result.agentId].error_count || 0) + 1;
1035
+ } else if (result.passed) {
1036
+ results.per_model[result.agentId].pass_count++;
929
1037
  }
1038
+ results.rubric_scores.push({
1039
+ agentId: result.agentId,
1040
+ trial: result.trial,
1041
+ score: result.score,
1042
+ errored: !!result.errored,
1043
+ error: result.error || undefined
1044
+ });
930
1045
  }
931
-
1046
+ for (const agentId of Object.keys(results.per_model)) {
1047
+ results.per_model[agentId].trials.sort((a, b) => a.trial - b.trial);
1048
+ }
1049
+ results.rubric_scores.sort((a, b) =>
1050
+ a.agentId === b.agentId ? a.trial - b.trial : a.agentId.localeCompare(b.agentId)
1051
+ );
1052
+
932
1053
  return results;
933
1054
  }
934
1055
 
@@ -961,19 +1082,27 @@ function aggregateResults(results, testCase) {
961
1082
 
962
1083
  for (const [agentId, modelData] of Object.entries(results.per_model)) {
963
1084
  const passCount = modelData.pass_count;
1085
+ const errorCount = modelData.error_count || 0;
964
1086
  const total = modelData.total;
1087
+ const effective = total - errorCount;
965
1088
  const threshold = Math.ceil(total / 2);
966
-
1089
+
967
1090
  let passed;
968
- if (useAll) {
1091
+ let errored = false;
1092
+ if (effective === 0) {
1093
+ passed = false;
1094
+ errored = true;
1095
+ } else if (useAll) {
969
1096
  passed = passCount === total;
970
1097
  } else {
971
1098
  passed = passCount >= threshold;
972
1099
  }
973
-
1100
+
974
1101
  perModelResults[agentId] = {
975
1102
  passed,
1103
+ errored,
976
1104
  pass_count: passCount,
1105
+ error_count: errorCount,
977
1106
  total,
978
1107
  threshold: useAll ? total : threshold
979
1108
  };
@@ -991,40 +1120,66 @@ async function writeMetaJson(caseId, skillName, status, durationMs, l2Results =
991
1120
  const skillsDir = findSkillsDir();
992
1121
  const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
993
1122
  ensureDir(caseDir);
994
-
1123
+
1124
+ const metaPath = path.join(caseDir, 'meta.json');
1125
+ let existing = null;
1126
+ if (fs.existsSync(metaPath)) {
1127
+ try {
1128
+ existing = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
1129
+ } catch {}
1130
+ }
1131
+
995
1132
  const meta = {
996
1133
  date: new Date().toISOString(),
997
1134
  skill_sha: getSkillSha(skillName),
998
1135
  status,
999
1136
  duration_ms: durationMs
1000
1137
  };
1001
-
1138
+
1002
1139
  if (l1_skipped) {
1003
1140
  meta.l1_skipped = true;
1004
1141
  }
1005
-
1142
+
1143
+ const mergedPerModel = (existing && existing.per_model) ? { ...existing.per_model } : {};
1144
+ let mergedRubricScores = (existing && existing.rubric_scores) ? [...existing.rubric_scores] : [];
1145
+
1006
1146
  if (l2Results) {
1007
1147
  const aggregated = aggregateResults(l2Results, {});
1008
- meta.per_model = aggregated.per_model;
1009
- meta.rubric_scores = l2Results.rubric_scores;
1148
+ const newAgentIds = new Set(Object.keys(aggregated.per_model || {}));
1149
+ for (const [agentId, data] of Object.entries(aggregated.per_model || {})) {
1150
+ mergedPerModel[agentId] = data;
1151
+ }
1152
+ mergedRubricScores = mergedRubricScores.filter(r => !newAgentIds.has(r.agentId));
1153
+ for (const r of (l2Results.rubric_scores || [])) {
1154
+ mergedRubricScores.push(r);
1155
+ }
1010
1156
  if (l2Results.tokens) {
1011
1157
  meta.tokens = l2Results.tokens;
1012
1158
  }
1013
1159
  }
1014
-
1015
- fs.writeFileSync(
1016
- path.join(caseDir, 'meta.json'),
1017
- JSON.stringify(meta, null, 2),
1018
- 'utf8'
1019
- );
1160
+
1161
+ if (Object.keys(mergedPerModel).length > 0) {
1162
+ meta.per_model = mergedPerModel;
1163
+ }
1164
+ if (mergedRubricScores.length > 0) {
1165
+ meta.rubric_scores = mergedRubricScores;
1166
+ }
1167
+
1168
+ const allPassed = Object.values(mergedPerModel).every(m => m.passed);
1169
+ if (Object.keys(mergedPerModel).length > 0) {
1170
+ meta.status = allPassed ? 'passed' : 'failed';
1171
+ }
1172
+
1173
+ fs.writeFileSync(metaPath, JSON.stringify(meta, null, 2), 'utf8');
1020
1174
  }
1021
1175
 
1022
1176
  async function runTestsForSkill(skillName, opts) {
1177
+ console.log(`[Runner] Per-task isolated workdirs will be created for each (case × agent × trial)`);
1023
1178
  const result = {
1024
1179
  skill: skillName,
1025
1180
  status: 'passed',
1026
1181
  total: 0,
1027
- current_run: { passed: 0, failed: 0 },
1182
+ current_run: { passed: 0, failed: 0, no_coverage: 0 },
1028
1183
  baseline_ref: 'origin/main',
1029
1184
  target_agents: [],
1030
1185
  judge_agent: null
@@ -1117,14 +1272,56 @@ async function runTestsForSkill(skillName, opts) {
1117
1272
 
1118
1273
  const runL2 = !opts.layer || opts.layer === 'l2';
1119
1274
 
1120
- if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent) {
1275
+ const casesWithRubric = cases.filter(cd => {
1276
+ try {
1277
+ const tc = loadTestCase(skillName, cd.file);
1278
+ return tc.assertions?.rubric && tc.assertions.rubric.length > 0;
1279
+ } catch { return false; }
1280
+ });
1281
+ const anyHasRubric = casesWithRubric.length > 0;
1282
+
1283
+ if (casesWithRubric.length < cases.length) {
1284
+ const missing = cases.length - casesWithRubric.length;
1285
+ console.log(`[Runner] ${missing}/${cases.length} cases have no rubric — L2 will be skipped for them`);
1286
+ }
1287
+
1288
+ if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric) {
1121
1289
  const trials = opts.fast ? 1 : 3;
1122
1290
  const totalModels = effectiveTargetAgents.length;
1123
- const llmEstimate = cases.length * totalModels * trials * 2;
1124
- await preFlightApproval(cases.length, totalModels, trials);
1291
+ await preFlightApproval(casesWithRubric.length, totalModels, trials);
1292
+ }
1293
+
1294
+ let secretScanFailed = false;
1295
+ let calibrationFailedResult = null;
1296
+
1297
+ const anyRunL1 = !opts.layer || opts.layer === 'deterministic';
1298
+ const anyRunL2 = !opts.layer || opts.layer === 'l2';
1299
+
1300
+ if (anyRunL1 && !opts.skipSecretScan) {
1301
+ const scanResult = await runSecretScan();
1302
+ if (!scanResult.passed) {
1303
+ secretScanFailed = true;
1304
+ result.error = 'Secret scan failed - secrets detected in fixtures';
1305
+ }
1125
1306
  }
1126
1307
 
1127
- for (const caseDef of cases) {
1308
+ if (anyRunL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric && !secretScanFailed) {
1309
+ const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
1310
+ if (!calibrationResult.passed) {
1311
+ console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
1312
+ calibrationFailedResult = calibrationResult;
1313
+ result.status = 'calibration_failed';
1314
+ result.error = calibrationResult.error;
1315
+ result.calibration = calibrationResult;
1316
+ return { ...result, cases, currentRunStatuses };
1317
+ }
1318
+ if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
1319
+ console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
1320
+ }
1321
+ console.log('[Runner] Calibration gate PASSED');
1322
+ }
1323
+
1324
+ await Promise.all(cases.map(async (caseDef) => {
1128
1325
  const caseStart = Date.now();
1129
1326
 
1130
1327
  try {
@@ -1136,17 +1333,13 @@ async function runTestsForSkill(skillName, opts) {
1136
1333
  const runL1 = !opts.layer || opts.layer === 'deterministic';
1137
1334
  const runL2 = !opts.layer || opts.layer === 'l2';
1138
1335
 
1139
- // Secret scan (only for deterministic layer)
1140
- if (runL1 && !opts.skipSecretScan) {
1141
- const scanResult = await runSecretScan();
1142
- if (!scanResult.passed) {
1143
- result.current_run.failed++;
1144
- result.status = 'failed';
1145
- result.error = 'Secret scan failed - secrets detected in fixtures';
1146
- currentRunStatuses[caseDef.id] = 'failed';
1147
- await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1148
- continue;
1149
- }
1336
+ // Secret scan result propagated from pre-loop
1337
+ if (runL1 && !opts.skipSecretScan && secretScanFailed) {
1338
+ result.current_run.failed++;
1339
+ result.status = 'failed';
1340
+ currentRunStatuses[caseDef.id] = 'failed';
1341
+ await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1342
+ return;
1150
1343
  }
1151
1344
 
1152
1345
  // L0 static assertions
@@ -1158,7 +1351,7 @@ async function runTestsForSkill(skillName, opts) {
1158
1351
  result.status = 'failed';
1159
1352
  currentRunStatuses[caseDef.id] = 'failed';
1160
1353
  await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1161
- continue;
1354
+ return;
1162
1355
  }
1163
1356
  }
1164
1357
 
@@ -1167,13 +1360,28 @@ async function runTestsForSkill(skillName, opts) {
1167
1360
  const l1Results = runL1Assertions(mockOutput, testCase);
1168
1361
  const l1Failed = l1Results.filter(r => !r.passed);
1169
1362
  const l1Skipped = l1Results.some(r => r.skipped);
1363
+ const l1Declared = (testCase.assertions?.deterministic || []).length;
1364
+ const l1Executed = l1Results.filter(r => !r.skipped).length;
1170
1365
 
1171
- const caseStatus = l1Failed.length === 0 ? 'passed' : 'failed';
1172
- currentRunStatuses[caseDef.id] = caseStatus;
1366
+ const willRunL2 = runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric;
1367
+ const noCoverage = l1Declared > 0 && l1Executed === 0 && !willRunL2;
1173
1368
 
1369
+ let caseStatus;
1174
1370
  if (l1Failed.length > 0) {
1371
+ caseStatus = 'failed';
1372
+ } else if (noCoverage) {
1373
+ caseStatus = 'no_coverage';
1374
+ } else {
1375
+ caseStatus = 'passed';
1376
+ }
1377
+ currentRunStatuses[caseDef.id] = caseStatus;
1378
+
1379
+ if (caseStatus === 'failed') {
1175
1380
  result.current_run.failed++;
1176
1381
  result.status = 'failed';
1382
+ } else if (caseStatus === 'no_coverage') {
1383
+ result.current_run.no_coverage = (result.current_run.no_coverage || 0) + 1;
1384
+ console.log(`[Runner] ${caseDef.id}: no_coverage — L1 assertions require agent output but L2 is not configured (no rubric or no agents)`);
1177
1385
  } else {
1178
1386
  result.current_run.passed++;
1179
1387
  }
@@ -1182,36 +1390,25 @@ async function runTestsForSkill(skillName, opts) {
1182
1390
  result.l1_skipped = true;
1183
1391
  }
1184
1392
 
1185
- if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
1186
- const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
1187
-
1188
- if (!calibrationResult.passed) {
1189
- console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
1190
- result.status = 'calibration_failed';
1191
- result.error = calibrationResult.error;
1192
- result.calibration = calibrationResult;
1193
- return result;
1194
- }
1195
-
1196
- if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
1197
- console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
1198
- }
1199
-
1200
- console.log('[Runner] Calibration gate PASSED');
1201
- }
1202
-
1203
1393
  let l2Results = null;
1204
- if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
1394
+ if (willRunL2) {
1205
1395
  const trials = opts.fast ? 1 : 3;
1206
1396
  const index = loadIndexYaml(skillName);
1207
1397
  const defaultTimeout = index.execution?.default_timeout_s || 300;
1208
1398
  const timeout = testCase.execution?.timeout_s || defaultTimeout;
1399
+ const caseTargetAgents = testCase.execution?.target_agents;
1400
+ const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
1401
+ ? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
1402
+ : effectiveTargetAgents;
1403
+ if (caseTargetAgents && caseTargetAgents.length > 0) {
1404
+ console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
1405
+ }
1209
1406
  try {
1210
1407
  l2Results = await runL2Evaluation(
1211
1408
  skillName,
1212
1409
  testCase,
1213
1410
  caseDef,
1214
- effectiveTargetAgents,
1411
+ perCaseAgents,
1215
1412
  judgeAgent,
1216
1413
  pipelineConfig,
1217
1414
  { trials, concurrency: 2, timeout }
@@ -1238,6 +1435,13 @@ async function runTestsForSkill(skillName, opts) {
1238
1435
  const trials = opts.fast ? 1 : 3;
1239
1436
  const defaultTimeout = index.execution?.default_timeout_s || 300;
1240
1437
  const timeout = testCase.execution?.timeout_s || defaultTimeout;
1438
+ const caseTargetAgents = testCase.execution?.target_agents;
1439
+ const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
1440
+ ? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
1441
+ : effectiveTargetAgents;
1442
+ if (caseTargetAgents && caseTargetAgents.length > 0) {
1443
+ console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
1444
+ }
1241
1445
  let l2Results = null;
1242
1446
  let caseStatus = 'passed';
1243
1447
  try {
@@ -1245,7 +1449,7 @@ async function runTestsForSkill(skillName, opts) {
1245
1449
  skillName,
1246
1450
  testCase,
1247
1451
  caseDef,
1248
- effectiveTargetAgents,
1452
+ perCaseAgents,
1249
1453
  judgeAgent,
1250
1454
  pipelineConfig,
1251
1455
  { trials, concurrency: 2, timeout }
@@ -1283,6 +1487,10 @@ async function runTestsForSkill(skillName, opts) {
1283
1487
  currentRunStatuses[caseDef.id] = 'error';
1284
1488
  await writeMetaJson(caseDef.id, skillName, 'error', Date.now() - caseStart);
1285
1489
  }
1490
+ }));
1491
+
1492
+ if (result.status === 'passed' && result.current_run.no_coverage > 0 && result.current_run.passed === 0) {
1493
+ result.status = 'no_coverage';
1286
1494
  }
1287
1495
  } catch (e) {
1288
1496
  result.status = 'error';
@@ -1307,7 +1515,7 @@ async function runSkillTests(opts) {
1307
1515
  skill: opts.skill || 'unknown',
1308
1516
  mode: 'deterministic',
1309
1517
  total: 0,
1310
- current_run: { passed: 0, failed: 0 },
1518
+ current_run: { passed: 0, failed: 0, no_coverage: 0 },
1311
1519
  baseline_ref: 'origin/main',
1312
1520
  git_head_comparison: null,
1313
1521
  verdict: 'ready_for_user_review',
@@ -1323,6 +1531,7 @@ async function runSkillTests(opts) {
1323
1531
  results.total = skillResult.total;
1324
1532
  results.current_run.passed = skillResult.current_run.passed;
1325
1533
  results.current_run.failed = skillResult.current_run.failed;
1534
+ results.current_run.no_coverage = skillResult.current_run.no_coverage || 0;
1326
1535
  results.status = skillResult.status;
1327
1536
  results.target_agents = skillResult.target_agents;
1328
1537
  results.judge_agent = skillResult.judge_agent;
@@ -1406,7 +1615,7 @@ async function runSkillTests(opts) {
1406
1615
  }
1407
1616
 
1408
1617
  return results;
1409
- }
1618
+ }
1410
1619
 
1411
1620
  function printResult(result) {
1412
1621
  console.log('---RESULT---');
@@ -1416,6 +1625,9 @@ function printResult(result) {
1416
1625
  console.log(`total: ${result.total}`);
1417
1626
  console.log(`current_run.passed: ${result.current_run.passed}`);
1418
1627
  console.log(`current_run.failed: ${result.current_run.failed}`);
1628
+ if (result.current_run.no_coverage) {
1629
+ console.log(`current_run.no_coverage: ${result.current_run.no_coverage}`);
1630
+ }
1419
1631
 
1420
1632
  if (result.baseline_ref) {
1421
1633
  console.log(`baseline_ref: ${result.baseline_ref}`);