workflow-ai 1.0.62 → 1.0.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (505) hide show
  1. package/README.md +61 -0
  2. package/agent-templates/CLAUDE.md.tpl +2 -0
  3. package/agent-templates/QWEN.md.tpl +2 -0
  4. package/configs/config.yaml +134 -0
  5. package/configs/pipeline.yaml +884 -0
  6. package/configs/ticket-movement-rules.yaml +80 -0
  7. package/package.json +2 -1
  8. package/src/global-dir.mjs +25 -1
  9. package/src/init.mjs +5 -4
  10. package/src/lib/agent-spawner.mjs +338 -0
  11. package/src/runner.mjs +15 -14
  12. package/src/scripts/get-next-test-id.js +94 -0
  13. package/src/scripts/migrate-backlog-to-tests.js +406 -0
  14. package/src/scripts/run-skill-tests.js +1703 -0
  15. package/src/scripts/scan-fixtures-for-secrets.js +248 -0
  16. package/src/scripts/tests/timeout-cascade.test.js +28 -0
  17. package/src/skills/analyze-report/README.md +44 -0
  18. package/src/skills/analyze-report/SKILL.md +121 -0
  19. package/src/skills/analyze-report/algorithms/progress-assessment.md +108 -0
  20. package/src/skills/analyze-report/knowledge/analysis-frameworks.md +66 -0
  21. package/src/skills/analyze-report/knowledge/report-structure.md +61 -0
  22. package/src/skills/analyze-report/scripts/calc-plan-metrics.js +234 -0
  23. package/src/skills/analyze-report/templates/analysis-report.md +80 -0
  24. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-1.md +69 -0
  25. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-2.md +103 -0
  26. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-3.md +99 -0
  27. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/judge.json +163 -0
  28. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-1.md +89 -0
  29. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-2.md +88 -0
  30. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-3.md +100 -0
  31. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-1.md +77 -0
  32. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-2.md +64 -0
  33. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-3.md +110 -0
  34. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-1.md +74 -0
  35. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-2.md +38 -0
  36. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-3.md +61 -0
  37. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/meta.json +115 -0
  38. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001-evidence-from-log.yaml +60 -0
  39. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-1.md +90 -0
  40. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-2.md +89 -0
  41. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-3.md +77 -0
  42. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/judge.json +163 -0
  43. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-1.md +84 -0
  44. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-2.md +77 -0
  45. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-3.md +89 -0
  46. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-1.md +103 -0
  47. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-2.md +103 -0
  48. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-3.md +103 -0
  49. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-1.md +93 -0
  50. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-2.md +93 -0
  51. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-3.md +86 -0
  52. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/meta.json +115 -0
  53. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002-result-block-format.yaml +44 -0
  54. package/src/skills/analyze-report/tests/fixtures/REPORT-002-incorrect-attribution.md +27 -0
  55. package/src/skills/analyze-report/tests/fixtures/pipeline-2026-04-06_qa-001-skip.log +32 -0
  56. package/src/skills/analyze-report/tests/index.yaml +25 -0
  57. package/src/skills/analyze-report/tests/rubrics/evidence-from-log.md +22 -0
  58. package/src/skills/analyze-report/tests/rubrics/result-block-format.md +22 -0
  59. package/src/skills/analyze-report/workflows/progress.md +158 -0
  60. package/src/skills/analyze-report/workflows/retrospective.md +143 -0
  61. package/src/skills/coach/README.md +43 -0
  62. package/src/skills/coach/SKILL.md +166 -0
  63. package/src/skills/coach/SKILL.md.legacy +157 -0
  64. package/src/skills/coach/algorithms/gap-analysis.md +69 -0
  65. package/src/skills/coach/algorithms/improvement-prioritization.md +62 -0
  66. package/src/skills/coach/algorithms/skill-scoring.md +80 -0
  67. package/src/skills/coach/knowledge/audit-applied-changes-clean.txt +11 -0
  68. package/src/skills/coach/knowledge/backlog-management.md +67 -0
  69. package/src/skills/coach/knowledge/backlog-management.md.legacy +90 -0
  70. package/src/skills/coach/knowledge/common-antipatterns.md +76 -0
  71. package/src/skills/coach/knowledge/prompt-engineering.md +45 -0
  72. package/src/skills/coach/knowledge/shared-knowledge-guide.md +44 -0
  73. package/src/skills/coach/knowledge/skill-anatomy.md +49 -0
  74. package/src/skills/coach/knowledge/test-authorship.md +141 -0
  75. package/src/skills/coach/templates/audit-report.md +39 -0
  76. package/src/skills/coach/templates/coach-backlog-init.yaml +14 -0
  77. package/src/skills/coach/templates/coach-backlog-init.yaml.legacy +10 -0
  78. package/src/skills/coach/templates/improvement-plan.md +42 -0
  79. package/src/skills/coach/templates/new-skill.md +95 -0
  80. package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-1.md +58 -0
  81. package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-2.md +65 -0
  82. package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-3.md +58 -0
  83. package/src/skills/coach/tests/cases/TC-COACH-001/current/judge.json +151 -0
  84. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-1.md +46 -0
  85. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-2.md +0 -0
  86. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-3.md +75 -0
  87. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-1.md +81 -0
  88. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-2.md +101 -0
  89. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-3.md +91 -0
  90. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-1.md +48 -0
  91. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-2.md +30 -0
  92. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-3.md +55 -0
  93. package/src/skills/coach/tests/cases/TC-COACH-001/current/meta.json +95 -0
  94. package/src/skills/coach/tests/cases/TC-COACH-001-evidence-based-temporal-diagram.yaml +53 -0
  95. package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-1.md +46 -0
  96. package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-2.md +50 -0
  97. package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-3.md +48 -0
  98. package/src/skills/coach/tests/cases/TC-COACH-002/current/judge.json +151 -0
  99. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-1.md +0 -0
  100. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-2.md +37 -0
  101. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-3.md +30 -0
  102. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-1.md +23 -0
  103. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-2.md +29 -0
  104. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-3.md +35 -0
  105. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-1.md +13 -0
  106. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-2.md +19 -0
  107. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-3.md +33 -0
  108. package/src/skills/coach/tests/cases/TC-COACH-002/current/meta.json +95 -0
  109. package/src/skills/coach/tests/cases/TC-COACH-002-root-cause-first.yaml +57 -0
  110. package/src/skills/coach/tests/fixtures/pipeline-2026-04-06_id-collision.log +77 -0
  111. package/src/skills/coach/tests/index.yaml +29 -0
  112. package/src/skills/coach/tests/rubrics/calibration/evidence-based-bad.md +13 -0
  113. package/src/skills/coach/tests/rubrics/calibration/evidence-based-good.md +29 -0
  114. package/src/skills/coach/tests/rubrics/evidence-based.md +26 -0
  115. package/src/skills/coach/tests/rubrics/root-cause-first.md +21 -0
  116. package/src/skills/coach/workflows/analyze.md +79 -0
  117. package/src/skills/coach/workflows/analyze.md.legacy +64 -0
  118. package/src/skills/coach/workflows/audit.md +74 -0
  119. package/src/skills/coach/workflows/audit.md.legacy +59 -0
  120. package/src/skills/coach/workflows/create.md +80 -0
  121. package/src/skills/coach/workflows/create.md.legacy +67 -0
  122. package/src/skills/coach/workflows/improve.md +71 -0
  123. package/src/skills/coach/workflows/improve.md.legacy +60 -0
  124. package/src/skills/coach/workflows/research.md +55 -0
  125. package/src/skills/coach/workflows/review.md +52 -0
  126. package/src/skills/coach/workflows/review.md.legacy +48 -0
  127. package/src/skills/coach/workflows/test.md +97 -0
  128. package/src/skills/create-plan/README.md +39 -0
  129. package/src/skills/create-plan/SKILL.md +104 -0
  130. package/src/skills/create-plan/algorithms/risk-assessment.md +73 -0
  131. package/src/skills/create-plan/knowledge/plan-completeness.md +67 -0
  132. package/src/skills/create-plan/knowledge/plan-lifecycle.md +33 -0
  133. package/src/skills/create-plan/knowledge/task-verification-pairs.md +151 -0
  134. package/src/skills/create-plan/scripts/validate-completeness.js +182 -0
  135. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-1.md +5 -0
  136. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-2.md +39 -0
  137. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-3.md +35 -0
  138. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/judge.json +167 -0
  139. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-1.md +5 -0
  140. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-2.md +10 -0
  141. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-3.md +5 -0
  142. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-1.md +26 -0
  143. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-2.md +86 -0
  144. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-3.md +5 -0
  145. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-1.md +11 -0
  146. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-2.md +15 -0
  147. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-3.md +14 -0
  148. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/meta.json +119 -0
  149. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001-validate-completeness.yaml +41 -0
  150. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-1.md +25 -0
  151. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-2.md +30 -0
  152. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-3.md +37 -0
  153. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/judge.json +164 -0
  154. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-1.md +3 -0
  155. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-2.md +11 -0
  156. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-3.md +13 -0
  157. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-1.md +44 -0
  158. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-2.md +5 -0
  159. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-3.md +49 -0
  160. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-1.md +6 -0
  161. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-2.md +11 -0
  162. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-3.md +16 -0
  163. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/meta.json +116 -0
  164. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002-task-granularity.yaml +39 -0
  165. package/src/skills/create-plan/tests/index.yaml +25 -0
  166. package/src/skills/create-plan/tests/rubrics/task-granularity.md +21 -0
  167. package/src/skills/create-plan/tests/rubrics/validate-completeness.md +21 -0
  168. package/src/skills/create-plan/workflows/create.md +136 -0
  169. package/src/skills/create-report/README.md +40 -0
  170. package/src/skills/create-report/SKILL.md +73 -0
  171. package/src/skills/create-report/algorithms/metric-calculation.md +93 -0
  172. package/src/skills/create-report/knowledge/report-metrics.md +82 -0
  173. package/src/skills/create-report/scripts/calc-metrics.js +383 -0
  174. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-1.md +25 -0
  175. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-2.md +26 -0
  176. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-3.md +28 -0
  177. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/judge.json +163 -0
  178. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-1.md +4 -0
  179. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-2.md +3 -0
  180. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-3.md +6 -0
  181. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-1.md +8 -0
  182. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-2.md +12 -0
  183. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-3.md +7 -0
  184. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-1.md +12 -0
  185. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-2.md +22 -0
  186. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-3.md +13 -0
  187. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/meta.json +115 -0
  188. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001-root-cause-attribution.yaml +57 -0
  189. package/src/skills/create-report/tests/index.yaml +20 -0
  190. package/src/skills/create-report/tests/rubrics/root-cause-attribution.md +21 -0
  191. package/src/skills/create-report/workflows/standard.md +175 -0
  192. package/src/skills/decompose-gaps/README.md +39 -0
  193. package/src/skills/decompose-gaps/SKILL.md +78 -0
  194. package/src/skills/decompose-gaps/algorithms/scope-check.md +110 -0
  195. package/src/skills/decompose-gaps/knowledge/scope-validation.md +65 -0
  196. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-1.md +49 -0
  197. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-2.md +56 -0
  198. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-3.md +39 -0
  199. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/judge.json +164 -0
  200. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-1.md +25 -0
  201. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-2.md +11 -0
  202. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-3.md +26 -0
  203. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-1.md +19 -0
  204. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-2.md +5 -0
  205. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-3.md +28 -0
  206. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-1.md +23 -0
  207. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-2.md +27 -0
  208. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-3.md +25 -0
  209. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/meta.json +116 -0
  210. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001-scope-exclusion.yaml +46 -0
  211. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-1.md +32 -0
  212. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-2.md +20 -0
  213. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-3.md +26 -0
  214. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/judge.json +164 -0
  215. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-1.md +7 -0
  216. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-2.md +16 -0
  217. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-3.md +7 -0
  218. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-1.md +5 -0
  219. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-2.md +11 -0
  220. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-3.md +13 -0
  221. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-1.md +13 -0
  222. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-2.md +12 -0
  223. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-3.md +5 -0
  224. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/meta.json +116 -0
  225. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002-glob-before-write.yaml +36 -0
  226. package/src/skills/decompose-gaps/tests/index.yaml +25 -0
  227. package/src/skills/decompose-gaps/tests/rubrics/glob-before-write.md +21 -0
  228. package/src/skills/decompose-gaps/tests/rubrics/scope-exclusion.md +21 -0
  229. package/src/skills/decompose-gaps/workflows/decompose.md +120 -0
  230. package/src/skills/decompose-plan/README.md +43 -0
  231. package/src/skills/decompose-plan/SKILL.md +87 -0
  232. package/src/skills/decompose-plan/algorithms/deduplication.md +101 -0
  233. package/src/skills/decompose-plan/knowledge/atomicity-checklist.md +113 -0
  234. package/src/skills/decompose-plan/knowledge/capabilities.md +44 -0
  235. package/src/skills/decompose-plan/knowledge/human-task-rules.md +67 -0
  236. package/src/skills/decompose-plan/knowledge/scope-guard-checklist.md +73 -0
  237. package/src/skills/decompose-plan/scripts/check-atomicity-limit.js +47 -0
  238. package/src/skills/decompose-plan/scripts/check-duplicates.js +323 -0
  239. package/src/skills/decompose-plan/scripts/verify-atomicity.js +408 -0
  240. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-1.md +30 -0
  241. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-2.md +36 -0
  242. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-3.md +37 -0
  243. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/judge.json +163 -0
  244. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-1.md +20 -0
  245. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-2.md +17 -0
  246. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-3.md +28 -0
  247. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-1.md +114 -0
  248. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-2.md +137 -0
  249. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-3.md +188 -0
  250. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-1.md +0 -0
  251. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-2.md +32 -0
  252. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-3.md +110 -0
  253. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/meta.json +115 -0
  254. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001-atomicity-no-1to1.yaml +56 -0
  255. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-1.md +47 -0
  256. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-2.md +54 -0
  257. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-3.md +43 -0
  258. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/judge.json +163 -0
  259. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-1.md +15 -0
  260. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-2.md +5 -0
  261. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-3.md +12 -0
  262. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-1.md +34 -0
  263. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-2.md +30 -0
  264. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-3.md +35 -0
  265. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-1.md +0 -0
  266. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-2.md +31 -0
  267. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-3.md +0 -0
  268. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/meta.json +115 -0
  269. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002-get-next-id-mandatory.yaml +44 -0
  270. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-1.md +21 -0
  271. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-2.md +38 -0
  272. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-3.md +30 -0
  273. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/judge.json +163 -0
  274. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-1.md +31 -0
  275. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-2.md +35 -0
  276. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-3.md +48 -0
  277. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-1.md +167 -0
  278. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-2.md +62 -0
  279. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-3.md +174 -0
  280. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-1.md +0 -0
  281. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-2.md +0 -0
  282. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-3.md +0 -0
  283. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/meta.json +115 -0
  284. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003-verbatim-dod-transfer.yaml +42 -0
  285. package/src/skills/decompose-plan/tests/index.yaml +30 -0
  286. package/src/skills/decompose-plan/tests/rubrics/atomicity-no-1to1.md +21 -0
  287. package/src/skills/decompose-plan/tests/rubrics/get-next-id-mandatory.md +21 -0
  288. package/src/skills/decompose-plan/tests/rubrics/verbatim-dod-transfer.md +21 -0
  289. package/src/skills/decompose-plan/workflows/decompose.md +272 -0
  290. package/src/skills/deep-research/README.md +36 -0
  291. package/src/skills/deep-research/SKILL.md +106 -0
  292. package/src/skills/deep-research/algorithms/source-scoring.md +63 -0
  293. package/src/skills/deep-research/algorithms/synthesis.md +67 -0
  294. package/src/skills/deep-research/knowledge/data-validation.md +44 -0
  295. package/src/skills/deep-research/knowledge/perplexity-config.md +30 -0
  296. package/src/skills/deep-research/knowledge/research-methodology.md +54 -0
  297. package/src/skills/deep-research/knowledge/source-evaluation.md +33 -0
  298. package/src/skills/deep-research/scripts/perplexity-research.js +315 -0
  299. package/src/skills/deep-research/templates/brief-summary.md +25 -0
  300. package/src/skills/deep-research/templates/research-report.md +76 -0
  301. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-1.md +48 -0
  302. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-2.md +88 -0
  303. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-3.md +56 -0
  304. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/judge.json +163 -0
  305. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-1.md +58 -0
  306. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-2.md +249 -0
  307. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-3.md +44 -0
  308. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-1.md +96 -0
  309. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-2.md +56 -0
  310. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-3.md +94 -0
  311. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-1.md +11 -0
  312. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-2.md +1 -0
  313. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-3.md +1 -0
  314. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/meta.json +115 -0
  315. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001-self-check-url.yaml +58 -0
  316. package/src/skills/deep-research/tests/index.yaml +20 -0
  317. package/src/skills/deep-research/tests/rubrics/self-check-url.md +34 -0
  318. package/src/skills/deep-research/workflows/base-checklist.md +19 -0
  319. package/src/skills/deep-research/workflows/benchmark.md +38 -0
  320. package/src/skills/deep-research/workflows/competitor.md +44 -0
  321. package/src/skills/deep-research/workflows/custom.md +32 -0
  322. package/src/skills/deep-research/workflows/market.md +44 -0
  323. package/src/skills/deep-research/workflows/technology.md +40 -0
  324. package/src/skills/deep-research/workflows/trend.md +40 -0
  325. package/src/skills/execute-task/README.md +44 -0
  326. package/src/skills/execute-task/SKILL.md +292 -0
  327. package/src/skills/execute-task/algorithms/execution-strategy.md +136 -0
  328. package/src/skills/execute-task/knowledge/context-checkpoints.md +75 -0
  329. package/src/skills/execute-task/knowledge/ticket-structure.md +70 -0
  330. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-1.md +5 -0
  331. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-2.md +5 -0
  332. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-3.md +5 -0
  333. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/judge.json +124 -0
  334. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-1.md +4 -0
  335. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-2.md +4 -0
  336. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-3.md +4 -0
  337. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-1.md +4 -0
  338. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-2.md +4 -0
  339. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-3.md +11 -0
  340. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/meta.json +89 -0
  341. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001-no-ticket-creation.yaml +48 -0
  342. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-1.md +5 -0
  343. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-2.md +6 -0
  344. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-3.md +5 -0
  345. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/judge.json +124 -0
  346. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-1.md +4 -0
  347. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-2.md +4 -0
  348. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-3.md +8 -0
  349. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-1.md +9 -0
  350. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-2.md +26 -0
  351. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-3.md +4 -0
  352. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/meta.json +89 -0
  353. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002-no-duplicate-dod.yaml +44 -0
  354. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-1.md +5 -0
  355. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-2.md +5 -0
  356. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-3.md +5 -0
  357. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/judge.json +46 -0
  358. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/meta.json +37 -0
  359. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003-verification-proportionality.yaml +46 -0
  360. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-1.md +18 -0
  361. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-2.md +16 -0
  362. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-3.md +14 -0
  363. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/judge.json +124 -0
  364. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-1.md +5 -0
  365. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-2.md +5 -0
  366. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-3.md +1 -0
  367. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-1.md +8 -0
  368. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-2.md +5 -0
  369. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-3.md +4 -0
  370. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/meta.json +89 -0
  371. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004-no-foreign-ticket-edit.yaml +50 -0
  372. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-1.md +5 -0
  373. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-2.md +5 -0
  374. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-3.md +5 -0
  375. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/judge.json +124 -0
  376. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-1.md +15 -0
  377. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-2.md +4 -0
  378. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-3.md +5 -0
  379. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-1.md +11 -0
  380. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-2.md +11 -0
  381. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-3.md +4 -0
  382. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/meta.json +89 -0
  383. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005-ticket-fields-updated.yaml +39 -0
  384. package/src/skills/execute-task/tests/fixtures/IMPL-902-create-file.md +41 -0
  385. package/src/skills/execute-task/tests/fixtures/IMPL-904-current-task.md +40 -0
  386. package/src/skills/execute-task/tests/fixtures/IMPL-906-fill-ticket.md +42 -0
  387. package/src/skills/execute-task/tests/fixtures/QA-901-button-click.md +41 -0
  388. package/src/skills/execute-task/tests/fixtures/QA-903-visual-figma.md +40 -0
  389. package/src/skills/execute-task/tests/fixtures/TASK-905-done-with-typo.md +36 -0
  390. package/src/skills/execute-task/tests/index.yaml +39 -0
  391. package/src/skills/execute-task/tests/rubrics/no-duplicate-dod.md +22 -0
  392. package/src/skills/execute-task/tests/rubrics/no-foreign-ticket-edit.md +20 -0
  393. package/src/skills/execute-task/tests/rubrics/no-ticket-creation.md +21 -0
  394. package/src/skills/execute-task/tests/rubrics/ticket-fields-updated.md +23 -0
  395. package/src/skills/execute-task/tests/rubrics/verification-proportionality.md +22 -0
  396. package/src/skills/execute-task/workflows/execute.md +104 -0
  397. package/src/skills/manual-testing/README.md +63 -0
  398. package/src/skills/manual-testing/SKILL.md +174 -0
  399. package/src/skills/manual-testing/algorithms/blocked-tool-strategy.md +74 -0
  400. package/src/skills/manual-testing/algorithms/bug-severity.md +73 -0
  401. package/src/skills/manual-testing/algorithms/mcp-budget.md +97 -0
  402. package/src/skills/manual-testing/algorithms/test-prioritization.md +69 -0
  403. package/src/skills/manual-testing/knowledge/browser-extension-testing.md +102 -0
  404. package/src/skills/manual-testing/knowledge/browser-tools.md +114 -0
  405. package/src/skills/manual-testing/knowledge/desktop-tools-advanced.md +92 -0
  406. package/src/skills/manual-testing/knowledge/desktop-tools-core.md +76 -0
  407. package/src/skills/manual-testing/knowledge/sandbox-advanced.md +83 -0
  408. package/src/skills/manual-testing/knowledge/sandbox-core.md +67 -0
  409. package/src/skills/manual-testing/knowledge/stateful-edge-cases.md +69 -0
  410. package/src/skills/manual-testing/knowledge/test-case-design.md +107 -0
  411. package/src/skills/manual-testing/knowledge/testing-types.md +45 -0
  412. package/src/skills/manual-testing/templates/bug-report.md +52 -0
  413. package/src/skills/manual-testing/templates/test-case.md +34 -0
  414. package/src/skills/manual-testing/templates/test-plan.md +97 -0
  415. package/src/skills/manual-testing/templates/test-session-report.md +56 -0
  416. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-1.md +21 -0
  417. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-2.md +65 -0
  418. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-3.md +35 -0
  419. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/judge.json +163 -0
  420. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-1.md +0 -0
  421. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-2.md +7 -0
  422. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-3.md +0 -0
  423. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-1.md +4 -0
  424. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-2.md +15 -0
  425. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-3.md +8 -0
  426. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-1.md +5 -0
  427. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-2.md +7 -0
  428. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-3.md +7 -0
  429. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/meta.json +114 -0
  430. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001-sandbox-mandatory.yaml +38 -0
  431. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-1.md +47 -0
  432. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-2.md +39 -0
  433. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-3.md +40 -0
  434. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/judge.json +163 -0
  435. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-1.md +19 -0
  436. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-2.md +15 -0
  437. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-3.md +24 -0
  438. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-1.md +19 -0
  439. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-2.md +13 -0
  440. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-3.md +18 -0
  441. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-1.md +21 -0
  442. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-2.md +15 -0
  443. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-3.md +14 -0
  444. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/meta.json +114 -0
  445. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002-visual-tc-screenshot.yaml +37 -0
  446. package/src/skills/manual-testing/tests/index.yaml +25 -0
  447. package/src/skills/manual-testing/tests/last-run-tc001-sonnet.log +140 -0
  448. package/src/skills/manual-testing/tests/last-run-tc002.log +1 -0
  449. package/src/skills/manual-testing/tests/last-run.log +1469 -0
  450. package/src/skills/manual-testing/tests/rubrics/sandbox-mandatory.md +20 -0
  451. package/src/skills/manual-testing/tests/rubrics/visual-tc-screenshot.md +21 -0
  452. package/src/skills/manual-testing/workflows/acceptance.md +80 -0
  453. package/src/skills/manual-testing/workflows/exploratory.md +84 -0
  454. package/src/skills/manual-testing/workflows/regression.md +76 -0
  455. package/src/skills/manual-testing/workflows/smoke.md +109 -0
  456. package/src/skills/manual-testing/workflows/test-plan.md +75 -0
  457. package/src/skills/review-result/README.md +59 -0
  458. package/src/skills/review-result/SKILL.md +138 -0
  459. package/src/skills/review-result/algorithms/verification.md +112 -0
  460. package/src/skills/review-result/knowledge/dod-patterns.md +115 -0
  461. package/src/skills/review-result/scripts/verify-artifacts.js +354 -0
  462. package/src/skills/review-result/templates/verdict.md +153 -0
  463. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-1.md +22 -0
  464. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-2.md +7 -0
  465. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-3.md +21 -0
  466. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-1.md +6 -0
  467. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-2.md +6 -0
  468. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-3.md +18 -0
  469. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/judge.json +164 -0
  470. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-1.md +5 -0
  471. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-2.md +7 -0
  472. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-3.md +6 -0
  473. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-1.md +49 -0
  474. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-2.md +28 -0
  475. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-3.md +37 -0
  476. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-1.md +22 -0
  477. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-2.md +13 -0
  478. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-3.md +21 -0
  479. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/meta.json +116 -0
  480. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001-visual-tc-trigger.yaml +51 -0
  481. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-1.md +23 -0
  482. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-2.md +22 -0
  483. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-3.md +28 -0
  484. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-1.md +4 -0
  485. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-2.md +36 -0
  486. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-3.md +4 -0
  487. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/judge.json +163 -0
  488. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-1.md +4 -0
  489. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-2.md +0 -0
  490. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-3.md +4 -0
  491. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-1.md +39 -0
  492. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-2.md +25 -0
  493. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-3.md +32 -0
  494. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-1.md +34 -0
  495. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-2.md +8 -0
  496. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-3.md +23 -0
  497. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/meta.json +115 -0
  498. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002-path-line-suffix.yaml +39 -0
  499. package/src/skills/review-result/tests/fixtures/IMPL-902-path-with-line.md +43 -0
  500. package/src/skills/review-result/tests/fixtures/QA-901-visual-button.md +46 -0
  501. package/src/skills/review-result/tests/index.yaml +25 -0
  502. package/src/skills/review-result/tests/rubrics/path-line-suffix.md +19 -0
  503. package/src/skills/review-result/tests/rubrics/visual-tc-trigger.md +19 -0
  504. package/src/skills/review-result/workflows/review.md +209 -0
  505. package/templates/plan-template.md +1 -0
@@ -0,0 +1,1703 @@
1
+ #!/usr/bin/env node
2
+
3
+ import path from 'path';
4
+ import fs from 'fs';
5
+ import crypto from 'crypto';
6
+ import { fileURLToPath } from 'url';
7
+ import { spawn } from 'child_process';
8
+ import YAML from '../lib/js-yaml.mjs';
9
+ import { findProjectRoot } from '../lib/find-root.mjs';
10
+ import { spawnAgent } from '../lib/agent-spawner.mjs';
11
+
12
+ const __filename = fileURLToPath(import.meta.url);
13
+ const __dirname = path.dirname(__filename);
14
+ const projectRoot = findProjectRoot(process.cwd());
15
+
16
+ import os from 'os';
17
+ import { execSync } from 'child_process';
18
+
19
+ function createTestWorkdir(skillName, suffix = '') {
20
+ const prefix = suffix ? `wf-test-${skillName}-${suffix}-` : `wf-test-${skillName}-`;
21
+ const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
22
+ const workflowDir = path.join(tmpRoot, '.workflow');
23
+ fs.mkdirSync(workflowDir, { recursive: true });
24
+ for (const sub of ['tickets/backlog', 'tickets/ready', 'tickets/in-progress', 'tickets/review', 'tickets/done', 'tickets/archive', 'plans/current', 'plans/archive', 'reports', 'logs']) {
25
+ fs.mkdirSync(path.join(workflowDir, sub), { recursive: true });
26
+ }
27
+ fs.writeFileSync(path.join(workflowDir, 'coach-backlog.yaml'), 'version: 1\nanalyzed_tickets: []\naudited_skills: {}\n', 'utf8');
28
+
29
+ const srcDir = path.join(workflowDir, 'src');
30
+ fs.mkdirSync(srcDir, { recursive: true });
31
+ const realSkills = path.join(projectRoot, 'src', 'skills');
32
+ const realScripts = path.join(projectRoot, 'src', 'scripts');
33
+ const linkSkills = path.join(srcDir, 'skills');
34
+ const linkScripts = path.join(srcDir, 'scripts');
35
+ const configDir = path.join(workflowDir, 'config');
36
+ const realConfigs = path.join(projectRoot, 'configs');
37
+
38
+ // Skills are COPIED (not junctioned) so that agents cannot write to real source files.
39
+ fs.cpSync(realSkills, linkSkills, { recursive: true, dereference: true });
40
+
41
+ // Scripts and configs are junctioned — read-only for agents in practice.
42
+ if (process.platform === 'win32') {
43
+ try { execSync(`mklink /J "${linkScripts}" "${realScripts}"`, { stdio: 'pipe', shell: true }); } catch {}
44
+ try { execSync(`mklink /J "${configDir}" "${realConfigs}"`, { stdio: 'pipe', shell: true }); } catch {}
45
+ } else {
46
+ try { fs.symlinkSync(realScripts, linkScripts, 'dir'); } catch {}
47
+ try { fs.symlinkSync(realConfigs, configDir, 'dir'); } catch {}
48
+ }
49
+
50
+ return tmpRoot;
51
+ }
52
+
53
+ function cleanupTestWorkdir(tmpRoot) {
54
+ if (!tmpRoot || !fs.existsSync(tmpRoot)) return;
55
+ // Remove junctions first so that their targets are not touched by rmSync.
56
+ if (process.platform === 'win32') {
57
+ for (const link of ['src/scripts', 'config']) {
58
+ const p = path.join(tmpRoot, '.workflow', link);
59
+ try { execSync(`rmdir "${p}"`, { stdio: 'pipe', shell: true }); } catch {}
60
+ }
61
+ }
62
+ try { fs.rmSync(tmpRoot, { recursive: true, force: true }); } catch {}
63
+ }
64
+
65
+ function parseArgs() {
66
+ const args = process.argv.slice(2);
67
+ const opts = {
68
+ skill: null,
69
+ caseId: null,
70
+ tag: null,
71
+ layer: null,
72
+ relevant: null,
73
+ all: false,
74
+ agent: null,
75
+ primaryOnly: false,
76
+ skipSecretScan: false,
77
+ fast: false,
78
+ yes: false,
79
+ baselineRef: null,
80
+ establishBaseline: false,
81
+ calibrate: false,
82
+ severity: null
83
+ };
84
+
85
+ for (let i = 0; i < args.length; i++) {
86
+ const arg = args[i];
87
+ if (arg === '--calibrate') {
88
+ opts.calibrate = true;
89
+ } else if (arg === '--skill' && args[i + 1]) {
90
+ opts.skill = args[i + 1];
91
+ i++;
92
+ } else if (arg === '--case' && args[i + 1]) {
93
+ opts.caseId = args[i + 1];
94
+ i++;
95
+ } else if (arg === '--tag' && args[i + 1]) {
96
+ opts.tag = args[i + 1];
97
+ i++;
98
+ } else if (arg === '--layer' && args[i + 1]) {
99
+ opts.layer = args[i + 1];
100
+ i++;
101
+ } else if (arg === '--relevant' && args[i + 1]) {
102
+ opts.relevant = args[i + 1];
103
+ i++;
104
+ } else if (arg === '--baseline-ref' && args[i + 1]) {
105
+ opts.baselineRef = args[i + 1];
106
+ i++;
107
+ } else if (arg === '--all') {
108
+ opts.all = true;
109
+ } else if (arg === '--agent' && args[i + 1]) {
110
+ opts.agent = args[i + 1];
111
+ i++;
112
+ } else if (arg === '--primary-only') {
113
+ opts.primaryOnly = true;
114
+ } else if (arg === '--skip-secret-scan') {
115
+ opts.skipSecretScan = true;
116
+ } else if (arg === '--fast') {
117
+ opts.fast = true;
118
+ } else if (arg === '--yes') {
119
+ opts.yes = true;
120
+ } else if (arg === '--establish-baseline') {
121
+ opts.establishBaseline = true;
122
+ } else if (arg === '--pipeline' && args[i + 1]) {
123
+ opts.pipeline = args[i + 1];
124
+ i++;
125
+ } else if (arg === '--severity' && args[i + 1]) {
126
+ opts.severity = args[i + 1];
127
+ i++;
128
+ }
129
+ }
130
+
131
+ return opts;
132
+ }
133
+
134
+ function findSkillsDir() {
135
+ return path.join(projectRoot, 'src', 'skills');
136
+ }
137
+
138
+ function findSkillTestsDir(skillName) {
139
+ return path.join(findSkillsDir(), skillName, 'tests');
140
+ }
141
+
142
+ function loadIndexYaml(skillName) {
143
+ const testsDir = findSkillTestsDir(skillName);
144
+ const indexPath = path.join(testsDir, 'index.yaml');
145
+
146
+ if (!fs.existsSync(indexPath)) {
147
+ throw new Error(`index.yaml not found for skill: ${skillName}`);
148
+ }
149
+
150
+ const content = fs.readFileSync(indexPath, 'utf8');
151
+ return YAML.load(content);
152
+ }
153
+
154
+ function getBaselineRef(skillName, explicitRef) {
155
+ if (explicitRef) {
156
+ return explicitRef;
157
+ }
158
+
159
+ const index = loadIndexYaml(skillName);
160
+ return index.baseline_ref || 'origin/main';
161
+ }
162
+
163
+ function gitShow(baselineRef, filePath) {
164
+ if (process.env.TEST_GIT_MOCK) {
165
+ return new Promise((resolve) => {
166
+ try {
167
+ const mocks = JSON.parse(fs.readFileSync(process.env.TEST_GIT_MOCK, 'utf8'));
168
+ // Нормализируем путь для кроссплатформности (Windows использует \, но mocks используют /)
169
+ const normalizedPath = filePath.replace(/\\/g, '/');
170
+ const key = `${baselineRef}:${normalizedPath}`;
171
+ if (mocks[key]) {
172
+ resolve(mocks[key]);
173
+ } else if (mocks.__error && mocks.__error[key]) {
174
+ throw new Error(mocks.__error[key]);
175
+ } else {
176
+ resolve(null);
177
+ }
178
+ } catch (e) {
179
+ resolve(null);
180
+ }
181
+ });
182
+ }
183
+
184
+ return new Promise((resolve, reject) => {
185
+ const proc = spawn('git', ['show', `${baselineRef}:${filePath}`], {
186
+ cwd: projectRoot,
187
+ stdio: ['ignore', 'pipe', 'pipe']
188
+ });
189
+
190
+ let stdout = '';
191
+ let stderr = '';
192
+
193
+ proc.stdout.on('data', (data) => { stdout += data; });
194
+ proc.stderr.on('data', (data) => { stderr += data; });
195
+
196
+ proc.on('close', (code) => {
197
+ if (code === 0) {
198
+ resolve(stdout);
199
+ } else if (stderr.includes('does not exist') || code === 128) {
200
+ resolve(null);
201
+ } else {
202
+ reject(new Error(`git show failed: ${stderr}`));
203
+ }
204
+ });
205
+
206
+ proc.on('error', (err) => {
207
+ reject(err);
208
+ });
209
+ });
210
+ }
211
+
212
+ async function loadBaselineMeta(skillName, caseId, baselineRef) {
213
+ const casesDir = path.join('src', 'skills', skillName, 'tests', 'cases', caseId);
214
+ const metaPath = path.join(casesDir, 'current', 'meta.json');
215
+
216
+ const gitMetaContent = await gitShow(baselineRef, metaPath);
217
+
218
+ if (!gitMetaContent) {
219
+ return null;
220
+ }
221
+
222
+ try {
223
+ return JSON.parse(gitMetaContent);
224
+ } catch {
225
+ return null;
226
+ }
227
+ }
228
+
229
+ async function analyzeGitHeadComparison(skillName, cases, baselineRef, currentRunStatuses = {}) {
230
+ console.error(`[DEBUG] analyzeGitHeadComparison called`);
231
+ console.log(`[Runner] analyzeGitHeadComparison called with ${cases.length} cases, skillName=${skillName}`);
232
+
233
+ const comparison = {
234
+ previously_green: 0,
235
+ previously_green_still_green: 0,
236
+ previously_green_now_red: 0,
237
+ previously_red: 0,
238
+ previously_red_still_red: 0,
239
+ previously_red_now_green: 0,
240
+ new_cases: 0
241
+ };
242
+
243
+ let hasBaselineHistory = false;
244
+
245
+ console.log(`[Runner] Starting to iterate ${cases.length} cases`);
246
+ for (const caseDef of cases) {
247
+ console.log(`[Runner] Checking case ${caseDef.id} for git history`);
248
+ let baselineMeta = null;
249
+ try {
250
+ baselineMeta = await loadBaselineMeta(skillName, caseDef.id, baselineRef);
251
+ console.log(`[Runner] loadBaselineMeta result for ${caseDef.id}:`, baselineMeta ? 'found' : 'not found');
252
+
253
+ if (!baselineMeta) {
254
+ comparison.new_cases++;
255
+ continue;
256
+ }
257
+
258
+ hasBaselineHistory = true;
259
+
260
+ const prevStatus = baselineMeta.status;
261
+ // Используем текущий статус из памяти (результат прогона), а не с диска
262
+ const currentStatus = currentRunStatuses[caseDef.id] || 'unknown';
263
+
264
+ if (prevStatus === 'passed') {
265
+ comparison.previously_green++;
266
+ if (currentStatus === 'passed') {
267
+ comparison.previously_green_still_green++;
268
+ } else if (currentStatus === 'failed' || currentStatus === 'error') {
269
+ comparison.previously_green_now_red++;
270
+ }
271
+ } else if (prevStatus === 'failed' || prevStatus === 'error') {
272
+ comparison.previously_red++;
273
+ if (currentStatus === 'failed' || currentStatus === 'error') {
274
+ comparison.previously_red_still_red++;
275
+ } else if (currentStatus === 'passed') {
276
+ comparison.previously_red_now_green++;
277
+ }
278
+ }
279
+ } catch (err) {
280
+ console.error(`[Runner] Error loading baseline meta for ${caseDef.id}:`, err.message);
281
+ throw err;
282
+ }
283
+ }
284
+
285
+ const mode = hasBaselineHistory ? 'no-regression' : 'no-baseline';
286
+ console.log(`[Runner] analyzeGitHeadComparison: hasBaselineHistory=${hasBaselineHistory}, mode=${mode}, cases_checked=${Object.keys(comparison).reduce((sum, key) => sum + (comparison[key] || 0), 0)}`);
287
+
288
+ return { comparison, mode };
289
+ }
290
+
291
+ function computeVerdict(comparison, mode, relevantCaseStatus, establishBaseline) {
292
+ // Priority 1: Check relevant case status first
293
+ if (relevantCaseStatus !== null && relevantCaseStatus !== 'passed') {
294
+ return 'relevant_case_failed';
295
+ }
296
+
297
+ // Priority 2: Check for regression
298
+ if (comparison.previously_green_now_red > 0) {
299
+ return 'regression_detected';
300
+ }
301
+
302
+ // Priority 3: Check for no-baseline mode
303
+ if (mode === 'no-baseline') {
304
+ if (establishBaseline) {
305
+ return 'baseline_established';
306
+ }
307
+ return 'no_baseline_failures';
308
+ }
309
+
310
+ // Default: ready for user review
311
+ return 'ready_for_user_review';
312
+ }
313
+
314
+ function generateOutcomeMessage(result) {
315
+ const { verdict, comparison, mode, relevantCase } = result;
316
+
317
+ let msg = `Verdict: ${verdict}. `;
318
+
319
+ if (mode === 'no-baseline') {
320
+ msg += `Mode: no-baseline (no baseline history found). `;
321
+ } else {
322
+ msg += `Mode: no-regression. `;
323
+ }
324
+
325
+ msg += `Green→Red: ${comparison.previously_green_now_red}/${comparison.previously_green}. `;
326
+ msg += `Red→Green: ${comparison.previously_red_now_green}/${comparison.previously_red}. `;
327
+ msg += `New cases: ${comparison.new_cases}.`;
328
+
329
+ if (relevantCase) {
330
+ msg += ` Relevant case (${relevantCase.id}): ${relevantCase.status}.`;
331
+ }
332
+
333
+ return msg;
334
+ }
335
+
336
+ function resolvePipelineYaml(overridePath = null) {
337
+ if (overridePath) {
338
+ const resolved = path.resolve(overridePath);
339
+ if (fs.existsSync(resolved)) {
340
+ return resolved;
341
+ }
342
+ throw new Error(`Pipeline not found: ${overridePath}`);
343
+ }
344
+
345
+ const projectRootDir = findProjectRoot(process.cwd());
346
+ const workflowConfigPath = path.join(projectRootDir, '.workflow', 'config', 'pipeline.yaml');
347
+ const packageRoot = path.dirname(projectRootDir);
348
+ const packageConfigPath = path.join(packageRoot, 'configs', 'pipeline.yaml');
349
+
350
+ if (fs.existsSync(workflowConfigPath)) {
351
+ return workflowConfigPath;
352
+ }
353
+
354
+ if (fs.existsSync(packageConfigPath)) {
355
+ return packageConfigPath;
356
+ }
357
+
358
+ throw new Error('pipeline.yaml not found in .workflow/config/ or configs/');
359
+ }
360
+
361
+ function loadPipelineConfig(pipelinePath = null) {
362
+ const resolvedPath = resolvePipelineYaml(pipelinePath);
363
+ const content = fs.readFileSync(resolvedPath, 'utf8');
364
+ const config = YAML.load(content);
365
+ console.log(`[Runner] Using pipeline.yaml: ${resolvedPath}`);
366
+ return config.pipeline || config;
367
+ }
368
+
369
+ function validateAgents(agentIds, pipelineConfig) {
370
+ const availableAgents = Object.keys(pipelineConfig.agents || {});
371
+ const invalid = [];
372
+
373
+ for (const agentId of agentIds) {
374
+ if (!availableAgents.includes(agentId)) {
375
+ invalid.push(agentId);
376
+ }
377
+ }
378
+
379
+ if (invalid.length > 0) {
380
+ throw new Error(`Agent(s) '${invalid.join(', ')}' from target_agents[] not found in pipeline.yaml → agents[]`);
381
+ }
382
+
383
+ return true;
384
+ }
385
+
386
+ function loadTestCase(skillName, caseFile) {
387
+ const testsDir = findSkillTestsDir(skillName);
388
+ const casePath = path.join(testsDir, caseFile);
389
+
390
+ if (!fs.existsSync(casePath)) {
391
+ throw new Error(`Test case not found: ${casePath}`);
392
+ }
393
+
394
+ const content = fs.readFileSync(casePath, 'utf8');
395
+ return YAML.load(content);
396
+ }
397
+
398
+ function filterCasesByTag(cases, tag) {
399
+ if (!tag) return cases;
400
+ return cases.filter(c => c.tags && c.tags.includes(tag));
401
+ }
402
+
403
+ function filterCasesBySeverity(cases, severity) {
404
+ if (!severity) return cases;
405
+ return cases.filter(c => c.severity === severity);
406
+ }
407
+
408
+ function getAllSkillNamesWithTests() {
409
+ const skillsDir = findSkillsDir();
410
+ const entries = fs.readdirSync(skillsDir);
411
+ const skillNames = [];
412
+ for (const entry of entries) {
413
+ const fullPath = path.join(skillsDir, entry);
414
+ try {
415
+ const stat = fs.statSync(fullPath);
416
+ if (stat.isDirectory()) {
417
+ const indexPath = path.join(fullPath, 'tests', 'index.yaml');
418
+ if (fs.existsSync(indexPath)) {
419
+ skillNames.push(entry);
420
+ }
421
+ }
422
+ } catch (e) {
423
+ // ignore
424
+ }
425
+ }
426
+ return skillNames;
427
+ }
428
+
429
+ function runSecretScan() {
430
+ return new Promise((resolve) => {
431
+ const scannerPath = path.join(projectRoot, 'src', 'scripts', 'scan-fixtures-for-secrets.js');
432
+ console.log('[Runner] Running secret scan before L2...');
433
+
434
+ const proc = spawn(process.execPath, [scannerPath], {
435
+ cwd: projectRoot,
436
+ stdio: ['ignore', 'pipe', 'pipe']
437
+ });
438
+
439
+ let stdout = '';
440
+ let stderr = '';
441
+
442
+ proc.stdout.on('data', (data) => { stdout += data; });
443
+ proc.stderr.on('data', (data) => { stderr += data; });
444
+
445
+ proc.on('close', (code) => {
446
+ if (code === 0 || stdout.includes('status: passed')) {
447
+ console.log('[Runner] Secret scan passed');
448
+ resolve({ passed: true });
449
+ } else {
450
+ console.log('[Runner] Secret scan FAILED - secrets detected:');
451
+ console.log(stdout);
452
+ if (stderr) console.error(stderr);
453
+ resolve({ passed: false, output: stdout });
454
+ }
455
+ });
456
+
457
+ proc.on('error', (err) => {
458
+ console.error('[Runner] Secret scan error:', err.message);
459
+ resolve({ passed: true });
460
+ });
461
+ });
462
+ }
463
+
464
+ function runL0Assertions(skillName, testCase) {
465
+ const assertions = testCase.assertions?.static || [];
466
+ const results = [];
467
+
468
+ for (const assertion of assertions) {
469
+ if (assertion.kind === 'skill_contains') {
470
+ const skillFile = path.join(findSkillsDir(), skillName, assertion.file || 'SKILL.md');
471
+
472
+ if (!fs.existsSync(skillFile)) {
473
+ results.push({
474
+ passed: false,
475
+ kind: assertion.kind,
476
+ reason: assertion.reason,
477
+ error: `Skill file not found: ${skillFile}`
478
+ });
479
+ continue;
480
+ }
481
+
482
+ const skillContent = fs.readFileSync(skillFile, 'utf8');
483
+ const regex = new RegExp(assertion.pattern, 'i');
484
+ const matches = regex.test(skillContent);
485
+
486
+ results.push({
487
+ passed: matches,
488
+ kind: assertion.kind,
489
+ reason: assertion.reason,
490
+ pattern: assertion.pattern
491
+ });
492
+ }
493
+ }
494
+
495
+ return results;
496
+ }
497
+
498
+ function runL1Assertions(output, testCase) {
499
+ const assertions = testCase.assertions?.deterministic || [];
500
+ const results = [];
501
+
502
+ const outputDependentKinds = ['output_contains_all', 'output_matches', 'output_does_not_contain', 'output_yaml_shape'];
503
+ if (!output && assertions.some(a => outputDependentKinds.includes(a.kind))) {
504
+ return assertions.map(a => ({
505
+ passed: true,
506
+ skipped: true,
507
+ kind: a.kind,
508
+ reason: 'No agent output available (L2 not run)'
509
+ }));
510
+ }
511
+
512
+ for (const assertion of assertions) {
513
+ if (assertion.kind === 'output_contains_all') {
514
+ const missing = [];
515
+ for (const val of assertion.values || []) {
516
+ if (!output.includes(val)) {
517
+ missing.push(val);
518
+ }
519
+ }
520
+ results.push({
521
+ passed: missing.length === 0,
522
+ kind: assertion.kind,
523
+ missing,
524
+ values: assertion.values
525
+ });
526
+ } else if (assertion.kind === 'output_matches') {
527
+ const regex = new RegExp(assertion.regex);
528
+ const matches = regex.test(output);
529
+ results.push({
530
+ passed: matches,
531
+ kind: assertion.kind,
532
+ regex: assertion.regex
533
+ });
534
+ } else if (assertion.kind === 'output_does_not_contain') {
535
+ const found = [];
536
+ for (const val of assertion.values || []) {
537
+ if (output.includes(val)) {
538
+ found.push(val);
539
+ }
540
+ }
541
+ results.push({
542
+ passed: found.length === 0,
543
+ kind: assertion.kind,
544
+ found,
545
+ values: assertion.values
546
+ });
547
+ } else if (assertion.kind === 'output_yaml_shape') {
548
+ try {
549
+ const parsed = YAML.load(output);
550
+ const hasKeys = assertion.required_keys?.every(k => parsed && typeof parsed[k] !== 'undefined');
551
+ results.push({
552
+ passed: hasKeys,
553
+ kind: assertion.kind,
554
+ required_keys: assertion.required_keys
555
+ });
556
+ } catch (e) {
557
+ results.push({
558
+ passed: false,
559
+ kind: assertion.kind,
560
+ error: e.message
561
+ });
562
+ }
563
+ } else if (assertion.kind === 'is_json') {
564
+ try {
565
+ JSON.parse(output);
566
+ results.push({
567
+ passed: true,
568
+ kind: assertion.kind
569
+ });
570
+ } catch (e) {
571
+ results.push({
572
+ passed: false,
573
+ kind: assertion.kind,
574
+ error: e.message
575
+ });
576
+ }
577
+ } else {
578
+ results.push({
579
+ passed: false,
580
+ kind: assertion.kind,
581
+ error: `Unknown assertion kind: ${assertion.kind}`
582
+ });
583
+ }
584
+ }
585
+
586
+ return results;
587
+ }
588
+
589
+ function getSkillSha(skillName) {
590
+ const skillsDir = findSkillsDir();
591
+ const skillFile = path.join(skillsDir, skillName, 'SKILL.md');
592
+
593
+ if (!fs.existsSync(skillFile)) {
594
+ return 'unknown';
595
+ }
596
+
597
+ const content = fs.readFileSync(skillFile, 'utf8');
598
+ return crypto.createHash('sha256').update(content).digest('hex').slice(0, 7);
599
+ }
600
+
601
+ function ensureDir(dir) {
602
+ if (!fs.existsSync(dir)) {
603
+ fs.mkdirSync(dir, { recursive: true });
604
+ }
605
+ }
606
+
607
+ function loadRubric(skillName, rubricName) {
608
+ const rubricPath = path.join(findSkillsDir(), skillName, 'tests', 'rubrics', `${rubricName}.md`);
609
+ if (!fs.existsSync(rubricPath)) {
610
+ throw new Error(`Rubric not found: ${rubricPath}`);
611
+ }
612
+ return fs.readFileSync(rubricPath, 'utf8');
613
+ }
614
+
615
+ function findCalibrationFiles(skillName) {
616
+ const rubricsDir = path.join(findSkillsDir(), skillName, 'tests', 'rubrics', 'calibration');
617
+ if (!fs.existsSync(rubricsDir)) {
618
+ return [];
619
+ }
620
+
621
+ const files = fs.readdirSync(rubricsDir);
622
+ const calibrationMap = {};
623
+
624
+ for (const file of files) {
625
+ const match = file.match(/^(.+)-good\.md$/);
626
+ if (match) {
627
+ const rubricName = match[1];
628
+ const goodPath = path.join(rubricsDir, file);
629
+ const badPath = path.join(rubricsDir, `${rubricName}-bad.md`);
630
+ const rubricPath = path.join(findSkillsDir(), skillName, 'tests', 'rubrics', `${rubricName}.md`);
631
+
632
+ if (fs.existsSync(badPath) && fs.existsSync(rubricPath)) {
633
+ calibrationMap[rubricName] = {
634
+ good: goodPath,
635
+ bad: badPath,
636
+ rubric: rubricPath
637
+ };
638
+ }
639
+ }
640
+ }
641
+
642
+ return calibrationMap;
643
+ }
644
+
645
+ function extractPassThreshold(rubricContent) {
646
+ const match = rubricContent.match(/score\s*≥\s*(\d+)/i);
647
+ if (match) {
648
+ return parseInt(match[1], 10);
649
+ }
650
+ return 4;
651
+ }
652
+
653
+ async function runCalibrationCheck(skillName, rubricName, calibrationFiles, pipelineConfig, judgeAgentId) {
654
+ const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
655
+ if (!judgeAgentConfig) {
656
+ throw new Error(`Judge agent not found: ${judgeAgentId}`);
657
+ }
658
+
659
+ const rubricContent = fs.readFileSync(calibrationFiles.rubric, 'utf8');
660
+ const threshold = extractPassThreshold(rubricContent);
661
+
662
+ const goodContent = fs.readFileSync(calibrationFiles.good, 'utf8');
663
+ const badContent = fs.readFileSync(calibrationFiles.bad, 'utf8');
664
+
665
+ const judgePrompt = (agentOutput, task) => `You are a judge evaluating the output of an AI agent.
666
+
667
+ ## Rubric
668
+ ${rubricContent}
669
+
670
+ ## Target Agent Output
671
+ ${agentOutput}
672
+
673
+ ## Task
674
+ ${task}
675
+
676
+ Please evaluate the output according to the rubric and provide a score from 1 to 5.
677
+ Output format:
678
+ ---RESULT---
679
+ score: <number 1-5>
680
+ reason: <brief explanation>
681
+ ---RESULT---`;
682
+
683
+ const extractGoodResponse = (content) => {
684
+ const match = content.match(/## Ответ агента[\s\S]*?^---$/m);
685
+ return match ? match[0] : content;
686
+ };
687
+
688
+ const goodOutput = extractGoodResponse(goodContent);
689
+ const badOutput = extractGoodResponse(badContent);
690
+
691
+ const [goodResult, badResult] = await Promise.all([
692
+ spawnAgent(judgeAgentConfig, judgePrompt(goodOutput, 'Evaluate the good response'), { timeout: 60 }),
693
+ spawnAgent(judgeAgentConfig, judgePrompt(badOutput, 'Evaluate the bad response'), { timeout: 60 })
694
+ ]);
695
+
696
+ const goodScore = parseJudgeResult(goodResult.output)?.score || 3;
697
+ const badScore = parseJudgeResult(badResult.output)?.score || 3;
698
+
699
+ return {
700
+ rubricName,
701
+ threshold,
702
+ goodScore,
703
+ badScore,
704
+ goodPassed: goodScore >= threshold,
705
+ badPassed: badScore < threshold
706
+ };
707
+ }
708
+
709
+ async function runCalibrationGate(skillName, pipelineConfig) {
710
+ const judgeAgent = loadIndexYaml(skillName).execution?.judge_agent;
711
+ if (!judgeAgent) {
712
+ console.log('[Runner] No judge_agent configured, skipping calibration gate');
713
+ return { passed: true, calibrations: [] };
714
+ }
715
+
716
+ const calibrationMap = findCalibrationFiles(skillName);
717
+
718
+ if (Object.keys(calibrationMap).length === 0) {
719
+ console.log('[Runner] No calibration files found, skipping calibration gate');
720
+ return { passed: true, calibrations: [], warnings: ['calibration files absent'] };
721
+ }
722
+
723
+ const results = [];
724
+ const warnings = [];
725
+
726
+ for (const [rubricName, files] of Object.entries(calibrationMap)) {
727
+ console.log(`[Runner] Calibrating rubric: ${rubricName}`);
728
+ const result = await runCalibrationCheck(skillName, rubricName, files, pipelineConfig, judgeAgent);
729
+ results.push(result);
730
+
731
+ if (!result.goodPassed) {
732
+ console.error(`[Runner] ABORT: judge miscalibrated — rubric '${rubricName}' requires fix (good score=${result.goodScore}, expected ≥${result.threshold})`);
733
+ return {
734
+ passed: false,
735
+ calibrations: results,
736
+ error: `judge miscalibrated — rubric '${rubricName}' requires fix (good score=${result.goodScore}, expected ≥${result.threshold})`
737
+ };
738
+ }
739
+
740
+ if (!result.badPassed) {
741
+ console.error(`[Runner] ABORT: judge miscalibrated — rubric '${rubricName}' requires fix (bad score=${result.badScore}, expected <${result.threshold})`);
742
+ return {
743
+ passed: false,
744
+ calibrations: results,
745
+ error: `judge miscalibrated — rubric '${rubricName}' requires fix (bad score=${result.badScore}, expected <${result.threshold})`
746
+ };
747
+ }
748
+
749
+ console.log(`[Runner] ${rubricName}: good=${result.goodScore} (≥${result.threshold}), bad=${result.badScore} (<${result.threshold}) ✓`);
750
+ }
751
+
752
+ return { passed: true, calibrations: results, warnings };
753
+ }
754
+
755
+ async function writeTrialOutput(skillName, caseId, agentId, trialNum, output) {
756
+ const skillsDir = findSkillsDir();
757
+ const trialDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
758
+ ensureDir(trialDir);
759
+
760
+ const trialFile = path.join(trialDir, `${agentId}/trial-${trialNum}.md`);
761
+ const agentDir = path.join(trialDir, agentId);
762
+ ensureDir(agentDir);
763
+
764
+ fs.writeFileSync(trialFile, output, 'utf8');
765
+ return trialFile;
766
+ }
767
+
768
+ async function writeJudgeResults(skillName, caseId, results) {
769
+ const skillsDir = findSkillsDir();
770
+ const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
771
+ ensureDir(caseDir);
772
+
773
+ const judgePath = path.join(caseDir, 'judge.json');
774
+ let judgeData = { per_model: {}, rubric_scores: [], timestamp: new Date().toISOString() };
775
+ if (fs.existsSync(judgePath)) {
776
+ try {
777
+ const existing = JSON.parse(fs.readFileSync(judgePath, 'utf8'));
778
+ judgeData.per_model = existing.per_model || {};
779
+ judgeData.rubric_scores = existing.rubric_scores || [];
780
+ } catch {}
781
+ }
782
+
783
+ const newAgentIds = new Set(Object.keys(results.per_model || {}));
784
+ judgeData.rubric_scores = judgeData.rubric_scores.filter(r => !newAgentIds.has(r.agentId));
785
+ for (const r of (results.rubric_scores || [])) {
786
+ judgeData.rubric_scores.push(r);
787
+ }
788
+
789
+ for (const [agentId, modelData] of Object.entries(results.per_model || {})) {
790
+ judgeData.per_model[agentId] = {
791
+ pass_count: modelData.pass_count,
792
+ total: modelData.total,
793
+ trials: (modelData.trials || []).map(t => ({
794
+ trial: t.trial,
795
+ score: t.score,
796
+ passed: t.passed
797
+ }))
798
+ };
799
+ }
800
+
801
+ judgeData.timestamp = new Date().toISOString();
802
+
803
+ fs.writeFileSync(judgePath, JSON.stringify(judgeData, null, 2), 'utf8');
804
+ }
805
+
806
+ async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0.02, targetAgentCost = 0.01) {
807
+ const totalLlms = numCases * numModels * trials;
808
+ const judgeCalls = numCases * numModels * trials;
809
+ const targetCalls = numCases * numModels * trials;
810
+ const estimatedCost = (judgeCalls * judgeAgentCost) + (targetCalls * targetAgentCost);
811
+
812
+ console.log(`[Runner] Estimated LLM calls: ${totalLlms} (target: ${targetCalls}, judge: ${judgeCalls})`);
813
+ console.log(`[Runner] Estimated cost: ~$${estimatedCost.toFixed(2)}`);
814
+
815
+ if (!process.argv.includes('--yes')) {
816
+ const readline = await import('readline');
817
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
818
+
819
+ return new Promise((resolve) => {
820
+ rl.question(`Estimated ${totalLlms} LLM calls ($${estimatedCost.toFixed(2)}). Continue? [y/N] `, (answer) => {
821
+ rl.close();
822
+ if (answer.toLowerCase() === 'y' || answer.toLowerCase() === 'yes') {
823
+ resolve(true);
824
+ } else {
825
+ console.log('[Runner] Aborted by user');
826
+ process.exit(0);
827
+ }
828
+ });
829
+ });
830
+ }
831
+
832
+ return true;
833
+ }
834
+
835
+ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judgeAgentId, pipelineConfig, options = {}) {
836
+ const { trials = 3, timeout = 300 } = options;
837
+
838
+ const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
839
+ if (!judgeAgentConfig) {
840
+ throw new Error(`Judge agent not found: ${judgeAgentId}`);
841
+ }
842
+
843
+ let rubricName = 'default';
844
+ if (testCase.assertions?.rubric && testCase.assertions.rubric.length > 0) {
845
+ const rubricPath = testCase.assertions.rubric[0].rubric_file;
846
+ if (rubricPath) {
847
+ rubricName = path.basename(rubricPath, '.md');
848
+ }
849
+ }
850
+
851
+ const rubric = loadRubric(skillName, rubricName);
852
+ const results = {
853
+ per_model: {},
854
+ rubric_scores: [],
855
+ tokens: null
856
+ };
857
+
858
+ const caseId = caseDef?.id || 'unknown';
859
+
860
+ function buildTargetPrompt(taskWorkdir) {
861
+ let targetPrompt = '';
862
+ const testsDir = findSkillTestsDir(skillName);
863
+ const caseDir = caseDef?.file ? path.dirname(caseDef.file) : '';
864
+
865
+ if (testCase.scenario?.system_prompt_file) {
866
+ const systemPromptPath = path.join(testsDir, caseDir, testCase.scenario.system_prompt_file);
867
+ if (fs.existsSync(systemPromptPath)) {
868
+ targetPrompt += fs.readFileSync(systemPromptPath, 'utf8') + '\n\n';
869
+ }
870
+ }
871
+
872
+ if (testCase.scenario?.extra_instructions) {
873
+ targetPrompt += testCase.scenario.extra_instructions + '\n\n';
874
+ }
875
+
876
+ if (testCase.scenario?.inputs) {
877
+ for (const input of testCase.scenario.inputs) {
878
+ if (input.kind === 'file') {
879
+ const fixturePath = path.join(testsDir, caseDir, input.path);
880
+ if (fs.existsSync(fixturePath)) {
881
+ targetPrompt += `## ${input.as || 'Input'}\n`;
882
+ targetPrompt += fs.readFileSync(fixturePath, 'utf8') + '\n\n';
883
+ }
884
+ } else if (input.kind === 'inline') {
885
+ if (input.content) {
886
+ targetPrompt += `## ${input.as || 'Input'}\n`;
887
+ targetPrompt += input.content + '\n\n';
888
+ }
889
+ } else if (input.kind === 'ticket_file') {
890
+ const fixturePath = path.join(testsDir, caseDir, input.path);
891
+ const destDir = input.dest_dir || 'in-progress';
892
+ const ticketId = input.ticket_id;
893
+ if (!ticketId) {
894
+ throw new Error(`ticket_file input requires ticket_id (case ${caseId})`);
895
+ }
896
+ if (!taskWorkdir) {
897
+ throw new Error(`ticket_file input requires task workdir (case ${caseId})`);
898
+ }
899
+ if (!fs.existsSync(fixturePath)) {
900
+ throw new Error(`ticket_file fixture not found: ${fixturePath}`);
901
+ }
902
+ const destPath = path.join(taskWorkdir, '.workflow', 'tickets', destDir, `${ticketId}.md`);
903
+ fs.mkdirSync(path.dirname(destPath), { recursive: true });
904
+ fs.copyFileSync(fixturePath, destPath);
905
+ targetPrompt += `## Context\nticket_id: ${ticketId}\n\n`;
906
+ }
907
+ }
908
+ }
909
+
910
+ if (!targetPrompt.trim()) {
911
+ targetPrompt = testCase.prompt || testCase.input || '';
912
+ }
913
+
914
+ return targetPrompt;
915
+ }
916
+
917
+ const allTasks = [];
918
+ for (const agentId of targetAgents) {
919
+ const agentConfig = pipelineConfig.agents[agentId];
920
+ if (!agentConfig) {
921
+ throw new Error(`Target agent not found: ${agentId}`);
922
+ }
923
+ results.per_model[agentId] = {
924
+ trials: [],
925
+ pass_count: 0,
926
+ total: trials
927
+ };
928
+ for (let trial = 1; trial <= trials; trial++) {
929
+ allTasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
930
+ }
931
+ }
932
+
933
+ const allResults = await Promise.all(
934
+ allTasks.map(async (task) => {
935
+ const taskSuffix = `${caseId}-${task.agentId}-t${task.trial}`;
936
+ let taskWorkdir = null;
937
+ try {
938
+ taskWorkdir = createTestWorkdir(skillName, taskSuffix);
939
+ const targetPrompt = buildTargetPrompt(taskWorkdir);
940
+ const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
941
+ timeout,
942
+ stageId: `${caseId}-${task.agentId}-trial-${task.trial}`,
943
+ projectRoot: taskWorkdir
944
+ });
945
+
946
+ // Snapshot ticket files after target-run (for judge to inspect actual file state).
947
+ let ticketFilesSection = '';
948
+ const ticketInputs = (testCase.scenario?.inputs || []).filter(i => i.kind === 'ticket_file');
949
+ for (const input of ticketInputs) {
950
+ const ticketPath = path.join(
951
+ taskWorkdir,
952
+ '.workflow', 'tickets',
953
+ input.dest_dir || 'in-progress',
954
+ `${input.ticket_id}.md`
955
+ );
956
+ if (fs.existsSync(ticketPath)) {
957
+ const content = fs.readFileSync(ticketPath, 'utf8');
958
+ ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id} (${input.dest_dir || 'in-progress'}/)\n\n\`\`\`markdown\n${content}\n\`\`\`\n`;
959
+ } else {
960
+ ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id}\n\n(file missing at ${input.dest_dir || 'in-progress'}/${input.ticket_id}.md)\n`;
961
+ }
962
+ }
963
+
964
+ const judgePrompt = `You are a judge evaluating the output of an AI agent.
965
+
966
+ ## Rubric
967
+ ${rubric}
968
+
969
+ ## Target Agent Output
970
+ ${targetOutput.output || targetOutput.status || 'No output'}
971
+ ${ticketFilesSection}
972
+ ## Task
973
+ ${testCase.description || testCase.name || 'Evaluate the response'}
974
+
975
+ Please evaluate the output according to the rubric and provide a score from 1 to 5.
976
+ Output format:
977
+ ---RESULT---
978
+ score: <number 1-5>
979
+ reason: <brief explanation>
980
+ ---RESULT---`;
981
+
982
+ const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
983
+ timeout: 60,
984
+ stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
985
+ });
986
+
987
+ let score = 3;
988
+ const parsed = parseJudgeResult(judgeResult.output);
989
+ if (parsed && parsed.score) {
990
+ score = parsed.score;
991
+ }
992
+
993
+ await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
994
+
995
+ return {
996
+ trial: task.trial,
997
+ agentId: task.agentId,
998
+ score,
999
+ output: targetOutput.output || '',
1000
+ judge_output: judgeResult.output || '',
1001
+ passed: score >= 4,
1002
+ errored: false
1003
+ };
1004
+ } catch (err) {
1005
+ console.error(`[Runner] Trial errored: ${task.agentId} trial ${task.trial} — ${err.message}`);
1006
+ try {
1007
+ await writeTrialOutput(
1008
+ skillName,
1009
+ caseId,
1010
+ task.agentId,
1011
+ task.trial,
1012
+ `# TRIAL ERRORED\n\nagent: ${task.agentId}\ntrial: ${task.trial}\nerror: ${err.message}\n`
1013
+ );
1014
+ } catch {}
1015
+ return {
1016
+ trial: task.trial,
1017
+ agentId: task.agentId,
1018
+ score: null,
1019
+ error: err.message,
1020
+ passed: false,
1021
+ errored: true
1022
+ };
1023
+ } finally {
1024
+ if (taskWorkdir) {
1025
+ cleanupTestWorkdir(taskWorkdir);
1026
+ }
1027
+ }
1028
+ })
1029
+ );
1030
+
1031
+ for (const result of allResults) {
1032
+ results.per_model[result.agentId].trials.push(result);
1033
+ if (result.errored) {
1034
+ results.per_model[result.agentId].error_count = (results.per_model[result.agentId].error_count || 0) + 1;
1035
+ } else if (result.passed) {
1036
+ results.per_model[result.agentId].pass_count++;
1037
+ }
1038
+ results.rubric_scores.push({
1039
+ agentId: result.agentId,
1040
+ trial: result.trial,
1041
+ score: result.score,
1042
+ errored: !!result.errored,
1043
+ error: result.error || undefined
1044
+ });
1045
+ }
1046
+ for (const agentId of Object.keys(results.per_model)) {
1047
+ results.per_model[agentId].trials.sort((a, b) => a.trial - b.trial);
1048
+ }
1049
+ results.rubric_scores.sort((a, b) =>
1050
+ a.agentId === b.agentId ? a.trial - b.trial : a.agentId.localeCompare(b.agentId)
1051
+ );
1052
+
1053
+ return results;
1054
+ }
1055
+
1056
+ function parseJudgeResult(output) {
1057
+ if (!output) return null;
1058
+
1059
+ const scoreMatch = output.match(/score:\s*(\d+)/i);
1060
+ const reasonMatch = output.match(/reason:\s*(.+)/i);
1061
+
1062
+ if (scoreMatch) {
1063
+ return {
1064
+ score: parseInt(scoreMatch[1], 10),
1065
+ reason: reasonMatch ? reasonMatch[1].trim() : ''
1066
+ };
1067
+ }
1068
+
1069
+ return null;
1070
+ }
1071
+
1072
+ function aggregateResults(results, testCase) {
1073
+ const aggregate = testCase.aggregate || 'auto';
1074
+ const severity = testCase.severity || 'normal';
1075
+
1076
+ let useAll = aggregate === 'all';
1077
+ if (aggregate === 'auto') {
1078
+ useAll = severity === 'critical';
1079
+ }
1080
+
1081
+ const perModelResults = {};
1082
+
1083
+ for (const [agentId, modelData] of Object.entries(results.per_model)) {
1084
+ const passCount = modelData.pass_count;
1085
+ const errorCount = modelData.error_count || 0;
1086
+ const total = modelData.total;
1087
+ const effective = total - errorCount;
1088
+ const threshold = Math.ceil(total / 2);
1089
+
1090
+ let passed;
1091
+ let errored = false;
1092
+ if (effective === 0) {
1093
+ passed = false;
1094
+ errored = true;
1095
+ } else if (useAll) {
1096
+ passed = passCount === total;
1097
+ } else {
1098
+ passed = passCount >= threshold;
1099
+ }
1100
+
1101
+ perModelResults[agentId] = {
1102
+ passed,
1103
+ errored,
1104
+ pass_count: passCount,
1105
+ error_count: errorCount,
1106
+ total,
1107
+ threshold: useAll ? total : threshold
1108
+ };
1109
+ }
1110
+
1111
+ const allModelsPassed = Object.values(perModelResults).every(m => m.passed);
1112
+
1113
+ return {
1114
+ per_model: perModelResults,
1115
+ overall_passed: allModelsPassed
1116
+ };
1117
+ }
1118
+
1119
+ async function writeMetaJson(caseId, skillName, status, durationMs, l2Results = null, l1_skipped = null) {
1120
+ const skillsDir = findSkillsDir();
1121
+ const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
1122
+ ensureDir(caseDir);
1123
+
1124
+ const metaPath = path.join(caseDir, 'meta.json');
1125
+ let existing = null;
1126
+ if (fs.existsSync(metaPath)) {
1127
+ try {
1128
+ existing = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
1129
+ } catch {}
1130
+ }
1131
+
1132
+ const meta = {
1133
+ date: new Date().toISOString(),
1134
+ skill_sha: getSkillSha(skillName),
1135
+ status,
1136
+ duration_ms: durationMs
1137
+ };
1138
+
1139
+ if (l1_skipped) {
1140
+ meta.l1_skipped = true;
1141
+ }
1142
+
1143
+ const mergedPerModel = (existing && existing.per_model) ? { ...existing.per_model } : {};
1144
+ let mergedRubricScores = (existing && existing.rubric_scores) ? [...existing.rubric_scores] : [];
1145
+
1146
+ if (l2Results) {
1147
+ const aggregated = aggregateResults(l2Results, {});
1148
+ const newAgentIds = new Set(Object.keys(aggregated.per_model || {}));
1149
+ for (const [agentId, data] of Object.entries(aggregated.per_model || {})) {
1150
+ mergedPerModel[agentId] = data;
1151
+ }
1152
+ mergedRubricScores = mergedRubricScores.filter(r => !newAgentIds.has(r.agentId));
1153
+ for (const r of (l2Results.rubric_scores || [])) {
1154
+ mergedRubricScores.push(r);
1155
+ }
1156
+ if (l2Results.tokens) {
1157
+ meta.tokens = l2Results.tokens;
1158
+ }
1159
+ }
1160
+
1161
+ if (Object.keys(mergedPerModel).length > 0) {
1162
+ meta.per_model = mergedPerModel;
1163
+ }
1164
+ if (mergedRubricScores.length > 0) {
1165
+ meta.rubric_scores = mergedRubricScores;
1166
+ }
1167
+
1168
+ const allPassed = Object.values(mergedPerModel).every(m => m.passed);
1169
+ if (Object.keys(mergedPerModel).length > 0) {
1170
+ meta.status = allPassed ? 'passed' : 'failed';
1171
+ }
1172
+
1173
+ fs.writeFileSync(metaPath, JSON.stringify(meta, null, 2), 'utf8');
1174
+ }
1175
+
1176
+ async function runTestsForSkill(skillName, opts) {
1177
+ console.log(`[Runner] Per-task isolated workdirs will be created for each (case × agent × trial)`);
1178
+ const result = {
1179
+ skill: skillName,
1180
+ status: 'passed',
1181
+ total: 0,
1182
+ current_run: { passed: 0, failed: 0, no_coverage: 0 },
1183
+ baseline_ref: 'origin/main',
1184
+ target_agents: [],
1185
+ judge_agent: null
1186
+ };
1187
+ let cases = [];
1188
+ const currentRunStatuses = {};
1189
+
1190
+ try {
1191
+ const index = loadIndexYaml(skillName);
1192
+ const pipelineConfig = loadPipelineConfig(opts.pipeline || null);
1193
+
1194
+ const defaultTargetAgents = index.execution?.target_agents || [];
1195
+ const judgeAgent = index.execution?.judge_agent || null;
1196
+
1197
+ if (defaultTargetAgents.length > 0) {
1198
+ validateAgents(defaultTargetAgents, pipelineConfig);
1199
+ console.log(`[Runner] target_agents from index.yaml: ${defaultTargetAgents.join(', ')}`);
1200
+ }
1201
+
1202
+ if (judgeAgent) {
1203
+ validateAgents([judgeAgent], pipelineConfig);
1204
+ console.log(`[Runner] judge_agent from index.yaml: ${judgeAgent}`);
1205
+ }
1206
+
1207
+ let effectiveTargetAgents = defaultTargetAgents;
1208
+
1209
+ if (opts.agent) {
1210
+ validateAgents([opts.agent], pipelineConfig);
1211
+ effectiveTargetAgents = [opts.agent];
1212
+ console.log(`[Runner] Override target_agents via --agent: ${opts.agent}`);
1213
+ } else if (opts.primaryOnly && defaultTargetAgents.length > 0) {
1214
+ effectiveTargetAgents = [defaultTargetAgents[0]];
1215
+ console.log(`[Runner] Using only primary agent: ${effectiveTargetAgents[0]}`);
1216
+ }
1217
+
1218
+ result.target_agents = effectiveTargetAgents;
1219
+ result.judge_agent = judgeAgent;
1220
+
1221
+ if (opts.calibrate) {
1222
+ console.log(`[Runner] Running calibration gate only...`);
1223
+ const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
1224
+
1225
+ if (!calibrationResult.passed) {
1226
+ console.error(`[Runner] Calibration FAILED: ${calibrationResult.error}`);
1227
+ result.status = 'calibration_failed';
1228
+ result.error = calibrationResult.error;
1229
+ result.calibration = calibrationResult;
1230
+ return result;
1231
+ }
1232
+
1233
+ console.log('[Runner] Calibration gate PASSED');
1234
+ result.calibration = calibrationResult;
1235
+ result.status = 'calibration_passed';
1236
+ return result;
1237
+ }
1238
+
1239
+ cases = index.cases || [];
1240
+
1241
+ if (opts.tag) {
1242
+ cases = filterCasesByTag(cases, opts.tag);
1243
+ }
1244
+
1245
+ if (opts.severity) {
1246
+ cases = filterCasesBySeverity(cases, opts.severity);
1247
+ }
1248
+
1249
+ if (opts.caseId) {
1250
+ const caseDef = cases.find(c => c.id === opts.caseId);
1251
+ if (caseDef) {
1252
+ const testCase = loadTestCase(skillName, caseDef.file);
1253
+ if (testCase.execution?.target_agents) {
1254
+ validateAgents(testCase.execution.target_agents, pipelineConfig);
1255
+ effectiveTargetAgents = testCase.execution.target_agents;
1256
+ console.log(`[Runner] Override target_agents in case ${opts.caseId}: ${effectiveTargetAgents.join(', ')}`);
1257
+ }
1258
+ if (testCase.execution?.judge_agent) {
1259
+ const caseJudgeAgent = testCase.execution.judge_agent;
1260
+ validateAgents([caseJudgeAgent], pipelineConfig);
1261
+ console.log(`[Runner] Override judge_agent in case ${opts.caseId}: ${caseJudgeAgent}`);
1262
+ }
1263
+ cases = [caseDef];
1264
+ } else {
1265
+ throw new Error(`Case not found: ${opts.caseId}`);
1266
+ }
1267
+ }
1268
+
1269
+ result.total = cases.length;
1270
+
1271
+ const startTime = Date.now();
1272
+
1273
+ const runL2 = !opts.layer || opts.layer === 'l2';
1274
+
1275
+ const casesWithRubric = cases.filter(cd => {
1276
+ try {
1277
+ const tc = loadTestCase(skillName, cd.file);
1278
+ return tc.assertions?.rubric && tc.assertions.rubric.length > 0;
1279
+ } catch { return false; }
1280
+ });
1281
+ const anyHasRubric = casesWithRubric.length > 0;
1282
+
1283
+ if (casesWithRubric.length < cases.length) {
1284
+ const missing = cases.length - casesWithRubric.length;
1285
+ console.log(`[Runner] ${missing}/${cases.length} cases have no rubric — L2 will be skipped for them`);
1286
+ }
1287
+
1288
+ if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric) {
1289
+ const trials = opts.fast ? 1 : 3;
1290
+ const totalModels = effectiveTargetAgents.length;
1291
+ await preFlightApproval(casesWithRubric.length, totalModels, trials);
1292
+ }
1293
+
1294
+ let secretScanFailed = false;
1295
+ let calibrationFailedResult = null;
1296
+
1297
+ const anyRunL1 = !opts.layer || opts.layer === 'deterministic';
1298
+ const anyRunL2 = !opts.layer || opts.layer === 'l2';
1299
+
1300
+ if (anyRunL1 && !opts.skipSecretScan) {
1301
+ const scanResult = await runSecretScan();
1302
+ if (!scanResult.passed) {
1303
+ secretScanFailed = true;
1304
+ result.error = 'Secret scan failed - secrets detected in fixtures';
1305
+ }
1306
+ }
1307
+
1308
+ if (anyRunL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric && !secretScanFailed) {
1309
+ const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
1310
+ if (!calibrationResult.passed) {
1311
+ console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
1312
+ calibrationFailedResult = calibrationResult;
1313
+ result.status = 'calibration_failed';
1314
+ result.error = calibrationResult.error;
1315
+ result.calibration = calibrationResult;
1316
+ return { ...result, cases, currentRunStatuses };
1317
+ }
1318
+ if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
1319
+ console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
1320
+ }
1321
+ console.log('[Runner] Calibration gate PASSED');
1322
+ }
1323
+
1324
+ await Promise.all(cases.map(async (caseDef) => {
1325
+ const caseStart = Date.now();
1326
+
1327
+ try {
1328
+ const testCase = loadTestCase(skillName, caseDef.file);
1329
+
1330
+ const hasRubric = testCase.assertions?.rubric && testCase.assertions.rubric.length > 0;
1331
+
1332
+ const runL0 = !opts.layer || opts.layer === 'static' || opts.layer === 'deterministic';
1333
+ const runL1 = !opts.layer || opts.layer === 'deterministic';
1334
+ const runL2 = !opts.layer || opts.layer === 'l2';
1335
+
1336
+ // Secret scan result propagated from pre-loop
1337
+ if (runL1 && !opts.skipSecretScan && secretScanFailed) {
1338
+ result.current_run.failed++;
1339
+ result.status = 'failed';
1340
+ currentRunStatuses[caseDef.id] = 'failed';
1341
+ await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1342
+ return;
1343
+ }
1344
+
1345
+ // L0 static assertions
1346
+ if (runL0) {
1347
+ const l0Results = runL0Assertions(skillName, testCase);
1348
+ const l0Failed = l0Results.filter(r => !r.passed);
1349
+ if (l0Failed.length > 0) {
1350
+ result.current_run.failed++;
1351
+ result.status = 'failed';
1352
+ currentRunStatuses[caseDef.id] = 'failed';
1353
+ await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1354
+ return;
1355
+ }
1356
+ }
1357
+
1358
+ if (runL1) {
1359
+ const mockOutput = '';
1360
+ const l1Results = runL1Assertions(mockOutput, testCase);
1361
+ const l1Failed = l1Results.filter(r => !r.passed);
1362
+ const l1Skipped = l1Results.some(r => r.skipped);
1363
+ const l1Declared = (testCase.assertions?.deterministic || []).length;
1364
+ const l1Executed = l1Results.filter(r => !r.skipped).length;
1365
+
1366
+ const willRunL2 = runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric;
1367
+ const noCoverage = l1Declared > 0 && l1Executed === 0 && !willRunL2;
1368
+
1369
+ let caseStatus;
1370
+ if (l1Failed.length > 0) {
1371
+ caseStatus = 'failed';
1372
+ } else if (noCoverage) {
1373
+ caseStatus = 'no_coverage';
1374
+ } else {
1375
+ caseStatus = 'passed';
1376
+ }
1377
+ currentRunStatuses[caseDef.id] = caseStatus;
1378
+
1379
+ if (caseStatus === 'failed') {
1380
+ result.current_run.failed++;
1381
+ result.status = 'failed';
1382
+ } else if (caseStatus === 'no_coverage') {
1383
+ result.current_run.no_coverage = (result.current_run.no_coverage || 0) + 1;
1384
+ console.log(`[Runner] ${caseDef.id}: no_coverage — L1 assertions require agent output but L2 is not configured (no rubric or no agents)`);
1385
+ } else {
1386
+ result.current_run.passed++;
1387
+ }
1388
+
1389
+ if (l1Skipped) {
1390
+ result.l1_skipped = true;
1391
+ }
1392
+
1393
+ let l2Results = null;
1394
+ if (willRunL2) {
1395
+ const trials = opts.fast ? 1 : 3;
1396
+ const index = loadIndexYaml(skillName);
1397
+ const defaultTimeout = index.execution?.default_timeout_s || 300;
1398
+ const timeout = testCase.execution?.timeout_s || defaultTimeout;
1399
+ const caseTargetAgents = testCase.execution?.target_agents;
1400
+ const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
1401
+ ? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
1402
+ : effectiveTargetAgents;
1403
+ if (caseTargetAgents && caseTargetAgents.length > 0) {
1404
+ console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
1405
+ }
1406
+ try {
1407
+ l2Results = await runL2Evaluation(
1408
+ skillName,
1409
+ testCase,
1410
+ caseDef,
1411
+ perCaseAgents,
1412
+ judgeAgent,
1413
+ pipelineConfig,
1414
+ { trials, concurrency: 2, timeout }
1415
+ );
1416
+
1417
+ const aggregated = aggregateResults(l2Results, testCase);
1418
+ console.log(`[Runner] L2 Results for ${caseDef.id}:`, JSON.stringify(aggregated, null, 2));
1419
+
1420
+ await writeJudgeResults(skillName, caseDef.id, l2Results);
1421
+
1422
+ if (!aggregated.overall_passed) {
1423
+ result.status = 'failed';
1424
+ currentRunStatuses[caseDef.id] = 'failed';
1425
+ }
1426
+ } catch (l2Err) {
1427
+ console.error(`[Runner] L2 evaluation failed:`, l2Err.message);
1428
+ result.status = 'failed';
1429
+ currentRunStatuses[caseDef.id] = 'failed';
1430
+ }
1431
+ }
1432
+
1433
+ await writeMetaJson(caseDef.id, skillName, caseStatus, Date.now() - caseStart, l2Results, result.l1_skipped);
1434
+ } else if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
1435
+ const trials = opts.fast ? 1 : 3;
1436
+ const defaultTimeout = index.execution?.default_timeout_s || 300;
1437
+ const timeout = testCase.execution?.timeout_s || defaultTimeout;
1438
+ const caseTargetAgents = testCase.execution?.target_agents;
1439
+ const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
1440
+ ? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
1441
+ : effectiveTargetAgents;
1442
+ if (caseTargetAgents && caseTargetAgents.length > 0) {
1443
+ console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
1444
+ }
1445
+ let l2Results = null;
1446
+ let caseStatus = 'passed';
1447
+ try {
1448
+ l2Results = await runL2Evaluation(
1449
+ skillName,
1450
+ testCase,
1451
+ caseDef,
1452
+ perCaseAgents,
1453
+ judgeAgent,
1454
+ pipelineConfig,
1455
+ { trials, concurrency: 2, timeout }
1456
+ );
1457
+
1458
+ const aggregated = aggregateResults(l2Results, testCase);
1459
+ console.log(`[Runner] L2 Results for ${caseDef.id}:`, JSON.stringify(aggregated, null, 2));
1460
+
1461
+ await writeJudgeResults(skillName, caseDef.id, l2Results);
1462
+
1463
+ if (!aggregated.overall_passed) {
1464
+ result.status = 'failed';
1465
+ result.current_run.failed++;
1466
+ caseStatus = 'failed';
1467
+ } else {
1468
+ result.current_run.passed++;
1469
+ }
1470
+ } catch (l2Err) {
1471
+ console.error(`[Runner] L2 evaluation failed:`, l2Err.message);
1472
+ result.status = 'failed';
1473
+ result.current_run.failed++;
1474
+ caseStatus = 'failed';
1475
+ }
1476
+
1477
+ currentRunStatuses[caseDef.id] = caseStatus;
1478
+ await writeMetaJson(caseDef.id, skillName, caseStatus, Date.now() - caseStart, l2Results);
1479
+ } else {
1480
+ result.current_run.passed++;
1481
+ currentRunStatuses[caseDef.id] = 'passed';
1482
+ await writeMetaJson(caseDef.id, skillName, 'passed', Date.now() - caseStart);
1483
+ }
1484
+ } catch (e) {
1485
+ result.current_run.failed++;
1486
+ result.status = 'failed';
1487
+ currentRunStatuses[caseDef.id] = 'error';
1488
+ await writeMetaJson(caseDef.id, skillName, 'error', Date.now() - caseStart);
1489
+ }
1490
+ }));
1491
+
1492
+ if (result.status === 'passed' && result.current_run.no_coverage > 0 && result.current_run.passed === 0) {
1493
+ result.status = 'no_coverage';
1494
+ }
1495
+ } catch (e) {
1496
+ result.status = 'error';
1497
+ result.error = e.message;
1498
+ }
1499
+
1500
+ return {
1501
+ ...result,
1502
+ cases,
1503
+ currentRunStatuses
1504
+ };
1505
+ }
1506
+
1507
+ async function runSkillTests(opts) {
1508
+ // Validate options
1509
+ if (!opts.all && !opts.skill) {
1510
+ throw new Error('Either --skill or --all must be specified');
1511
+ }
1512
+
1513
+ const results = {
1514
+ status: 'passed',
1515
+ skill: opts.skill || 'unknown',
1516
+ mode: 'deterministic',
1517
+ total: 0,
1518
+ current_run: { passed: 0, failed: 0, no_coverage: 0 },
1519
+ baseline_ref: 'origin/main',
1520
+ git_head_comparison: null,
1521
+ verdict: 'ready_for_user_review',
1522
+ outcome_message: ''
1523
+ };
1524
+
1525
+ try {
1526
+ if (opts.skill) {
1527
+ const skillResult = await runTestsForSkill(opts.skill, opts);
1528
+
1529
+ // Merge skill results
1530
+ results.skill = skillResult.skill;
1531
+ results.total = skillResult.total;
1532
+ results.current_run.passed = skillResult.current_run.passed;
1533
+ results.current_run.failed = skillResult.current_run.failed;
1534
+ results.current_run.no_coverage = skillResult.current_run.no_coverage || 0;
1535
+ results.status = skillResult.status;
1536
+ results.target_agents = skillResult.target_agents;
1537
+ results.judge_agent = skillResult.judge_agent;
1538
+ if (skillResult.error) results.error = skillResult.error;
1539
+ if (skillResult.calibration) results.calibration = skillResult.calibration;
1540
+
1541
+ // Prepare for git comparison (if applicable)
1542
+ const cases = skillResult.cases;
1543
+ const currentRunStatuses = skillResult.currentRunStatuses;
1544
+
1545
+ // Git comparison and verdict (skip for calibration or no cases)
1546
+ if (cases && cases.length > 0 && !opts.calibrate && !skillResult.status.startsWith('calibration_')) {
1547
+ try {
1548
+ const baselineRef = getBaselineRef(opts.skill, opts.baselineRef);
1549
+ results.baseline_ref = baselineRef;
1550
+
1551
+ console.log(`[Runner] Computing git head comparison for ${cases.length} cases with baselineRef=${baselineRef}`);
1552
+ const gitResult = await analyzeGitHeadComparison(opts.skill, cases, baselineRef, currentRunStatuses);
1553
+ const { comparison, mode } = gitResult;
1554
+ results.mode = mode;
1555
+ results.git_head_comparison = comparison;
1556
+ console.log(`[Runner] Git head comparison complete: mode=${mode}`);
1557
+
1558
+ let relevantCaseStatus = null;
1559
+ if (opts.relevant) {
1560
+ const relevantCaseDir = path.join(findSkillTestsDir(opts.skill), 'cases', opts.relevant, 'current', 'meta.json');
1561
+ if (fs.existsSync(relevantCaseDir)) {
1562
+ try {
1563
+ const meta = JSON.parse(fs.readFileSync(relevantCaseDir, 'utf8'));
1564
+ relevantCaseStatus = meta.status;
1565
+ } catch {}
1566
+ }
1567
+ }
1568
+
1569
+ if (relevantCaseStatus) {
1570
+ results.relevant_case_status = relevantCaseStatus;
1571
+ }
1572
+
1573
+ results.verdict = computeVerdict(comparison, mode, relevantCaseStatus, opts.establishBaseline);
1574
+ results.outcome_message = generateOutcomeMessage({
1575
+ verdict: results.verdict,
1576
+ comparison,
1577
+ mode,
1578
+ relevantCase: opts.relevant ? { id: opts.relevant, status: relevantCaseStatus } : null
1579
+ });
1580
+ } catch (verdictErr) {
1581
+ console.error('[Runner] Verdict computation failed:', verdictErr.message);
1582
+ console.error('[Runner] Stack:', verdictErr.stack);
1583
+ }
1584
+ }
1585
+ } else if (opts.all) {
1586
+ const skillNames = getAllSkillNamesWithTests();
1587
+ let total = 0;
1588
+ let passed = 0;
1589
+ let failed = 0;
1590
+ let overallStatus = 'passed';
1591
+
1592
+ for (const skillName of skillNames) {
1593
+ const skillResult = await runTestsForSkill(skillName, opts);
1594
+ total += skillResult.total;
1595
+ passed += skillResult.current_run.passed;
1596
+ failed += skillResult.current_run.failed;
1597
+ if (skillResult.status !== 'passed') {
1598
+ overallStatus = 'failed';
1599
+ }
1600
+ }
1601
+
1602
+ results.total = total;
1603
+ results.current_run.passed = passed;
1604
+ results.current_run.failed = failed;
1605
+ results.status = overallStatus;
1606
+ results.skill = 'all';
1607
+ results.mode = 'aggregated';
1608
+ results.verdict = overallStatus === 'passed' ? 'all_passed' : 'aggregated_failed';
1609
+ results.outcome_message = overallStatus === 'passed' ? 'All skills passed' : 'Some skills failed';
1610
+ results.baseline_ref = null;
1611
+ }
1612
+ } catch (e) {
1613
+ results.status = 'error';
1614
+ results.error = e.message;
1615
+ }
1616
+
1617
+ return results;
1618
+ }
1619
+
1620
+ function printResult(result) {
1621
+ console.log('---RESULT---');
1622
+ console.log(`status: ${result.status}`);
1623
+ console.log(`skill: ${result.skill}`);
1624
+ console.log(`mode: ${result.mode}`);
1625
+ console.log(`total: ${result.total}`);
1626
+ console.log(`current_run.passed: ${result.current_run.passed}`);
1627
+ console.log(`current_run.failed: ${result.current_run.failed}`);
1628
+ if (result.current_run.no_coverage) {
1629
+ console.log(`current_run.no_coverage: ${result.current_run.no_coverage}`);
1630
+ }
1631
+
1632
+ if (result.baseline_ref) {
1633
+ console.log(`baseline_ref: ${result.baseline_ref}`);
1634
+ }
1635
+
1636
+ if (result.git_head_comparison) {
1637
+ const c = result.git_head_comparison;
1638
+ console.log(`git_head_comparison.previously_green: ${c.previously_green}`);
1639
+ console.log(`git_head_comparison.previously_green_still_green: ${c.previously_green_still_green}`);
1640
+ console.log(`git_head_comparison.previously_green_now_red: ${c.previously_green_now_red}`);
1641
+ console.log(`git_head_comparison.previously_red: ${c.previously_red}`);
1642
+ console.log(`git_head_comparison.previously_red_still_red: ${c.previously_red_still_red}`);
1643
+ console.log(`git_head_comparison.previously_red_now_green: ${c.previously_red_now_green}`);
1644
+ console.log(`git_head_comparison.new_cases: ${c.new_cases}`);
1645
+ }
1646
+
1647
+ if (result.relevant_case_status) {
1648
+ console.log(`relevant_case_status: ${result.relevant_case_status}`);
1649
+ }
1650
+
1651
+ if (result.verdict) {
1652
+ console.log(`verdict: ${result.verdict}`);
1653
+ }
1654
+
1655
+ if (result.outcome_message) {
1656
+ console.log(`outcome_message: ${result.outcome_message}`);
1657
+ }
1658
+
1659
+ console.log('---RESULT---');
1660
+ }
1661
+
1662
+ function showHelp() {
1663
+ console.log('run-skill-tests.js - Runner for skill tests');
1664
+ console.log('');
1665
+ console.log('Usage:');
1666
+ console.log(' node run-skill-tests.js --skill <name> Run all tests for a skill');
1667
+ console.log(' node run-skill-tests.js --case TC-XXX-NNN Run a single test case');
1668
+ console.log(' node run-skill-tests.js --tag <tag> Filter tests by tag');
1669
+ console.log(' node run-skill-tests.js --severity <level> Filter tests by severity (e.g., critical, normal)');
1670
+ console.log(' node run-skill-tests.js --layer static|deterministic|l2 Run only L0, L1 or L2');
1671
+ console.log(' node run-skill-tests.js --relevant TC-XXX-NNN Mark relevant case for coach');
1672
+ console.log(' node run-skill-tests.js --baseline-ref <ref> Override baseline ref (default: origin/main)');
1673
+ console.log(' node run-skill-tests.js --establish-baseline Allow reds in no-baseline mode');
1674
+ console.log(' node run-skill-tests.js --all Run all skills');
1675
+ console.log(' node run-skill-tests.js --agent <id> Run only on specific model from target_agents[]');
1676
+ console.log(' node run-skill-tests.js --primary-only Run only on first model from target_agents[]');
1677
+ console.log(' node run-skill-tests.js --skip-secret-scan Skip secret scanning before L2');
1678
+ console.log(' node run-skill-tests.js --fast Run with trials=1 for all cases');
1679
+ console.log(' node run-skill-tests.js --yes Skip pre-flight approval gate');
1680
+ console.log(' node run-skill-tests.js --calibrate Run only calibration gate (no full suite)');
1681
+ }
1682
+
1683
+ async function main() {
1684
+ const args = process.argv.slice(2);
1685
+
1686
+ if (args.includes('--help') || args.includes('-h')) {
1687
+ showHelp();
1688
+ return;
1689
+ }
1690
+
1691
+ const opts = parseArgs();
1692
+ const result = await runSkillTests(opts);
1693
+ printResult(result);
1694
+
1695
+ if (result.status === 'error') {
1696
+ process.exit(1);
1697
+ }
1698
+ }
1699
+
1700
+ main().catch(e => {
1701
+ console.error('Fatal error:', e.message);
1702
+ process.exit(1);
1703
+ });