workflow-ai 1.0.63 → 1.0.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (494) hide show
  1. package/configs/config.yaml +134 -0
  2. package/configs/pipeline.yaml +884 -0
  3. package/configs/ticket-movement-rules.yaml +80 -0
  4. package/package.json +1 -1
  5. package/src/global-dir.mjs +25 -1
  6. package/src/scripts/run-skill-tests.js +348 -136
  7. package/src/skills/analyze-report/README.md +44 -0
  8. package/src/skills/analyze-report/SKILL.md +121 -0
  9. package/src/skills/analyze-report/algorithms/progress-assessment.md +108 -0
  10. package/src/skills/analyze-report/knowledge/analysis-frameworks.md +66 -0
  11. package/src/skills/analyze-report/knowledge/report-structure.md +61 -0
  12. package/src/skills/analyze-report/scripts/calc-plan-metrics.js +234 -0
  13. package/src/skills/analyze-report/templates/analysis-report.md +80 -0
  14. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-1.md +69 -0
  15. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-2.md +103 -0
  16. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/claude-sonnet/trial-3.md +99 -0
  17. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/judge.json +163 -0
  18. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-1.md +89 -0
  19. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-2.md +88 -0
  20. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-deepseek/trial-3.md +100 -0
  21. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-1.md +77 -0
  22. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-2.md +64 -0
  23. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-glm/trial-3.md +110 -0
  24. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-1.md +74 -0
  25. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-2.md +38 -0
  26. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/kilo-minimax/trial-3.md +61 -0
  27. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001/current/meta.json +115 -0
  28. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-001-evidence-from-log.yaml +60 -0
  29. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-1.md +90 -0
  30. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-2.md +89 -0
  31. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/claude-sonnet/trial-3.md +77 -0
  32. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/judge.json +163 -0
  33. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-1.md +84 -0
  34. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-2.md +77 -0
  35. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-deepseek/trial-3.md +89 -0
  36. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-1.md +103 -0
  37. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-2.md +103 -0
  38. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-glm/trial-3.md +103 -0
  39. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-1.md +93 -0
  40. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-2.md +93 -0
  41. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/kilo-minimax/trial-3.md +86 -0
  42. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002/current/meta.json +115 -0
  43. package/src/skills/analyze-report/tests/cases/TC-ANALYZE-REPORT-002-result-block-format.yaml +44 -0
  44. package/src/skills/analyze-report/tests/fixtures/REPORT-002-incorrect-attribution.md +27 -0
  45. package/src/skills/analyze-report/tests/fixtures/pipeline-2026-04-06_qa-001-skip.log +32 -0
  46. package/src/skills/analyze-report/tests/index.yaml +25 -0
  47. package/src/skills/analyze-report/tests/rubrics/evidence-from-log.md +22 -0
  48. package/src/skills/analyze-report/tests/rubrics/result-block-format.md +22 -0
  49. package/src/skills/analyze-report/workflows/progress.md +158 -0
  50. package/src/skills/analyze-report/workflows/retrospective.md +143 -0
  51. package/src/skills/coach/README.md +43 -0
  52. package/src/skills/coach/SKILL.md +166 -0
  53. package/src/skills/coach/SKILL.md.legacy +157 -0
  54. package/src/skills/coach/algorithms/gap-analysis.md +69 -0
  55. package/src/skills/coach/algorithms/improvement-prioritization.md +62 -0
  56. package/src/skills/coach/algorithms/skill-scoring.md +80 -0
  57. package/src/skills/coach/knowledge/audit-applied-changes-clean.txt +11 -0
  58. package/src/skills/coach/knowledge/backlog-management.md +67 -0
  59. package/src/skills/coach/knowledge/backlog-management.md.legacy +90 -0
  60. package/src/skills/coach/knowledge/common-antipatterns.md +76 -0
  61. package/src/skills/coach/knowledge/prompt-engineering.md +45 -0
  62. package/src/skills/coach/knowledge/shared-knowledge-guide.md +44 -0
  63. package/src/skills/coach/knowledge/skill-anatomy.md +49 -0
  64. package/src/skills/coach/knowledge/test-authorship.md +141 -0
  65. package/src/skills/coach/templates/audit-report.md +39 -0
  66. package/src/skills/coach/templates/coach-backlog-init.yaml +14 -0
  67. package/src/skills/coach/templates/coach-backlog-init.yaml.legacy +10 -0
  68. package/src/skills/coach/templates/improvement-plan.md +42 -0
  69. package/src/skills/coach/templates/new-skill.md +95 -0
  70. package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-1.md +58 -0
  71. package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-2.md +65 -0
  72. package/src/skills/coach/tests/cases/TC-COACH-001/current/claude-sonnet/trial-3.md +58 -0
  73. package/src/skills/coach/tests/cases/TC-COACH-001/current/judge.json +151 -0
  74. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-1.md +46 -0
  75. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-2.md +0 -0
  76. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-deepseek/trial-3.md +75 -0
  77. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-1.md +81 -0
  78. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-2.md +101 -0
  79. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-glm/trial-3.md +91 -0
  80. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-1.md +48 -0
  81. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-2.md +30 -0
  82. package/src/skills/coach/tests/cases/TC-COACH-001/current/kilo-minimax/trial-3.md +55 -0
  83. package/src/skills/coach/tests/cases/TC-COACH-001/current/meta.json +95 -0
  84. package/src/skills/coach/tests/cases/TC-COACH-001-evidence-based-temporal-diagram.yaml +53 -0
  85. package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-1.md +46 -0
  86. package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-2.md +50 -0
  87. package/src/skills/coach/tests/cases/TC-COACH-002/current/claude-sonnet/trial-3.md +48 -0
  88. package/src/skills/coach/tests/cases/TC-COACH-002/current/judge.json +151 -0
  89. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-1.md +0 -0
  90. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-2.md +37 -0
  91. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-deepseek/trial-3.md +30 -0
  92. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-1.md +23 -0
  93. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-2.md +29 -0
  94. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-glm/trial-3.md +35 -0
  95. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-1.md +13 -0
  96. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-2.md +19 -0
  97. package/src/skills/coach/tests/cases/TC-COACH-002/current/kilo-minimax/trial-3.md +33 -0
  98. package/src/skills/coach/tests/cases/TC-COACH-002/current/meta.json +95 -0
  99. package/src/skills/coach/tests/cases/TC-COACH-002-root-cause-first.yaml +57 -0
  100. package/src/skills/coach/tests/fixtures/pipeline-2026-04-06_id-collision.log +77 -0
  101. package/src/skills/coach/tests/index.yaml +29 -0
  102. package/src/skills/coach/tests/rubrics/calibration/evidence-based-bad.md +13 -0
  103. package/src/skills/coach/tests/rubrics/calibration/evidence-based-good.md +29 -0
  104. package/src/skills/coach/tests/rubrics/evidence-based.md +26 -0
  105. package/src/skills/coach/tests/rubrics/root-cause-first.md +21 -0
  106. package/src/skills/coach/workflows/analyze.md +79 -0
  107. package/src/skills/coach/workflows/analyze.md.legacy +64 -0
  108. package/src/skills/coach/workflows/audit.md +74 -0
  109. package/src/skills/coach/workflows/audit.md.legacy +59 -0
  110. package/src/skills/coach/workflows/create.md +80 -0
  111. package/src/skills/coach/workflows/create.md.legacy +67 -0
  112. package/src/skills/coach/workflows/improve.md +71 -0
  113. package/src/skills/coach/workflows/improve.md.legacy +60 -0
  114. package/src/skills/coach/workflows/research.md +55 -0
  115. package/src/skills/coach/workflows/review.md +52 -0
  116. package/src/skills/coach/workflows/review.md.legacy +48 -0
  117. package/src/skills/coach/workflows/test.md +97 -0
  118. package/src/skills/create-plan/README.md +39 -0
  119. package/src/skills/create-plan/SKILL.md +104 -0
  120. package/src/skills/create-plan/algorithms/risk-assessment.md +73 -0
  121. package/src/skills/create-plan/knowledge/plan-completeness.md +67 -0
  122. package/src/skills/create-plan/knowledge/plan-lifecycle.md +33 -0
  123. package/src/skills/create-plan/knowledge/task-verification-pairs.md +151 -0
  124. package/src/skills/create-plan/scripts/validate-completeness.js +182 -0
  125. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-1.md +5 -0
  126. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-2.md +39 -0
  127. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/claude-sonnet/trial-3.md +35 -0
  128. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/judge.json +167 -0
  129. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-1.md +5 -0
  130. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-2.md +10 -0
  131. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-deepseek/trial-3.md +5 -0
  132. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-1.md +26 -0
  133. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-2.md +86 -0
  134. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-glm/trial-3.md +5 -0
  135. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-1.md +11 -0
  136. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-2.md +15 -0
  137. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/kilo-minimax/trial-3.md +14 -0
  138. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001/current/meta.json +119 -0
  139. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-001-validate-completeness.yaml +41 -0
  140. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-1.md +25 -0
  141. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-2.md +30 -0
  142. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/claude-sonnet/trial-3.md +37 -0
  143. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/judge.json +164 -0
  144. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-1.md +3 -0
  145. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-2.md +11 -0
  146. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-deepseek/trial-3.md +13 -0
  147. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-1.md +44 -0
  148. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-2.md +5 -0
  149. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-glm/trial-3.md +49 -0
  150. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-1.md +6 -0
  151. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-2.md +11 -0
  152. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/kilo-minimax/trial-3.md +16 -0
  153. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002/current/meta.json +116 -0
  154. package/src/skills/create-plan/tests/cases/TC-CREATE-PLAN-002-task-granularity.yaml +39 -0
  155. package/src/skills/create-plan/tests/index.yaml +25 -0
  156. package/src/skills/create-plan/tests/rubrics/task-granularity.md +21 -0
  157. package/src/skills/create-plan/tests/rubrics/validate-completeness.md +21 -0
  158. package/src/skills/create-plan/workflows/create.md +136 -0
  159. package/src/skills/create-report/README.md +40 -0
  160. package/src/skills/create-report/SKILL.md +73 -0
  161. package/src/skills/create-report/algorithms/metric-calculation.md +93 -0
  162. package/src/skills/create-report/knowledge/report-metrics.md +82 -0
  163. package/src/skills/create-report/scripts/calc-metrics.js +383 -0
  164. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-1.md +25 -0
  165. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-2.md +26 -0
  166. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/claude-sonnet/trial-3.md +28 -0
  167. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/judge.json +163 -0
  168. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-1.md +4 -0
  169. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-2.md +3 -0
  170. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-deepseek/trial-3.md +6 -0
  171. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-1.md +8 -0
  172. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-2.md +12 -0
  173. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-glm/trial-3.md +7 -0
  174. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-1.md +12 -0
  175. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-2.md +22 -0
  176. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/kilo-minimax/trial-3.md +13 -0
  177. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001/current/meta.json +115 -0
  178. package/src/skills/create-report/tests/cases/TC-CREATE-REPORT-001-root-cause-attribution.yaml +57 -0
  179. package/src/skills/create-report/tests/index.yaml +20 -0
  180. package/src/skills/create-report/tests/rubrics/root-cause-attribution.md +21 -0
  181. package/src/skills/create-report/workflows/standard.md +175 -0
  182. package/src/skills/decompose-gaps/README.md +39 -0
  183. package/src/skills/decompose-gaps/SKILL.md +78 -0
  184. package/src/skills/decompose-gaps/algorithms/scope-check.md +110 -0
  185. package/src/skills/decompose-gaps/knowledge/scope-validation.md +65 -0
  186. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-1.md +49 -0
  187. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-2.md +56 -0
  188. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/claude-sonnet/trial-3.md +39 -0
  189. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/judge.json +164 -0
  190. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-1.md +25 -0
  191. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-2.md +11 -0
  192. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-deepseek/trial-3.md +26 -0
  193. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-1.md +19 -0
  194. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-2.md +5 -0
  195. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-glm/trial-3.md +28 -0
  196. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-1.md +23 -0
  197. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-2.md +27 -0
  198. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/kilo-minimax/trial-3.md +25 -0
  199. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001/current/meta.json +116 -0
  200. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-001-scope-exclusion.yaml +46 -0
  201. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-1.md +32 -0
  202. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-2.md +20 -0
  203. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/claude-sonnet/trial-3.md +26 -0
  204. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/judge.json +164 -0
  205. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-1.md +7 -0
  206. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-2.md +16 -0
  207. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-deepseek/trial-3.md +7 -0
  208. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-1.md +5 -0
  209. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-2.md +11 -0
  210. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-glm/trial-3.md +13 -0
  211. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-1.md +13 -0
  212. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-2.md +12 -0
  213. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/kilo-minimax/trial-3.md +5 -0
  214. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002/current/meta.json +116 -0
  215. package/src/skills/decompose-gaps/tests/cases/TC-DECOMPOSE-GAPS-002-glob-before-write.yaml +36 -0
  216. package/src/skills/decompose-gaps/tests/index.yaml +25 -0
  217. package/src/skills/decompose-gaps/tests/rubrics/glob-before-write.md +21 -0
  218. package/src/skills/decompose-gaps/tests/rubrics/scope-exclusion.md +21 -0
  219. package/src/skills/decompose-gaps/workflows/decompose.md +120 -0
  220. package/src/skills/decompose-plan/README.md +43 -0
  221. package/src/skills/decompose-plan/SKILL.md +87 -0
  222. package/src/skills/decompose-plan/algorithms/deduplication.md +101 -0
  223. package/src/skills/decompose-plan/knowledge/atomicity-checklist.md +113 -0
  224. package/src/skills/decompose-plan/knowledge/capabilities.md +44 -0
  225. package/src/skills/decompose-plan/knowledge/human-task-rules.md +67 -0
  226. package/src/skills/decompose-plan/knowledge/scope-guard-checklist.md +73 -0
  227. package/src/skills/decompose-plan/scripts/check-atomicity-limit.js +47 -0
  228. package/src/skills/decompose-plan/scripts/check-duplicates.js +323 -0
  229. package/src/skills/decompose-plan/scripts/verify-atomicity.js +408 -0
  230. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-1.md +30 -0
  231. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-2.md +36 -0
  232. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/claude-sonnet/trial-3.md +37 -0
  233. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/judge.json +163 -0
  234. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-1.md +20 -0
  235. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-2.md +17 -0
  236. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-deepseek/trial-3.md +28 -0
  237. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-1.md +114 -0
  238. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-2.md +137 -0
  239. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-glm/trial-3.md +188 -0
  240. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-1.md +0 -0
  241. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-2.md +32 -0
  242. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/kilo-minimax/trial-3.md +110 -0
  243. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001/current/meta.json +115 -0
  244. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-001-atomicity-no-1to1.yaml +56 -0
  245. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-1.md +47 -0
  246. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-2.md +54 -0
  247. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/claude-sonnet/trial-3.md +43 -0
  248. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/judge.json +163 -0
  249. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-1.md +15 -0
  250. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-2.md +5 -0
  251. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-deepseek/trial-3.md +12 -0
  252. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-1.md +34 -0
  253. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-2.md +30 -0
  254. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-glm/trial-3.md +35 -0
  255. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-1.md +0 -0
  256. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-2.md +31 -0
  257. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/kilo-minimax/trial-3.md +0 -0
  258. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002/current/meta.json +115 -0
  259. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-002-get-next-id-mandatory.yaml +44 -0
  260. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-1.md +21 -0
  261. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-2.md +38 -0
  262. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/claude-sonnet/trial-3.md +30 -0
  263. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/judge.json +163 -0
  264. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-1.md +31 -0
  265. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-2.md +35 -0
  266. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-deepseek/trial-3.md +48 -0
  267. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-1.md +167 -0
  268. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-2.md +62 -0
  269. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-glm/trial-3.md +174 -0
  270. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-1.md +0 -0
  271. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-2.md +0 -0
  272. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/kilo-minimax/trial-3.md +0 -0
  273. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003/current/meta.json +115 -0
  274. package/src/skills/decompose-plan/tests/cases/TC-DECOMPOSE-PLAN-003-verbatim-dod-transfer.yaml +42 -0
  275. package/src/skills/decompose-plan/tests/index.yaml +30 -0
  276. package/src/skills/decompose-plan/tests/rubrics/atomicity-no-1to1.md +21 -0
  277. package/src/skills/decompose-plan/tests/rubrics/get-next-id-mandatory.md +21 -0
  278. package/src/skills/decompose-plan/tests/rubrics/verbatim-dod-transfer.md +21 -0
  279. package/src/skills/decompose-plan/workflows/decompose.md +272 -0
  280. package/src/skills/deep-research/README.md +36 -0
  281. package/src/skills/deep-research/SKILL.md +106 -0
  282. package/src/skills/deep-research/algorithms/source-scoring.md +63 -0
  283. package/src/skills/deep-research/algorithms/synthesis.md +67 -0
  284. package/src/skills/deep-research/knowledge/data-validation.md +44 -0
  285. package/src/skills/deep-research/knowledge/perplexity-config.md +30 -0
  286. package/src/skills/deep-research/knowledge/research-methodology.md +54 -0
  287. package/src/skills/deep-research/knowledge/source-evaluation.md +33 -0
  288. package/src/skills/deep-research/scripts/perplexity-research.js +315 -0
  289. package/src/skills/deep-research/templates/brief-summary.md +25 -0
  290. package/src/skills/deep-research/templates/research-report.md +76 -0
  291. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-1.md +48 -0
  292. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-2.md +88 -0
  293. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/claude-haiku/trial-3.md +56 -0
  294. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/judge.json +163 -0
  295. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-1.md +58 -0
  296. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-2.md +249 -0
  297. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-free/trial-3.md +44 -0
  298. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-1.md +96 -0
  299. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-2.md +56 -0
  300. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm/trial-3.md +94 -0
  301. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-1.md +11 -0
  302. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-2.md +1 -0
  303. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/kilo-glm-air/trial-3.md +1 -0
  304. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001/current/meta.json +115 -0
  305. package/src/skills/deep-research/tests/cases/TC-DEEP-RESEARCH-001-self-check-url.yaml +58 -0
  306. package/src/skills/deep-research/tests/index.yaml +20 -0
  307. package/src/skills/deep-research/tests/rubrics/self-check-url.md +34 -0
  308. package/src/skills/deep-research/workflows/base-checklist.md +19 -0
  309. package/src/skills/deep-research/workflows/benchmark.md +38 -0
  310. package/src/skills/deep-research/workflows/competitor.md +44 -0
  311. package/src/skills/deep-research/workflows/custom.md +32 -0
  312. package/src/skills/deep-research/workflows/market.md +44 -0
  313. package/src/skills/deep-research/workflows/technology.md +40 -0
  314. package/src/skills/deep-research/workflows/trend.md +40 -0
  315. package/src/skills/execute-task/README.md +44 -0
  316. package/src/skills/execute-task/SKILL.md +292 -0
  317. package/src/skills/execute-task/algorithms/execution-strategy.md +136 -0
  318. package/src/skills/execute-task/knowledge/context-checkpoints.md +75 -0
  319. package/src/skills/execute-task/knowledge/ticket-structure.md +70 -0
  320. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-1.md +5 -0
  321. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-2.md +5 -0
  322. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/claude-haiku/trial-3.md +5 -0
  323. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/judge.json +124 -0
  324. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-1.md +4 -0
  325. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-2.md +4 -0
  326. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-free/trial-3.md +4 -0
  327. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-1.md +4 -0
  328. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-2.md +4 -0
  329. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/kilo-glm-air/trial-3.md +11 -0
  330. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001/current/meta.json +89 -0
  331. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-001-no-ticket-creation.yaml +48 -0
  332. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-1.md +5 -0
  333. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-2.md +6 -0
  334. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/claude-haiku/trial-3.md +5 -0
  335. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/judge.json +124 -0
  336. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-1.md +4 -0
  337. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-2.md +4 -0
  338. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-free/trial-3.md +8 -0
  339. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-1.md +9 -0
  340. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-2.md +26 -0
  341. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/kilo-glm-air/trial-3.md +4 -0
  342. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002/current/meta.json +89 -0
  343. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-002-no-duplicate-dod.yaml +44 -0
  344. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-1.md +5 -0
  345. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-2.md +5 -0
  346. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/claude-haiku/trial-3.md +5 -0
  347. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/judge.json +46 -0
  348. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003/current/meta.json +37 -0
  349. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-003-verification-proportionality.yaml +46 -0
  350. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-1.md +18 -0
  351. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-2.md +16 -0
  352. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/claude-haiku/trial-3.md +14 -0
  353. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/judge.json +124 -0
  354. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-1.md +5 -0
  355. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-2.md +5 -0
  356. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-free/trial-3.md +1 -0
  357. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-1.md +8 -0
  358. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-2.md +5 -0
  359. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/kilo-glm-air/trial-3.md +4 -0
  360. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004/current/meta.json +89 -0
  361. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-004-no-foreign-ticket-edit.yaml +50 -0
  362. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-1.md +5 -0
  363. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-2.md +5 -0
  364. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/claude-haiku/trial-3.md +5 -0
  365. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/judge.json +124 -0
  366. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-1.md +15 -0
  367. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-2.md +4 -0
  368. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-free/trial-3.md +5 -0
  369. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-1.md +11 -0
  370. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-2.md +11 -0
  371. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/kilo-glm-air/trial-3.md +4 -0
  372. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005/current/meta.json +89 -0
  373. package/src/skills/execute-task/tests/cases/TC-EXECUTE-TASK-005-ticket-fields-updated.yaml +39 -0
  374. package/src/skills/execute-task/tests/fixtures/IMPL-902-create-file.md +41 -0
  375. package/src/skills/execute-task/tests/fixtures/IMPL-904-current-task.md +40 -0
  376. package/src/skills/execute-task/tests/fixtures/IMPL-906-fill-ticket.md +42 -0
  377. package/src/skills/execute-task/tests/fixtures/QA-901-button-click.md +41 -0
  378. package/src/skills/execute-task/tests/fixtures/QA-903-visual-figma.md +40 -0
  379. package/src/skills/execute-task/tests/fixtures/TASK-905-done-with-typo.md +36 -0
  380. package/src/skills/execute-task/tests/index.yaml +39 -0
  381. package/src/skills/execute-task/tests/rubrics/no-duplicate-dod.md +22 -0
  382. package/src/skills/execute-task/tests/rubrics/no-foreign-ticket-edit.md +20 -0
  383. package/src/skills/execute-task/tests/rubrics/no-ticket-creation.md +21 -0
  384. package/src/skills/execute-task/tests/rubrics/ticket-fields-updated.md +23 -0
  385. package/src/skills/execute-task/tests/rubrics/verification-proportionality.md +22 -0
  386. package/src/skills/execute-task/workflows/execute.md +104 -0
  387. package/src/skills/manual-testing/README.md +63 -0
  388. package/src/skills/manual-testing/SKILL.md +174 -0
  389. package/src/skills/manual-testing/algorithms/blocked-tool-strategy.md +74 -0
  390. package/src/skills/manual-testing/algorithms/bug-severity.md +73 -0
  391. package/src/skills/manual-testing/algorithms/mcp-budget.md +97 -0
  392. package/src/skills/manual-testing/algorithms/test-prioritization.md +69 -0
  393. package/src/skills/manual-testing/knowledge/browser-extension-testing.md +102 -0
  394. package/src/skills/manual-testing/knowledge/browser-tools.md +114 -0
  395. package/src/skills/manual-testing/knowledge/desktop-tools-advanced.md +92 -0
  396. package/src/skills/manual-testing/knowledge/desktop-tools-core.md +76 -0
  397. package/src/skills/manual-testing/knowledge/sandbox-advanced.md +83 -0
  398. package/src/skills/manual-testing/knowledge/sandbox-core.md +67 -0
  399. package/src/skills/manual-testing/knowledge/stateful-edge-cases.md +69 -0
  400. package/src/skills/manual-testing/knowledge/test-case-design.md +107 -0
  401. package/src/skills/manual-testing/knowledge/testing-types.md +45 -0
  402. package/src/skills/manual-testing/templates/bug-report.md +52 -0
  403. package/src/skills/manual-testing/templates/test-case.md +34 -0
  404. package/src/skills/manual-testing/templates/test-plan.md +97 -0
  405. package/src/skills/manual-testing/templates/test-session-report.md +56 -0
  406. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-1.md +21 -0
  407. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-2.md +65 -0
  408. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/claude-sonnet/trial-3.md +35 -0
  409. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/judge.json +163 -0
  410. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-1.md +0 -0
  411. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-2.md +7 -0
  412. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-deepseek/trial-3.md +0 -0
  413. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-1.md +4 -0
  414. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-2.md +15 -0
  415. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-glm/trial-3.md +8 -0
  416. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-1.md +5 -0
  417. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-2.md +7 -0
  418. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/kilo-minimax/trial-3.md +7 -0
  419. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001/current/meta.json +114 -0
  420. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-001-sandbox-mandatory.yaml +38 -0
  421. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-1.md +47 -0
  422. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-2.md +39 -0
  423. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/claude-sonnet/trial-3.md +40 -0
  424. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/judge.json +163 -0
  425. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-1.md +19 -0
  426. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-2.md +15 -0
  427. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-deepseek/trial-3.md +24 -0
  428. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-1.md +19 -0
  429. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-2.md +13 -0
  430. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-glm/trial-3.md +18 -0
  431. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-1.md +21 -0
  432. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-2.md +15 -0
  433. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/kilo-minimax/trial-3.md +14 -0
  434. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002/current/meta.json +114 -0
  435. package/src/skills/manual-testing/tests/cases/TC-MANUAL-TESTING-002-visual-tc-screenshot.yaml +37 -0
  436. package/src/skills/manual-testing/tests/index.yaml +25 -0
  437. package/src/skills/manual-testing/tests/last-run-tc001-sonnet.log +140 -0
  438. package/src/skills/manual-testing/tests/last-run-tc002.log +1 -0
  439. package/src/skills/manual-testing/tests/last-run.log +1469 -0
  440. package/src/skills/manual-testing/tests/rubrics/sandbox-mandatory.md +20 -0
  441. package/src/skills/manual-testing/tests/rubrics/visual-tc-screenshot.md +21 -0
  442. package/src/skills/manual-testing/workflows/acceptance.md +80 -0
  443. package/src/skills/manual-testing/workflows/exploratory.md +84 -0
  444. package/src/skills/manual-testing/workflows/regression.md +76 -0
  445. package/src/skills/manual-testing/workflows/smoke.md +109 -0
  446. package/src/skills/manual-testing/workflows/test-plan.md +75 -0
  447. package/src/skills/review-result/README.md +59 -0
  448. package/src/skills/review-result/SKILL.md +138 -0
  449. package/src/skills/review-result/algorithms/verification.md +112 -0
  450. package/src/skills/review-result/knowledge/dod-patterns.md +115 -0
  451. package/src/skills/review-result/scripts/verify-artifacts.js +354 -0
  452. package/src/skills/review-result/templates/verdict.md +153 -0
  453. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-1.md +22 -0
  454. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-2.md +7 -0
  455. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-haiku/trial-3.md +21 -0
  456. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-1.md +6 -0
  457. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-2.md +6 -0
  458. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/claude-sonnet/trial-3.md +18 -0
  459. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/judge.json +164 -0
  460. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-1.md +5 -0
  461. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-2.md +7 -0
  462. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-deepseek/trial-3.md +6 -0
  463. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-1.md +49 -0
  464. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-2.md +28 -0
  465. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-glm/trial-3.md +37 -0
  466. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-1.md +22 -0
  467. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-2.md +13 -0
  468. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/kilo-minimax/trial-3.md +21 -0
  469. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001/current/meta.json +116 -0
  470. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-001-visual-tc-trigger.yaml +51 -0
  471. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-1.md +23 -0
  472. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-2.md +22 -0
  473. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-haiku/trial-3.md +28 -0
  474. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-1.md +4 -0
  475. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-2.md +36 -0
  476. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/claude-sonnet/trial-3.md +4 -0
  477. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/judge.json +163 -0
  478. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-1.md +4 -0
  479. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-2.md +0 -0
  480. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-deepseek/trial-3.md +4 -0
  481. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-1.md +39 -0
  482. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-2.md +25 -0
  483. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-glm/trial-3.md +32 -0
  484. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-1.md +34 -0
  485. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-2.md +8 -0
  486. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/kilo-minimax/trial-3.md +23 -0
  487. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002/current/meta.json +115 -0
  488. package/src/skills/review-result/tests/cases/TC-REVIEW-RESULT-002-path-line-suffix.yaml +39 -0
  489. package/src/skills/review-result/tests/fixtures/IMPL-902-path-with-line.md +43 -0
  490. package/src/skills/review-result/tests/fixtures/QA-901-visual-button.md +46 -0
  491. package/src/skills/review-result/tests/index.yaml +25 -0
  492. package/src/skills/review-result/tests/rubrics/path-line-suffix.md +19 -0
  493. package/src/skills/review-result/tests/rubrics/visual-tc-trigger.md +19 -0
  494. package/src/skills/review-result/workflows/review.md +209 -0
@@ -13,6 +13,55 @@ const __filename = fileURLToPath(import.meta.url);
13
13
  const __dirname = path.dirname(__filename);
14
14
  const projectRoot = findProjectRoot(process.cwd());
15
15
 
16
+ import os from 'os';
17
+ import { execSync } from 'child_process';
18
+
19
+ function createTestWorkdir(skillName, suffix = '') {
20
+ const prefix = suffix ? `wf-test-${skillName}-${suffix}-` : `wf-test-${skillName}-`;
21
+ const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
22
+ const workflowDir = path.join(tmpRoot, '.workflow');
23
+ fs.mkdirSync(workflowDir, { recursive: true });
24
+ for (const sub of ['tickets/backlog', 'tickets/ready', 'tickets/in-progress', 'tickets/review', 'tickets/done', 'tickets/archive', 'plans/current', 'plans/archive', 'reports', 'logs']) {
25
+ fs.mkdirSync(path.join(workflowDir, sub), { recursive: true });
26
+ }
27
+ fs.writeFileSync(path.join(workflowDir, 'coach-backlog.yaml'), 'version: 1\nanalyzed_tickets: []\naudited_skills: {}\n', 'utf8');
28
+
29
+ const srcDir = path.join(workflowDir, 'src');
30
+ fs.mkdirSync(srcDir, { recursive: true });
31
+ const realSkills = path.join(projectRoot, 'src', 'skills');
32
+ const realScripts = path.join(projectRoot, 'src', 'scripts');
33
+ const linkSkills = path.join(srcDir, 'skills');
34
+ const linkScripts = path.join(srcDir, 'scripts');
35
+ const configDir = path.join(workflowDir, 'config');
36
+ const realConfigs = path.join(projectRoot, 'configs');
37
+
38
+ // Skills are COPIED (not junctioned) so that agents cannot write to real source files.
39
+ fs.cpSync(realSkills, linkSkills, { recursive: true, dereference: true });
40
+
41
+ // Scripts and configs are junctioned — read-only for agents in practice.
42
+ if (process.platform === 'win32') {
43
+ try { execSync(`mklink /J "${linkScripts}" "${realScripts}"`, { stdio: 'pipe', shell: true }); } catch {}
44
+ try { execSync(`mklink /J "${configDir}" "${realConfigs}"`, { stdio: 'pipe', shell: true }); } catch {}
45
+ } else {
46
+ try { fs.symlinkSync(realScripts, linkScripts, 'dir'); } catch {}
47
+ try { fs.symlinkSync(realConfigs, configDir, 'dir'); } catch {}
48
+ }
49
+
50
+ return tmpRoot;
51
+ }
52
+
53
+ function cleanupTestWorkdir(tmpRoot) {
54
+ if (!tmpRoot || !fs.existsSync(tmpRoot)) return;
55
+ // Remove junctions first so that their targets are not touched by rmSync.
56
+ if (process.platform === 'win32') {
57
+ for (const link of ['src/scripts', 'config']) {
58
+ const p = path.join(tmpRoot, '.workflow', link);
59
+ try { execSync(`rmdir "${p}"`, { stdio: 'pipe', shell: true }); } catch {}
60
+ }
61
+ }
62
+ try { fs.rmSync(tmpRoot, { recursive: true, force: true }); } catch {}
63
+ }
64
+
16
65
  function parseArgs() {
17
66
  const args = process.argv.slice(2);
18
67
  const opts = {
@@ -720,13 +769,23 @@ async function writeJudgeResults(skillName, caseId, results) {
720
769
  const skillsDir = findSkillsDir();
721
770
  const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
722
771
  ensureDir(caseDir);
723
-
724
- const judgeData = {
725
- per_model: {},
726
- rubric_scores: results.rubric_scores || [],
727
- timestamp: new Date().toISOString()
728
- };
729
-
772
+
773
+ const judgePath = path.join(caseDir, 'judge.json');
774
+ let judgeData = { per_model: {}, rubric_scores: [], timestamp: new Date().toISOString() };
775
+ if (fs.existsSync(judgePath)) {
776
+ try {
777
+ const existing = JSON.parse(fs.readFileSync(judgePath, 'utf8'));
778
+ judgeData.per_model = existing.per_model || {};
779
+ judgeData.rubric_scores = existing.rubric_scores || [];
780
+ } catch {}
781
+ }
782
+
783
+ const newAgentIds = new Set(Object.keys(results.per_model || {}));
784
+ judgeData.rubric_scores = judgeData.rubric_scores.filter(r => !newAgentIds.has(r.agentId));
785
+ for (const r of (results.rubric_scores || [])) {
786
+ judgeData.rubric_scores.push(r);
787
+ }
788
+
730
789
  for (const [agentId, modelData] of Object.entries(results.per_model || {})) {
731
790
  judgeData.per_model[agentId] = {
732
791
  pass_count: modelData.pass_count,
@@ -738,12 +797,10 @@ async function writeJudgeResults(skillName, caseId, results) {
738
797
  }))
739
798
  };
740
799
  }
741
-
742
- fs.writeFileSync(
743
- path.join(caseDir, 'judge.json'),
744
- JSON.stringify(judgeData, null, 2),
745
- 'utf8'
746
- );
800
+
801
+ judgeData.timestamp = new Date().toISOString();
802
+
803
+ fs.writeFileSync(judgePath, JSON.stringify(judgeData, null, 2), 'utf8');
747
804
  }
748
805
 
749
806
  async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0.02, targetAgentCost = 0.01) {
@@ -776,7 +833,7 @@ async function preFlightApproval(numCases, numModels, trials, judgeAgentCost = 0
776
833
  }
777
834
 
778
835
  async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judgeAgentId, pipelineConfig, options = {}) {
779
- const { trials = 3, concurrency = 2, timeout = 300 } = options;
836
+ const { trials = 3, timeout = 300 } = options;
780
837
 
781
838
  const judgeAgentConfig = pipelineConfig.agents[judgeAgentId];
782
839
  if (!judgeAgentConfig) {
@@ -799,12 +856,12 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
799
856
  };
800
857
 
801
858
  const caseId = caseDef?.id || 'unknown';
802
-
803
- function buildTargetPrompt() {
859
+
860
+ function buildTargetPrompt(taskWorkdir) {
804
861
  let targetPrompt = '';
805
862
  const testsDir = findSkillTestsDir(skillName);
806
863
  const caseDir = caseDef?.file ? path.dirname(caseDef.file) : '';
807
-
864
+
808
865
  if (testCase.scenario?.system_prompt_file) {
809
866
  const systemPromptPath = path.join(testsDir, caseDir, testCase.scenario.system_prompt_file);
810
867
  if (fs.existsSync(systemPromptPath)) {
@@ -824,6 +881,28 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
824
881
  targetPrompt += `## ${input.as || 'Input'}\n`;
825
882
  targetPrompt += fs.readFileSync(fixturePath, 'utf8') + '\n\n';
826
883
  }
884
+ } else if (input.kind === 'inline') {
885
+ if (input.content) {
886
+ targetPrompt += `## ${input.as || 'Input'}\n`;
887
+ targetPrompt += input.content + '\n\n';
888
+ }
889
+ } else if (input.kind === 'ticket_file') {
890
+ const fixturePath = path.join(testsDir, caseDir, input.path);
891
+ const destDir = input.dest_dir || 'in-progress';
892
+ const ticketId = input.ticket_id;
893
+ if (!ticketId) {
894
+ throw new Error(`ticket_file input requires ticket_id (case ${caseId})`);
895
+ }
896
+ if (!taskWorkdir) {
897
+ throw new Error(`ticket_file input requires task workdir (case ${caseId})`);
898
+ }
899
+ if (!fs.existsSync(fixturePath)) {
900
+ throw new Error(`ticket_file fixture not found: ${fixturePath}`);
901
+ }
902
+ const destPath = path.join(taskWorkdir, '.workflow', 'tickets', destDir, `${ticketId}.md`);
903
+ fs.mkdirSync(path.dirname(destPath), { recursive: true });
904
+ fs.copyFileSync(fixturePath, destPath);
905
+ targetPrompt += `## Context\nticket_id: ${ticketId}\n\n`;
827
906
  }
828
907
  }
829
908
  }
@@ -831,46 +910,65 @@ async function runL2Evaluation(skillName, testCase, caseDef, targetAgents, judge
831
910
  if (!targetPrompt.trim()) {
832
911
  targetPrompt = testCase.prompt || testCase.input || '';
833
912
  }
834
-
913
+
835
914
  return targetPrompt;
836
915
  }
837
916
 
917
+ const allTasks = [];
838
918
  for (const agentId of targetAgents) {
839
919
  const agentConfig = pipelineConfig.agents[agentId];
840
920
  if (!agentConfig) {
841
921
  throw new Error(`Target agent not found: ${agentId}`);
842
922
  }
843
-
844
923
  results.per_model[agentId] = {
845
924
  trials: [],
846
925
  pass_count: 0,
847
926
  total: trials
848
927
  };
849
-
850
- const tasks = [];
851
928
  for (let trial = 1; trial <= trials; trial++) {
852
- tasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
929
+ allTasks.push({ agentId, trial, agentConfig, judgeAgentConfig, rubric, testCase });
853
930
  }
854
-
855
- for (let i = 0; i < tasks.length; i += concurrency) {
856
- const batch = tasks.slice(i, i + concurrency);
857
- const batchResults = await Promise.all(
858
- batch.map(async (task) => {
859
- try {
860
- const targetPrompt = buildTargetPrompt();
861
- const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
862
- timeout,
863
- stageId: `${caseId}-${task.agentId}-trial-${task.trial}`
864
- });
865
-
866
- const judgePrompt = `You are a judge evaluating the output of an AI agent.
931
+ }
932
+
933
+ const allResults = await Promise.all(
934
+ allTasks.map(async (task) => {
935
+ const taskSuffix = `${caseId}-${task.agentId}-t${task.trial}`;
936
+ let taskWorkdir = null;
937
+ try {
938
+ taskWorkdir = createTestWorkdir(skillName, taskSuffix);
939
+ const targetPrompt = buildTargetPrompt(taskWorkdir);
940
+ const targetOutput = await spawnAgent(task.agentConfig, targetPrompt, {
941
+ timeout,
942
+ stageId: `${caseId}-${task.agentId}-trial-${task.trial}`,
943
+ projectRoot: taskWorkdir
944
+ });
945
+
946
+ // Snapshot ticket files after target-run (for judge to inspect actual file state).
947
+ let ticketFilesSection = '';
948
+ const ticketInputs = (testCase.scenario?.inputs || []).filter(i => i.kind === 'ticket_file');
949
+ for (const input of ticketInputs) {
950
+ const ticketPath = path.join(
951
+ taskWorkdir,
952
+ '.workflow', 'tickets',
953
+ input.dest_dir || 'in-progress',
954
+ `${input.ticket_id}.md`
955
+ );
956
+ if (fs.existsSync(ticketPath)) {
957
+ const content = fs.readFileSync(ticketPath, 'utf8');
958
+ ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id} (${input.dest_dir || 'in-progress'}/)\n\n\`\`\`markdown\n${content}\n\`\`\`\n`;
959
+ } else {
960
+ ticketFilesSection += `\n## Ticket File After Execution — ${input.ticket_id}\n\n(file missing at ${input.dest_dir || 'in-progress'}/${input.ticket_id}.md)\n`;
961
+ }
962
+ }
963
+
964
+ const judgePrompt = `You are a judge evaluating the output of an AI agent.
867
965
 
868
966
  ## Rubric
869
967
  ${rubric}
870
968
 
871
969
  ## Target Agent Output
872
970
  ${targetOutput.output || targetOutput.status || 'No output'}
873
-
971
+ ${ticketFilesSection}
874
972
  ## Task
875
973
  ${testCase.description || testCase.name || 'Evaluate the response'}
876
974
 
@@ -881,54 +979,77 @@ score: <number 1-5>
881
979
  reason: <brief explanation>
882
980
  ---RESULT---`;
883
981
 
884
- const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
885
- timeout: 60,
886
- stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
887
- });
888
-
889
- let score = 3;
890
- const parsed = parseJudgeResult(judgeResult.output);
891
- if (parsed && parsed.score) {
892
- score = parsed.score;
893
- }
894
-
895
- await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
896
-
897
- return {
898
- trial: task.trial,
899
- agentId: task.agentId,
900
- score,
901
- output: targetOutput.output || '',
902
- judge_output: judgeResult.output || '',
903
- passed: score >= 4
904
- };
905
- } catch (err) {
906
- console.error(`[Runner] Trial failed: ${task.agentId} trial ${task.trial}`, err.message);
907
- return {
908
- trial: task.trial,
909
- agentId: task.agentId,
910
- score: 1,
911
- error: err.message,
912
- passed: false
913
- };
914
- }
915
- })
916
- );
917
-
918
- for (const result of batchResults) {
919
- results.per_model[result.agentId].trials.push(result);
920
- if (result.passed) {
921
- results.per_model[result.agentId].pass_count++;
922
- }
923
- results.rubric_scores.push({
924
- agentId: result.agentId,
925
- trial: result.trial,
926
- score: result.score
982
+ const judgeResult = await spawnAgent(task.judgeAgentConfig, judgePrompt, {
983
+ timeout: 60,
984
+ stageId: `${caseId}-judge-${task.agentId}-trial-${task.trial}`
927
985
  });
986
+
987
+ let score = 3;
988
+ const parsed = parseJudgeResult(judgeResult.output);
989
+ if (parsed && parsed.score) {
990
+ score = parsed.score;
991
+ }
992
+
993
+ await writeTrialOutput(skillName, caseId, task.agentId, task.trial, targetOutput.output || '');
994
+
995
+ return {
996
+ trial: task.trial,
997
+ agentId: task.agentId,
998
+ score,
999
+ output: targetOutput.output || '',
1000
+ judge_output: judgeResult.output || '',
1001
+ passed: score >= 4,
1002
+ errored: false
1003
+ };
1004
+ } catch (err) {
1005
+ console.error(`[Runner] Trial errored: ${task.agentId} trial ${task.trial} — ${err.message}`);
1006
+ try {
1007
+ await writeTrialOutput(
1008
+ skillName,
1009
+ caseId,
1010
+ task.agentId,
1011
+ task.trial,
1012
+ `# TRIAL ERRORED\n\nagent: ${task.agentId}\ntrial: ${task.trial}\nerror: ${err.message}\n`
1013
+ );
1014
+ } catch {}
1015
+ return {
1016
+ trial: task.trial,
1017
+ agentId: task.agentId,
1018
+ score: null,
1019
+ error: err.message,
1020
+ passed: false,
1021
+ errored: true
1022
+ };
1023
+ } finally {
1024
+ if (taskWorkdir) {
1025
+ cleanupTestWorkdir(taskWorkdir);
1026
+ }
928
1027
  }
1028
+ })
1029
+ );
1030
+
1031
+ for (const result of allResults) {
1032
+ results.per_model[result.agentId].trials.push(result);
1033
+ if (result.errored) {
1034
+ results.per_model[result.agentId].error_count = (results.per_model[result.agentId].error_count || 0) + 1;
1035
+ } else if (result.passed) {
1036
+ results.per_model[result.agentId].pass_count++;
929
1037
  }
1038
+ results.rubric_scores.push({
1039
+ agentId: result.agentId,
1040
+ trial: result.trial,
1041
+ score: result.score,
1042
+ errored: !!result.errored,
1043
+ error: result.error || undefined
1044
+ });
930
1045
  }
931
-
1046
+ for (const agentId of Object.keys(results.per_model)) {
1047
+ results.per_model[agentId].trials.sort((a, b) => a.trial - b.trial);
1048
+ }
1049
+ results.rubric_scores.sort((a, b) =>
1050
+ a.agentId === b.agentId ? a.trial - b.trial : a.agentId.localeCompare(b.agentId)
1051
+ );
1052
+
932
1053
  return results;
933
1054
  }
934
1055
 
@@ -961,19 +1082,27 @@ function aggregateResults(results, testCase) {
961
1082
 
962
1083
  for (const [agentId, modelData] of Object.entries(results.per_model)) {
963
1084
  const passCount = modelData.pass_count;
1085
+ const errorCount = modelData.error_count || 0;
964
1086
  const total = modelData.total;
1087
+ const effective = total - errorCount;
965
1088
  const threshold = Math.ceil(total / 2);
966
-
1089
+
967
1090
  let passed;
968
- if (useAll) {
1091
+ let errored = false;
1092
+ if (effective === 0) {
1093
+ passed = false;
1094
+ errored = true;
1095
+ } else if (useAll) {
969
1096
  passed = passCount === total;
970
1097
  } else {
971
1098
  passed = passCount >= threshold;
972
1099
  }
973
-
1100
+
974
1101
  perModelResults[agentId] = {
975
1102
  passed,
1103
+ errored,
976
1104
  pass_count: passCount,
1105
+ error_count: errorCount,
977
1106
  total,
978
1107
  threshold: useAll ? total : threshold
979
1108
  };
@@ -991,40 +1120,66 @@ async function writeMetaJson(caseId, skillName, status, durationMs, l2Results =
991
1120
  const skillsDir = findSkillsDir();
992
1121
  const caseDir = path.join(skillsDir, skillName, 'tests', 'cases', caseId, 'current');
993
1122
  ensureDir(caseDir);
994
-
1123
+
1124
+ const metaPath = path.join(caseDir, 'meta.json');
1125
+ let existing = null;
1126
+ if (fs.existsSync(metaPath)) {
1127
+ try {
1128
+ existing = JSON.parse(fs.readFileSync(metaPath, 'utf8'));
1129
+ } catch {}
1130
+ }
1131
+
995
1132
  const meta = {
996
1133
  date: new Date().toISOString(),
997
1134
  skill_sha: getSkillSha(skillName),
998
1135
  status,
999
1136
  duration_ms: durationMs
1000
1137
  };
1001
-
1138
+
1002
1139
  if (l1_skipped) {
1003
1140
  meta.l1_skipped = true;
1004
1141
  }
1005
-
1142
+
1143
+ const mergedPerModel = (existing && existing.per_model) ? { ...existing.per_model } : {};
1144
+ let mergedRubricScores = (existing && existing.rubric_scores) ? [...existing.rubric_scores] : [];
1145
+
1006
1146
  if (l2Results) {
1007
1147
  const aggregated = aggregateResults(l2Results, {});
1008
- meta.per_model = aggregated.per_model;
1009
- meta.rubric_scores = l2Results.rubric_scores;
1148
+ const newAgentIds = new Set(Object.keys(aggregated.per_model || {}));
1149
+ for (const [agentId, data] of Object.entries(aggregated.per_model || {})) {
1150
+ mergedPerModel[agentId] = data;
1151
+ }
1152
+ mergedRubricScores = mergedRubricScores.filter(r => !newAgentIds.has(r.agentId));
1153
+ for (const r of (l2Results.rubric_scores || [])) {
1154
+ mergedRubricScores.push(r);
1155
+ }
1010
1156
  if (l2Results.tokens) {
1011
1157
  meta.tokens = l2Results.tokens;
1012
1158
  }
1013
1159
  }
1014
-
1015
- fs.writeFileSync(
1016
- path.join(caseDir, 'meta.json'),
1017
- JSON.stringify(meta, null, 2),
1018
- 'utf8'
1019
- );
1160
+
1161
+ if (Object.keys(mergedPerModel).length > 0) {
1162
+ meta.per_model = mergedPerModel;
1163
+ }
1164
+ if (mergedRubricScores.length > 0) {
1165
+ meta.rubric_scores = mergedRubricScores;
1166
+ }
1167
+
1168
+ const allPassed = Object.values(mergedPerModel).every(m => m.passed);
1169
+ if (Object.keys(mergedPerModel).length > 0) {
1170
+ meta.status = allPassed ? 'passed' : 'failed';
1171
+ }
1172
+
1173
+ fs.writeFileSync(metaPath, JSON.stringify(meta, null, 2), 'utf8');
1020
1174
  }
1021
1175
 
1022
1176
  async function runTestsForSkill(skillName, opts) {
1177
+ console.log(`[Runner] Per-task isolated workdirs will be created for each (case × agent × trial)`);
1023
1178
  const result = {
1024
1179
  skill: skillName,
1025
1180
  status: 'passed',
1026
1181
  total: 0,
1027
- current_run: { passed: 0, failed: 0 },
1182
+ current_run: { passed: 0, failed: 0, no_coverage: 0 },
1028
1183
  baseline_ref: 'origin/main',
1029
1184
  target_agents: [],
1030
1185
  judge_agent: null
@@ -1117,14 +1272,56 @@ async function runTestsForSkill(skillName, opts) {
1117
1272
 
1118
1273
  const runL2 = !opts.layer || opts.layer === 'l2';
1119
1274
 
1120
- if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent) {
1275
+ const casesWithRubric = cases.filter(cd => {
1276
+ try {
1277
+ const tc = loadTestCase(skillName, cd.file);
1278
+ return tc.assertions?.rubric && tc.assertions.rubric.length > 0;
1279
+ } catch { return false; }
1280
+ });
1281
+ const anyHasRubric = casesWithRubric.length > 0;
1282
+
1283
+ if (casesWithRubric.length < cases.length) {
1284
+ const missing = cases.length - casesWithRubric.length;
1285
+ console.log(`[Runner] ${missing}/${cases.length} cases have no rubric — L2 will be skipped for them`);
1286
+ }
1287
+
1288
+ if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric) {
1121
1289
  const trials = opts.fast ? 1 : 3;
1122
1290
  const totalModels = effectiveTargetAgents.length;
1123
- const llmEstimate = cases.length * totalModels * trials * 2;
1124
- await preFlightApproval(cases.length, totalModels, trials);
1291
+ await preFlightApproval(casesWithRubric.length, totalModels, trials);
1292
+ }
1293
+
1294
+ let secretScanFailed = false;
1295
+ let calibrationFailedResult = null;
1296
+
1297
+ const anyRunL1 = !opts.layer || opts.layer === 'deterministic';
1298
+ const anyRunL2 = !opts.layer || opts.layer === 'l2';
1299
+
1300
+ if (anyRunL1 && !opts.skipSecretScan) {
1301
+ const scanResult = await runSecretScan();
1302
+ if (!scanResult.passed) {
1303
+ secretScanFailed = true;
1304
+ result.error = 'Secret scan failed - secrets detected in fixtures';
1305
+ }
1125
1306
  }
1126
1307
 
1127
- for (const caseDef of cases) {
1308
+ if (anyRunL2 && effectiveTargetAgents.length > 0 && judgeAgent && anyHasRubric && !secretScanFailed) {
1309
+ const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
1310
+ if (!calibrationResult.passed) {
1311
+ console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
1312
+ calibrationFailedResult = calibrationResult;
1313
+ result.status = 'calibration_failed';
1314
+ result.error = calibrationResult.error;
1315
+ result.calibration = calibrationResult;
1316
+ return { ...result, cases, currentRunStatuses };
1317
+ }
1318
+ if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
1319
+ console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
1320
+ }
1321
+ console.log('[Runner] Calibration gate PASSED');
1322
+ }
1323
+
1324
+ await Promise.all(cases.map(async (caseDef) => {
1128
1325
  const caseStart = Date.now();
1129
1326
 
1130
1327
  try {
@@ -1136,17 +1333,13 @@ async function runTestsForSkill(skillName, opts) {
1136
1333
  const runL1 = !opts.layer || opts.layer === 'deterministic';
1137
1334
  const runL2 = !opts.layer || opts.layer === 'l2';
1138
1335
 
1139
- // Secret scan (only for deterministic layer)
1140
- if (runL1 && !opts.skipSecretScan) {
1141
- const scanResult = await runSecretScan();
1142
- if (!scanResult.passed) {
1143
- result.current_run.failed++;
1144
- result.status = 'failed';
1145
- result.error = 'Secret scan failed - secrets detected in fixtures';
1146
- currentRunStatuses[caseDef.id] = 'failed';
1147
- await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1148
- continue;
1149
- }
1336
+ // Secret scan result propagated from pre-loop
1337
+ if (runL1 && !opts.skipSecretScan && secretScanFailed) {
1338
+ result.current_run.failed++;
1339
+ result.status = 'failed';
1340
+ currentRunStatuses[caseDef.id] = 'failed';
1341
+ await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1342
+ return;
1150
1343
  }
1151
1344
 
1152
1345
  // L0 static assertions
@@ -1158,7 +1351,7 @@ async function runTestsForSkill(skillName, opts) {
1158
1351
  result.status = 'failed';
1159
1352
  currentRunStatuses[caseDef.id] = 'failed';
1160
1353
  await writeMetaJson(caseDef.id, skillName, 'failed', Date.now() - caseStart);
1161
- continue;
1354
+ return;
1162
1355
  }
1163
1356
  }
1164
1357
 
@@ -1167,13 +1360,28 @@ async function runTestsForSkill(skillName, opts) {
1167
1360
  const l1Results = runL1Assertions(mockOutput, testCase);
1168
1361
  const l1Failed = l1Results.filter(r => !r.passed);
1169
1362
  const l1Skipped = l1Results.some(r => r.skipped);
1363
+ const l1Declared = (testCase.assertions?.deterministic || []).length;
1364
+ const l1Executed = l1Results.filter(r => !r.skipped).length;
1170
1365
 
1171
- const caseStatus = l1Failed.length === 0 ? 'passed' : 'failed';
1172
- currentRunStatuses[caseDef.id] = caseStatus;
1366
+ const willRunL2 = runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric;
1367
+ const noCoverage = l1Declared > 0 && l1Executed === 0 && !willRunL2;
1173
1368
 
1369
+ let caseStatus;
1174
1370
  if (l1Failed.length > 0) {
1371
+ caseStatus = 'failed';
1372
+ } else if (noCoverage) {
1373
+ caseStatus = 'no_coverage';
1374
+ } else {
1375
+ caseStatus = 'passed';
1376
+ }
1377
+ currentRunStatuses[caseDef.id] = caseStatus;
1378
+
1379
+ if (caseStatus === 'failed') {
1175
1380
  result.current_run.failed++;
1176
1381
  result.status = 'failed';
1382
+ } else if (caseStatus === 'no_coverage') {
1383
+ result.current_run.no_coverage = (result.current_run.no_coverage || 0) + 1;
1384
+ console.log(`[Runner] ${caseDef.id}: no_coverage — L1 assertions require agent output but L2 is not configured (no rubric or no agents)`);
1177
1385
  } else {
1178
1386
  result.current_run.passed++;
1179
1387
  }
@@ -1182,36 +1390,25 @@ async function runTestsForSkill(skillName, opts) {
1182
1390
  result.l1_skipped = true;
1183
1391
  }
1184
1392
 
1185
- if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
1186
- const calibrationResult = await runCalibrationGate(skillName, pipelineConfig);
1187
-
1188
- if (!calibrationResult.passed) {
1189
- console.error(`[Runner] Calibration gate FAILED: ${calibrationResult.error}`);
1190
- result.status = 'calibration_failed';
1191
- result.error = calibrationResult.error;
1192
- result.calibration = calibrationResult;
1193
- return result;
1194
- }
1195
-
1196
- if (calibrationResult.warnings && calibrationResult.warnings.length > 0) {
1197
- console.log(`[Runner] Calibration warnings: ${calibrationResult.warnings.join(', ')}`);
1198
- }
1199
-
1200
- console.log('[Runner] Calibration gate PASSED');
1201
- }
1202
-
1203
1393
  let l2Results = null;
1204
- if (runL2 && effectiveTargetAgents.length > 0 && judgeAgent && hasRubric) {
1394
+ if (willRunL2) {
1205
1395
  const trials = opts.fast ? 1 : 3;
1206
1396
  const index = loadIndexYaml(skillName);
1207
1397
  const defaultTimeout = index.execution?.default_timeout_s || 300;
1208
1398
  const timeout = testCase.execution?.timeout_s || defaultTimeout;
1399
+ const caseTargetAgents = testCase.execution?.target_agents;
1400
+ const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
1401
+ ? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
1402
+ : effectiveTargetAgents;
1403
+ if (caseTargetAgents && caseTargetAgents.length > 0) {
1404
+ console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
1405
+ }
1209
1406
  try {
1210
1407
  l2Results = await runL2Evaluation(
1211
1408
  skillName,
1212
1409
  testCase,
1213
1410
  caseDef,
1214
- effectiveTargetAgents,
1411
+ perCaseAgents,
1215
1412
  judgeAgent,
1216
1413
  pipelineConfig,
1217
1414
  { trials, concurrency: 2, timeout }
@@ -1238,6 +1435,13 @@ async function runTestsForSkill(skillName, opts) {
1238
1435
  const trials = opts.fast ? 1 : 3;
1239
1436
  const defaultTimeout = index.execution?.default_timeout_s || 300;
1240
1437
  const timeout = testCase.execution?.timeout_s || defaultTimeout;
1438
+ const caseTargetAgents = testCase.execution?.target_agents;
1439
+ const perCaseAgents = caseTargetAgents && caseTargetAgents.length > 0
1440
+ ? (validateAgents(caseTargetAgents, pipelineConfig), caseTargetAgents)
1441
+ : effectiveTargetAgents;
1442
+ if (caseTargetAgents && caseTargetAgents.length > 0) {
1443
+ console.log(`[Runner] ${caseDef.id}: per-case target_agents override → ${perCaseAgents.join(', ')}`);
1444
+ }
1241
1445
  let l2Results = null;
1242
1446
  let caseStatus = 'passed';
1243
1447
  try {
@@ -1245,7 +1449,7 @@ async function runTestsForSkill(skillName, opts) {
1245
1449
  skillName,
1246
1450
  testCase,
1247
1451
  caseDef,
1248
- effectiveTargetAgents,
1452
+ perCaseAgents,
1249
1453
  judgeAgent,
1250
1454
  pipelineConfig,
1251
1455
  { trials, concurrency: 2, timeout }
@@ -1283,6 +1487,10 @@ async function runTestsForSkill(skillName, opts) {
1283
1487
  currentRunStatuses[caseDef.id] = 'error';
1284
1488
  await writeMetaJson(caseDef.id, skillName, 'error', Date.now() - caseStart);
1285
1489
  }
1490
+ }));
1491
+
1492
+ if (result.status === 'passed' && result.current_run.no_coverage > 0 && result.current_run.passed === 0) {
1493
+ result.status = 'no_coverage';
1286
1494
  }
1287
1495
  } catch (e) {
1288
1496
  result.status = 'error';
@@ -1307,7 +1515,7 @@ async function runSkillTests(opts) {
1307
1515
  skill: opts.skill || 'unknown',
1308
1516
  mode: 'deterministic',
1309
1517
  total: 0,
1310
- current_run: { passed: 0, failed: 0 },
1518
+ current_run: { passed: 0, failed: 0, no_coverage: 0 },
1311
1519
  baseline_ref: 'origin/main',
1312
1520
  git_head_comparison: null,
1313
1521
  verdict: 'ready_for_user_review',
@@ -1323,6 +1531,7 @@ async function runSkillTests(opts) {
1323
1531
  results.total = skillResult.total;
1324
1532
  results.current_run.passed = skillResult.current_run.passed;
1325
1533
  results.current_run.failed = skillResult.current_run.failed;
1534
+ results.current_run.no_coverage = skillResult.current_run.no_coverage || 0;
1326
1535
  results.status = skillResult.status;
1327
1536
  results.target_agents = skillResult.target_agents;
1328
1537
  results.judge_agent = skillResult.judge_agent;
@@ -1406,7 +1615,7 @@ async function runSkillTests(opts) {
1406
1615
  }
1407
1616
 
1408
1617
  return results;
1409
- }
1618
+ }
1410
1619
 
1411
1620
  function printResult(result) {
1412
1621
  console.log('---RESULT---');
@@ -1416,6 +1625,9 @@ function printResult(result) {
1416
1625
  console.log(`total: ${result.total}`);
1417
1626
  console.log(`current_run.passed: ${result.current_run.passed}`);
1418
1627
  console.log(`current_run.failed: ${result.current_run.failed}`);
1628
+ if (result.current_run.no_coverage) {
1629
+ console.log(`current_run.no_coverage: ${result.current_run.no_coverage}`);
1630
+ }
1419
1631
 
1420
1632
  if (result.baseline_ref) {
1421
1633
  console.log(`baseline_ref: ${result.baseline_ref}`);