agentic-qe 3.4.1 → 3.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (496) hide show
  1. package/.claude/skills/.validation/README.md +111 -0
  2. package/.claude/skills/.validation/examples/chaos-engineering-output.example.json +530 -0
  3. package/.claude/skills/.validation/examples/performance-testing-output.example.json +252 -0
  4. package/.claude/skills/.validation/examples/security-testing-output.example.json +413 -0
  5. package/.claude/skills/.validation/examples/testability-scoring-output.example.json +350 -0
  6. package/.claude/skills/.validation/schemas/skill-eval.schema.json +452 -0
  7. package/.claude/skills/.validation/schemas/skill-frontmatter.schema.json +341 -0
  8. package/.claude/skills/.validation/schemas/skill-output-meta.schema.json +199 -0
  9. package/.claude/skills/.validation/schemas/skill-output.template.json +610 -0
  10. package/.claude/skills/.validation/skill-validation-mcp-integration.md +250 -0
  11. package/.claude/skills/.validation/templates/eval.template.yaml +366 -0
  12. package/.claude/skills/.validation/templates/schemas/output.json +145 -0
  13. package/.claude/skills/.validation/templates/security-testing-eval.template.yaml +725 -0
  14. package/.claude/skills/.validation/templates/skill-frontmatter.example.yaml +225 -0
  15. package/.claude/skills/.validation/templates/validate.template.sh +1060 -0
  16. package/.claude/skills/.validation/templates/validator-lib.sh +1031 -0
  17. package/.claude/skills/.validation/test-data/invalid-output.json +5 -0
  18. package/.claude/skills/.validation/test-data/minimal-output.json +9 -0
  19. package/.claude/skills/.validation/test-data/sample-output.json +73 -0
  20. package/.claude/skills/TRUST-TIERS.md +158 -0
  21. package/.claude/skills/a11y-ally/SKILL.md +1664 -1658
  22. package/.claude/skills/a11y-ally/evals/a11y-ally.yaml +376 -0
  23. package/.claude/skills/a11y-ally/schemas/output.json +549 -0
  24. package/.claude/skills/a11y-ally/scripts/validate.sh +690 -0
  25. package/.claude/skills/accessibility-testing/SKILL.md +8 -1
  26. package/.claude/skills/accessibility-testing/evals/accessibility-testing.yaml +719 -0
  27. package/.claude/skills/accessibility-testing/schemas/output.json +776 -0
  28. package/.claude/skills/accessibility-testing/scripts/validate.sh +943 -0
  29. package/.claude/skills/accessibility-testing/test-data/sample-output.json +191 -0
  30. package/.claude/skills/agentic-quality-engineering/SKILL.md +4 -0
  31. package/.claude/skills/agentic-quality-engineering/schemas/output.json +577 -0
  32. package/.claude/skills/api-testing-patterns/SKILL.md +5 -0
  33. package/.claude/skills/api-testing-patterns/evals/api-testing-patterns.yaml +696 -0
  34. package/.claude/skills/api-testing-patterns/schemas/output.json +845 -0
  35. package/.claude/skills/api-testing-patterns/scripts/validate.sh +931 -0
  36. package/.claude/skills/aqe-v2-v3-migration/schemas/output.json +513 -0
  37. package/.claude/skills/aqe-v2-v3-migration/skill.md +4 -0
  38. package/.claude/skills/brutal-honesty-review/SKILL.md +5 -0
  39. package/.claude/skills/brutal-honesty-review/schemas/output.json +291 -0
  40. package/.claude/skills/brutal-honesty-review/scripts/validate.sh +130 -0
  41. package/.claude/skills/bug-reporting-excellence/SKILL.md +5 -0
  42. package/.claude/skills/bug-reporting-excellence/schemas/output.json +288 -0
  43. package/.claude/skills/bug-reporting-excellence/scripts/validate.sh +108 -0
  44. package/.claude/skills/chaos-engineering-resilience/SKILL.md +5 -0
  45. package/.claude/skills/chaos-engineering-resilience/evals/chaos-engineering-resilience.yaml +761 -0
  46. package/.claude/skills/chaos-engineering-resilience/schemas/output.json +1205 -0
  47. package/.claude/skills/chaos-engineering-resilience/scripts/validate.sh +1107 -0
  48. package/.claude/skills/cicd-pipeline-qe-orchestrator/SKILL.md +6 -0
  49. package/.claude/skills/cicd-pipeline-qe-orchestrator/evals/cicd-pipeline-qe-orchestrator.yaml +157 -0
  50. package/.claude/skills/cicd-pipeline-qe-orchestrator/schemas/output.json +542 -0
  51. package/.claude/skills/cicd-pipeline-qe-orchestrator/scripts/validate.sh +310 -0
  52. package/.claude/skills/code-review-quality/SKILL.md +5 -0
  53. package/.claude/skills/code-review-quality/schemas/output.json +264 -0
  54. package/.claude/skills/code-review-quality/scripts/validate.sh +113 -0
  55. package/.claude/skills/compatibility-testing/evals/compatibility-testing.yaml +410 -0
  56. package/.claude/skills/compatibility-testing/schemas/output.json +551 -0
  57. package/.claude/skills/compatibility-testing/scripts/validate.sh +605 -0
  58. package/.claude/skills/compliance-testing/SKILL.md +7 -0
  59. package/.claude/skills/compliance-testing/evals/compliance-testing.yaml +1107 -0
  60. package/.claude/skills/compliance-testing/schemas/output.json +845 -0
  61. package/.claude/skills/compliance-testing/scripts/validate.sh +888 -0
  62. package/.claude/skills/consultancy-practices/SKILL.md +4 -0
  63. package/.claude/skills/consultancy-practices/schemas/output.json +282 -0
  64. package/.claude/skills/context-driven-testing/SKILL.md +3 -0
  65. package/.claude/skills/contract-testing/SKILL.md +5 -0
  66. package/.claude/skills/contract-testing/evals/contract-testing.yaml +748 -0
  67. package/.claude/skills/contract-testing/schemas/output.json +638 -0
  68. package/.claude/skills/contract-testing/scripts/validate.sh +1033 -0
  69. package/.claude/skills/database-testing/SKILL.md +5 -0
  70. package/.claude/skills/database-testing/evals/database-testing.yaml +968 -0
  71. package/.claude/skills/database-testing/schemas/output.json +1446 -0
  72. package/.claude/skills/database-testing/scripts/validate.sh +1099 -0
  73. package/.claude/skills/exploratory-testing-advanced/SKILL.md +3 -0
  74. package/.claude/skills/holistic-testing-pact/SKILL.md +3 -0
  75. package/.claude/skills/localization-testing/SKILL.md +6 -0
  76. package/.claude/skills/localization-testing/evals/localization-testing.yaml +544 -0
  77. package/.claude/skills/localization-testing/schemas/output.json +325 -0
  78. package/.claude/skills/localization-testing/scripts/validate.sh +131 -0
  79. package/.claude/skills/mobile-testing/SKILL.md +6 -0
  80. package/.claude/skills/mobile-testing/evals/mobile-testing.yaml +537 -0
  81. package/.claude/skills/mobile-testing/schemas/output.json +318 -0
  82. package/.claude/skills/mobile-testing/scripts/validate.sh +127 -0
  83. package/.claude/skills/mutation-testing/SKILL.md +8 -1
  84. package/.claude/skills/mutation-testing/evals/mutation-testing.yaml +652 -0
  85. package/.claude/skills/mutation-testing/schemas/output.json +707 -0
  86. package/.claude/skills/mutation-testing/scripts/validate.sh +992 -0
  87. package/.claude/skills/mutation-testing/test-data/sample-output.json +295 -0
  88. package/.claude/skills/n8n-expression-testing/SKILL.md +6 -0
  89. package/.claude/skills/n8n-expression-testing/evals/n8n-expression-testing.yaml +450 -0
  90. package/.claude/skills/n8n-expression-testing/schemas/output.json +369 -0
  91. package/.claude/skills/n8n-expression-testing/scripts/validate.sh +162 -0
  92. package/.claude/skills/n8n-integration-testing-patterns/SKILL.md +6 -0
  93. package/.claude/skills/n8n-integration-testing-patterns/evals/n8n-integration-testing-patterns.yaml +522 -0
  94. package/.claude/skills/n8n-integration-testing-patterns/schemas/output.json +291 -0
  95. package/.claude/skills/n8n-integration-testing-patterns/scripts/validate.sh +104 -0
  96. package/.claude/skills/n8n-security-testing/SKILL.md +6 -0
  97. package/.claude/skills/n8n-security-testing/evals/n8n-security-testing.yaml +493 -0
  98. package/.claude/skills/n8n-security-testing/schemas/output.json +293 -0
  99. package/.claude/skills/n8n-security-testing/scripts/validate.sh +120 -0
  100. package/.claude/skills/n8n-trigger-testing-strategies/SKILL.md +6 -0
  101. package/.claude/skills/n8n-trigger-testing-strategies/evals/n8n-trigger-testing-strategies.yaml +500 -0
  102. package/.claude/skills/n8n-trigger-testing-strategies/schemas/output.json +295 -0
  103. package/.claude/skills/n8n-trigger-testing-strategies/scripts/validate.sh +121 -0
  104. package/.claude/skills/n8n-workflow-testing-fundamentals/SKILL.md +6 -0
  105. package/.claude/skills/n8n-workflow-testing-fundamentals/evals/n8n-workflow-testing-fundamentals.yaml +497 -0
  106. package/.claude/skills/n8n-workflow-testing-fundamentals/schemas/output.json +254 -0
  107. package/.claude/skills/n8n-workflow-testing-fundamentals/scripts/validate.sh +126 -0
  108. package/.claude/skills/performance-analysis/SKILL.md +6 -0
  109. package/.claude/skills/performance-analysis/evals/performance-analysis.yaml +144 -0
  110. package/.claude/skills/performance-analysis/schemas/output.json +588 -0
  111. package/.claude/skills/performance-analysis/scripts/validate.sh +63 -0
  112. package/.claude/skills/performance-testing/SKILL.md +5 -0
  113. package/.claude/skills/performance-testing/evals/performance-testing.yaml +772 -0
  114. package/.claude/skills/performance-testing/schemas/output.json +1184 -0
  115. package/.claude/skills/performance-testing/scripts/validate.sh +1090 -0
  116. package/.claude/skills/qcsd-ideation-swarm/SKILL.md +1759 -1753
  117. package/.claude/skills/qcsd-ideation-swarm/evals/qcsd-ideation-swarm.yaml +138 -0
  118. package/.claude/skills/qcsd-ideation-swarm/schemas/output.json +568 -0
  119. package/.claude/skills/qcsd-ideation-swarm/scripts/validate.sh +56 -0
  120. package/.claude/skills/qe-chaos-resilience/SKILL.md +6 -0
  121. package/.claude/skills/qe-chaos-resilience/evals/qe-chaos-resilience.yaml +443 -0
  122. package/.claude/skills/qe-chaos-resilience/schemas/output.json +314 -0
  123. package/.claude/skills/qe-chaos-resilience/scripts/validate.sh +401 -0
  124. package/.claude/skills/qe-code-intelligence/SKILL.md +6 -0
  125. package/.claude/skills/qe-code-intelligence/evals/qe-code-intelligence.yaml +459 -0
  126. package/.claude/skills/qe-code-intelligence/schemas/output.json +315 -0
  127. package/.claude/skills/qe-code-intelligence/scripts/validate.sh +408 -0
  128. package/.claude/skills/qe-contract-testing/SKILL.md +6 -0
  129. package/.claude/skills/qe-contract-testing/evals/qe-contract-testing.yaml +513 -0
  130. package/.claude/skills/qe-contract-testing/schemas/output.json +295 -0
  131. package/.claude/skills/qe-contract-testing/scripts/validate.sh +421 -0
  132. package/.claude/skills/qe-coverage-analysis/SKILL.md +6 -0
  133. package/.claude/skills/qe-coverage-analysis/evals/qe-coverage-analysis.yaml +494 -0
  134. package/.claude/skills/qe-coverage-analysis/schemas/output.json +286 -0
  135. package/.claude/skills/qe-coverage-analysis/scripts/validate.sh +453 -0
  136. package/.claude/skills/qe-defect-intelligence/SKILL.md +6 -0
  137. package/.claude/skills/qe-defect-intelligence/evals/qe-defect-intelligence.yaml +511 -0
  138. package/.claude/skills/qe-defect-intelligence/schemas/output.json +283 -0
  139. package/.claude/skills/qe-defect-intelligence/scripts/validate.sh +466 -0
  140. package/.claude/skills/qe-iterative-loop/SKILL.md +445 -440
  141. package/.claude/skills/qe-iterative-loop/schemas/output.json +264 -0
  142. package/.claude/skills/qe-iterative-loop/scripts/validate.sh +474 -0
  143. package/.claude/skills/qe-learning-optimization/SKILL.md +6 -0
  144. package/.claude/skills/qe-learning-optimization/evals/qe-learning-optimization.yaml +144 -0
  145. package/.claude/skills/qe-learning-optimization/schemas/output.json +288 -0
  146. package/.claude/skills/qe-learning-optimization/scripts/validate.sh +56 -0
  147. package/.claude/skills/qe-quality-assessment/SKILL.md +6 -0
  148. package/.claude/skills/qe-quality-assessment/evals/qe-quality-assessment.yaml +506 -0
  149. package/.claude/skills/qe-quality-assessment/schemas/output.json +550 -0
  150. package/.claude/skills/qe-quality-assessment/scripts/validate.sh +352 -0
  151. package/.claude/skills/qe-requirements-validation/SKILL.md +6 -0
  152. package/.claude/skills/qe-requirements-validation/evals/qe-requirements-validation.yaml +598 -0
  153. package/.claude/skills/qe-requirements-validation/schemas/output.json +587 -0
  154. package/.claude/skills/qe-requirements-validation/scripts/validate.sh +275 -0
  155. package/.claude/skills/qe-security-compliance/SKILL.md +6 -0
  156. package/.claude/skills/qe-security-compliance/evals/qe-security-compliance.yaml +595 -0
  157. package/.claude/skills/qe-security-compliance/schemas/output.json +498 -0
  158. package/.claude/skills/qe-security-compliance/scripts/validate.sh +305 -0
  159. package/.claude/skills/qe-test-execution/SKILL.md +6 -0
  160. package/.claude/skills/qe-test-execution/evals/qe-test-execution.yaml +607 -0
  161. package/.claude/skills/qe-test-execution/schemas/output.json +529 -0
  162. package/.claude/skills/qe-test-execution/scripts/validate.sh +323 -0
  163. package/.claude/skills/qe-test-generation/SKILL.md +6 -0
  164. package/.claude/skills/qe-test-generation/evals/qe-test-generation.yaml +148 -0
  165. package/.claude/skills/qe-test-generation/schemas/output.json +439 -0
  166. package/.claude/skills/qe-test-generation/scripts/validate.sh +56 -0
  167. package/.claude/skills/qe-visual-accessibility/SKILL.md +6 -0
  168. package/.claude/skills/qe-visual-accessibility/evals/qe-visual-accessibility.yaml +142 -0
  169. package/.claude/skills/qe-visual-accessibility/schemas/output.json +491 -0
  170. package/.claude/skills/qe-visual-accessibility/scripts/validate.sh +56 -0
  171. package/.claude/skills/quality-metrics/SKILL.md +6 -0
  172. package/.claude/skills/quality-metrics/evals/quality-metrics.yaml +494 -0
  173. package/.claude/skills/quality-metrics/schemas/output.json +403 -0
  174. package/.claude/skills/quality-metrics/scripts/validate.sh +434 -0
  175. package/.claude/skills/refactoring-patterns/SKILL.md +5 -0
  176. package/.claude/skills/refactoring-patterns/schemas/output.json +475 -0
  177. package/.claude/skills/refactoring-patterns/scripts/validate.sh +241 -0
  178. package/.claude/skills/regression-testing/SKILL.md +6 -0
  179. package/.claude/skills/regression-testing/evals/regression-testing.yaml +504 -0
  180. package/.claude/skills/regression-testing/schemas/output.json +311 -0
  181. package/.claude/skills/regression-testing/scripts/validate.sh +130 -0
  182. package/.claude/skills/risk-based-testing/SKILL.md +6 -0
  183. package/.claude/skills/risk-based-testing/evals/risk-based-testing.yaml +141 -0
  184. package/.claude/skills/risk-based-testing/schemas/output.json +480 -0
  185. package/.claude/skills/risk-based-testing/scripts/validate.sh +56 -0
  186. package/.claude/skills/security-testing/SKILL.md +5 -0
  187. package/.claude/skills/security-testing/evals/security-testing.yaml +789 -0
  188. package/.claude/skills/security-testing/schemas/output.json +879 -0
  189. package/.claude/skills/security-testing/scripts/validate.sh +967 -0
  190. package/.claude/skills/security-visual-testing/SKILL.md +223 -217
  191. package/.claude/skills/security-visual-testing/evals/security-visual-testing.yaml +163 -0
  192. package/.claude/skills/security-visual-testing/schemas/output.json +486 -0
  193. package/.claude/skills/security-visual-testing/scripts/validate.sh +748 -0
  194. package/.claude/skills/sherlock-review/SKILL.md +5 -0
  195. package/.claude/skills/sherlock-review/schemas/output.json +297 -0
  196. package/.claude/skills/sherlock-review/scripts/validate.sh +115 -0
  197. package/.claude/skills/shift-left-testing/SKILL.md +6 -0
  198. package/.claude/skills/shift-left-testing/evals/shift-left-testing.yaml +145 -0
  199. package/.claude/skills/shift-left-testing/schemas/output.json +459 -0
  200. package/.claude/skills/shift-left-testing/scripts/validate.sh +56 -0
  201. package/.claude/skills/shift-right-testing/SKILL.md +6 -0
  202. package/.claude/skills/shift-right-testing/evals/shift-right-testing.yaml +147 -0
  203. package/.claude/skills/shift-right-testing/schemas/output.json +418 -0
  204. package/.claude/skills/shift-right-testing/scripts/validate.sh +56 -0
  205. package/.claude/skills/six-thinking-hats/SKILL.md +3 -0
  206. package/.claude/skills/tdd-london-chicago/SKILL.md +5 -0
  207. package/.claude/skills/tdd-london-chicago/schemas/output.json +444 -0
  208. package/.claude/skills/tdd-london-chicago/scripts/validate.sh +214 -0
  209. package/.claude/skills/technical-writing/SKILL.md +4 -0
  210. package/.claude/skills/technical-writing/schemas/output.json +268 -0
  211. package/.claude/skills/test-automation-strategy/SKILL.md +6 -0
  212. package/.claude/skills/test-automation-strategy/evals/test-automation-strategy.yaml +148 -0
  213. package/.claude/skills/test-automation-strategy/schemas/output.json +444 -0
  214. package/.claude/skills/test-automation-strategy/scripts/validate.sh +195 -0
  215. package/.claude/skills/test-data-management/SKILL.md +6 -0
  216. package/.claude/skills/test-data-management/evals/test-data-management.yaml +504 -0
  217. package/.claude/skills/test-data-management/schemas/output.json +284 -0
  218. package/.claude/skills/test-data-management/scripts/validate.sh +137 -0
  219. package/.claude/skills/test-design-techniques/SKILL.md +6 -0
  220. package/.claude/skills/test-design-techniques/evals/test-design-techniques.yaml +142 -0
  221. package/.claude/skills/test-design-techniques/schemas/output.json +295 -0
  222. package/.claude/skills/test-design-techniques/scripts/validate.sh +91 -0
  223. package/.claude/skills/test-environment-management/SKILL.md +4 -0
  224. package/.claude/skills/test-environment-management/schemas/output.json +310 -0
  225. package/.claude/skills/test-reporting-analytics/SKILL.md +6 -0
  226. package/.claude/skills/test-reporting-analytics/evals/test-reporting-analytics.yaml +155 -0
  227. package/.claude/skills/test-reporting-analytics/schemas/output.json +329 -0
  228. package/.claude/skills/test-reporting-analytics/scripts/validate.sh +77 -0
  229. package/.claude/skills/testability-scoring/SKILL.md +5 -0
  230. package/.claude/skills/testability-scoring/evals/testability-scoring.yaml +814 -0
  231. package/.claude/skills/testability-scoring/schemas/output.json +606 -0
  232. package/.claude/skills/testability-scoring/scripts/validate.sh +891 -0
  233. package/.claude/skills/trust-tier-manifest.json +2333 -0
  234. package/.claude/skills/verification-quality/SKILL.md +6 -0
  235. package/.claude/skills/verification-quality/evals/verification-quality.yaml +150 -0
  236. package/.claude/skills/verification-quality/schemas/output.json +432 -0
  237. package/.claude/skills/verification-quality/scripts/validate.sh +77 -0
  238. package/.claude/skills/visual-testing-advanced/SKILL.md +6 -0
  239. package/.claude/skills/visual-testing-advanced/evals/visual-testing-advanced.yaml +154 -0
  240. package/.claude/skills/visual-testing-advanced/schemas/output.json +294 -0
  241. package/.claude/skills/visual-testing-advanced/scripts/validate.sh +77 -0
  242. package/.claude/skills/xp-practices/SKILL.md +3 -0
  243. package/README.md +39 -2
  244. package/package.json +4 -1
  245. package/scripts/run-skill-eval.ts +1097 -0
  246. package/scripts/test-schema-validation.js +301 -0
  247. package/scripts/update-skill-manifest.ts +751 -0
  248. package/scripts/validate-skill-frontmatter.js +141 -0
  249. package/v3/CHANGELOG.md +21 -0
  250. package/v3/README.md +43 -3
  251. package/v3/assets/agents/v3/qe-test-idea-rewriter.md +375 -375
  252. package/v3/assets/skills/.validation/README.md +111 -0
  253. package/v3/assets/skills/.validation/examples/chaos-engineering-output.example.json +530 -0
  254. package/v3/assets/skills/.validation/examples/performance-testing-output.example.json +252 -0
  255. package/v3/assets/skills/.validation/examples/security-testing-output.example.json +413 -0
  256. package/v3/assets/skills/.validation/examples/testability-scoring-output.example.json +350 -0
  257. package/v3/assets/skills/.validation/schemas/skill-eval.schema.json +452 -0
  258. package/v3/assets/skills/.validation/schemas/skill-frontmatter.schema.json +341 -0
  259. package/v3/assets/skills/.validation/schemas/skill-output-meta.schema.json +199 -0
  260. package/v3/assets/skills/.validation/schemas/skill-output.template.json +610 -0
  261. package/v3/assets/skills/.validation/skill-validation-mcp-integration.md +250 -0
  262. package/v3/assets/skills/.validation/templates/eval.template.yaml +366 -0
  263. package/v3/assets/skills/.validation/templates/schemas/output.json +145 -0
  264. package/v3/assets/skills/.validation/templates/security-testing-eval.template.yaml +725 -0
  265. package/v3/assets/skills/.validation/templates/skill-frontmatter.example.yaml +225 -0
  266. package/v3/assets/skills/.validation/templates/validate.template.sh +1060 -0
  267. package/v3/assets/skills/.validation/templates/validator-lib.sh +1031 -0
  268. package/v3/assets/skills/.validation/test-data/invalid-output.json +5 -0
  269. package/v3/assets/skills/.validation/test-data/minimal-output.json +9 -0
  270. package/v3/assets/skills/.validation/test-data/sample-output.json +73 -0
  271. package/v3/assets/skills/accessibility-testing/SKILL.md +8 -1
  272. package/v3/assets/skills/accessibility-testing/evals/accessibility-testing.yaml +719 -0
  273. package/v3/assets/skills/accessibility-testing/schemas/output.json +776 -0
  274. package/v3/assets/skills/accessibility-testing/scripts/validate.sh +943 -0
  275. package/v3/assets/skills/accessibility-testing/test-data/sample-output.json +191 -0
  276. package/v3/assets/skills/agentic-quality-engineering/SKILL.md +4 -0
  277. package/v3/assets/skills/agentic-quality-engineering/schemas/output.json +577 -0
  278. package/v3/assets/skills/api-testing-patterns/SKILL.md +5 -0
  279. package/v3/assets/skills/api-testing-patterns/evals/api-testing-patterns.yaml +696 -0
  280. package/v3/assets/skills/api-testing-patterns/schemas/output.json +845 -0
  281. package/v3/assets/skills/api-testing-patterns/scripts/validate.sh +931 -0
  282. package/v3/assets/skills/aqe-v2-v3-migration/schemas/output.json +513 -0
  283. package/v3/assets/skills/aqe-v2-v3-migration/skill.md +20 -16
  284. package/v3/assets/skills/bug-reporting-excellence/SKILL.md +5 -0
  285. package/v3/assets/skills/bug-reporting-excellence/schemas/output.json +288 -0
  286. package/v3/assets/skills/bug-reporting-excellence/scripts/validate.sh +108 -0
  287. package/v3/assets/skills/chaos-engineering-resilience/SKILL.md +5 -0
  288. package/v3/assets/skills/chaos-engineering-resilience/evals/chaos-engineering-resilience.yaml +761 -0
  289. package/v3/assets/skills/chaos-engineering-resilience/schemas/output.json +1205 -0
  290. package/v3/assets/skills/chaos-engineering-resilience/scripts/validate.sh +1107 -0
  291. package/v3/assets/skills/code-review-quality/SKILL.md +5 -0
  292. package/v3/assets/skills/code-review-quality/schemas/output.json +264 -0
  293. package/v3/assets/skills/code-review-quality/scripts/validate.sh +113 -0
  294. package/v3/assets/skills/compatibility-testing/evals/compatibility-testing.yaml +410 -0
  295. package/v3/assets/skills/compatibility-testing/schemas/output.json +551 -0
  296. package/v3/assets/skills/compatibility-testing/scripts/validate.sh +605 -0
  297. package/v3/assets/skills/compliance-testing/SKILL.md +7 -0
  298. package/v3/assets/skills/compliance-testing/evals/compliance-testing.yaml +1107 -0
  299. package/v3/assets/skills/compliance-testing/schemas/output.json +845 -0
  300. package/v3/assets/skills/compliance-testing/scripts/validate.sh +888 -0
  301. package/v3/assets/skills/consultancy-practices/SKILL.md +4 -0
  302. package/v3/assets/skills/consultancy-practices/schemas/output.json +282 -0
  303. package/v3/assets/skills/context-driven-testing/SKILL.md +3 -0
  304. package/v3/assets/skills/contract-testing/SKILL.md +5 -0
  305. package/v3/assets/skills/contract-testing/evals/contract-testing.yaml +748 -0
  306. package/v3/assets/skills/contract-testing/schemas/output.json +638 -0
  307. package/v3/assets/skills/contract-testing/scripts/validate.sh +1033 -0
  308. package/v3/assets/skills/database-testing/SKILL.md +5 -0
  309. package/v3/assets/skills/database-testing/evals/database-testing.yaml +968 -0
  310. package/v3/assets/skills/database-testing/schemas/output.json +1446 -0
  311. package/v3/assets/skills/database-testing/scripts/validate.sh +1099 -0
  312. package/v3/assets/skills/exploratory-testing-advanced/SKILL.md +3 -0
  313. package/v3/assets/skills/holistic-testing-pact/SKILL.md +3 -0
  314. package/v3/assets/skills/localization-testing/SKILL.md +6 -0
  315. package/v3/assets/skills/localization-testing/evals/localization-testing.yaml +544 -0
  316. package/v3/assets/skills/localization-testing/schemas/output.json +325 -0
  317. package/v3/assets/skills/localization-testing/scripts/validate.sh +131 -0
  318. package/v3/assets/skills/mobile-testing/SKILL.md +6 -0
  319. package/v3/assets/skills/mobile-testing/evals/mobile-testing.yaml +537 -0
  320. package/v3/assets/skills/mobile-testing/schemas/output.json +318 -0
  321. package/v3/assets/skills/mobile-testing/scripts/validate.sh +127 -0
  322. package/v3/assets/skills/mutation-testing/SKILL.md +8 -1
  323. package/v3/assets/skills/mutation-testing/evals/mutation-testing.yaml +652 -0
  324. package/v3/assets/skills/mutation-testing/schemas/output.json +707 -0
  325. package/v3/assets/skills/mutation-testing/scripts/validate.sh +992 -0
  326. package/v3/assets/skills/mutation-testing/test-data/sample-output.json +295 -0
  327. package/v3/assets/skills/n8n-expression-testing/scripts/validate.sh +162 -0
  328. package/v3/assets/skills/n8n-integration-testing-patterns/scripts/validate.sh +104 -0
  329. package/v3/assets/skills/n8n-security-testing/scripts/validate.sh +120 -0
  330. package/v3/assets/skills/n8n-trigger-testing-strategies/scripts/validate.sh +121 -0
  331. package/v3/assets/skills/n8n-workflow-testing-fundamentals/scripts/validate.sh +126 -0
  332. package/v3/assets/skills/performance-testing/SKILL.md +5 -0
  333. package/v3/assets/skills/performance-testing/evals/performance-testing.yaml +772 -0
  334. package/v3/assets/skills/performance-testing/schemas/output.json +1184 -0
  335. package/v3/assets/skills/performance-testing/scripts/validate.sh +1090 -0
  336. package/v3/assets/skills/qe-chaos-resilience/SKILL.md +6 -0
  337. package/v3/assets/skills/qe-chaos-resilience/evals/qe-chaos-resilience.yaml +443 -0
  338. package/v3/assets/skills/qe-chaos-resilience/schemas/output.json +314 -0
  339. package/v3/assets/skills/qe-chaos-resilience/scripts/validate.sh +401 -0
  340. package/v3/assets/skills/qe-code-intelligence/SKILL.md +6 -0
  341. package/v3/assets/skills/qe-code-intelligence/evals/qe-code-intelligence.yaml +459 -0
  342. package/v3/assets/skills/qe-code-intelligence/schemas/output.json +315 -0
  343. package/v3/assets/skills/qe-code-intelligence/scripts/validate.sh +408 -0
  344. package/v3/assets/skills/qe-contract-testing/SKILL.md +6 -0
  345. package/v3/assets/skills/qe-contract-testing/evals/qe-contract-testing.yaml +513 -0
  346. package/v3/assets/skills/qe-contract-testing/schemas/output.json +295 -0
  347. package/v3/assets/skills/qe-contract-testing/scripts/validate.sh +421 -0
  348. package/v3/assets/skills/qe-coverage-analysis/SKILL.md +6 -0
  349. package/v3/assets/skills/qe-coverage-analysis/evals/qe-coverage-analysis.yaml +494 -0
  350. package/v3/assets/skills/qe-coverage-analysis/schemas/output.json +286 -0
  351. package/v3/assets/skills/qe-coverage-analysis/scripts/validate.sh +453 -0
  352. package/v3/assets/skills/qe-defect-intelligence/SKILL.md +6 -0
  353. package/v3/assets/skills/qe-defect-intelligence/evals/qe-defect-intelligence.yaml +511 -0
  354. package/v3/assets/skills/qe-defect-intelligence/schemas/output.json +283 -0
  355. package/v3/assets/skills/qe-defect-intelligence/scripts/validate.sh +466 -0
  356. package/v3/assets/skills/qe-iterative-loop/SKILL.md +445 -443
  357. package/v3/assets/skills/qe-iterative-loop/schemas/output.json +264 -0
  358. package/v3/assets/skills/qe-iterative-loop/scripts/validate.sh +474 -0
  359. package/v3/assets/skills/qe-learning-optimization/SKILL.md +6 -0
  360. package/v3/assets/skills/qe-learning-optimization/evals/qe-learning-optimization.yaml +144 -0
  361. package/v3/assets/skills/qe-learning-optimization/schemas/output.json +288 -0
  362. package/v3/assets/skills/qe-learning-optimization/scripts/validate.sh +56 -0
  363. package/v3/assets/skills/qe-quality-assessment/SKILL.md +6 -0
  364. package/v3/assets/skills/qe-quality-assessment/evals/qe-quality-assessment.yaml +506 -0
  365. package/v3/assets/skills/qe-quality-assessment/schemas/output.json +550 -0
  366. package/v3/assets/skills/qe-quality-assessment/scripts/validate.sh +352 -0
  367. package/v3/assets/skills/qe-requirements-validation/SKILL.md +6 -0
  368. package/v3/assets/skills/qe-requirements-validation/evals/qe-requirements-validation.yaml +598 -0
  369. package/v3/assets/skills/qe-requirements-validation/schemas/output.json +587 -0
  370. package/v3/assets/skills/qe-requirements-validation/scripts/validate.sh +275 -0
  371. package/v3/assets/skills/qe-security-compliance/SKILL.md +6 -0
  372. package/v3/assets/skills/qe-security-compliance/evals/qe-security-compliance.yaml +595 -0
  373. package/v3/assets/skills/qe-security-compliance/schemas/output.json +498 -0
  374. package/v3/assets/skills/qe-security-compliance/scripts/validate.sh +305 -0
  375. package/v3/assets/skills/qe-test-execution/SKILL.md +6 -0
  376. package/v3/assets/skills/qe-test-execution/evals/qe-test-execution.yaml +607 -0
  377. package/v3/assets/skills/qe-test-execution/schemas/output.json +529 -0
  378. package/v3/assets/skills/qe-test-execution/scripts/validate.sh +323 -0
  379. package/v3/assets/skills/qe-test-generation/SKILL.md +6 -0
  380. package/v3/assets/skills/qe-test-generation/evals/qe-test-generation.yaml +148 -0
  381. package/v3/assets/skills/qe-test-generation/schemas/output.json +439 -0
  382. package/v3/assets/skills/qe-test-generation/scripts/validate.sh +56 -0
  383. package/v3/assets/skills/qe-visual-accessibility/SKILL.md +6 -0
  384. package/v3/assets/skills/qe-visual-accessibility/evals/qe-visual-accessibility.yaml +142 -0
  385. package/v3/assets/skills/qe-visual-accessibility/schemas/output.json +491 -0
  386. package/v3/assets/skills/qe-visual-accessibility/scripts/validate.sh +56 -0
  387. package/v3/assets/skills/quality-metrics/SKILL.md +6 -0
  388. package/v3/assets/skills/quality-metrics/evals/quality-metrics.yaml +494 -0
  389. package/v3/assets/skills/quality-metrics/schemas/output.json +403 -0
  390. package/v3/assets/skills/quality-metrics/scripts/validate.sh +434 -0
  391. package/v3/assets/skills/refactoring-patterns/SKILL.md +5 -0
  392. package/v3/assets/skills/refactoring-patterns/schemas/output.json +475 -0
  393. package/v3/assets/skills/refactoring-patterns/scripts/validate.sh +241 -0
  394. package/v3/assets/skills/regression-testing/SKILL.md +6 -0
  395. package/v3/assets/skills/regression-testing/evals/regression-testing.yaml +504 -0
  396. package/v3/assets/skills/regression-testing/schemas/output.json +311 -0
  397. package/v3/assets/skills/regression-testing/scripts/validate.sh +130 -0
  398. package/v3/assets/skills/risk-based-testing/SKILL.md +6 -0
  399. package/v3/assets/skills/risk-based-testing/evals/risk-based-testing.yaml +141 -0
  400. package/v3/assets/skills/risk-based-testing/schemas/output.json +480 -0
  401. package/v3/assets/skills/risk-based-testing/scripts/validate.sh +56 -0
  402. package/v3/assets/skills/security-testing/SKILL.md +5 -0
  403. package/v3/assets/skills/security-testing/evals/security-testing.yaml +789 -0
  404. package/v3/assets/skills/security-testing/schemas/output.json +879 -0
  405. package/v3/assets/skills/security-testing/scripts/validate.sh +967 -0
  406. package/v3/assets/skills/shift-left-testing/SKILL.md +6 -0
  407. package/v3/assets/skills/shift-left-testing/evals/shift-left-testing.yaml +145 -0
  408. package/v3/assets/skills/shift-left-testing/schemas/output.json +459 -0
  409. package/v3/assets/skills/shift-left-testing/scripts/validate.sh +56 -0
  410. package/v3/assets/skills/shift-right-testing/SKILL.md +6 -0
  411. package/v3/assets/skills/shift-right-testing/evals/shift-right-testing.yaml +147 -0
  412. package/v3/assets/skills/shift-right-testing/schemas/output.json +418 -0
  413. package/v3/assets/skills/shift-right-testing/scripts/validate.sh +56 -0
  414. package/v3/assets/skills/six-thinking-hats/SKILL.md +3 -0
  415. package/v3/assets/skills/tdd-london-chicago/SKILL.md +5 -0
  416. package/v3/assets/skills/tdd-london-chicago/schemas/output.json +444 -0
  417. package/v3/assets/skills/tdd-london-chicago/scripts/validate.sh +214 -0
  418. package/v3/assets/skills/technical-writing/SKILL.md +4 -0
  419. package/v3/assets/skills/technical-writing/schemas/output.json +268 -0
  420. package/v3/assets/skills/test-automation-strategy/SKILL.md +6 -0
  421. package/v3/assets/skills/test-automation-strategy/evals/test-automation-strategy.yaml +148 -0
  422. package/v3/assets/skills/test-automation-strategy/schemas/output.json +444 -0
  423. package/v3/assets/skills/test-automation-strategy/scripts/validate.sh +195 -0
  424. package/v3/assets/skills/test-data-management/SKILL.md +6 -0
  425. package/v3/assets/skills/test-data-management/evals/test-data-management.yaml +504 -0
  426. package/v3/assets/skills/test-data-management/schemas/output.json +284 -0
  427. package/v3/assets/skills/test-data-management/scripts/validate.sh +137 -0
  428. package/v3/assets/skills/test-design-techniques/SKILL.md +6 -0
  429. package/v3/assets/skills/test-design-techniques/evals/test-design-techniques.yaml +142 -0
  430. package/v3/assets/skills/test-design-techniques/schemas/output.json +295 -0
  431. package/v3/assets/skills/test-design-techniques/scripts/validate.sh +91 -0
  432. package/v3/assets/skills/test-environment-management/SKILL.md +4 -0
  433. package/v3/assets/skills/test-environment-management/schemas/output.json +310 -0
  434. package/v3/assets/skills/test-reporting-analytics/SKILL.md +6 -0
  435. package/v3/assets/skills/test-reporting-analytics/evals/test-reporting-analytics.yaml +155 -0
  436. package/v3/assets/skills/test-reporting-analytics/schemas/output.json +329 -0
  437. package/v3/assets/skills/test-reporting-analytics/scripts/validate.sh +77 -0
  438. package/v3/assets/skills/verification-quality/SKILL.md +6 -0
  439. package/v3/assets/skills/verification-quality/evals/verification-quality.yaml +150 -0
  440. package/v3/assets/skills/verification-quality/schemas/output.json +432 -0
  441. package/v3/assets/skills/verification-quality/scripts/validate.sh +77 -0
  442. package/v3/assets/skills/visual-testing-advanced/SKILL.md +6 -0
  443. package/v3/assets/skills/visual-testing-advanced/evals/visual-testing-advanced.yaml +154 -0
  444. package/v3/assets/skills/visual-testing-advanced/schemas/output.json +294 -0
  445. package/v3/assets/skills/visual-testing-advanced/scripts/validate.sh +77 -0
  446. package/v3/assets/skills/xp-practices/SKILL.md +3 -0
  447. package/v3/assets/templates/validation-report.md.hbs +139 -0
  448. package/v3/assets/templates/validation-summary.json +56 -0
  449. package/v3/dist/cli/bundle.js +7368 -3418
  450. package/v3/dist/cli/commands/eval.d.ts +18 -0
  451. package/v3/dist/cli/commands/eval.d.ts.map +1 -0
  452. package/v3/dist/cli/commands/eval.js +505 -0
  453. package/v3/dist/cli/commands/eval.js.map +1 -0
  454. package/v3/dist/cli/commands/validate-swarm.d.ts +16 -0
  455. package/v3/dist/cli/commands/validate-swarm.d.ts.map +1 -0
  456. package/v3/dist/cli/commands/validate-swarm.js +251 -0
  457. package/v3/dist/cli/commands/validate-swarm.js.map +1 -0
  458. package/v3/dist/cli/commands/validate.d.ts +14 -0
  459. package/v3/dist/cli/commands/validate.d.ts.map +1 -0
  460. package/v3/dist/cli/commands/validate.js +408 -0
  461. package/v3/dist/cli/commands/validate.js.map +1 -0
  462. package/v3/dist/cli/index.js +6 -0
  463. package/v3/dist/cli/index.js.map +1 -1
  464. package/v3/dist/index.d.ts +2 -0
  465. package/v3/dist/index.d.ts.map +1 -1
  466. package/v3/dist/index.js +10 -0
  467. package/v3/dist/index.js.map +1 -1
  468. package/v3/dist/init/skills-installer.d.ts +6 -0
  469. package/v3/dist/init/skills-installer.d.ts.map +1 -1
  470. package/v3/dist/init/skills-installer.js +48 -0
  471. package/v3/dist/init/skills-installer.js.map +1 -1
  472. package/v3/dist/learning/index.d.ts +2 -0
  473. package/v3/dist/learning/index.d.ts.map +1 -1
  474. package/v3/dist/learning/index.js +4 -0
  475. package/v3/dist/learning/index.js.map +1 -1
  476. package/v3/dist/learning/skill-validation-learner.d.ts +264 -0
  477. package/v3/dist/learning/skill-validation-learner.d.ts.map +1 -0
  478. package/v3/dist/learning/skill-validation-learner.js +515 -0
  479. package/v3/dist/learning/skill-validation-learner.js.map +1 -0
  480. package/v3/dist/validation/index.d.ts +19 -0
  481. package/v3/dist/validation/index.d.ts.map +1 -0
  482. package/v3/dist/validation/index.js +31 -0
  483. package/v3/dist/validation/index.js.map +1 -0
  484. package/v3/dist/validation/parallel-eval-runner.d.ts +307 -0
  485. package/v3/dist/validation/parallel-eval-runner.d.ts.map +1 -0
  486. package/v3/dist/validation/parallel-eval-runner.js +566 -0
  487. package/v3/dist/validation/parallel-eval-runner.js.map +1 -0
  488. package/v3/dist/validation/swarm-skill-validator.d.ts +282 -0
  489. package/v3/dist/validation/swarm-skill-validator.d.ts.map +1 -0
  490. package/v3/dist/validation/swarm-skill-validator.js +460 -0
  491. package/v3/dist/validation/swarm-skill-validator.js.map +1 -0
  492. package/v3/dist/validation/validation-result-aggregator.d.ts +232 -0
  493. package/v3/dist/validation/validation-result-aggregator.d.ts.map +1 -0
  494. package/v3/dist/validation/validation-result-aggregator.js +630 -0
  495. package/v3/dist/validation/validation-result-aggregator.js.map +1 -0
  496. package/v3/package.json +1 -1
@@ -0,0 +1,1097 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * AQE Skill Evaluation Runner
4
+ *
5
+ * Runs skill evaluation test suites defined in YAML format.
6
+ * Integrates with AQE MCP tools for shared learning and QualityFeedbackLoop.
7
+ *
8
+ * Usage:
9
+ * npx tsx scripts/run-skill-eval.ts --skill security-testing --model claude-3.5-sonnet
10
+ * npx tsx scripts/run-skill-eval.ts --eval-file .claude/skills/security-testing/evals/security-testing.yaml
11
+ * npx tsx scripts/run-skill-eval.ts --skill security-testing --all-models
12
+ *
13
+ * MCP Integration (per docs/specs/skill-validation-mcp-integration.md):
14
+ * - Queries patterns before running: mcp__agentic-qe__memory_query
15
+ * - Tracks test outcomes: mcp__agentic-qe__test_outcome_track
16
+ * - Stores patterns after: mcp__agentic-qe__memory_store
17
+ * - Shares learning: mcp__agentic-qe__memory_share
18
+ * - Updates quality gate: mcp__agentic-qe__quality_assess
19
+ *
20
+ * @module scripts/run-skill-eval
21
+ * @version 1.0.0
22
+ */
23
+
24
+ import { readFileSync, writeFileSync, existsSync } from 'fs';
25
+ import { join, dirname, basename } from 'path';
26
+ import { fileURLToPath } from 'url';
27
+
28
+ // For YAML parsing - using dynamic import for ESM compatibility
29
+ let yaml: typeof import('yaml');
30
+
31
+ // =============================================================================
32
+ // Types
33
+ // =============================================================================
34
+
35
+ /**
36
+ * Test case from eval YAML
37
+ */
38
+ interface TestCase {
39
+ id: string;
40
+ description: string;
41
+ category?: string;
42
+ priority: 'critical' | 'high' | 'medium' | 'low';
43
+ skip?: boolean;
44
+ skip_reason?: string;
45
+ input: TestInput;
46
+ expected_output: ExpectedOutput;
47
+ validation?: ValidationConfig;
48
+ timeout_ms?: number;
49
+ }
50
+
51
+ interface TestInput {
52
+ code?: string;
53
+ file_path?: string;
54
+ url?: string;
55
+ prompt?: string;
56
+ context?: {
57
+ language?: string;
58
+ framework?: string;
59
+ environment?: 'development' | 'staging' | 'production';
60
+ };
61
+ options?: Record<string, unknown>;
62
+ }
63
+
64
+ interface ExpectedOutput {
65
+ must_contain?: string[];
66
+ must_not_contain?: string[];
67
+ must_match_regex?: string[];
68
+ severity_classification?: 'critical' | 'high' | 'medium' | 'low' | 'info';
69
+ finding_count?: { min?: number; max?: number };
70
+ recommendation_count?: { min?: number; max?: number };
71
+ schema_path?: string;
72
+ custom_assertions?: CustomAssertion[];
73
+ }
74
+
75
+ interface CustomAssertion {
76
+ type: 'jsonpath' | 'semantic' | 'function';
77
+ expression: string;
78
+ expected?: unknown;
79
+ }
80
+
81
+ interface ValidationConfig {
82
+ schema_check?: boolean;
83
+ keyword_match_threshold?: number;
84
+ reasoning_quality_min?: number;
85
+ semantic_similarity_min?: number;
86
+ allow_partial?: boolean;
87
+ grading_rubric?: {
88
+ completeness?: number;
89
+ accuracy?: number;
90
+ actionability?: number;
91
+ };
92
+ }
93
+
94
+ interface MCPIntegrationConfig {
95
+ enabled: boolean;
96
+ namespace: string;
97
+ store_patterns: boolean;
98
+ query_patterns: boolean;
99
+ track_outcomes: boolean;
100
+ share_learning: boolean;
101
+ update_quality_gate: boolean;
102
+ target_agents: string[];
103
+ }
104
+
105
+ interface LearningConfig {
106
+ store_success_patterns: boolean;
107
+ store_failure_patterns: boolean;
108
+ pattern_ttl_days: number;
109
+ min_confidence_to_store: number;
110
+ cross_model_comparison: boolean;
111
+ }
112
+
113
+ interface SuccessCriteria {
114
+ pass_rate: number;
115
+ critical_pass_rate?: number;
116
+ avg_reasoning_quality?: number;
117
+ max_execution_time_ms?: number;
118
+ cross_model_variance?: number;
119
+ }
120
+
121
+ /**
122
+ * Full eval suite configuration
123
+ */
124
+ interface EvalSuite {
125
+ skill: string;
126
+ version: string;
127
+ description?: string;
128
+ models_to_test: string[];
129
+ mcp_integration?: MCPIntegrationConfig;
130
+ learning?: LearningConfig;
131
+ setup?: {
132
+ required_tools?: string[];
133
+ environment_variables?: Record<string, string>;
134
+ fixtures?: Array<{ name: string; path: string; content: string }>;
135
+ };
136
+ test_cases: TestCase[];
137
+ success_criteria: SuccessCriteria;
138
+ metadata?: {
139
+ author?: string;
140
+ created?: string;
141
+ last_updated?: string;
142
+ coverage_target?: string;
143
+ };
144
+ }
145
+
146
+ /**
147
+ * Result of a single test case execution
148
+ */
149
+ interface TestCaseResult {
150
+ id: string;
151
+ description: string;
152
+ category?: string;
153
+ priority: string;
154
+ passed: boolean;
155
+ skipped: boolean;
156
+ skip_reason?: string;
157
+ execution_time_ms: number;
158
+ keyword_match_score: number;
159
+ reasoning_quality_score: number;
160
+ validation_details: {
161
+ must_contain_matches: string[];
162
+ must_contain_misses: string[];
163
+ must_not_contain_violations: string[];
164
+ regex_matches: string[];
165
+ regex_misses: string[];
166
+ severity_matched: boolean;
167
+ finding_count_matched: boolean;
168
+ };
169
+ raw_output?: string;
170
+ error?: string;
171
+ }
172
+
173
+ /**
174
+ * Result of running eval suite against a single model
175
+ */
176
+ interface ModelEvalResult {
177
+ model: string;
178
+ skill: string;
179
+ version: string;
180
+ timestamp: string;
181
+ total_tests: number;
182
+ passed: number;
183
+ failed: number;
184
+ skipped: number;
185
+ pass_rate: number;
186
+ critical_pass_rate: number;
187
+ avg_reasoning_quality: number;
188
+ total_execution_time_ms: number;
189
+ test_results: TestCaseResult[];
190
+ success_criteria_met: boolean;
191
+ criteria_failures: string[];
192
+ }
193
+
194
+ /**
195
+ * Aggregated result across all models
196
+ */
197
+ interface EvalRunResult {
198
+ skill: string;
199
+ version: string;
200
+ run_id: string;
201
+ timestamp: string;
202
+ models_tested: string[];
203
+ model_results: ModelEvalResult[];
204
+ cross_model_variance: number;
205
+ overall_passed: boolean;
206
+ summary: {
207
+ best_model: string;
208
+ worst_model: string;
209
+ avg_pass_rate: number;
210
+ recommendations: string[];
211
+ };
212
+ mcp_integration_log: MCPIntegrationLog;
213
+ }
214
+
215
+ /**
216
+ * Log of MCP tool calls made during evaluation
217
+ */
218
+ interface MCPIntegrationLog {
219
+ patterns_queried: number;
220
+ outcomes_tracked: number;
221
+ patterns_stored: number;
222
+ learning_shared: boolean;
223
+ quality_gate_updated: boolean;
224
+ errors: string[];
225
+ }
226
+
227
+ // =============================================================================
228
+ // MCP Integration Layer
229
+ // =============================================================================
230
+
231
+ /**
232
+ * Mock MCP client for demonstration
233
+ * In production, this would connect to the actual AQE MCP server
234
+ */
235
+ class MCPClient {
236
+ private enabled: boolean;
237
+ private namespace: string;
238
+ private log: MCPIntegrationLog;
239
+
240
+ constructor(config?: MCPIntegrationConfig) {
241
+ this.enabled = config?.enabled ?? false;
242
+ this.namespace = config?.namespace ?? 'skill-validation';
243
+ this.log = {
244
+ patterns_queried: 0,
245
+ outcomes_tracked: 0,
246
+ patterns_stored: 0,
247
+ learning_shared: false,
248
+ quality_gate_updated: false,
249
+ errors: [],
250
+ };
251
+ }
252
+
253
+ getLog(): MCPIntegrationLog {
254
+ return { ...this.log };
255
+ }
256
+
257
+ /**
258
+ * Query existing patterns before running evals
259
+ * MCP Tool: mcp__agentic-qe__memory_query
260
+ */
261
+ async queryPatterns(skill: string): Promise<unknown[]> {
262
+ if (!this.enabled) return [];
263
+
264
+ console.log(`[MCP] Querying patterns for skill: ${skill}`);
265
+
266
+ // In production, this would call:
267
+ // await mcp__agentic-qe__memory_query({
268
+ // pattern: `skill-validation-${skill}-*`,
269
+ // namespace: this.namespace,
270
+ // limit: 10
271
+ // });
272
+
273
+ this.log.patterns_queried++;
274
+
275
+ // Return mock patterns for demonstration
276
+ return [
277
+ {
278
+ patternId: `${skill}-baseline-pattern`,
279
+ successRate: 0.92,
280
+ lastValidated: new Date().toISOString(),
281
+ },
282
+ ];
283
+ }
284
+
285
+ /**
286
+ * Track individual test outcome
287
+ * MCP Tool: mcp__agentic-qe__test_outcome_track
288
+ */
289
+ async trackOutcome(
290
+ testId: string,
291
+ passed: boolean,
292
+ patternId?: string,
293
+ coverage?: { lines: number; branches: number; functions: number },
294
+ executionTime?: number
295
+ ): Promise<void> {
296
+ if (!this.enabled) return;
297
+
298
+ console.log(`[MCP] Tracking outcome: ${testId} - ${passed ? 'PASSED' : 'FAILED'}`);
299
+
300
+ // In production, this would call:
301
+ // await mcp__agentic-qe__test_outcome_track({
302
+ // testId: `skill-${skill}-${testId}`,
303
+ // generatedBy: 'eval-runner',
304
+ // patternId,
305
+ // passed,
306
+ // coverage,
307
+ // executionTime,
308
+ // flaky: false
309
+ // });
310
+
311
+ this.log.outcomes_tracked++;
312
+ }
313
+
314
+ /**
315
+ * Store successful patterns for future reference
316
+ * MCP Tool: mcp__agentic-qe__memory_store
317
+ */
318
+ async storePattern(
319
+ skill: string,
320
+ results: ModelEvalResult,
321
+ patterns: unknown[]
322
+ ): Promise<void> {
323
+ if (!this.enabled) return;
324
+
325
+ console.log(`[MCP] Storing patterns for skill: ${skill}`);
326
+
327
+ const key = `skill-validation-${skill}-${Date.now()}`;
328
+ const value = {
329
+ skillName: skill,
330
+ trustTier: 3, // Level 3 = has eval suite
331
+ validationResult: {
332
+ passRate: results.pass_rate,
333
+ criticalPassRate: results.critical_pass_rate,
334
+ avgReasoningQuality: results.avg_reasoning_quality,
335
+ },
336
+ model: results.model,
337
+ passRate: results.pass_rate,
338
+ patterns,
339
+ timestamp: new Date().toISOString(),
340
+ };
341
+
342
+ // In production, this would call:
343
+ // await mcp__agentic-qe__memory_store({
344
+ // key,
345
+ // value,
346
+ // namespace: this.namespace
347
+ // });
348
+
349
+ this.log.patterns_stored++;
350
+ }
351
+
352
+ /**
353
+ * Share learning with fleet agents
354
+ * MCP Tool: mcp__agentic-qe__memory_share
355
+ */
356
+ async shareLearning(
357
+ skill: string,
358
+ targetAgents: string[],
359
+ insights: unknown
360
+ ): Promise<void> {
361
+ if (!this.enabled) return;
362
+
363
+ console.log(`[MCP] Sharing learning with agents: ${targetAgents.join(', ')}`);
364
+
365
+ // In production, this would call:
366
+ // await mcp__agentic-qe__memory_share({
367
+ // sourceAgentId: 'eval-runner',
368
+ // targetAgentIds: targetAgents,
369
+ // knowledgeDomain: 'skill-validation',
370
+ // data: {
371
+ // skillName: skill,
372
+ // insights,
373
+ // recommendations: []
374
+ // }
375
+ // });
376
+
377
+ this.log.learning_shared = true;
378
+ }
379
+
380
+ /**
381
+ * Update quality gate with validation metrics
382
+ * MCP Tool: mcp__agentic-qe__quality_assess
383
+ */
384
+ async updateQualityGate(skill: string, metrics: unknown): Promise<void> {
385
+ if (!this.enabled) return;
386
+
387
+ console.log(`[MCP] Updating quality gate for skill: ${skill}`);
388
+
389
+ // In production, this would call:
390
+ // await mcp__agentic-qe__quality_assess({
391
+ // target: `skill:${skill}`,
392
+ // metrics,
393
+ // updateQualityScore: true
394
+ // });
395
+
396
+ this.log.quality_gate_updated = true;
397
+ }
398
+ }
399
+
400
+ // =============================================================================
401
+ // Evaluation Engine
402
+ // =============================================================================
403
+
404
+ class SkillEvaluationRunner {
405
+ private suite: EvalSuite;
406
+ private mcpClient: MCPClient;
407
+ private verbose: boolean;
408
+
409
+ constructor(suite: EvalSuite, verbose = false) {
410
+ this.suite = suite;
411
+ this.verbose = verbose;
412
+ this.mcpClient = new MCPClient(suite.mcp_integration);
413
+ }
414
+
415
+ /**
416
+ * Run evaluation against a single model
417
+ */
418
+ async runForModel(model: string): Promise<ModelEvalResult> {
419
+ const startTime = Date.now();
420
+ const results: TestCaseResult[] = [];
421
+
422
+ console.log(`\n${'='.repeat(60)}`);
423
+ console.log(`Running ${this.suite.skill} evals against ${model}`);
424
+ console.log(`${'='.repeat(60)}\n`);
425
+
426
+ // Query existing patterns (MCP integration)
427
+ await this.mcpClient.queryPatterns(this.suite.skill);
428
+
429
+ // Run each test case
430
+ for (const testCase of this.suite.test_cases) {
431
+ const result = await this.runTestCase(testCase, model);
432
+ results.push(result);
433
+
434
+ // Track outcome (MCP integration)
435
+ await this.mcpClient.trackOutcome(
436
+ testCase.id,
437
+ result.passed,
438
+ undefined,
439
+ undefined,
440
+ result.execution_time_ms
441
+ );
442
+
443
+ // Log progress
444
+ const status = result.skipped ? 'SKIP' : result.passed ? 'PASS' : 'FAIL';
445
+ const icon = result.skipped ? '-' : result.passed ? '+' : 'x';
446
+ console.log(` [${icon}] ${testCase.id}: ${status} (${result.execution_time_ms}ms)`);
447
+ }
448
+
449
+ const totalTime = Date.now() - startTime;
450
+
451
+ // Calculate metrics
452
+ const passed = results.filter((r) => r.passed && !r.skipped).length;
453
+ const failed = results.filter((r) => !r.passed && !r.skipped).length;
454
+ const skipped = results.filter((r) => r.skipped).length;
455
+ const total = results.length - skipped;
456
+
457
+ const criticalTests = results.filter(
458
+ (r) => this.suite.test_cases.find((tc) => tc.id === r.id)?.priority === 'critical'
459
+ );
460
+ const criticalPassed = criticalTests.filter((r) => r.passed && !r.skipped).length;
461
+ const criticalTotal = criticalTests.filter((r) => !r.skipped).length;
462
+
463
+ const avgReasoningQuality =
464
+ results
465
+ .filter((r) => !r.skipped)
466
+ .reduce((sum, r) => sum + r.reasoning_quality_score, 0) / total || 0;
467
+
468
+ // Check success criteria
469
+ const criteriaFailures: string[] = [];
470
+ const passRate = total > 0 ? passed / total : 0;
471
+ const criticalPassRate = criticalTotal > 0 ? criticalPassed / criticalTotal : 1;
472
+
473
+ if (passRate < this.suite.success_criteria.pass_rate) {
474
+ criteriaFailures.push(
475
+ `Pass rate ${(passRate * 100).toFixed(1)}% < required ${(this.suite.success_criteria.pass_rate * 100).toFixed(1)}%`
476
+ );
477
+ }
478
+
479
+ if (
480
+ this.suite.success_criteria.critical_pass_rate &&
481
+ criticalPassRate < this.suite.success_criteria.critical_pass_rate
482
+ ) {
483
+ criteriaFailures.push(
484
+ `Critical pass rate ${(criticalPassRate * 100).toFixed(1)}% < required ${(this.suite.success_criteria.critical_pass_rate * 100).toFixed(1)}%`
485
+ );
486
+ }
487
+
488
+ if (
489
+ this.suite.success_criteria.avg_reasoning_quality &&
490
+ avgReasoningQuality < this.suite.success_criteria.avg_reasoning_quality
491
+ ) {
492
+ criteriaFailures.push(
493
+ `Avg reasoning quality ${avgReasoningQuality.toFixed(2)} < required ${this.suite.success_criteria.avg_reasoning_quality}`
494
+ );
495
+ }
496
+
497
+ if (
498
+ this.suite.success_criteria.max_execution_time_ms &&
499
+ totalTime > this.suite.success_criteria.max_execution_time_ms
500
+ ) {
501
+ criteriaFailures.push(
502
+ `Execution time ${totalTime}ms > max ${this.suite.success_criteria.max_execution_time_ms}ms`
503
+ );
504
+ }
505
+
506
+ const modelResult: ModelEvalResult = {
507
+ model,
508
+ skill: this.suite.skill,
509
+ version: this.suite.version,
510
+ timestamp: new Date().toISOString(),
511
+ total_tests: results.length,
512
+ passed,
513
+ failed,
514
+ skipped,
515
+ pass_rate: passRate,
516
+ critical_pass_rate: criticalPassRate,
517
+ avg_reasoning_quality: avgReasoningQuality,
518
+ total_execution_time_ms: totalTime,
519
+ test_results: results,
520
+ success_criteria_met: criteriaFailures.length === 0,
521
+ criteria_failures: criteriaFailures,
522
+ };
523
+
524
+ // Store patterns (MCP integration)
525
+ if (this.suite.mcp_integration?.store_patterns) {
526
+ const learnedPatterns = this.extractPatterns(results);
527
+ await this.mcpClient.storePattern(this.suite.skill, modelResult, learnedPatterns);
528
+ }
529
+
530
+ return modelResult;
531
+ }
532
+
533
+ /**
534
+ * Run a single test case
535
+ */
536
+ private async runTestCase(testCase: TestCase, model: string): Promise<TestCaseResult> {
537
+ const startTime = Date.now();
538
+
539
+ // Handle skipped tests
540
+ if (testCase.skip) {
541
+ return {
542
+ id: testCase.id,
543
+ description: testCase.description,
544
+ category: testCase.category,
545
+ priority: testCase.priority,
546
+ passed: false,
547
+ skipped: true,
548
+ skip_reason: testCase.skip_reason,
549
+ execution_time_ms: 0,
550
+ keyword_match_score: 0,
551
+ reasoning_quality_score: 0,
552
+ validation_details: {
553
+ must_contain_matches: [],
554
+ must_contain_misses: [],
555
+ must_not_contain_violations: [],
556
+ regex_matches: [],
557
+ regex_misses: [],
558
+ severity_matched: false,
559
+ finding_count_matched: false,
560
+ },
561
+ };
562
+ }
563
+
564
+ try {
565
+ // In production, this would invoke the actual skill via Claude API
566
+ // For now, we simulate the output based on test case expectations
567
+ const output = await this.simulateSkillExecution(testCase, model);
568
+ const executionTime = Date.now() - startTime;
569
+
570
+ // Validate output against expectations
571
+ const validation = this.validateOutput(output, testCase.expected_output, testCase.validation);
572
+
573
+ return {
574
+ id: testCase.id,
575
+ description: testCase.description,
576
+ category: testCase.category,
577
+ priority: testCase.priority,
578
+ passed: validation.passed,
579
+ skipped: false,
580
+ execution_time_ms: executionTime,
581
+ keyword_match_score: validation.keywordMatchScore,
582
+ reasoning_quality_score: validation.reasoningQualityScore,
583
+ validation_details: validation.details,
584
+ raw_output: this.verbose ? output : undefined,
585
+ };
586
+ } catch (error) {
587
+ return {
588
+ id: testCase.id,
589
+ description: testCase.description,
590
+ category: testCase.category,
591
+ priority: testCase.priority,
592
+ passed: false,
593
+ skipped: false,
594
+ execution_time_ms: Date.now() - startTime,
595
+ keyword_match_score: 0,
596
+ reasoning_quality_score: 0,
597
+ validation_details: {
598
+ must_contain_matches: [],
599
+ must_contain_misses: [],
600
+ must_not_contain_violations: [],
601
+ regex_matches: [],
602
+ regex_misses: [],
603
+ severity_matched: false,
604
+ finding_count_matched: false,
605
+ },
606
+ error: error instanceof Error ? error.message : String(error),
607
+ };
608
+ }
609
+ }
610
+
611
+ /**
612
+ * Simulate skill execution (placeholder for actual LLM call)
613
+ * In production, this would:
614
+ * 1. Load the skill from .claude/skills/{skill}/SKILL.md
615
+ * 2. Call the LLM API with the skill prompt and test input
616
+ * 3. Return the actual LLM output
617
+ */
618
+ private async simulateSkillExecution(testCase: TestCase, model: string): Promise<string> {
619
+ // Simulate some processing time
620
+ await new Promise((resolve) => setTimeout(resolve, 100));
621
+
622
+ // For demonstration, generate mock output based on test case category
623
+ const category = testCase.category || 'general';
624
+
625
+ // This is a simulation - in production this would be the actual LLM response
626
+ if (category === 'negative') {
627
+ return JSON.stringify({
628
+ findings: [],
629
+ summary: 'The code appears to follow security best practices. No critical vulnerabilities detected.',
630
+ recommendations: ['Continue following current security patterns'],
631
+ severity: 'info',
632
+ });
633
+ }
634
+
635
+ // For injection/auth/crypto tests, return findings
636
+ return JSON.stringify({
637
+ findings: [
638
+ {
639
+ type: testCase.expected_output.must_contain?.[0]?.toLowerCase() || 'vulnerability',
640
+ severity: testCase.expected_output.severity_classification || 'high',
641
+ description: `Detected potential ${testCase.expected_output.must_contain?.[0] || 'security issue'}`,
642
+ cwe: 'CWE-89',
643
+ owasp: 'A03:2021',
644
+ remediation: 'Use parameterized queries and proper input validation',
645
+ },
646
+ ],
647
+ summary: `Security analysis complete. Found ${testCase.expected_output.finding_count?.min || 1} issue(s).`,
648
+ recommendations: ['Implement secure coding practices', 'Use prepared statements'],
649
+ });
650
+ }
651
+
652
+ /**
653
+ * Validate LLM output against expected output criteria
654
+ */
655
+ private validateOutput(
656
+ output: string,
657
+ expected: ExpectedOutput,
658
+ config?: ValidationConfig
659
+ ): {
660
+ passed: boolean;
661
+ keywordMatchScore: number;
662
+ reasoningQualityScore: number;
663
+ details: TestCaseResult['validation_details'];
664
+ } {
665
+ const outputLower = output.toLowerCase();
666
+ const threshold = config?.keyword_match_threshold ?? 0.8;
667
+
668
+ // Check must_contain
669
+ const mustContainMatches: string[] = [];
670
+ const mustContainMisses: string[] = [];
671
+
672
+ for (const keyword of expected.must_contain || []) {
673
+ if (outputLower.includes(keyword.toLowerCase())) {
674
+ mustContainMatches.push(keyword);
675
+ } else {
676
+ mustContainMisses.push(keyword);
677
+ }
678
+ }
679
+
680
+ // Check must_not_contain
681
+ const violations: string[] = [];
682
+ for (const keyword of expected.must_not_contain || []) {
683
+ if (outputLower.includes(keyword.toLowerCase())) {
684
+ violations.push(keyword);
685
+ }
686
+ }
687
+
688
+ // Check regex patterns
689
+ const regexMatches: string[] = [];
690
+ const regexMisses: string[] = [];
691
+ for (const pattern of expected.must_match_regex || []) {
692
+ try {
693
+ const regex = new RegExp(pattern, 'i');
694
+ if (regex.test(output)) {
695
+ regexMatches.push(pattern);
696
+ } else {
697
+ regexMisses.push(pattern);
698
+ }
699
+ } catch {
700
+ regexMisses.push(pattern);
701
+ }
702
+ }
703
+
704
+ // Calculate keyword match score
705
+ const totalKeywords = (expected.must_contain?.length || 0) + (expected.must_match_regex?.length || 0);
706
+ const matchedKeywords = mustContainMatches.length + regexMatches.length;
707
+ const keywordMatchScore = totalKeywords > 0 ? matchedKeywords / totalKeywords : 1;
708
+
709
+ // Check severity classification
710
+ const severityMatched =
711
+ !expected.severity_classification ||
712
+ outputLower.includes(expected.severity_classification.toLowerCase());
713
+
714
+ // Check finding count (mock implementation)
715
+ const findingCountMatched = true; // Would parse JSON output in production
716
+
717
+ // Calculate reasoning quality (simplified - would use embeddings in production)
718
+ const reasoningQualityScore = keywordMatchScore * 0.8 + (violations.length === 0 ? 0.2 : 0);
719
+
720
+ // Determine if test passed
721
+ const passed =
722
+ keywordMatchScore >= threshold &&
723
+ violations.length === 0 &&
724
+ (config?.reasoning_quality_min === undefined ||
725
+ reasoningQualityScore >= config.reasoning_quality_min) &&
726
+ (config?.allow_partial || mustContainMisses.length === 0);
727
+
728
+ return {
729
+ passed,
730
+ keywordMatchScore,
731
+ reasoningQualityScore,
732
+ details: {
733
+ must_contain_matches: mustContainMatches,
734
+ must_contain_misses: mustContainMisses,
735
+ must_not_contain_violations: violations,
736
+ regex_matches: regexMatches,
737
+ regex_misses: regexMisses,
738
+ severity_matched: severityMatched,
739
+ finding_count_matched: findingCountMatched,
740
+ },
741
+ };
742
+ }
743
+
744
+ /**
745
+ * Extract patterns from results for learning
746
+ */
747
+ private extractPatterns(results: TestCaseResult[]): unknown[] {
748
+ return results
749
+ .filter((r) => r.passed && !r.skipped)
750
+ .map((r) => ({
751
+ testId: r.id,
752
+ category: r.category,
753
+ keywordMatchScore: r.keyword_match_score,
754
+ reasoningQualityScore: r.reasoning_quality_score,
755
+ }));
756
+ }
757
+
758
+ /**
759
+ * Run full evaluation across all configured models
760
+ */
761
+ async runFull(): Promise<EvalRunResult> {
762
+ const runId = `eval-${this.suite.skill}-${Date.now()}`;
763
+ const modelResults: ModelEvalResult[] = [];
764
+
765
+ console.log(`\nStarting evaluation run: ${runId}`);
766
+ console.log(`Skill: ${this.suite.skill} v${this.suite.version}`);
767
+ console.log(`Models: ${this.suite.models_to_test.join(', ')}`);
768
+ console.log(`Test cases: ${this.suite.test_cases.length}`);
769
+
770
+ // Run for each model
771
+ for (const model of this.suite.models_to_test) {
772
+ const result = await this.runForModel(model);
773
+ modelResults.push(result);
774
+ }
775
+
776
+ // Calculate cross-model variance
777
+ const passRates = modelResults.map((r) => r.pass_rate);
778
+ const maxRate = Math.max(...passRates);
779
+ const minRate = Math.min(...passRates);
780
+ const variance = maxRate - minRate;
781
+
782
+ // Share learning with fleet (MCP integration)
783
+ if (this.suite.mcp_integration?.share_learning) {
784
+ await this.mcpClient.shareLearning(
785
+ this.suite.skill,
786
+ this.suite.mcp_integration.target_agents || [],
787
+ {
788
+ passRates,
789
+ variance,
790
+ bestModel: modelResults.find((r) => r.pass_rate === maxRate)?.model,
791
+ }
792
+ );
793
+ }
794
+
795
+ // Update quality gate (MCP integration)
796
+ if (this.suite.mcp_integration?.update_quality_gate) {
797
+ const avgPassRate = passRates.reduce((a, b) => a + b, 0) / passRates.length;
798
+ await this.mcpClient.updateQualityGate(this.suite.skill, {
799
+ passRate: avgPassRate,
800
+ schemaCompliance: true,
801
+ validatorPassed: true,
802
+ evalSuiteScore: avgPassRate * 100,
803
+ });
804
+ }
805
+
806
+ // Determine overall success
807
+ const overallPassed =
808
+ modelResults.every((r) => r.success_criteria_met) &&
809
+ (!this.suite.success_criteria.cross_model_variance ||
810
+ variance <= this.suite.success_criteria.cross_model_variance);
811
+
812
+ // Generate recommendations
813
+ const recommendations: string[] = [];
814
+ if (variance > 0.1) {
815
+ recommendations.push(
816
+ `High cross-model variance (${(variance * 100).toFixed(1)}%). Consider model-specific tuning.`
817
+ );
818
+ }
819
+ const failingModels = modelResults.filter((r) => !r.success_criteria_met);
820
+ if (failingModels.length > 0) {
821
+ recommendations.push(
822
+ `Models failing criteria: ${failingModels.map((r) => r.model).join(', ')}`
823
+ );
824
+ }
825
+
826
+ return {
827
+ skill: this.suite.skill,
828
+ version: this.suite.version,
829
+ run_id: runId,
830
+ timestamp: new Date().toISOString(),
831
+ models_tested: this.suite.models_to_test,
832
+ model_results: modelResults,
833
+ cross_model_variance: variance,
834
+ overall_passed: overallPassed,
835
+ summary: {
836
+ best_model: modelResults.find((r) => r.pass_rate === maxRate)?.model || '',
837
+ worst_model: modelResults.find((r) => r.pass_rate === minRate)?.model || '',
838
+ avg_pass_rate: passRates.reduce((a, b) => a + b, 0) / passRates.length,
839
+ recommendations,
840
+ },
841
+ mcp_integration_log: this.mcpClient.getLog(),
842
+ };
843
+ }
844
+ }
845
+
846
+ // =============================================================================
847
+ // CLI Interface
848
+ // =============================================================================
849
+
850
+ interface CLIOptions {
851
+ skill?: string;
852
+ evalFile?: string;
853
+ model?: string;
854
+ allModels?: boolean;
855
+ output?: string;
856
+ verbose?: boolean;
857
+ dryRun?: boolean;
858
+ useMcp?: boolean;
859
+ }
860
+
861
+ function parseArgs(args: string[]): CLIOptions {
862
+ const options: CLIOptions = {};
863
+
864
+ for (let i = 0; i < args.length; i++) {
865
+ const arg = args[i];
866
+
867
+ switch (arg) {
868
+ case '--skill':
869
+ case '-s':
870
+ options.skill = args[++i];
871
+ break;
872
+ case '--eval-file':
873
+ case '-e':
874
+ options.evalFile = args[++i];
875
+ break;
876
+ case '--model':
877
+ case '-m':
878
+ options.model = args[++i];
879
+ break;
880
+ case '--all-models':
881
+ case '-a':
882
+ options.allModels = true;
883
+ break;
884
+ case '--output':
885
+ case '-o':
886
+ options.output = args[++i];
887
+ break;
888
+ case '--verbose':
889
+ case '-v':
890
+ options.verbose = true;
891
+ break;
892
+ case '--dry-run':
893
+ options.dryRun = true;
894
+ break;
895
+ case '--use-mcp':
896
+ options.useMcp = true;
897
+ break;
898
+ case '--help':
899
+ case '-h':
900
+ printUsage();
901
+ process.exit(0);
902
+ }
903
+ }
904
+
905
+ return options;
906
+ }
907
+
908
+ function printUsage(): void {
909
+ console.log(`
910
+ AQE Skill Evaluation Runner
911
+
912
+ Usage:
913
+ npx tsx scripts/run-skill-eval.ts [options]
914
+
915
+ Options:
916
+ -s, --skill <name> Skill name (looks for .claude/skills/<name>/evals/<name>.yaml)
917
+ -e, --eval-file <path> Path to eval YAML file (alternative to --skill)
918
+ -m, --model <model> Run against specific model only
919
+ -a, --all-models Run against all configured models (default)
920
+ -o, --output <path> Output JSON results to file
921
+ -v, --verbose Include raw LLM output in results
922
+ --dry-run Parse and validate eval file without running
923
+ --use-mcp Enable MCP integration for shared learning
924
+ -h, --help Show this help message
925
+
926
+ Examples:
927
+ # Run security-testing evals against default model
928
+ npx tsx scripts/run-skill-eval.ts --skill security-testing
929
+
930
+ # Run against specific model
931
+ npx tsx scripts/run-skill-eval.ts --skill security-testing --model claude-3-haiku
932
+
933
+ # Run with MCP integration for learning
934
+ npx tsx scripts/run-skill-eval.ts --skill security-testing --use-mcp
935
+
936
+ # Output results to file
937
+ npx tsx scripts/run-skill-eval.ts --skill security-testing --output results.json
938
+
939
+ MCP Integration:
940
+ When --use-mcp is enabled, the runner will:
941
+ 1. Query existing patterns before running (mcp__agentic-qe__memory_query)
942
+ 2. Track each test outcome (mcp__agentic-qe__test_outcome_track)
943
+ 3. Store successful patterns after (mcp__agentic-qe__memory_store)
944
+ 4. Share learning with fleet (mcp__agentic-qe__memory_share)
945
+ 5. Update quality gate metrics (mcp__agentic-qe__quality_assess)
946
+ `);
947
+ }
948
+
949
+ async function loadEvalSuite(options: CLIOptions): Promise<EvalSuite> {
950
+ let evalPath: string;
951
+
952
+ if (options.evalFile) {
953
+ evalPath = options.evalFile;
954
+ } else if (options.skill) {
955
+ // Look for eval file in standard location
956
+ const possiblePaths = [
957
+ `.claude/skills/${options.skill}/evals/${options.skill}.yaml`,
958
+ `.claude/skills/${options.skill}/evals/${options.skill}.yml`,
959
+ `docs/templates/${options.skill}-eval.template.yaml`,
960
+ ];
961
+
962
+ const foundPath = possiblePaths.find((p) => existsSync(p));
963
+ if (!foundPath) {
964
+ throw new Error(
965
+ `Eval file not found for skill '${options.skill}'. Searched:\n ${possiblePaths.join('\n ')}`
966
+ );
967
+ }
968
+ evalPath = foundPath;
969
+ } else {
970
+ throw new Error('Either --skill or --eval-file must be specified');
971
+ }
972
+
973
+ console.log(`Loading eval suite from: ${evalPath}`);
974
+
975
+ const content = readFileSync(evalPath, 'utf-8');
976
+
977
+ // Dynamic import of yaml parser
978
+ if (!yaml) {
979
+ yaml = await import('yaml');
980
+ }
981
+
982
+ const suite = yaml.parse(content) as EvalSuite;
983
+
984
+ // Apply CLI overrides
985
+ if (options.model && !options.allModels) {
986
+ suite.models_to_test = [options.model];
987
+ }
988
+
989
+ if (options.useMcp) {
990
+ suite.mcp_integration = {
991
+ enabled: true,
992
+ namespace: 'skill-validation',
993
+ store_patterns: true,
994
+ query_patterns: true,
995
+ track_outcomes: true,
996
+ share_learning: true,
997
+ update_quality_gate: true,
998
+ target_agents: ['qe-learning-coordinator', 'qe-queen-coordinator'],
999
+ ...suite.mcp_integration,
1000
+ };
1001
+ }
1002
+
1003
+ return suite;
1004
+ }
1005
+
1006
+ function printResults(result: EvalRunResult): void {
1007
+ console.log(`\n${'='.repeat(60)}`);
1008
+ console.log('EVALUATION RESULTS');
1009
+ console.log(`${'='.repeat(60)}`);
1010
+
1011
+ console.log(`\nSkill: ${result.skill} v${result.version}`);
1012
+ console.log(`Run ID: ${result.run_id}`);
1013
+ console.log(`Timestamp: ${result.timestamp}`);
1014
+ console.log(`Overall: ${result.overall_passed ? 'PASSED' : 'FAILED'}`);
1015
+
1016
+ console.log(`\nCross-Model Summary:`);
1017
+ console.log(` Models tested: ${result.models_tested.join(', ')}`);
1018
+ console.log(` Best model: ${result.summary.best_model}`);
1019
+ console.log(` Worst model: ${result.summary.worst_model}`);
1020
+ console.log(` Avg pass rate: ${(result.summary.avg_pass_rate * 100).toFixed(1)}%`);
1021
+ console.log(` Cross-model variance: ${(result.cross_model_variance * 100).toFixed(1)}%`);
1022
+
1023
+ for (const modelResult of result.model_results) {
1024
+ console.log(`\n--- ${modelResult.model} ---`);
1025
+ console.log(` Pass rate: ${(modelResult.pass_rate * 100).toFixed(1)}%`);
1026
+ console.log(` Critical pass rate: ${(modelResult.critical_pass_rate * 100).toFixed(1)}%`);
1027
+ console.log(` Avg reasoning quality: ${modelResult.avg_reasoning_quality.toFixed(2)}`);
1028
+ console.log(` Tests: ${modelResult.passed}/${modelResult.total_tests} passed, ${modelResult.skipped} skipped`);
1029
+ console.log(` Execution time: ${modelResult.total_execution_time_ms}ms`);
1030
+ console.log(` Criteria met: ${modelResult.success_criteria_met ? 'YES' : 'NO'}`);
1031
+
1032
+ if (modelResult.criteria_failures.length > 0) {
1033
+ console.log(` Failures:`);
1034
+ for (const failure of modelResult.criteria_failures) {
1035
+ console.log(` - ${failure}`);
1036
+ }
1037
+ }
1038
+ }
1039
+
1040
+ if (result.summary.recommendations.length > 0) {
1041
+ console.log(`\nRecommendations:`);
1042
+ for (const rec of result.summary.recommendations) {
1043
+ console.log(` - ${rec}`);
1044
+ }
1045
+ }
1046
+
1047
+ console.log(`\nMCP Integration Log:`);
1048
+ console.log(` Patterns queried: ${result.mcp_integration_log.patterns_queried}`);
1049
+ console.log(` Outcomes tracked: ${result.mcp_integration_log.outcomes_tracked}`);
1050
+ console.log(` Patterns stored: ${result.mcp_integration_log.patterns_stored}`);
1051
+ console.log(` Learning shared: ${result.mcp_integration_log.learning_shared}`);
1052
+ console.log(` Quality gate updated: ${result.mcp_integration_log.quality_gate_updated}`);
1053
+ }
1054
+
1055
+ async function main(): Promise<void> {
1056
+ const args = process.argv.slice(2);
1057
+
1058
+ if (args.length === 0) {
1059
+ printUsage();
1060
+ process.exit(1);
1061
+ }
1062
+
1063
+ const options = parseArgs(args);
1064
+
1065
+ try {
1066
+ const suite = await loadEvalSuite(options);
1067
+
1068
+ console.log(`\nLoaded eval suite: ${suite.skill} v${suite.version}`);
1069
+ console.log(`Test cases: ${suite.test_cases.length}`);
1070
+ console.log(`Models: ${suite.models_to_test.join(', ')}`);
1071
+
1072
+ if (options.dryRun) {
1073
+ console.log('\n[Dry run] Eval file parsed successfully. Exiting.');
1074
+ process.exit(0);
1075
+ }
1076
+
1077
+ const runner = new SkillEvaluationRunner(suite, options.verbose);
1078
+ const result = await runner.runFull();
1079
+
1080
+ printResults(result);
1081
+
1082
+ // Output to file if specified
1083
+ if (options.output) {
1084
+ writeFileSync(options.output, JSON.stringify(result, null, 2));
1085
+ console.log(`\nResults written to: ${options.output}`);
1086
+ }
1087
+
1088
+ // Exit with appropriate code
1089
+ process.exit(result.overall_passed ? 0 : 1);
1090
+ } catch (error) {
1091
+ console.error('\nError:', error instanceof Error ? error.message : String(error));
1092
+ process.exit(1);
1093
+ }
1094
+ }
1095
+
1096
+ // Run if executed directly
1097
+ main().catch(console.error);