@raishin/vanguard-frontier-agentic 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (342) hide show
  1. package/.claude-plugin/plugin.json +25 -1
  2. package/.cursor-plugin/plugin.json +25 -1
  3. package/.github/plugin/marketplace.json +1 -1
  4. package/README.md +26 -7
  5. package/agents/marketing/README.md +44 -0
  6. package/agents/marketing/ai-advertising-targeting-fairness-review-agent/AGENT.md +53 -0
  7. package/agents/marketing/ai-advertising-targeting-fairness-review-agent/harnesses/claude-code.agent.md +36 -0
  8. package/agents/marketing/ai-advertising-targeting-fairness-review-agent/harnesses/codex.toml +33 -0
  9. package/agents/marketing/ai-advertising-targeting-fairness-review-agent/harnesses/copilot.agent.md +36 -0
  10. package/agents/marketing/ai-advertising-targeting-fairness-review-agent/harnesses/cursor.agent.md +36 -0
  11. package/agents/marketing/ai-advertising-targeting-fairness-review-agent/harnesses/gemini.agent.md +36 -0
  12. package/agents/marketing/ai-advertising-targeting-fairness-review-agent/harnesses/kiro-cli.agent.json +5 -0
  13. package/agents/marketing/ai-advertising-targeting-fairness-review-agent/harnesses/kiro-ide.agent.md +36 -0
  14. package/agents/marketing/ai-advertising-targeting-fairness-review-agent/metadata.json +31 -0
  15. package/agents/marketing/analytics-data-minimization-review-agent/AGENT.md +51 -0
  16. package/agents/marketing/analytics-data-minimization-review-agent/harnesses/claude-code.agent.md +34 -0
  17. package/agents/marketing/analytics-data-minimization-review-agent/harnesses/codex.toml +33 -0
  18. package/agents/marketing/analytics-data-minimization-review-agent/harnesses/copilot.agent.md +34 -0
  19. package/agents/marketing/analytics-data-minimization-review-agent/harnesses/cursor.agent.md +34 -0
  20. package/agents/marketing/analytics-data-minimization-review-agent/harnesses/gemini.agent.md +34 -0
  21. package/agents/marketing/analytics-data-minimization-review-agent/harnesses/kiro-cli.agent.json +5 -0
  22. package/agents/marketing/analytics-data-minimization-review-agent/harnesses/kiro-ide.agent.md +34 -0
  23. package/agents/marketing/analytics-data-minimization-review-agent/metadata.json +31 -0
  24. package/agents/marketing/email-sender-authentication-review-agent/AGENT.md +50 -0
  25. package/agents/marketing/email-sender-authentication-review-agent/harnesses/claude-code.agent.md +33 -0
  26. package/agents/marketing/email-sender-authentication-review-agent/harnesses/codex.toml +32 -0
  27. package/agents/marketing/email-sender-authentication-review-agent/harnesses/copilot.agent.md +33 -0
  28. package/agents/marketing/email-sender-authentication-review-agent/harnesses/cursor.agent.md +33 -0
  29. package/agents/marketing/email-sender-authentication-review-agent/harnesses/gemini.agent.md +33 -0
  30. package/agents/marketing/email-sender-authentication-review-agent/harnesses/kiro-cli.agent.json +5 -0
  31. package/agents/marketing/email-sender-authentication-review-agent/harnesses/kiro-ide.agent.md +33 -0
  32. package/agents/marketing/email-sender-authentication-review-agent/metadata.json +31 -0
  33. package/agents/marketing/eu-ai-act-marketing-system-review-agent/AGENT.md +54 -0
  34. package/agents/marketing/eu-ai-act-marketing-system-review-agent/harnesses/claude-code.agent.md +37 -0
  35. package/agents/marketing/eu-ai-act-marketing-system-review-agent/harnesses/codex.toml +33 -0
  36. package/agents/marketing/eu-ai-act-marketing-system-review-agent/harnesses/copilot.agent.md +37 -0
  37. package/agents/marketing/eu-ai-act-marketing-system-review-agent/harnesses/cursor.agent.md +37 -0
  38. package/agents/marketing/eu-ai-act-marketing-system-review-agent/harnesses/gemini.agent.md +37 -0
  39. package/agents/marketing/eu-ai-act-marketing-system-review-agent/harnesses/kiro-cli.agent.json +5 -0
  40. package/agents/marketing/eu-ai-act-marketing-system-review-agent/harnesses/kiro-ide.agent.md +37 -0
  41. package/agents/marketing/eu-ai-act-marketing-system-review-agent/metadata.json +31 -0
  42. package/agents/marketing/influencer-disclosure-compliance-review-agent/AGENT.md +52 -0
  43. package/agents/marketing/influencer-disclosure-compliance-review-agent/harnesses/claude-code.agent.md +35 -0
  44. package/agents/marketing/influencer-disclosure-compliance-review-agent/harnesses/codex.toml +33 -0
  45. package/agents/marketing/influencer-disclosure-compliance-review-agent/harnesses/copilot.agent.md +35 -0
  46. package/agents/marketing/influencer-disclosure-compliance-review-agent/harnesses/cursor.agent.md +35 -0
  47. package/agents/marketing/influencer-disclosure-compliance-review-agent/harnesses/gemini.agent.md +35 -0
  48. package/agents/marketing/influencer-disclosure-compliance-review-agent/harnesses/kiro-cli.agent.json +5 -0
  49. package/agents/marketing/influencer-disclosure-compliance-review-agent/harnesses/kiro-ide.agent.md +35 -0
  50. package/agents/marketing/influencer-disclosure-compliance-review-agent/metadata.json +31 -0
  51. package/agents/marketing/lookalike-audience-upload-compliance-review-agent/AGENT.md +54 -0
  52. package/agents/marketing/lookalike-audience-upload-compliance-review-agent/harnesses/claude-code.agent.md +37 -0
  53. package/agents/marketing/lookalike-audience-upload-compliance-review-agent/harnesses/codex.toml +34 -0
  54. package/agents/marketing/lookalike-audience-upload-compliance-review-agent/harnesses/copilot.agent.md +37 -0
  55. package/agents/marketing/lookalike-audience-upload-compliance-review-agent/harnesses/cursor.agent.md +37 -0
  56. package/agents/marketing/lookalike-audience-upload-compliance-review-agent/harnesses/gemini.agent.md +37 -0
  57. package/agents/marketing/lookalike-audience-upload-compliance-review-agent/harnesses/kiro-cli.agent.json +5 -0
  58. package/agents/marketing/lookalike-audience-upload-compliance-review-agent/harnesses/kiro-ide.agent.md +37 -0
  59. package/agents/marketing/lookalike-audience-upload-compliance-review-agent/metadata.json +31 -0
  60. package/agents/marketing/marketing-consent-data-collection-review-agent/AGENT.md +51 -0
  61. package/agents/marketing/marketing-consent-data-collection-review-agent/harnesses/claude-code.agent.md +34 -0
  62. package/agents/marketing/marketing-consent-data-collection-review-agent/harnesses/codex.toml +33 -0
  63. package/agents/marketing/marketing-consent-data-collection-review-agent/harnesses/copilot.agent.md +34 -0
  64. package/agents/marketing/marketing-consent-data-collection-review-agent/harnesses/cursor.agent.md +34 -0
  65. package/agents/marketing/marketing-consent-data-collection-review-agent/harnesses/gemini.agent.md +34 -0
  66. package/agents/marketing/marketing-consent-data-collection-review-agent/harnesses/kiro-cli.agent.json +5 -0
  67. package/agents/marketing/marketing-consent-data-collection-review-agent/harnesses/kiro-ide.agent.md +34 -0
  68. package/agents/marketing/marketing-consent-data-collection-review-agent/metadata.json +31 -0
  69. package/agents/marketing/marketing-conversion-flow-dark-pattern-review-agent/AGENT.md +51 -0
  70. package/agents/marketing/marketing-conversion-flow-dark-pattern-review-agent/harnesses/claude-code.agent.md +34 -0
  71. package/agents/marketing/marketing-conversion-flow-dark-pattern-review-agent/harnesses/codex.toml +33 -0
  72. package/agents/marketing/marketing-conversion-flow-dark-pattern-review-agent/harnesses/copilot.agent.md +34 -0
  73. package/agents/marketing/marketing-conversion-flow-dark-pattern-review-agent/harnesses/cursor.agent.md +34 -0
  74. package/agents/marketing/marketing-conversion-flow-dark-pattern-review-agent/harnesses/gemini.agent.md +34 -0
  75. package/agents/marketing/marketing-conversion-flow-dark-pattern-review-agent/harnesses/kiro-cli.agent.json +5 -0
  76. package/agents/marketing/marketing-conversion-flow-dark-pattern-review-agent/harnesses/kiro-ide.agent.md +34 -0
  77. package/agents/marketing/marketing-conversion-flow-dark-pattern-review-agent/metadata.json +31 -0
  78. package/agents/marketing/marketing-email-list-retention-review-agent/AGENT.md +50 -0
  79. package/agents/marketing/marketing-email-list-retention-review-agent/harnesses/claude-code.agent.md +33 -0
  80. package/agents/marketing/marketing-email-list-retention-review-agent/harnesses/codex.toml +32 -0
  81. package/agents/marketing/marketing-email-list-retention-review-agent/harnesses/copilot.agent.md +33 -0
  82. package/agents/marketing/marketing-email-list-retention-review-agent/harnesses/cursor.agent.md +33 -0
  83. package/agents/marketing/marketing-email-list-retention-review-agent/harnesses/gemini.agent.md +33 -0
  84. package/agents/marketing/marketing-email-list-retention-review-agent/harnesses/kiro-cli.agent.json +5 -0
  85. package/agents/marketing/marketing-email-list-retention-review-agent/harnesses/kiro-ide.agent.md +33 -0
  86. package/agents/marketing/marketing-email-list-retention-review-agent/metadata.json +31 -0
  87. package/agents/marketing/marketing-gpc-signal-honoring-review-agent/AGENT.md +50 -0
  88. package/agents/marketing/marketing-gpc-signal-honoring-review-agent/harnesses/claude-code.agent.md +33 -0
  89. package/agents/marketing/marketing-gpc-signal-honoring-review-agent/harnesses/codex.toml +32 -0
  90. package/agents/marketing/marketing-gpc-signal-honoring-review-agent/harnesses/copilot.agent.md +33 -0
  91. package/agents/marketing/marketing-gpc-signal-honoring-review-agent/harnesses/cursor.agent.md +33 -0
  92. package/agents/marketing/marketing-gpc-signal-honoring-review-agent/harnesses/gemini.agent.md +33 -0
  93. package/agents/marketing/marketing-gpc-signal-honoring-review-agent/harnesses/kiro-cli.agent.json +5 -0
  94. package/agents/marketing/marketing-gpc-signal-honoring-review-agent/harnesses/kiro-ide.agent.md +33 -0
  95. package/agents/marketing/marketing-gpc-signal-honoring-review-agent/metadata.json +31 -0
  96. package/agents/marketing/marketing-maestro-agent/AGENT.md +62 -0
  97. package/agents/marketing/marketing-maestro-agent/PERMISSIONS.md +75 -0
  98. package/agents/marketing/marketing-maestro-agent/README.md +62 -0
  99. package/agents/marketing/marketing-maestro-agent/harnesses/claude-code.agent.md +43 -0
  100. package/agents/marketing/marketing-maestro-agent/harnesses/codex.toml +35 -0
  101. package/agents/marketing/marketing-maestro-agent/harnesses/copilot.agent.md +43 -0
  102. package/agents/marketing/marketing-maestro-agent/harnesses/cursor.agent.md +43 -0
  103. package/agents/marketing/marketing-maestro-agent/harnesses/gemini.agent.md +43 -0
  104. package/agents/marketing/marketing-maestro-agent/harnesses/kiro-cli.agent.json +5 -0
  105. package/agents/marketing/marketing-maestro-agent/harnesses/kiro-ide.agent.md +43 -0
  106. package/agents/marketing/marketing-maestro-agent/metadata.json +38 -0
  107. package/agents/marketing/marketing-pixel-data-leakage-review-agent/AGENT.md +50 -0
  108. package/agents/marketing/marketing-pixel-data-leakage-review-agent/harnesses/claude-code.agent.md +33 -0
  109. package/agents/marketing/marketing-pixel-data-leakage-review-agent/harnesses/codex.toml +32 -0
  110. package/agents/marketing/marketing-pixel-data-leakage-review-agent/harnesses/copilot.agent.md +33 -0
  111. package/agents/marketing/marketing-pixel-data-leakage-review-agent/harnesses/cursor.agent.md +33 -0
  112. package/agents/marketing/marketing-pixel-data-leakage-review-agent/harnesses/gemini.agent.md +33 -0
  113. package/agents/marketing/marketing-pixel-data-leakage-review-agent/harnesses/kiro-cli.agent.json +5 -0
  114. package/agents/marketing/marketing-pixel-data-leakage-review-agent/harnesses/kiro-ide.agent.md +33 -0
  115. package/agents/marketing/marketing-pixel-data-leakage-review-agent/metadata.json +31 -0
  116. package/agents/marketing/martech-access-governance-review-agent/AGENT.md +51 -0
  117. package/agents/marketing/martech-access-governance-review-agent/harnesses/claude-code.agent.md +34 -0
  118. package/agents/marketing/martech-access-governance-review-agent/harnesses/codex.toml +33 -0
  119. package/agents/marketing/martech-access-governance-review-agent/harnesses/copilot.agent.md +34 -0
  120. package/agents/marketing/martech-access-governance-review-agent/harnesses/cursor.agent.md +34 -0
  121. package/agents/marketing/martech-access-governance-review-agent/harnesses/gemini.agent.md +34 -0
  122. package/agents/marketing/martech-access-governance-review-agent/harnesses/kiro-cli.agent.json +5 -0
  123. package/agents/marketing/martech-access-governance-review-agent/harnesses/kiro-ide.agent.md +34 -0
  124. package/agents/marketing/martech-access-governance-review-agent/metadata.json +31 -0
  125. package/agents/marketing/programmatic-supply-chain-integrity-review-agent/AGENT.md +50 -0
  126. package/agents/marketing/programmatic-supply-chain-integrity-review-agent/harnesses/claude-code.agent.md +33 -0
  127. package/agents/marketing/programmatic-supply-chain-integrity-review-agent/harnesses/codex.toml +32 -0
  128. package/agents/marketing/programmatic-supply-chain-integrity-review-agent/harnesses/copilot.agent.md +33 -0
  129. package/agents/marketing/programmatic-supply-chain-integrity-review-agent/harnesses/cursor.agent.md +33 -0
  130. package/agents/marketing/programmatic-supply-chain-integrity-review-agent/harnesses/gemini.agent.md +33 -0
  131. package/agents/marketing/programmatic-supply-chain-integrity-review-agent/harnesses/kiro-cli.agent.json +5 -0
  132. package/agents/marketing/programmatic-supply-chain-integrity-review-agent/harnesses/kiro-ide.agent.md +33 -0
  133. package/agents/marketing/programmatic-supply-chain-integrity-review-agent/metadata.json +31 -0
  134. package/agents/qa/README.md +51 -0
  135. package/agents/qa/ci-test-pipeline-review-agent/AGENT.md +51 -0
  136. package/agents/qa/ci-test-pipeline-review-agent/harnesses/claude-code.agent.md +35 -0
  137. package/agents/qa/ci-test-pipeline-review-agent/harnesses/codex.toml +34 -0
  138. package/agents/qa/ci-test-pipeline-review-agent/harnesses/copilot.agent.md +35 -0
  139. package/agents/qa/ci-test-pipeline-review-agent/harnesses/cursor.agent.md +35 -0
  140. package/agents/qa/ci-test-pipeline-review-agent/harnesses/gemini.agent.md +35 -0
  141. package/agents/qa/ci-test-pipeline-review-agent/harnesses/kiro-cli.agent.json +5 -0
  142. package/agents/qa/ci-test-pipeline-review-agent/harnesses/kiro-ide.agent.md +35 -0
  143. package/agents/qa/ci-test-pipeline-review-agent/metadata.json +33 -0
  144. package/agents/qa/helm-chart-quality-review-agent/AGENT.md +56 -0
  145. package/agents/qa/helm-chart-quality-review-agent/harnesses/claude-code.agent.md +40 -0
  146. package/agents/qa/helm-chart-quality-review-agent/harnesses/codex.toml +39 -0
  147. package/agents/qa/helm-chart-quality-review-agent/harnesses/copilot.agent.md +40 -0
  148. package/agents/qa/helm-chart-quality-review-agent/harnesses/cursor.agent.md +40 -0
  149. package/agents/qa/helm-chart-quality-review-agent/harnesses/gemini.agent.md +40 -0
  150. package/agents/qa/helm-chart-quality-review-agent/harnesses/kiro-cli.agent.json +5 -0
  151. package/agents/qa/helm-chart-quality-review-agent/harnesses/kiro-ide.agent.md +40 -0
  152. package/agents/qa/helm-chart-quality-review-agent/metadata.json +35 -0
  153. package/agents/qa/kubernetes-manifest-quality-review-agent/AGENT.md +55 -0
  154. package/agents/qa/kubernetes-manifest-quality-review-agent/harnesses/claude-code.agent.md +32 -0
  155. package/agents/qa/kubernetes-manifest-quality-review-agent/harnesses/codex.toml +38 -0
  156. package/agents/qa/kubernetes-manifest-quality-review-agent/harnesses/copilot.agent.md +32 -0
  157. package/agents/qa/kubernetes-manifest-quality-review-agent/harnesses/cursor.agent.md +32 -0
  158. package/agents/qa/kubernetes-manifest-quality-review-agent/harnesses/gemini.agent.md +32 -0
  159. package/agents/qa/kubernetes-manifest-quality-review-agent/harnesses/kiro-cli.agent.json +5 -0
  160. package/agents/qa/kubernetes-manifest-quality-review-agent/harnesses/kiro-ide.agent.md +32 -0
  161. package/agents/qa/kubernetes-manifest-quality-review-agent/metadata.json +35 -0
  162. package/agents/qa/llm-ai-pipeline-test-review-agent/AGENT.md +52 -0
  163. package/agents/qa/llm-ai-pipeline-test-review-agent/harnesses/claude-code.agent.md +36 -0
  164. package/agents/qa/llm-ai-pipeline-test-review-agent/harnesses/codex.toml +36 -0
  165. package/agents/qa/llm-ai-pipeline-test-review-agent/harnesses/copilot.agent.md +36 -0
  166. package/agents/qa/llm-ai-pipeline-test-review-agent/harnesses/cursor.agent.md +36 -0
  167. package/agents/qa/llm-ai-pipeline-test-review-agent/harnesses/gemini.agent.md +36 -0
  168. package/agents/qa/llm-ai-pipeline-test-review-agent/harnesses/kiro-cli.agent.json +5 -0
  169. package/agents/qa/llm-ai-pipeline-test-review-agent/harnesses/kiro-ide.agent.md +36 -0
  170. package/agents/qa/llm-ai-pipeline-test-review-agent/metadata.json +35 -0
  171. package/agents/qa/playwright-e2e-execution-run-agent/AGENT.md +50 -0
  172. package/agents/qa/playwright-e2e-execution-run-agent/harnesses/claude-code.agent.md +39 -0
  173. package/agents/qa/playwright-e2e-execution-run-agent/harnesses/cursor.agent.md +39 -0
  174. package/agents/qa/playwright-e2e-execution-run-agent/metadata.json +28 -0
  175. package/agents/qa/playwright-e2e-suite-review-agent/AGENT.md +51 -0
  176. package/agents/qa/playwright-e2e-suite-review-agent/harnesses/claude-code.agent.md +35 -0
  177. package/agents/qa/playwright-e2e-suite-review-agent/harnesses/codex.toml +34 -0
  178. package/agents/qa/playwright-e2e-suite-review-agent/harnesses/copilot.agent.md +35 -0
  179. package/agents/qa/playwright-e2e-suite-review-agent/harnesses/cursor.agent.md +35 -0
  180. package/agents/qa/playwright-e2e-suite-review-agent/harnesses/gemini.agent.md +35 -0
  181. package/agents/qa/playwright-e2e-suite-review-agent/harnesses/kiro-cli.agent.json +5 -0
  182. package/agents/qa/playwright-e2e-suite-review-agent/harnesses/kiro-ide.agent.md +35 -0
  183. package/agents/qa/playwright-e2e-suite-review-agent/metadata.json +35 -0
  184. package/agents/qa/plc-control-logic-safety-review-agent/AGENT.md +53 -0
  185. package/agents/qa/plc-control-logic-safety-review-agent/harnesses/claude-code.agent.md +37 -0
  186. package/agents/qa/plc-control-logic-safety-review-agent/harnesses/codex.toml +36 -0
  187. package/agents/qa/plc-control-logic-safety-review-agent/harnesses/copilot.agent.md +37 -0
  188. package/agents/qa/plc-control-logic-safety-review-agent/harnesses/cursor.agent.md +37 -0
  189. package/agents/qa/plc-control-logic-safety-review-agent/harnesses/gemini.agent.md +37 -0
  190. package/agents/qa/plc-control-logic-safety-review-agent/harnesses/kiro-cli.agent.json +5 -0
  191. package/agents/qa/plc-control-logic-safety-review-agent/harnesses/kiro-ide.agent.md +37 -0
  192. package/agents/qa/plc-control-logic-safety-review-agent/metadata.json +33 -0
  193. package/agents/qa/rpa-workflow-resilience-review-agent/AGENT.md +52 -0
  194. package/agents/qa/rpa-workflow-resilience-review-agent/harnesses/claude-code.agent.md +36 -0
  195. package/agents/qa/rpa-workflow-resilience-review-agent/harnesses/codex.toml +35 -0
  196. package/agents/qa/rpa-workflow-resilience-review-agent/harnesses/copilot.agent.md +36 -0
  197. package/agents/qa/rpa-workflow-resilience-review-agent/harnesses/cursor.agent.md +36 -0
  198. package/agents/qa/rpa-workflow-resilience-review-agent/harnesses/gemini.agent.md +36 -0
  199. package/agents/qa/rpa-workflow-resilience-review-agent/harnesses/kiro-cli.agent.json +5 -0
  200. package/agents/qa/rpa-workflow-resilience-review-agent/harnesses/kiro-ide.agent.md +36 -0
  201. package/agents/qa/rpa-workflow-resilience-review-agent/metadata.json +34 -0
  202. package/agents/qa/test-coverage-quality-review-agent/AGENT.md +50 -0
  203. package/agents/qa/test-coverage-quality-review-agent/harnesses/claude-code.agent.md +34 -0
  204. package/agents/qa/test-coverage-quality-review-agent/harnesses/codex.toml +33 -0
  205. package/agents/qa/test-coverage-quality-review-agent/harnesses/copilot.agent.md +34 -0
  206. package/agents/qa/test-coverage-quality-review-agent/harnesses/cursor.agent.md +34 -0
  207. package/agents/qa/test-coverage-quality-review-agent/harnesses/gemini.agent.md +34 -0
  208. package/agents/qa/test-coverage-quality-review-agent/harnesses/kiro-cli.agent.json +5 -0
  209. package/agents/qa/test-coverage-quality-review-agent/harnesses/kiro-ide.agent.md +34 -0
  210. package/agents/qa/test-coverage-quality-review-agent/metadata.json +33 -0
  211. package/agents/qa/test-flakiness-triage-agent/AGENT.md +52 -0
  212. package/agents/qa/test-flakiness-triage-agent/harnesses/claude-code.agent.md +36 -0
  213. package/agents/qa/test-flakiness-triage-agent/harnesses/codex.toml +33 -0
  214. package/agents/qa/test-flakiness-triage-agent/harnesses/copilot.agent.md +36 -0
  215. package/agents/qa/test-flakiness-triage-agent/harnesses/cursor.agent.md +36 -0
  216. package/agents/qa/test-flakiness-triage-agent/harnesses/gemini.agent.md +36 -0
  217. package/agents/qa/test-flakiness-triage-agent/harnesses/kiro-cli.agent.json +5 -0
  218. package/agents/qa/test-flakiness-triage-agent/harnesses/kiro-ide.agent.md +36 -0
  219. package/agents/qa/test-flakiness-triage-agent/metadata.json +33 -0
  220. package/catalog/agents.json +1047 -197
  221. package/catalog/asset-integrity.json +2950 -1675
  222. package/catalog/install-roles.json +65 -1
  223. package/catalog/skill-manifest.json +538 -0
  224. package/catalog/skills.json +685 -0
  225. package/package.json +5 -2
  226. package/plugins/vanguard-frontier-agentic/.codex-plugin/plugin.json +1 -1
  227. package/scripts/generate-readme-counts.mjs +162 -0
  228. package/skills/marketing/ai-advertising-targeting-fairness-review/SKILL.md +43 -0
  229. package/skills/marketing/ai-advertising-targeting-fairness-review/metadata.json +21 -0
  230. package/skills/marketing/ai-advertising-targeting-fairness-review/references/workflow-and-output.md +150 -0
  231. package/skills/marketing/analytics-data-minimization-review/SKILL.md +44 -0
  232. package/skills/marketing/analytics-data-minimization-review/metadata.json +22 -0
  233. package/skills/marketing/analytics-data-minimization-review/references/workflow-and-output.md +187 -0
  234. package/skills/marketing/email-sender-authentication-review/SKILL.md +43 -0
  235. package/skills/marketing/email-sender-authentication-review/metadata.json +22 -0
  236. package/skills/marketing/email-sender-authentication-review/references/workflow-and-output.md +152 -0
  237. package/skills/marketing/eu-ai-act-marketing-system-review/SKILL.md +43 -0
  238. package/skills/marketing/eu-ai-act-marketing-system-review/metadata.json +21 -0
  239. package/skills/marketing/eu-ai-act-marketing-system-review/references/workflow-and-output.md +176 -0
  240. package/skills/marketing/influencer-disclosure-compliance-review/SKILL.md +43 -0
  241. package/skills/marketing/influencer-disclosure-compliance-review/metadata.json +22 -0
  242. package/skills/marketing/influencer-disclosure-compliance-review/references/workflow-and-output.md +156 -0
  243. package/skills/marketing/lookalike-audience-upload-compliance-review/SKILL.md +44 -0
  244. package/skills/marketing/lookalike-audience-upload-compliance-review/metadata.json +21 -0
  245. package/skills/marketing/lookalike-audience-upload-compliance-review/references/workflow-and-output.md +203 -0
  246. package/skills/marketing/marketing-consent-data-collection-review/SKILL.md +44 -0
  247. package/skills/marketing/marketing-consent-data-collection-review/metadata.json +21 -0
  248. package/skills/marketing/marketing-consent-data-collection-review/references/workflow-and-output.md +139 -0
  249. package/skills/marketing/marketing-conversion-flow-dark-pattern-review/SKILL.md +45 -0
  250. package/skills/marketing/marketing-conversion-flow-dark-pattern-review/metadata.json +22 -0
  251. package/skills/marketing/marketing-conversion-flow-dark-pattern-review/references/workflow-and-output.md +160 -0
  252. package/skills/marketing/marketing-email-list-retention-review/SKILL.md +43 -0
  253. package/skills/marketing/marketing-email-list-retention-review/metadata.json +22 -0
  254. package/skills/marketing/marketing-email-list-retention-review/references/workflow-and-output.md +144 -0
  255. package/skills/marketing/marketing-gpc-signal-honoring-review/SKILL.md +42 -0
  256. package/skills/marketing/marketing-gpc-signal-honoring-review/metadata.json +22 -0
  257. package/skills/marketing/marketing-gpc-signal-honoring-review/references/workflow-and-output.md +145 -0
  258. package/skills/marketing/marketing-maestro/README.md +37 -0
  259. package/skills/marketing/marketing-maestro/SKILL.md +49 -0
  260. package/skills/marketing/marketing-maestro/metadata.json +26 -0
  261. package/skills/marketing/marketing-maestro/references/safety-checklist.md +67 -0
  262. package/skills/marketing/marketing-maestro/references/workflow-and-output.md +110 -0
  263. package/skills/marketing/marketing-pixel-data-leakage-review/SKILL.md +43 -0
  264. package/skills/marketing/marketing-pixel-data-leakage-review/metadata.json +21 -0
  265. package/skills/marketing/marketing-pixel-data-leakage-review/references/workflow-and-output.md +129 -0
  266. package/skills/marketing/martech-access-governance-review/SKILL.md +45 -0
  267. package/skills/marketing/martech-access-governance-review/metadata.json +21 -0
  268. package/skills/marketing/martech-access-governance-review/references/workflow-and-output.md +116 -0
  269. package/skills/marketing/programmatic-supply-chain-integrity-review/SKILL.md +43 -0
  270. package/skills/marketing/programmatic-supply-chain-integrity-review/metadata.json +22 -0
  271. package/skills/marketing/programmatic-supply-chain-integrity-review/references/workflow-and-output.md +164 -0
  272. package/skills/qa/ci-test-pipeline-review/SKILL.md +45 -0
  273. package/skills/qa/ci-test-pipeline-review/metadata.json +21 -0
  274. package/skills/qa/ci-test-pipeline-review/references/workflow-and-output.md +124 -0
  275. package/skills/qa/helm-chart-quality-review/SKILL.md +61 -0
  276. package/skills/qa/helm-chart-quality-review/metadata.json +23 -0
  277. package/skills/qa/helm-chart-quality-review/references/workflow-and-output.md +174 -0
  278. package/skills/qa/kubernetes-manifest-quality-review/SKILL.md +92 -0
  279. package/skills/qa/kubernetes-manifest-quality-review/metadata.json +23 -0
  280. package/skills/qa/kubernetes-manifest-quality-review/references/workflow-and-output.md +246 -0
  281. package/skills/qa/llm-ai-pipeline-test-review/SKILL.md +52 -0
  282. package/skills/qa/llm-ai-pipeline-test-review/metadata.json +23 -0
  283. package/skills/qa/llm-ai-pipeline-test-review/references/workflow-and-output.md +221 -0
  284. package/skills/qa/playwright-e2e-execution-run/SKILL.md +54 -0
  285. package/skills/qa/playwright-e2e-execution-run/metadata.json +24 -0
  286. package/skills/qa/playwright-e2e-execution-run/references/workflow-and-output.md +133 -0
  287. package/skills/qa/playwright-e2e-suite-review/SKILL.md +44 -0
  288. package/skills/qa/playwright-e2e-suite-review/metadata.json +23 -0
  289. package/skills/qa/playwright-e2e-suite-review/references/workflow-and-output.md +176 -0
  290. package/skills/qa/plc-control-logic-safety-review/SKILL.md +47 -0
  291. package/skills/qa/plc-control-logic-safety-review/metadata.json +21 -0
  292. package/skills/qa/plc-control-logic-safety-review/references/workflow-and-output.md +231 -0
  293. package/skills/qa/rpa-workflow-resilience-review/SKILL.md +47 -0
  294. package/skills/qa/rpa-workflow-resilience-review/metadata.json +22 -0
  295. package/skills/qa/rpa-workflow-resilience-review/references/workflow-and-output.md +210 -0
  296. package/skills/qa/test-coverage-quality-review/SKILL.md +44 -0
  297. package/skills/qa/test-coverage-quality-review/metadata.json +21 -0
  298. package/skills/qa/test-coverage-quality-review/references/workflow-and-output.md +139 -0
  299. package/skills/qa/test-flakiness-triage/SKILL.md +43 -0
  300. package/skills/qa/test-flakiness-triage/metadata.json +21 -0
  301. package/skills/qa/test-flakiness-triage/references/workflow-and-output.md +114 -0
  302. package/tests/eval-qa-cluster.mjs +111 -0
  303. package/tests/fixtures/marketing-maestro-routing/expected/001-happy-ai-advertising-targeting-fairness-review.json +6 -0
  304. package/tests/fixtures/marketing-maestro-routing/expected/002-happy-analytics-data-minimization-review.json +6 -0
  305. package/tests/fixtures/marketing-maestro-routing/expected/003-happy-consent-data-collection-review.json +6 -0
  306. package/tests/fixtures/marketing-maestro-routing/expected/004-happy-conversion-flow-dark-pattern-review.json +6 -0
  307. package/tests/fixtures/marketing-maestro-routing/expected/005-happy-email-list-retention-review.json +6 -0
  308. package/tests/fixtures/marketing-maestro-routing/expected/006-happy-email-sender-authentication-review.json +6 -0
  309. package/tests/fixtures/marketing-maestro-routing/expected/007-happy-eu-ai-act-marketing-system-review.json +6 -0
  310. package/tests/fixtures/marketing-maestro-routing/expected/008-happy-gpc-signal-honoring-review.json +6 -0
  311. package/tests/fixtures/marketing-maestro-routing/expected/009-happy-influencer-disclosure-compliance-review.json +6 -0
  312. package/tests/fixtures/marketing-maestro-routing/expected/010-happy-lookalike-audience-upload-compliance-review.json +6 -0
  313. package/tests/fixtures/marketing-maestro-routing/expected/011-happy-martech-access-governance-review.json +6 -0
  314. package/tests/fixtures/marketing-maestro-routing/expected/012-happy-pixel-data-leakage-review.json +6 -0
  315. package/tests/fixtures/marketing-maestro-routing/expected/013-happy-programmatic-supply-chain-integrity-review.json +6 -0
  316. package/tests/fixtures/marketing-maestro-routing/expected/adv-ambiguous.json +4 -0
  317. package/tests/fixtures/marketing-maestro-routing/expected/adv-instruction-injection.json +7 -0
  318. package/tests/fixtures/marketing-maestro-routing/expected/adv-live-guard-gate.json +4 -0
  319. package/tests/fixtures/marketing-maestro-routing/expected/adv-persona-replacement.json +6 -0
  320. package/tests/fixtures/marketing-maestro-routing/expected/adv-secrets-bait.json +7 -0
  321. package/tests/fixtures/marketing-maestro-routing/inputs/001-happy-ai-advertising-targeting-fairness-review.json +7 -0
  322. package/tests/fixtures/marketing-maestro-routing/inputs/002-happy-analytics-data-minimization-review.json +7 -0
  323. package/tests/fixtures/marketing-maestro-routing/inputs/003-happy-consent-data-collection-review.json +7 -0
  324. package/tests/fixtures/marketing-maestro-routing/inputs/004-happy-conversion-flow-dark-pattern-review.json +7 -0
  325. package/tests/fixtures/marketing-maestro-routing/inputs/005-happy-email-list-retention-review.json +7 -0
  326. package/tests/fixtures/marketing-maestro-routing/inputs/006-happy-email-sender-authentication-review.json +7 -0
  327. package/tests/fixtures/marketing-maestro-routing/inputs/007-happy-eu-ai-act-marketing-system-review.json +7 -0
  328. package/tests/fixtures/marketing-maestro-routing/inputs/008-happy-gpc-signal-honoring-review.json +7 -0
  329. package/tests/fixtures/marketing-maestro-routing/inputs/009-happy-influencer-disclosure-compliance-review.json +7 -0
  330. package/tests/fixtures/marketing-maestro-routing/inputs/010-happy-lookalike-audience-upload-compliance-review.json +7 -0
  331. package/tests/fixtures/marketing-maestro-routing/inputs/011-happy-martech-access-governance-review.json +7 -0
  332. package/tests/fixtures/marketing-maestro-routing/inputs/012-happy-pixel-data-leakage-review.json +7 -0
  333. package/tests/fixtures/marketing-maestro-routing/inputs/013-happy-programmatic-supply-chain-integrity-review.json +7 -0
  334. package/tests/fixtures/marketing-maestro-routing/inputs/adv-ambiguous.json +7 -0
  335. package/tests/fixtures/marketing-maestro-routing/inputs/adv-instruction-injection.json +7 -0
  336. package/tests/fixtures/marketing-maestro-routing/inputs/adv-live-guard-gate.json +7 -0
  337. package/tests/fixtures/marketing-maestro-routing/inputs/adv-persona-replacement.json +7 -0
  338. package/tests/fixtures/marketing-maestro-routing/inputs/adv-secrets-bait.json +7 -0
  339. package/tests/fixtures/marketing-maestro-routing/taxonomy.json +183 -0
  340. package/tests/validate-catalog.py +1 -0
  341. package/tests/validate-maestro-routing.py +4 -0
  342. package/tests/validate-readme-counts.mjs +179 -0
@@ -0,0 +1,246 @@
1
+ # Workflow and Output Contract
2
+
3
+ ## Workflow
4
+
5
+ ### Step 1 — Collect inputs
6
+
7
+ Ask the user to provide one or more of the following as sanitized files (no real secret values, no kubeconfig, no service account tokens, no cloud credentials — replace sensitive values with placeholders):
8
+ - Workload manifests: Deployment, StatefulSet, DaemonSet YAML
9
+ - Service and Ingress YAML
10
+ - NetworkPolicy YAML
11
+ - RBAC resources: Role, ClusterRole, RoleBinding, ClusterRoleBinding YAML
12
+ - CRD definitions if relevant
13
+ - Any Kustomize base and overlay files
14
+
15
+ If NetworkPolicy resources are not provided, the egress/ingress audit findings are stated as `inference` — say so and ask for them.
16
+
17
+ ### Step 2 — Schema and API version audit
18
+
19
+ Validate that every manifest has `apiVersion` and `kind` present. Check for deprecated or removed API versions:
20
+
21
+ ```yaml
22
+ # HIGH — removed in Kubernetes 1.22
23
+ apiVersion: extensions/v1beta1
24
+ kind: Ingress
25
+
26
+ # HIGH — networking.k8s.io/v1beta1 Ingress removed in 1.22
27
+ apiVersion: networking.k8s.io/v1beta1
28
+ kind: Ingress
29
+
30
+ # HIGH — policy/v1beta1 PodSecurityPolicy removed in 1.25
31
+ apiVersion: policy/v1beta1
32
+ kind: PodSecurityPolicy
33
+ ```
34
+
35
+ Check that required labels are present on Pod templates and workload controllers: `app`, `app.kubernetes.io/name`, `app.kubernetes.io/version`. Flag missing `namespace` on all resources.
36
+
37
+ ### Step 3 — Pod security audit (PSS Restricted/Baseline comparison)
38
+
39
+ Evaluate each Pod spec against the Pod Security Standards Restricted profile:
40
+
41
+ ```yaml
42
+ # CRITICAL — privileged container
43
+ securityContext:
44
+ privileged: true
45
+
46
+ # CRITICAL — host namespaces
47
+ hostNetwork: true
48
+ hostPID: true
49
+ hostIPC: true
50
+
51
+ # HIGH — runAsRoot or missing runAsNonRoot
52
+ securityContext:
53
+ runAsUser: 0
54
+ # or: runAsNonRoot absent
55
+
56
+ # HIGH — allowPrivilegeEscalation unset or true
57
+ securityContext:
58
+ allowPrivilegeEscalation: true
59
+
60
+ # CRITICAL — dangerous capabilities
61
+ securityContext:
62
+ capabilities:
63
+ add: ["SYS_ADMIN"]
64
+
65
+ # MEDIUM — writable root filesystem
66
+ securityContext:
67
+ readOnlyRootFilesystem: false
68
+ # or: field absent
69
+
70
+ # MEDIUM — no seccomp profile
71
+ securityContext:
72
+ # seccompProfile absent
73
+ ```
74
+
75
+ For each container in the pod, note whether the field is set at the pod level, the container level, or both. Container-level settings override pod-level settings.
76
+
77
+ ### Step 4 — Image hygiene audit
78
+
79
+ Check every container and init container image reference:
80
+
81
+ ```yaml
82
+ # HIGH — mutable tag, non-reproducible
83
+ image: nginx:latest
84
+ image: myapp # tag absent
85
+
86
+ # MEDIUM — no digest pinning
87
+ image: nginx:1.25.3 # tag present but no @sha256 digest
88
+
89
+ # MEDIUM — unverified public registry, no digest
90
+ image: docker.io/library/nginx:1.25.3
91
+ ```
92
+
93
+ For production-grade manifests, recommend digest-pinned images:
94
+ ```yaml
95
+ image: nginx:1.25.3@sha256:<digest>
96
+ ```
97
+
98
+ ### Step 5 — Resource governance audit
99
+
100
+ Check every container for `resources.requests` and `resources.limits`:
101
+
102
+ ```yaml
103
+ # HIGH — no requests or limits
104
+ containers:
105
+ - name: app
106
+ image: myapp:1.0.0
107
+ # resources absent
108
+
109
+ # MEDIUM — memory limit set without CPU limit
110
+ resources:
111
+ limits:
112
+ memory: 512Mi
113
+ requests:
114
+ cpu: 100m
115
+ memory: 256Mi
116
+ # limits.cpu absent
117
+ ```
118
+
119
+ Check for ephemeral storage limits on containers known to produce log output or temporary files.
120
+
121
+ ### Step 6 — Health probe audit
122
+
123
+ Check every container for `livenessProbe` and `readinessProbe`:
124
+
125
+ ```yaml
126
+ # HIGH — missing livenessProbe
127
+ containers:
128
+ - name: app
129
+ # livenessProbe absent
130
+
131
+ # HIGH — missing readinessProbe
132
+ containers:
133
+ - name: app
134
+ # readinessProbe absent
135
+
136
+ # MEDIUM — exec probe with no timeoutSeconds
137
+ livenessProbe:
138
+ exec:
139
+ command: ["/bin/check"]
140
+ # timeoutSeconds absent, defaults to 1 second
141
+ ```
142
+
143
+ ### Step 7 — Networking and exposure audit
144
+
145
+ Review Service types, Ingress TLS, NetworkPolicy coverage, and Ingress annotations:
146
+
147
+ ```yaml
148
+ # MEDIUM — external exposure without documented justification
149
+ kind: Service
150
+ spec:
151
+ type: LoadBalancer # or NodePort
152
+
153
+ # HIGH — Ingress without TLS
154
+ kind: Ingress
155
+ spec:
156
+ # tls block absent
157
+
158
+ # MEDIUM — no NetworkPolicy found in namespace (default allow-all)
159
+
160
+ # CRITICAL — SSRF-enabling Ingress annotation
161
+ metadata:
162
+ annotations:
163
+ nginx.ingress.kubernetes.io/use-proxy-protocol: "true"
164
+ ```
165
+
166
+ If no NetworkPolicy resources are provided for the namespace, state that the default-allow posture is inferred and ask for NetworkPolicy files.
167
+
168
+ ### Step 8 — RBAC and secrets audit
169
+
170
+ Review ClusterRole, Role, RoleBinding, ClusterRoleBinding, and Secret resources:
171
+
172
+ ```yaml
173
+ # CRITICAL — wildcard verbs on wildcard resources
174
+ rules:
175
+ - apiGroups: ["*"]
176
+ resources: ["*"]
177
+ verbs: ["*"]
178
+
179
+ # CRITICAL — unauthenticated subject
180
+ subjects:
181
+ - kind: Group
182
+ name: system:unauthenticated
183
+
184
+ # HIGH — automount enabled on pods that do not need API access
185
+ automountServiceAccountToken: true # or field absent
186
+
187
+ # HIGH — broad secret access
188
+ rules:
189
+ - resources: ["secrets"]
190
+ verbs: ["get", "list"]
191
+
192
+ # CRITICAL — plaintext credentials in env
193
+ env:
194
+ - name: DB_PASSWORD
195
+ value: "mysecretpassword"
196
+
197
+ # MEDIUM — empty-string secret value
198
+ data:
199
+ password: "" # decodes to empty
200
+ ```
201
+
202
+ ---
203
+
204
+ ## Output
205
+
206
+ Return findings in this structure:
207
+
208
+ ```
209
+ ## Verdict
210
+ <one sentence: manifests pass baseline / manifests have blocking security defects / manifests need remediation before production>
211
+
212
+ ## Evidence level
213
+ <manifest files provided | partial manifests only | inference for missing resources>
214
+
215
+ ## Findings
216
+
217
+ ### CRITICAL
218
+ - [C1] <resource name> — <finding>: <description> — <remediation>
219
+
220
+ ### HIGH
221
+ - [H1] <resource name> — <finding>: <description> — <remediation>
222
+
223
+ ### MEDIUM
224
+ - [M1] <resource name> — <finding>: <description> — <remediation>
225
+
226
+ ### LOW
227
+ - [L1] <resource name> — <finding>: <description> — <remediation>
228
+
229
+ ## Safe next actions
230
+ 1. <action>
231
+ 2. <action>
232
+
233
+ ## Open questions
234
+ - <question requiring user clarification>
235
+ ```
236
+
237
+ ---
238
+
239
+ ## Security notes
240
+
241
+ - Never request or accept kubeconfig, service account tokens, cloud credentials, or actual secret values. Ask for sanitized manifests with placeholder values in Secret resources.
242
+ - This is a static review: do not apply manifests, run `kubectl`, or contact any cluster.
243
+ - A `privileged: true` container, `hostNetwork/hostPID/hostIPC: true`, or a ClusterRole with `*` verbs on `*` resources is the highest-impact finding class. Lead with it.
244
+ - `RoleBinding` to `system:unauthenticated` or `system:anonymous` is a critical exposure; tell the user to remove it immediately.
245
+ - Plaintext credentials in `env.value` or `ConfigMap.data` should be replaced with `secretKeyRef` references; never recommend committing real credentials even in base64.
246
+ - Do not recommend disabling probes or relaxing securityContext fields to pass short-term validation — recommend the correct secure configuration and explain the rationale.
@@ -0,0 +1,52 @@
1
+ ---
2
+ name: llm-ai-pipeline-test-review
3
+ description: Use this skill when reviewing how an LLM or AI pipeline is evaluated — metric selection, golden datasets, threshold governance, adversarial coverage, and regression gating — to determine whether low-quality or unsafe model outputs can ship undetected. Trigger when a user provides evaluation configuration files, DeepEval or RAGAS test scripts, eval CI steps, or asks whether their AI pipeline actually prevents a bad model from reaching production. This skill reviews evaluation setup statically; it does not call LLM APIs, run evaluations, or contact inference endpoints.
4
+ allowed-tools: Read Grep Glob
5
+ metadata:
6
+ author: "github: Raishin"
7
+ version: "0.1.0"
8
+ updated: "2026-05-17"
9
+ category: ai
10
+ lifecycle: experimental
11
+ ---
12
+
13
+ # LLM AI Pipeline Test Review
14
+
15
+ ## Purpose
16
+ This skill reviews how an LLM or AI pipeline is evaluated — not the model itself, but the evaluation setup that decides whether a model change is safe to ship. An evaluation suite only protects users if it measures the right things, gates on meaningful thresholds, covers adversarial inputs, and detects drift across model versions. The review catches missing hallucination and factuality metrics, absent answer-relevancy and faithfulness checks for RAG pipelines, unguarded bias and toxicity, no adversarial or red-team coverage, agent evals that ignore tool correctness and task completion, thresholds that are undefined or set to zero, single-shot evals on non-deterministic outputs, and no regression baseline to detect metric drift.
17
+
18
+ ## Lean operating rules
19
+
20
+ - Treat a RAG or summarisation pipeline with no `HallucinationMetric` or no GEval with factuality criteria against source documents as HIGH — the pipeline can fabricate facts and ship them.
21
+ - Treat a pipeline with no golden dataset (fixed reference set for regression) as HIGH — metric drift across model versions is undetectable.
22
+ - Treat the absence of `AnswerRelevancyMetric` as MEDIUM — responses may be fluent but off-topic, and no eval catches it.
23
+ - Treat a RAG pipeline with no `FaithfulnessMetric` as HIGH — the model can ignore retrieved context and hallucinate; faithfulness is the primary RAG correctness signal.
24
+ - Treat missing `ContextualPrecisionMetric` or `ContextualRecallMetric` in a RAG pipeline as MEDIUM — retrieval quality is unmeasured; noisy or incomplete retrieval is invisible to the eval.
25
+ - Treat the absence of `BiasMetric` or `ToxicityMetric` as HIGH if the system is user-facing — unsafe outputs can reach users without detection; treat as CRITICAL if the audience is vulnerable (children, medical patients, crisis users).
26
+ - Treat no adversarial test cases and no red-team dataset as CRITICAL for agentic systems; HIGH for all other user-facing LLM products — prompt-injection and jailbreak paths are untested.
27
+ - Treat agent evals with no `ToolCorrectnessMetric` as HIGH — the agent can call wrong tools silently and the eval still passes.
28
+ - Treat multi-step agent evals with no `TaskCompletionMetric` as HIGH — end-to-end success is unmeasured even if individual steps look fine.
29
+ - Treat metric thresholds that are undefined, set to 0, or not reviewed by a domain expert as HIGH — a threshold of 0 means every output passes; an unreviewed threshold is a guess.
30
+ - Treat evals that run only once per input on non-deterministic outputs (no pass@k or mean-score aggregation across multiple runs) as MEDIUM — a single lucky sample masks systematic failure.
31
+ - Treat the absence of a golden dataset or scoring baseline that would detect metric regression across model versions as HIGH — a model update can silently degrade quality.
32
+ - Treat static golden datasets that have never been rotated or supplemented with synthetic adversarial data as MEDIUM — a suite that tests the same inputs repeatedly stops finding new defects (the pesticide paradox).
33
+ - Apply thresholds contextually: a faithfulness score of 0.7 may be acceptable for a joke generator and unacceptable for a medical chatbot — flag any threshold that appears copied from a tutorial without domain justification.
34
+ - Define eval metrics early in the model selection process, not after a model is chosen — catching defects before model selection is always cheaper than retrofitting evals.
35
+ - Label every finding with evidence basis: eval config provided, test script provided, documentation-based, or inference.
36
+ - Static review only — read eval configs and test source; never call LLM APIs, never run evaluations, never request model API keys or inference endpoints.
37
+
38
+ ## References
39
+ Load these only when needed:
40
+ - [Workflow and output contract](references/workflow-and-output.md) — use when executing the full review or formatting the final answer.
41
+
42
+ ## Response minimum
43
+ Return, at minimum:
44
+ - Hallucination and factual correctness findings
45
+ - Answer relevancy and faithfulness findings (especially for RAG pipelines)
46
+ - Safety metric findings (bias, toxicity)
47
+ - Adversarial and red-team coverage findings
48
+ - Agent-specific metric findings (tool correctness, task completion)
49
+ - Threshold governance and non-determinism findings
50
+ - Regression gating findings (golden dataset, baseline)
51
+ - Severity-labelled finding list (critical / high / medium / low)
52
+ - Safe next actions
@@ -0,0 +1,23 @@
1
+ {
2
+ "id": "llm-ai-pipeline-test-review",
3
+ "name": "LLM AI Pipeline Test Review",
4
+ "type": "skill",
5
+ "provider": "generic",
6
+ "harnesses": ["codex", "claude-code", "cursor", "gemini", "kiro", "other"],
7
+ "summary": "Review an LLM or AI pipeline's evaluation setup for test-quality defects — missing hallucination, relevancy, faithfulness, bias, toxicity, and tool-correctness metrics; absent golden datasets; unthresholded or single-shot evals; and no regression gate across model versions. Static review only.",
8
+ "source_type": "original",
9
+ "official_docs": [
10
+ "https://docs.confident-ai.com/",
11
+ "https://docs.confident-ai.com/docs/metrics-hallucination",
12
+ "https://docs.confident-ai.com/docs/metrics-answer-relevancy",
13
+ "https://docs.confident-ai.com/docs/metrics-faithfulness",
14
+ "https://docs.confident-ai.com/docs/metrics-bias",
15
+ "https://docs.confident-ai.com/docs/metrics-tool-correctness",
16
+ "https://www.istqb.org/certifications/certified-tester-foundation-level"
17
+ ],
18
+ "security_notes": "Static review only — reads eval configuration and test source; never calls LLM APIs, never runs evaluations, never requests model API keys or inference endpoints. Do not accept eval fixtures containing real user PII, private prompt chains, or model weights; ask for sanitized configurations.",
19
+ "last_verified": "2026-05-17",
20
+ "path": "skills/qa/llm-ai-pipeline-test-review",
21
+ "author": "github: Raishin",
22
+ "version": "0.1.0"
23
+ }
@@ -0,0 +1,221 @@
1
+ # Workflow and Output Contract
2
+
3
+ ## Workflow
4
+
5
+ ### Step 1 — Collect inputs
6
+
7
+ Ask the user to provide one or more of the following as sanitized files (no API keys, no model weights, no real user PII — replace with placeholders):
8
+ - Evaluation configuration files (DeepEval `test_*.py`, RAGAS config, custom eval scripts)
9
+ - Golden dataset samples or references to a golden dataset (path, size, last-updated date)
10
+ - CI step that runs evals (workflow YAML, script, or description of the gate)
11
+ - The metric list and threshold values in use (even if embedded in test files)
12
+ - For RAG pipelines: retrieval configuration (vector store, top-k, similarity threshold)
13
+ - Optional: recent eval run report or score history showing metric trends
14
+
15
+ If CI gating configuration is not provided, regression-gate findings are stated as `inference` — say so and ask for it.
16
+ If threshold values are not provided, threshold-governance findings are stated as `inference`.
17
+
18
+ ### Step 2 — Hallucination and factual correctness audit
19
+
20
+ Confirm the eval measures whether the model's claims are factually grounded.
21
+
22
+ ```python
23
+ # HIGH — no hallucination check; fabrications pass the suite undetected
24
+ test_cases = [LLMTestCase(input=q, actual_output=answer)]
25
+ # no HallucinationMetric or GEval with factuality criteria
26
+
27
+ # Correct — hallucination measured against source
28
+ hallucination_metric = HallucinationMetric(threshold=0.2)
29
+ dataset = EvaluationDataset(test_cases=[
30
+ LLMTestCase(input=q, actual_output=answer, context=[source_doc])
31
+ ])
32
+ assert_test(dataset, [hallucination_metric])
33
+ ```
34
+
35
+ Check for:
36
+ - Presence of `HallucinationMetric` or a GEval with `"factual consistency"` / `"faithfulness to source"` criteria
37
+ - Whether `context` (source documents) is provided to the metric — without it, the metric cannot detect contradiction
38
+ - Whether a golden dataset with expected answers exists for regression comparisons
39
+
40
+ ### Step 3 — Answer relevancy and faithfulness audit (RAG focus)
41
+
42
+ For all pipelines, confirm responses address the input. For RAG pipelines, confirm outputs are grounded in retrieved context.
43
+
44
+ ```python
45
+ # MEDIUM — relevancy not measured; off-topic responses pass
46
+ # missing AnswerRelevancyMetric
47
+
48
+ # HIGH — RAG pipeline without faithfulness check; model can ignore retrieved docs
49
+ # missing FaithfulnessMetric with retrieved_contexts
50
+
51
+ # Correct — both relevancy and faithfulness measured
52
+ relevancy = AnswerRelevancyMetric(threshold=0.7)
53
+ faithfulness = FaithfulnessMetric(threshold=0.7)
54
+ test_case = LLMTestCase(
55
+ input=query,
56
+ actual_output=answer,
57
+ retrieval_context=retrieved_docs
58
+ )
59
+ ```
60
+
61
+ Check for:
62
+ - `AnswerRelevancyMetric` present for any conversational or Q&A pipeline
63
+ - `FaithfulnessMetric` present for any RAG pipeline — this is the primary RAG correctness signal
64
+ - `ContextualPrecisionMetric` and `ContextualRecallMetric` for RAG pipelines measuring retrieval quality
65
+ - Whether `retrieval_context` is populated in test cases — an empty context silently disables the metric
66
+
67
+ ### Step 4 — Safety metrics audit (bias, toxicity)
68
+
69
+ Confirm the eval catches unsafe outputs before they reach users.
70
+
71
+ ```python
72
+ # HIGH (CRITICAL for vulnerable audiences) — no safety guardrails in eval
73
+ # missing BiasMetric and ToxicityMetric
74
+
75
+ # Correct — safety metrics applied
76
+ bias_metric = BiasMetric(threshold=0.5)
77
+ toxicity_metric = ToxicityMetric(threshold=0.5)
78
+ ```
79
+
80
+ Check for:
81
+ - `BiasMetric` present for any user-facing system
82
+ - `ToxicityMetric` present for any user-facing system
83
+ - Threshold values reviewed for the deployment context — a threshold appropriate for an adult content filter may be too permissive for a children's education tool
84
+ - Whether bias and toxicity metrics are in the gating suite or are only advisory/non-blocking
85
+
86
+ ### Step 5 — Adversarial and red-team coverage audit
87
+
88
+ Confirm the eval includes adversarial inputs, not only happy-path test cases.
89
+
90
+ ```python
91
+ # CRITICAL for agentic / HIGH for others — no adversarial cases
92
+ test_cases = [LLMTestCase(input=normal_query, actual_output=answer)]
93
+ # only benign inputs; no prompt-injection attempts, no jailbreaks
94
+
95
+ # Correct — red-team dataset included
96
+ adversarial_cases = load_dataset("adversarial_prompts.json")
97
+ ```
98
+
99
+ Check for:
100
+ - Presence of adversarial test cases or a red-team dataset (prompt-injection attempts, jailbreak patterns, boundary inputs)
101
+ - For agentic systems: test cases that verify the agent refuses or handles malicious tool-calling instructions
102
+ - Whether adversarial cases are rotated periodically — a static adversarial set becomes predictable (pesticide paradox)
103
+ - Whether adversarial inputs cluster around the topic or domain boundaries of the deployment (defect clustering)
104
+
105
+ ### Step 6 — Agent-specific metrics audit (tool correctness, task completion)
106
+
107
+ For pipelines that include LLM agents, confirm the eval measures agent behavior, not only text quality.
108
+
109
+ ```python
110
+ # HIGH — agent evals check only output text; wrong tool calls pass undetected
111
+ # missing ToolCorrectnessMetric
112
+
113
+ # HIGH — multi-step agent eval has no end-to-end success signal
114
+ # missing TaskCompletionMetric
115
+
116
+ # Correct — both agent metrics present
117
+ tool_correctness = ToolCorrectnessMetric()
118
+ task_completion = TaskCompletionMetric(threshold=0.8)
119
+ agent_test_case = LLMTestCase(
120
+ input=user_request,
121
+ actual_output=final_answer,
122
+ tools_called=agent_tool_log,
123
+ expected_tools=["search", "summarize"]
124
+ )
125
+ ```
126
+
127
+ Check for:
128
+ - `ToolCorrectnessMetric` present when an agent selects or calls tools
129
+ - `TaskCompletionMetric` present for multi-step agentic workflows
130
+ - Whether `tools_called` is logged and passed to tool metrics — without the log the metric cannot evaluate tool use
131
+ - Whether task completion is defined and measurable for the specific agent goal
132
+
133
+ ### Step 7 — Threshold governance and non-determinism audit
134
+
135
+ Confirm thresholds are meaningful and results are statistically reliable.
136
+
137
+ ```python
138
+ # HIGH — threshold of 0 means every output passes; the metric is decorative
139
+ HallucinationMetric(threshold=0)
140
+
141
+ # MEDIUM — single run on a non-deterministic model; one lucky sample masks failures
142
+ result = evaluate(dataset, metrics=[hallucination_metric])
143
+
144
+ # Correct — multiple runs aggregated; threshold domain-reviewed
145
+ scores = [evaluate(dataset, metrics=[hallucination_metric]).scores for _ in range(5)]
146
+ mean_score = sum(scores) / len(scores)
147
+ # threshold=0.2 reviewed by a domain expert for this medical-chatbot use case
148
+ ```
149
+
150
+ Check for:
151
+ - Any threshold set to 0 or left at default without documented review — flag as HIGH
152
+ - Whether thresholds are documented with a rationale (use case, acceptable failure rate, domain expert sign-off)
153
+ - Whether multi-run aggregation (pass@k, mean score over N runs) is used for non-deterministic outputs
154
+ - Whether thresholds differ appropriately across deployment contexts (production vs. staging, medical vs. entertainment)
155
+
156
+ ### Step 8 — Regression gate audit
157
+
158
+ Confirm the eval detects when a model update silently degrades quality.
159
+
160
+ ```python
161
+ # HIGH — no baseline; a new model can score worse than the old one and ship
162
+ evaluate(dataset, metrics=[hallucination_metric])
163
+ # no comparison to previous run scores
164
+
165
+ # Correct — baseline scores recorded and compared
166
+ baseline = load_baseline("eval_baseline_v1.json")
167
+ current = evaluate(dataset, metrics=[hallucination_metric])
168
+ assert current.score >= baseline.score - ALLOWED_REGRESSION
169
+ ```
170
+
171
+ Check for:
172
+ - A golden dataset that is versioned and stable enough to detect regression
173
+ - Baseline scores stored from prior runs and compared against current runs
174
+ - CI or eval step that fails when scores drop below the baseline by more than an allowed delta
175
+ - Whether the golden dataset is ever refreshed — a dataset that never changes stops finding new defect categories (pesticide paradox); rotate or supplement it with synthetic data periodically
176
+
177
+ ---
178
+
179
+ ## Output
180
+
181
+ Return findings in this structure:
182
+
183
+ ```
184
+ ## Verdict
185
+ <one sentence: eval suite gates unsafe outputs / eval runs but gates nothing / partial coverage with gaps>
186
+
187
+ ## Evidence level
188
+ <eval config + test scripts provided | eval config only | documentation-based | inference>
189
+
190
+ ## Findings
191
+
192
+ ### CRITICAL
193
+ - [C1] <finding>: <description> — <remediation>
194
+
195
+ ### HIGH
196
+ - [H1] <finding>: <description> — <remediation>
197
+
198
+ ### MEDIUM
199
+ - [M1] <finding>: <description> — <remediation>
200
+
201
+ ### LOW
202
+ - [L1] <finding>: <description> — <remediation>
203
+
204
+ ## Safe next actions
205
+ 1. <action>
206
+ 2. <action>
207
+
208
+ ## Open questions
209
+ - <question requiring user clarification>
210
+ ```
211
+
212
+ ---
213
+
214
+ ## Security notes
215
+
216
+ - Never request or accept model API keys, inference endpoint URLs, or model weights. Ask for sanitized eval configuration with placeholders.
217
+ - Never call LLM APIs, run evaluations, or contact inference endpoints — this is a static review only.
218
+ - Do not accept eval fixtures containing real user PII or private prompt chains; ask the user to anonymize them first.
219
+ - A metric with threshold=0 is functionally disabled — it is the eval equivalent of `continue-on-error: true` on a test step. Lead with it when present.
220
+ - Bias and toxicity without thresholds reviewed for the actual audience are a false signal of safety; flag the gap and ask what the audience is.
221
+ - Adversarial coverage is the most commonly absent category; absence is not evidence that the model is robust — it is evidence the question was never asked.
@@ -0,0 +1,54 @@
1
+ ---
2
+ name: playwright-e2e-execution-run
3
+ description: Use this skill when an operator wants to actually execute an existing Playwright end-to-end suite against a confirmed non-production target and receive a structured, attested run report — pass/fail counts, flaky tests, durations, and trace artifacts. Trigger when the user asks to "run the e2e suite", "execute the Playwright tests against staging", or hands the agent a Playwright project plus a target base URL. This is the live-execution counterpart to the static-review skill `playwright-e2e-suite-review`. Default mode is static and runs nothing; runtime execution is a per-session opt-in that requires explicit target confirmation.
4
+ allowed-tools: Read Grep Glob Bash(npx playwright test*) Bash(npx playwright install*) Bash(npx playwright show-report*)
5
+ metadata:
6
+ author: "github: Raishin"
7
+ version: "0.1.0"
8
+ updated: "2026-05-17"
9
+ category: delivery
10
+ lifecycle: experimental
11
+ execution_tier: read-only-runtime
12
+ required_egress:
13
+ - operator-confirmed-target-host
14
+ - cdn.playwright.dev
15
+ - playwright.download.prss.microsoft.com
16
+ requires_credentials: []
17
+ output_attestation:
18
+ schema: schemas/attestation.schema.json
19
+ signed_with: none
20
+ ---
21
+
22
+ # Playwright E2E Execution Run
23
+
24
+ ## Purpose
25
+ This skill executes an existing Playwright end-to-end suite against an operator-confirmed non-production target and emits a structured run attestation: total/passed/failed/flaky counts, slowest tests, retry-only passes, and the location of trace and screenshot artifacts. It is the live-execution counterpart to `playwright-e2e-suite-review` (which is static-review only and never runs anything). The skill runs the suite as authored — it does not write the tests, deploy the application, or mutate infrastructure — and it refuses to run against a production target.
26
+
27
+ ## Execution modes
28
+ - **Static (default).** The skill runs nothing. It inspects `playwright.config`, enumerates the project and target, states exactly which command it would execute, and asks the operator for explicit runtime opt-in plus target confirmation.
29
+ - **Runtime (per-session opt-in).** Only after the operator explicitly opts in and confirms a non-production base URL does the skill invoke `npx playwright test`. Runtime mode is never assumed from the request alone.
30
+
31
+ ## Lean operating rules
32
+ - Never execute the suite without an explicit, in-session runtime opt-in AND an operator-confirmed base URL — absent either, stay in static mode and ask.
33
+ - Refuse to run if the target base URL resolves to, or is named like, a production environment (`prod`, `www.`, a customer-facing apex domain). Require a staging, preview, or ephemeral target; state the refusal reason.
34
+ - Never accept credentials, bearer tokens, or a `storageState` file inline. Test credentials must come from the environment or a config the operator already controls; the skill never collects, echoes, or logs their values.
35
+ - Run only the allowlisted commands: `npx playwright test` (with operator-supplied flags), `npx playwright install` (browser binaries), `npx playwright show-report`. Never run deploy, migration, seed, or registry commands.
36
+ - Treat the suite's own side effects as the operator's responsibility — state plainly that E2E tests may create or modify data in the target, which is why a non-production target is mandatory.
37
+ - Do not retry a failed run with raised timeouts or added retries to manufacture a green result — report the failure as observed.
38
+ - Emit the run attestation as JSON conforming to `schemas/attestation.schema.json`; the verdict is one of `pass`, `fail`, or `manual-review`.
39
+ - If browser binaries are missing, run `npx playwright install` only with operator awareness; if egress to the browser CDN is blocked, degrade to `manual-review` rather than reporting a false `fail`.
40
+ - Label the run: command executed, target host (host only, never the full credentialed URL), Playwright version, and wall-clock duration.
41
+
42
+ ## References
43
+ Load these only when needed:
44
+ - [Workflow and output contract](references/workflow-and-output.md) — use when executing the run or formatting the attestation.
45
+
46
+ ## Response minimum
47
+ Return, at minimum:
48
+ - The execution mode used (static or runtime) and why
49
+ - The exact command executed (runtime) or that would be executed (static)
50
+ - The confirmed target host and Playwright version
51
+ - Run results: total / passed / failed / flaky (retry-only pass) counts
52
+ - Trace and screenshot artifact locations for any failure
53
+ - A `pass` / `fail` / `manual-review` verdict with reasons
54
+ - Safe next actions
@@ -0,0 +1,24 @@
1
+ {
2
+ "id": "playwright-e2e-execution-run",
3
+ "name": "Playwright E2E Execution Run",
4
+ "type": "skill",
5
+ "provider": "generic",
6
+ "harnesses": ["claude-code", "cursor"],
7
+ "summary": "Execute an existing Playwright E2E suite against an operator-confirmed non-production target and emit a structured run attestation — pass/fail/flaky counts, slowest tests, and trace artifact locations. Live-execution counterpart to playwright-e2e-suite-review.",
8
+ "source_type": "original",
9
+ "official_docs": [
10
+ "https://playwright.dev/docs/test-cli",
11
+ "https://playwright.dev/docs/running-tests",
12
+ "https://playwright.dev/docs/test-reporters",
13
+ "https://playwright.dev/docs/trace-viewer",
14
+ "https://playwright.dev/docs/ci"
15
+ ],
16
+ "security_notes": "Live-execution skill, read-only-runtime tier. Default mode is static and runs nothing; runtime execution is a per-session opt-in requiring explicit operator confirmation of a non-production target. The Bash allowlist locks invocations to `npx playwright test`, `npx playwright install`, and `npx playwright show-report` — no deploy, migration, seed, or registry commands. Refuses production targets. Never accepts or echoes credentials, tokens, or storageState; test credentials come from the operator-controlled environment. Egress limited to the operator-confirmed target host and the Playwright browser CDN; blocked CDN egress degrades to manual-review rather than a false fail.",
17
+ "last_verified": "2026-05-17",
18
+ "path": "skills/qa/playwright-e2e-execution-run",
19
+ "category": "delivery",
20
+ "lifecycle": "experimental",
21
+ "execution_tier": "read-only-runtime",
22
+ "author": "github: Raishin",
23
+ "version": "0.1.0"
24
+ }