@ruaruababa/vibe-kit 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (462) hide show
  1. package/CATALOG.md +317 -0
  2. package/README.md +121 -0
  3. package/aliases.json +65 -0
  4. package/bin/vibe.js +2 -0
  5. package/bundles.json +265 -0
  6. package/catalog.json +1560 -0
  7. package/dist/antigravity-skills/bin/cli.js +438 -0
  8. package/dist/antigravity-skills/lib/skill-utils.js +158 -0
  9. package/dist/antigravity-skills/scripts/build-catalog.js +305 -0
  10. package/dist/antigravity-skills/scripts/normalize-frontmatter.js +144 -0
  11. package/dist/antigravity-skills/scripts/validate-skills.js +230 -0
  12. package/dist/bin/vibe.js +2 -0
  13. package/dist/dist/src/cli/index.js +26 -0
  14. package/dist/lib/skill-utils.js +158 -0
  15. package/dist/scripts/build-catalog.js +50 -0
  16. package/dist/scripts/normalize-frontmatter.js +144 -0
  17. package/dist/scripts/validate-skills.js +56 -0
  18. package/dist/src/cli/index.js +146 -0
  19. package/dist/src/types/index.js +13 -0
  20. package/dist/src/utils/fs.js +1 -0
  21. package/package.json +43 -0
  22. package/skills/accessibility-compliance-accessibility-audit/SKILL.md +42 -0
  23. package/skills/accessibility-compliance-accessibility-audit/resources/implementation-playbook.md +502 -0
  24. package/skills/agent-orchestration-improve-agent/SKILL.md +349 -0
  25. package/skills/agent-orchestration-multi-agent-optimize/SKILL.md +239 -0
  26. package/skills/agent-orchestrator/SKILL.md +24 -0
  27. package/skills/ai-engineer/SKILL.md +171 -0
  28. package/skills/airflow-dag-patterns/SKILL.md +41 -0
  29. package/skills/airflow-dag-patterns/resources/implementation-playbook.md +509 -0
  30. package/skills/angular-migration/SKILL.md +428 -0
  31. package/skills/anti-reversing-techniques/SKILL.md +42 -0
  32. package/skills/anti-reversing-techniques/resources/implementation-playbook.md +539 -0
  33. package/skills/api-design-principles/SKILL.md +37 -0
  34. package/skills/api-design-principles/assets/api-design-checklist.md +155 -0
  35. package/skills/api-design-principles/assets/rest-api-template.py +182 -0
  36. package/skills/api-design-principles/references/graphql-schema-design.md +583 -0
  37. package/skills/api-design-principles/references/rest-best-practices.md +408 -0
  38. package/skills/api-design-principles/resources/implementation-playbook.md +513 -0
  39. package/skills/api-documenter/SKILL.md +184 -0
  40. package/skills/api-testing-observability-api-mock/SKILL.md +46 -0
  41. package/skills/api-testing-observability-api-mock/resources/implementation-playbook.md +1327 -0
  42. package/skills/application-performance-performance-optimization/SKILL.md +154 -0
  43. package/skills/architect-review/SKILL.md +174 -0
  44. package/skills/architecture-decision-records/SKILL.md +441 -0
  45. package/skills/architecture-patterns/SKILL.md +37 -0
  46. package/skills/architecture-patterns/resources/implementation-playbook.md +479 -0
  47. package/skills/arm-cortex-expert/SKILL.md +306 -0
  48. package/skills/async-python-patterns/SKILL.md +39 -0
  49. package/skills/async-python-patterns/resources/implementation-playbook.md +678 -0
  50. package/skills/attack-tree-construction/SKILL.md +38 -0
  51. package/skills/attack-tree-construction/resources/implementation-playbook.md +671 -0
  52. package/skills/auth-implementation-patterns/SKILL.md +39 -0
  53. package/skills/auth-implementation-patterns/resources/implementation-playbook.md +618 -0
  54. package/skills/backend-architect/SKILL.md +333 -0
  55. package/skills/backend-development-feature-development/SKILL.md +180 -0
  56. package/skills/backend-security-coder/SKILL.md +156 -0
  57. package/skills/backtesting-frameworks/SKILL.md +39 -0
  58. package/skills/backtesting-frameworks/resources/implementation-playbook.md +647 -0
  59. package/skills/bash-defensive-patterns/SKILL.md +43 -0
  60. package/skills/bash-defensive-patterns/resources/implementation-playbook.md +517 -0
  61. package/skills/bash-pro/SKILL.md +310 -0
  62. package/skills/bats-testing-patterns/SKILL.md +34 -0
  63. package/skills/bats-testing-patterns/resources/implementation-playbook.md +614 -0
  64. package/skills/bazel-build-optimization/SKILL.md +397 -0
  65. package/skills/billing-automation/SKILL.md +42 -0
  66. package/skills/billing-automation/resources/implementation-playbook.md +544 -0
  67. package/skills/binary-analysis-patterns/SKILL.md +450 -0
  68. package/skills/blockchain-developer/SKILL.md +208 -0
  69. package/skills/business-analyst/SKILL.md +182 -0
  70. package/skills/c-pro/SKILL.md +56 -0
  71. package/skills/c4-architecture-c4-architecture/SKILL.md +389 -0
  72. package/skills/c4-code/SKILL.md +244 -0
  73. package/skills/c4-component/SKILL.md +153 -0
  74. package/skills/c4-container/SKILL.md +171 -0
  75. package/skills/c4-context/SKILL.md +150 -0
  76. package/skills/changelog-automation/SKILL.md +38 -0
  77. package/skills/changelog-automation/resources/implementation-playbook.md +538 -0
  78. package/skills/cicd-automation-workflow-automate/SKILL.md +51 -0
  79. package/skills/cicd-automation-workflow-automate/resources/implementation-playbook.md +1333 -0
  80. package/skills/clean-markdown/SKILL.md +23 -0
  81. package/skills/cloud-architect/SKILL.md +135 -0
  82. package/skills/code-documentation-code-explain/SKILL.md +46 -0
  83. package/skills/code-documentation-code-explain/resources/implementation-playbook.md +802 -0
  84. package/skills/code-documentation-doc-generate/SKILL.md +48 -0
  85. package/skills/code-documentation-doc-generate/resources/implementation-playbook.md +640 -0
  86. package/skills/code-refactoring-context-restore/SKILL.md +179 -0
  87. package/skills/code-refactoring-refactor-clean/SKILL.md +51 -0
  88. package/skills/code-refactoring-refactor-clean/resources/implementation-playbook.md +879 -0
  89. package/skills/code-refactoring-tech-debt/SKILL.md +386 -0
  90. package/skills/code-review-ai-ai-review/SKILL.md +450 -0
  91. package/skills/code-review-excellence/SKILL.md +40 -0
  92. package/skills/code-review-excellence/resources/implementation-playbook.md +515 -0
  93. package/skills/code-reviewer/SKILL.md +178 -0
  94. package/skills/codebase-cleanup-deps-audit/SKILL.md +51 -0
  95. package/skills/codebase-cleanup-deps-audit/resources/implementation-playbook.md +766 -0
  96. package/skills/codebase-cleanup-refactor-clean/SKILL.md +51 -0
  97. package/skills/codebase-cleanup-refactor-clean/resources/implementation-playbook.md +879 -0
  98. package/skills/codebase-cleanup-tech-debt/SKILL.md +386 -0
  99. package/skills/competitive-landscape/SKILL.md +34 -0
  100. package/skills/competitive-landscape/resources/implementation-playbook.md +494 -0
  101. package/skills/comprehensive-review-full-review/SKILL.md +146 -0
  102. package/skills/comprehensive-review-pr-enhance/SKILL.md +46 -0
  103. package/skills/comprehensive-review-pr-enhance/resources/implementation-playbook.md +691 -0
  104. package/skills/conductor-implement/SKILL.md +388 -0
  105. package/skills/conductor-manage/SKILL.md +39 -0
  106. package/skills/conductor-manage/resources/implementation-playbook.md +1120 -0
  107. package/skills/conductor-new-track/SKILL.md +433 -0
  108. package/skills/conductor-revert/SKILL.md +372 -0
  109. package/skills/conductor-setup/SKILL.md +426 -0
  110. package/skills/conductor-status/SKILL.md +338 -0
  111. package/skills/conductor-validator/SKILL.md +62 -0
  112. package/skills/content-marketer/SKILL.md +170 -0
  113. package/skills/context-driven-development/SKILL.md +400 -0
  114. package/skills/context-management-context-restore/SKILL.md +179 -0
  115. package/skills/context-management-context-save/SKILL.md +177 -0
  116. package/skills/context-manager/SKILL.md +185 -0
  117. package/skills/cost-optimization/SKILL.md +286 -0
  118. package/skills/cpp-pro/SKILL.md +59 -0
  119. package/skills/cqrs-implementation/SKILL.md +35 -0
  120. package/skills/cqrs-implementation/resources/implementation-playbook.md +540 -0
  121. package/skills/csharp-pro/SKILL.md +59 -0
  122. package/skills/customer-support/SKILL.md +170 -0
  123. package/skills/data-engineer/SKILL.md +224 -0
  124. package/skills/data-engineering-data-driven-feature/SKILL.md +182 -0
  125. package/skills/data-engineering-data-pipeline/SKILL.md +201 -0
  126. package/skills/data-quality-frameworks/SKILL.md +40 -0
  127. package/skills/data-quality-frameworks/resources/implementation-playbook.md +573 -0
  128. package/skills/data-scientist/SKILL.md +199 -0
  129. package/skills/data-storytelling/SKILL.md +465 -0
  130. package/skills/database-admin/SKILL.md +165 -0
  131. package/skills/database-architect/SKILL.md +268 -0
  132. package/skills/database-cloud-optimization-cost-optimize/SKILL.md +44 -0
  133. package/skills/database-cloud-optimization-cost-optimize/resources/implementation-playbook.md +1441 -0
  134. package/skills/database-migration/SKILL.md +436 -0
  135. package/skills/database-migrations-migration-observability/SKILL.md +420 -0
  136. package/skills/database-migrations-sql-migrations/SKILL.md +53 -0
  137. package/skills/database-migrations-sql-migrations/resources/implementation-playbook.md +499 -0
  138. package/skills/database-optimizer/SKILL.md +167 -0
  139. package/skills/dbt-transformation-patterns/SKILL.md +34 -0
  140. package/skills/dbt-transformation-patterns/resources/implementation-playbook.md +547 -0
  141. package/skills/debugger/SKILL.md +49 -0
  142. package/skills/debugging-strategies/SKILL.md +34 -0
  143. package/skills/debugging-strategies/resources/implementation-playbook.md +511 -0
  144. package/skills/debugging-toolkit-smart-debug/SKILL.md +197 -0
  145. package/skills/defi-protocol-templates/SKILL.md +466 -0
  146. package/skills/dependency-management-deps-audit/SKILL.md +44 -0
  147. package/skills/dependency-management-deps-audit/resources/implementation-playbook.md +766 -0
  148. package/skills/dependency-upgrade/SKILL.md +421 -0
  149. package/skills/deployment-engineer/SKILL.md +170 -0
  150. package/skills/deployment-pipeline-design/SKILL.md +371 -0
  151. package/skills/deployment-validation-config-validate/SKILL.md +496 -0
  152. package/skills/devops-troubleshooter/SKILL.md +161 -0
  153. package/skills/distributed-debugging-debug-trace/SKILL.md +44 -0
  154. package/skills/distributed-debugging-debug-trace/resources/implementation-playbook.md +1307 -0
  155. package/skills/distributed-tracing/SKILL.md +450 -0
  156. package/skills/django-pro/SKILL.md +180 -0
  157. package/skills/docs-architect/SKILL.md +98 -0
  158. package/skills/documentation-generation-doc-generate/SKILL.md +48 -0
  159. package/skills/documentation-generation-doc-generate/resources/implementation-playbook.md +640 -0
  160. package/skills/dotnet-architect/SKILL.md +197 -0
  161. package/skills/dotnet-backend-patterns/SKILL.md +37 -0
  162. package/skills/dotnet-backend-patterns/assets/repository-template.cs +523 -0
  163. package/skills/dotnet-backend-patterns/assets/service-template.cs +336 -0
  164. package/skills/dotnet-backend-patterns/references/dapper-patterns.md +544 -0
  165. package/skills/dotnet-backend-patterns/references/ef-core-best-practices.md +355 -0
  166. package/skills/dotnet-backend-patterns/resources/implementation-playbook.md +799 -0
  167. package/skills/dummy-skill/SKILL.md +5 -0
  168. package/skills/dx-optimizer/SKILL.md +83 -0
  169. package/skills/e2e-testing-patterns/SKILL.md +41 -0
  170. package/skills/e2e-testing-patterns/resources/implementation-playbook.md +531 -0
  171. package/skills/elixir-pro/SKILL.md +59 -0
  172. package/skills/embedding-strategies/SKILL.md +491 -0
  173. package/skills/employment-contract-templates/SKILL.md +39 -0
  174. package/skills/employment-contract-templates/resources/implementation-playbook.md +493 -0
  175. package/skills/error-debugging-error-analysis/SKILL.md +47 -0
  176. package/skills/error-debugging-error-analysis/resources/implementation-playbook.md +1143 -0
  177. package/skills/error-debugging-error-trace/SKILL.md +43 -0
  178. package/skills/error-debugging-error-trace/resources/implementation-playbook.md +1361 -0
  179. package/skills/error-debugging-multi-agent-review/SKILL.md +216 -0
  180. package/skills/error-detective/SKILL.md +53 -0
  181. package/skills/error-diagnostics-error-analysis/SKILL.md +47 -0
  182. package/skills/error-diagnostics-error-analysis/resources/implementation-playbook.md +1143 -0
  183. package/skills/error-diagnostics-error-trace/SKILL.md +48 -0
  184. package/skills/error-diagnostics-error-trace/resources/implementation-playbook.md +1371 -0
  185. package/skills/error-diagnostics-smart-debug/SKILL.md +197 -0
  186. package/skills/error-handling-patterns/SKILL.md +35 -0
  187. package/skills/error-handling-patterns/resources/implementation-playbook.md +635 -0
  188. package/skills/event-sourcing-architect/SKILL.md +58 -0
  189. package/skills/event-store-design/SKILL.md +449 -0
  190. package/skills/fastapi-pro/SKILL.md +192 -0
  191. package/skills/fastapi-templates/SKILL.md +32 -0
  192. package/skills/fastapi-templates/resources/implementation-playbook.md +566 -0
  193. package/skills/final-test/SKILL.md +5 -0
  194. package/skills/firmware-analyst/SKILL.md +320 -0
  195. package/skills/flutter-expert/SKILL.md +200 -0
  196. package/skills/framework-migration-code-migrate/SKILL.md +48 -0
  197. package/skills/framework-migration-code-migrate/resources/implementation-playbook.md +1052 -0
  198. package/skills/framework-migration-deps-upgrade/SKILL.md +48 -0
  199. package/skills/framework-migration-deps-upgrade/resources/implementation-playbook.md +755 -0
  200. package/skills/framework-migration-legacy-modernize/SKILL.md +132 -0
  201. package/skills/frontend-developer/SKILL.md +171 -0
  202. package/skills/frontend-mobile-development-component-scaffold/SKILL.md +403 -0
  203. package/skills/frontend-mobile-security-xss-scan/SKILL.md +322 -0
  204. package/skills/frontend-security-coder/SKILL.md +170 -0
  205. package/skills/full-stack-orchestration-full-stack-feature/SKILL.md +135 -0
  206. package/skills/gdpr-data-handling/SKILL.md +33 -0
  207. package/skills/gdpr-data-handling/resources/implementation-playbook.md +615 -0
  208. package/skills/git-advanced-workflows/SKILL.md +412 -0
  209. package/skills/git-pr-workflows-git-workflow/SKILL.md +140 -0
  210. package/skills/git-pr-workflows-onboard/SKILL.md +416 -0
  211. package/skills/git-pr-workflows-pr-enhance/SKILL.md +48 -0
  212. package/skills/git-pr-workflows-pr-enhance/resources/implementation-playbook.md +701 -0
  213. package/skills/github-actions-templates/SKILL.md +345 -0
  214. package/skills/gitlab-ci-patterns/SKILL.md +283 -0
  215. package/skills/gitops-workflow/SKILL.md +303 -0
  216. package/skills/gitops-workflow/references/argocd-setup.md +134 -0
  217. package/skills/gitops-workflow/references/sync-policies.md +131 -0
  218. package/skills/go-concurrency-patterns/SKILL.md +33 -0
  219. package/skills/go-concurrency-patterns/resources/implementation-playbook.md +654 -0
  220. package/skills/godot-gdscript-patterns/SKILL.md +33 -0
  221. package/skills/godot-gdscript-patterns/resources/implementation-playbook.md +804 -0
  222. package/skills/golang-pro/SKILL.md +179 -0
  223. package/skills/grafana-dashboards/SKILL.md +381 -0
  224. package/skills/graphql-architect/SKILL.md +182 -0
  225. package/skills/haskell-pro/SKILL.md +56 -0
  226. package/skills/helm-chart-scaffolding/SKILL.md +34 -0
  227. package/skills/helm-chart-scaffolding/assets/Chart.yaml.template +42 -0
  228. package/skills/helm-chart-scaffolding/assets/values.yaml.template +185 -0
  229. package/skills/helm-chart-scaffolding/references/chart-structure.md +500 -0
  230. package/skills/helm-chart-scaffolding/resources/implementation-playbook.md +543 -0
  231. package/skills/helm-chart-scaffolding/scripts/validate-chart.sh +244 -0
  232. package/skills/hr-pro/SKILL.md +126 -0
  233. package/skills/hybrid-cloud-architect/SKILL.md +168 -0
  234. package/skills/hybrid-cloud-networking/SKILL.md +238 -0
  235. package/skills/hybrid-search-implementation/SKILL.md +32 -0
  236. package/skills/hybrid-search-implementation/resources/implementation-playbook.md +567 -0
  237. package/skills/incident-responder/SKILL.md +213 -0
  238. package/skills/incident-response-incident-response/SKILL.md +168 -0
  239. package/skills/incident-response-smart-fix/SKILL.md +29 -0
  240. package/skills/incident-response-smart-fix/resources/implementation-playbook.md +838 -0
  241. package/skills/incident-runbook-templates/SKILL.md +395 -0
  242. package/skills/ios-developer/SKILL.md +219 -0
  243. package/skills/istio-traffic-management/SKILL.md +337 -0
  244. package/skills/java-pro/SKILL.md +177 -0
  245. package/skills/javascript-pro/SKILL.md +57 -0
  246. package/skills/javascript-testing-patterns/SKILL.md +35 -0
  247. package/skills/javascript-testing-patterns/resources/implementation-playbook.md +1024 -0
  248. package/skills/javascript-typescript-typescript-scaffold/SKILL.md +361 -0
  249. package/skills/julia-pro/SKILL.md +209 -0
  250. package/skills/k8s-manifest-generator/SKILL.md +35 -0
  251. package/skills/k8s-manifest-generator/assets/configmap-template.yaml +296 -0
  252. package/skills/k8s-manifest-generator/assets/deployment-template.yaml +203 -0
  253. package/skills/k8s-manifest-generator/assets/service-template.yaml +171 -0
  254. package/skills/k8s-manifest-generator/references/deployment-spec.md +753 -0
  255. package/skills/k8s-manifest-generator/references/service-spec.md +724 -0
  256. package/skills/k8s-manifest-generator/resources/implementation-playbook.md +510 -0
  257. package/skills/k8s-security-policies/SKILL.md +346 -0
  258. package/skills/k8s-security-policies/assets/network-policy-template.yaml +177 -0
  259. package/skills/k8s-security-policies/references/rbac-patterns.md +187 -0
  260. package/skills/kpi-dashboard-design/SKILL.md +440 -0
  261. package/skills/kubernetes-architect/SKILL.md +170 -0
  262. package/skills/langchain-architecture/SKILL.md +350 -0
  263. package/skills/legacy-modernizer/SKILL.md +53 -0
  264. package/skills/legal-advisor/SKILL.md +70 -0
  265. package/skills/linkerd-patterns/SKILL.md +321 -0
  266. package/skills/llm-application-dev-ai-assistant/SKILL.md +35 -0
  267. package/skills/llm-application-dev-ai-assistant/resources/implementation-playbook.md +1236 -0
  268. package/skills/llm-application-dev-langchain-agent/SKILL.md +246 -0
  269. package/skills/llm-application-dev-prompt-optimize/SKILL.md +37 -0
  270. package/skills/llm-application-dev-prompt-optimize/resources/implementation-playbook.md +591 -0
  271. package/skills/llm-evaluation/SKILL.md +483 -0
  272. package/skills/machine-learning-ops-ml-pipeline/SKILL.md +314 -0
  273. package/skills/malware-analyst/SKILL.md +247 -0
  274. package/skills/market-sizing-analysis/SKILL.md +425 -0
  275. package/skills/market-sizing-analysis/examples/saas-market-sizing.md +349 -0
  276. package/skills/market-sizing-analysis/references/data-sources.md +360 -0
  277. package/skills/memory-forensics/SKILL.md +491 -0
  278. package/skills/memory-safety-patterns/SKILL.md +33 -0
  279. package/skills/memory-safety-patterns/resources/implementation-playbook.md +603 -0
  280. package/skills/mermaid-expert/SKILL.md +59 -0
  281. package/skills/microservices-patterns/SKILL.md +35 -0
  282. package/skills/microservices-patterns/resources/implementation-playbook.md +607 -0
  283. package/skills/minecraft-bukkit-pro/SKILL.md +126 -0
  284. package/skills/ml-engineer/SKILL.md +168 -0
  285. package/skills/ml-pipeline-workflow/SKILL.md +257 -0
  286. package/skills/mlops-engineer/SKILL.md +219 -0
  287. package/skills/mobile-developer/SKILL.md +205 -0
  288. package/skills/mobile-security-coder/SKILL.md +184 -0
  289. package/skills/modern-javascript-patterns/SKILL.md +35 -0
  290. package/skills/modern-javascript-patterns/resources/implementation-playbook.md +910 -0
  291. package/skills/monorepo-architect/SKILL.md +61 -0
  292. package/skills/monorepo-management/SKILL.md +35 -0
  293. package/skills/monorepo-management/resources/implementation-playbook.md +621 -0
  294. package/skills/mtls-configuration/SKILL.md +359 -0
  295. package/skills/multi-cloud-architecture/SKILL.md +189 -0
  296. package/skills/multi-platform-apps-multi-platform/SKILL.md +203 -0
  297. package/skills/network-engineer/SKILL.md +169 -0
  298. package/skills/nextjs-app-router-patterns/SKILL.md +33 -0
  299. package/skills/nextjs-app-router-patterns/resources/implementation-playbook.md +543 -0
  300. package/skills/nft-standards/SKILL.md +395 -0
  301. package/skills/node-expert/SKILL.md +23 -0
  302. package/skills/nodejs-backend-patterns/SKILL.md +35 -0
  303. package/skills/nodejs-backend-patterns/resources/implementation-playbook.md +1019 -0
  304. package/skills/nx-workspace-patterns/SKILL.md +464 -0
  305. package/skills/observability-engineer/SKILL.md +237 -0
  306. package/skills/observability-monitoring-monitor-setup/SKILL.md +48 -0
  307. package/skills/observability-monitoring-monitor-setup/resources/implementation-playbook.md +505 -0
  308. package/skills/observability-monitoring-slo-implement/SKILL.md +43 -0
  309. package/skills/observability-monitoring-slo-implement/resources/implementation-playbook.md +1077 -0
  310. package/skills/on-call-handoff-patterns/SKILL.md +453 -0
  311. package/skills/openapi-spec-generation/SKILL.md +33 -0
  312. package/skills/openapi-spec-generation/resources/implementation-playbook.md +1027 -0
  313. package/skills/payment-integration/SKILL.md +77 -0
  314. package/skills/paypal-integration/SKILL.md +479 -0
  315. package/skills/pci-compliance/SKILL.md +478 -0
  316. package/skills/performance-engineer/SKILL.md +180 -0
  317. package/skills/performance-testing-review-ai-review/SKILL.md +450 -0
  318. package/skills/performance-testing-review-multi-agent-review/SKILL.md +216 -0
  319. package/skills/php-pro/SKILL.md +63 -0
  320. package/skills/posix-shell-pro/SKILL.md +304 -0
  321. package/skills/postgresql/SKILL.md +230 -0
  322. package/skills/postmortem-writing/SKILL.md +386 -0
  323. package/skills/projection-patterns/SKILL.md +33 -0
  324. package/skills/projection-patterns/resources/implementation-playbook.md +501 -0
  325. package/skills/prometheus-configuration/SKILL.md +404 -0
  326. package/skills/prompt-engineer/SKILL.md +272 -0
  327. package/skills/prompt-engineering-patterns/SKILL.md +213 -0
  328. package/skills/prompt-engineering-patterns/assets/few-shot-examples.json +106 -0
  329. package/skills/prompt-engineering-patterns/assets/prompt-template-library.md +246 -0
  330. package/skills/prompt-engineering-patterns/references/chain-of-thought.md +399 -0
  331. package/skills/prompt-engineering-patterns/references/few-shot-learning.md +369 -0
  332. package/skills/prompt-engineering-patterns/references/prompt-optimization.md +414 -0
  333. package/skills/prompt-engineering-patterns/references/prompt-templates.md +470 -0
  334. package/skills/prompt-engineering-patterns/references/system-prompts.md +189 -0
  335. package/skills/prompt-engineering-patterns/scripts/optimize-prompt.py +279 -0
  336. package/skills/protocol-reverse-engineering/SKILL.md +29 -0
  337. package/skills/protocol-reverse-engineering/resources/implementation-playbook.md +509 -0
  338. package/skills/python-development-python-scaffold/SKILL.md +331 -0
  339. package/skills/python-packaging/SKILL.md +36 -0
  340. package/skills/python-packaging/resources/implementation-playbook.md +869 -0
  341. package/skills/python-performance-optimization/SKILL.md +36 -0
  342. package/skills/python-performance-optimization/resources/implementation-playbook.md +868 -0
  343. package/skills/python-pro/SKILL.md +158 -0
  344. package/skills/python-testing-patterns/SKILL.md +37 -0
  345. package/skills/python-testing-patterns/resources/implementation-playbook.md +906 -0
  346. package/skills/quant-analyst/SKILL.md +53 -0
  347. package/skills/rag-implementation/SKILL.md +421 -0
  348. package/skills/react-modernization/SKILL.md +34 -0
  349. package/skills/react-modernization/resources/implementation-playbook.md +512 -0
  350. package/skills/react-native-architecture/SKILL.md +33 -0
  351. package/skills/react-native-architecture/resources/implementation-playbook.md +670 -0
  352. package/skills/react-state-management/SKILL.md +441 -0
  353. package/skills/reference-builder/SKILL.md +188 -0
  354. package/skills/reverse-engineer/SKILL.md +173 -0
  355. package/skills/risk-manager/SKILL.md +61 -0
  356. package/skills/risk-metrics-calculation/SKILL.md +33 -0
  357. package/skills/risk-metrics-calculation/resources/implementation-playbook.md +554 -0
  358. package/skills/ruby-pro/SKILL.md +56 -0
  359. package/skills/rust-async-patterns/SKILL.md +33 -0
  360. package/skills/rust-async-patterns/resources/implementation-playbook.md +516 -0
  361. package/skills/rust-pro/SKILL.md +178 -0
  362. package/skills/saga-orchestration/SKILL.md +496 -0
  363. package/skills/sales-automator/SKILL.md +55 -0
  364. package/skills/sast-configuration/SKILL.md +212 -0
  365. package/skills/scala-pro/SKILL.md +82 -0
  366. package/skills/screen-reader-testing/SKILL.md +33 -0
  367. package/skills/screen-reader-testing/resources/implementation-playbook.md +544 -0
  368. package/skills/search-specialist/SKILL.md +80 -0
  369. package/skills/secrets-management/SKILL.md +364 -0
  370. package/skills/security-auditor/SKILL.md +169 -0
  371. package/skills/security-compliance-compliance-check/SKILL.md +55 -0
  372. package/skills/security-compliance-compliance-check/resources/implementation-playbook.md +963 -0
  373. package/skills/security-requirement-extraction/SKILL.md +33 -0
  374. package/skills/security-requirement-extraction/resources/implementation-playbook.md +676 -0
  375. package/skills/security-scanning-security-dependencies/SKILL.md +43 -0
  376. package/skills/security-scanning-security-dependencies/resources/implementation-playbook.md +544 -0
  377. package/skills/security-scanning-security-hardening/SKILL.md +147 -0
  378. package/skills/security-scanning-security-sast/SKILL.md +495 -0
  379. package/skills/seo-authority-builder/SKILL.md +136 -0
  380. package/skills/seo-cannibalization-detector/SKILL.md +123 -0
  381. package/skills/seo-content-auditor/SKILL.md +83 -0
  382. package/skills/seo-content-planner/SKILL.md +108 -0
  383. package/skills/seo-content-refresher/SKILL.md +118 -0
  384. package/skills/seo-content-writer/SKILL.md +96 -0
  385. package/skills/seo-keyword-strategist/SKILL.md +95 -0
  386. package/skills/seo-meta-optimizer/SKILL.md +92 -0
  387. package/skills/seo-snippet-hunter/SKILL.md +114 -0
  388. package/skills/seo-structure-architect/SKILL.md +108 -0
  389. package/skills/service-mesh-expert/SKILL.md +58 -0
  390. package/skills/service-mesh-observability/SKILL.md +395 -0
  391. package/skills/shellcheck-configuration/SKILL.md +466 -0
  392. package/skills/similarity-search-patterns/SKILL.md +33 -0
  393. package/skills/similarity-search-patterns/resources/implementation-playbook.md +557 -0
  394. package/skills/slo-implementation/SKILL.md +341 -0
  395. package/skills/solidity-security/SKILL.md +34 -0
  396. package/skills/solidity-security/resources/implementation-playbook.md +524 -0
  397. package/skills/spark-optimization/SKILL.md +427 -0
  398. package/skills/sql-optimization-patterns/SKILL.md +35 -0
  399. package/skills/sql-optimization-patterns/resources/implementation-playbook.md +504 -0
  400. package/skills/sql-pro/SKILL.md +173 -0
  401. package/skills/startup-analyst/SKILL.md +328 -0
  402. package/skills/startup-business-analyst-business-case/SKILL.md +487 -0
  403. package/skills/startup-business-analyst-financial-projections/SKILL.md +353 -0
  404. package/skills/startup-business-analyst-market-opportunity/SKILL.md +240 -0
  405. package/skills/startup-financial-modeling/SKILL.md +467 -0
  406. package/skills/startup-metrics-framework/SKILL.md +34 -0
  407. package/skills/startup-metrics-framework/resources/implementation-playbook.md +500 -0
  408. package/skills/stride-analysis-patterns/SKILL.md +33 -0
  409. package/skills/stride-analysis-patterns/resources/implementation-playbook.md +655 -0
  410. package/skills/stripe-integration/SKILL.md +454 -0
  411. package/skills/systems-programming-rust-project/SKILL.md +440 -0
  412. package/skills/tailwind-design-system/SKILL.md +33 -0
  413. package/skills/tailwind-design-system/resources/implementation-playbook.md +665 -0
  414. package/skills/tdd-orchestrator/SKILL.md +205 -0
  415. package/skills/tdd-workflows-tdd-cycle/SKILL.md +221 -0
  416. package/skills/tdd-workflows-tdd-green/SKILL.md +73 -0
  417. package/skills/tdd-workflows-tdd-green/resources/implementation-playbook.md +870 -0
  418. package/skills/tdd-workflows-tdd-red/SKILL.md +164 -0
  419. package/skills/tdd-workflows-tdd-refactor/SKILL.md +187 -0
  420. package/skills/team-collaboration-issue/SKILL.md +37 -0
  421. package/skills/team-collaboration-issue/resources/implementation-playbook.md +640 -0
  422. package/skills/team-collaboration-standup-notes/SKILL.md +44 -0
  423. package/skills/team-collaboration-standup-notes/resources/implementation-playbook.md +768 -0
  424. package/skills/team-composition-analysis/SKILL.md +413 -0
  425. package/skills/temporal-python-pro/SKILL.md +370 -0
  426. package/skills/temporal-python-testing/SKILL.md +170 -0
  427. package/skills/temporal-python-testing/resources/integration-testing.md +455 -0
  428. package/skills/temporal-python-testing/resources/local-setup.md +553 -0
  429. package/skills/temporal-python-testing/resources/replay-testing.md +462 -0
  430. package/skills/temporal-python-testing/resources/unit-testing.md +328 -0
  431. package/skills/terraform-module-library/SKILL.md +261 -0
  432. package/skills/terraform-module-library/references/aws-modules.md +63 -0
  433. package/skills/terraform-specialist/SKILL.md +166 -0
  434. package/skills/test-automator/SKILL.md +224 -0
  435. package/skills/threat-mitigation-mapping/SKILL.md +33 -0
  436. package/skills/threat-mitigation-mapping/resources/implementation-playbook.md +744 -0
  437. package/skills/threat-modeling-expert/SKILL.md +60 -0
  438. package/skills/track-management/SKILL.md +38 -0
  439. package/skills/track-management/resources/implementation-playbook.md +591 -0
  440. package/skills/turborepo-caching/SKILL.md +419 -0
  441. package/skills/tutorial-engineer/SKILL.md +139 -0
  442. package/skills/typescript-advanced-types/SKILL.md +35 -0
  443. package/skills/typescript-advanced-types/resources/implementation-playbook.md +716 -0
  444. package/skills/typescript-pro/SKILL.md +55 -0
  445. package/skills/ui-minimal/SKILL.md +23 -0
  446. package/skills/ui-ux-designer/SKILL.md +209 -0
  447. package/skills/ui-visual-validator/SKILL.md +214 -0
  448. package/skills/unit-testing-test-generate/SKILL.md +319 -0
  449. package/skills/unity-developer/SKILL.md +230 -0
  450. package/skills/unity-ecs-patterns/SKILL.md +33 -0
  451. package/skills/unity-ecs-patterns/resources/implementation-playbook.md +625 -0
  452. package/skills/uv-package-manager/SKILL.md +37 -0
  453. package/skills/uv-package-manager/resources/implementation-playbook.md +830 -0
  454. package/skills/vector-database-engineer/SKILL.md +60 -0
  455. package/skills/vector-index-tuning/SKILL.md +42 -0
  456. package/skills/vector-index-tuning/resources/implementation-playbook.md +507 -0
  457. package/skills/wcag-audit-patterns/SKILL.md +41 -0
  458. package/skills/wcag-audit-patterns/resources/implementation-playbook.md +541 -0
  459. package/skills/web3-testing/SKILL.md +427 -0
  460. package/skills/workflow-orchestration-patterns/SKILL.md +333 -0
  461. package/skills/workflow-patterns/SKILL.md +38 -0
  462. package/skills/workflow-patterns/resources/implementation-playbook.md +621 -0
@@ -0,0 +1,483 @@
1
+ ---
2
+ name: llm-evaluation
3
+ description: Implement comprehensive evaluation strategies for LLM applications using automated metrics, human feedback, and benchmarking. Use when testing LLM performance, measuring AI application quality, or establishing evaluation frameworks.
4
+ ---
5
+
6
+ # LLM Evaluation
7
+
8
+ Master comprehensive evaluation strategies for LLM applications, from automated metrics to human evaluation and A/B testing.
9
+
10
+ ## Do not use this skill when
11
+
12
+ - The task is unrelated to llm evaluation
13
+ - You need a different domain or tool outside this scope
14
+
15
+ ## Instructions
16
+
17
+ - Clarify goals, constraints, and required inputs.
18
+ - Apply relevant best practices and validate outcomes.
19
+ - Provide actionable steps and verification.
20
+ - If detailed examples are required, open `resources/implementation-playbook.md`.
21
+
22
+ ## Use this skill when
23
+
24
+ - Measuring LLM application performance systematically
25
+ - Comparing different models or prompts
26
+ - Detecting performance regressions before deployment
27
+ - Validating improvements from prompt changes
28
+ - Building confidence in production systems
29
+ - Establishing baselines and tracking progress over time
30
+ - Debugging unexpected model behavior
31
+
32
+ ## Core Evaluation Types
33
+
34
+ ### 1. Automated Metrics
35
+ Fast, repeatable, scalable evaluation using computed scores.
36
+
37
+ **Text Generation:**
38
+ - **BLEU**: N-gram overlap (translation)
39
+ - **ROUGE**: Recall-oriented (summarization)
40
+ - **METEOR**: Semantic similarity
41
+ - **BERTScore**: Embedding-based similarity
42
+ - **Perplexity**: Language model confidence
43
+
44
+ **Classification:**
45
+ - **Accuracy**: Percentage correct
46
+ - **Precision/Recall/F1**: Class-specific performance
47
+ - **Confusion Matrix**: Error patterns
48
+ - **AUC-ROC**: Ranking quality
49
+
50
+ **Retrieval (RAG):**
51
+ - **MRR**: Mean Reciprocal Rank
52
+ - **NDCG**: Normalized Discounted Cumulative Gain
53
+ - **Precision@K**: Relevant in top K
54
+ - **Recall@K**: Coverage in top K
55
+
56
+ ### 2. Human Evaluation
57
+ Manual assessment for quality aspects difficult to automate.
58
+
59
+ **Dimensions:**
60
+ - **Accuracy**: Factual correctness
61
+ - **Coherence**: Logical flow
62
+ - **Relevance**: Answers the question
63
+ - **Fluency**: Natural language quality
64
+ - **Safety**: No harmful content
65
+ - **Helpfulness**: Useful to the user
66
+
67
+ ### 3. LLM-as-Judge
68
+ Use stronger LLMs to evaluate weaker model outputs.
69
+
70
+ **Approaches:**
71
+ - **Pointwise**: Score individual responses
72
+ - **Pairwise**: Compare two responses
73
+ - **Reference-based**: Compare to gold standard
74
+ - **Reference-free**: Judge without ground truth
75
+
76
+ ## Quick Start
77
+
78
+ ```python
79
+ from llm_eval import EvaluationSuite, Metric
80
+
81
+ # Define evaluation suite
82
+ suite = EvaluationSuite([
83
+ Metric.accuracy(),
84
+ Metric.bleu(),
85
+ Metric.bertscore(),
86
+ Metric.custom(name="groundedness", fn=check_groundedness)
87
+ ])
88
+
89
+ # Prepare test cases
90
+ test_cases = [
91
+ {
92
+ "input": "What is the capital of France?",
93
+ "expected": "Paris",
94
+ "context": "France is a country in Europe. Paris is its capital."
95
+ },
96
+ # ... more test cases
97
+ ]
98
+
99
+ # Run evaluation
100
+ results = suite.evaluate(
101
+ model=your_model,
102
+ test_cases=test_cases
103
+ )
104
+
105
+ print(f"Overall Accuracy: {results.metrics['accuracy']}")
106
+ print(f"BLEU Score: {results.metrics['bleu']}")
107
+ ```
108
+
109
+ ## Automated Metrics Implementation
110
+
111
+ ### BLEU Score
112
+ ```python
113
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
114
+
115
+ def calculate_bleu(reference, hypothesis):
116
+ """Calculate BLEU score between reference and hypothesis."""
117
+ smoothie = SmoothingFunction().method4
118
+
119
+ return sentence_bleu(
120
+ [reference.split()],
121
+ hypothesis.split(),
122
+ smoothing_function=smoothie
123
+ )
124
+
125
+ # Usage
126
+ bleu = calculate_bleu(
127
+ reference="The cat sat on the mat",
128
+ hypothesis="A cat is sitting on the mat"
129
+ )
130
+ ```
131
+
132
+ ### ROUGE Score
133
+ ```python
134
+ from rouge_score import rouge_scorer
135
+
136
+ def calculate_rouge(reference, hypothesis):
137
+ """Calculate ROUGE scores."""
138
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
139
+ scores = scorer.score(reference, hypothesis)
140
+
141
+ return {
142
+ 'rouge1': scores['rouge1'].fmeasure,
143
+ 'rouge2': scores['rouge2'].fmeasure,
144
+ 'rougeL': scores['rougeL'].fmeasure
145
+ }
146
+ ```
147
+
148
+ ### BERTScore
149
+ ```python
150
+ from bert_score import score
151
+
152
+ def calculate_bertscore(references, hypotheses):
153
+ """Calculate BERTScore using pre-trained BERT."""
154
+ P, R, F1 = score(
155
+ hypotheses,
156
+ references,
157
+ lang='en',
158
+ model_type='microsoft/deberta-xlarge-mnli'
159
+ )
160
+
161
+ return {
162
+ 'precision': P.mean().item(),
163
+ 'recall': R.mean().item(),
164
+ 'f1': F1.mean().item()
165
+ }
166
+ ```
167
+
168
+ ### Custom Metrics
169
+ ```python
170
+ def calculate_groundedness(response, context):
171
+ """Check if response is grounded in provided context."""
172
+ # Use NLI model to check entailment
173
+ from transformers import pipeline
174
+
175
+ nli = pipeline("text-classification", model="microsoft/deberta-large-mnli")
176
+
177
+ result = nli(f"{context} [SEP] {response}")[0]
178
+
179
+ # Return confidence that response is entailed by context
180
+ return result['score'] if result['label'] == 'ENTAILMENT' else 0.0
181
+
182
+ def calculate_toxicity(text):
183
+ """Measure toxicity in generated text."""
184
+ from detoxify import Detoxify
185
+
186
+ results = Detoxify('original').predict(text)
187
+ return max(results.values()) # Return highest toxicity score
188
+
189
+ def calculate_factuality(claim, knowledge_base):
190
+ """Verify factual claims against knowledge base."""
191
+ # Implementation depends on your knowledge base
192
+ # Could use retrieval + NLI, or fact-checking API
193
+ pass
194
+ ```
195
+
196
+ ## LLM-as-Judge Patterns
197
+
198
+ ### Single Output Evaluation
199
+ ```python
200
+ def llm_judge_quality(response, question):
201
+ """Use GPT-5 to judge response quality."""
202
+ prompt = f"""Rate the following response on a scale of 1-10 for:
203
+ 1. Accuracy (factually correct)
204
+ 2. Helpfulness (answers the question)
205
+ 3. Clarity (well-written and understandable)
206
+
207
+ Question: {question}
208
+ Response: {response}
209
+
210
+ Provide ratings in JSON format:
211
+ {{
212
+ "accuracy": <1-10>,
213
+ "helpfulness": <1-10>,
214
+ "clarity": <1-10>,
215
+ "reasoning": "<brief explanation>"
216
+ }}
217
+ """
218
+
219
+ result = openai.ChatCompletion.create(
220
+ model="gpt-5",
221
+ messages=[{"role": "user", "content": prompt}],
222
+ temperature=0
223
+ )
224
+
225
+ return json.loads(result.choices[0].message.content)
226
+ ```
227
+
228
+ ### Pairwise Comparison
229
+ ```python
230
+ def compare_responses(question, response_a, response_b):
231
+ """Compare two responses using LLM judge."""
232
+ prompt = f"""Compare these two responses to the question and determine which is better.
233
+
234
+ Question: {question}
235
+
236
+ Response A: {response_a}
237
+
238
+ Response B: {response_b}
239
+
240
+ Which response is better and why? Consider accuracy, helpfulness, and clarity.
241
+
242
+ Answer with JSON:
243
+ {{
244
+ "winner": "A" or "B" or "tie",
245
+ "reasoning": "<explanation>",
246
+ "confidence": <1-10>
247
+ }}
248
+ """
249
+
250
+ result = openai.ChatCompletion.create(
251
+ model="gpt-5",
252
+ messages=[{"role": "user", "content": prompt}],
253
+ temperature=0
254
+ )
255
+
256
+ return json.loads(result.choices[0].message.content)
257
+ ```
258
+
259
+ ## Human Evaluation Frameworks
260
+
261
+ ### Annotation Guidelines
262
+ ```python
263
+ class AnnotationTask:
264
+ """Structure for human annotation task."""
265
+
266
+ def __init__(self, response, question, context=None):
267
+ self.response = response
268
+ self.question = question
269
+ self.context = context
270
+
271
+ def get_annotation_form(self):
272
+ return {
273
+ "question": self.question,
274
+ "context": self.context,
275
+ "response": self.response,
276
+ "ratings": {
277
+ "accuracy": {
278
+ "scale": "1-5",
279
+ "description": "Is the response factually correct?"
280
+ },
281
+ "relevance": {
282
+ "scale": "1-5",
283
+ "description": "Does it answer the question?"
284
+ },
285
+ "coherence": {
286
+ "scale": "1-5",
287
+ "description": "Is it logically consistent?"
288
+ }
289
+ },
290
+ "issues": {
291
+ "factual_error": False,
292
+ "hallucination": False,
293
+ "off_topic": False,
294
+ "unsafe_content": False
295
+ },
296
+ "feedback": ""
297
+ }
298
+ ```
299
+
300
+ ### Inter-Rater Agreement
301
+ ```python
302
+ from sklearn.metrics import cohen_kappa_score
303
+
304
+ def calculate_agreement(rater1_scores, rater2_scores):
305
+ """Calculate inter-rater agreement."""
306
+ kappa = cohen_kappa_score(rater1_scores, rater2_scores)
307
+
308
+ interpretation = {
309
+ kappa < 0: "Poor",
310
+ kappa < 0.2: "Slight",
311
+ kappa < 0.4: "Fair",
312
+ kappa < 0.6: "Moderate",
313
+ kappa < 0.8: "Substantial",
314
+ kappa <= 1.0: "Almost Perfect"
315
+ }
316
+
317
+ return {
318
+ "kappa": kappa,
319
+ "interpretation": interpretation[True]
320
+ }
321
+ ```
322
+
323
+ ## A/B Testing
324
+
325
+ ### Statistical Testing Framework
326
+ ```python
327
+ from scipy import stats
328
+ import numpy as np
329
+
330
+ class ABTest:
331
+ def __init__(self, variant_a_name="A", variant_b_name="B"):
332
+ self.variant_a = {"name": variant_a_name, "scores": []}
333
+ self.variant_b = {"name": variant_b_name, "scores": []}
334
+
335
+ def add_result(self, variant, score):
336
+ """Add evaluation result for a variant."""
337
+ if variant == "A":
338
+ self.variant_a["scores"].append(score)
339
+ else:
340
+ self.variant_b["scores"].append(score)
341
+
342
+ def analyze(self, alpha=0.05):
343
+ """Perform statistical analysis."""
344
+ a_scores = self.variant_a["scores"]
345
+ b_scores = self.variant_b["scores"]
346
+
347
+ # T-test
348
+ t_stat, p_value = stats.ttest_ind(a_scores, b_scores)
349
+
350
+ # Effect size (Cohen's d)
351
+ pooled_std = np.sqrt((np.std(a_scores)**2 + np.std(b_scores)**2) / 2)
352
+ cohens_d = (np.mean(b_scores) - np.mean(a_scores)) / pooled_std
353
+
354
+ return {
355
+ "variant_a_mean": np.mean(a_scores),
356
+ "variant_b_mean": np.mean(b_scores),
357
+ "difference": np.mean(b_scores) - np.mean(a_scores),
358
+ "relative_improvement": (np.mean(b_scores) - np.mean(a_scores)) / np.mean(a_scores),
359
+ "p_value": p_value,
360
+ "statistically_significant": p_value < alpha,
361
+ "cohens_d": cohens_d,
362
+ "effect_size": self.interpret_cohens_d(cohens_d),
363
+ "winner": "B" if np.mean(b_scores) > np.mean(a_scores) else "A"
364
+ }
365
+
366
+ @staticmethod
367
+ def interpret_cohens_d(d):
368
+ """Interpret Cohen's d effect size."""
369
+ abs_d = abs(d)
370
+ if abs_d < 0.2:
371
+ return "negligible"
372
+ elif abs_d < 0.5:
373
+ return "small"
374
+ elif abs_d < 0.8:
375
+ return "medium"
376
+ else:
377
+ return "large"
378
+ ```
379
+
380
+ ## Regression Testing
381
+
382
+ ### Regression Detection
383
+ ```python
384
+ class RegressionDetector:
385
+ def __init__(self, baseline_results, threshold=0.05):
386
+ self.baseline = baseline_results
387
+ self.threshold = threshold
388
+
389
+ def check_for_regression(self, new_results):
390
+ """Detect if new results show regression."""
391
+ regressions = []
392
+
393
+ for metric in self.baseline.keys():
394
+ baseline_score = self.baseline[metric]
395
+ new_score = new_results.get(metric)
396
+
397
+ if new_score is None:
398
+ continue
399
+
400
+ # Calculate relative change
401
+ relative_change = (new_score - baseline_score) / baseline_score
402
+
403
+ # Flag if significant decrease
404
+ if relative_change < -self.threshold:
405
+ regressions.append({
406
+ "metric": metric,
407
+ "baseline": baseline_score,
408
+ "current": new_score,
409
+ "change": relative_change
410
+ })
411
+
412
+ return {
413
+ "has_regression": len(regressions) > 0,
414
+ "regressions": regressions
415
+ }
416
+ ```
417
+
418
+ ## Benchmarking
419
+
420
+ ### Running Benchmarks
421
+ ```python
422
+ class BenchmarkRunner:
423
+ def __init__(self, benchmark_dataset):
424
+ self.dataset = benchmark_dataset
425
+
426
+ def run_benchmark(self, model, metrics):
427
+ """Run model on benchmark and calculate metrics."""
428
+ results = {metric.name: [] for metric in metrics}
429
+
430
+ for example in self.dataset:
431
+ # Generate prediction
432
+ prediction = model.predict(example["input"])
433
+
434
+ # Calculate each metric
435
+ for metric in metrics:
436
+ score = metric.calculate(
437
+ prediction=prediction,
438
+ reference=example["reference"],
439
+ context=example.get("context")
440
+ )
441
+ results[metric.name].append(score)
442
+
443
+ # Aggregate results
444
+ return {
445
+ metric: {
446
+ "mean": np.mean(scores),
447
+ "std": np.std(scores),
448
+ "min": min(scores),
449
+ "max": max(scores)
450
+ }
451
+ for metric, scores in results.items()
452
+ }
453
+ ```
454
+
455
+ ## Resources
456
+
457
+ - **references/metrics.md**: Comprehensive metric guide
458
+ - **references/human-evaluation.md**: Annotation best practices
459
+ - **references/benchmarking.md**: Standard benchmarks
460
+ - **references/a-b-testing.md**: Statistical testing guide
461
+ - **references/regression-testing.md**: CI/CD integration
462
+ - **assets/evaluation-framework.py**: Complete evaluation harness
463
+ - **assets/benchmark-dataset.jsonl**: Example datasets
464
+ - **scripts/evaluate-model.py**: Automated evaluation runner
465
+
466
+ ## Best Practices
467
+
468
+ 1. **Multiple Metrics**: Use diverse metrics for comprehensive view
469
+ 2. **Representative Data**: Test on real-world, diverse examples
470
+ 3. **Baselines**: Always compare against baseline performance
471
+ 4. **Statistical Rigor**: Use proper statistical tests for comparisons
472
+ 5. **Continuous Evaluation**: Integrate into CI/CD pipeline
473
+ 6. **Human Validation**: Combine automated metrics with human judgment
474
+ 7. **Error Analysis**: Investigate failures to understand weaknesses
475
+ 8. **Version Control**: Track evaluation results over time
476
+
477
+ ## Common Pitfalls
478
+
479
+ - **Single Metric Obsession**: Optimizing for one metric at the expense of others
480
+ - **Small Sample Size**: Drawing conclusions from too few examples
481
+ - **Data Contamination**: Testing on training data
482
+ - **Ignoring Variance**: Not accounting for statistical uncertainty
483
+ - **Metric Mismatch**: Using metrics not aligned with business goals