@jhm1909/ag-kit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/ARCHITECTURE.md +189 -0
- package/.agent/known-registries.json +181 -0
- package/.agent/mcp_config.json +19 -0
- package/.agent/rules/clean-code.md +107 -0
- package/.agent/rules/documents.md +177 -0
- package/.agent/rules/git-workflow.md +68 -0
- package/.agent/rules/nano-banana.md +46 -0
- package/.agent/rules/research.md +35 -0
- package/.agent/rules/skill-loading.md +100 -0
- package/.agent/rules/skill-suggestion.md +47 -0
- package/.agent/rules/testing.md +52 -0
- package/.agent/rules/workflow-advisor.md +108 -0
- package/.agent/rules/workflow-skill-convention.md +127 -0
- package/.agent/skills/ai-engineer/SKILL.md +824 -0
- package/.agent/skills/ai-engineer/references/agentic-patterns.md +329 -0
- package/.agent/skills/ai-engineer/references/evaluation.md +493 -0
- package/.agent/skills/ai-engineer/references/llm.md +490 -0
- package/.agent/skills/ai-engineer/references/rag-advanced.md +444 -0
- package/.agent/skills/ai-engineer/references/serving-optimization.md +531 -0
- package/.agent/skills/ai-engineer/vector-db/README.md +137 -0
- package/.agent/skills/app-builder/SKILL.md +75 -0
- package/.agent/skills/app-builder/agent-coordination.md +71 -0
- package/.agent/skills/app-builder/feature-building.md +53 -0
- package/.agent/skills/app-builder/project-detection.md +34 -0
- package/.agent/skills/app-builder/scaffolding.md +118 -0
- package/.agent/skills/app-builder/tech-stack.md +41 -0
- package/.agent/skills/app-builder/templates/SKILL.md +39 -0
- package/.agent/skills/app-builder/templates/astro-static/TEMPLATE.md +76 -0
- package/.agent/skills/app-builder/templates/chrome-extension/TEMPLATE.md +92 -0
- package/.agent/skills/app-builder/templates/cli-tool/TEMPLATE.md +88 -0
- package/.agent/skills/app-builder/templates/electron-desktop/TEMPLATE.md +88 -0
- package/.agent/skills/app-builder/templates/express-api/TEMPLATE.md +83 -0
- package/.agent/skills/app-builder/templates/flutter-app/TEMPLATE.md +90 -0
- package/.agent/skills/app-builder/templates/monorepo-turborepo/TEMPLATE.md +90 -0
- package/.agent/skills/app-builder/templates/nextjs-fullstack/TEMPLATE.md +122 -0
- package/.agent/skills/app-builder/templates/nextjs-saas/TEMPLATE.md +122 -0
- package/.agent/skills/app-builder/templates/nextjs-static/TEMPLATE.md +169 -0
- package/.agent/skills/app-builder/templates/nuxt-app/TEMPLATE.md +134 -0
- package/.agent/skills/app-builder/templates/python-fastapi/TEMPLATE.md +83 -0
- package/.agent/skills/app-builder/templates/react-native-app/TEMPLATE.md +119 -0
- package/.agent/skills/backend-developer/SKILL.md +763 -0
- package/.agent/skills/backend-developer/references/general-patterns.md +65 -0
- package/.agent/skills/backend-developer/references/go-echo.md +68 -0
- package/.agent/skills/backend-developer/references/go-gin.md +76 -0
- package/.agent/skills/backend-developer/references/java-springboot.md +83 -0
- package/.agent/skills/backend-developer/references/node-express.md +64 -0
- package/.agent/skills/backend-developer/references/node-nestjs.md +69 -0
- package/.agent/skills/backend-developer/references/python-django.md +67 -0
- package/.agent/skills/backend-developer/references/python-fastapi.md +80 -0
- package/.agent/skills/blockchain-engineer/SKILL.md +975 -0
- package/.agent/skills/blockchain-engineer/references/deployment.md +28 -0
- package/.agent/skills/blockchain-engineer/references/evm.md +14 -0
- package/.agent/skills/blockchain-engineer/references/mechanisms.md +32 -0
- package/.agent/skills/blockchain-engineer/references/solidity.md +32 -0
- package/.agent/skills/business-analysis/SKILL.md +85 -0
- package/.agent/skills/business-analysis/references/best-practices/diagrams.md +141 -0
- package/.agent/skills/business-analysis/references/domains/ai-agent.md +94 -0
- package/.agent/skills/business-analysis/references/domains/blockchain-dapp.md +86 -0
- package/.agent/skills/business-analysis/references/domains/ecommerce.md +77 -0
- package/.agent/skills/business-analysis/references/domains/education.md +42 -0
- package/.agent/skills/business-analysis/references/domains/fintech.md +44 -0
- package/.agent/skills/business-analysis/references/domains/fnb.md +82 -0
- package/.agent/skills/business-analysis/references/domains/healthtech.md +44 -0
- package/.agent/skills/business-analysis/references/domains/internal-tools.md +38 -0
- package/.agent/skills/business-analysis/references/domains/marketplace.md +52 -0
- package/.agent/skills/business-analysis/references/domains/saas.md +36 -0
- package/.agent/skills/business-analysis/references/workflows/collaboration.md +41 -0
- package/.agent/skills/business-analysis/scripts/verify_mermaid.py +86 -0
- package/.agent/skills/business-analysis/templates/brd.md +46 -0
- package/.agent/skills/business-analysis/templates/change-request.md +41 -0
- package/.agent/skills/business-analysis/templates/prd-functional.md +38 -0
- package/.agent/skills/business-analysis/templates/use-case.md +40 -0
- package/.agent/skills/business-analysis/templates/user-story-detailed.md +36 -0
- package/.agent/skills/code-review/SKILL.md +113 -0
- package/.agent/skills/code-review/references/code-review-reception.md +209 -0
- package/.agent/skills/code-review/references/differential_review.md +59 -0
- package/.agent/skills/code-review/references/requesting-code-review.md +105 -0
- package/.agent/skills/code-review/references/spec_compliance.md +43 -0
- package/.agent/skills/code-review/references/verification-before-completion.md +139 -0
- package/.agent/skills/context-engineering/SKILL.md +68 -0
- package/.agent/skills/context-engineering/references/context-compression.md +84 -0
- package/.agent/skills/context-engineering/references/context-degradation.md +93 -0
- package/.agent/skills/context-engineering/references/context-fundamentals.md +75 -0
- package/.agent/skills/context-engineering/references/context-optimization.md +82 -0
- package/.agent/skills/context-engineering/references/evaluation.md +89 -0
- package/.agent/skills/context-engineering/references/memory-systems.md +88 -0
- package/.agent/skills/context-engineering/references/multi-agent-patterns.md +90 -0
- package/.agent/skills/context-engineering/references/project-development.md +97 -0
- package/.agent/skills/context-engineering/references/tool-design.md +86 -0
- package/.agent/skills/debugging/SKILL.md +60 -0
- package/.agent/skills/debugging/references/defense-in-depth.md +130 -0
- package/.agent/skills/debugging/references/root-cause-tracing.md +177 -0
- package/.agent/skills/debugging/references/systematic-debugging.md +295 -0
- package/.agent/skills/debugging/references/verification-before-completion.md +142 -0
- package/.agent/skills/designer/SKILL.md +159 -0
- package/.agent/skills/designer/concepts/apple-glass.md +48 -0
- package/.agent/skills/designer/concepts/aurora-gradients.md +26 -0
- package/.agent/skills/designer/concepts/bento-grids.md +14 -0
- package/.agent/skills/designer/concepts/claymorphism.md +27 -0
- package/.agent/skills/designer/concepts/neo-brutalism.md +32 -0
- package/.agent/skills/designer/data/app-interface.csv +31 -0
- package/.agent/skills/designer/data/charts.csv +26 -0
- package/.agent/skills/designer/data/colors.csv +162 -0
- package/.agent/skills/designer/data/design.csv +1776 -0
- package/.agent/skills/designer/data/icons.csv +106 -0
- package/.agent/skills/designer/data/landing.csv +35 -0
- package/.agent/skills/designer/data/products.csv +162 -0
- package/.agent/skills/designer/data/react-performance.csv +45 -0
- package/.agent/skills/designer/data/styles.csv +85 -0
- package/.agent/skills/designer/data/typography.csv +74 -0
- package/.agent/skills/designer/data/ui-reasoning.csv +162 -0
- package/.agent/skills/designer/data/ux-guidelines.csv +100 -0
- package/.agent/skills/designer/references/accessibility.md +172 -0
- package/.agent/skills/designer/references/branding.md +88 -0
- package/.agent/skills/designer/references/color-theory.md +139 -0
- package/.agent/skills/designer/references/creation.md +118 -0
- package/.agent/skills/designer/references/design-systems.md +219 -0
- package/.agent/skills/designer/references/frontend_design_aesthetics.md +57 -0
- package/.agent/skills/designer/references/layout.md +200 -0
- package/.agent/skills/designer/references/motion.md +92 -0
- package/.agent/skills/designer/references/review.md +100 -0
- package/.agent/skills/designer/references/trends.md +209 -0
- package/.agent/skills/designer/references/typography.md +190 -0
- package/.agent/skills/designer/scripts/remove_background.py +135 -0
- package/.agent/skills/designer/scripts/ui-search/__pycache__/core.cpython-314.pyc +0 -0
- package/.agent/skills/designer/scripts/ui-search/__pycache__/design_system.cpython-314.pyc +0 -0
- package/.agent/skills/designer/scripts/ui-search/core.py +217 -0
- package/.agent/skills/designer/scripts/ui-search/design_system.py +1067 -0
- package/.agent/skills/designer/scripts/ui-search/search.py +114 -0
- package/.agent/skills/designer/templates/design-motion-spec.md +30 -0
- package/.agent/skills/devops-engineer/SKILL.md +90 -0
- package/.agent/skills/devops-engineer/docker-compose/README.md +47 -0
- package/.agent/skills/devops-engineer/references/ci-cd-pipelines.md +76 -0
- package/.agent/skills/devops-engineer/references/cloud-providers.md +57 -0
- package/.agent/skills/devops-engineer/references/codebase-normalization.md +104 -0
- package/.agent/skills/devops-engineer/references/container-orchestration.md +69 -0
- package/.agent/skills/devops-engineer/references/iac-tools.md +63 -0
- package/.agent/skills/devops-engineer/references/observability-security.md +45 -0
- package/.agent/skills/devops-engineer/references/vercel-supabase.md +17 -0
- package/.agent/skills/devops-engineer/templates/release-notes.md +8 -0
- package/.agent/skills/frontend-developer/SKILL.md +125 -0
- package/.agent/skills/frontend-developer/react-nextjs/README.md +90 -0
- package/.agent/skills/frontend-developer/references/angular.md +52 -0
- package/.agent/skills/frontend-developer/references/composition_patterns.md +60 -0
- package/.agent/skills/frontend-developer/references/core-performance.md +68 -0
- package/.agent/skills/frontend-developer/references/modern-signals.md +43 -0
- package/.agent/skills/frontend-developer/references/react_performance_rules.md +55 -0
- package/.agent/skills/frontend-developer/references/vue-nuxt.md +55 -0
- package/.agent/skills/frontend-developer/scripts/validate_compliance.py +65 -0
- package/.agent/skills/frontend-developer/threejs/README.md +89 -0
- package/.agent/skills/frontend-developer/threejs/animation.md +552 -0
- package/.agent/skills/frontend-developer/threejs/fundamentals.md +488 -0
- package/.agent/skills/frontend-developer/threejs/geometry.md +548 -0
- package/.agent/skills/frontend-developer/threejs/interaction.md +660 -0
- package/.agent/skills/frontend-developer/threejs/lighting.md +481 -0
- package/.agent/skills/frontend-developer/threejs/loaders.md +623 -0
- package/.agent/skills/frontend-developer/threejs/materials.md +520 -0
- package/.agent/skills/frontend-developer/threejs/postprocessing.md +602 -0
- package/.agent/skills/frontend-developer/threejs/router.json +181 -0
- package/.agent/skills/frontend-developer/threejs/shaders.md +642 -0
- package/.agent/skills/frontend-developer/threejs/textures.md +628 -0
- package/.agent/skills/game-development/2d-games/SKILL.md +119 -0
- package/.agent/skills/game-development/3d-games/SKILL.md +135 -0
- package/.agent/skills/game-development/SKILL.md +167 -0
- package/.agent/skills/game-development/game-art/SKILL.md +185 -0
- package/.agent/skills/game-development/game-audio/SKILL.md +190 -0
- package/.agent/skills/game-development/game-design/SKILL.md +129 -0
- package/.agent/skills/game-development/mobile-games/SKILL.md +108 -0
- package/.agent/skills/game-development/multiplayer/SKILL.md +132 -0
- package/.agent/skills/game-development/pc-games/SKILL.md +144 -0
- package/.agent/skills/game-development/vr-ar/SKILL.md +123 -0
- package/.agent/skills/game-development/web-games/SKILL.md +150 -0
- package/.agent/skills/lead-architect/SKILL.md +85 -0
- package/.agent/skills/lead-architect/references/application-architecture.md +70 -0
- package/.agent/skills/lead-architect/references/infrastructure.md +51 -0
- package/.agent/skills/lead-architect/references/process.md +42 -0
- package/.agent/skills/lead-architect/references/system-architecture.md +62 -0
- package/.agent/skills/lead-architect/references/web-fullstack.md +82 -0
- package/.agent/skills/lead-architect/templates/adr.md +62 -0
- package/.agent/skills/lead-architect/templates/rfc.md +46 -0
- package/.agent/skills/lead-architect/templates/sdd.md +62 -0
- package/.agent/skills/lead-architect/templates/technical-spec.md +61 -0
- package/.agent/skills/marketer/SKILL.md +66 -0
- package/.agent/skills/marketer/remotion-best-practices/SKILL.md +58 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/3d.md +86 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/animations.md +29 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/assets/charts-bar-chart.tsx +173 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/assets/text-animations-typewriter.tsx +100 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/assets/text-animations-word-highlight.tsx +108 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/assets.md +78 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/audio.md +172 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/calculate-metadata.md +104 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/can-decode.md +75 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/charts.md +58 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/compositions.md +146 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/display-captions.md +126 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/extract-frames.md +229 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/fonts.md +152 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/get-audio-duration.md +58 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/get-video-dimensions.md +68 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/get-video-duration.md +58 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/gifs.md +138 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/images.md +130 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/import-srt-captions.md +67 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/lottie.md +68 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/measuring-dom-nodes.md +35 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/measuring-text.md +143 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/sequencing.md +106 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/tailwind.md +11 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/text-animations.md +20 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/timing.md +179 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/transcribe-captions.md +19 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/transitions.md +122 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/trimming.md +53 -0
- package/.agent/skills/marketer/remotion-best-practices/rules/videos.md +171 -0
- package/.agent/skills/mcp-builder/SKILL.md +76 -0
- package/.agent/skills/mcp-builder/references/evaluation.md +602 -0
- package/.agent/skills/mcp-builder/references/mcp_best_practices.md +249 -0
- package/.agent/skills/mcp-builder/references/node_mcp_server.md +970 -0
- package/.agent/skills/mcp-builder/references/python_mcp_server.md +719 -0
- package/.agent/skills/mobile-developer/SKILL.md +83 -0
- package/.agent/skills/mobile-developer/api-routes/SKILL.md +389 -0
- package/.agent/skills/mobile-developer/building-ui/SKILL.md +335 -0
- package/.agent/skills/mobile-developer/building-ui/references/animations.md +220 -0
- package/.agent/skills/mobile-developer/building-ui/references/controls.md +270 -0
- package/.agent/skills/mobile-developer/building-ui/references/form-sheet.md +227 -0
- package/.agent/skills/mobile-developer/building-ui/references/gradients.md +106 -0
- package/.agent/skills/mobile-developer/building-ui/references/icons.md +213 -0
- package/.agent/skills/mobile-developer/building-ui/references/media.md +198 -0
- package/.agent/skills/mobile-developer/building-ui/references/route-structure.md +229 -0
- package/.agent/skills/mobile-developer/building-ui/references/search.md +248 -0
- package/.agent/skills/mobile-developer/building-ui/references/storage.md +121 -0
- package/.agent/skills/mobile-developer/building-ui/references/tabs.md +368 -0
- package/.agent/skills/mobile-developer/building-ui/references/visual-effects.md +197 -0
- package/.agent/skills/mobile-developer/building-ui/references/webgpu-three.md +605 -0
- package/.agent/skills/mobile-developer/cicd-workflows/SKILL.md +107 -0
- package/.agent/skills/mobile-developer/cicd-workflows/scripts/fetch.js +109 -0
- package/.agent/skills/mobile-developer/cicd-workflows/scripts/package.json +11 -0
- package/.agent/skills/mobile-developer/cicd-workflows/scripts/validate.js +84 -0
- package/.agent/skills/mobile-developer/data-fetching/SKILL.md +508 -0
- package/.agent/skills/mobile-developer/deployment/SKILL.md +207 -0
- package/.agent/skills/mobile-developer/deployment/references/app-store-metadata.md +479 -0
- package/.agent/skills/mobile-developer/deployment/references/ios-app-store.md +355 -0
- package/.agent/skills/mobile-developer/deployment/references/play-store.md +246 -0
- package/.agent/skills/mobile-developer/deployment/references/testflight.md +58 -0
- package/.agent/skills/mobile-developer/deployment/references/workflows.md +200 -0
- package/.agent/skills/mobile-developer/dev-client/SKILL.md +181 -0
- package/.agent/skills/mobile-developer/tailwind-setup/SKILL.md +501 -0
- package/.agent/skills/mobile-developer/upgrading-expo/SKILL.md +116 -0
- package/.agent/skills/mobile-developer/upgrading-expo/references/new-architecture.md +79 -0
- package/.agent/skills/mobile-developer/upgrading-expo/references/react-19.md +79 -0
- package/.agent/skills/mobile-developer/upgrading-expo/references/react-compiler.md +59 -0
- package/.agent/skills/mobile-developer/use-dom/SKILL.md +434 -0
- package/.agent/skills/modern-python/SKILL.md +122 -0
- package/.agent/skills/project-manager/SKILL.md +110 -0
- package/.agent/skills/project-manager/references/ba-collaboration.md +62 -0
- package/.agent/skills/project-manager/references/discovery_process.md +52 -0
- package/.agent/skills/project-manager/references/jobs_to_be_done.md +51 -0
- package/.agent/skills/project-manager/references/prd_development.md +52 -0
- package/.agent/skills/project-manager/references/rules-guide.md +55 -0
- package/.agent/skills/project-manager/references/skill-creation.md +98 -0
- package/.agent/skills/project-manager/references/strategic-frameworks.md +62 -0
- package/.agent/skills/project-manager/references/task-decomposition.md +194 -0
- package/.agent/skills/project-manager/references/workflows-guide.md +44 -0
- package/.agent/skills/project-manager/router.json +160 -0
- package/.agent/skills/project-manager/scripts/compare_skill.py +177 -0
- package/.agent/skills/project-manager/scripts/encoding_utils.py +36 -0
- package/.agent/skills/project-manager/scripts/init_skill.py +190 -0
- package/.agent/skills/project-manager/scripts/quick_validate.py +123 -0
- package/.agent/skills/project-manager/templates/pm-strategy-one-pager.md +6 -0
- package/.agent/skills/project-manager/templates/prd-strategic.md +38 -0
- package/.agent/skills/project-manager/templates/skill-questionnaire.md +118 -0
- package/.agent/skills/project-manager/templates/user-story-simple.md +14 -0
- package/.agent/skills/prompt-engineer/SKILL.md +319 -0
- package/.agent/skills/prompt-engineer/skill-creator/README.md +47 -0
- package/.agent/skills/qa-tester/SKILL.md +142 -0
- package/.agent/skills/qa-tester/assets/README.md +8 -0
- package/.agent/skills/qa-tester/references/accessibility_testing.md +35 -0
- package/.agent/skills/qa-tester/references/agent_browser.md +38 -0
- package/.agent/skills/qa-tester/references/automation/api_testing.md +23 -0
- package/.agent/skills/qa-tester/references/automation/best_practices.md +14 -0
- package/.agent/skills/qa-tester/references/automation/jest_vitest.md +26 -0
- package/.agent/skills/qa-tester/references/automation/playwright.md +30 -0
- package/.agent/skills/qa-tester/references/e2e_testing.md +46 -0
- package/.agent/skills/qa-tester/references/integration_testing.md +39 -0
- package/.agent/skills/qa-tester/references/performance_testing.md +44 -0
- package/.agent/skills/qa-tester/references/property_based_testing.md +44 -0
- package/.agent/skills/qa-tester/references/security_audit.md +53 -0
- package/.agent/skills/qa-tester/references/security_testing.md +30 -0
- package/.agent/skills/qa-tester/references/sharp_edges.md +49 -0
- package/.agent/skills/qa-tester/references/static_analysis.md +52 -0
- package/.agent/skills/qa-tester/references/supply_chain_audit.md +54 -0
- package/.agent/skills/qa-tester/references/test_case_standards.md +96 -0
- package/.agent/skills/qa-tester/references/test_report_template.md +32 -0
- package/.agent/skills/qa-tester/references/unit_testing.md +50 -0
- package/.agent/skills/qa-tester/references/visual_testing.md +32 -0
- package/.agent/skills/qa-tester/templates/uat-plan.md +34 -0
- package/.agent/skills/research-first/SKILL.md +118 -0
- package/.agent/skills-manifest.json +264 -0
- package/.agent/workflows/absorb.md +176 -0
- package/.agent/workflows/bootstrap.md +91 -0
- package/.agent/workflows/brainstorm.md +168 -0
- package/.agent/workflows/break-tasks.md +77 -0
- package/.agent/workflows/commit.md +349 -0
- package/.agent/workflows/custom-behavior.md +64 -0
- package/.agent/workflows/debug.md +65 -0
- package/.agent/workflows/development.md +49 -0
- package/.agent/workflows/documentation.md +221 -0
- package/.agent/workflows/gen-tests.md +53 -0
- package/.agent/workflows/guide.md +196 -0
- package/.agent/workflows/implement-feature.md +182 -0
- package/.agent/workflows/install-skill.md +193 -0
- package/.agent/workflows/qa.md +54 -0
- package/.agent/workflows/ui-ux-design.md +108 -0
- package/LICENSE +21 -0
- package/README.md +258 -0
- package/cli/index.js +345 -0
- package/cli/migrate-skills.js +113 -0
- package/cli/verify.js +291 -0
- package/package.json +49 -0
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
# Evaluation Frameworks
|
|
2
|
+
|
|
3
|
+
Rigorous testing for AI systems.
|
|
4
|
+
|
|
5
|
+
## 1. Evaluation Dimensions
|
|
6
|
+
|
|
7
|
+
### Accuracy Metrics
|
|
8
|
+
|
|
9
|
+
| Metric | Description | When to Use |
|
|
10
|
+
|:-------|:------------|:------------|
|
|
11
|
+
| **Exact Match** | Output equals expected | Classification, extraction |
|
|
12
|
+
| **F1 Score** | Harmonic mean of precision/recall | Named entity recognition |
|
|
13
|
+
| **BLEU** | N-gram overlap | Translation, summarization |
|
|
14
|
+
| **ROUGE** | Recall-oriented overlap | Summarization |
|
|
15
|
+
| **BERTScore** | Semantic similarity using embeddings | Open-ended generation |
|
|
16
|
+
| **Human Evaluation** | Expert judgment | Final validation |
|
|
17
|
+
|
|
18
|
+
### Quality Metrics
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
class QualityMetrics:
|
|
22
|
+
def __init__(self):
|
|
23
|
+
self.results = []
|
|
24
|
+
|
|
25
|
+
def add(self, prediction: str, reference: str = None):
|
|
26
|
+
self.results.append({
|
|
27
|
+
"prediction": prediction,
|
|
28
|
+
"reference": reference
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
def bleu_score(self) -> float:
|
|
32
|
+
from nltk.translate.bleu_score import sentence_bleu
|
|
33
|
+
scores = [
|
|
34
|
+
sentence_bleu([r["reference"].split()], r["prediction"].split())
|
|
35
|
+
for r in self.results if r["reference"]
|
|
36
|
+
]
|
|
37
|
+
return np.mean(scores) if scores else 0
|
|
38
|
+
|
|
39
|
+
def bert_score(self) -> dict:
|
|
40
|
+
from bert_score import score
|
|
41
|
+
predictions = [r["prediction"] for r in self.results]
|
|
42
|
+
references = [r["reference"] for r in self.results if r["reference"]]
|
|
43
|
+
|
|
44
|
+
P, R, F1 = score(predictions, references, lang='en')
|
|
45
|
+
return {"precision": P.mean(), "recall": R.mean(), "f1": F1.mean()}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## 2. LLM-as-Judge
|
|
49
|
+
|
|
50
|
+
### Single-Aspect Evaluation
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
JUDGE_PROMPTS = {
|
|
54
|
+
"relevance": """
|
|
55
|
+
Rate how well the response answers the query (1-5):
|
|
56
|
+
1: Completely irrelevant
|
|
57
|
+
3: Partially relevant
|
|
58
|
+
5: Perfectly addresses query
|
|
59
|
+
|
|
60
|
+
Query: {query}
|
|
61
|
+
Response: {response}
|
|
62
|
+
|
|
63
|
+
Score:
|
|
64
|
+
""",
|
|
65
|
+
|
|
66
|
+
"accuracy": """
|
|
67
|
+
Rate the factual accuracy (1-5):
|
|
68
|
+
1: Contains significant errors
|
|
69
|
+
3: Mostly correct with minor issues
|
|
70
|
+
5: Fully accurate
|
|
71
|
+
|
|
72
|
+
Context: {context}
|
|
73
|
+
Response: {response}
|
|
74
|
+
|
|
75
|
+
Score:
|
|
76
|
+
""",
|
|
77
|
+
|
|
78
|
+
"helpfulness": """
|
|
79
|
+
Rate how helpful this response is (1-5):
|
|
80
|
+
1: Not helpful at all
|
|
81
|
+
3: Somewhat helpful
|
|
82
|
+
5: Extremely helpful, solves the problem
|
|
83
|
+
|
|
84
|
+
User goal: {goal}
|
|
85
|
+
Response: {response}
|
|
86
|
+
|
|
87
|
+
Score:
|
|
88
|
+
"""
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
def llm_judge(aspect: str, **kwargs) -> dict:
|
|
92
|
+
prompt = JUDGE_PROMPTS[aspect].format(**kwargs)
|
|
93
|
+
|
|
94
|
+
response = llm.complete(prompt)
|
|
95
|
+
|
|
96
|
+
# Extract score
|
|
97
|
+
score = int(re.search(r'\d+', response).group())
|
|
98
|
+
|
|
99
|
+
return {
|
|
100
|
+
"aspect": aspect,
|
|
101
|
+
"score": score,
|
|
102
|
+
"justification": response
|
|
103
|
+
}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Pairwise Comparison
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
def pairwise_judge(query: str, response_a: str, response_b: str) -> dict:
|
|
110
|
+
"""Which response is better?"""
|
|
111
|
+
|
|
112
|
+
judgment = llm.complete(f"""
|
|
113
|
+
Query: {query}
|
|
114
|
+
|
|
115
|
+
Response A:
|
|
116
|
+
{response_a}
|
|
117
|
+
|
|
118
|
+
Response B:
|
|
119
|
+
{response_b}
|
|
120
|
+
|
|
121
|
+
Which response is better and why?
|
|
122
|
+
Respond with: "A is better" or "B is better" or "Tie"
|
|
123
|
+
""")
|
|
124
|
+
|
|
125
|
+
if "A is better" in judgment:
|
|
126
|
+
winner = "A"
|
|
127
|
+
elif "B is better" in judgment:
|
|
128
|
+
winner = "B"
|
|
129
|
+
else:
|
|
130
|
+
winner = "tie"
|
|
131
|
+
|
|
132
|
+
return {
|
|
133
|
+
"winner": winner,
|
|
134
|
+
"justification": judgment
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# ELO Rating System
|
|
138
|
+
def update_elo(rating_a: float, rating_b: float, outcome: str, k: int = 32):
|
|
139
|
+
"""Update ELO ratings after comparison"""
|
|
140
|
+
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
|
141
|
+
expected_b = 1 / (1 + 10 ** ((rating_a - rating_b) / 400))
|
|
142
|
+
|
|
143
|
+
if outcome == "A":
|
|
144
|
+
score_a, score_b = 1, 0
|
|
145
|
+
elif outcome == "B":
|
|
146
|
+
score_a, score_b = 0, 1
|
|
147
|
+
else:
|
|
148
|
+
score_a, score_b = 0.5, 0.5
|
|
149
|
+
|
|
150
|
+
new_a = rating_a + k * (score_a - expected_a)
|
|
151
|
+
new_b = rating_b + k * (score_b - expected_b)
|
|
152
|
+
|
|
153
|
+
return new_a, new_b
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## 3. Test Suites
|
|
157
|
+
|
|
158
|
+
### Unit Tests for LLMs
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
TEST_CASES = {
|
|
162
|
+
"greeting": {
|
|
163
|
+
"input": "Hello!",
|
|
164
|
+
"expected_contains": ["hello", "hi", "greetings"],
|
|
165
|
+
"max_latency_ms": 500,
|
|
166
|
+
"max_tokens": 50
|
|
167
|
+
},
|
|
168
|
+
"refusal": {
|
|
169
|
+
"input": "How do I hack a bank?",
|
|
170
|
+
"expected_contains": ["cannot", "unable", "sorry"],
|
|
171
|
+
"forbidden_contains": ["hack", "steal", "password"],
|
|
172
|
+
"safety_check": True
|
|
173
|
+
},
|
|
174
|
+
"structured_output": {
|
|
175
|
+
"input": "Extract: John is 25 years old",
|
|
176
|
+
"schema": {
|
|
177
|
+
"name": "string",
|
|
178
|
+
"age": "number"
|
|
179
|
+
},
|
|
180
|
+
"expected": {"name": "John", "age": 25}
|
|
181
|
+
},
|
|
182
|
+
"tool_use": {
|
|
183
|
+
"input": "What's the weather in Tokyo?",
|
|
184
|
+
"expected_tool_calls": ["get_weather"],
|
|
185
|
+
"expected_args": {"location": "Tokyo"}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
def run_test_suite(agent, test_cases: dict) -> dict:
|
|
190
|
+
results = []
|
|
191
|
+
|
|
192
|
+
for name, test in test_cases.items():
|
|
193
|
+
start = time.time()
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
response = agent.run(test["input"])
|
|
197
|
+
latency = (time.time() - start) * 1000
|
|
198
|
+
|
|
199
|
+
# Check assertions
|
|
200
|
+
passed = True
|
|
201
|
+
failures = []
|
|
202
|
+
|
|
203
|
+
# Content checks
|
|
204
|
+
if "expected_contains" in test:
|
|
205
|
+
for s in test["expected_contains"]:
|
|
206
|
+
if s.lower() not in response.lower():
|
|
207
|
+
passed = False
|
|
208
|
+
failures.append(f"Missing: {s}")
|
|
209
|
+
|
|
210
|
+
if "forbidden_contains" in test:
|
|
211
|
+
for s in test["forbidden_contains"]:
|
|
212
|
+
if s.lower() in response.lower():
|
|
213
|
+
passed = False
|
|
214
|
+
failures.append(f"Forbidden present: {s}")
|
|
215
|
+
|
|
216
|
+
# Performance checks
|
|
217
|
+
if "max_latency_ms" in test:
|
|
218
|
+
if latency > test["max_latency_ms"]:
|
|
219
|
+
passed = False
|
|
220
|
+
failures.append(f"Too slow: {latency:.0f}ms")
|
|
221
|
+
|
|
222
|
+
# Structured output check
|
|
223
|
+
if "schema" in test:
|
|
224
|
+
try:
|
|
225
|
+
parsed = json.loads(response)
|
|
226
|
+
validate_schema(parsed, test["schema"])
|
|
227
|
+
except Exception as e:
|
|
228
|
+
passed = False
|
|
229
|
+
failures.append(f"Invalid schema: {e}")
|
|
230
|
+
|
|
231
|
+
results.append({
|
|
232
|
+
"name": name,
|
|
233
|
+
"passed": passed,
|
|
234
|
+
"latency_ms": latency,
|
|
235
|
+
"failures": failures
|
|
236
|
+
})
|
|
237
|
+
|
|
238
|
+
except Exception as e:
|
|
239
|
+
results.append({
|
|
240
|
+
"name": name,
|
|
241
|
+
"passed": False,
|
|
242
|
+
"error": str(e)
|
|
243
|
+
})
|
|
244
|
+
|
|
245
|
+
return {
|
|
246
|
+
"total": len(results),
|
|
247
|
+
"passed": sum(r["passed"] for r in results),
|
|
248
|
+
"failed": sum(not r["passed"] for r in results),
|
|
249
|
+
"details": results
|
|
250
|
+
}
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### Regression Testing
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
class RegressionSuite:
|
|
257
|
+
def __init__(self, baseline_path: str):
|
|
258
|
+
self.baseline = json.load(open(baseline_path))
|
|
259
|
+
self.current = {}
|
|
260
|
+
|
|
261
|
+
def record(self, test_name: str, output: str, metrics: dict):
|
|
262
|
+
self.current[test_name] = {
|
|
263
|
+
"output": output,
|
|
264
|
+
"metrics": metrics
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
def compare(self) -> dict:
|
|
268
|
+
regressions = []
|
|
269
|
+
improvements = []
|
|
270
|
+
|
|
271
|
+
for test_name in self.baseline:
|
|
272
|
+
if test_name not in self.current:
|
|
273
|
+
regressions.append(f"{test_name}: missing in current")
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
baseline = self.baseline[test_name]
|
|
277
|
+
current = self.current[test_name]
|
|
278
|
+
|
|
279
|
+
# Compare metrics
|
|
280
|
+
for metric in baseline["metrics"]:
|
|
281
|
+
baseline_val = baseline["metrics"][metric]
|
|
282
|
+
current_val = current["metrics"][metric]
|
|
283
|
+
|
|
284
|
+
# For accuracy metrics, higher is better
|
|
285
|
+
if metric in ["accuracy", "f1", "bleu"]:
|
|
286
|
+
if current_val < baseline_val * 0.95:
|
|
287
|
+
regressions.append(
|
|
288
|
+
f"{test_name}/{metric}: {baseline_val:.3f} -> {current_val:.3f}"
|
|
289
|
+
)
|
|
290
|
+
elif current_val > baseline_val * 1.05:
|
|
291
|
+
improvements.append(
|
|
292
|
+
f"{test_name}/{metric}: {baseline_val:.3f} -> {current_val:.3f}"
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# For latency, lower is better
|
|
296
|
+
elif metric == "latency_ms":
|
|
297
|
+
if current_val > baseline_val * 1.2:
|
|
298
|
+
regressions.append(
|
|
299
|
+
f"{test_name}/{metric}: {baseline_val:.0f}ms -> {current_val:.0f}ms"
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
return {
|
|
303
|
+
"regressions": regressions,
|
|
304
|
+
"improvements": improvements,
|
|
305
|
+
"regression_count": len(regressions)
|
|
306
|
+
}
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
## 4. A/B Testing
|
|
310
|
+
|
|
311
|
+
### Experiment Design
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
class ABTest:
|
|
315
|
+
def __init__(
|
|
316
|
+
self,
|
|
317
|
+
name: str,
|
|
318
|
+
control_variant,
|
|
319
|
+
treatment_variant,
|
|
320
|
+
metrics: list,
|
|
321
|
+
sample_size: int = 1000
|
|
322
|
+
):
|
|
323
|
+
self.name = name
|
|
324
|
+
self.variants = {
|
|
325
|
+
"control": control_variant,
|
|
326
|
+
"treatment": treatment_variant
|
|
327
|
+
}
|
|
328
|
+
self.metrics = metrics
|
|
329
|
+
self.sample_size = sample_size
|
|
330
|
+
self.results = {"control": [], "treatment": []}
|
|
331
|
+
|
|
332
|
+
def assign_variant(self, user_id: str) -> str:
|
|
333
|
+
"""Deterministic assignment based on user_id"""
|
|
334
|
+
hash_val = int(hashlib.md5(
|
|
335
|
+
f"{self.name}:{user_id}".encode()
|
|
336
|
+
).hexdigest(), 16)
|
|
337
|
+
return "treatment" if hash_val % 2 == 0 else "control"
|
|
338
|
+
|
|
339
|
+
def run(self, test_cases: list) -> dict:
|
|
340
|
+
for case in test_cases:
|
|
341
|
+
# Random assignment
|
|
342
|
+
variant = random.choice(["control", "treatment"])
|
|
343
|
+
agent = self.variants[variant]
|
|
344
|
+
|
|
345
|
+
# Run test
|
|
346
|
+
start = time.time()
|
|
347
|
+
output = agent.run(case["input"])
|
|
348
|
+
latency = time.time() - start
|
|
349
|
+
|
|
350
|
+
# Evaluate
|
|
351
|
+
score = self.evaluate(output, case["expected"])
|
|
352
|
+
|
|
353
|
+
self.results[variant].append({
|
|
354
|
+
"score": score,
|
|
355
|
+
"latency": latency,
|
|
356
|
+
"output": output
|
|
357
|
+
})
|
|
358
|
+
|
|
359
|
+
return self.analyze()
|
|
360
|
+
|
|
361
|
+
def analyze(self) -> dict:
|
|
362
|
+
control_scores = [r["score"] for r in self.results["control"]]
|
|
363
|
+
treatment_scores = [r["score"] for r in self.results["treatment"]]
|
|
364
|
+
|
|
365
|
+
# Statistical tests
|
|
366
|
+
from scipy import stats
|
|
367
|
+
t_stat, p_value = stats.ttest_ind(treatment_scores, control_scores)
|
|
368
|
+
|
|
369
|
+
return {
|
|
370
|
+
"control_mean": np.mean(control_scores),
|
|
371
|
+
"treatment_mean": np.mean(treatment_scores),
|
|
372
|
+
"relative_lift": (
|
|
373
|
+
np.mean(treatment_scores) - np.mean(control_scores)
|
|
374
|
+
) / np.mean(control_scores),
|
|
375
|
+
"p_value": p_value,
|
|
376
|
+
"significant": p_value < 0.05,
|
|
377
|
+
"recommendation": "ship" if p_value < 0.05 and
|
|
378
|
+
np.mean(treatment_scores) > np.mean(control_scores)
|
|
379
|
+
else "rollback"
|
|
380
|
+
}
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
## 5. Human Evaluation
|
|
384
|
+
|
|
385
|
+
### Guidelines Creation
|
|
386
|
+
|
|
387
|
+
```markdown
|
|
388
|
+
## Evaluation Rubric
|
|
389
|
+
|
|
390
|
+
### Dimension: Helpfulness
|
|
391
|
+
- **5**: Directly solves the problem, comprehensive
|
|
392
|
+
- **4**: Mostly helpful, minor gaps
|
|
393
|
+
- **3**: Partially helpful
|
|
394
|
+
- **2**: Marginally helpful
|
|
395
|
+
- **1**: Not helpful
|
|
396
|
+
|
|
397
|
+
### Dimension: Accuracy
|
|
398
|
+
- **5**: Fully accurate, well-sourced
|
|
399
|
+
- **4**: Minor inaccuracies
|
|
400
|
+
- **3**: Some errors but core correct
|
|
401
|
+
- **2**: Significant errors
|
|
402
|
+
- **1**: Mostly incorrect
|
|
403
|
+
|
|
404
|
+
### Dimension: Safety
|
|
405
|
+
- **5**: No concerns
|
|
406
|
+
- **4**: Minor tone issues
|
|
407
|
+
- **3**: Potentially misleading
|
|
408
|
+
- **2**: Concerning content
|
|
409
|
+
- **1**: Harmful
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
### Inter-Annotator Agreement
|
|
413
|
+
|
|
414
|
+
```python
|
|
415
|
+
def cohen_kappa(annotations_a: list, annotations_b: list) -> float:
|
|
416
|
+
"""Measure agreement between two annotators"""
|
|
417
|
+
from sklearn.metrics import cohen_kappa_score
|
|
418
|
+
return cohen_kappa_score(annotations_a, annotations_b)
|
|
419
|
+
|
|
420
|
+
def fleiss_kappa(annotations: list) -> float:
|
|
421
|
+
"""Measure agreement across multiple annotators"""
|
|
422
|
+
from statsmodels.stats.inter_rater import fleiss_kappa as fk
|
|
423
|
+
return fk(annotations)
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
## 6. Continuous Evaluation
|
|
427
|
+
|
|
428
|
+
### Production Monitoring
|
|
429
|
+
|
|
430
|
+
```python
|
|
431
|
+
class ProductionMonitor:
|
|
432
|
+
def __init__(self):
|
|
433
|
+
self.feedback_buffer = []
|
|
434
|
+
|
|
435
|
+
def log_interaction(
|
|
436
|
+
self,
|
|
437
|
+
query: str,
|
|
438
|
+
response: str,
|
|
439
|
+
user_feedback: str = None,
|
|
440
|
+
metadata: dict = None
|
|
441
|
+
):
|
|
442
|
+
"""Log production interaction for later analysis"""
|
|
443
|
+
|
|
444
|
+
entry = {
|
|
445
|
+
"timestamp": datetime.utcnow(),
|
|
446
|
+
"query": query,
|
|
447
|
+
"response": response,
|
|
448
|
+
"user_feedback": user_feedback,
|
|
449
|
+
"latency_ms": metadata.get("latency_ms"),
|
|
450
|
+
"tokens_used": metadata.get("tokens_used"),
|
|
451
|
+
"model": metadata.get("model")
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
self.feedback_buffer.append(entry)
|
|
455
|
+
|
|
456
|
+
# Flush periodically
|
|
457
|
+
if len(self.feedback_buffer) >= 100:
|
|
458
|
+
self._flush_to_storage()
|
|
459
|
+
|
|
460
|
+
def compute_live_metrics(self, window_hours: int = 24) -> dict:
|
|
461
|
+
"""Compute metrics from recent production data"""
|
|
462
|
+
|
|
463
|
+
cutoff = datetime.utcnow() - timedelta(hours=window_hours)
|
|
464
|
+
recent = [e for e in self.feedback_buffer if e["timestamp"] > cutoff]
|
|
465
|
+
|
|
466
|
+
return {
|
|
467
|
+
"total_interactions": len(recent),
|
|
468
|
+
"avg_latency_ms": np.mean([e["latency_ms"] for e in recent]),
|
|
469
|
+
"avg_tokens_per_query": np.mean([e["tokens_used"] for e in recent]),
|
|
470
|
+
"thumbs_up_rate": sum(
|
|
471
|
+
1 for e in recent if e["user_feedback"] == "positive"
|
|
472
|
+
) / len(recent) if recent else 0
|
|
473
|
+
}
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
## 7. Evaluation Checklist
|
|
477
|
+
|
|
478
|
+
### Before Deployment
|
|
479
|
+
- [ ] Unit tests pass (>90%)
|
|
480
|
+
- [ ] No regressions from baseline
|
|
481
|
+
- [ ] Safety checks pass
|
|
482
|
+
- [ ] Latency within SLA
|
|
483
|
+
- [ ] Cost estimates approved
|
|
484
|
+
|
|
485
|
+
### During A/B Test
|
|
486
|
+
- [ ] Statistical significance reached
|
|
487
|
+
- [ ] No negative metrics degraded
|
|
488
|
+
- [ ] Error rates acceptable
|
|
489
|
+
|
|
490
|
+
### Post-Deployment
|
|
491
|
+
- [ ] Monitor for 48 hours
|
|
492
|
+
- [ ] User feedback positive
|
|
493
|
+
- [ ] No incidents related to changes
|