@ngxtm/devkit 3.6.1 → 3.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/SKILLS_INDEX.md +2419 -634
- package/cli/update.js +145 -77
- package/merged-commands/3d-web-experience.md +254 -0
- package/merged-commands/ab-test-setup.md +232 -0
- package/merged-commands/accessibility-compliance-accessibility-audit.md +42 -0
- package/merged-commands/active-directory-attacks.md +383 -0
- package/merged-commands/address-github-comments.md +55 -0
- package/merged-commands/aesthetic.md +134 -0
- package/merged-commands/agent-evaluation.md +64 -0
- package/merged-commands/agent-framework-azure-hosted-agents.md +332 -0
- package/merged-commands/agent-manager-skill.md +40 -0
- package/merged-commands/agent-memory-mcp.md +82 -0
- package/merged-commands/agent-memory-systems.md +67 -0
- package/merged-commands/agent-orchestration-improve-agent.md +349 -0
- package/merged-commands/agent-orchestration-multi-agent-optimize.md +239 -0
- package/merged-commands/agent-tool-builder.md +53 -0
- package/merged-commands/agile-product-owner.md +31 -0
- package/merged-commands/ai-agents-architect.md +90 -0
- package/merged-commands/ai-artist.md +75 -0
- package/merged-commands/ai-engineer.md +171 -0
- package/merged-commands/ai-multimodal.md +109 -0
- package/merged-commands/ai-product.md +54 -0
- package/merged-commands/ai-wrapper-product.md +273 -0
- package/merged-commands/airflow-dag-patterns.md +41 -0
- package/merged-commands/algolia-search.md +66 -0
- package/merged-commands/algorithmic-art.md +405 -0
- package/merged-commands/analytics-tracking.md +404 -0
- package/merged-commands/angular-architect.md +97 -0
- package/merged-commands/angular-migration.md +428 -0
- package/merged-commands/anti-reversing-techniques.md +42 -0
- package/merged-commands/api-design-principles.md +37 -0
- package/merged-commands/api-designer.md +101 -0
- package/merged-commands/api-documentation-generator.md +484 -0
- package/merged-commands/api-documenter.md +184 -0
- package/merged-commands/api-fuzzing-bug-bounty.md +433 -0
- package/merged-commands/api-patterns.md +81 -0
- package/merged-commands/api-security-best-practices.md +907 -0
- package/merged-commands/api-testing-observability-api-mock.md +46 -0
- package/merged-commands/app-builder.md +75 -0
- package/merged-commands/app-store-optimization.md +403 -0
- package/merged-commands/application-performance-performance-optimization.md +154 -0
- package/merged-commands/architect-review.md +174 -0
- package/merged-commands/architecture-decision-records.md +441 -0
- package/merged-commands/architecture-designer.md +89 -0
- package/merged-commands/architecture-patterns.md +37 -0
- package/merged-commands/architecture.md +55 -0
- package/merged-commands/arm-cortex-expert.md +306 -0
- package/merged-commands/artifacts-builder.md +74 -0
- package/merged-commands/ask-questions-if-underspecified.md +81 -0
- package/merged-commands/async-python-patterns.md +39 -0
- package/merged-commands/atlassian-mcp.md +100 -0
- package/merged-commands/attack-tree-construction.md +38 -0
- package/merged-commands/auth-implementation-patterns.md +39 -0
- package/merged-commands/automate-whatsapp.md +257 -0
- package/merged-commands/autonomous-agent-patterns.md +761 -0
- package/merged-commands/autonomous-agents.md +68 -0
- package/merged-commands/avalonia-layout-zafiro.md +59 -0
- package/merged-commands/avalonia-viewmodels-zafiro.md +29 -0
- package/merged-commands/avalonia-zafiro-development.md +29 -0
- package/merged-commands/aws-agentic-ai.md +117 -0
- package/merged-commands/aws-cdk-development.md +278 -0
- package/merged-commands/aws-cost-operations.md +317 -0
- package/merged-commands/aws-penetration-testing.md +405 -0
- package/merged-commands/aws-serverless-eda.md +757 -0
- package/merged-commands/aws-serverless.md +323 -0
- package/merged-commands/aws-skills.md +22 -0
- package/merged-commands/azd-deployment.md +296 -0
- package/merged-commands/azure-ai-agents-python.md +277 -0
- package/merged-commands/azure-ai-search-python.md +198 -0
- package/merged-commands/azure-ai-voicelive-skill.md +294 -0
- package/merged-commands/azure-functions.md +42 -0
- package/merged-commands/backend-architect.md +333 -0
- package/merged-commands/backend-dev-guidelines.md +342 -0
- package/merged-commands/backend-development-feature-development.md +180 -0
- package/merged-commands/backend-development.md +155 -0
- package/merged-commands/backend-security-coder.md +156 -0
- package/merged-commands/backtesting-frameworks.md +39 -0
- package/merged-commands/bash-defensive-patterns.md +43 -0
- package/merged-commands/bash-linux.md +199 -0
- package/merged-commands/bash-pro.md +310 -0
- package/merged-commands/bats-testing-patterns.md +34 -0
- package/merged-commands/bazel-build-optimization.md +397 -0
- package/merged-commands/beautiful-prose.md +22 -0
- package/merged-commands/behavioral-modes.md +242 -0
- package/merged-commands/best-practices.md +500 -0
- package/merged-commands/better-auth.md +204 -0
- package/merged-commands/billing-automation.md +42 -0
- package/merged-commands/binary-analysis-patterns.md +450 -0
- package/merged-commands/blockchain-developer.md +208 -0
- package/merged-commands/blockrun.md +292 -0
- package/merged-commands/brainstorming.md +230 -0
- package/merged-commands/brand-guidelines-anthropic.md +73 -0
- package/merged-commands/brand-guidelines-community.md +73 -0
- package/merged-commands/brand-guidelines.md +73 -0
- package/merged-commands/broken-authentication.md +476 -0
- package/merged-commands/browser-automation.md +70 -0
- package/merged-commands/browser-extension-builder.md +261 -0
- package/merged-commands/building-ai-agent-on-cloudflare.md +391 -0
- package/merged-commands/building-mcp-server-on-cloudflare.md +265 -0
- package/merged-commands/bullmq-specialist.md +57 -0
- package/merged-commands/bun-development.md +691 -0
- package/merged-commands/burp-suite-testing.md +380 -0
- package/merged-commands/business-analyst.md +182 -0
- package/merged-commands/busybox-on-windows.md +30 -0
- package/merged-commands/c-pro.md +56 -0
- package/merged-commands/c4-architecture-c4-architecture.md +389 -0
- package/merged-commands/c4-code.md +244 -0
- package/merged-commands/c4-component.md +153 -0
- package/merged-commands/c4-container.md +171 -0
- package/merged-commands/c4-context.md +150 -0
- package/merged-commands/canvas-design.md +130 -0
- package/merged-commands/cc-skill-backend-patterns.md +584 -0
- package/merged-commands/cc-skill-clickhouse-io.md +431 -0
- package/merged-commands/cc-skill-coding-standards.md +522 -0
- package/merged-commands/cc-skill-continuous-learning.md +10 -0
- package/merged-commands/cc-skill-frontend-patterns.md +633 -0
- package/merged-commands/cc-skill-project-guidelines-example.md +352 -0
- package/merged-commands/cc-skill-security-review.md +496 -0
- package/merged-commands/cc-skill-strategic-compact.md +10 -0
- package/merged-commands/changelog-automation.md +38 -0
- package/merged-commands/changelog-generator.md +104 -0
- package/merged-commands/chaos-engineer.md +98 -0
- package/merged-commands/chrome-devtools.md +407 -0
- package/merged-commands/cicd-automation-workflow-automate.md +51 -0
- package/merged-commands/clarity-gate.md +22 -0
- package/merged-commands/claude-ally-health.md +22 -0
- package/merged-commands/claude-code-guide.md +68 -0
- package/merged-commands/claude-d3js-skill.md +820 -0
- package/merged-commands/claude-scientific-skills.md +22 -0
- package/merged-commands/claude-speed-reader.md +22 -0
- package/merged-commands/claude-win11-speckit-update-skill.md +22 -0
- package/merged-commands/clean-code.md +201 -0
- package/merged-commands/clerk-auth.md +56 -0
- package/merged-commands/cli-developer.md +97 -0
- package/merged-commands/cloud-architect.md +135 -0
- package/merged-commands/cloud-penetration-testing.md +501 -0
- package/merged-commands/cloudflare-expert.md +227 -0
- package/merged-commands/code-documentation-code-explain.md +46 -0
- package/merged-commands/code-documentation-doc-generate.md +48 -0
- package/merged-commands/code-documentation.md +263 -0
- package/merged-commands/code-documenter.md +95 -0
- package/merged-commands/code-refactoring-context-restore.md +179 -0
- package/merged-commands/code-refactoring-refactor-clean.md +51 -0
- package/merged-commands/code-refactoring-tech-debt.md +386 -0
- package/merged-commands/code-refactoring.md +209 -0
- package/merged-commands/code-review-ai-ai-review.md +450 -0
- package/merged-commands/code-review-checklist.md +444 -0
- package/merged-commands/code-review-excellence.md +40 -0
- package/merged-commands/code-review.md +121 -0
- package/merged-commands/code-reviewer.md +178 -0
- package/merged-commands/codebase-cleanup-deps-audit.md +51 -0
- package/merged-commands/codebase-cleanup-refactor-clean.md +51 -0
- package/merged-commands/codebase-cleanup-tech-debt.md +386 -0
- package/merged-commands/codex-review.md +37 -0
- package/merged-commands/commit.md +171 -0
- package/merged-commands/competitive-ads-extractor.md +293 -0
- package/merged-commands/competitive-landscape.md +34 -0
- package/merged-commands/competitor-alternatives.md +750 -0
- package/merged-commands/comprehensive-review-full-review.md +146 -0
- package/merged-commands/comprehensive-review-pr-enhance.md +46 -0
- package/merged-commands/computer-use-agents.md +315 -0
- package/merged-commands/concise-planning.md +62 -0
- package/merged-commands/conductor-implement.md +388 -0
- package/merged-commands/conductor-manage.md +39 -0
- package/merged-commands/conductor-new-track.md +433 -0
- package/merged-commands/conductor-revert.md +372 -0
- package/merged-commands/conductor-setup.md +426 -0
- package/merged-commands/conductor-status.md +338 -0
- package/merged-commands/conductor-validator.md +62 -0
- package/merged-commands/content-creator.md +248 -0
- package/merged-commands/content-marketer.md +170 -0
- package/merged-commands/content-research-writer.md +538 -0
- package/merged-commands/context-compression.md +266 -0
- package/merged-commands/context-degradation.md +238 -0
- package/merged-commands/context-driven-development.md +400 -0
- package/merged-commands/context-engineering.md +107 -0
- package/merged-commands/context-fundamentals.md +192 -0
- package/merged-commands/context-management-context-restore.md +179 -0
- package/merged-commands/context-management-context-save.md +177 -0
- package/merged-commands/context-manager.md +185 -0
- package/merged-commands/context-optimization.md +186 -0
- package/merged-commands/context-window-management.md +53 -0
- package/merged-commands/context7-auto-research.md +36 -0
- package/merged-commands/conversation-memory.md +61 -0
- package/merged-commands/copy-editing.md +439 -0
- package/merged-commands/copywriting.md +225 -0
- package/merged-commands/core-components.md +264 -0
- package/merged-commands/cosmos-db-python-skill.md +198 -0
- package/merged-commands/cost-optimization.md +286 -0
- package/merged-commands/cpp-pro.md +59 -0
- package/merged-commands/cqrs-implementation.md +35 -0
- package/merged-commands/create-pr.md +192 -0
- package/merged-commands/crewai.md +243 -0
- package/merged-commands/csharp-developer.md +94 -0
- package/merged-commands/csharp-pro.md +59 -0
- package/merged-commands/culture-index.md +43 -0
- package/merged-commands/customer-support.md +170 -0
- package/merged-commands/daily-news-report.md +356 -0
- package/merged-commands/data-engineer.md +224 -0
- package/merged-commands/data-engineering-data-driven-feature.md +182 -0
- package/merged-commands/data-engineering-data-pipeline.md +201 -0
- package/merged-commands/data-quality-frameworks.md +40 -0
- package/merged-commands/data-scientist.md +199 -0
- package/merged-commands/data-storytelling.md +465 -0
- package/merged-commands/database-admin.md +165 -0
- package/merged-commands/database-architect.md +268 -0
- package/merged-commands/database-cloud-optimization-cost-optimize.md +44 -0
- package/merged-commands/database-design.md +52 -0
- package/merged-commands/database-migration.md +436 -0
- package/merged-commands/database-migrations-migration-observability.md +420 -0
- package/merged-commands/database-migrations-sql-migrations.md +53 -0
- package/merged-commands/database-optimizer.md +167 -0
- package/merged-commands/databases.md +232 -0
- package/merged-commands/dbt-transformation-patterns.md +34 -0
- package/merged-commands/debugger.md +49 -0
- package/merged-commands/debugging-strategies.md +34 -0
- package/merged-commands/debugging-toolkit-smart-debug.md +197 -0
- package/merged-commands/debugging-wizard.md +93 -0
- package/merged-commands/debugging.md +84 -0
- package/merged-commands/deep-research.md +114 -0
- package/merged-commands/defi-protocol-templates.md +466 -0
- package/merged-commands/dependency-management-deps-audit.md +44 -0
- package/merged-commands/dependency-upgrade.md +421 -0
- package/merged-commands/deployment-engineer.md +170 -0
- package/merged-commands/deployment-pipeline-design.md +371 -0
- package/merged-commands/deployment-procedures.md +241 -0
- package/merged-commands/deployment-validation-config-validate.md +496 -0
- package/merged-commands/design-md.md +178 -0
- package/merged-commands/design-orchestration.md +167 -0
- package/merged-commands/developer-growth-analysis.md +322 -0
- package/merged-commands/devops-engineer.md +92 -0
- package/merged-commands/devops-troubleshooter.md +161 -0
- package/merged-commands/devops.md +285 -0
- package/merged-commands/discord-bot-architect.md +277 -0
- package/merged-commands/dispatching-parallel-agents.md +180 -0
- package/merged-commands/distributed-debugging-debug-trace.md +44 -0
- package/merged-commands/distributed-tracing.md +450 -0
- package/merged-commands/django-expert.md +89 -0
- package/merged-commands/django-pro.md +180 -0
- package/merged-commands/doc-coauthoring.md +375 -0
- package/merged-commands/docker-expert.md +409 -0
- package/merged-commands/docs-architect.md +98 -0
- package/merged-commands/docs-seeker.md +102 -0
- package/merged-commands/documentation-generation-doc-generate.md +48 -0
- package/merged-commands/documentation-templates.md +194 -0
- package/merged-commands/docx-official.md +197 -0
- package/merged-commands/docx.md +197 -0
- package/merged-commands/domain-name-brainstormer.md +212 -0
- package/merged-commands/dotnet-architect.md +197 -0
- package/merged-commands/dotnet-backend-patterns.md +37 -0
- package/merged-commands/dotnet-core-expert.md +96 -0
- package/merged-commands/dx-optimizer.md +83 -0
- package/merged-commands/e2e-testing-patterns.md +41 -0
- package/merged-commands/elixir-pro.md +59 -0
- package/merged-commands/email-sequence.md +925 -0
- package/merged-commands/email-systems.md +54 -0
- package/merged-commands/embedded-systems.md +98 -0
- package/merged-commands/embedding-strategies.md +491 -0
- package/merged-commands/employment-contract-templates.md +39 -0
- package/merged-commands/environment-setup-guide.md +479 -0
- package/merged-commands/error-debugging-error-analysis.md +47 -0
- package/merged-commands/error-debugging-error-trace.md +43 -0
- package/merged-commands/error-debugging-multi-agent-review.md +216 -0
- package/merged-commands/error-detective.md +53 -0
- package/merged-commands/error-diagnostics-error-analysis.md +47 -0
- package/merged-commands/error-diagnostics-error-trace.md +48 -0
- package/merged-commands/error-diagnostics-smart-debug.md +197 -0
- package/merged-commands/error-handling-patterns.md +35 -0
- package/merged-commands/ethical-hacking-methodology.md +466 -0
- package/merged-commands/evaluation.md +238 -0
- package/merged-commands/event-sourcing-architect.md +58 -0
- package/merged-commands/event-store-design.md +449 -0
- package/merged-commands/exa-search.md +36 -0
- package/merged-commands/executing-plans.md +76 -0
- package/merged-commands/expo-app-design.md +22 -0
- package/merged-commands/expo-deployment.md +72 -0
- package/merged-commands/fal-audio.md +22 -0
- package/merged-commands/fal-generate.md +22 -0
- package/merged-commands/fal-image-edit.md +22 -0
- package/merged-commands/fal-platform.md +22 -0
- package/merged-commands/fal-upscale.md +22 -0
- package/merged-commands/fal-workflow.md +22 -0
- package/merged-commands/fastapi-expert.md +93 -0
- package/merged-commands/fastapi-pro.md +192 -0
- package/merged-commands/fastapi-router.md +52 -0
- package/merged-commands/fastapi-templates.md +32 -0
- package/merged-commands/feature-forge.md +90 -0
- package/merged-commands/ffuf-claude-skill.md +22 -0
- package/merged-commands/file-organizer.md +250 -0
- package/merged-commands/file-path-traversal.md +486 -0
- package/merged-commands/file-uploads.md +22 -0
- package/merged-commands/find-bugs.md +86 -0
- package/merged-commands/fine-tuning-expert.md +98 -0
- package/merged-commands/finishing-a-development-branch.md +200 -0
- package/merged-commands/firebase.md +56 -0
- package/merged-commands/firecrawl-scraper.md +37 -0
- package/merged-commands/firmware-analyst.md +320 -0
- package/merged-commands/fix-review.md +53 -0
- package/merged-commands/fixing.md +72 -0
- package/merged-commands/flutter-expert.md +200 -0
- package/merged-commands/form-cro.md +441 -0
- package/merged-commands/foundry-iq-agent.md +15 -0
- package/merged-commands/foundry-iq-python.md +275 -0
- package/merged-commands/foundry-nextgen-frontend.md +555 -0
- package/merged-commands/foundry-sdk-python.md +290 -0
- package/merged-commands/fp-ts-errors.md +856 -0
- package/merged-commands/fp-ts-pragmatic.md +598 -0
- package/merged-commands/fp-ts-react.md +796 -0
- package/merged-commands/framework-migration-code-migrate.md +48 -0
- package/merged-commands/framework-migration-deps-upgrade.md +48 -0
- package/merged-commands/framework-migration-legacy-modernize.md +132 -0
- package/merged-commands/free-tool-strategy.md +576 -0
- package/merged-commands/frontend-design.md +272 -0
- package/merged-commands/frontend-dev-guidelines.md +359 -0
- package/merged-commands/frontend-developer.md +171 -0
- package/merged-commands/frontend-development.md +399 -0
- package/merged-commands/frontend-mobile-development-component-scaffold.md +403 -0
- package/merged-commands/frontend-mobile-security-xss-scan.md +322 -0
- package/merged-commands/frontend-security-coder.md +170 -0
- package/merged-commands/frontend-slides.md +770 -0
- package/merged-commands/full-stack-orchestration-full-stack-feature.md +135 -0
- package/merged-commands/fullstack-guardian.md +99 -0
- package/merged-commands/game-developer.md +94 -0
- package/merged-commands/game-development.md +167 -0
- package/merged-commands/gcp-cloud-run.md +288 -0
- package/merged-commands/gdpr-data-handling.md +33 -0
- package/merged-commands/geo-fundamentals.md +156 -0
- package/merged-commands/git-advanced-workflows.md +412 -0
- package/merged-commands/git-pr-workflows-git-workflow.md +140 -0
- package/merged-commands/git-pr-workflows-onboard.md +416 -0
- package/merged-commands/git-pr-workflows-pr-enhance.md +48 -0
- package/merged-commands/git-pushing.md +33 -0
- package/merged-commands/github-actions-templates.md +345 -0
- package/merged-commands/github-workflow-automation.md +846 -0
- package/merged-commands/gitlab-ci-patterns.md +283 -0
- package/merged-commands/gitops-workflow.md +303 -0
- package/merged-commands/go-concurrency-patterns.md +33 -0
- package/merged-commands/godot-gdscript-patterns.md +33 -0
- package/merged-commands/golang-pro.md +179 -0
- package/merged-commands/google-adk-python.md +243 -0
- package/merged-commands/grafana-dashboards.md +381 -0
- package/merged-commands/graphql-architect.md +182 -0
- package/merged-commands/graphql.md +68 -0
- package/merged-commands/haskell-pro.md +56 -0
- package/merged-commands/helm-chart-scaffolding.md +34 -0
- package/merged-commands/hr-pro.md +126 -0
- package/merged-commands/html-injection-testing.md +498 -0
- package/merged-commands/hubspot-integration.md +42 -0
- package/merged-commands/hugging-face-cli.md +198 -0
- package/merged-commands/hugging-face-jobs.md +1038 -0
- package/merged-commands/hybrid-cloud-architect.md +168 -0
- package/merged-commands/hybrid-cloud-networking.md +238 -0
- package/merged-commands/hybrid-search-implementation.md +32 -0
- package/merged-commands/i18n-localization.md +154 -0
- package/merged-commands/idor-testing.md +442 -0
- package/merged-commands/image-enhancer.md +99 -0
- package/merged-commands/imagen.md +77 -0
- package/merged-commands/incident-responder.md +213 -0
- package/merged-commands/incident-response-incident-response.md +168 -0
- package/merged-commands/incident-response-smart-fix.md +29 -0
- package/merged-commands/incident-runbook-templates.md +395 -0
- package/merged-commands/infinite-gratitude.md +26 -0
- package/merged-commands/inngest.md +55 -0
- package/merged-commands/interactive-portfolio.md +223 -0
- package/merged-commands/internal-comms-anthropic.md +32 -0
- package/merged-commands/internal-comms-community.md +32 -0
- package/merged-commands/internal-comms.md +32 -0
- package/merged-commands/invoice-organizer.md +446 -0
- package/merged-commands/ios-developer.md +219 -0
- package/merged-commands/issue-creator.md +137 -0
- package/merged-commands/istio-traffic-management.md +337 -0
- package/merged-commands/iterate-pr.md +150 -0
- package/merged-commands/java-architect.md +95 -0
- package/merged-commands/java-pro.md +177 -0
- package/merged-commands/javascript-mastery.md +645 -0
- package/merged-commands/javascript-pro.md +57 -0
- package/merged-commands/javascript-testing-patterns.md +35 -0
- package/merged-commands/javascript-typescript-typescript-scaffold.md +361 -0
- package/merged-commands/javascript-typescript.md +142 -0
- package/merged-commands/jira-issues.md +181 -0
- package/merged-commands/job-application.md +90 -0
- package/merged-commands/julia-pro.md +209 -0
- package/merged-commands/k8s-manifest-generator.md +35 -0
- package/merged-commands/k8s-security-policies.md +346 -0
- package/merged-commands/kaizen.md +730 -0
- package/merged-commands/kotlin-specialist.md +94 -0
- package/merged-commands/kpi-dashboard-design.md +440 -0
- package/merged-commands/kubernetes-architect.md +170 -0
- package/merged-commands/kubernetes-specialist.md +117 -0
- package/merged-commands/langchain-architecture.md +350 -0
- package/merged-commands/langfuse.md +238 -0
- package/merged-commands/langgraph.md +287 -0
- package/merged-commands/laravel-specialist.md +101 -0
- package/merged-commands/last30days.md +421 -0
- package/merged-commands/launch-strategy.md +344 -0
- package/merged-commands/lead-research-assistant.md +199 -0
- package/merged-commands/learn.md +476 -0
- package/merged-commands/legacy-modernizer.md +53 -0
- package/merged-commands/legal-advisor.md +70 -0
- package/merged-commands/linear-claude-skill.md +543 -0
- package/merged-commands/linkerd-patterns.md +321 -0
- package/merged-commands/lint-and-validate.md +45 -0
- package/merged-commands/linux-privilege-escalation.md +504 -0
- package/merged-commands/linux-shell-scripting.md +504 -0
- package/merged-commands/llm-app-patterns.md +760 -0
- package/merged-commands/llm-application-dev-ai-assistant.md +35 -0
- package/merged-commands/llm-application-dev-langchain-agent.md +246 -0
- package/merged-commands/llm-application-dev-prompt-optimize.md +37 -0
- package/merged-commands/llm-application-dev.md +216 -0
- package/merged-commands/llm-evaluation.md +483 -0
- package/merged-commands/loki-mode.md +721 -0
- package/merged-commands/machine-learning-ops-ml-pipeline.md +314 -0
- package/merged-commands/makepad-skills.md +22 -0
- package/merged-commands/malware-analyst.md +247 -0
- package/merged-commands/markdown-novel-viewer.md +281 -0
- package/merged-commands/market-sizing-analysis.md +425 -0
- package/merged-commands/marketing-ideas.md +221 -0
- package/merged-commands/marketing-psychology.md +255 -0
- package/merged-commands/mcp-builder.md +236 -0
- package/merged-commands/mcp-developer.md +94 -0
- package/merged-commands/mcp-management.md +209 -0
- package/merged-commands/media-processing.md +358 -0
- package/merged-commands/meeting-insights-analyzer.md +327 -0
- package/merged-commands/memory-forensics.md +491 -0
- package/merged-commands/memory-safety-patterns.md +33 -0
- package/merged-commands/memory-systems.md +228 -0
- package/merged-commands/mermaid-expert.md +59 -0
- package/merged-commands/mermaidjs-v11.md +115 -0
- package/merged-commands/metasploit-framework.md +478 -0
- package/merged-commands/micro-saas-launcher.md +212 -0
- package/merged-commands/microservices-architect.md +102 -0
- package/merged-commands/microservices-patterns.md +35 -0
- package/merged-commands/minecraft-bukkit-pro.md +126 -0
- package/merged-commands/ml-engineer.md +168 -0
- package/merged-commands/ml-pipeline-workflow.md +257 -0
- package/merged-commands/ml-pipeline.md +111 -0
- package/merged-commands/mlops-engineer.md +219 -0
- package/merged-commands/mobile-design.md +284 -0
- package/merged-commands/mobile-developer.md +205 -0
- package/merged-commands/mobile-development.md +212 -0
- package/merged-commands/mobile-security-coder.md +184 -0
- package/merged-commands/modern-javascript-patterns.md +35 -0
- package/merged-commands/monitoring-expert.md +92 -0
- package/merged-commands/monorepo-architect.md +61 -0
- package/merged-commands/monorepo-management.md +35 -0
- package/merged-commands/moodle-external-api-development.md +597 -0
- package/merged-commands/mtls-configuration.md +359 -0
- package/merged-commands/multi-agent-brainstorming.md +256 -0
- package/merged-commands/multi-agent-patterns.md +262 -0
- package/merged-commands/multi-cloud-architecture.md +189 -0
- package/merged-commands/multi-platform-apps-multi-platform.md +203 -0
- package/merged-commands/n8n-code-python.md +750 -0
- package/merged-commands/n8n-mcp-tools-expert.md +654 -0
- package/merged-commands/n8n-node-configuration.md +796 -0
- package/merged-commands/nanobanana-ppt-skills.md +22 -0
- package/merged-commands/neon-postgres.md +56 -0
- package/merged-commands/nestjs-expert.md +552 -0
- package/merged-commands/network-101.md +342 -0
- package/merged-commands/network-engineer.md +169 -0
- package/merged-commands/nextjs-app-router-patterns.md +33 -0
- package/merged-commands/nextjs-best-practices.md +203 -0
- package/merged-commands/nextjs-developer.md +97 -0
- package/merged-commands/nextjs-supabase-auth.md +56 -0
- package/merged-commands/nft-standards.md +395 -0
- package/merged-commands/nodejs-backend-patterns.md +35 -0
- package/merged-commands/nodejs-best-practices.md +333 -0
- package/merged-commands/nosql-expert.md +111 -0
- package/merged-commands/notebooklm-skill.md +269 -0
- package/merged-commands/notebooklm.md +269 -0
- package/merged-commands/notion-template-business.md +216 -0
- package/merged-commands/nx-workspace-patterns.md +464 -0
- package/merged-commands/observability-engineer.md +237 -0
- package/merged-commands/observability-monitoring-monitor-setup.md +48 -0
- package/merged-commands/observability-monitoring-slo-implement.md +43 -0
- package/merged-commands/observe-whatsapp.md +109 -0
- package/merged-commands/obsidian-clipper-template-creator.md +64 -0
- package/merged-commands/on-call-handoff-patterns.md +453 -0
- package/merged-commands/onboarding-cro.md +433 -0
- package/merged-commands/openapi-spec-generation.md +33 -0
- package/merged-commands/page-cro.md +343 -0
- package/merged-commands/paid-ads.md +551 -0
- package/merged-commands/pandas-pro.md +96 -0
- package/merged-commands/parallel-agents.md +175 -0
- package/merged-commands/payment-integration.md +77 -0
- package/merged-commands/paypal-integration.md +479 -0
- package/merged-commands/paywall-upgrade-cro.md +570 -0
- package/merged-commands/pci-compliance.md +478 -0
- package/merged-commands/pdf-official.md +294 -0
- package/merged-commands/pdf.md +294 -0
- package/merged-commands/pentest-checklist.md +334 -0
- package/merged-commands/pentest-commands.md +438 -0
- package/merged-commands/performance-engineer.md +180 -0
- package/merged-commands/performance-profiling.md +143 -0
- package/merged-commands/performance-testing-review-ai-review.md +450 -0
- package/merged-commands/performance-testing-review-multi-agent-review.md +216 -0
- package/merged-commands/personal-tool-builder.md +289 -0
- package/merged-commands/php-pro.md +63 -0
- package/merged-commands/plaid-fintech.md +50 -0
- package/merged-commands/plan-writing.md +152 -0
- package/merged-commands/planning-with-files.md +211 -0
- package/merged-commands/planning.md +95 -0
- package/merged-commands/plans-kanban.md +166 -0
- package/merged-commands/playwright-expert.md +87 -0
- package/merged-commands/playwright-skill.md +453 -0
- package/merged-commands/podcast-generation.md +121 -0
- package/merged-commands/popup-cro.md +346 -0
- package/merged-commands/posix-shell-pro.md +304 -0
- package/merged-commands/postgres-best-practices.md +57 -0
- package/merged-commands/postgres-pro.md +98 -0
- package/merged-commands/postgresql.md +230 -0
- package/merged-commands/postmortem-writing.md +386 -0
- package/merged-commands/powershell-windows.md +167 -0
- package/merged-commands/pptx-official.md +484 -0
- package/merged-commands/pptx.md +484 -0
- package/merged-commands/pricing-strategy.md +356 -0
- package/merged-commands/prisma-expert.md +355 -0
- package/merged-commands/privilege-escalation-methods.md +333 -0
- package/merged-commands/problem-solving.md +96 -0
- package/merged-commands/product-manager-toolkit.md +351 -0
- package/merged-commands/product-strategist.md +26 -0
- package/merged-commands/production-code-audit.md +540 -0
- package/merged-commands/programmatic-seo.md +351 -0
- package/merged-commands/projection-patterns.md +33 -0
- package/merged-commands/prometheus-configuration.md +404 -0
- package/merged-commands/prompt-caching.md +61 -0
- package/merged-commands/prompt-engineer.md +272 -0
- package/merged-commands/prompt-engineering-patterns.md +213 -0
- package/merged-commands/prompt-engineering.md +171 -0
- package/merged-commands/prompt-library.md +322 -0
- package/merged-commands/protocol-reverse-engineering.md +29 -0
- package/merged-commands/pydantic-models.md +58 -0
- package/merged-commands/pypict-skill.md +22 -0
- package/merged-commands/python-development-python-scaffold.md +331 -0
- package/merged-commands/python-development.md +139 -0
- package/merged-commands/python-packaging.md +36 -0
- package/merged-commands/python-patterns.md +441 -0
- package/merged-commands/python-performance-optimization.md +36 -0
- package/merged-commands/python-pro.md +158 -0
- package/merged-commands/python-testing-patterns.md +37 -0
- package/merged-commands/qa-regression.md +337 -0
- package/merged-commands/quant-analyst.md +53 -0
- package/merged-commands/radix-ui-design-system.md +847 -0
- package/merged-commands/raffle-winner-picker.md +159 -0
- package/merged-commands/rag-architect.md +100 -0
- package/merged-commands/rag-engineer.md +90 -0
- package/merged-commands/rag-implementation.md +421 -0
- package/merged-commands/rails-expert.md +97 -0
- package/merged-commands/react-best-practices.md +121 -0
- package/merged-commands/react-expert.md +98 -0
- package/merged-commands/react-flow-node.md +66 -0
- package/merged-commands/react-modernization.md +34 -0
- package/merged-commands/react-native-architecture.md +33 -0
- package/merged-commands/react-native-expert.md +88 -0
- package/merged-commands/react-patterns.md +198 -0
- package/merged-commands/react-state-management.md +441 -0
- package/merged-commands/react-ui-patterns.md +289 -0
- package/merged-commands/readme.md +775 -0
- package/merged-commands/receiving-code-review.md +213 -0
- package/merged-commands/red-team-tactics.md +199 -0
- package/merged-commands/red-team-tools.md +310 -0
- package/merged-commands/reference-builder.md +188 -0
- package/merged-commands/referral-program.md +602 -0
- package/merged-commands/remotion-best-practices.md +45 -0
- package/merged-commands/repomix.md +275 -0
- package/merged-commands/requesting-code-review.md +105 -0
- package/merged-commands/research-engineer.md +135 -0
- package/merged-commands/research.md +191 -0
- package/merged-commands/reverse-engineer.md +173 -0
- package/merged-commands/risk-manager.md +61 -0
- package/merged-commands/risk-metrics-calculation.md +33 -0
- package/merged-commands/ruby-pro.md +56 -0
- package/merged-commands/rust-async-patterns.md +33 -0
- package/merged-commands/rust-engineer.md +96 -0
- package/merged-commands/rust-pro.md +178 -0
- package/merged-commands/saga-orchestration.md +496 -0
- package/merged-commands/sales-automator.md +55 -0
- package/merged-commands/salesforce-developer.md +105 -0
- package/merged-commands/salesforce-development.md +51 -0
- package/merged-commands/sast-configuration.md +212 -0
- package/merged-commands/scala-pro.md +82 -0
- package/merged-commands/scanning-tools.md +589 -0
- package/merged-commands/schema-markup.md +360 -0
- package/merged-commands/screen-reader-testing.md +33 -0
- package/merged-commands/screenshots.md +401 -0
- package/merged-commands/scroll-experience.md +263 -0
- package/merged-commands/search-specialist.md +80 -0
- package/merged-commands/secrets-management.md +364 -0
- package/merged-commands/secure-code-guardian.md +93 -0
- package/merged-commands/security-auditor.md +169 -0
- package/merged-commands/security-bluebook-builder.md +22 -0
- package/merged-commands/security-compliance-compliance-check.md +55 -0
- package/merged-commands/security-requirement-extraction.md +33 -0
- package/merged-commands/security-reviewer.md +94 -0
- package/merged-commands/security-scanning-security-dependencies.md +43 -0
- package/merged-commands/security-scanning-security-hardening.md +147 -0
- package/merged-commands/security-scanning-security-sast.md +495 -0
- package/merged-commands/segment-cdp.md +50 -0
- package/merged-commands/senior-architect.md +209 -0
- package/merged-commands/senior-backend.md +209 -0
- package/merged-commands/senior-computer-vision.md +226 -0
- package/merged-commands/senior-data-engineer.md +226 -0
- package/merged-commands/senior-data-scientist.md +226 -0
- package/merged-commands/senior-devops.md +209 -0
- package/merged-commands/senior-frontend.md +209 -0
- package/merged-commands/senior-fullstack.md +209 -0
- package/merged-commands/senior-ml-engineer.md +226 -0
- package/merged-commands/senior-prompt-engineer.md +226 -0
- package/merged-commands/senior-qa.md +209 -0
- package/merged-commands/senior-secops.md +209 -0
- package/merged-commands/senior-security.md +209 -0
- package/merged-commands/seo-audit.md +487 -0
- package/merged-commands/seo-authority-builder.md +136 -0
- package/merged-commands/seo-cannibalization-detector.md +123 -0
- package/merged-commands/seo-content-auditor.md +83 -0
- package/merged-commands/seo-content-planner.md +108 -0
- package/merged-commands/seo-content-refresher.md +118 -0
- package/merged-commands/seo-content-writer.md +96 -0
- package/merged-commands/seo-fundamentals.md +173 -0
- package/merged-commands/seo-keyword-strategist.md +95 -0
- package/merged-commands/seo-meta-optimizer.md +92 -0
- package/merged-commands/seo-snippet-hunter.md +114 -0
- package/merged-commands/seo-structure-architect.md +108 -0
- package/merged-commands/sequential-thinking.md +94 -0
- package/merged-commands/server-management.md +161 -0
- package/merged-commands/service-mesh-expert.md +58 -0
- package/merged-commands/service-mesh-observability.md +395 -0
- package/merged-commands/sharp-edges.md +70 -0
- package/merged-commands/shellcheck-configuration.md +466 -0
- package/merged-commands/shodan-reconnaissance.md +503 -0
- package/merged-commands/shopify-apps.md +42 -0
- package/merged-commands/shopify-development.md +366 -0
- package/merged-commands/shopify-expert.md +102 -0
- package/merged-commands/signup-flow-cro.md +355 -0
- package/merged-commands/similarity-search-patterns.md +33 -0
- package/merged-commands/skill-creator.md +356 -0
- package/merged-commands/skill-developer.md +426 -0
- package/merged-commands/skill-rails-upgrade.md +408 -0
- package/merged-commands/skill-seekers.md +22 -0
- package/merged-commands/skill-share.md +80 -0
- package/merged-commands/slack-bot-builder.md +264 -0
- package/merged-commands/slack-gif-creator.md +254 -0
- package/merged-commands/slo-implementation.md +341 -0
- package/merged-commands/smtp-penetration-testing.md +500 -0
- package/merged-commands/social-content.md +807 -0
- package/merged-commands/software-architecture.md +75 -0
- package/merged-commands/solidity-security.md +34 -0
- package/merged-commands/spark-engineer.md +100 -0
- package/merged-commands/spark-optimization.md +427 -0
- package/merged-commands/spec-miner.md +88 -0
- package/merged-commands/spring-boot-engineer.md +104 -0
- package/merged-commands/sql-injection-testing.md +448 -0
- package/merged-commands/sql-optimization-patterns.md +35 -0
- package/merged-commands/sql-pro.md +173 -0
- package/merged-commands/sqlmap-database-pentesting.md +400 -0
- package/merged-commands/sre-engineer.md +98 -0
- package/merged-commands/ssh-penetration-testing.md +488 -0
- package/merged-commands/startup-analyst.md +328 -0
- package/merged-commands/startup-business-analyst-business-case.md +487 -0
- package/merged-commands/startup-business-analyst-financial-projections.md +353 -0
- package/merged-commands/startup-business-analyst-market-opportunity.md +240 -0
- package/merged-commands/startup-financial-modeling.md +467 -0
- package/merged-commands/startup-metrics-framework.md +34 -0
- package/merged-commands/stitch-ui-design.md +378 -0
- package/merged-commands/stride-analysis-patterns.md +33 -0
- package/merged-commands/stripe-integration.md +454 -0
- package/merged-commands/subagent-driven-development.md +240 -0
- package/merged-commands/superpowers-lab.md +22 -0
- package/merged-commands/swift-expert.md +94 -0
- package/merged-commands/swiftui-expert-skill.md +275 -0
- package/merged-commands/systematic-debugging.md +296 -0
- package/merged-commands/systems-programming-rust-project.md +440 -0
- package/merged-commands/tailwind-design-system.md +33 -0
- package/merged-commands/tailwind-patterns.md +269 -0
- package/merged-commands/tavily-web.md +36 -0
- package/merged-commands/tdd-orchestrator.md +205 -0
- package/merged-commands/tdd-workflow.md +149 -0
- package/merged-commands/tdd-workflows-tdd-cycle.md +221 -0
- package/merged-commands/tdd-workflows-tdd-green.md +73 -0
- package/merged-commands/tdd-workflows-tdd-red.md +164 -0
- package/merged-commands/tdd-workflows-tdd-refactor.md +187 -0
- package/merged-commands/team-collaboration-issue.md +37 -0
- package/merged-commands/team-collaboration-standup-notes.md +44 -0
- package/merged-commands/team-composition-analysis.md +413 -0
- package/merged-commands/telegram-bot-builder.md +254 -0
- package/merged-commands/telegram-mini-app.md +279 -0
- package/merged-commands/template-skill.md +6 -0
- package/merged-commands/temporal-python-pro.md +370 -0
- package/merged-commands/temporal-python-testing.md +170 -0
- package/merged-commands/terraform-engineer.md +97 -0
- package/merged-commands/terraform-module-library.md +261 -0
- package/merged-commands/terraform-skill.md +517 -0
- package/merged-commands/terraform-specialist.md +166 -0
- package/merged-commands/test-automator.md +224 -0
- package/merged-commands/test-driven-development.md +371 -0
- package/merged-commands/test-fixing.md +119 -0
- package/merged-commands/test-master.md +104 -0
- package/merged-commands/testing-patterns.md +259 -0
- package/merged-commands/theme-factory.md +59 -0
- package/merged-commands/threat-mitigation-mapping.md +33 -0
- package/merged-commands/threat-modeling-expert.md +60 -0
- package/merged-commands/threejs-skills.md +22 -0
- package/merged-commands/threejs.md +89 -0
- package/merged-commands/tool-design.md +318 -0
- package/merged-commands/top-web-vulnerabilities.md +543 -0
- package/merged-commands/track-management.md +38 -0
- package/merged-commands/trigger-dev.md +67 -0
- package/merged-commands/turborepo-caching.md +419 -0
- package/merged-commands/tutorial-engineer.md +139 -0
- package/merged-commands/twilio-communications.md +295 -0
- package/merged-commands/typescript-advanced-types.md +35 -0
- package/merged-commands/typescript-expert.md +429 -0
- package/merged-commands/typescript-pro.md +55 -0
- package/merged-commands/ui-design-system.md +32 -0
- package/merged-commands/ui-skills.md +22 -0
- package/merged-commands/ui-styling.md +321 -0
- package/merged-commands/ui-ux-designer.md +209 -0
- package/merged-commands/ui-ux-pro-max.md +351 -0
- package/merged-commands/ui-visual-validator.md +214 -0
- package/merged-commands/unit-testing-test-generate.md +319 -0
- package/merged-commands/unity-developer.md +230 -0
- package/merged-commands/unity-ecs-patterns.md +33 -0
- package/merged-commands/unreal-engine-cpp-pro.md +114 -0
- package/merged-commands/upgrading-expo.md +118 -0
- package/merged-commands/upstash-qstash.md +68 -0
- package/merged-commands/using-git-worktrees.md +217 -0
- package/merged-commands/using-neon.md +84 -0
- package/merged-commands/using-superpowers.md +87 -0
- package/merged-commands/uv-package-manager.md +37 -0
- package/merged-commands/ux-researcher-designer.md +30 -0
- package/merged-commands/varlock-claude-skill.md +22 -0
- package/merged-commands/vector-database-engineer.md +60 -0
- package/merged-commands/vector-index-tuning.md +42 -0
- package/merged-commands/vercel-deploy-claimable.md +120 -0
- package/merged-commands/vercel-deploy.md +22 -0
- package/merged-commands/vercel-deployment.md +79 -0
- package/merged-commands/verification-before-completion.md +139 -0
- package/merged-commands/vexor.md +22 -0
- package/merged-commands/video-downloader.md +106 -0
- package/merged-commands/viral-generator-builder.md +199 -0
- package/merged-commands/voice-agents.md +68 -0
- package/merged-commands/voice-ai-development.md +302 -0
- package/merged-commands/voice-ai-engine-development.md +721 -0
- package/merged-commands/vue-expert-js.md +91 -0
- package/merged-commands/vue-expert.md +374 -0
- package/merged-commands/vulnerability-scanner.md +276 -0
- package/merged-commands/wcag-audit-patterns.md +41 -0
- package/merged-commands/web-artifacts-builder.md +74 -0
- package/merged-commands/web-design-guidelines.md +36 -0
- package/merged-commands/web-frameworks.md +324 -0
- package/merged-commands/web-performance-optimization.md +646 -0
- package/merged-commands/web3-testing.md +427 -0
- package/merged-commands/webapp-testing.md +96 -0
- package/merged-commands/websocket-engineer.md +96 -0
- package/merged-commands/windows-privilege-escalation.md +496 -0
- package/merged-commands/wireshark-analysis.md +497 -0
- package/merged-commands/wordpress-penetration-testing.md +485 -0
- package/merged-commands/wordpress-pro.md +105 -0
- package/merged-commands/workflow-automation.md +68 -0
- package/merged-commands/workflow-orchestration-patterns.md +333 -0
- package/merged-commands/workflow-patterns.md +38 -0
- package/merged-commands/writing-plans.md +116 -0
- package/merged-commands/writing-skills.md +125 -0
- package/merged-commands/x-article-publisher-skill.md +22 -0
- package/merged-commands/xlsx-official.md +289 -0
- package/merged-commands/xlsx.md +289 -0
- package/merged-commands/xss-html-injection.md +499 -0
- package/merged-commands/youtube-transcript.md +415 -0
- package/merged-commands/zapier-make-patterns.md +67 -0
- package/merged-commands/zustand-store.md +68 -0
- package/package.json +1 -1
- package/scripts/generate-index.js +3 -1
- package/scripts/merge-commands.js +21 -0
- package/skills-index.json +2248 -463
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: llm-evaluation
|
|
3
|
+
description: Implement comprehensive evaluation strategies for LLM applications using automated metrics, human feedback, and benchmarking. Use when testing LLM performance, measuring AI application quality, or establishing evaluation frameworks.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# LLM Evaluation
|
|
7
|
+
|
|
8
|
+
Master comprehensive evaluation strategies for LLM applications, from automated metrics to human evaluation and A/B testing.
|
|
9
|
+
|
|
10
|
+
## Do not use this skill when
|
|
11
|
+
|
|
12
|
+
- The task is unrelated to llm evaluation
|
|
13
|
+
- You need a different domain or tool outside this scope
|
|
14
|
+
|
|
15
|
+
## Instructions
|
|
16
|
+
|
|
17
|
+
- Clarify goals, constraints, and required inputs.
|
|
18
|
+
- Apply relevant best practices and validate outcomes.
|
|
19
|
+
- Provide actionable steps and verification.
|
|
20
|
+
- If detailed examples are required, open `resources/implementation-playbook.md`.
|
|
21
|
+
|
|
22
|
+
## Use this skill when
|
|
23
|
+
|
|
24
|
+
- Measuring LLM application performance systematically
|
|
25
|
+
- Comparing different models or prompts
|
|
26
|
+
- Detecting performance regressions before deployment
|
|
27
|
+
- Validating improvements from prompt changes
|
|
28
|
+
- Building confidence in production systems
|
|
29
|
+
- Establishing baselines and tracking progress over time
|
|
30
|
+
- Debugging unexpected model behavior
|
|
31
|
+
|
|
32
|
+
## Core Evaluation Types
|
|
33
|
+
|
|
34
|
+
### 1. Automated Metrics
|
|
35
|
+
Fast, repeatable, scalable evaluation using computed scores.
|
|
36
|
+
|
|
37
|
+
**Text Generation:**
|
|
38
|
+
- **BLEU**: N-gram overlap (translation)
|
|
39
|
+
- **ROUGE**: Recall-oriented (summarization)
|
|
40
|
+
- **METEOR**: Semantic similarity
|
|
41
|
+
- **BERTScore**: Embedding-based similarity
|
|
42
|
+
- **Perplexity**: Language model confidence
|
|
43
|
+
|
|
44
|
+
**Classification:**
|
|
45
|
+
- **Accuracy**: Percentage correct
|
|
46
|
+
- **Precision/Recall/F1**: Class-specific performance
|
|
47
|
+
- **Confusion Matrix**: Error patterns
|
|
48
|
+
- **AUC-ROC**: Ranking quality
|
|
49
|
+
|
|
50
|
+
**Retrieval (RAG):**
|
|
51
|
+
- **MRR**: Mean Reciprocal Rank
|
|
52
|
+
- **NDCG**: Normalized Discounted Cumulative Gain
|
|
53
|
+
- **Precision@K**: Relevant in top K
|
|
54
|
+
- **Recall@K**: Coverage in top K
|
|
55
|
+
|
|
56
|
+
### 2. Human Evaluation
|
|
57
|
+
Manual assessment for quality aspects difficult to automate.
|
|
58
|
+
|
|
59
|
+
**Dimensions:**
|
|
60
|
+
- **Accuracy**: Factual correctness
|
|
61
|
+
- **Coherence**: Logical flow
|
|
62
|
+
- **Relevance**: Answers the question
|
|
63
|
+
- **Fluency**: Natural language quality
|
|
64
|
+
- **Safety**: No harmful content
|
|
65
|
+
- **Helpfulness**: Useful to the user
|
|
66
|
+
|
|
67
|
+
### 3. LLM-as-Judge
|
|
68
|
+
Use stronger LLMs to evaluate weaker model outputs.
|
|
69
|
+
|
|
70
|
+
**Approaches:**
|
|
71
|
+
- **Pointwise**: Score individual responses
|
|
72
|
+
- **Pairwise**: Compare two responses
|
|
73
|
+
- **Reference-based**: Compare to gold standard
|
|
74
|
+
- **Reference-free**: Judge without ground truth
|
|
75
|
+
|
|
76
|
+
## Quick Start
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from llm_eval import EvaluationSuite, Metric
|
|
80
|
+
|
|
81
|
+
# Define evaluation suite
|
|
82
|
+
suite = EvaluationSuite([
|
|
83
|
+
Metric.accuracy(),
|
|
84
|
+
Metric.bleu(),
|
|
85
|
+
Metric.bertscore(),
|
|
86
|
+
Metric.custom(name="groundedness", fn=check_groundedness)
|
|
87
|
+
])
|
|
88
|
+
|
|
89
|
+
# Prepare test cases
|
|
90
|
+
test_cases = [
|
|
91
|
+
{
|
|
92
|
+
"input": "What is the capital of France?",
|
|
93
|
+
"expected": "Paris",
|
|
94
|
+
"context": "France is a country in Europe. Paris is its capital."
|
|
95
|
+
},
|
|
96
|
+
# ... more test cases
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
# Run evaluation
|
|
100
|
+
results = suite.evaluate(
|
|
101
|
+
model=your_model,
|
|
102
|
+
test_cases=test_cases
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
print(f"Overall Accuracy: {results.metrics['accuracy']}")
|
|
106
|
+
print(f"BLEU Score: {results.metrics['bleu']}")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Automated Metrics Implementation
|
|
110
|
+
|
|
111
|
+
### BLEU Score
|
|
112
|
+
```python
|
|
113
|
+
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
|
114
|
+
|
|
115
|
+
def calculate_bleu(reference, hypothesis):
|
|
116
|
+
"""Calculate BLEU score between reference and hypothesis."""
|
|
117
|
+
smoothie = SmoothingFunction().method4
|
|
118
|
+
|
|
119
|
+
return sentence_bleu(
|
|
120
|
+
[reference.split()],
|
|
121
|
+
hypothesis.split(),
|
|
122
|
+
smoothing_function=smoothie
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Usage
|
|
126
|
+
bleu = calculate_bleu(
|
|
127
|
+
reference="The cat sat on the mat",
|
|
128
|
+
hypothesis="A cat is sitting on the mat"
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### ROUGE Score
|
|
133
|
+
```python
|
|
134
|
+
from rouge_score import rouge_scorer
|
|
135
|
+
|
|
136
|
+
def calculate_rouge(reference, hypothesis):
|
|
137
|
+
"""Calculate ROUGE scores."""
|
|
138
|
+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
|
|
139
|
+
scores = scorer.score(reference, hypothesis)
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
'rouge1': scores['rouge1'].fmeasure,
|
|
143
|
+
'rouge2': scores['rouge2'].fmeasure,
|
|
144
|
+
'rougeL': scores['rougeL'].fmeasure
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### BERTScore
|
|
149
|
+
```python
|
|
150
|
+
from bert_score import score
|
|
151
|
+
|
|
152
|
+
def calculate_bertscore(references, hypotheses):
|
|
153
|
+
"""Calculate BERTScore using pre-trained BERT."""
|
|
154
|
+
P, R, F1 = score(
|
|
155
|
+
hypotheses,
|
|
156
|
+
references,
|
|
157
|
+
lang='en',
|
|
158
|
+
model_type='microsoft/deberta-xlarge-mnli'
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
'precision': P.mean().item(),
|
|
163
|
+
'recall': R.mean().item(),
|
|
164
|
+
'f1': F1.mean().item()
|
|
165
|
+
}
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Custom Metrics
|
|
169
|
+
```python
|
|
170
|
+
def calculate_groundedness(response, context):
|
|
171
|
+
"""Check if response is grounded in provided context."""
|
|
172
|
+
# Use NLI model to check entailment
|
|
173
|
+
from transformers import pipeline
|
|
174
|
+
|
|
175
|
+
nli = pipeline("text-classification", model="microsoft/deberta-large-mnli")
|
|
176
|
+
|
|
177
|
+
result = nli(f"{context} [SEP] {response}")[0]
|
|
178
|
+
|
|
179
|
+
# Return confidence that response is entailed by context
|
|
180
|
+
return result['score'] if result['label'] == 'ENTAILMENT' else 0.0
|
|
181
|
+
|
|
182
|
+
def calculate_toxicity(text):
|
|
183
|
+
"""Measure toxicity in generated text."""
|
|
184
|
+
from detoxify import Detoxify
|
|
185
|
+
|
|
186
|
+
results = Detoxify('original').predict(text)
|
|
187
|
+
return max(results.values()) # Return highest toxicity score
|
|
188
|
+
|
|
189
|
+
def calculate_factuality(claim, knowledge_base):
|
|
190
|
+
"""Verify factual claims against knowledge base."""
|
|
191
|
+
# Implementation depends on your knowledge base
|
|
192
|
+
# Could use retrieval + NLI, or fact-checking API
|
|
193
|
+
pass
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## LLM-as-Judge Patterns
|
|
197
|
+
|
|
198
|
+
### Single Output Evaluation
|
|
199
|
+
```python
|
|
200
|
+
def llm_judge_quality(response, question):
|
|
201
|
+
"""Use GPT-5 to judge response quality."""
|
|
202
|
+
prompt = f"""Rate the following response on a scale of 1-10 for:
|
|
203
|
+
1. Accuracy (factually correct)
|
|
204
|
+
2. Helpfulness (answers the question)
|
|
205
|
+
3. Clarity (well-written and understandable)
|
|
206
|
+
|
|
207
|
+
Question: {question}
|
|
208
|
+
Response: {response}
|
|
209
|
+
|
|
210
|
+
Provide ratings in JSON format:
|
|
211
|
+
{{
|
|
212
|
+
"accuracy": <1-10>,
|
|
213
|
+
"helpfulness": <1-10>,
|
|
214
|
+
"clarity": <1-10>,
|
|
215
|
+
"reasoning": "<brief explanation>"
|
|
216
|
+
}}
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
result = openai.ChatCompletion.create(
|
|
220
|
+
model="gpt-5",
|
|
221
|
+
messages=[{"role": "user", "content": prompt}],
|
|
222
|
+
temperature=0
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return json.loads(result.choices[0].message.content)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
### Pairwise Comparison
|
|
229
|
+
```python
|
|
230
|
+
def compare_responses(question, response_a, response_b):
|
|
231
|
+
"""Compare two responses using LLM judge."""
|
|
232
|
+
prompt = f"""Compare these two responses to the question and determine which is better.
|
|
233
|
+
|
|
234
|
+
Question: {question}
|
|
235
|
+
|
|
236
|
+
Response A: {response_a}
|
|
237
|
+
|
|
238
|
+
Response B: {response_b}
|
|
239
|
+
|
|
240
|
+
Which response is better and why? Consider accuracy, helpfulness, and clarity.
|
|
241
|
+
|
|
242
|
+
Answer with JSON:
|
|
243
|
+
{{
|
|
244
|
+
"winner": "A" or "B" or "tie",
|
|
245
|
+
"reasoning": "<explanation>",
|
|
246
|
+
"confidence": <1-10>
|
|
247
|
+
}}
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
result = openai.ChatCompletion.create(
|
|
251
|
+
model="gpt-5",
|
|
252
|
+
messages=[{"role": "user", "content": prompt}],
|
|
253
|
+
temperature=0
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
return json.loads(result.choices[0].message.content)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## Human Evaluation Frameworks
|
|
260
|
+
|
|
261
|
+
### Annotation Guidelines
|
|
262
|
+
```python
|
|
263
|
+
class AnnotationTask:
|
|
264
|
+
"""Structure for human annotation task."""
|
|
265
|
+
|
|
266
|
+
def __init__(self, response, question, context=None):
|
|
267
|
+
self.response = response
|
|
268
|
+
self.question = question
|
|
269
|
+
self.context = context
|
|
270
|
+
|
|
271
|
+
def get_annotation_form(self):
|
|
272
|
+
return {
|
|
273
|
+
"question": self.question,
|
|
274
|
+
"context": self.context,
|
|
275
|
+
"response": self.response,
|
|
276
|
+
"ratings": {
|
|
277
|
+
"accuracy": {
|
|
278
|
+
"scale": "1-5",
|
|
279
|
+
"description": "Is the response factually correct?"
|
|
280
|
+
},
|
|
281
|
+
"relevance": {
|
|
282
|
+
"scale": "1-5",
|
|
283
|
+
"description": "Does it answer the question?"
|
|
284
|
+
},
|
|
285
|
+
"coherence": {
|
|
286
|
+
"scale": "1-5",
|
|
287
|
+
"description": "Is it logically consistent?"
|
|
288
|
+
}
|
|
289
|
+
},
|
|
290
|
+
"issues": {
|
|
291
|
+
"factual_error": False,
|
|
292
|
+
"hallucination": False,
|
|
293
|
+
"off_topic": False,
|
|
294
|
+
"unsafe_content": False
|
|
295
|
+
},
|
|
296
|
+
"feedback": ""
|
|
297
|
+
}
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
### Inter-Rater Agreement
|
|
301
|
+
```python
|
|
302
|
+
from sklearn.metrics import cohen_kappa_score
|
|
303
|
+
|
|
304
|
+
def calculate_agreement(rater1_scores, rater2_scores):
|
|
305
|
+
"""Calculate inter-rater agreement."""
|
|
306
|
+
kappa = cohen_kappa_score(rater1_scores, rater2_scores)
|
|
307
|
+
|
|
308
|
+
interpretation = {
|
|
309
|
+
kappa < 0: "Poor",
|
|
310
|
+
kappa < 0.2: "Slight",
|
|
311
|
+
kappa < 0.4: "Fair",
|
|
312
|
+
kappa < 0.6: "Moderate",
|
|
313
|
+
kappa < 0.8: "Substantial",
|
|
314
|
+
kappa <= 1.0: "Almost Perfect"
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
return {
|
|
318
|
+
"kappa": kappa,
|
|
319
|
+
"interpretation": interpretation[True]
|
|
320
|
+
}
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
## A/B Testing
|
|
324
|
+
|
|
325
|
+
### Statistical Testing Framework
|
|
326
|
+
```python
|
|
327
|
+
from scipy import stats
|
|
328
|
+
import numpy as np
|
|
329
|
+
|
|
330
|
+
class ABTest:
|
|
331
|
+
def __init__(self, variant_a_name="A", variant_b_name="B"):
|
|
332
|
+
self.variant_a = {"name": variant_a_name, "scores": []}
|
|
333
|
+
self.variant_b = {"name": variant_b_name, "scores": []}
|
|
334
|
+
|
|
335
|
+
def add_result(self, variant, score):
|
|
336
|
+
"""Add evaluation result for a variant."""
|
|
337
|
+
if variant == "A":
|
|
338
|
+
self.variant_a["scores"].append(score)
|
|
339
|
+
else:
|
|
340
|
+
self.variant_b["scores"].append(score)
|
|
341
|
+
|
|
342
|
+
def analyze(self, alpha=0.05):
|
|
343
|
+
"""Perform statistical analysis."""
|
|
344
|
+
a_scores = self.variant_a["scores"]
|
|
345
|
+
b_scores = self.variant_b["scores"]
|
|
346
|
+
|
|
347
|
+
# T-test
|
|
348
|
+
t_stat, p_value = stats.ttest_ind(a_scores, b_scores)
|
|
349
|
+
|
|
350
|
+
# Effect size (Cohen's d)
|
|
351
|
+
pooled_std = np.sqrt((np.std(a_scores)**2 + np.std(b_scores)**2) / 2)
|
|
352
|
+
cohens_d = (np.mean(b_scores) - np.mean(a_scores)) / pooled_std
|
|
353
|
+
|
|
354
|
+
return {
|
|
355
|
+
"variant_a_mean": np.mean(a_scores),
|
|
356
|
+
"variant_b_mean": np.mean(b_scores),
|
|
357
|
+
"difference": np.mean(b_scores) - np.mean(a_scores),
|
|
358
|
+
"relative_improvement": (np.mean(b_scores) - np.mean(a_scores)) / np.mean(a_scores),
|
|
359
|
+
"p_value": p_value,
|
|
360
|
+
"statistically_significant": p_value < alpha,
|
|
361
|
+
"cohens_d": cohens_d,
|
|
362
|
+
"effect_size": self.interpret_cohens_d(cohens_d),
|
|
363
|
+
"winner": "B" if np.mean(b_scores) > np.mean(a_scores) else "A"
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
@staticmethod
|
|
367
|
+
def interpret_cohens_d(d):
|
|
368
|
+
"""Interpret Cohen's d effect size."""
|
|
369
|
+
abs_d = abs(d)
|
|
370
|
+
if abs_d < 0.2:
|
|
371
|
+
return "negligible"
|
|
372
|
+
elif abs_d < 0.5:
|
|
373
|
+
return "small"
|
|
374
|
+
elif abs_d < 0.8:
|
|
375
|
+
return "medium"
|
|
376
|
+
else:
|
|
377
|
+
return "large"
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
## Regression Testing
|
|
381
|
+
|
|
382
|
+
### Regression Detection
|
|
383
|
+
```python
|
|
384
|
+
class RegressionDetector:
|
|
385
|
+
def __init__(self, baseline_results, threshold=0.05):
|
|
386
|
+
self.baseline = baseline_results
|
|
387
|
+
self.threshold = threshold
|
|
388
|
+
|
|
389
|
+
def check_for_regression(self, new_results):
|
|
390
|
+
"""Detect if new results show regression."""
|
|
391
|
+
regressions = []
|
|
392
|
+
|
|
393
|
+
for metric in self.baseline.keys():
|
|
394
|
+
baseline_score = self.baseline[metric]
|
|
395
|
+
new_score = new_results.get(metric)
|
|
396
|
+
|
|
397
|
+
if new_score is None:
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
# Calculate relative change
|
|
401
|
+
relative_change = (new_score - baseline_score) / baseline_score
|
|
402
|
+
|
|
403
|
+
# Flag if significant decrease
|
|
404
|
+
if relative_change < -self.threshold:
|
|
405
|
+
regressions.append({
|
|
406
|
+
"metric": metric,
|
|
407
|
+
"baseline": baseline_score,
|
|
408
|
+
"current": new_score,
|
|
409
|
+
"change": relative_change
|
|
410
|
+
})
|
|
411
|
+
|
|
412
|
+
return {
|
|
413
|
+
"has_regression": len(regressions) > 0,
|
|
414
|
+
"regressions": regressions
|
|
415
|
+
}
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
## Benchmarking
|
|
419
|
+
|
|
420
|
+
### Running Benchmarks
|
|
421
|
+
```python
|
|
422
|
+
class BenchmarkRunner:
|
|
423
|
+
def __init__(self, benchmark_dataset):
|
|
424
|
+
self.dataset = benchmark_dataset
|
|
425
|
+
|
|
426
|
+
def run_benchmark(self, model, metrics):
|
|
427
|
+
"""Run model on benchmark and calculate metrics."""
|
|
428
|
+
results = {metric.name: [] for metric in metrics}
|
|
429
|
+
|
|
430
|
+
for example in self.dataset:
|
|
431
|
+
# Generate prediction
|
|
432
|
+
prediction = model.predict(example["input"])
|
|
433
|
+
|
|
434
|
+
# Calculate each metric
|
|
435
|
+
for metric in metrics:
|
|
436
|
+
score = metric.calculate(
|
|
437
|
+
prediction=prediction,
|
|
438
|
+
reference=example["reference"],
|
|
439
|
+
context=example.get("context")
|
|
440
|
+
)
|
|
441
|
+
results[metric.name].append(score)
|
|
442
|
+
|
|
443
|
+
# Aggregate results
|
|
444
|
+
return {
|
|
445
|
+
metric: {
|
|
446
|
+
"mean": np.mean(scores),
|
|
447
|
+
"std": np.std(scores),
|
|
448
|
+
"min": min(scores),
|
|
449
|
+
"max": max(scores)
|
|
450
|
+
}
|
|
451
|
+
for metric, scores in results.items()
|
|
452
|
+
}
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
## Resources
|
|
456
|
+
|
|
457
|
+
- **references/metrics.md**: Comprehensive metric guide
|
|
458
|
+
- **references/human-evaluation.md**: Annotation best practices
|
|
459
|
+
- **references/benchmarking.md**: Standard benchmarks
|
|
460
|
+
- **references/a-b-testing.md**: Statistical testing guide
|
|
461
|
+
- **references/regression-testing.md**: CI/CD integration
|
|
462
|
+
- **assets/evaluation-framework.py**: Complete evaluation harness
|
|
463
|
+
- **assets/benchmark-dataset.jsonl**: Example datasets
|
|
464
|
+
- **scripts/evaluate-model.py**: Automated evaluation runner
|
|
465
|
+
|
|
466
|
+
## Best Practices
|
|
467
|
+
|
|
468
|
+
1. **Multiple Metrics**: Use diverse metrics for comprehensive view
|
|
469
|
+
2. **Representative Data**: Test on real-world, diverse examples
|
|
470
|
+
3. **Baselines**: Always compare against baseline performance
|
|
471
|
+
4. **Statistical Rigor**: Use proper statistical tests for comparisons
|
|
472
|
+
5. **Continuous Evaluation**: Integrate into CI/CD pipeline
|
|
473
|
+
6. **Human Validation**: Combine automated metrics with human judgment
|
|
474
|
+
7. **Error Analysis**: Investigate failures to understand weaknesses
|
|
475
|
+
8. **Version Control**: Track evaluation results over time
|
|
476
|
+
|
|
477
|
+
## Common Pitfalls
|
|
478
|
+
|
|
479
|
+
- **Single Metric Obsession**: Optimizing for one metric at the expense of others
|
|
480
|
+
- **Small Sample Size**: Drawing conclusions from too few examples
|
|
481
|
+
- **Data Contamination**: Testing on training data
|
|
482
|
+
- **Ignoring Variance**: Not accounting for statistical uncertainty
|
|
483
|
+
- **Metric Mismatch**: Using metrics not aligned with business goals
|