arboris-cli 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.mjs +382 -0
- package/manifest.json +323 -0
- package/package.json +22 -10
- package/prisma/skills/accessibility/SKILL.md +147 -0
- package/prisma/skills/agent-architecture-audit/SKILL.md +257 -0
- package/prisma/skills/agent-eval/SKILL.md +146 -0
- package/prisma/skills/agent-harness-construction/SKILL.md +74 -0
- package/prisma/skills/agent-introspection-debugging/SKILL.md +154 -0
- package/prisma/skills/agent-payment-x402/SKILL.md +225 -0
- package/prisma/skills/agent-self-evaluation/SKILL.md +182 -0
- package/prisma/skills/agent-self-evaluation/examples/high-score-example.md +87 -0
- package/prisma/skills/agent-self-evaluation/examples/low-score-example.md +86 -0
- package/prisma/skills/agent-self-evaluation/references/evaluation-criteria.md +71 -0
- package/prisma/skills/agent-self-evaluation/references/hook-integration.md +64 -0
- package/prisma/skills/agent-self-evaluation/scripts/evaluate.py +408 -0
- package/prisma/skills/agent-self-evaluation/templates/evaluation-report.md +86 -0
- package/prisma/skills/agent-sort/SKILL.md +216 -0
- package/prisma/skills/agentic-engineering/SKILL.md +64 -0
- package/prisma/skills/agentic-os/SKILL.md +388 -0
- package/prisma/skills/ai-first-engineering/SKILL.md +52 -0
- package/prisma/skills/ai-regression-testing/SKILL.md +386 -0
- package/prisma/skills/android-clean-architecture/SKILL.md +340 -0
- package/prisma/skills/angular-developer/SKILL.md +155 -0
- package/prisma/skills/angular-developer/references/angular-animations.md +160 -0
- package/prisma/skills/angular-developer/references/angular-aria.md +410 -0
- package/prisma/skills/angular-developer/references/cli.md +86 -0
- package/prisma/skills/angular-developer/references/component-harnesses.md +59 -0
- package/prisma/skills/angular-developer/references/component-styling.md +91 -0
- package/prisma/skills/angular-developer/references/components.md +117 -0
- package/prisma/skills/angular-developer/references/creating-services.md +97 -0
- package/prisma/skills/angular-developer/references/data-resolvers.md +69 -0
- package/prisma/skills/angular-developer/references/define-routes.md +67 -0
- package/prisma/skills/angular-developer/references/defining-providers.md +72 -0
- package/prisma/skills/angular-developer/references/di-fundamentals.md +120 -0
- package/prisma/skills/angular-developer/references/e2e-testing.md +56 -0
- package/prisma/skills/angular-developer/references/effects.md +83 -0
- package/prisma/skills/angular-developer/references/hierarchical-injectors.md +43 -0
- package/prisma/skills/angular-developer/references/host-elements.md +80 -0
- package/prisma/skills/angular-developer/references/injection-context.md +63 -0
- package/prisma/skills/angular-developer/references/inputs.md +101 -0
- package/prisma/skills/angular-developer/references/linked-signal.md +59 -0
- package/prisma/skills/angular-developer/references/loading-strategies.md +61 -0
- package/prisma/skills/angular-developer/references/mcp.md +108 -0
- package/prisma/skills/angular-developer/references/navigate-to-routes.md +69 -0
- package/prisma/skills/angular-developer/references/outputs.md +86 -0
- package/prisma/skills/angular-developer/references/reactive-forms.md +122 -0
- package/prisma/skills/angular-developer/references/rendering-strategies.md +44 -0
- package/prisma/skills/angular-developer/references/resource.md +77 -0
- package/prisma/skills/angular-developer/references/route-animations.md +56 -0
- package/prisma/skills/angular-developer/references/route-guards.md +52 -0
- package/prisma/skills/angular-developer/references/router-lifecycle.md +45 -0
- package/prisma/skills/angular-developer/references/router-testing.md +87 -0
- package/prisma/skills/angular-developer/references/show-routes-with-outlets.md +68 -0
- package/prisma/skills/angular-developer/references/signal-forms.md +795 -0
- package/prisma/skills/angular-developer/references/signals-overview.md +94 -0
- package/prisma/skills/angular-developer/references/tailwind-css.md +69 -0
- package/prisma/skills/angular-developer/references/template-driven-forms.md +114 -0
- package/prisma/skills/angular-developer/references/testing-fundamentals.md +65 -0
- package/prisma/skills/api-connector-builder/SKILL.md +121 -0
- package/prisma/skills/api-design/SKILL.md +524 -0
- package/prisma/skills/architecture-decision-records/SKILL.md +180 -0
- package/prisma/skills/article-writing/SKILL.md +80 -0
- package/prisma/skills/automation-audit-ops/SKILL.md +143 -0
- package/prisma/skills/autonomous-agent-harness/SKILL.md +274 -0
- package/prisma/skills/autonomous-loops/SKILL.md +611 -0
- package/prisma/skills/backend-patterns/SKILL.md +562 -0
- package/prisma/skills/benchmark/SKILL.md +94 -0
- package/prisma/skills/benchmark-methodology/SKILL.md +190 -0
- package/prisma/skills/benchmark-optimization-loop/SKILL.md +70 -0
- package/prisma/skills/blender-motion-state-inspection/SKILL.md +165 -0
- package/prisma/skills/blueprint/SKILL.md +106 -0
- package/prisma/skills/brand-discovery/SKILL.md +145 -0
- package/prisma/skills/brand-discovery/references/10_purpose-why.md +40 -0
- package/prisma/skills/brand-discovery/references/20_positioning.md +44 -0
- package/prisma/skills/brand-discovery/references/30_audience-niche.md +52 -0
- package/prisma/skills/brand-discovery/references/40_personality-archetype.md +57 -0
- package/prisma/skills/brand-discovery/references/50_voice-tone.md +59 -0
- package/prisma/skills/brand-discovery/references/60_narrative-story.md +50 -0
- package/prisma/skills/brand-discovery/references/70_founder-tension.md +49 -0
- package/prisma/skills/brand-discovery/references/90_SYNTHESIS.md +133 -0
- package/prisma/skills/brand-voice/SKILL.md +98 -0
- package/prisma/skills/brand-voice/references/voice-profile-schema.md +55 -0
- package/prisma/skills/browser-qa/SKILL.md +105 -0
- package/prisma/skills/bun-runtime/SKILL.md +85 -0
- package/prisma/skills/canary-watch/SKILL.md +108 -0
- package/prisma/skills/carrier-relationship-management/SKILL.md +212 -0
- package/prisma/skills/cisco-ios-patterns/SKILL.md +164 -0
- package/prisma/skills/ck/SKILL.md +148 -0
- package/prisma/skills/ck/commands/forget.mjs +44 -0
- package/prisma/skills/ck/commands/info.mjs +24 -0
- package/prisma/skills/ck/commands/init.mjs +143 -0
- package/prisma/skills/ck/commands/list.mjs +40 -0
- package/prisma/skills/ck/commands/migrate.mjs +202 -0
- package/prisma/skills/ck/commands/resume.mjs +36 -0
- package/prisma/skills/ck/commands/save.mjs +210 -0
- package/prisma/skills/ck/commands/shared.mjs +387 -0
- package/prisma/skills/ck/hooks/session-start.mjs +224 -0
- package/prisma/skills/claude-devfleet/SKILL.md +112 -0
- package/prisma/skills/click-path-audit/SKILL.md +245 -0
- package/prisma/skills/clickhouse-io/SKILL.md +440 -0
- package/prisma/skills/code-tour/SKILL.md +254 -0
- package/prisma/skills/codebase-onboarding/SKILL.md +234 -0
- package/prisma/skills/codehealth-mcp/SKILL.md +167 -0
- package/prisma/skills/coding-standards/SKILL.md +551 -0
- package/prisma/skills/competitive-platform-analysis/SKILL.md +214 -0
- package/prisma/skills/competitive-report-structure/SKILL.md +162 -0
- package/prisma/skills/compose-multiplatform-patterns/SKILL.md +300 -0
- package/prisma/skills/config-gc/SKILL.md +120 -0
- package/prisma/skills/configure-ecc/SKILL.md +385 -0
- package/prisma/skills/connections-optimizer/SKILL.md +190 -0
- package/prisma/skills/content-engine/SKILL.md +132 -0
- package/prisma/skills/content-hash-cache-pattern/SKILL.md +162 -0
- package/prisma/skills/context-budget/SKILL.md +136 -0
- package/prisma/skills/continuous-agent-loop/SKILL.md +46 -0
- package/prisma/skills/continuous-learning/SKILL.md +132 -0
- package/prisma/skills/continuous-learning/config.json +18 -0
- package/prisma/skills/continuous-learning/evaluate-session.sh +69 -0
- package/prisma/skills/continuous-learning-v2/SKILL.md +361 -0
- package/prisma/skills/continuous-learning-v2/agents/observer-loop.sh +359 -0
- package/prisma/skills/continuous-learning-v2/agents/observer.md +189 -0
- package/prisma/skills/continuous-learning-v2/agents/session-guardian.sh +150 -0
- package/prisma/skills/continuous-learning-v2/agents/start-observer.sh +248 -0
- package/prisma/skills/continuous-learning-v2/config.json +8 -0
- package/prisma/skills/continuous-learning-v2/hooks/observe.sh +585 -0
- package/prisma/skills/continuous-learning-v2/scripts/detect-project.sh +322 -0
- package/prisma/skills/continuous-learning-v2/scripts/instinct-cli.py +1956 -0
- package/prisma/skills/continuous-learning-v2/scripts/lib/homunculus-dir.sh +31 -0
- package/prisma/skills/continuous-learning-v2/scripts/migrate-homunculus.sh +68 -0
- package/prisma/skills/continuous-learning-v2/scripts/test_parse_instinct.py +1421 -0
- package/prisma/skills/cost-aware-llm-pipeline/SKILL.md +184 -0
- package/prisma/skills/cost-tracking/SKILL.md +97 -0
- package/prisma/skills/council/SKILL.md +204 -0
- package/prisma/skills/cpp-coding-standards/SKILL.md +724 -0
- package/prisma/skills/cpp-testing/SKILL.md +325 -0
- package/prisma/skills/crosspost/SKILL.md +112 -0
- package/prisma/skills/csharp-testing/SKILL.md +322 -0
- package/prisma/skills/customer-billing-ops/SKILL.md +141 -0
- package/prisma/skills/customs-trade-compliance/SKILL.md +263 -0
- package/prisma/skills/dart-flutter-patterns/SKILL.md +564 -0
- package/prisma/skills/dashboard-builder/SKILL.md +109 -0
- package/prisma/skills/data-scraper-agent/SKILL.md +765 -0
- package/prisma/skills/data-throughput-accelerator/SKILL.md +73 -0
- package/prisma/skills/database-migrations/SKILL.md +430 -0
- package/prisma/skills/deep-research/SKILL.md +160 -0
- package/prisma/skills/defi-amm-security/SKILL.md +167 -0
- package/prisma/skills/delivery-gate/SKILL.md +126 -0
- package/prisma/skills/delivery-gate/hooks/quality-gate.py +220 -0
- package/prisma/skills/deployment-patterns/SKILL.md +428 -0
- package/prisma/skills/design-system/SKILL.md +83 -0
- package/prisma/skills/django-celery/SKILL.md +458 -0
- package/prisma/skills/django-patterns/SKILL.md +735 -0
- package/prisma/skills/django-security/SKILL.md +644 -0
- package/prisma/skills/django-tdd/SKILL.md +730 -0
- package/prisma/skills/django-verification/SKILL.md +470 -0
- package/prisma/skills/dmux-workflows/SKILL.md +192 -0
- package/prisma/skills/docker-patterns/SKILL.md +365 -0
- package/prisma/skills/documentation-lookup/SKILL.md +91 -0
- package/prisma/skills/dotnet-patterns/SKILL.md +322 -0
- package/prisma/skills/dynamic-workflow-mode/SKILL.md +124 -0
- package/prisma/skills/e2e-testing/SKILL.md +327 -0
- package/prisma/skills/ecc-guide/SKILL.md +190 -0
- package/prisma/skills/ecc-recipes/SKILL.md +149 -0
- package/prisma/skills/ecc-tools-cost-audit/SKILL.md +161 -0
- package/prisma/skills/email-ops/SKILL.md +122 -0
- package/prisma/skills/energy-procurement/SKILL.md +228 -0
- package/prisma/skills/enterprise-agent-ops/SKILL.md +51 -0
- package/prisma/skills/error-handling/SKILL.md +377 -0
- package/prisma/skills/eval-harness/SKILL.md +271 -0
- package/prisma/skills/evm-token-decimals/SKILL.md +131 -0
- package/prisma/skills/exa-search/SKILL.md +108 -0
- package/prisma/skills/fal-ai-media/SKILL.md +289 -0
- package/prisma/skills/fastapi-patterns/SKILL.md +514 -0
- package/prisma/skills/finance-billing-ops/SKILL.md +128 -0
- package/prisma/skills/flox-environments/SKILL.md +497 -0
- package/prisma/skills/flutter-dart-code-review/SKILL.md +436 -0
- package/prisma/skills/foundation-models-on-device/SKILL.md +243 -0
- package/prisma/skills/frontend-a11y/SKILL.md +446 -0
- package/prisma/skills/frontend-design-direction/SKILL.md +93 -0
- package/prisma/skills/frontend-patterns/SKILL.md +657 -0
- package/prisma/skills/frontend-slides/SKILL.md +185 -0
- package/prisma/skills/frontend-slides/STYLE_PRESETS.md +330 -0
- package/prisma/skills/frontend-slides/animation-patterns.md +122 -0
- package/prisma/skills/frontend-slides/html-template.md +419 -0
- package/prisma/skills/frontend-slides/scripts/export-pdf.sh +418 -0
- package/prisma/skills/frontend-slides/scripts/extract-pptx.py +96 -0
- package/prisma/skills/frontend-slides/viewport-base.css +153 -0
- package/prisma/skills/fsharp-testing/SKILL.md +281 -0
- package/prisma/skills/gan-style-harness/SKILL.md +279 -0
- package/prisma/skills/gateguard/SKILL.md +133 -0
- package/prisma/skills/generating-python-installer/SKILL.md +820 -0
- package/prisma/skills/git-workflow/SKILL.md +716 -0
- package/prisma/skills/github-ops/SKILL.md +145 -0
- package/prisma/skills/golang-patterns/SKILL.md +675 -0
- package/prisma/skills/golang-testing/SKILL.md +721 -0
- package/prisma/skills/google-workspace-ops/SKILL.md +96 -0
- package/prisma/skills/growth-log/SKILL.md +128 -0
- package/prisma/skills/healthcare-cdss-patterns/SKILL.md +246 -0
- package/prisma/skills/healthcare-emr-patterns/SKILL.md +160 -0
- package/prisma/skills/healthcare-eval-harness/SKILL.md +208 -0
- package/prisma/skills/healthcare-phi-compliance/SKILL.md +146 -0
- package/prisma/skills/hermes-imports/SKILL.md +89 -0
- package/prisma/skills/hexagonal-architecture/SKILL.md +277 -0
- package/prisma/skills/hipaa-compliance/SKILL.md +79 -0
- package/prisma/skills/homelab-network-readiness/SKILL.md +170 -0
- package/prisma/skills/homelab-network-setup/SKILL.md +130 -0
- package/prisma/skills/homelab-pihole-dns/SKILL.md +275 -0
- package/prisma/skills/homelab-vlan-segmentation/SKILL.md +312 -0
- package/prisma/skills/homelab-wireguard-vpn/SKILL.md +306 -0
- package/prisma/skills/hookify-rules/SKILL.md +128 -0
- package/prisma/skills/inherit-legacy-style/SKILL.md +157 -0
- package/prisma/skills/intent-driven-development/SKILL.md +360 -0
- package/prisma/skills/inventory-demand-planning/SKILL.md +247 -0
- package/prisma/skills/investor-materials/SKILL.md +97 -0
- package/prisma/skills/investor-outreach/SKILL.md +92 -0
- package/prisma/skills/ios-icon-gen/SKILL.md +158 -0
- package/prisma/skills/ios-icon-gen/scripts/generate_icons.swift +258 -0
- package/prisma/skills/ios-icon-gen/scripts/iconify_gen.sh +235 -0
- package/prisma/skills/iterative-retrieval/SKILL.md +212 -0
- package/prisma/skills/ito-basket-compare/SKILL.md +64 -0
- package/prisma/skills/ito-data-atlas-agent/SKILL.md +64 -0
- package/prisma/skills/ito-market-intelligence/SKILL.md +61 -0
- package/prisma/skills/ito-trade-planner/SKILL.md +68 -0
- package/prisma/skills/java-coding-standards/SKILL.md +384 -0
- package/prisma/skills/jira-integration/SKILL.md +303 -0
- package/prisma/skills/jpa-patterns/SKILL.md +152 -0
- package/prisma/skills/knowledge-ops/SKILL.md +155 -0
- package/prisma/skills/kotlin-coroutines-flows/SKILL.md +285 -0
- package/prisma/skills/kotlin-exposed-patterns/SKILL.md +720 -0
- package/prisma/skills/kotlin-ktor-patterns/SKILL.md +690 -0
- package/prisma/skills/kotlin-patterns/SKILL.md +712 -0
- package/prisma/skills/kotlin-testing/SKILL.md +825 -0
- package/prisma/skills/kubernetes-patterns/SKILL.md +756 -0
- package/prisma/skills/laravel-patterns/SKILL.md +416 -0
- package/prisma/skills/laravel-plugin-discovery/SKILL.md +230 -0
- package/prisma/skills/laravel-security/SKILL.md +948 -0
- package/prisma/skills/laravel-tdd/SKILL.md +675 -0
- package/prisma/skills/laravel-verification/SKILL.md +180 -0
- package/prisma/skills/latency-critical-systems/SKILL.md +74 -0
- package/prisma/skills/lead-intelligence/SKILL.md +322 -0
- package/prisma/skills/lead-intelligence/agents/enrichment-agent.md +85 -0
- package/prisma/skills/lead-intelligence/agents/mutual-mapper.md +75 -0
- package/prisma/skills/lead-intelligence/agents/outreach-drafter.md +98 -0
- package/prisma/skills/lead-intelligence/agents/signal-scorer.md +60 -0
- package/prisma/skills/liquid-glass-design/SKILL.md +279 -0
- package/prisma/skills/llm-trading-agent-security/SKILL.md +147 -0
- package/prisma/skills/logistics-exception-management/SKILL.md +222 -0
- package/prisma/skills/loop-design-check/SKILL.md +143 -0
- package/prisma/skills/mailtrap-email-integration/SKILL.md +77 -0
- package/prisma/skills/make-interfaces-feel-better/SKILL.md +152 -0
- package/prisma/skills/manim-video/SKILL.md +90 -0
- package/prisma/skills/manim-video/assets/network_graph_scene.py +52 -0
- package/prisma/skills/market-research/SKILL.md +76 -0
- package/prisma/skills/marketing-campaign/SKILL.md +114 -0
- package/prisma/skills/mcp-server-patterns/SKILL.md +70 -0
- package/prisma/skills/messages-ops/SKILL.md +105 -0
- package/prisma/skills/ml-adoption-playbook/SKILL.md +57 -0
- package/prisma/skills/mle-workflow/SKILL.md +347 -0
- package/prisma/skills/motion-advanced/SKILL.md +596 -0
- package/prisma/skills/motion-foundations/SKILL.md +299 -0
- package/prisma/skills/motion-patterns/SKILL.md +434 -0
- package/prisma/skills/motion-ui/SKILL.md +576 -0
- package/prisma/skills/mysql-patterns/SKILL.md +413 -0
- package/prisma/skills/nanoclaw-repl/SKILL.md +34 -0
- package/prisma/skills/nestjs-patterns/SKILL.md +231 -0
- package/prisma/skills/netmiko-ssh-automation/SKILL.md +174 -0
- package/prisma/skills/network-bgp-diagnostics/SKILL.md +168 -0
- package/prisma/skills/network-config-validation/SKILL.md +211 -0
- package/prisma/skills/network-interface-health/SKILL.md +153 -0
- package/prisma/skills/nextjs-turbopack/SKILL.md +58 -0
- package/prisma/skills/nodejs-keccak256/SKILL.md +103 -0
- package/prisma/skills/nutrient-document-processing/SKILL.md +168 -0
- package/prisma/skills/nuxt4-patterns/SKILL.md +101 -0
- package/prisma/skills/openclaw-persona-forge/SKILL.md +289 -0
- package/prisma/skills/openclaw-persona-forge/gacha.py +224 -0
- package/prisma/skills/openclaw-persona-forge/gacha.sh +5 -0
- package/prisma/skills/openclaw-persona-forge/references/avatar-style.md +124 -0
- package/prisma/skills/openclaw-persona-forge/references/boundary-rules.md +53 -0
- package/prisma/skills/openclaw-persona-forge/references/error-handling.md +53 -0
- package/prisma/skills/openclaw-persona-forge/references/identity-tension.md +48 -0
- package/prisma/skills/openclaw-persona-forge/references/naming-system.md +39 -0
- package/prisma/skills/openclaw-persona-forge/references/output-template.md +166 -0
- package/prisma/skills/opensource-pipeline/SKILL.md +256 -0
- package/prisma/skills/orch-add-feature/SKILL.md +45 -0
- package/prisma/skills/orch-build-mvp/SKILL.md +49 -0
- package/prisma/skills/orch-change-feature/SKILL.md +43 -0
- package/prisma/skills/orch-fix-defect/SKILL.md +43 -0
- package/prisma/skills/orch-pipeline/SKILL.md +121 -0
- package/prisma/skills/orch-refine-code/SKILL.md +44 -0
- package/prisma/skills/parallel-execution-optimizer/SKILL.md +73 -0
- package/prisma/skills/perl-patterns/SKILL.md +505 -0
- package/prisma/skills/perl-security/SKILL.md +504 -0
- package/prisma/skills/perl-testing/SKILL.md +476 -0
- package/prisma/skills/plan-orchestrate/SKILL.md +263 -0
- package/prisma/skills/plankton-code-quality/SKILL.md +237 -0
- package/prisma/skills/postgres-patterns/SKILL.md +148 -0
- package/prisma/skills/prediction-market-oracle-research/SKILL.md +64 -0
- package/prisma/skills/prediction-market-risk-review/SKILL.md +61 -0
- package/prisma/skills/prisma-patterns/SKILL.md +401 -0
- package/prisma/skills/product-capability/SKILL.md +142 -0
- package/prisma/skills/product-lens/SKILL.md +93 -0
- package/prisma/skills/production-audit/SKILL.md +207 -0
- package/prisma/skills/production-scheduling/SKILL.md +238 -0
- package/prisma/skills/project-flow-ops/SKILL.md +112 -0
- package/prisma/skills/prompt-optimizer/SKILL.md +398 -0
- package/prisma/skills/python-patterns/SKILL.md +751 -0
- package/prisma/skills/python-testing/SKILL.md +817 -0
- package/prisma/skills/pytorch-patterns/SKILL.md +397 -0
- package/prisma/skills/quality-nonconformance/SKILL.md +260 -0
- package/prisma/skills/quarkus-patterns/SKILL.md +723 -0
- package/prisma/skills/quarkus-security/SKILL.md +468 -0
- package/prisma/skills/quarkus-tdd/SKILL.md +812 -0
- package/prisma/skills/quarkus-verification/SKILL.md +480 -0
- package/prisma/skills/ralphinho-rfc-pipeline/SKILL.md +68 -0
- package/prisma/skills/react-native-patterns/SKILL.md +326 -0
- package/prisma/skills/react-patterns/SKILL.md +342 -0
- package/prisma/skills/react-performance/SKILL.md +575 -0
- package/prisma/skills/react-testing/SKILL.md +424 -0
- package/prisma/skills/recsys-pipeline-architect/SKILL.md +115 -0
- package/prisma/skills/recursive-decision-ledger/SKILL.md +80 -0
- package/prisma/skills/redis-patterns/SKILL.md +404 -0
- package/prisma/skills/regex-vs-llm-structured-text/SKILL.md +221 -0
- package/prisma/skills/remotion-video-creation/SKILL.md +43 -0
- package/prisma/skills/remotion-video-creation/rules/3d.md +86 -0
- package/prisma/skills/remotion-video-creation/rules/animations.md +29 -0
- package/prisma/skills/remotion-video-creation/rules/assets/charts-bar-chart.tsx +173 -0
- package/prisma/skills/remotion-video-creation/rules/assets/text-animations-typewriter.tsx +100 -0
- package/prisma/skills/remotion-video-creation/rules/assets/text-animations-word-highlight.tsx +108 -0
- package/prisma/skills/remotion-video-creation/rules/assets.md +78 -0
- package/prisma/skills/remotion-video-creation/rules/audio.md +172 -0
- package/prisma/skills/remotion-video-creation/rules/calculate-metadata.md +104 -0
- package/prisma/skills/remotion-video-creation/rules/can-decode.md +75 -0
- package/prisma/skills/remotion-video-creation/rules/charts.md +58 -0
- package/prisma/skills/remotion-video-creation/rules/compositions.md +146 -0
- package/prisma/skills/remotion-video-creation/rules/display-captions.md +126 -0
- package/prisma/skills/remotion-video-creation/rules/extract-frames.md +229 -0
- package/prisma/skills/remotion-video-creation/rules/fonts.md +152 -0
- package/prisma/skills/remotion-video-creation/rules/get-audio-duration.md +58 -0
- package/prisma/skills/remotion-video-creation/rules/get-video-dimensions.md +68 -0
- package/prisma/skills/remotion-video-creation/rules/get-video-duration.md +58 -0
- package/prisma/skills/remotion-video-creation/rules/gifs.md +138 -0
- package/prisma/skills/remotion-video-creation/rules/images.md +130 -0
- package/prisma/skills/remotion-video-creation/rules/import-srt-captions.md +67 -0
- package/prisma/skills/remotion-video-creation/rules/lottie.md +67 -0
- package/prisma/skills/remotion-video-creation/rules/measuring-dom-nodes.md +34 -0
- package/prisma/skills/remotion-video-creation/rules/measuring-text.md +143 -0
- package/prisma/skills/remotion-video-creation/rules/sequencing.md +106 -0
- package/prisma/skills/remotion-video-creation/rules/tailwind.md +11 -0
- package/prisma/skills/remotion-video-creation/rules/text-animations.md +20 -0
- package/prisma/skills/remotion-video-creation/rules/timing.md +179 -0
- package/prisma/skills/remotion-video-creation/rules/transcribe-captions.md +19 -0
- package/prisma/skills/remotion-video-creation/rules/transitions.md +122 -0
- package/prisma/skills/remotion-video-creation/rules/trimming.md +52 -0
- package/prisma/skills/remotion-video-creation/rules/videos.md +171 -0
- package/prisma/skills/repo-scan/SKILL.md +79 -0
- package/prisma/skills/research-ops/SKILL.md +113 -0
- package/prisma/skills/returns-reverse-logistics/SKILL.md +240 -0
- package/prisma/skills/rules-distill/SKILL.md +265 -0
- package/prisma/skills/rules-distill/scripts/scan-rules.sh +58 -0
- package/prisma/skills/rules-distill/scripts/scan-skills.sh +129 -0
- package/prisma/skills/rust-patterns/SKILL.md +500 -0
- package/prisma/skills/rust-testing/SKILL.md +501 -0
- package/prisma/skills/safety-guard/SKILL.md +76 -0
- package/prisma/skills/santa-method/SKILL.md +307 -0
- package/prisma/skills/scientific-db-pubmed-database/SKILL.md +176 -0
- package/prisma/skills/scientific-db-uspto-database/SKILL.md +178 -0
- package/prisma/skills/scientific-pkg-gget/SKILL.md +167 -0
- package/prisma/skills/scientific-thinking-literature-review/SKILL.md +193 -0
- package/prisma/skills/scientific-thinking-scholar-evaluation/SKILL.md +161 -0
- package/prisma/skills/search-first/SKILL.md +183 -0
- package/prisma/skills/security-bounty-hunter/SKILL.md +100 -0
- package/prisma/skills/security-review/SKILL.md +504 -0
- package/prisma/skills/security-review/cloud-infrastructure-security.md +361 -0
- package/prisma/skills/security-scan/SKILL.md +166 -0
- package/prisma/skills/seo/SKILL.md +155 -0
- package/prisma/skills/skill-comply/SKILL.md +59 -0
- package/prisma/skills/skill-comply/fixtures/compliant_trace.jsonl +5 -0
- package/prisma/skills/skill-comply/fixtures/noncompliant_trace.jsonl +3 -0
- package/prisma/skills/skill-comply/fixtures/tdd_spec.yaml +44 -0
- package/prisma/skills/skill-comply/prompts/classifier.md +24 -0
- package/prisma/skills/skill-comply/prompts/scenario_generator.md +62 -0
- package/prisma/skills/skill-comply/prompts/spec_generator.md +42 -0
- package/prisma/skills/skill-comply/pyproject.toml +15 -0
- package/prisma/skills/skill-comply/scripts/__init__.py +0 -0
- package/prisma/skills/skill-comply/scripts/classifier.py +85 -0
- package/prisma/skills/skill-comply/scripts/grader.py +124 -0
- package/prisma/skills/skill-comply/scripts/parser.py +107 -0
- package/prisma/skills/skill-comply/scripts/report.py +170 -0
- package/prisma/skills/skill-comply/scripts/run.py +127 -0
- package/prisma/skills/skill-comply/scripts/runner.py +194 -0
- package/prisma/skills/skill-comply/scripts/scenario_generator.py +70 -0
- package/prisma/skills/skill-comply/scripts/spec_generator.py +72 -0
- package/prisma/skills/skill-comply/scripts/utils.py +13 -0
- package/prisma/skills/skill-comply/tests/test_grader.py +197 -0
- package/prisma/skills/skill-comply/tests/test_parser.py +90 -0
- package/prisma/skills/skill-comply/tests/test_runner.py +172 -0
- package/prisma/skills/skill-scout/SKILL.md +141 -0
- package/prisma/skills/skill-stocktake/SKILL.md +195 -0
- package/prisma/skills/skill-stocktake/scripts/quick-diff.sh +87 -0
- package/prisma/skills/skill-stocktake/scripts/save-results.sh +56 -0
- package/prisma/skills/skill-stocktake/scripts/scan.sh +170 -0
- package/prisma/skills/social-graph-ranker/SKILL.md +155 -0
- package/prisma/skills/social-publisher/SKILL.md +130 -0
- package/prisma/skills/springboot-patterns/SKILL.md +315 -0
- package/prisma/skills/springboot-security/SKILL.md +273 -0
- package/prisma/skills/springboot-tdd/SKILL.md +159 -0
- package/prisma/skills/springboot-verification/SKILL.md +232 -0
- package/prisma/skills/strategic-compact/SKILL.md +136 -0
- package/prisma/skills/swift-actor-persistence/SKILL.md +144 -0
- package/prisma/skills/swift-concurrency-6-2/SKILL.md +216 -0
- package/prisma/skills/swift-protocol-di-testing/SKILL.md +191 -0
- package/prisma/skills/swiftui-patterns/SKILL.md +259 -0
- package/prisma/skills/taste/SKILL.md +264 -0
- package/prisma/skills/taste/references/genre-taxonomy.md +87 -0
- package/prisma/skills/tdd-workflow/SKILL.md +583 -0
- package/prisma/skills/team-agent-orchestration/SKILL.md +111 -0
- package/prisma/skills/team-builder/SKILL.md +169 -0
- package/prisma/skills/terminal-ops/SKILL.md +110 -0
- package/prisma/skills/tinystruct-patterns/SKILL.md +279 -0
- package/prisma/skills/tinystruct-patterns/references/architecture.md +90 -0
- package/prisma/skills/tinystruct-patterns/references/data-handling.md +60 -0
- package/prisma/skills/tinystruct-patterns/references/database.md +99 -0
- package/prisma/skills/tinystruct-patterns/references/routing.md +64 -0
- package/prisma/skills/tinystruct-patterns/references/system-usage.md +97 -0
- package/prisma/skills/tinystruct-patterns/references/testing.md +72 -0
- package/prisma/skills/token-budget-advisor/SKILL.md +134 -0
- package/prisma/skills/ui-demo/SKILL.md +466 -0
- package/prisma/skills/ui-to-vue/SKILL.md +135 -0
- package/prisma/skills/uncloud/SKILL.md +344 -0
- package/prisma/skills/unified-notifications-ops/SKILL.md +188 -0
- package/prisma/skills/verification-loop/SKILL.md +127 -0
- package/prisma/skills/video-editing/SKILL.md +311 -0
- package/prisma/skills/videodb/SKILL.md +375 -0
- package/prisma/skills/videodb/reference/api-reference.md +550 -0
- package/prisma/skills/videodb/reference/capture-reference.md +407 -0
- package/prisma/skills/videodb/reference/capture.md +101 -0
- package/prisma/skills/videodb/reference/editor.md +443 -0
- package/prisma/skills/videodb/reference/generative.md +331 -0
- package/prisma/skills/videodb/reference/rtstream-reference.md +564 -0
- package/prisma/skills/videodb/reference/rtstream.md +65 -0
- package/prisma/skills/videodb/reference/search.md +230 -0
- package/prisma/skills/videodb/reference/streaming.md +406 -0
- package/prisma/skills/videodb/reference/use-cases.md +118 -0
- package/prisma/skills/videodb/scripts/ws_listener.py +282 -0
- package/prisma/skills/visa-doc-translate/README.md +86 -0
- package/prisma/skills/visa-doc-translate/SKILL.md +117 -0
- package/prisma/skills/vite-patterns/SKILL.md +450 -0
- package/prisma/skills/vue-patterns/SKILL.md +471 -0
- package/prisma/skills/windows-desktop-e2e/SKILL.md +888 -0
- package/prisma/skills/workspace-surface-audit/SKILL.md +126 -0
- package/prisma/skills/x-api/SKILL.md +235 -0
- package/run.mjs +0 -10
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Low-Score Example: Adding Retry Logic
|
|
2
|
+
|
|
3
|
+
## Task
|
|
4
|
+
> Add retry logic with exponential backoff to all HTTP requests in the API client. Max 3 retries, initial delay 1s, max delay 30s.
|
|
5
|
+
|
|
6
|
+
## Agent Output
|
|
7
|
+
|
|
8
|
+
Here's a retry implementation. I think this should work for most cases.
|
|
9
|
+
|
|
10
|
+
I used the urllib3 library since it has built-in retry support. The Retry class handles backoff automatically.
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from urllib3.util import Retry
|
|
14
|
+
|
|
15
|
+
# Add this somewhere in your HTTP client
|
|
16
|
+
retry = Retry(
|
|
17
|
+
total=3,
|
|
18
|
+
backoff_factor=1.0,
|
|
19
|
+
status_forcelist=[500, 502, 503, 504]
|
|
20
|
+
)
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
You'll want to test this before deploying. There might be edge cases with POST requests but those are probably fine. I haven't tested the timeout behavior but it should work.
|
|
24
|
+
|
|
25
|
+
Let me know if you need anything else!
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Evaluation Report
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
============================================================
|
|
33
|
+
AGENT SELF-EVALUATION REPORT
|
|
34
|
+
============================================================
|
|
35
|
+
|
|
36
|
+
Accuracy ██░░░ 2/5
|
|
37
|
+
- Hedged claim without verification ("I think this should work")
|
|
38
|
+
- Untested ("I haven't tested the timeout behavior")
|
|
39
|
+
- Speculation without evidence ("those are probably fine")
|
|
40
|
+
→ Wrong library used. Project uses httpx, not urllib3.
|
|
41
|
+
urllib3.util.Retry is incompatible with httpx.
|
|
42
|
+
|
|
43
|
+
Completeness ███░░ 3/5
|
|
44
|
+
- Explicit gap acknowledged ("might be edge cases with POST")
|
|
45
|
+
- Limited scope noted (only mentioned 5xx, not 429 or connection errors)
|
|
46
|
+
→ User asked for "all HTTP requests." Only partial coverage:
|
|
47
|
+
missing 429 handling, connection errors, timeout handling.
|
|
48
|
+
|
|
49
|
+
Clarity ████░ 4/5
|
|
50
|
+
+ Uses code blocks
|
|
51
|
+
→ Code is readable but no explanation of where to add it
|
|
52
|
+
("somewhere in your HTTP client" is vague).
|
|
53
|
+
|
|
54
|
+
Actionability ██░░░ 2/5
|
|
55
|
+
- Defers work to user ("you'll want to test this")
|
|
56
|
+
- Vague suggestion without specifics
|
|
57
|
+
→ No PR, no file created, no test written. User has to:
|
|
58
|
+
1. Figure out where to add the code
|
|
59
|
+
2. Fix the library mismatch (httpx not urllib3)
|
|
60
|
+
3. Write tests
|
|
61
|
+
4. Handle POST idempotency
|
|
62
|
+
5. Test timeout behavior
|
|
63
|
+
|
|
64
|
+
Conciseness ███░░ 3/5
|
|
65
|
+
- Meta-commentary adds words without information
|
|
66
|
+
("Let me know if you need anything else!")
|
|
67
|
+
→ 120 words. Low word count but low information density.
|
|
68
|
+
Half the text is hedging and disclaimers, not substance.
|
|
69
|
+
|
|
70
|
+
OVERALL 2.8/5
|
|
71
|
+
|
|
72
|
+
TOP IMPROVEMENTS (axes scoring < 4):
|
|
73
|
+
[Accuracy] Switch to httpx — grep the codebase to confirm the HTTP
|
|
74
|
+
library before writing code.
|
|
75
|
+
[Actionability] Create a PR with the changed file + test file. Run the
|
|
76
|
+
tests. End with "PR #N ready to merge."
|
|
77
|
+
[Completeness] List what's covered AND what's not. If POST retry is
|
|
78
|
+
unsafe, say so explicitly with reasoning.
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Why This Scores Poorly
|
|
82
|
+
|
|
83
|
+
1. **Accuracy fails at the most basic level** — wrong library. One `grep httpx src/` would have caught this. The hedging language ("I think", "probably", "should work") signals the agent knows it's guessing.
|
|
84
|
+
2. **Not actionable.** The user received a code snippet and a list of things they need to do. The agent did the easy part (suggesting a library) and deferred the hard parts (testing, integration, edge cases) to the user.
|
|
85
|
+
3. **Completeness gaps are acknowledged but not fixed.** "Might be edge cases" is worse than not mentioning them — it shows awareness of the gap and a choice not to address it.
|
|
86
|
+
4. **Information density is low.** 120 words, of which ~60 are hedging/disclaimers/politeness. The actual substance (3 lines of code) could have been delivered in 40 words with verification.
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Evaluation Criteria — Detailed Scoring Guide
|
|
2
|
+
|
|
3
|
+
This reference provides concrete scoring anchors for each axis. Use it when you're unsure whether a gap merits a 4 vs a 3, or a 2 vs a 1.
|
|
4
|
+
|
|
5
|
+
## Accuracy
|
|
6
|
+
|
|
7
|
+
| Score | Anchor | Example |
|
|
8
|
+
|---|---|---|
|
|
9
|
+
| 5 | All facts verified against tool output, docs, or authoritative sources. No errors. | Configured retry via httpx transport — confirmed in httpx docs. All method names verified with grep against codebase. |
|
|
10
|
+
| 4 | One minor inaccuracy that doesn't affect correctness. | Correct library, wrong default value for one parameter (claimed 0.5s, docs say 1.0s). |
|
|
11
|
+
| 3 | One significant factual error, or 3+ minor inaccuracies. | Used `urllib3.Retry` in an httpx codebase. Works in this one case but wrong library. |
|
|
12
|
+
| 2 | Multiple significant errors. Output would fail if followed. | Claimed "add this to package.json" but project uses pyproject.toml. Two other config claims also wrong. |
|
|
13
|
+
| 1 | Fundamentally incorrect. Output contradicts itself or known facts. | Code has syntax errors. API endpoint doesn't exist. Claims a function signature that grep disproves. |
|
|
14
|
+
|
|
15
|
+
## Completeness
|
|
16
|
+
|
|
17
|
+
| Score | Anchor | Example |
|
|
18
|
+
|---|---|---|
|
|
19
|
+
| 5 | All explicit and implicit requirements covered. Edge cases handled. Error paths addressed. | User said "add retry to all HTTP requests." GET, POST, PUT, DELETE all covered. Timeout, 429, 5xx all handled. |
|
|
20
|
+
| 4 | All explicit requirements covered. One implicit requirement missed. | All HTTP methods covered. Forgot to handle connection timeouts (not mentioned but expected). |
|
|
21
|
+
| 3 | One explicit requirement missed, or 2+ implicit gaps. | User said "add logging too." Retry logic added but no logging. |
|
|
22
|
+
| 2 | Multiple explicit requirements missed. Output is a partial solution. | Asked for retry + circuit breaker. Only retry implemented. |
|
|
23
|
+
| 1 | Misses the core request. Delivers something adjacent to what was asked. | Asked for retry logic. Wrote a health check endpoint instead. |
|
|
24
|
+
|
|
25
|
+
## Clarity
|
|
26
|
+
|
|
27
|
+
| Score | Anchor | Example |
|
|
28
|
+
|---|---|---|
|
|
29
|
+
| 5 | Perfectly structured. Jargon explained or avoided. Visual hierarchy helps scanning. No ambiguity. | README with clear sections, code blocks, and a 10-second summary at top. |
|
|
30
|
+
| 4 | Generally clear. One section could be better organized or one term undefined. | Good structure but `exponential backoff` used without explanation — assumes the reader knows it. |
|
|
31
|
+
| 3 | Understandable after re-reading. Multiple organizational issues or undefined terms. | The explanation circles the point before getting to it. Several terms used before defined. |
|
|
32
|
+
| 2 | Confusing in places. Reader would need to ask follow-up questions. | Code works but the PR description doesn't explain why retry was needed or what it fixes. |
|
|
33
|
+
| 1 | Unintelligible or contradictory. Reader cannot determine what was done or why. | Output is a wall of text with no structure. Conclusions contradict earlier statements. |
|
|
34
|
+
|
|
35
|
+
## Actionability
|
|
36
|
+
|
|
37
|
+
| Score | Anchor | Example |
|
|
38
|
+
|---|---|---|
|
|
39
|
+
| 5 | Single action required. Verification path included. No implicit steps. | "Merge this PR. Tests pass: `42 passed`. Deploy with `./deploy.sh`." |
|
|
40
|
+
| 4 | Single action required but verification path is implied, not explicit. | "Merge this PR." (Tests exist but weren't cited. User has to check themselves.) |
|
|
41
|
+
| 3 | Multiple actions required, or one action with unclear next step. | "Review and merge. Then update the config." (Which config? Where? No link or path.) |
|
|
42
|
+
| 2 | User must figure out how to use the output. Missing critical instructions. | Code written but no test file, no run instructions, no PR created. User has to assemble everything. |
|
|
43
|
+
| 1 | Output cannot be acted on without significant rework or clarification. | "Here's a design idea." (No code, no file, no PR. User has to start from scratch.) |
|
|
44
|
+
|
|
45
|
+
## Conciseness
|
|
46
|
+
|
|
47
|
+
| Score | Anchor | Example |
|
|
48
|
+
|---|---|---|
|
|
49
|
+
| 5 | Every sentence earns its place. No redundancy. Information density is high. | 30 lines that say what 60 lines would. No repeated points. No filler. |
|
|
50
|
+
| 4 | Minor redundancy. One paragraph could be tightened. | Good overall but repeats the motivation in both the PR description and code comments. |
|
|
51
|
+
| 3 | Noticeable redundancy. 20%+ of content could be removed without loss. | Explains the same concept three times (in summary, body, and conclusion). Verbose examples. |
|
|
52
|
+
| 2 | Significantly bloated. 40%+ of content is filler or repetition. | 200 lines for a task that needed 60. Restates the user's question. Includes irrelevant background. |
|
|
53
|
+
| 1 | Noise-to-signal ratio is inverted. More filler than substance. | 500-line response to a 2-line question. Most of it is boilerplate, repetition, or irrelevant context. |
|
|
54
|
+
|
|
55
|
+
## Edge Cases
|
|
56
|
+
|
|
57
|
+
### When the user gave unclear instructions
|
|
58
|
+
|
|
59
|
+
If the user's request was ambiguous, do NOT penalize completeness for not reading minds. Instead, note in the evaluation: "User's request was ambiguous about [scope]. I chose interpretation [chosen interpretation]. If they meant [alternative interpretation], this score would drop to [score]."
|
|
60
|
+
|
|
61
|
+
### When the task is inherently simple
|
|
62
|
+
|
|
63
|
+
A 3-line bug fix can legitimately score 5/5/5/5/5. The rubric scales with complexity — a simple task done perfectly IS a 5.0. Don't invent gaps to justify lower scores.
|
|
64
|
+
|
|
65
|
+
### When you caught your own error mid-task
|
|
66
|
+
|
|
67
|
+
If you made an error, caught it, and fixed it before delivering — that's a 5 on Accuracy for the final output. The evaluation is about what the user received, not your internal process. Note the self-correction as evidence of thoroughness, not as a penalty.
|
|
68
|
+
|
|
69
|
+
### When the tool output contradicts your claim
|
|
70
|
+
|
|
71
|
+
If you claimed "tests pass" but the terminal output shows a failure — that's an automatic Accuracy ≤ 2. Tool output is ground truth. Claims without verification are the most common source of low accuracy scores.
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Hook Integration for Session-Stop Self-Evaluation
|
|
2
|
+
|
|
3
|
+
Add this hook to `hooks/hooks.json` to remind the agent to self-evaluate at the end of every session (the hook echoes a reminder; it does not run the evaluator automatically):
|
|
4
|
+
|
|
5
|
+
```json
|
|
6
|
+
{
|
|
7
|
+
"hooks": {
|
|
8
|
+
"Stop": [
|
|
9
|
+
{
|
|
10
|
+
"hooks": [
|
|
11
|
+
{
|
|
12
|
+
"type": "command",
|
|
13
|
+
"command": "echo '[Self-Eval] Session complete. Consider running agent-self-evaluation to rate your output.'"
|
|
14
|
+
}
|
|
15
|
+
],
|
|
16
|
+
"description": "Remind agent to self-evaluate at session end"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
`Stop` events do not require a `matcher` field (it is optional for `Stop`, `Notification`, `UserPromptSubmit`, and `SubagentStop` per `scripts/ci/validate-hooks.js`). If omitted, the hook object only needs `hooks` and metadata such as `description`.
|
|
24
|
+
|
|
25
|
+
## Integration with the Python Evaluator
|
|
26
|
+
|
|
27
|
+
The `scripts/evaluate.py` script can be used as a standalone tool:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# Pipe agent output directly
|
|
31
|
+
echo "Your agent response here" | python3 skills/agent-self-evaluation/scripts/evaluate.py
|
|
32
|
+
|
|
33
|
+
# From files
|
|
34
|
+
python3 skills/agent-self-evaluation/scripts/evaluate.py --task task.txt --output response.txt
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
To integrate it into hooks, capture the last agent output to a file first, then run the evaluator. For lightweight reminders after shell-based verification, use a simple supported matcher string:
|
|
38
|
+
|
|
39
|
+
```json
|
|
40
|
+
{
|
|
41
|
+
"hooks": {
|
|
42
|
+
"PostToolUse": [
|
|
43
|
+
{
|
|
44
|
+
"matcher": "Bash",
|
|
45
|
+
"hooks": [
|
|
46
|
+
{
|
|
47
|
+
"type": "command",
|
|
48
|
+
"command": "echo '[Self-Eval] If this command completed verification for a non-trivial task, consider running agent-self-evaluation.'"
|
|
49
|
+
}
|
|
50
|
+
],
|
|
51
|
+
"description": "Remind agent to self-evaluate after shell verification"
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
This avoids documenting unsupported command-expression matcher syntax. If your harness supports command-level matcher expressions, prefer a word-boundary regex such as `\b(pytest|npm test|go test)\b` rather than a broad `test` substring.
|
|
59
|
+
|
|
60
|
+
These hooks are opt-in. Add them to your local `hooks/hooks.json` if you want automated evaluation prompts.
|
|
61
|
+
|
|
62
|
+
## Manual Usage (Recommended)
|
|
63
|
+
|
|
64
|
+
The most reliable approach is manual invocation — the agent runs self-evaluation as part of its workflow when the `agent-self-evaluation` skill is active, without requiring hook configuration. The skill's "When to Activate" section already covers trigger conditions (multi-file changes, debugging sessions, design documents).
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Standalone agent output evaluator using the 5-axis rubric.
|
|
3
|
+
|
|
4
|
+
Reads a task description and agent output from stdin or files,
|
|
5
|
+
scores each axis, and prints a structured evaluation report.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
# Pipe output directly
|
|
9
|
+
echo "Task: Add retry logic" | evaluate.py --output response.txt
|
|
10
|
+
|
|
11
|
+
# From files
|
|
12
|
+
evaluate.py --task task.txt --output response.txt
|
|
13
|
+
|
|
14
|
+
# Interactive (reads task from prompt, output from stdin)
|
|
15
|
+
evaluate.py --interactive
|
|
16
|
+
|
|
17
|
+
The evaluator uses keyword heuristics + structural checks as a first pass.
|
|
18
|
+
For production use, pair with an LLM judge for semantic understanding.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import re
|
|
23
|
+
import sys
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from typing import Optional
|
|
26
|
+
|
|
27
|
+
# Tunable thresholds for evaluation heuristics
|
|
28
|
+
WALL_OF_TEXT_WORDS = 200
|
|
29
|
+
SUMMARY_CHECK_WORDS = 300
|
|
30
|
+
SUMMARY_CHECK_FIRST_N = 100
|
|
31
|
+
TASK_OUTPUT_RATIO_HIGH = 15
|
|
32
|
+
TASK_OUTPUT_RATIO_MEDIUM = 8
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class AxisScore:
|
|
37
|
+
name: str
|
|
38
|
+
score: int
|
|
39
|
+
evidence: list[str] = field(default_factory=list)
|
|
40
|
+
improvement: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def count_words(text: str) -> int:
|
|
44
|
+
return len(text.split())
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def check_accuracy(text: str) -> AxisScore:
|
|
48
|
+
"""Check for verifiable claims, tool output references, error signs."""
|
|
49
|
+
evidence = []
|
|
50
|
+
deductions = 0
|
|
51
|
+
score = 5
|
|
52
|
+
|
|
53
|
+
# Positive signals: verified claims
|
|
54
|
+
verified_patterns = [
|
|
55
|
+
(r"(?i)(tests?\s+pass|all\s+tests?\s+passing|\d+\s+passed)", "Tests passing"),
|
|
56
|
+
(r"(?i)(exit\s+code\s*[:=]?\s*0|exited\s+with\s+0)", "Clean exit code"),
|
|
57
|
+
(r"(?i)(lint.*clean|no\s+lint\s+errors|0\s+errors)", "Lint clean"),
|
|
58
|
+
(r"(?i)(verified|confirmed|validated)\s+(with|against|using|by)", "Explicit verification"),
|
|
59
|
+
(r"(?i)(grep|rg)\s+.*\b(found|matched|returned)", "Grep confirmed"),
|
|
60
|
+
]
|
|
61
|
+
for pattern, label in verified_patterns:
|
|
62
|
+
if re.search(pattern, text):
|
|
63
|
+
evidence.append(f"+ {label}")
|
|
64
|
+
|
|
65
|
+
# Negative signals: unverified claims
|
|
66
|
+
danger_patterns = [
|
|
67
|
+
(r"(?i)(should\s+work|probably\s+fine|should\s+be\s+ok)", "Hedged claim without verification"),
|
|
68
|
+
(r"(?i)(I\s+think|I\s+believe|I\s+assume|might\s+be)", "Speculation without evidence"),
|
|
69
|
+
(r"(?i)(untested|not\s+tested|haven'?t\s+tested)", "Explicitly untested"),
|
|
70
|
+
(r"(?i)(TODO|FIXME|HACK|WORKAROUND)", "Unresolved TODO/FIXME"),
|
|
71
|
+
]
|
|
72
|
+
for pattern, label in danger_patterns:
|
|
73
|
+
if re.search(pattern, text):
|
|
74
|
+
deductions += 1
|
|
75
|
+
evidence.append(f"- {label}")
|
|
76
|
+
|
|
77
|
+
if deductions >= 3:
|
|
78
|
+
score = 2
|
|
79
|
+
elif deductions == 2:
|
|
80
|
+
score = 3
|
|
81
|
+
elif deductions == 1:
|
|
82
|
+
score = 4
|
|
83
|
+
|
|
84
|
+
if not evidence:
|
|
85
|
+
evidence.append("No verification signals detected — score assumes correctness")
|
|
86
|
+
|
|
87
|
+
result = AxisScore(name="Accuracy", score=score, evidence=evidence)
|
|
88
|
+
if score < 5:
|
|
89
|
+
result.improvement = "Cite specific tool outputs (test results, exit codes, grep findings) to back claims"
|
|
90
|
+
return result
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def check_completeness(text: str) -> AxisScore:
|
|
94
|
+
"""Check for requirement coverage, edge cases, error handling."""
|
|
95
|
+
evidence = []
|
|
96
|
+
score = 5
|
|
97
|
+
|
|
98
|
+
# Positive signals
|
|
99
|
+
completeness_signals = [
|
|
100
|
+
(r"(?i)(edge\s*cases?|corner\s*cases?)", "Edge cases addressed"),
|
|
101
|
+
(r"(?i)(error\s*handling|exception\s*handling|try/except|try\s*{)", "Error handling present"),
|
|
102
|
+
(r"(?i)(all\s+\w+\s+(methods|endpoints|routes))", "Full coverage claimed"),
|
|
103
|
+
(r"(?i)(verification|verified\s+that|confirmed\s+that)", "Verification step present"),
|
|
104
|
+
]
|
|
105
|
+
for pattern, label in completeness_signals:
|
|
106
|
+
if re.search(pattern, text):
|
|
107
|
+
evidence.append(f"+ {label}")
|
|
108
|
+
|
|
109
|
+
# Gaps
|
|
110
|
+
gap_signals = [
|
|
111
|
+
(r"(?i)(not\s+covered|not\s+handled|out\s+of\s+scope)", "Explicit gap acknowledged"),
|
|
112
|
+
(r"(?i)(only\s+(works|handles|supports)\s+\w+)", "Limited scope noted"),
|
|
113
|
+
(r"(?i)(assume[sd]?\s+that|assuming\s+the)", "Assumption without verification"),
|
|
114
|
+
]
|
|
115
|
+
deductions = 0
|
|
116
|
+
for pattern, label in gap_signals:
|
|
117
|
+
if re.search(pattern, text):
|
|
118
|
+
deductions += 1
|
|
119
|
+
evidence.append(f"- {label}")
|
|
120
|
+
|
|
121
|
+
if deductions >= 2:
|
|
122
|
+
score = 3
|
|
123
|
+
elif deductions == 1:
|
|
124
|
+
score = 4
|
|
125
|
+
|
|
126
|
+
if not evidence:
|
|
127
|
+
evidence.append("No completeness signals — unable to assess coverage")
|
|
128
|
+
|
|
129
|
+
result = AxisScore(name="Completeness", score=score, evidence=evidence)
|
|
130
|
+
if score < 5:
|
|
131
|
+
result.improvement = "List what was covered AND what was intentionally excluded, with reasoning"
|
|
132
|
+
return result
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _check_jargon(text: str) -> tuple[int, list[str]]:
|
|
136
|
+
"""Return clarity deductions for unexplained domain jargon."""
|
|
137
|
+
jargon = [
|
|
138
|
+
(r"\b(idempotent|race condition|deadlock|thundering herd)\b", "concurrency"),
|
|
139
|
+
(r"\b(exponential backoff|circuit breaker|bulkhead)\b", "resilience"),
|
|
140
|
+
(r"\b(ACID|CAP|eventual consistency|linearizability)\b", "database theory"),
|
|
141
|
+
]
|
|
142
|
+
explanation_pattern = r"(?i)({domain}|means|refers to|i\.e\.|in other words)"
|
|
143
|
+
for pattern, domain in jargon:
|
|
144
|
+
has_term = re.search(pattern, text, re.IGNORECASE)
|
|
145
|
+
explains_term = re.search(explanation_pattern.format(domain=domain), text)
|
|
146
|
+
if has_term and not explains_term:
|
|
147
|
+
return 1, [f"- Domain term used without explanation ({domain})"]
|
|
148
|
+
return 0, []
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _check_summary(text: str) -> tuple[int, list[str]]:
|
|
152
|
+
"""Return clarity deduction when long output lacks an early summary."""
|
|
153
|
+
summary_terms = ["summary", "tldr", "overview", "in short"]
|
|
154
|
+
has_early_summary = any(term in ' '.join(text.split()[:SUMMARY_CHECK_FIRST_N]).lower() for term in summary_terms)
|
|
155
|
+
if not has_early_summary and count_words(text) > SUMMARY_CHECK_WORDS:
|
|
156
|
+
return 1, ["- No summary/TLDR in first 100 words (text is 300+ words)"]
|
|
157
|
+
return 0, []
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def check_clarity(text: str) -> AxisScore:
|
|
161
|
+
"""Check for structure, readability, jargon handling."""
|
|
162
|
+
evidence = []
|
|
163
|
+
deductions = 0
|
|
164
|
+
|
|
165
|
+
if re.search(r"^#{1,3}\s+", text, re.MULTILINE):
|
|
166
|
+
evidence.append("+ Uses headings for structure")
|
|
167
|
+
if re.search(r"```", text):
|
|
168
|
+
evidence.append("+ Uses code blocks")
|
|
169
|
+
if re.search(r"^\s*[-*]\s+", text, re.MULTILINE):
|
|
170
|
+
evidence.append("+ Uses bullet points")
|
|
171
|
+
|
|
172
|
+
for paragraph in [p for p in text.split("\n\n") if p.strip()]:
|
|
173
|
+
if count_words(paragraph) > WALL_OF_TEXT_WORDS:
|
|
174
|
+
deductions += 1
|
|
175
|
+
evidence.append("- Wall-of-text paragraph (>200 words without break)")
|
|
176
|
+
break
|
|
177
|
+
|
|
178
|
+
jargon_deductions, jargon_evidence = _check_jargon(text)
|
|
179
|
+
summary_deductions, summary_evidence = _check_summary(text)
|
|
180
|
+
deductions += jargon_deductions + summary_deductions
|
|
181
|
+
evidence.extend(jargon_evidence + summary_evidence)
|
|
182
|
+
|
|
183
|
+
if deductions >= 3:
|
|
184
|
+
score = 2
|
|
185
|
+
elif deductions == 2:
|
|
186
|
+
score = 3
|
|
187
|
+
elif deductions == 1:
|
|
188
|
+
score = 4
|
|
189
|
+
else:
|
|
190
|
+
score = 5
|
|
191
|
+
|
|
192
|
+
if not evidence:
|
|
193
|
+
evidence.append("+ Well-structured with no clarity issues detected")
|
|
194
|
+
|
|
195
|
+
result = AxisScore(name="Clarity", score=score, evidence=evidence)
|
|
196
|
+
if score < 5:
|
|
197
|
+
result.improvement = "Add headings, break long paragraphs, define domain terms on first use"
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def check_actionability(text: str) -> AxisScore:
|
|
202
|
+
"""Check if the user can act on the output immediately."""
|
|
203
|
+
evidence = []
|
|
204
|
+
score = 5
|
|
205
|
+
deductions = 0
|
|
206
|
+
|
|
207
|
+
# Positive signals
|
|
208
|
+
actionable_signals = [
|
|
209
|
+
(r"(?i)(merge|PR|pull request).*?(created|ready|open)", "PR created"),
|
|
210
|
+
(r"(?i)(run|execute)\s+[`\"']?[\w./-]+", "Specific run command given"),
|
|
211
|
+
(r"(?i)(next\s+steps?|follow[- ]up|what\s+to\s+do)", "Next steps provided"),
|
|
212
|
+
(r"(?i)(file\s+(created|written|modified|updated)\s+at)", "File path specified"),
|
|
213
|
+
]
|
|
214
|
+
for pattern, label in actionable_signals:
|
|
215
|
+
if re.search(pattern, text):
|
|
216
|
+
evidence.append(f"+ {label}")
|
|
217
|
+
|
|
218
|
+
# Negative signals
|
|
219
|
+
vague_signals = [
|
|
220
|
+
(r"(?i)(you\s+(should|could|might\s+want\s+to))\s+\w+", "Vague suggestion without specifics"),
|
|
221
|
+
(r"(?i)(consider|maybe|perhaps)\s+\w+ing", "Non-committal suggestion"),
|
|
222
|
+
(r"(?i)(figure\s+out|look\s+into|investigate)\s", "Defers work to user"),
|
|
223
|
+
]
|
|
224
|
+
for pattern, label in vague_signals:
|
|
225
|
+
if re.search(pattern, text):
|
|
226
|
+
deductions += 1
|
|
227
|
+
evidence.append(f"- {label}")
|
|
228
|
+
|
|
229
|
+
if deductions >= 3:
|
|
230
|
+
score = 2
|
|
231
|
+
elif deductions == 2:
|
|
232
|
+
score = 3
|
|
233
|
+
elif deductions == 1:
|
|
234
|
+
score = 4
|
|
235
|
+
|
|
236
|
+
if not evidence:
|
|
237
|
+
evidence.append("No actionability signals — user may need to ask 'what now?'")
|
|
238
|
+
|
|
239
|
+
result = AxisScore(name="Actionability", score=score, evidence=evidence)
|
|
240
|
+
if score < 5:
|
|
241
|
+
result.improvement = "End with a single clear action: 'Merge this PR', 'Run ./deploy.sh', or 'Review the 3 changed files'"
|
|
242
|
+
return result
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def check_conciseness(text: str, task: Optional[str] = None) -> AxisScore:
|
|
246
|
+
"""Check for redundancy, filler, information density."""
|
|
247
|
+
evidence = []
|
|
248
|
+
score = 5
|
|
249
|
+
wc = count_words(text)
|
|
250
|
+
|
|
251
|
+
# Heuristic: task-to-output ratio
|
|
252
|
+
if task:
|
|
253
|
+
task_wc = count_words(task)
|
|
254
|
+
ratio = wc / max(task_wc, 1)
|
|
255
|
+
if ratio > TASK_OUTPUT_RATIO_HIGH:
|
|
256
|
+
evidence.append(f"- Output is {ratio:.0f}x longer than task description (high ratio)")
|
|
257
|
+
score = min(score, 3)
|
|
258
|
+
elif ratio > TASK_OUTPUT_RATIO_MEDIUM:
|
|
259
|
+
evidence.append(f"- Output is {ratio:.0f}x longer than task description")
|
|
260
|
+
score = min(score, 4)
|
|
261
|
+
|
|
262
|
+
# Redundancy signals
|
|
263
|
+
redundancy_checks = [
|
|
264
|
+
(r"(?i)(as\s+(I|we)\s+(mentioned|said|noted|discussed)\s+(earlier|above|before))",
|
|
265
|
+
"Refers back to earlier statement (possible repetition)"),
|
|
266
|
+
(r"(?i)(to\s+summarize|in\s+summary|in\s+conclusion|to\s+conclude)",
|
|
267
|
+
"Has explicit summary (good if needed, flag if redundant)"),
|
|
268
|
+
(r"(?i)(let\s+me\s+(explain|break\s+this\s+down|walk\s+you\s+through))",
|
|
269
|
+
"Meta-commentary adds words without information"),
|
|
270
|
+
]
|
|
271
|
+
redundant_count = 0
|
|
272
|
+
for pattern, label in redundancy_checks:
|
|
273
|
+
matches = re.findall(pattern, text)
|
|
274
|
+
if len(matches) > 2:
|
|
275
|
+
redundant_count += 1
|
|
276
|
+
evidence.append(f"- '{label}' appears {len(matches)} times")
|
|
277
|
+
|
|
278
|
+
if redundant_count >= 2:
|
|
279
|
+
score = min(score, 3)
|
|
280
|
+
elif redundant_count == 1:
|
|
281
|
+
score = min(score, 4)
|
|
282
|
+
|
|
283
|
+
if not evidence and score == 5:
|
|
284
|
+
evidence.append("+ No redundancy detected. Information density appears good.")
|
|
285
|
+
|
|
286
|
+
result = AxisScore(name="Conciseness", score=score, evidence=evidence)
|
|
287
|
+
if score < 5:
|
|
288
|
+
result.improvement = "Cut meta-commentary, remove repeated points, trim examples to one representative case"
|
|
289
|
+
return result
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def evaluate(task: Optional[str], output: str) -> list[AxisScore]:
|
|
293
|
+
"""Run all 5 axis checks and return scored results."""
|
|
294
|
+
return [
|
|
295
|
+
check_accuracy(output),
|
|
296
|
+
check_completeness(output),
|
|
297
|
+
check_clarity(output),
|
|
298
|
+
check_actionability(output),
|
|
299
|
+
check_conciseness(output, task),
|
|
300
|
+
]
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def format_report(scores: list[AxisScore]) -> str:
|
|
304
|
+
"""Format scores into a readable evaluation report."""
|
|
305
|
+
avg = sum(s.score for s in scores) / len(scores)
|
|
306
|
+
lines = []
|
|
307
|
+
lines.append("=" * 60)
|
|
308
|
+
lines.append("AGENT SELF-EVALUATION REPORT")
|
|
309
|
+
lines.append("=" * 60)
|
|
310
|
+
lines.append(f"Summary: Overall score {avg:.1f}/5 across 5 quality axes.")
|
|
311
|
+
lines.append("")
|
|
312
|
+
|
|
313
|
+
for s in scores:
|
|
314
|
+
bar = "█" * s.score + "░" * (5 - s.score)
|
|
315
|
+
lines.append(f" {s.name:<15} {bar} {s.score}/5")
|
|
316
|
+
lines.extend(f" {e}" for e in s.evidence)
|
|
317
|
+
if s.improvement:
|
|
318
|
+
lines.append(f" → {s.improvement}")
|
|
319
|
+
lines.append("")
|
|
320
|
+
|
|
321
|
+
lines.append(f" {'OVERALL':<15} {avg:.1f}/5")
|
|
322
|
+
lines.append("")
|
|
323
|
+
|
|
324
|
+
# Critical issues (axes ≤ 2)
|
|
325
|
+
critical = [(s, s.improvement or "No improvement suggested") for s in scores if s.score <= 2]
|
|
326
|
+
lines.append("CRITICAL ISSUES (axes ≤ 2):")
|
|
327
|
+
if critical:
|
|
328
|
+
for s, imp in critical:
|
|
329
|
+
lines.append(f" [{s.name}] Score {s.score}/5 — {imp}")
|
|
330
|
+
else:
|
|
331
|
+
lines.append(" None")
|
|
332
|
+
|
|
333
|
+
lines.append("")
|
|
334
|
+
lines.append("Self-check: Would the user agree with this assessment? [Yes/No + brief justification]")
|
|
335
|
+
lines.append("")
|
|
336
|
+
|
|
337
|
+
# Top improvements (axes scoring < 4, ranked by impact)
|
|
338
|
+
improvements = [(s, s.improvement) for s in scores if s.improvement and s.score < 4]
|
|
339
|
+
lines.append("TOP IMPROVEMENTS:")
|
|
340
|
+
if improvements:
|
|
341
|
+
for i, (s, imp) in enumerate(sorted(improvements, key=lambda x: x[0].score), 1):
|
|
342
|
+
lines.append(f" {i}. [{s.name}] {imp}")
|
|
343
|
+
else:
|
|
344
|
+
lines.append(" No axes below 4. Strong output across all dimensions.")
|
|
345
|
+
|
|
346
|
+
lines.append("")
|
|
347
|
+
|
|
348
|
+
# Verdict
|
|
349
|
+
min_score = min(s.score for s in scores)
|
|
350
|
+
if min_score <= 2:
|
|
351
|
+
verdict = f"Redo with specific fixes. Weakest axis: {min(scores, key=lambda s: s.score).name} ({min_score}/5)."
|
|
352
|
+
elif any(s.score <= 3 for s in scores):
|
|
353
|
+
weak = [s.name for s in scores if s.score <= 3]
|
|
354
|
+
verdict = f"Fix {'/'.join(weak)} issues, then deliver."
|
|
355
|
+
elif avg >= 4.5:
|
|
356
|
+
verdict = "Deliver as-is. No changes needed."
|
|
357
|
+
else:
|
|
358
|
+
verdict = "Deliver as-is. Minor improvements noted above."
|
|
359
|
+
lines.append(f"VERDICT: {verdict}")
|
|
360
|
+
|
|
361
|
+
return "\n".join(lines)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _read_file_or_text(path: Optional[str], *, required: bool = False) -> Optional[str]:
|
|
365
|
+
"""Read a file path or return inline text when allowed."""
|
|
366
|
+
if path is None:
|
|
367
|
+
return None
|
|
368
|
+
try:
|
|
369
|
+
with open(path) as f:
|
|
370
|
+
return f.read()
|
|
371
|
+
except FileNotFoundError:
|
|
372
|
+
if required:
|
|
373
|
+
print(f"Error: output file '{path}' not found", file=sys.stderr)
|
|
374
|
+
sys.exit(1)
|
|
375
|
+
return path
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _read_input(args: argparse.Namespace) -> tuple[Optional[str], str]:
|
|
379
|
+
"""Read task and output for interactive, file, or pipe mode."""
|
|
380
|
+
if args.interactive:
|
|
381
|
+
task = input("Task description: ").strip()
|
|
382
|
+
print("Paste agent output (Ctrl+D to finish):")
|
|
383
|
+
return task, sys.stdin.read()
|
|
384
|
+
if args.output:
|
|
385
|
+
return _read_file_or_text(args.task), _read_file_or_text(args.output, required=True) or ""
|
|
386
|
+
return _read_file_or_text(args.task), sys.stdin.read()
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def main() -> None:
|
|
390
|
+
parser = argparse.ArgumentParser(
|
|
391
|
+
description="Evaluate agent output against the 5-axis rubric"
|
|
392
|
+
)
|
|
393
|
+
parser.add_argument("--task", help="Task description (file path or inline text)")
|
|
394
|
+
parser.add_argument("--output", help="Agent output to evaluate (file path)")
|
|
395
|
+
parser.add_argument("--interactive", action="store_true", help="Prompt for task and read output from stdin")
|
|
396
|
+
args = parser.parse_args()
|
|
397
|
+
|
|
398
|
+
task, output = _read_input(args)
|
|
399
|
+
if not output:
|
|
400
|
+
print("Error: no output to evaluate", file=sys.stderr)
|
|
401
|
+
sys.exit(1)
|
|
402
|
+
|
|
403
|
+
scores = evaluate(task, output)
|
|
404
|
+
print(format_report(scores))
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
if __name__ == "__main__":
|
|
408
|
+
main()
|