npm - aw-ecc - Versions diffs - 1.4.32 → 1.4.47 - Mend

aw-ecc 1.4.32 → 1.4.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (258) hide show

package/.claude-plugin/plugin.json +1 -1
package/.codex/hooks/aw-post-tool-use.sh +8 -2
package/.codex/hooks/aw-session-start.sh +11 -4
package/.codex/hooks/aw-stop.sh +8 -2
package/.codex/hooks/aw-user-prompt-submit.sh +10 -2
package/.codex/hooks.json +8 -8
package/.cursor/INSTALL.md +7 -5
package/.cursor/hooks/adapter.js +41 -4
package/.cursor/hooks/after-agent-response.js +62 -0
package/.cursor/hooks/before-submit-prompt.js +7 -1
package/.cursor/hooks/post-tool-use-failure.js +21 -0
package/.cursor/hooks/post-tool-use.js +39 -0
package/.cursor/hooks/shared/aw-phase-definitions.js +53 -0
package/.cursor/hooks/shared/aw-phase-runner.js +3 -1
package/.cursor/hooks/subagent-start.js +22 -4
package/.cursor/hooks/subagent-stop.js +18 -1
package/.cursor/hooks.json +23 -2
package/.opencode/package.json +1 -1
package/AGENTS.md +3 -3
package/README.md +5 -5
package/commands/adk.md +52 -0
package/commands/build.md +22 -9
package/commands/deploy.md +12 -0
package/commands/execute.md +9 -0
package/commands/feature.md +333 -0
package/commands/investigate.md +18 -5
package/commands/plan.md +23 -9
package/commands/publish.md +65 -0
package/commands/review.md +12 -0
package/commands/ship.md +12 -0
package/commands/test.md +12 -0
package/commands/verify.md +9 -0
package/hooks/hooks.json +36 -0
package/manifests/install-components.json +8 -0
package/manifests/install-modules.json +83 -0
package/manifests/install-profiles.json +7 -0
package/package.json +1 -1
package/scripts/ci/validate-rules.js +51 -0
package/scripts/cursor-aw-home/hooks.json +23 -2
package/scripts/cursor-aw-hooks/adapter.js +41 -4
package/scripts/cursor-aw-hooks/before-submit-prompt.js +7 -1
package/scripts/hooks/aw-usage-commit-created.js +32 -0
package/scripts/hooks/aw-usage-post-tool-use-failure.js +56 -0
package/scripts/hooks/aw-usage-post-tool-use.js +242 -0
package/scripts/hooks/aw-usage-prompt-submit.js +112 -0
package/scripts/hooks/aw-usage-session-start.js +48 -0
package/scripts/hooks/aw-usage-stop.js +182 -0
package/scripts/hooks/aw-usage-telemetry-send.js +84 -0
package/scripts/hooks/cost-tracker.js +3 -23
package/scripts/hooks/shared/aw-phase-definitions.js +53 -0
package/scripts/hooks/shared/aw-phase-runner.js +3 -1
package/scripts/lib/aw-hook-contract.js +2 -2
package/scripts/lib/aw-pricing.js +306 -0
package/scripts/lib/aw-usage-telemetry.js +472 -0
package/scripts/lib/codex-hook-config.js +8 -8
package/scripts/lib/cursor-hook-config.js +25 -10
package/scripts/lib/install-targets/cursor-project.js +3 -0
package/scripts/lib/install-targets/helpers.js +20 -3
package/skills/aw-adk/SKILL.md +317 -0
package/skills/aw-adk/agents/analyzer.md +113 -0
package/skills/aw-adk/agents/comparator.md +113 -0
package/skills/aw-adk/agents/grader.md +115 -0
package/skills/aw-adk/assets/eval_review.html +76 -0
package/skills/aw-adk/eval-viewer/generate_review.py +164 -0
package/skills/aw-adk/eval-viewer/viewer.html +181 -0
package/skills/aw-adk/evals/eval-colocated-placement.md +84 -0
package/skills/aw-adk/evals/eval-create-agent.md +90 -0
package/skills/aw-adk/evals/eval-create-command.md +98 -0
package/skills/aw-adk/evals/eval-create-eval.md +89 -0
package/skills/aw-adk/evals/eval-create-rule.md +99 -0
package/skills/aw-adk/evals/eval-create-skill.md +97 -0
package/skills/aw-adk/evals/eval-delete-agent.md +79 -0
package/skills/aw-adk/evals/eval-delete-command.md +89 -0
package/skills/aw-adk/evals/eval-delete-rule.md +86 -0
package/skills/aw-adk/evals/eval-delete-skill.md +90 -0
package/skills/aw-adk/evals/eval-meta-eval-coverage.md +78 -0
package/skills/aw-adk/evals/eval-meta-eval-determinism.md +81 -0
package/skills/aw-adk/evals/eval-meta-eval-false-pass.md +81 -0
package/skills/aw-adk/evals/eval-score-accuracy.md +95 -0
package/skills/aw-adk/evals/eval-type-redirect.md +68 -0
package/skills/aw-adk/evals/evals.json +96 -0
package/skills/aw-adk/references/artifact-wiring.md +162 -0
package/skills/aw-adk/references/cross-ide-mapping.md +71 -0
package/skills/aw-adk/references/eval-placement-guide.md +183 -0
package/skills/aw-adk/references/external-resources.md +75 -0
package/skills/aw-adk/references/getting-started.md +66 -0
package/skills/aw-adk/references/registry-structure.md +152 -0
package/skills/aw-adk/references/rubric-agent.md +36 -0
package/skills/aw-adk/references/rubric-command.md +36 -0
package/skills/aw-adk/references/rubric-eval.md +36 -0
package/skills/aw-adk/references/rubric-meta-eval.md +132 -0
package/skills/aw-adk/references/rubric-rule.md +36 -0
package/skills/aw-adk/references/rubric-skill.md +36 -0
package/skills/aw-adk/references/schemas.md +222 -0
package/skills/aw-adk/references/template-agent.md +251 -0
package/skills/aw-adk/references/template-command.md +279 -0
package/skills/aw-adk/references/template-eval.md +176 -0
package/skills/aw-adk/references/template-rule.md +119 -0
package/skills/aw-adk/references/template-skill.md +123 -0
package/skills/aw-adk/references/type-classifier.md +98 -0
package/skills/aw-adk/references/writing-good-agents.md +227 -0
package/skills/aw-adk/references/writing-good-commands.md +258 -0
package/skills/aw-adk/references/writing-good-evals.md +271 -0
package/skills/aw-adk/references/writing-good-rules.md +214 -0
package/skills/aw-adk/references/writing-good-skills.md +159 -0
package/skills/aw-adk/scripts/aggregate-benchmark.py +190 -0
package/skills/aw-adk/scripts/lint-artifact.sh +211 -0
package/skills/aw-adk/scripts/score-artifact.sh +179 -0
package/skills/aw-adk/scripts/trigger-eval.py +192 -0
package/skills/aw-build/SKILL.md +19 -2
package/skills/aw-deploy/SKILL.md +65 -3
package/skills/aw-design/SKILL.md +156 -0
package/skills/aw-design/references/highrise-tokens.md +394 -0
package/skills/aw-design/references/micro-interactions.md +76 -0
package/skills/aw-design/references/prompt-template.md +160 -0
package/skills/aw-design/references/quality-checklist.md +70 -0
package/skills/aw-design/references/self-review.md +497 -0
package/skills/aw-design/references/stitch-workflow.md +127 -0
package/skills/aw-feature/SKILL.md +293 -0
package/skills/aw-investigate/SKILL.md +17 -0
package/skills/aw-plan/SKILL.md +34 -3
package/skills/aw-publish/SKILL.md +300 -0
package/skills/aw-publish/evals/eval-confirmation-gate.md +60 -0
package/skills/aw-publish/evals/eval-intent-detection.md +111 -0
package/skills/aw-publish/evals/eval-push-modes.md +67 -0
package/skills/aw-publish/evals/eval-rules-push.md +60 -0
package/skills/aw-publish/evals/evals.json +29 -0
package/skills/aw-publish/references/push-modes.md +38 -0
package/skills/aw-review/SKILL.md +88 -9
package/skills/aw-rules-review/SKILL.md +124 -0
package/skills/aw-rules-review/agents/openai.yaml +3 -0
package/skills/aw-rules-review/scripts/generate-review-template.mjs +323 -0
package/skills/aw-ship/SKILL.md +16 -0
package/skills/aw-spec/SKILL.md +15 -0
package/skills/aw-tasks/SKILL.md +15 -0
package/skills/aw-test/SKILL.md +16 -0
package/skills/aw-yolo/SKILL.md +4 -0
package/skills/diagnose/SKILL.md +121 -0
package/skills/diagnose/scripts/hitl-loop.template.sh +41 -0
package/skills/finish-only-when-green/SKILL.md +265 -0
package/skills/grill-me/SKILL.md +24 -0
package/skills/grill-with-docs/SKILL.md +92 -0
package/skills/grill-with-docs/adr-format.md +47 -0
package/skills/grill-with-docs/context-format.md +67 -0
package/skills/improve-codebase-architecture/SKILL.md +75 -0
package/skills/improve-codebase-architecture/deepening.md +37 -0
package/skills/improve-codebase-architecture/interface-design.md +44 -0
package/skills/improve-codebase-architecture/language.md +53 -0
package/skills/local-ghl-setup-from-screenshot/SKILL.md +538 -0
package/skills/tdd/SKILL.md +115 -0
package/skills/tdd/deep-modules.md +33 -0
package/skills/tdd/interface-design.md +31 -0
package/skills/tdd/mocking.md +59 -0
package/skills/tdd/refactoring.md +10 -0
package/skills/tdd/tests.md +61 -0
package/skills/to-issues/SKILL.md +62 -0
package/skills/to-prd/SKILL.md +75 -0
package/skills/using-aw-skills/SKILL.md +170 -237
package/skills/using-aw-skills/hooks/session-start.sh +11 -41
package/skills/zoom-out/SKILL.md +24 -0
package/.cursor/rules/common-agents.md +0 -53
package/.cursor/rules/common-aw-routing.md +0 -43
package/.cursor/rules/common-coding-style.md +0 -52
package/.cursor/rules/common-development-workflow.md +0 -33
package/.cursor/rules/common-git-workflow.md +0 -28
package/.cursor/rules/common-hooks.md +0 -34
package/.cursor/rules/common-patterns.md +0 -35
package/.cursor/rules/common-performance.md +0 -59
package/.cursor/rules/common-security.md +0 -33
package/.cursor/rules/common-testing.md +0 -33
package/.cursor/skills/api-and-interface-design/SKILL.md +0 -75
package/.cursor/skills/article-writing/SKILL.md +0 -85
package/.cursor/skills/aw-brainstorm/SKILL.md +0 -115
package/.cursor/skills/aw-build/SKILL.md +0 -152
package/.cursor/skills/aw-build/evals/build-stage-cases.json +0 -28
package/.cursor/skills/aw-debug/SKILL.md +0 -49
package/.cursor/skills/aw-deploy/SKILL.md +0 -101
package/.cursor/skills/aw-deploy/evals/deploy-stage-cases.json +0 -32
package/.cursor/skills/aw-execute/SKILL.md +0 -47
package/.cursor/skills/aw-execute/references/mode-code.md +0 -47
package/.cursor/skills/aw-execute/references/mode-docs.md +0 -28
package/.cursor/skills/aw-execute/references/mode-infra.md +0 -44
package/.cursor/skills/aw-execute/references/mode-migration.md +0 -58
package/.cursor/skills/aw-execute/references/worker-implementer.md +0 -26
package/.cursor/skills/aw-execute/references/worker-parallel-worker.md +0 -23
package/.cursor/skills/aw-execute/references/worker-quality-reviewer.md +0 -23
package/.cursor/skills/aw-execute/references/worker-spec-reviewer.md +0 -23
package/.cursor/skills/aw-execute/scripts/build-worker-bundle.js +0 -229
package/.cursor/skills/aw-finish/SKILL.md +0 -111
package/.cursor/skills/aw-investigate/SKILL.md +0 -109
package/.cursor/skills/aw-plan/SKILL.md +0 -368
package/.cursor/skills/aw-prepare/SKILL.md +0 -118
package/.cursor/skills/aw-review/SKILL.md +0 -118
package/.cursor/skills/aw-ship/SKILL.md +0 -115
package/.cursor/skills/aw-spec/SKILL.md +0 -104
package/.cursor/skills/aw-tasks/SKILL.md +0 -138
package/.cursor/skills/aw-test/SKILL.md +0 -118
package/.cursor/skills/aw-verify/SKILL.md +0 -51
package/.cursor/skills/aw-yolo/SKILL.md +0 -111
package/.cursor/skills/browser-testing-with-devtools/SKILL.md +0 -81
package/.cursor/skills/bun-runtime/SKILL.md +0 -84
package/.cursor/skills/ci-cd-and-automation/SKILL.md +0 -71
package/.cursor/skills/code-simplification/SKILL.md +0 -74
package/.cursor/skills/content-engine/SKILL.md +0 -88
package/.cursor/skills/context-engineering/SKILL.md +0 -74
package/.cursor/skills/deprecation-and-migration/SKILL.md +0 -75
package/.cursor/skills/documentation-and-adrs/SKILL.md +0 -75
package/.cursor/skills/documentation-lookup/SKILL.md +0 -90
package/.cursor/skills/frontend-slides/SKILL.md +0 -184
package/.cursor/skills/frontend-slides/STYLE_PRESETS.md +0 -330
package/.cursor/skills/frontend-ui-engineering/SKILL.md +0 -68
package/.cursor/skills/git-workflow-and-versioning/SKILL.md +0 -75
package/.cursor/skills/idea-refine/SKILL.md +0 -84
package/.cursor/skills/incremental-implementation/SKILL.md +0 -75
package/.cursor/skills/investor-materials/SKILL.md +0 -96
package/.cursor/skills/investor-outreach/SKILL.md +0 -76
package/.cursor/skills/market-research/SKILL.md +0 -75
package/.cursor/skills/mcp-server-patterns/SKILL.md +0 -67
package/.cursor/skills/nextjs-turbopack/SKILL.md +0 -44
package/.cursor/skills/performance-optimization/SKILL.md +0 -77
package/.cursor/skills/security-and-hardening/SKILL.md +0 -70
package/.cursor/skills/using-aw-skills/SKILL.md +0 -290
package/.cursor/skills/using-aw-skills/evals/skill-trigger-cases.tsv +0 -25
package/.cursor/skills/using-aw-skills/evals/test-skill-triggers.sh +0 -171
package/.cursor/skills/using-aw-skills/hooks/hooks.json +0 -9
package/.cursor/skills/using-aw-skills/hooks/session-start.sh +0 -67
package/.cursor/skills/using-platform-skills/SKILL.md +0 -163
package/.cursor/skills/using-platform-skills/evals/platform-selection-cases.json +0 -52
/package/.cursor/rules/{golang-coding-style.md → golang-coding-style.mdc} +0 -0
/package/.cursor/rules/{golang-hooks.md → golang-hooks.mdc} +0 -0
/package/.cursor/rules/{golang-patterns.md → golang-patterns.mdc} +0 -0
/package/.cursor/rules/{golang-security.md → golang-security.mdc} +0 -0
/package/.cursor/rules/{golang-testing.md → golang-testing.mdc} +0 -0
/package/.cursor/rules/{kotlin-coding-style.md → kotlin-coding-style.mdc} +0 -0
/package/.cursor/rules/{kotlin-hooks.md → kotlin-hooks.mdc} +0 -0
/package/.cursor/rules/{kotlin-patterns.md → kotlin-patterns.mdc} +0 -0
/package/.cursor/rules/{kotlin-security.md → kotlin-security.mdc} +0 -0
/package/.cursor/rules/{kotlin-testing.md → kotlin-testing.mdc} +0 -0
/package/.cursor/rules/{php-coding-style.md → php-coding-style.mdc} +0 -0
/package/.cursor/rules/{php-hooks.md → php-hooks.mdc} +0 -0
/package/.cursor/rules/{php-patterns.md → php-patterns.mdc} +0 -0
/package/.cursor/rules/{php-security.md → php-security.mdc} +0 -0
/package/.cursor/rules/{php-testing.md → php-testing.mdc} +0 -0
/package/.cursor/rules/{python-coding-style.md → python-coding-style.mdc} +0 -0
/package/.cursor/rules/{python-hooks.md → python-hooks.mdc} +0 -0
/package/.cursor/rules/{python-patterns.md → python-patterns.mdc} +0 -0
/package/.cursor/rules/{python-security.md → python-security.mdc} +0 -0
/package/.cursor/rules/{python-testing.md → python-testing.mdc} +0 -0
/package/.cursor/rules/{swift-coding-style.md → swift-coding-style.mdc} +0 -0
/package/.cursor/rules/{swift-hooks.md → swift-hooks.mdc} +0 -0
/package/.cursor/rules/{swift-patterns.md → swift-patterns.mdc} +0 -0
/package/.cursor/rules/{swift-security.md → swift-security.mdc} +0 -0
/package/.cursor/rules/{swift-testing.md → swift-testing.mdc} +0 -0
/package/.cursor/rules/{typescript-coding-style.md → typescript-coding-style.mdc} +0 -0
/package/.cursor/rules/{typescript-hooks.md → typescript-hooks.mdc} +0 -0
/package/.cursor/rules/{typescript-patterns.md → typescript-patterns.mdc} +0 -0
/package/.cursor/rules/{typescript-security.md → typescript-security.mdc} +0 -0
/package/.cursor/rules/{typescript-testing.md → typescript-testing.mdc} +0 -0

package/skills/aw-adk/evals/eval-delete-skill.md ADDED Viewed

@@ -0,0 +1,90 @@
+---
+name: eval-delete-skill
+target: skill/aw-adk
+category: functional
+difficulty: intermediate
+---
+# Eval: Delete Skill — Reverse Reference Cleanup in Agents
+## Task
+Test that deleting a skill also finds and cleans up agents that reference it in their `skills:` frontmatter, preventing phantom dependencies.
+### Prompt
+```
+First, create a temporary skill called temp-delete-test-patterns in the platform/data namespace. It teaches temporary testing patterns for data pipelines. It needs no scripts or references — just a simple SKILL.md.
+Then create a temporary agent called temp-data-tester in the platform/data namespace. Tools: Read, Grep. Model: haiku. Skills: [platform-data-temp-delete-test-patterns]. Description: "Temporary agent that uses the temp skill."
+After both are created, delete the skill temp-delete-test-patterns using the ADK delete flow. When warned about the agent reference, confirm you want to clean it up too. Confirm deletion when prompted.
+```
+## Context
+| Field | Value |
+|-------|-------|
+| **Namespace** | `platform/data` |
+| **Domain** | `data` |
+| **Target artifact** | `skills/aw-adk/SKILL.md` |
+| **Target type** | `skill` (create then delete) |
+## Expected Outcomes
+- [ ] **Skill created** at `.aw/.aw_registry/platform/data/skills/temp-delete-test-patterns/SKILL.md`
+- [ ] **Agent created** referencing the skill in `skills:` frontmatter
+- [ ] **Delete flow initiated** for the skill
+- [ ] **Reverse reference scan** — finds the agent that references this skill
+- [ ] **Warning shown** — "temp-data-tester references this skill in its skills: frontmatter"
+- [ ] **User asked** whether to clean up the reference
+- [ ] **Skill file + evals deleted**
+- [ ] **Agent's skills: frontmatter updated** — reference to the deleted skill removed
+- [ ] **No phantom dependencies remain** — agent no longer references a non-existent skill
+- [ ] **`aw link` ran**
+## Grading Criteria
+### PASS
+- All 10 outcomes met
+- Agent file still exists but no longer references the deleted skill
+### PARTIAL
+- Skill deleted but agent's skills: frontmatter not updated (phantom created)
+- OR no reverse reference scan performed
+### FAIL
+- Skill not deleted
+- Agent also deleted (overkill — should only remove the reference)
+- No warning about the dependent agent
+## Evaluation Method
+**Type:** hybrid
+### Deterministic Checks
+```bash
+# Skill should be gone
+test ! -d ".aw/.aw_registry/platform/data/skills/temp-delete-test-patterns" || echo "FAIL: skill still exists"
+# Agent should still exist
+test -f ".aw/.aw_registry/platform/data/agents/temp-data-tester.md" || echo "FAIL: agent was deleted (should only clean reference)"
+# Agent should NOT reference the deleted skill
+grep -q "temp-delete-test-patterns" ".aw/.aw_registry/platform/data/agents/temp-data-tester.md" 2>/dev/null && echo "FAIL: phantom reference in agent"
+```
+### Model-Based Checks
+- Did the ADK warn about the agent dependency before deleting?
+- Did it offer to clean up the reference rather than silently deleting?
+## Baseline Expectations
+- Without ADK: Skill deleted, agent left with phantom reference that breaks at runtime.
+- With ADK: Reverse reference scan catches the dependency, cleans it up.
+- **Expected delta:** 0 phantom references with ADK vs. 1+ without

package/skills/aw-adk/evals/eval-meta-eval-coverage.md ADDED Viewed

@@ -0,0 +1,78 @@
+---
+name: eval-meta-eval-coverage
+target: skill/aw-adk
+category: structural
+difficulty: intermediate
+---
+# Eval: Meta-Eval — Scenario Coverage
+## Task
+Test that evals created by the ADK cover both happy path AND failure scenarios — not just happy-path-only. The ADK's eval gate requires "happy path + at least one failure scenario." This meta-eval verifies that requirement is actually met.
+### Prompt
+```
+Create a skill for Redis caching patterns in the platform/data namespace.
+```
+## Context
+| Field | Value |
+|-------|-------|
+| **Namespace** | `platform/data` |
+| **Domain** | `data` |
+| **Target artifact** | evals created by ADK during skill creation |
+| **Target type** | `eval` (meta) |
+## Expected Outcomes
+- [ ] **Skill created** with 2+ colocated evals
+- [ ] **At least one happy-path eval** — tests the skill working correctly with valid input
+- [ ] **At least one failure-scenario eval** — tests error handling, edge cases, or invalid input
+- [ ] **Failure eval is not just "minimal input"** — it tests a genuinely different scenario (not the happy path with fewer words)
+- [ ] **Eval purposes are distinct** — the two evals test meaningfully different aspects, not the same scenario with different wording
+- [ ] **Each eval has PASS/FAIL criteria** that are independently verifiable
+## Grading Criteria
+### PASS
+- 2+ evals exist
+- At least one is clearly a failure/edge-case scenario (not a relabeled happy path)
+- Each has distinct, verifiable pass/fail criteria
+### PARTIAL
+- 2+ evals exist but both are variations of happy path
+- OR only 1 eval created
+### FAIL
+- No evals created
+- OR all evals test the same scenario
+## Evaluation Method
+**Type:** hybrid
+### Deterministic Checks
+```bash
+# Verify 2+ eval files
+EVAL_COUNT=$(ls .aw/.aw_registry/platform/data/skills/redis-caching-patterns/evals/eval-*.md 2>/dev/null | wc -l)
+[[ "$EVAL_COUNT" -ge 2 ]] || echo "FAIL: fewer than 2 evals"
+```
+### Model-Based Checks
+- Read each eval's scenario: are they testing genuinely different cases?
+- Does at least one eval describe a failure condition (invalid input, missing data, error state)?
+- Would a broken skill pass all evals? (If yes → insufficient coverage)
+## Baseline Expectations
+- Without ADK: Single happy-path eval or no evals at all.
+- With ADK: 2+ evals with distinct scenarios covering happy path and failure.
+- **Expected delta:** 100% coverage of both paths with ADK

package/skills/aw-adk/evals/eval-meta-eval-determinism.md ADDED Viewed

@@ -0,0 +1,81 @@
+---
+name: eval-meta-eval-determinism
+target: skill/aw-adk
+category: behavioral
+difficulty: advanced
+---
+# Eval: Meta-Eval — Scoring Determinism
+## Task
+Test that the ADK's scoring produces consistent results. The same artifact scored twice should receive the same tier and similar per-dimension scores. Flaky scoring undermines trust in the entire rubric system.
+### Prompt (run twice)
+```
+Score this skill: .aw/.aw_registry/platform/data/skills/redis-caching-patterns/SKILL.md
+```
+Run the exact same scoring prompt twice against the same artifact. Compare the two score outputs.
+## Context
+| Field | Value |
+|-------|-------|
+| **Target artifact** | any existing skill with stable content |
+| **Target type** | `skill` |
+## Expected Outcomes
+- [ ] **Both runs produce a 10-dimension score table**
+- [ ] **Same tier in both runs** — if run 1 is B-Tier, run 2 must also be B-Tier
+- [ ] **Per-dimension scores within ±1 point** — a dimension scored 7 in run 1 should be 6-8 in run 2
+- [ ] **Total score within ±5 points** — e.g., 72 and 76 is acceptable; 72 and 85 is not
+- [ ] **Same improvement suggestions** — the top 3 gaps identified should overlap between runs
+- [ ] **Both runs reference rubric-skill.md** — scoring is rubric-based, not ad-hoc
+## Grading Criteria
+### PASS
+- Same tier in both runs
+- Total score difference ≤ 5 points
+- Per-dimension scores within ±1
+- Top 3 improvement suggestions overlap (at least 2 of 3 match)
+### PARTIAL
+- Same tier but total score difference 6-10 points
+- OR different tier but adjacent (B vs C, not B vs D)
+### FAIL
+- Different tiers separated by 2+ levels (e.g., B-Tier vs D-Tier)
+- Total score difference > 10 points
+- Improvement suggestions are completely different between runs
+## Evaluation Method
+**Type:** hybrid
+### Deterministic Checks
+```bash
+# Compare total scores from both runs (requires parsing the output)
+# Tier must match exactly
+# Total must be within ±5
+```
+### Model-Based Checks
+- Extract the total score from each run's output
+- Compare tier assignments
+- Compare per-dimension scores
+- Compare improvement suggestions for overlap
+## Baseline Expectations
+- Without ADK: Scoring is ad-hoc, varies wildly between runs (±20+ points).
+- With ADK: Rubric-anchored scoring with ≤5 point variance.
+- **Expected delta:** 75%+ reduction in score variance

package/skills/aw-adk/evals/eval-meta-eval-false-pass.md ADDED Viewed

@@ -0,0 +1,81 @@
+---
+name: eval-meta-eval-false-pass
+target: skill/aw-adk
+category: behavioral
+difficulty: advanced
+---
+# Eval: Meta-Eval — False Pass Resistance
+## Task
+Test that evals created by the ADK can actually detect bad artifacts. The ADK creates an agent, then creates evals for it. Then a known-bad version of the agent (missing critical sections, wrong structure) is fed to those evals. The evals must FAIL the bad agent — not give it a false pass.
+This is a meta-eval: it tests the quality of evals that the ADK produces, not the ADK's create flow itself.
+### Prompt (two-step)
+**Step 1:** Create an agent for log analysis in the platform/infra namespace.
+**Step 2:** Take the evals that were just created. Run them against this known-bad agent:
+```markdown
+---
+name: log-analyzer
+description: "Analyzes logs"
+---
+# Log Analyzer
+Looks at logs and finds problems.
+```
+## Context
+| Field | Value |
+|-------|-------|
+| **Namespace** | `platform/infra` |
+| **Target artifact** | evals created by ADK in step 1 |
+| **Target type** | `eval` (meta) |
+## Expected Outcomes
+- [ ] **Step 1 completes** — a well-structured agent is created with evals
+- [ ] **Known-bad agent is structurally deficient** — missing: tools, model, category, squad, skills, identity section, core mission, critical rules, process, deliverables
+- [ ] **Evals FAIL the known-bad agent** — at least 1 eval produces a FAIL verdict
+- [ ] **Failure reasons are specific** — "missing Identity section" not just "low quality"
+- [ ] **Evals don't false-pass** — a clearly deficient agent must not get PASS or even PARTIAL
+## Grading Criteria
+### PASS
+- At least 1 eval FAILs the known-bad agent
+- Failure reasons reference specific missing sections or frontmatter fields
+- The well-structured agent from step 1 would PASS the same evals
+### PARTIAL
+- Evals give PARTIAL (not PASS) to the known-bad agent
+- Some discrimination but not full rejection
+### FAIL
+- Evals PASS the known-bad agent (false pass)
+- OR evals can't be run against the bad agent (no mechanism)
+- OR evals only check surface features (file exists, has frontmatter) that the bad agent satisfies
+## Evaluation Method
+**Type:** model-based
+### Model-Based Checks
+- Do the evals contain assertions that the known-bad agent would fail?
+- Are assertions specific enough to distinguish good from bad?
+- Would substituting the bad agent into the eval's expected outcomes produce FAIL?
+## Baseline Expectations
+- Without ADK: Evals are always-pass stubs that accept any output.
+- With ADK: Evals have discriminating assertions that catch missing sections and thin content.
+- **Expected delta:** 80%+ false-pass detection rate with ADK evals

package/skills/aw-adk/evals/eval-score-accuracy.md ADDED Viewed

@@ -0,0 +1,95 @@
+---
+name: eval-score-accuracy
+target: skill/aw-adk
+category: behavioral
+difficulty: intermediate
+---
+# Eval: Score Accuracy — Deliberately Minimal Agent Gets Low Score
+## Task
+Test that the ADK's scoring is calibrated — a deliberately minimal agent (missing most sections) should receive a D-Tier score, not be generously graded. This targets the self-scoring bias where the LLM creates an artifact and then scores it high.
+### Prompt
+```
+Score this agent: .aw/.aw_registry/revex/reselling/backend/agents/stub-agent.md
+```
+### Setup (pre-condition)
+Create a minimal stub agent before running the eval:
+```markdown
+---
+name: stub-agent
+description: "Does stuff"
+tools: [Read]
+---
+# Stub Agent
+This agent helps with things.
+```
+## Context
+| Field | Value |
+|-------|-------|
+| **Namespace** | `revex/reselling` |
+| **Domain** | `backend` |
+| **Target artifact** | `skills/aw-adk/SKILL.md` |
+| **Target type** | `agent` |
+## Expected Outcomes
+- [ ] **ADK reads rubric-agent.md** before scoring
+- [ ] **Score table has 10 dimensions** — not a summary number
+- [ ] **Total score is D-Tier (below 40)** — the stub agent is missing: identity section, core mission, critical rules, process, deliverables, communication style, success metrics, model field, category field, squad field
+- [ ] **Missing dimensions identified** — the score output names specific gaps (not "looks good overall")
+- [ ] **No inflation** — scores for missing sections are 0, not 3-5 "because the section could be implied"
+- [ ] **Improvement suggestions provided** — concrete next steps to raise the score
+## Grading Criteria
+### PASS
+- Total score is D-Tier (0-39)
+- At least 6 dimensions scored 0 (missing entirely)
+- Missing sections explicitly named
+### PARTIAL
+- Total score is C-Tier (40-59) — some inflation but identifies gaps
+- OR correct D-Tier but fewer than 6 zero-scored dimensions
+### FAIL
+- Total score is B-Tier or above (60+) — severe inflation
+- OR no per-dimension breakdown (just a summary score)
+- OR does not read the rubric before scoring
+## Evaluation Method
+**Type:** hybrid
+### Deterministic Checks
+```bash
+# Verify the score output contains a table with 10 rows
+# (model-based check needed to parse the actual scores)
+```
+### Model-Based Checks
+- Is the total score below 40?
+- Are missing sections scored 0 (not given partial credit)?
+- Did the executor read rubric-agent.md before scoring?
+- Are improvement suggestions specific (not "add more content")?
+## Baseline Expectations
+- Without ADK: Model says "looks good, 7/10" with no rubric reference.
+- With ADK: Calibrated score using rubric-agent.md, D-Tier for stub, specific gaps identified.
+- **Expected delta:** 30+ point difference in score accuracy

package/skills/aw-adk/evals/eval-type-redirect.md ADDED Viewed

@@ -0,0 +1,68 @@
+---
+name: eval-type-redirect
+target: skill/aw-adk
+category: behavioral
+difficulty: advanced
+---
+# Eval: Type Redirect — Command Request That Should Be a Skill
+## Task
+Test that the ADK's type classifier catches misclassifications. The prompt asks to "create a command" but the subject matter (static knowledge, best practices) is actually a skill. The ADK should redirect or at minimum flag the mismatch during the interview.
+### Prompt
+```
+Create a command for React best practices in the platform/frontend namespace. It should cover component patterns, hooks usage, state management, and performance optimization tips.
+```
+## Context
+| Field | Value |
+|-------|-------|
+| **Namespace** | `platform/frontend` |
+| **Domain** | `frontend` |
+| **Target artifact** | `skills/aw-adk/SKILL.md` |
+| **Target type** | `skill` (despite user saying "command") |
+## Expected Outcomes
+- [ ] **Type redirect detected** — the ADK recognizes "React best practices" is static knowledge (skill), not a multi-phase workflow (command)
+- [ ] **User informed of redirect** — explains why this is a skill, not a command (commands automate workflows with agents and phases; skills encode knowledge)
+- [ ] **Proceeds as skill** — after redirect, follows the skill create flow
+- [ ] **OR asks user to confirm** — "This sounds like a skill (knowledge reference) rather than a command (workflow). Should I create it as a skill?"
+- [ ] **Does NOT blindly create a command** — a "React best practices command" with forced phases and agent roster would be the wrong artifact type
+## Grading Criteria
+### PASS
+- Redirect detected and communicated to user
+- Proceeds with correct type (skill) after confirmation
+### PARTIAL
+- Creates the artifact but notes during interview that it might be a skill
+- OR creates a skill without explaining the redirect
+### FAIL
+- Creates a command with forced multi-phase structure for static knowledge
+- No mention of type mismatch
+## Evaluation Method
+**Type:** model-based
+### Model-Based Checks
+- Did the executor question the "command" classification?
+- Did it explain the difference between commands (workflow) and skills (knowledge)?
+- Did it ultimately create a skill (or ask user to choose)?
+## Baseline Expectations
+- Without ADK: Creates whatever the user asked for literally — a forced "command" with fake phases.
+- With ADK: Type classifier catches the mismatch and redirects.
+- **Expected delta:** correct type 90%+ with ADK vs. literal compliance without

package/skills/aw-adk/evals/evals.json ADDED Viewed

@@ -0,0 +1,96 @@
+{
+  "artifact_name": "aw-adk",
+  "artifact_type": "skill",
+  "evals": [
+    {
+      "name": "eval-create-skill",
+      "category": "functional",
+      "difficulty": "intermediate",
+      "group": "create-mode"
+    },
+    {
+      "name": "eval-create-agent",
+      "category": "functional",
+      "difficulty": "intermediate",
+      "group": "create-mode"
+    },
+    {
+      "name": "eval-create-command",
+      "category": "functional",
+      "difficulty": "advanced",
+      "group": "create-mode"
+    },
+    {
+      "name": "eval-create-rule",
+      "category": "functional",
+      "difficulty": "intermediate",
+      "group": "create-mode"
+    },
+    {
+      "name": "eval-create-eval",
+      "category": "functional",
+      "difficulty": "intermediate",
+      "group": "create-mode"
+    },
+    {
+      "name": "eval-type-redirect",
+      "category": "behavioral",
+      "difficulty": "advanced",
+      "group": "cross-cutting"
+    },
+    {
+      "name": "eval-score-accuracy",
+      "category": "behavioral",
+      "difficulty": "intermediate",
+      "group": "cross-cutting"
+    },
+    {
+      "name": "eval-colocated-placement",
+      "category": "structural",
+      "difficulty": "basic",
+      "group": "cross-cutting"
+    },
+    {
+      "name": "eval-meta-eval-false-pass",
+      "category": "behavioral",
+      "difficulty": "advanced",
+      "group": "meta-evals"
+    },
+    {
+      "name": "eval-meta-eval-coverage",
+      "category": "structural",
+      "difficulty": "intermediate",
+      "group": "meta-evals"
+    },
+    {
+      "name": "eval-meta-eval-determinism",
+      "category": "behavioral",
+      "difficulty": "advanced",
+      "group": "meta-evals"
+    },
+    {
+      "name": "eval-delete-agent",
+      "category": "functional",
+      "difficulty": "intermediate",
+      "group": "delete-mode"
+    },
+    {
+      "name": "eval-delete-rule",
+      "category": "functional",
+      "difficulty": "intermediate",
+      "group": "delete-mode"
+    },
+    {
+      "name": "eval-delete-skill",
+      "category": "functional",
+      "difficulty": "intermediate",
+      "group": "delete-mode"
+    },
+    {
+      "name": "eval-delete-command",
+      "category": "functional",
+      "difficulty": "intermediate",
+      "group": "delete-mode"
+    }
+  ]
+}