aw-ecc 1.4.32 → 1.4.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (258) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/.codex/hooks/aw-post-tool-use.sh +8 -2
  3. package/.codex/hooks/aw-session-start.sh +11 -4
  4. package/.codex/hooks/aw-stop.sh +8 -2
  5. package/.codex/hooks/aw-user-prompt-submit.sh +10 -2
  6. package/.codex/hooks.json +8 -8
  7. package/.cursor/INSTALL.md +7 -5
  8. package/.cursor/hooks/adapter.js +41 -4
  9. package/.cursor/hooks/after-agent-response.js +62 -0
  10. package/.cursor/hooks/before-submit-prompt.js +7 -1
  11. package/.cursor/hooks/post-tool-use-failure.js +21 -0
  12. package/.cursor/hooks/post-tool-use.js +39 -0
  13. package/.cursor/hooks/shared/aw-phase-definitions.js +53 -0
  14. package/.cursor/hooks/shared/aw-phase-runner.js +3 -1
  15. package/.cursor/hooks/subagent-start.js +22 -4
  16. package/.cursor/hooks/subagent-stop.js +18 -1
  17. package/.cursor/hooks.json +23 -2
  18. package/.opencode/package.json +1 -1
  19. package/AGENTS.md +3 -3
  20. package/README.md +5 -5
  21. package/commands/adk.md +52 -0
  22. package/commands/build.md +22 -9
  23. package/commands/deploy.md +12 -0
  24. package/commands/execute.md +9 -0
  25. package/commands/feature.md +333 -0
  26. package/commands/investigate.md +18 -5
  27. package/commands/plan.md +23 -9
  28. package/commands/publish.md +65 -0
  29. package/commands/review.md +12 -0
  30. package/commands/ship.md +12 -0
  31. package/commands/test.md +12 -0
  32. package/commands/verify.md +9 -0
  33. package/hooks/hooks.json +36 -0
  34. package/manifests/install-components.json +8 -0
  35. package/manifests/install-modules.json +83 -0
  36. package/manifests/install-profiles.json +7 -0
  37. package/package.json +1 -1
  38. package/scripts/ci/validate-rules.js +51 -0
  39. package/scripts/cursor-aw-home/hooks.json +23 -2
  40. package/scripts/cursor-aw-hooks/adapter.js +41 -4
  41. package/scripts/cursor-aw-hooks/before-submit-prompt.js +7 -1
  42. package/scripts/hooks/aw-usage-commit-created.js +32 -0
  43. package/scripts/hooks/aw-usage-post-tool-use-failure.js +56 -0
  44. package/scripts/hooks/aw-usage-post-tool-use.js +242 -0
  45. package/scripts/hooks/aw-usage-prompt-submit.js +112 -0
  46. package/scripts/hooks/aw-usage-session-start.js +48 -0
  47. package/scripts/hooks/aw-usage-stop.js +182 -0
  48. package/scripts/hooks/aw-usage-telemetry-send.js +84 -0
  49. package/scripts/hooks/cost-tracker.js +3 -23
  50. package/scripts/hooks/shared/aw-phase-definitions.js +53 -0
  51. package/scripts/hooks/shared/aw-phase-runner.js +3 -1
  52. package/scripts/lib/aw-hook-contract.js +2 -2
  53. package/scripts/lib/aw-pricing.js +306 -0
  54. package/scripts/lib/aw-usage-telemetry.js +472 -0
  55. package/scripts/lib/codex-hook-config.js +8 -8
  56. package/scripts/lib/cursor-hook-config.js +25 -10
  57. package/scripts/lib/install-targets/cursor-project.js +3 -0
  58. package/scripts/lib/install-targets/helpers.js +20 -3
  59. package/skills/aw-adk/SKILL.md +317 -0
  60. package/skills/aw-adk/agents/analyzer.md +113 -0
  61. package/skills/aw-adk/agents/comparator.md +113 -0
  62. package/skills/aw-adk/agents/grader.md +115 -0
  63. package/skills/aw-adk/assets/eval_review.html +76 -0
  64. package/skills/aw-adk/eval-viewer/generate_review.py +164 -0
  65. package/skills/aw-adk/eval-viewer/viewer.html +181 -0
  66. package/skills/aw-adk/evals/eval-colocated-placement.md +84 -0
  67. package/skills/aw-adk/evals/eval-create-agent.md +90 -0
  68. package/skills/aw-adk/evals/eval-create-command.md +98 -0
  69. package/skills/aw-adk/evals/eval-create-eval.md +89 -0
  70. package/skills/aw-adk/evals/eval-create-rule.md +99 -0
  71. package/skills/aw-adk/evals/eval-create-skill.md +97 -0
  72. package/skills/aw-adk/evals/eval-delete-agent.md +79 -0
  73. package/skills/aw-adk/evals/eval-delete-command.md +89 -0
  74. package/skills/aw-adk/evals/eval-delete-rule.md +86 -0
  75. package/skills/aw-adk/evals/eval-delete-skill.md +90 -0
  76. package/skills/aw-adk/evals/eval-meta-eval-coverage.md +78 -0
  77. package/skills/aw-adk/evals/eval-meta-eval-determinism.md +81 -0
  78. package/skills/aw-adk/evals/eval-meta-eval-false-pass.md +81 -0
  79. package/skills/aw-adk/evals/eval-score-accuracy.md +95 -0
  80. package/skills/aw-adk/evals/eval-type-redirect.md +68 -0
  81. package/skills/aw-adk/evals/evals.json +96 -0
  82. package/skills/aw-adk/references/artifact-wiring.md +162 -0
  83. package/skills/aw-adk/references/cross-ide-mapping.md +71 -0
  84. package/skills/aw-adk/references/eval-placement-guide.md +183 -0
  85. package/skills/aw-adk/references/external-resources.md +75 -0
  86. package/skills/aw-adk/references/getting-started.md +66 -0
  87. package/skills/aw-adk/references/registry-structure.md +152 -0
  88. package/skills/aw-adk/references/rubric-agent.md +36 -0
  89. package/skills/aw-adk/references/rubric-command.md +36 -0
  90. package/skills/aw-adk/references/rubric-eval.md +36 -0
  91. package/skills/aw-adk/references/rubric-meta-eval.md +132 -0
  92. package/skills/aw-adk/references/rubric-rule.md +36 -0
  93. package/skills/aw-adk/references/rubric-skill.md +36 -0
  94. package/skills/aw-adk/references/schemas.md +222 -0
  95. package/skills/aw-adk/references/template-agent.md +251 -0
  96. package/skills/aw-adk/references/template-command.md +279 -0
  97. package/skills/aw-adk/references/template-eval.md +176 -0
  98. package/skills/aw-adk/references/template-rule.md +119 -0
  99. package/skills/aw-adk/references/template-skill.md +123 -0
  100. package/skills/aw-adk/references/type-classifier.md +98 -0
  101. package/skills/aw-adk/references/writing-good-agents.md +227 -0
  102. package/skills/aw-adk/references/writing-good-commands.md +258 -0
  103. package/skills/aw-adk/references/writing-good-evals.md +271 -0
  104. package/skills/aw-adk/references/writing-good-rules.md +214 -0
  105. package/skills/aw-adk/references/writing-good-skills.md +159 -0
  106. package/skills/aw-adk/scripts/aggregate-benchmark.py +190 -0
  107. package/skills/aw-adk/scripts/lint-artifact.sh +211 -0
  108. package/skills/aw-adk/scripts/score-artifact.sh +179 -0
  109. package/skills/aw-adk/scripts/trigger-eval.py +192 -0
  110. package/skills/aw-build/SKILL.md +19 -2
  111. package/skills/aw-deploy/SKILL.md +65 -3
  112. package/skills/aw-design/SKILL.md +156 -0
  113. package/skills/aw-design/references/highrise-tokens.md +394 -0
  114. package/skills/aw-design/references/micro-interactions.md +76 -0
  115. package/skills/aw-design/references/prompt-template.md +160 -0
  116. package/skills/aw-design/references/quality-checklist.md +70 -0
  117. package/skills/aw-design/references/self-review.md +497 -0
  118. package/skills/aw-design/references/stitch-workflow.md +127 -0
  119. package/skills/aw-feature/SKILL.md +293 -0
  120. package/skills/aw-investigate/SKILL.md +17 -0
  121. package/skills/aw-plan/SKILL.md +34 -3
  122. package/skills/aw-publish/SKILL.md +300 -0
  123. package/skills/aw-publish/evals/eval-confirmation-gate.md +60 -0
  124. package/skills/aw-publish/evals/eval-intent-detection.md +111 -0
  125. package/skills/aw-publish/evals/eval-push-modes.md +67 -0
  126. package/skills/aw-publish/evals/eval-rules-push.md +60 -0
  127. package/skills/aw-publish/evals/evals.json +29 -0
  128. package/skills/aw-publish/references/push-modes.md +38 -0
  129. package/skills/aw-review/SKILL.md +88 -9
  130. package/skills/aw-rules-review/SKILL.md +124 -0
  131. package/skills/aw-rules-review/agents/openai.yaml +3 -0
  132. package/skills/aw-rules-review/scripts/generate-review-template.mjs +323 -0
  133. package/skills/aw-ship/SKILL.md +16 -0
  134. package/skills/aw-spec/SKILL.md +15 -0
  135. package/skills/aw-tasks/SKILL.md +15 -0
  136. package/skills/aw-test/SKILL.md +16 -0
  137. package/skills/aw-yolo/SKILL.md +4 -0
  138. package/skills/diagnose/SKILL.md +121 -0
  139. package/skills/diagnose/scripts/hitl-loop.template.sh +41 -0
  140. package/skills/finish-only-when-green/SKILL.md +265 -0
  141. package/skills/grill-me/SKILL.md +24 -0
  142. package/skills/grill-with-docs/SKILL.md +92 -0
  143. package/skills/grill-with-docs/adr-format.md +47 -0
  144. package/skills/grill-with-docs/context-format.md +67 -0
  145. package/skills/improve-codebase-architecture/SKILL.md +75 -0
  146. package/skills/improve-codebase-architecture/deepening.md +37 -0
  147. package/skills/improve-codebase-architecture/interface-design.md +44 -0
  148. package/skills/improve-codebase-architecture/language.md +53 -0
  149. package/skills/local-ghl-setup-from-screenshot/SKILL.md +538 -0
  150. package/skills/tdd/SKILL.md +115 -0
  151. package/skills/tdd/deep-modules.md +33 -0
  152. package/skills/tdd/interface-design.md +31 -0
  153. package/skills/tdd/mocking.md +59 -0
  154. package/skills/tdd/refactoring.md +10 -0
  155. package/skills/tdd/tests.md +61 -0
  156. package/skills/to-issues/SKILL.md +62 -0
  157. package/skills/to-prd/SKILL.md +75 -0
  158. package/skills/using-aw-skills/SKILL.md +170 -237
  159. package/skills/using-aw-skills/hooks/session-start.sh +11 -41
  160. package/skills/zoom-out/SKILL.md +24 -0
  161. package/.cursor/rules/common-agents.md +0 -53
  162. package/.cursor/rules/common-aw-routing.md +0 -43
  163. package/.cursor/rules/common-coding-style.md +0 -52
  164. package/.cursor/rules/common-development-workflow.md +0 -33
  165. package/.cursor/rules/common-git-workflow.md +0 -28
  166. package/.cursor/rules/common-hooks.md +0 -34
  167. package/.cursor/rules/common-patterns.md +0 -35
  168. package/.cursor/rules/common-performance.md +0 -59
  169. package/.cursor/rules/common-security.md +0 -33
  170. package/.cursor/rules/common-testing.md +0 -33
  171. package/.cursor/skills/api-and-interface-design/SKILL.md +0 -75
  172. package/.cursor/skills/article-writing/SKILL.md +0 -85
  173. package/.cursor/skills/aw-brainstorm/SKILL.md +0 -115
  174. package/.cursor/skills/aw-build/SKILL.md +0 -152
  175. package/.cursor/skills/aw-build/evals/build-stage-cases.json +0 -28
  176. package/.cursor/skills/aw-debug/SKILL.md +0 -49
  177. package/.cursor/skills/aw-deploy/SKILL.md +0 -101
  178. package/.cursor/skills/aw-deploy/evals/deploy-stage-cases.json +0 -32
  179. package/.cursor/skills/aw-execute/SKILL.md +0 -47
  180. package/.cursor/skills/aw-execute/references/mode-code.md +0 -47
  181. package/.cursor/skills/aw-execute/references/mode-docs.md +0 -28
  182. package/.cursor/skills/aw-execute/references/mode-infra.md +0 -44
  183. package/.cursor/skills/aw-execute/references/mode-migration.md +0 -58
  184. package/.cursor/skills/aw-execute/references/worker-implementer.md +0 -26
  185. package/.cursor/skills/aw-execute/references/worker-parallel-worker.md +0 -23
  186. package/.cursor/skills/aw-execute/references/worker-quality-reviewer.md +0 -23
  187. package/.cursor/skills/aw-execute/references/worker-spec-reviewer.md +0 -23
  188. package/.cursor/skills/aw-execute/scripts/build-worker-bundle.js +0 -229
  189. package/.cursor/skills/aw-finish/SKILL.md +0 -111
  190. package/.cursor/skills/aw-investigate/SKILL.md +0 -109
  191. package/.cursor/skills/aw-plan/SKILL.md +0 -368
  192. package/.cursor/skills/aw-prepare/SKILL.md +0 -118
  193. package/.cursor/skills/aw-review/SKILL.md +0 -118
  194. package/.cursor/skills/aw-ship/SKILL.md +0 -115
  195. package/.cursor/skills/aw-spec/SKILL.md +0 -104
  196. package/.cursor/skills/aw-tasks/SKILL.md +0 -138
  197. package/.cursor/skills/aw-test/SKILL.md +0 -118
  198. package/.cursor/skills/aw-verify/SKILL.md +0 -51
  199. package/.cursor/skills/aw-yolo/SKILL.md +0 -111
  200. package/.cursor/skills/browser-testing-with-devtools/SKILL.md +0 -81
  201. package/.cursor/skills/bun-runtime/SKILL.md +0 -84
  202. package/.cursor/skills/ci-cd-and-automation/SKILL.md +0 -71
  203. package/.cursor/skills/code-simplification/SKILL.md +0 -74
  204. package/.cursor/skills/content-engine/SKILL.md +0 -88
  205. package/.cursor/skills/context-engineering/SKILL.md +0 -74
  206. package/.cursor/skills/deprecation-and-migration/SKILL.md +0 -75
  207. package/.cursor/skills/documentation-and-adrs/SKILL.md +0 -75
  208. package/.cursor/skills/documentation-lookup/SKILL.md +0 -90
  209. package/.cursor/skills/frontend-slides/SKILL.md +0 -184
  210. package/.cursor/skills/frontend-slides/STYLE_PRESETS.md +0 -330
  211. package/.cursor/skills/frontend-ui-engineering/SKILL.md +0 -68
  212. package/.cursor/skills/git-workflow-and-versioning/SKILL.md +0 -75
  213. package/.cursor/skills/idea-refine/SKILL.md +0 -84
  214. package/.cursor/skills/incremental-implementation/SKILL.md +0 -75
  215. package/.cursor/skills/investor-materials/SKILL.md +0 -96
  216. package/.cursor/skills/investor-outreach/SKILL.md +0 -76
  217. package/.cursor/skills/market-research/SKILL.md +0 -75
  218. package/.cursor/skills/mcp-server-patterns/SKILL.md +0 -67
  219. package/.cursor/skills/nextjs-turbopack/SKILL.md +0 -44
  220. package/.cursor/skills/performance-optimization/SKILL.md +0 -77
  221. package/.cursor/skills/security-and-hardening/SKILL.md +0 -70
  222. package/.cursor/skills/using-aw-skills/SKILL.md +0 -290
  223. package/.cursor/skills/using-aw-skills/evals/skill-trigger-cases.tsv +0 -25
  224. package/.cursor/skills/using-aw-skills/evals/test-skill-triggers.sh +0 -171
  225. package/.cursor/skills/using-aw-skills/hooks/hooks.json +0 -9
  226. package/.cursor/skills/using-aw-skills/hooks/session-start.sh +0 -67
  227. package/.cursor/skills/using-platform-skills/SKILL.md +0 -163
  228. package/.cursor/skills/using-platform-skills/evals/platform-selection-cases.json +0 -52
  229. /package/.cursor/rules/{golang-coding-style.md → golang-coding-style.mdc} +0 -0
  230. /package/.cursor/rules/{golang-hooks.md → golang-hooks.mdc} +0 -0
  231. /package/.cursor/rules/{golang-patterns.md → golang-patterns.mdc} +0 -0
  232. /package/.cursor/rules/{golang-security.md → golang-security.mdc} +0 -0
  233. /package/.cursor/rules/{golang-testing.md → golang-testing.mdc} +0 -0
  234. /package/.cursor/rules/{kotlin-coding-style.md → kotlin-coding-style.mdc} +0 -0
  235. /package/.cursor/rules/{kotlin-hooks.md → kotlin-hooks.mdc} +0 -0
  236. /package/.cursor/rules/{kotlin-patterns.md → kotlin-patterns.mdc} +0 -0
  237. /package/.cursor/rules/{kotlin-security.md → kotlin-security.mdc} +0 -0
  238. /package/.cursor/rules/{kotlin-testing.md → kotlin-testing.mdc} +0 -0
  239. /package/.cursor/rules/{php-coding-style.md → php-coding-style.mdc} +0 -0
  240. /package/.cursor/rules/{php-hooks.md → php-hooks.mdc} +0 -0
  241. /package/.cursor/rules/{php-patterns.md → php-patterns.mdc} +0 -0
  242. /package/.cursor/rules/{php-security.md → php-security.mdc} +0 -0
  243. /package/.cursor/rules/{php-testing.md → php-testing.mdc} +0 -0
  244. /package/.cursor/rules/{python-coding-style.md → python-coding-style.mdc} +0 -0
  245. /package/.cursor/rules/{python-hooks.md → python-hooks.mdc} +0 -0
  246. /package/.cursor/rules/{python-patterns.md → python-patterns.mdc} +0 -0
  247. /package/.cursor/rules/{python-security.md → python-security.mdc} +0 -0
  248. /package/.cursor/rules/{python-testing.md → python-testing.mdc} +0 -0
  249. /package/.cursor/rules/{swift-coding-style.md → swift-coding-style.mdc} +0 -0
  250. /package/.cursor/rules/{swift-hooks.md → swift-hooks.mdc} +0 -0
  251. /package/.cursor/rules/{swift-patterns.md → swift-patterns.mdc} +0 -0
  252. /package/.cursor/rules/{swift-security.md → swift-security.mdc} +0 -0
  253. /package/.cursor/rules/{swift-testing.md → swift-testing.mdc} +0 -0
  254. /package/.cursor/rules/{typescript-coding-style.md → typescript-coding-style.mdc} +0 -0
  255. /package/.cursor/rules/{typescript-hooks.md → typescript-hooks.mdc} +0 -0
  256. /package/.cursor/rules/{typescript-patterns.md → typescript-patterns.mdc} +0 -0
  257. /package/.cursor/rules/{typescript-security.md → typescript-security.mdc} +0 -0
  258. /package/.cursor/rules/{typescript-testing.md → typescript-testing.mdc} +0 -0
@@ -0,0 +1,90 @@
1
+ ---
2
+ name: eval-delete-skill
3
+ target: skill/aw-adk
4
+ category: functional
5
+ difficulty: intermediate
6
+ ---
7
+
8
+ # Eval: Delete Skill — Reverse Reference Cleanup in Agents
9
+
10
+ ## Task
11
+
12
+ Test that deleting a skill also finds and cleans up agents that reference it in their `skills:` frontmatter, preventing phantom dependencies.
13
+
14
+ ### Prompt
15
+
16
+ ```
17
+ First, create a temporary skill called temp-delete-test-patterns in the platform/data namespace. It teaches temporary testing patterns for data pipelines. It needs no scripts or references — just a simple SKILL.md.
18
+
19
+ Then create a temporary agent called temp-data-tester in the platform/data namespace. Tools: Read, Grep. Model: haiku. Skills: [platform-data-temp-delete-test-patterns]. Description: "Temporary agent that uses the temp skill."
20
+
21
+ After both are created, delete the skill temp-delete-test-patterns using the ADK delete flow. When warned about the agent reference, confirm you want to clean it up too. Confirm deletion when prompted.
22
+ ```
23
+
24
+ ## Context
25
+
26
+ | Field | Value |
27
+ |-------|-------|
28
+ | **Namespace** | `platform/data` |
29
+ | **Domain** | `data` |
30
+ | **Target artifact** | `skills/aw-adk/SKILL.md` |
31
+ | **Target type** | `skill` (create then delete) |
32
+
33
+ ## Expected Outcomes
34
+
35
+ - [ ] **Skill created** at `.aw/.aw_registry/platform/data/skills/temp-delete-test-patterns/SKILL.md`
36
+ - [ ] **Agent created** referencing the skill in `skills:` frontmatter
37
+ - [ ] **Delete flow initiated** for the skill
38
+ - [ ] **Reverse reference scan** — finds the agent that references this skill
39
+ - [ ] **Warning shown** — "temp-data-tester references this skill in its skills: frontmatter"
40
+ - [ ] **User asked** whether to clean up the reference
41
+ - [ ] **Skill file + evals deleted**
42
+ - [ ] **Agent's skills: frontmatter updated** — reference to the deleted skill removed
43
+ - [ ] **No phantom dependencies remain** — agent no longer references a non-existent skill
44
+ - [ ] **`aw link` ran**
45
+
46
+ ## Grading Criteria
47
+
48
+ ### PASS
49
+
50
+ - All 10 outcomes met
51
+ - Agent file still exists but no longer references the deleted skill
52
+
53
+ ### PARTIAL
54
+
55
+ - Skill deleted but agent's skills: frontmatter not updated (phantom created)
56
+ - OR no reverse reference scan performed
57
+
58
+ ### FAIL
59
+
60
+ - Skill not deleted
61
+ - Agent also deleted (overkill — should only remove the reference)
62
+ - No warning about the dependent agent
63
+
64
+ ## Evaluation Method
65
+
66
+ **Type:** hybrid
67
+
68
+ ### Deterministic Checks
69
+
70
+ ```bash
71
+ # Skill should be gone
72
+ test ! -d ".aw/.aw_registry/platform/data/skills/temp-delete-test-patterns" || echo "FAIL: skill still exists"
73
+
74
+ # Agent should still exist
75
+ test -f ".aw/.aw_registry/platform/data/agents/temp-data-tester.md" || echo "FAIL: agent was deleted (should only clean reference)"
76
+
77
+ # Agent should NOT reference the deleted skill
78
+ grep -q "temp-delete-test-patterns" ".aw/.aw_registry/platform/data/agents/temp-data-tester.md" 2>/dev/null && echo "FAIL: phantom reference in agent"
79
+ ```
80
+
81
+ ### Model-Based Checks
82
+
83
+ - Did the ADK warn about the agent dependency before deleting?
84
+ - Did it offer to clean up the reference rather than silently deleting?
85
+
86
+ ## Baseline Expectations
87
+
88
+ - Without ADK: Skill deleted, agent left with phantom reference that breaks at runtime.
89
+ - With ADK: Reverse reference scan catches the dependency, cleans it up.
90
+ - **Expected delta:** 0 phantom references with ADK vs. 1+ without
@@ -0,0 +1,78 @@
1
+ ---
2
+ name: eval-meta-eval-coverage
3
+ target: skill/aw-adk
4
+ category: structural
5
+ difficulty: intermediate
6
+ ---
7
+
8
+ # Eval: Meta-Eval — Scenario Coverage
9
+
10
+ ## Task
11
+
12
+ Test that evals created by the ADK cover both happy path AND failure scenarios — not just happy-path-only. The ADK's eval gate requires "happy path + at least one failure scenario." This meta-eval verifies that requirement is actually met.
13
+
14
+ ### Prompt
15
+
16
+ ```
17
+ Create a skill for Redis caching patterns in the platform/data namespace.
18
+ ```
19
+
20
+ ## Context
21
+
22
+ | Field | Value |
23
+ |-------|-------|
24
+ | **Namespace** | `platform/data` |
25
+ | **Domain** | `data` |
26
+ | **Target artifact** | evals created by ADK during skill creation |
27
+ | **Target type** | `eval` (meta) |
28
+
29
+ ## Expected Outcomes
30
+
31
+ - [ ] **Skill created** with 2+ colocated evals
32
+ - [ ] **At least one happy-path eval** — tests the skill working correctly with valid input
33
+ - [ ] **At least one failure-scenario eval** — tests error handling, edge cases, or invalid input
34
+ - [ ] **Failure eval is not just "minimal input"** — it tests a genuinely different scenario (not the happy path with fewer words)
35
+ - [ ] **Eval purposes are distinct** — the two evals test meaningfully different aspects, not the same scenario with different wording
36
+ - [ ] **Each eval has PASS/FAIL criteria** that are independently verifiable
37
+
38
+ ## Grading Criteria
39
+
40
+ ### PASS
41
+
42
+ - 2+ evals exist
43
+ - At least one is clearly a failure/edge-case scenario (not a relabeled happy path)
44
+ - Each has distinct, verifiable pass/fail criteria
45
+
46
+ ### PARTIAL
47
+
48
+ - 2+ evals exist but both are variations of happy path
49
+ - OR only 1 eval created
50
+
51
+ ### FAIL
52
+
53
+ - No evals created
54
+ - OR all evals test the same scenario
55
+
56
+ ## Evaluation Method
57
+
58
+ **Type:** hybrid
59
+
60
+ ### Deterministic Checks
61
+
62
+ ```bash
63
+ # Verify 2+ eval files
64
+ EVAL_COUNT=$(ls .aw/.aw_registry/platform/data/skills/redis-caching-patterns/evals/eval-*.md 2>/dev/null | wc -l)
65
+ [[ "$EVAL_COUNT" -ge 2 ]] || echo "FAIL: fewer than 2 evals"
66
+ ```
67
+
68
+ ### Model-Based Checks
69
+
70
+ - Read each eval's scenario: are they testing genuinely different cases?
71
+ - Does at least one eval describe a failure condition (invalid input, missing data, error state)?
72
+ - Would a broken skill pass all evals? (If yes → insufficient coverage)
73
+
74
+ ## Baseline Expectations
75
+
76
+ - Without ADK: Single happy-path eval or no evals at all.
77
+ - With ADK: 2+ evals with distinct scenarios covering happy path and failure.
78
+ - **Expected delta:** 100% coverage of both paths with ADK
@@ -0,0 +1,81 @@
1
+ ---
2
+ name: eval-meta-eval-determinism
3
+ target: skill/aw-adk
4
+ category: behavioral
5
+ difficulty: advanced
6
+ ---
7
+
8
+ # Eval: Meta-Eval — Scoring Determinism
9
+
10
+ ## Task
11
+
12
+ Test that the ADK's scoring produces consistent results. The same artifact scored twice should receive the same tier and similar per-dimension scores. Flaky scoring undermines trust in the entire rubric system.
13
+
14
+ ### Prompt (run twice)
15
+
16
+ ```
17
+ Score this skill: .aw/.aw_registry/platform/data/skills/redis-caching-patterns/SKILL.md
18
+ ```
19
+
20
+ Run the exact same scoring prompt twice against the same artifact. Compare the two score outputs.
21
+
22
+ ## Context
23
+
24
+ | Field | Value |
25
+ |-------|-------|
26
+ | **Target artifact** | any existing skill with stable content |
27
+ | **Target type** | `skill` |
28
+
29
+ ## Expected Outcomes
30
+
31
+ - [ ] **Both runs produce a 10-dimension score table**
32
+ - [ ] **Same tier in both runs** — if run 1 is B-Tier, run 2 must also be B-Tier
33
+ - [ ] **Per-dimension scores within ±1 point** — a dimension scored 7 in run 1 should be 6-8 in run 2
34
+ - [ ] **Total score within ±5 points** — e.g., 72 and 76 is acceptable; 72 and 85 is not
35
+ - [ ] **Same improvement suggestions** — the top 3 gaps identified should overlap between runs
36
+ - [ ] **Both runs reference rubric-skill.md** — scoring is rubric-based, not ad-hoc
37
+
38
+ ## Grading Criteria
39
+
40
+ ### PASS
41
+
42
+ - Same tier in both runs
43
+ - Total score difference ≤ 5 points
44
+ - Per-dimension scores within ±1
45
+ - Top 3 improvement suggestions overlap (at least 2 of 3 match)
46
+
47
+ ### PARTIAL
48
+
49
+ - Same tier but total score difference 6-10 points
50
+ - OR different tier but adjacent (B vs C, not B vs D)
51
+
52
+ ### FAIL
53
+
54
+ - Different tiers separated by 2+ levels (e.g., B-Tier vs D-Tier)
55
+ - Total score difference > 10 points
56
+ - Improvement suggestions are completely different between runs
57
+
58
+ ## Evaluation Method
59
+
60
+ **Type:** hybrid
61
+
62
+ ### Deterministic Checks
63
+
64
+ ```bash
65
+ # Compare total scores from both runs (requires parsing the output)
66
+ # Tier must match exactly
67
+ # Total must be within ±5
68
+ ```
69
+
70
+ ### Model-Based Checks
71
+
72
+ - Extract the total score from each run's output
73
+ - Compare tier assignments
74
+ - Compare per-dimension scores
75
+ - Compare improvement suggestions for overlap
76
+
77
+ ## Baseline Expectations
78
+
79
+ - Without ADK: Scoring is ad-hoc, varies wildly between runs (±20+ points).
80
+ - With ADK: Rubric-anchored scoring with ≤5 point variance.
81
+ - **Expected delta:** 75%+ reduction in score variance
@@ -0,0 +1,81 @@
1
+ ---
2
+ name: eval-meta-eval-false-pass
3
+ target: skill/aw-adk
4
+ category: behavioral
5
+ difficulty: advanced
6
+ ---
7
+
8
+ # Eval: Meta-Eval — False Pass Resistance
9
+
10
+ ## Task
11
+
12
+ Test that evals created by the ADK can actually detect bad artifacts. The ADK creates an agent, then creates evals for it. Then a known-bad version of the agent (missing critical sections, wrong structure) is fed to those evals. The evals must FAIL the bad agent — not give it a false pass.
13
+
14
+ This is a meta-eval: it tests the quality of evals that the ADK produces, not the ADK's create flow itself.
15
+
16
+ ### Prompt (two-step)
17
+
18
+ **Step 1:** Create an agent for log analysis in the platform/infra namespace.
19
+
20
+ **Step 2:** Take the evals that were just created. Run them against this known-bad agent:
21
+
22
+ ```markdown
23
+ ---
24
+ name: log-analyzer
25
+ description: "Analyzes logs"
26
+ ---
27
+
28
+ # Log Analyzer
29
+ Looks at logs and finds problems.
30
+ ```
31
+
32
+ ## Context
33
+
34
+ | Field | Value |
35
+ |-------|-------|
36
+ | **Namespace** | `platform/infra` |
37
+ | **Target artifact** | evals created by ADK in step 1 |
38
+ | **Target type** | `eval` (meta) |
39
+
40
+ ## Expected Outcomes
41
+
42
+ - [ ] **Step 1 completes** — a well-structured agent is created with evals
43
+ - [ ] **Known-bad agent is structurally deficient** — missing: tools, model, category, squad, skills, identity section, core mission, critical rules, process, deliverables
44
+ - [ ] **Evals FAIL the known-bad agent** — at least 1 eval produces a FAIL verdict
45
+ - [ ] **Failure reasons are specific** — "missing Identity section" not just "low quality"
46
+ - [ ] **Evals don't false-pass** — a clearly deficient agent must not get PASS or even PARTIAL
47
+
48
+ ## Grading Criteria
49
+
50
+ ### PASS
51
+
52
+ - At least 1 eval FAILs the known-bad agent
53
+ - Failure reasons reference specific missing sections or frontmatter fields
54
+ - The well-structured agent from step 1 would PASS the same evals
55
+
56
+ ### PARTIAL
57
+
58
+ - Evals give PARTIAL (not PASS) to the known-bad agent
59
+ - Some discrimination but not full rejection
60
+
61
+ ### FAIL
62
+
63
+ - Evals PASS the known-bad agent (false pass)
64
+ - OR evals can't be run against the bad agent (no mechanism)
65
+ - OR evals only check surface features (file exists, has frontmatter) that the bad agent satisfies
66
+
67
+ ## Evaluation Method
68
+
69
+ **Type:** model-based
70
+
71
+ ### Model-Based Checks
72
+
73
+ - Do the evals contain assertions that the known-bad agent would fail?
74
+ - Are assertions specific enough to distinguish good from bad?
75
+ - Would substituting the bad agent into the eval's expected outcomes produce FAIL?
76
+
77
+ ## Baseline Expectations
78
+
79
+ - Without ADK: Evals are always-pass stubs that accept any output.
80
+ - With ADK: Evals have discriminating assertions that catch missing sections and thin content.
81
+ - **Expected delta:** 80%+ false-pass detection rate with ADK evals
@@ -0,0 +1,95 @@
1
+ ---
2
+ name: eval-score-accuracy
3
+ target: skill/aw-adk
4
+ category: behavioral
5
+ difficulty: intermediate
6
+ ---
7
+
8
+ # Eval: Score Accuracy — Deliberately Minimal Agent Gets Low Score
9
+
10
+ ## Task
11
+
12
+ Test that the ADK's scoring is calibrated — a deliberately minimal agent (missing most sections) should receive a D-Tier score, not be generously graded. This targets the self-scoring bias where the LLM creates an artifact and then scores it high.
13
+
14
+ ### Prompt
15
+
16
+ ```
17
+ Score this agent: .aw/.aw_registry/revex/reselling/backend/agents/stub-agent.md
18
+ ```
19
+
20
+ ### Setup (pre-condition)
21
+
22
+ Create a minimal stub agent before running the eval:
23
+
24
+ ```markdown
25
+ ---
26
+ name: stub-agent
27
+ description: "Does stuff"
28
+ tools: [Read]
29
+ ---
30
+
31
+ # Stub Agent
32
+
33
+ This agent helps with things.
34
+ ```
35
+
36
+ ## Context
37
+
38
+ | Field | Value |
39
+ |-------|-------|
40
+ | **Namespace** | `revex/reselling` |
41
+ | **Domain** | `backend` |
42
+ | **Target artifact** | `skills/aw-adk/SKILL.md` |
43
+ | **Target type** | `agent` |
44
+
45
+ ## Expected Outcomes
46
+
47
+ - [ ] **ADK reads rubric-agent.md** before scoring
48
+ - [ ] **Score table has 10 dimensions** — not a summary number
49
+ - [ ] **Total score is D-Tier (below 40)** — the stub agent is missing: identity section, core mission, critical rules, process, deliverables, communication style, success metrics, model field, category field, squad field
50
+ - [ ] **Missing dimensions identified** — the score output names specific gaps (not "looks good overall")
51
+ - [ ] **No inflation** — scores for missing sections are 0, not 3-5 "because the section could be implied"
52
+ - [ ] **Improvement suggestions provided** — concrete next steps to raise the score
53
+
54
+ ## Grading Criteria
55
+
56
+ ### PASS
57
+
58
+ - Total score is D-Tier (0-39)
59
+ - At least 6 dimensions scored 0 (missing entirely)
60
+ - Missing sections explicitly named
61
+
62
+ ### PARTIAL
63
+
64
+ - Total score is C-Tier (40-59) — some inflation but identifies gaps
65
+ - OR correct D-Tier but fewer than 6 zero-scored dimensions
66
+
67
+ ### FAIL
68
+
69
+ - Total score is B-Tier or above (60+) — severe inflation
70
+ - OR no per-dimension breakdown (just a summary score)
71
+ - OR does not read the rubric before scoring
72
+
73
+ ## Evaluation Method
74
+
75
+ **Type:** hybrid
76
+
77
+ ### Deterministic Checks
78
+
79
+ ```bash
80
+ # Verify the score output contains a table with 10 rows
81
+ # (model-based check needed to parse the actual scores)
82
+ ```
83
+
84
+ ### Model-Based Checks
85
+
86
+ - Is the total score below 40?
87
+ - Are missing sections scored 0 (not given partial credit)?
88
+ - Did the executor read rubric-agent.md before scoring?
89
+ - Are improvement suggestions specific (not "add more content")?
90
+
91
+ ## Baseline Expectations
92
+
93
+ - Without ADK: Model says "looks good, 7/10" with no rubric reference.
94
+ - With ADK: Calibrated score using rubric-agent.md, D-Tier for stub, specific gaps identified.
95
+ - **Expected delta:** 30+ point difference in score accuracy
@@ -0,0 +1,68 @@
1
+ ---
2
+ name: eval-type-redirect
3
+ target: skill/aw-adk
4
+ category: behavioral
5
+ difficulty: advanced
6
+ ---
7
+
8
+ # Eval: Type Redirect — Command Request That Should Be a Skill
9
+
10
+ ## Task
11
+
12
+ Test that the ADK's type classifier catches misclassifications. The prompt asks to "create a command" but the subject matter (static knowledge, best practices) is actually a skill. The ADK should redirect or at minimum flag the mismatch during the interview.
13
+
14
+ ### Prompt
15
+
16
+ ```
17
+ Create a command for React best practices in the platform/frontend namespace. It should cover component patterns, hooks usage, state management, and performance optimization tips.
18
+ ```
19
+
20
+ ## Context
21
+
22
+ | Field | Value |
23
+ |-------|-------|
24
+ | **Namespace** | `platform/frontend` |
25
+ | **Domain** | `frontend` |
26
+ | **Target artifact** | `skills/aw-adk/SKILL.md` |
27
+ | **Target type** | `skill` (despite user saying "command") |
28
+
29
+ ## Expected Outcomes
30
+
31
+ - [ ] **Type redirect detected** — the ADK recognizes "React best practices" is static knowledge (skill), not a multi-phase workflow (command)
32
+ - [ ] **User informed of redirect** — explains why this is a skill, not a command (commands automate workflows with agents and phases; skills encode knowledge)
33
+ - [ ] **Proceeds as skill** — after redirect, follows the skill create flow
34
+ - [ ] **OR asks user to confirm** — "This sounds like a skill (knowledge reference) rather than a command (workflow). Should I create it as a skill?"
35
+ - [ ] **Does NOT blindly create a command** — a "React best practices command" with forced phases and agent roster would be the wrong artifact type
36
+
37
+ ## Grading Criteria
38
+
39
+ ### PASS
40
+
41
+ - Redirect detected and communicated to user
42
+ - Proceeds with correct type (skill) after confirmation
43
+
44
+ ### PARTIAL
45
+
46
+ - Creates the artifact but notes during interview that it might be a skill
47
+ - OR creates a skill without explaining the redirect
48
+
49
+ ### FAIL
50
+
51
+ - Creates a command with forced multi-phase structure for static knowledge
52
+ - No mention of type mismatch
53
+
54
+ ## Evaluation Method
55
+
56
+ **Type:** model-based
57
+
58
+ ### Model-Based Checks
59
+
60
+ - Did the executor question the "command" classification?
61
+ - Did it explain the difference between commands (workflow) and skills (knowledge)?
62
+ - Did it ultimately create a skill (or ask user to choose)?
63
+
64
+ ## Baseline Expectations
65
+
66
+ - Without ADK: Creates whatever the user asked for literally — a forced "command" with fake phases.
67
+ - With ADK: Type classifier catches the mismatch and redirects.
68
+ - **Expected delta:** correct type 90%+ with ADK vs. literal compliance without
@@ -0,0 +1,96 @@
1
+ {
2
+ "artifact_name": "aw-adk",
3
+ "artifact_type": "skill",
4
+ "evals": [
5
+ {
6
+ "name": "eval-create-skill",
7
+ "category": "functional",
8
+ "difficulty": "intermediate",
9
+ "group": "create-mode"
10
+ },
11
+ {
12
+ "name": "eval-create-agent",
13
+ "category": "functional",
14
+ "difficulty": "intermediate",
15
+ "group": "create-mode"
16
+ },
17
+ {
18
+ "name": "eval-create-command",
19
+ "category": "functional",
20
+ "difficulty": "advanced",
21
+ "group": "create-mode"
22
+ },
23
+ {
24
+ "name": "eval-create-rule",
25
+ "category": "functional",
26
+ "difficulty": "intermediate",
27
+ "group": "create-mode"
28
+ },
29
+ {
30
+ "name": "eval-create-eval",
31
+ "category": "functional",
32
+ "difficulty": "intermediate",
33
+ "group": "create-mode"
34
+ },
35
+ {
36
+ "name": "eval-type-redirect",
37
+ "category": "behavioral",
38
+ "difficulty": "advanced",
39
+ "group": "cross-cutting"
40
+ },
41
+ {
42
+ "name": "eval-score-accuracy",
43
+ "category": "behavioral",
44
+ "difficulty": "intermediate",
45
+ "group": "cross-cutting"
46
+ },
47
+ {
48
+ "name": "eval-colocated-placement",
49
+ "category": "structural",
50
+ "difficulty": "basic",
51
+ "group": "cross-cutting"
52
+ },
53
+ {
54
+ "name": "eval-meta-eval-false-pass",
55
+ "category": "behavioral",
56
+ "difficulty": "advanced",
57
+ "group": "meta-evals"
58
+ },
59
+ {
60
+ "name": "eval-meta-eval-coverage",
61
+ "category": "structural",
62
+ "difficulty": "intermediate",
63
+ "group": "meta-evals"
64
+ },
65
+ {
66
+ "name": "eval-meta-eval-determinism",
67
+ "category": "behavioral",
68
+ "difficulty": "advanced",
69
+ "group": "meta-evals"
70
+ },
71
+ {
72
+ "name": "eval-delete-agent",
73
+ "category": "functional",
74
+ "difficulty": "intermediate",
75
+ "group": "delete-mode"
76
+ },
77
+ {
78
+ "name": "eval-delete-rule",
79
+ "category": "functional",
80
+ "difficulty": "intermediate",
81
+ "group": "delete-mode"
82
+ },
83
+ {
84
+ "name": "eval-delete-skill",
85
+ "category": "functional",
86
+ "difficulty": "intermediate",
87
+ "group": "delete-mode"
88
+ },
89
+ {
90
+ "name": "eval-delete-command",
91
+ "category": "functional",
92
+ "difficulty": "intermediate",
93
+ "group": "delete-mode"
94
+ }
95
+ ]
96
+ }