aw-ecc 1.4.31 → 1.4.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/.codex/hooks/aw-post-tool-use.sh +8 -2
  3. package/.codex/hooks/aw-session-start.sh +11 -4
  4. package/.codex/hooks/aw-stop.sh +8 -2
  5. package/.codex/hooks/aw-user-prompt-submit.sh +10 -2
  6. package/.codex/hooks.json +8 -8
  7. package/.cursor/INSTALL.md +7 -5
  8. package/.cursor/hooks/adapter.js +41 -4
  9. package/.cursor/hooks/after-agent-response.js +62 -0
  10. package/.cursor/hooks/before-submit-prompt.js +7 -1
  11. package/.cursor/hooks/post-tool-use-failure.js +21 -0
  12. package/.cursor/hooks/post-tool-use.js +39 -0
  13. package/.cursor/hooks/shared/aw-phase-definitions.js +53 -0
  14. package/.cursor/hooks/shared/aw-phase-runner.js +3 -1
  15. package/.cursor/hooks/subagent-start.js +22 -4
  16. package/.cursor/hooks/subagent-stop.js +18 -1
  17. package/.cursor/hooks.json +23 -2
  18. package/.opencode/package.json +1 -1
  19. package/AGENTS.md +3 -3
  20. package/README.md +5 -5
  21. package/commands/adk.md +52 -0
  22. package/commands/build.md +22 -9
  23. package/commands/deploy.md +12 -0
  24. package/commands/execute.md +9 -0
  25. package/commands/feature.md +333 -0
  26. package/commands/investigate.md +18 -5
  27. package/commands/plan.md +23 -9
  28. package/commands/publish.md +65 -0
  29. package/commands/review.md +12 -0
  30. package/commands/ship.md +12 -0
  31. package/commands/test.md +12 -0
  32. package/commands/verify.md +9 -0
  33. package/hooks/hooks.json +36 -0
  34. package/manifests/install-components.json +8 -0
  35. package/manifests/install-modules.json +83 -0
  36. package/manifests/install-profiles.json +7 -0
  37. package/package.json +1 -1
  38. package/scripts/ci/validate-rules.js +51 -0
  39. package/scripts/cursor-aw-home/hooks.json +23 -2
  40. package/scripts/cursor-aw-hooks/adapter.js +41 -4
  41. package/scripts/cursor-aw-hooks/before-submit-prompt.js +7 -1
  42. package/scripts/hooks/aw-usage-commit-created.js +32 -0
  43. package/scripts/hooks/aw-usage-post-tool-use-failure.js +56 -0
  44. package/scripts/hooks/aw-usage-post-tool-use.js +242 -0
  45. package/scripts/hooks/aw-usage-prompt-submit.js +112 -0
  46. package/scripts/hooks/aw-usage-session-start.js +48 -0
  47. package/scripts/hooks/aw-usage-stop.js +182 -0
  48. package/scripts/hooks/aw-usage-telemetry-send.js +84 -0
  49. package/scripts/hooks/cost-tracker.js +3 -23
  50. package/scripts/hooks/shared/aw-phase-definitions.js +53 -0
  51. package/scripts/hooks/shared/aw-phase-runner.js +3 -1
  52. package/scripts/lib/aw-hook-contract.js +2 -2
  53. package/scripts/lib/aw-pricing.js +306 -0
  54. package/scripts/lib/aw-usage-telemetry.js +472 -0
  55. package/scripts/lib/codex-hook-config.js +8 -8
  56. package/scripts/lib/cursor-hook-config.js +25 -10
  57. package/scripts/lib/install-targets/codex-home.js +7 -0
  58. package/scripts/lib/install-targets/cursor-project.js +3 -0
  59. package/scripts/lib/install-targets/helpers.js +20 -3
  60. package/skills/aw-adk/SKILL.md +317 -0
  61. package/skills/aw-adk/agents/analyzer.md +113 -0
  62. package/skills/aw-adk/agents/comparator.md +113 -0
  63. package/skills/aw-adk/agents/grader.md +115 -0
  64. package/skills/aw-adk/assets/eval_review.html +76 -0
  65. package/skills/aw-adk/eval-viewer/generate_review.py +164 -0
  66. package/skills/aw-adk/eval-viewer/viewer.html +181 -0
  67. package/skills/aw-adk/evals/eval-colocated-placement.md +84 -0
  68. package/skills/aw-adk/evals/eval-create-agent.md +90 -0
  69. package/skills/aw-adk/evals/eval-create-command.md +98 -0
  70. package/skills/aw-adk/evals/eval-create-eval.md +89 -0
  71. package/skills/aw-adk/evals/eval-create-rule.md +99 -0
  72. package/skills/aw-adk/evals/eval-create-skill.md +97 -0
  73. package/skills/aw-adk/evals/eval-delete-agent.md +79 -0
  74. package/skills/aw-adk/evals/eval-delete-command.md +89 -0
  75. package/skills/aw-adk/evals/eval-delete-rule.md +86 -0
  76. package/skills/aw-adk/evals/eval-delete-skill.md +90 -0
  77. package/skills/aw-adk/evals/eval-meta-eval-coverage.md +78 -0
  78. package/skills/aw-adk/evals/eval-meta-eval-determinism.md +81 -0
  79. package/skills/aw-adk/evals/eval-meta-eval-false-pass.md +81 -0
  80. package/skills/aw-adk/evals/eval-score-accuracy.md +95 -0
  81. package/skills/aw-adk/evals/eval-type-redirect.md +68 -0
  82. package/skills/aw-adk/evals/evals.json +96 -0
  83. package/skills/aw-adk/references/artifact-wiring.md +162 -0
  84. package/skills/aw-adk/references/cross-ide-mapping.md +71 -0
  85. package/skills/aw-adk/references/eval-placement-guide.md +183 -0
  86. package/skills/aw-adk/references/external-resources.md +75 -0
  87. package/skills/aw-adk/references/getting-started.md +66 -0
  88. package/skills/aw-adk/references/registry-structure.md +152 -0
  89. package/skills/aw-adk/references/rubric-agent.md +36 -0
  90. package/skills/aw-adk/references/rubric-command.md +36 -0
  91. package/skills/aw-adk/references/rubric-eval.md +36 -0
  92. package/skills/aw-adk/references/rubric-meta-eval.md +132 -0
  93. package/skills/aw-adk/references/rubric-rule.md +36 -0
  94. package/skills/aw-adk/references/rubric-skill.md +36 -0
  95. package/skills/aw-adk/references/schemas.md +222 -0
  96. package/skills/aw-adk/references/template-agent.md +251 -0
  97. package/skills/aw-adk/references/template-command.md +279 -0
  98. package/skills/aw-adk/references/template-eval.md +176 -0
  99. package/skills/aw-adk/references/template-rule.md +119 -0
  100. package/skills/aw-adk/references/template-skill.md +123 -0
  101. package/skills/aw-adk/references/type-classifier.md +98 -0
  102. package/skills/aw-adk/references/writing-good-agents.md +227 -0
  103. package/skills/aw-adk/references/writing-good-commands.md +258 -0
  104. package/skills/aw-adk/references/writing-good-evals.md +271 -0
  105. package/skills/aw-adk/references/writing-good-rules.md +214 -0
  106. package/skills/aw-adk/references/writing-good-skills.md +159 -0
  107. package/skills/aw-adk/scripts/aggregate-benchmark.py +190 -0
  108. package/skills/aw-adk/scripts/lint-artifact.sh +211 -0
  109. package/skills/aw-adk/scripts/score-artifact.sh +179 -0
  110. package/skills/aw-adk/scripts/trigger-eval.py +192 -0
  111. package/skills/aw-build/SKILL.md +19 -2
  112. package/skills/aw-deploy/SKILL.md +65 -3
  113. package/skills/aw-design/SKILL.md +156 -0
  114. package/skills/aw-design/references/highrise-tokens.md +394 -0
  115. package/skills/aw-design/references/micro-interactions.md +76 -0
  116. package/skills/aw-design/references/prompt-template.md +160 -0
  117. package/skills/aw-design/references/quality-checklist.md +70 -0
  118. package/skills/aw-design/references/self-review.md +497 -0
  119. package/skills/aw-design/references/stitch-workflow.md +127 -0
  120. package/skills/aw-feature/SKILL.md +293 -0
  121. package/skills/aw-investigate/SKILL.md +17 -0
  122. package/skills/aw-plan/SKILL.md +34 -3
  123. package/skills/aw-publish/SKILL.md +300 -0
  124. package/skills/aw-publish/evals/eval-confirmation-gate.md +60 -0
  125. package/skills/aw-publish/evals/eval-intent-detection.md +111 -0
  126. package/skills/aw-publish/evals/eval-push-modes.md +67 -0
  127. package/skills/aw-publish/evals/eval-rules-push.md +60 -0
  128. package/skills/aw-publish/evals/evals.json +29 -0
  129. package/skills/aw-publish/references/push-modes.md +38 -0
  130. package/skills/aw-review/SKILL.md +88 -9
  131. package/skills/aw-rules-review/SKILL.md +124 -0
  132. package/skills/aw-rules-review/agents/openai.yaml +3 -0
  133. package/skills/aw-rules-review/scripts/generate-review-template.mjs +323 -0
  134. package/skills/aw-ship/SKILL.md +16 -0
  135. package/skills/aw-spec/SKILL.md +15 -0
  136. package/skills/aw-tasks/SKILL.md +15 -0
  137. package/skills/aw-test/SKILL.md +16 -0
  138. package/skills/aw-yolo/SKILL.md +4 -0
  139. package/skills/diagnose/SKILL.md +121 -0
  140. package/skills/diagnose/scripts/hitl-loop.template.sh +41 -0
  141. package/skills/finish-only-when-green/SKILL.md +265 -0
  142. package/skills/grill-me/SKILL.md +24 -0
  143. package/skills/grill-with-docs/SKILL.md +92 -0
  144. package/skills/grill-with-docs/adr-format.md +47 -0
  145. package/skills/grill-with-docs/context-format.md +67 -0
  146. package/skills/improve-codebase-architecture/SKILL.md +75 -0
  147. package/skills/improve-codebase-architecture/deepening.md +37 -0
  148. package/skills/improve-codebase-architecture/interface-design.md +44 -0
  149. package/skills/improve-codebase-architecture/language.md +53 -0
  150. package/skills/local-ghl-setup-from-screenshot/SKILL.md +538 -0
  151. package/skills/tdd/SKILL.md +115 -0
  152. package/skills/tdd/deep-modules.md +33 -0
  153. package/skills/tdd/interface-design.md +31 -0
  154. package/skills/tdd/mocking.md +59 -0
  155. package/skills/tdd/refactoring.md +10 -0
  156. package/skills/tdd/tests.md +61 -0
  157. package/skills/to-issues/SKILL.md +62 -0
  158. package/skills/to-prd/SKILL.md +75 -0
  159. package/skills/using-aw-skills/SKILL.md +170 -237
  160. package/skills/using-aw-skills/hooks/session-start.sh +11 -41
  161. package/skills/zoom-out/SKILL.md +24 -0
  162. package/.cursor/rules/common-agents.md +0 -53
  163. package/.cursor/rules/common-aw-routing.md +0 -43
  164. package/.cursor/rules/common-coding-style.md +0 -52
  165. package/.cursor/rules/common-development-workflow.md +0 -33
  166. package/.cursor/rules/common-git-workflow.md +0 -28
  167. package/.cursor/rules/common-hooks.md +0 -34
  168. package/.cursor/rules/common-patterns.md +0 -35
  169. package/.cursor/rules/common-performance.md +0 -59
  170. package/.cursor/rules/common-security.md +0 -33
  171. package/.cursor/rules/common-testing.md +0 -33
  172. package/.cursor/skills/api-and-interface-design/SKILL.md +0 -75
  173. package/.cursor/skills/article-writing/SKILL.md +0 -85
  174. package/.cursor/skills/aw-brainstorm/SKILL.md +0 -115
  175. package/.cursor/skills/aw-build/SKILL.md +0 -152
  176. package/.cursor/skills/aw-build/evals/build-stage-cases.json +0 -28
  177. package/.cursor/skills/aw-debug/SKILL.md +0 -49
  178. package/.cursor/skills/aw-deploy/SKILL.md +0 -101
  179. package/.cursor/skills/aw-deploy/evals/deploy-stage-cases.json +0 -32
  180. package/.cursor/skills/aw-execute/SKILL.md +0 -47
  181. package/.cursor/skills/aw-execute/references/mode-code.md +0 -47
  182. package/.cursor/skills/aw-execute/references/mode-docs.md +0 -28
  183. package/.cursor/skills/aw-execute/references/mode-infra.md +0 -44
  184. package/.cursor/skills/aw-execute/references/mode-migration.md +0 -58
  185. package/.cursor/skills/aw-execute/references/worker-implementer.md +0 -26
  186. package/.cursor/skills/aw-execute/references/worker-parallel-worker.md +0 -23
  187. package/.cursor/skills/aw-execute/references/worker-quality-reviewer.md +0 -23
  188. package/.cursor/skills/aw-execute/references/worker-spec-reviewer.md +0 -23
  189. package/.cursor/skills/aw-execute/scripts/build-worker-bundle.js +0 -229
  190. package/.cursor/skills/aw-finish/SKILL.md +0 -111
  191. package/.cursor/skills/aw-investigate/SKILL.md +0 -109
  192. package/.cursor/skills/aw-plan/SKILL.md +0 -368
  193. package/.cursor/skills/aw-prepare/SKILL.md +0 -118
  194. package/.cursor/skills/aw-review/SKILL.md +0 -118
  195. package/.cursor/skills/aw-ship/SKILL.md +0 -115
  196. package/.cursor/skills/aw-spec/SKILL.md +0 -104
  197. package/.cursor/skills/aw-tasks/SKILL.md +0 -138
  198. package/.cursor/skills/aw-test/SKILL.md +0 -118
  199. package/.cursor/skills/aw-verify/SKILL.md +0 -51
  200. package/.cursor/skills/aw-yolo/SKILL.md +0 -111
  201. package/.cursor/skills/browser-testing-with-devtools/SKILL.md +0 -81
  202. package/.cursor/skills/bun-runtime/SKILL.md +0 -84
  203. package/.cursor/skills/ci-cd-and-automation/SKILL.md +0 -71
  204. package/.cursor/skills/code-simplification/SKILL.md +0 -74
  205. package/.cursor/skills/content-engine/SKILL.md +0 -88
  206. package/.cursor/skills/context-engineering/SKILL.md +0 -74
  207. package/.cursor/skills/deprecation-and-migration/SKILL.md +0 -75
  208. package/.cursor/skills/documentation-and-adrs/SKILL.md +0 -75
  209. package/.cursor/skills/documentation-lookup/SKILL.md +0 -90
  210. package/.cursor/skills/frontend-slides/SKILL.md +0 -184
  211. package/.cursor/skills/frontend-slides/STYLE_PRESETS.md +0 -330
  212. package/.cursor/skills/frontend-ui-engineering/SKILL.md +0 -68
  213. package/.cursor/skills/git-workflow-and-versioning/SKILL.md +0 -75
  214. package/.cursor/skills/idea-refine/SKILL.md +0 -84
  215. package/.cursor/skills/incremental-implementation/SKILL.md +0 -75
  216. package/.cursor/skills/investor-materials/SKILL.md +0 -96
  217. package/.cursor/skills/investor-outreach/SKILL.md +0 -76
  218. package/.cursor/skills/market-research/SKILL.md +0 -75
  219. package/.cursor/skills/mcp-server-patterns/SKILL.md +0 -67
  220. package/.cursor/skills/nextjs-turbopack/SKILL.md +0 -44
  221. package/.cursor/skills/performance-optimization/SKILL.md +0 -77
  222. package/.cursor/skills/security-and-hardening/SKILL.md +0 -70
  223. package/.cursor/skills/using-aw-skills/SKILL.md +0 -290
  224. package/.cursor/skills/using-aw-skills/evals/skill-trigger-cases.tsv +0 -25
  225. package/.cursor/skills/using-aw-skills/evals/test-skill-triggers.sh +0 -171
  226. package/.cursor/skills/using-aw-skills/hooks/hooks.json +0 -9
  227. package/.cursor/skills/using-aw-skills/hooks/session-start.sh +0 -67
  228. package/.cursor/skills/using-platform-skills/SKILL.md +0 -163
  229. package/.cursor/skills/using-platform-skills/evals/platform-selection-cases.json +0 -52
  230. /package/.cursor/rules/{golang-coding-style.md → golang-coding-style.mdc} +0 -0
  231. /package/.cursor/rules/{golang-hooks.md → golang-hooks.mdc} +0 -0
  232. /package/.cursor/rules/{golang-patterns.md → golang-patterns.mdc} +0 -0
  233. /package/.cursor/rules/{golang-security.md → golang-security.mdc} +0 -0
  234. /package/.cursor/rules/{golang-testing.md → golang-testing.mdc} +0 -0
  235. /package/.cursor/rules/{kotlin-coding-style.md → kotlin-coding-style.mdc} +0 -0
  236. /package/.cursor/rules/{kotlin-hooks.md → kotlin-hooks.mdc} +0 -0
  237. /package/.cursor/rules/{kotlin-patterns.md → kotlin-patterns.mdc} +0 -0
  238. /package/.cursor/rules/{kotlin-security.md → kotlin-security.mdc} +0 -0
  239. /package/.cursor/rules/{kotlin-testing.md → kotlin-testing.mdc} +0 -0
  240. /package/.cursor/rules/{php-coding-style.md → php-coding-style.mdc} +0 -0
  241. /package/.cursor/rules/{php-hooks.md → php-hooks.mdc} +0 -0
  242. /package/.cursor/rules/{php-patterns.md → php-patterns.mdc} +0 -0
  243. /package/.cursor/rules/{php-security.md → php-security.mdc} +0 -0
  244. /package/.cursor/rules/{php-testing.md → php-testing.mdc} +0 -0
  245. /package/.cursor/rules/{python-coding-style.md → python-coding-style.mdc} +0 -0
  246. /package/.cursor/rules/{python-hooks.md → python-hooks.mdc} +0 -0
  247. /package/.cursor/rules/{python-patterns.md → python-patterns.mdc} +0 -0
  248. /package/.cursor/rules/{python-security.md → python-security.mdc} +0 -0
  249. /package/.cursor/rules/{python-testing.md → python-testing.mdc} +0 -0
  250. /package/.cursor/rules/{swift-coding-style.md → swift-coding-style.mdc} +0 -0
  251. /package/.cursor/rules/{swift-hooks.md → swift-hooks.mdc} +0 -0
  252. /package/.cursor/rules/{swift-patterns.md → swift-patterns.mdc} +0 -0
  253. /package/.cursor/rules/{swift-security.md → swift-security.mdc} +0 -0
  254. /package/.cursor/rules/{swift-testing.md → swift-testing.mdc} +0 -0
  255. /package/.cursor/rules/{typescript-coding-style.md → typescript-coding-style.mdc} +0 -0
  256. /package/.cursor/rules/{typescript-hooks.md → typescript-hooks.mdc} +0 -0
  257. /package/.cursor/rules/{typescript-patterns.md → typescript-patterns.mdc} +0 -0
  258. /package/.cursor/rules/{typescript-security.md → typescript-security.mdc} +0 -0
  259. /package/.cursor/rules/{typescript-testing.md → typescript-testing.mdc} +0 -0
@@ -0,0 +1,317 @@
1
+ ---
2
+ name: aw-adk
3
+ description: "Agent Development Kit — create, improve, fix, score, comply, audit, and health-check any CASRE artifact (Command, Agent, Skill, Rule, Eval) in the AW registry. Use this skill whenever the user wants to author, scaffold, score, audit, improve, or fix registry artifacts. Also triggers on: 'ADK', 'developer kit', 'create an agent/skill/command/rule/eval', 'score my skill', 'audit all agents', 'make this better', 'fix lint errors'."
4
+ trigger: when the user says /aw:adk, asks to create/add/update/improve/fix/score/audit any CASRE artifact, or wants to author registry content
5
+ ---
6
+
7
+ # Agent Development Kit (ADK)
8
+
9
+ Unified authoring tool for all AW registry artifacts. One entry point, five artifact types, seven modes.
10
+
11
+ ## When to Use
12
+
13
+ - **Create**: User wants a new command, agent, skill, rule, or eval
14
+ - **Improve**: User wants to enrich an existing artifact (add examples, references, sections)
15
+ - **Fix**: User wants to resolve lint/rubric failures on an existing artifact
16
+ - **Score**: User wants to audit an artifact against its quality rubric
17
+ - **Comply**: User wants a compliance check against the spec
18
+ - **Audit**: User wants a batch score across all artifacts of a type
19
+ - **Health**: User wants a dashboard of success rates, failure clusters, pending fixes
20
+ - **Delete**: User wants to remove an artifact and clean up all its references
21
+
22
+ ## Type × Mode Matrix
23
+
24
+ ```
25
+ /aw:adk [type] [mode] [target]
26
+
27
+ Types: command | agent | skill | rule | eval
28
+ Modes: create | improve | fix | score | comply | audit | health | delete
29
+
30
+ Examples:
31
+ /aw:adk → interactive: ask type, then mode
32
+ /aw:adk agent create → create a new agent (guided)
33
+ /aw:adk skill improve my-skill → enrich an existing skill
34
+ /aw:adk agent fix my-agent → resolve lint failures
35
+ /aw:adk skill score my-skill → score against rubric
36
+ /aw:adk rule audit all → audit all rules
37
+ /aw:adk eval create my-agent → create evals for existing agent
38
+ /aw:adk agent delete my-agent → remove agent + its evals + references
39
+ ```
40
+
41
+ ## CASRE Type Classifier
42
+
43
+ Before any work, classify what the user wants. Read [type-classifier.md](references/type-classifier.md) for the full decision tree.
44
+
45
+ **Quick classifier:**
46
+
47
+ | User wants... | Type | Why |
48
+ |---|---|---|
49
+ | Reusable domain knowledge, patterns, checklists | **Skill** | Static knowledge loaded on demand |
50
+ | A persona that makes decisions, has judgment, uses tools | **Agent** | Has identity, model tier, and skills |
51
+ | A multi-phase workflow orchestrating multiple agents | **Command** | Pipeline with phases and agent assignments |
52
+ | An enforceable standard with WRONG/RIGHT examples | **Rule** | Constraint with severity and automation path |
53
+ | Validation scenarios for an existing artifact | **Eval** | Tests that the artifact works correctly |
54
+
55
+ **Common misclassifications:**
56
+ - "Create a command for MongoDB best practices" → That's a **skill** (static knowledge)
57
+ - "Create a command that reviews security" → Likely a **skill** unless it's a multi-phase pipeline
58
+ - "Create a command that acts as a database expert" → That's an **agent** (persona)
59
+
60
+ If misclassified: explain WHY, suggest the correct type, offer to redirect.
61
+
62
+ ## Create Flow
63
+
64
+ The create flow follows an eval-driven iteration loop modeled after skill-creator: draft → test → review → improve → repeat.
65
+
66
+ ### Steps
67
+
68
+ 1. **TYPE GATE** — classify using the decision tree above
69
+ 2. **REQUIREMENTS INTERVIEW** — ask 3-5 type-specific questions (one at a time)
70
+ - Read the type-specific section below for which questions to ask
71
+ 3. **NAMESPACE RESOLUTION** — construct the exact target path
72
+ - Read [registry-structure.md](references/registry-structure.md) for the path resolution decision tree
73
+ - Walk the decision tree to produce the exact filesystem path (every combination resolves to exactly one path)
74
+ - Example: platform + review domain + agent → `.aw/.aw_registry/platform/review/agents/<slug>.md`
75
+ 4. **SCAFFOLD** — generate from template
76
+ - Read the appropriate `references/template-<type>.md`
77
+ - Consult `references/writing-good-<type>s.md` for quality guidance
78
+ - To reference existing artifacts in the same domain, construct their path the same way (e.g., to see existing agents in platform/data: `ls .aw/.aw_registry/platform/data/agents/`). The registry structure is deterministic — use direct paths, not broad searches.
79
+ - **No phantom dependencies.** Every name you put in frontmatter or body is a real pointer — if the target doesn't exist, the artifact breaks at runtime. Before finalizing any artifact, verify its dependencies actually exist. If something doesn't exist yet, either create it first or remove the reference.
80
+
81
+ **Examples of what to check:**
82
+ - Creating an **agent** with `skills: [revex-reselling-redis-patterns]` → run `ls .aw/.aw_registry/revex/reselling/skills/redis-patterns/SKILL.md`. If it doesn't exist, create the skill first or drop it from the list.
83
+ - Creating a **command** with agents in the roster → you just created those agents, so they exist. But each agent may list skills in *its* `skills:` frontmatter — check those too. The chain is command → agents → skills, and every link must resolve.
84
+ - Creating a **skill** that says "run `scripts/validate.sh`" → does `scripts/validate.sh` actually exist in the skill directory? Same for `references/` links in the body.
85
+ - Follow the "explain the why" principle: explain reasoning, not just MUST/NEVER
86
+ 5. **CHECKPOINT** — before moving on, output this for the user:
87
+ > **Remaining steps for `<type>`:** LINT → SCORE (rubric-`<type>`.md) → EVALS (2+) → REGISTRY UPDATES → SYNC
88
+ This applies to every type equally — commands, agents, skills, rules, and evals all go through lint, scoring, and eval creation. Rules are not simpler; they just have different checks.
89
+ 6. **LINT** — validate the artifact
90
+ - Run `bash skills/aw-adk/scripts/lint-artifact.sh <path> <type>`
91
+ 7. **SCORE** — apply the rubric
92
+ - Read the appropriate `references/rubric-<type>.md`
93
+ - Score conservatively — when you created the artifact yourself, there's a natural bias toward generous scoring. If a section exists but is thin or uses placeholder content, score it lower (3-5) not full marks.
94
+ - Must achieve B-Tier (60+) minimum for new artifacts
95
+ 8. **EVAL GATE** — create 2+ colocated eval files
96
+ - Read [eval-placement-guide.md](references/eval-placement-guide.md) for placement rules
97
+ - Each eval must cover: happy path + at least one failure scenario
98
+ - **Eval prompts must be self-contained.** Include all context inline (interview answers, config values, expected behavior) so the eval can run non-interactively in any AI tool (Claude Code, Cursor, Codex, etc.). Never write an eval prompt that requires the runner to answer follow-up questions.
99
+ - Include at least one eval that validates the dependency chain — e.g., "all agents in the command's roster exist and all skills in those agents' frontmatter resolve to real files." This catches phantom references before they reach production.
100
+ - **Derive evals from the artifact's own structure, not just generic categories.** Look at what you built — phases, human checkpoints, agent roster, error paths — and create evals that exercise those specific mechanisms:
101
+ - **Commands with human checkpoints:** create at least one eval per checkpoint covering both approve AND reject paths. Human gates are the highest-risk behavior — if they don't block, the command's safety guarantee is void.
102
+ - **Commands with parallel agents:** create an eval where one agent fails while others pass — does the command handle mixed results correctly?
103
+ - **Agents with skills:** create an eval that exercises the skill-loaded behavior vs. skill-missing fallback.
104
+ - **Multi-phase commands:** ensure at least one eval tests a mid-pipeline failure (not just phase 1 or the final phase).
105
+ 9. **TEST RUNS** — spawn subagents to validate
106
+ - For each eval: spawn with-artifact + baseline subagents in parallel
107
+ - Collect outputs to `<artifact>-workspace/iteration-<N>/`
108
+ - Grade via `agents/grader.md` — read [schemas.md](references/schemas.md) for JSON structures
109
+ - Aggregate via `scripts/aggregate-benchmark.py`
110
+ - Launch `eval-viewer/generate_review.py` for human review
111
+ 10. **ITERATION LOOP** — review → improve → re-test
112
+ - Read feedback from `feedback.json`
113
+ - Improve artifact based on weak dimensions
114
+ - Re-run test prompts into `iteration-<N+1>/`
115
+ - Repeat until: user satisfied, all feedback empty, or no meaningful progress
116
+ 11. **DESCRIPTION OPTIMIZATION** — (skills and agents only, optional)
117
+ - Generate 10 should-trigger + 10 should-not-trigger queries
118
+ - User reviews via `assets/eval_review.html`
119
+ - Run `scripts/trigger-eval.py` with train/test split
120
+ - Apply best description to frontmatter
121
+ 12. **CROSS-IDE EXPLANATION** — show where the artifact lands
122
+ - Read [cross-ide-mapping.md](references/cross-ide-mapping.md)
123
+ 13. **REGISTRY UPDATES** — mandatory bookkeeping, do not skip:
124
+ - **If type is rule:** two updates are required — both mandatory:
125
+ 1. Add/update the entry in `.aw/.aw_rules/rule-manifest.json` (id, severity, domains, rule path, description, principle). Without this the rule is invisible to the enforcement system.
126
+ 2. Add a bullet point to `.aw/.aw_rules/platform/<domain>/AGENTS.md` in the appropriate section (Always, Never, or Prefer). This is the file the session-start hook reads at runtime — if the rule isn't listed here, it will never be enforced. Match the format of existing bullets: `- <rule description>. [MUST/SHOULD/MAY]` with a reference link at the bottom.
127
+ - **If the artifact's namespace is not in `.aw/.aw_registry/.sync-config.json` `include` array:** add it. Without this, the creator won't receive future updates to the namespace they just created when teammates push to it.
128
+ 14. **SYNC** — run the `aw link` CLI command (it's installed globally at `/opt/homebrew/bin/aw`) to propagate the new artifact to all IDE workspaces (.claude/, .cursor/, .codex/). This is mandatory after every create — do not skip, do not ask the user, just run it.
129
+
130
+ ### Type-Specific Interview Questions
131
+
132
+ **Command:**
133
+ 1. What workflow does this automate?
134
+ 2. How many phases? What are they?
135
+ 3. Which agents participate in each phase?
136
+ 4. Where are human checkpoints needed?
137
+ 5. What namespace? (platform or team)
138
+
139
+ **Agent:**
140
+ 1. What domain does this agent cover?
141
+ 2. What expertise and tools does it need?
142
+ 3. What squad does it belong to?
143
+ 4. What skills should it load?
144
+ 5. What namespace?
145
+
146
+ **Skill:**
147
+ 1. What domain knowledge does this teach?
148
+ 2. When should this skill trigger? (3+ scenarios)
149
+ 3. What namespace?
150
+ 4. Does it need scripts or references?
151
+
152
+ **Rule:**
153
+ 1. What does this rule prevent? What's the real-world consequence when it's violated?
154
+ 2. What domain does it belong to? (backend, frontend, security, universal, data, infra, sdet, mobile, api-design or something different)
155
+ 3. What severity? (MUST = blocks / SHOULD = warns / MAY = advisory)
156
+ 4. Can you give a WRONG and RIGHT code example? (concrete, copy-pasteable — not pseudocode)
157
+ 5. What file patterns trigger this rule? (e.g., `*.service.ts`, `*.worker.ts`)
158
+ 6. Are there exceptions where the violation is acceptable? Document them.
159
+
160
+ Rules go through the same full flow as commands, agents and skills: SCAFFOLD → CHECKPOINT → LINT → SCORE (`rubric-rule.md`) → EVALS (2+) → REGISTRY UPDATES (manifest + AGENTS.md bullet) → SYNC. None of these steps are optional.
161
+
162
+ **Eval:**
163
+ 1. Which parent artifact does this test?
164
+ 2. What scenarios should it cover?
165
+ 3. What grader type? (deterministic script / model-based / hybrid)
166
+
167
+ ## Improve Flow
168
+
169
+ For enriching existing artifacts. Mirrors skill-creator's iteration pattern.
170
+
171
+ 1. **LOCATE** — construct the artifact path from name + type + namespace using [registry-structure.md](references/registry-structure.md). For example, to find skill `my-skill` in platform/data: `.aw/.aw_registry/platform/data/skills/my-skill/SKILL.md`. If the name is ambiguous, `ls` the type directory to list candidates.
172
+ 2. **SNAPSHOT** — copy current version to workspace (baseline for A/B comparison)
173
+ 3. **SCORE** — apply type rubric, identify lowest-scoring dimensions
174
+ 4. **CONSULT AUTHORING GUIDE** — read `references/writing-good-<type>s.md`
175
+ 5. **ENRICH** — add missing sections, expand thin examples, add references
176
+ - Follow "explain the why" principle throughout
177
+ - Keep the prompt lean — remove what isn't pulling its weight
178
+ - Generalize from feedback — don't overfit to specific examples
179
+ 6. **RE-SCORE** — show before/after tier delta
180
+ 7. **TEST RUNS** — run evals against improved version + snapshot baseline
181
+ - Optionally use `agents/comparator.md` for blind A/B comparison
182
+ - Use `agents/analyzer.md` to understand why one version scores higher
183
+ 8. **ITERATE** — if user has feedback, improve and re-test
184
+ 9. **DESCRIPTION OPTIMIZATION** — if skill/agent, optionally re-optimize trigger
185
+ 10. **REGISTRY UPDATES** — if type is rule, update `rule-manifest.json`. If namespace changed, update `.sync-config.json`. Mandatory, do not skip.
186
+ 11. **SYNC** — run the `aw link` CLI command (it's installed globally at `/opt/homebrew/bin/aw`) to propagate changes to all IDE workspaces. Mandatory — do not skip, do not ask the user, just run it.
187
+
188
+ ## Fix Flow
189
+
190
+ For resolving lint and rubric failures on existing artifacts.
191
+
192
+ 1. **LOCATE** — construct the artifact path using [registry-structure.md](references/registry-structure.md) (same as improve flow)
193
+ 2. **LINT** — run `scripts/lint-artifact.sh` to identify all failures
194
+ 3. **AUTO-FIX** — apply mechanical fixes (missing frontmatter fields, section stubs, name alignment)
195
+ 4. **RE-LINT** — confirm all checks pass
196
+ 5. **REPORT** — list what was fixed and any remaining manual items
197
+ 6. **REGISTRY UPDATES** — if type is rule, update `rule-manifest.json`. Mandatory, do not skip.
198
+ 7. **SYNC** — run the `aw link` CLI command (it's installed globally at `/opt/homebrew/bin/aw`) to propagate fixes to all IDE workspaces. Mandatory — do not skip, do not ask the user, just run it.
199
+
200
+ ## Score Flow
201
+
202
+ 1. Read the artifact completely
203
+ 2. Read the appropriate `references/rubric-<type>.md`
204
+ 3. Score each dimension 0-10
205
+ 4. Calculate total, assign tier
206
+ 5. List specific gaps and rewrite suggestions for lowest dimensions
207
+
208
+ ## Comply Flow
209
+
210
+ Delegates to `skill-comply` for compliance checking against spec.
211
+
212
+ ## Audit Flow
213
+
214
+ Batch score all artifacts of a type. Produces a portfolio report with:
215
+ - Per-artifact scores and tiers
216
+ - Average score by category
217
+ - Artifacts needing improvement (< 60)
218
+ - Reference artifacts (highest scores)
219
+
220
+ ## Health Flow
221
+
222
+ Dashboard showing: success rates, failure clusters, pending fixes, score trends.
223
+
224
+ ## Delete Flow
225
+
226
+ For removing an artifact and all its associated files. Destructive — requires explicit user confirmation.
227
+
228
+ 1. **LOCATE** — construct the artifact path using [registry-structure.md](references/registry-structure.md)
229
+ 2. **INVENTORY** — list everything that will be deleted:
230
+ - The artifact file itself
231
+ - Colocated evals directory (e.g., `agents/evals/<slug>/`)
232
+ - Any workspace directories (`<artifact>-workspace/`)
233
+ - **If type is rule:** the `rule-manifest.json` entry AND the `AGENTS.md` bullet
234
+ - **If type is command:** agents created exclusively for this command (ask user — they may be shared)
235
+ - **If type is agent:** check if any command references this agent in its roster (warn if so)
236
+ - **If type is skill:** check if any agent lists this skill in its `skills:` frontmatter (warn if so)
237
+ 3. **REVERSE REFERENCE SCAN** — find everything that points TO this artifact and would become a phantom reference after deletion:
238
+ - **Agent being deleted:** scan all commands for this agent name in their `## Agent Roster` section
239
+ - **Skill being deleted:** scan all agents for this skill name in their `skills:` frontmatter
240
+ - **Command being deleted:** check if any other command or skill references it
241
+ - **Rule being deleted:** the manifest entry and AGENTS.md bullet (these are cleaned up in step 6)
242
+ - **Eval being deleted:** just the parent artifact's eval directory (no reverse references)
243
+ - For each reference found, show it to the user: "WARNING: <file> references this artifact. Deleting will create a phantom dependency. Remove the reference too? (yes/skip)"
244
+ 4. **CONFIRM** — show the full inventory (files to delete + references to clean) and ask: "This will delete N files and update M references. Proceed? (yes/no)". Never delete without explicit confirmation.
245
+ 5. **DELETE** — remove all inventoried files AND clean up confirmed reverse references (remove the artifact from `skills:` arrays, agent roster rows, etc.)
246
+ 6. **REGISTRY CLEANUP**:
247
+ - **If type is rule:** remove the entry from `rule-manifest.json` and the bullet from `.aw/.aw_rules/platform/<domain>/AGENTS.md`
248
+ - **If namespace is now empty:** remove the namespace directory (but check `.sync-config.json` — if other artifacts exist in sibling type directories, leave it)
249
+ 7. **SYNC** — run `aw link` to propagate the removal to all IDE workspaces
250
+
251
+ ## Writing Philosophy
252
+
253
+ These principles shape every artifact the ADK produces. They come from skill-creator (75k+ forks) and are the reason its artifacts work at scale.
254
+
255
+ 1. **Explain the why** — If you find yourself writing ALWAYS or NEVER in caps, stop. Explain the reasoning instead. LLMs are smart; give them understanding, not just compliance rules. A model that understands *why* will handle edge cases better than one following rigid directives.
256
+
257
+ 2. **Keep it lean** — Remove instructions that aren't pulling their weight. Read test run transcripts: if the model wastes time on unproductive steps, trim the instructions causing it.
258
+
259
+ 3. **Generalize from feedback** — When improving an artifact based on test results, don't overfit to the specific test cases. Think about the million future invocations. Fiddly, example-specific fixes produce brittle artifacts.
260
+
261
+ 4. **Bundle repeated work** — If test runs consistently produce similar helper scripts or take the same multi-step approach, bundle that as a script in the artifact's `scripts/` directory.
262
+
263
+ 5. **Theory of mind** — Write for the model's understanding. Use metaphors, explain context, describe the user's situation. Generic, narrow instructions produce generic, narrow results.
264
+
265
+ ## Subagents
266
+
267
+ The ADK uses three subagents for eval-driven iteration (read before spawning):
268
+
269
+ - [agents/grader.md](agents/grader.md) — Evaluates assertions against outputs. Also critiques eval quality.
270
+ - [agents/comparator.md](agents/comparator.md) — Blind A/B comparison between artifact versions.
271
+ - [agents/analyzer.md](agents/analyzer.md) — Analyzes benchmark results, surfaces patterns aggregate stats hide.
272
+
273
+ ## Scripts
274
+
275
+ Deterministic tooling for validation and benchmarking:
276
+
277
+ - `scripts/lint-artifact.sh <path> <type>` — Validates frontmatter, sections, naming, paths
278
+ - `scripts/score-artifact.sh <path> <type>` — Applies rubric, produces tier + scores (JSON)
279
+ - `scripts/aggregate-benchmark.py <workspace>/iteration-N --artifact-name <name>` — Aggregates eval results
280
+ - `scripts/trigger-eval.py --eval-set <path> --skill-path <path>` — Tests description triggering accuracy
281
+
282
+ ## References
283
+
284
+ Deep content loaded on demand. Do NOT load all at once — read only what the current mode needs.
285
+
286
+ ### Registry & Structure
287
+ - [registry-structure.md](references/registry-structure.md) — Namespace/domain/path resolution
288
+ - [cross-ide-mapping.md](references/cross-ide-mapping.md) — How artifacts appear in .claude/.cursor/.codex
289
+ - [type-classifier.md](references/type-classifier.md) — CASRE decision tree with examples
290
+ - [artifact-wiring.md](references/artifact-wiring.md) — How CASRE artifacts reference each other
291
+ - [eval-placement-guide.md](references/eval-placement-guide.md) — Colocated eval placement rules
292
+
293
+ ### Quality Rubrics (one per type)
294
+ - [rubric-command.md](references/rubric-command.md) — 10 dimensions, /100
295
+ - [rubric-agent.md](references/rubric-agent.md) — 10 dimensions, /100
296
+ - [rubric-skill.md](references/rubric-skill.md) — 10 dimensions, /100
297
+ - [rubric-rule.md](references/rubric-rule.md) — 10 dimensions, /100
298
+ - [rubric-eval.md](references/rubric-eval.md) — 10 dimensions, /100
299
+ - [rubric-meta-eval.md](references/rubric-meta-eval.md) — 5 dimensions, /50
300
+
301
+ ### Templates (one per type)
302
+ - [template-command.md](references/template-command.md)
303
+ - [template-agent.md](references/template-agent.md)
304
+ - [template-skill.md](references/template-skill.md)
305
+ - [template-rule.md](references/template-rule.md)
306
+ - [template-eval.md](references/template-eval.md)
307
+
308
+ ### Authoring Guides (how to write good artifacts)
309
+ - [writing-good-skills.md](references/writing-good-skills.md)
310
+ - [writing-good-agents.md](references/writing-good-agents.md)
311
+ - [writing-good-commands.md](references/writing-good-commands.md)
312
+ - [writing-good-rules.md](references/writing-good-rules.md)
313
+ - [writing-good-evals.md](references/writing-good-evals.md)
314
+
315
+ ### Meta
316
+ - [schemas.md](references/schemas.md) — JSON structures for evals, grading, benchmarks
317
+ - [external-resources.md](references/external-resources.md) — Curated external references
@@ -0,0 +1,113 @@
1
+ # ADK Post-hoc Analyzer Agent
2
+
3
+ Analyze benchmark results to surface patterns and generate improvement suggestions for CASRE artifacts.
4
+
5
+ ## Role
6
+
7
+ The Analyzer has two modes:
8
+
9
+ 1. **Post-comparison analysis** — After a blind comparison, "unblinds" results to explain WHY the winner won and generate actionable improvements for the loser.
10
+ 2. **Benchmark analysis** — Reviews aggregate eval results to surface patterns that aggregate stats hide.
11
+
12
+ ---
13
+
14
+ ## Mode 1: Post-Comparison Analysis
15
+
16
+ ### Inputs
17
+
18
+ - **winner**: "A" or "B" (from blind comparison)
19
+ - **winner_artifact_path**: Path to the winning artifact
20
+ - **loser_artifact_path**: Path to the losing artifact
21
+ - **comparison_result_path**: Path to the comparator's output JSON
22
+ - **output_path**: Where to save analysis results
23
+
24
+ ### Process
25
+
26
+ 1. **Read comparison result** — note winner, reasoning, and per-dimension scores
27
+ 2. **Read both artifacts** — identify structural differences in instructions, examples, edge case handling
28
+ 3. **Read test transcripts** (if available) — compare execution patterns
29
+ 4. **Identify winner strengths** — what specific content led to better outcomes?
30
+ 5. **Identify loser weaknesses** — what gaps caused worse performance?
31
+ 6. **Generate improvement suggestions** — prioritized by impact
32
+
33
+ ### Output Format
34
+
35
+ ```json
36
+ {
37
+ "comparison_summary": {
38
+ "winner": "A",
39
+ "winner_artifact": "path/to/winner",
40
+ "loser_artifact": "path/to/loser",
41
+ "score_delta": 13
42
+ },
43
+ "winner_strengths": [
44
+ "Clear step-by-step process with input/output per phase",
45
+ "Concrete code examples using actual package names"
46
+ ],
47
+ "loser_weaknesses": [
48
+ "Vague 'handle appropriately' instruction led to inconsistent behavior",
49
+ "No code examples — agent had to improvise patterns"
50
+ ],
51
+ "improvement_suggestions": [
52
+ {
53
+ "priority": "high",
54
+ "category": "instructions",
55
+ "suggestion": "Replace 'handle edge cases' with specific numbered steps for each edge case type",
56
+ "expected_impact": "Would eliminate ambiguity in 3 lowest-scoring dimensions"
57
+ }
58
+ ]
59
+ }
60
+ ```
61
+
62
+ ### Suggestion Categories
63
+
64
+ | Category | Description |
65
+ |---|---|
66
+ | `instructions` | Changes to the artifact's prose instructions |
67
+ | `examples` | Code examples or before/after patterns to add |
68
+ | `structure` | Reorganization of sections or content |
69
+ | `references` | External docs or reference files to add |
70
+ | `frontmatter` | Metadata improvements (description, trigger, etc.) |
71
+ | `error_handling` | Guidance for handling failures or edge cases |
72
+
73
+ ---
74
+
75
+ ## Mode 2: Benchmark Analysis
76
+
77
+ ### Inputs
78
+
79
+ - **benchmark_data_path**: Path to benchmark.json with all run results
80
+ - **artifact_path**: Path to the artifact being benchmarked
81
+ - **output_path**: Where to save notes (JSON array of strings)
82
+
83
+ ### Process
84
+
85
+ 1. **Read benchmark.json** — note configurations, per-run results, aggregates
86
+ 2. **Analyze per-assertion patterns**:
87
+ - Always passes in both configs? → may not differentiate artifact value
88
+ - Always fails in both? → may be broken or beyond capability
89
+ - Passes with artifact, fails without? → artifact clearly adds value
90
+ - Fails with artifact, passes without? → artifact may be hurting
91
+ - Highly variable? → flaky assertion or non-deterministic behavior
92
+ 3. **Analyze cross-eval patterns** — which eval types are consistently harder/easier?
93
+ 4. **Analyze metrics patterns** — time, tokens, tool calls; outliers that skew aggregates
94
+ 5. **Generate notes** — specific observations grounded in data
95
+
96
+ ### Output Format
97
+
98
+ ```json
99
+ [
100
+ "Assertion 'Agent has Identity section' passes 100% in both configs - doesn't differentiate artifact value",
101
+ "Eval 2 (complex multi-phase command) shows high variance (40% ± 30%) - may be flaky",
102
+ "Without-artifact runs consistently fail on eval placement checks (0% pass rate)",
103
+ "Artifact adds 8s average execution time but improves pass rate by 45%"
104
+ ]
105
+ ```
106
+
107
+ ### Guidelines
108
+
109
+ - **Report what you observe** — be specific about which evals, assertions, or runs
110
+ - **Surface hidden patterns** — things aggregate metrics would hide
111
+ - **Do NOT suggest improvements** — that's for the improvement step, not benchmarking
112
+ - **Do NOT repeat aggregates** — the user can read those in run_summary
113
+ - **Think about generalization** — would this pattern hold across more test cases?
@@ -0,0 +1,113 @@
1
+ # ADK Blind Comparator Agent
2
+
3
+ Compare two versions of a CASRE artifact WITHOUT knowing which is the improved version.
4
+
5
+ ## Role
6
+
7
+ The Blind Comparator judges which artifact version better accomplishes its purpose. You receive two artifacts labeled A and B, but you do NOT know which is the original and which is improved. This prevents bias toward the "new" version.
8
+
9
+ Your judgment is based purely on artifact quality against its type's rubric dimensions.
10
+
11
+ ## Inputs
12
+
13
+ - **artifact_a_path**: Path to the first artifact version
14
+ - **artifact_b_path**: Path to the second artifact version
15
+ - **artifact_type**: One of: command, agent, skill, rule, eval
16
+ - **rubric_path**: Path to the type-specific rubric (e.g., `references/rubric-agent.md`)
17
+
18
+ ## Process
19
+
20
+ ### Step 1: Read Both Artifacts
21
+
22
+ 1. Read artifact A completely
23
+ 2. Read artifact B completely
24
+ 3. Note structure, sections, depth, and quality of each
25
+
26
+ ### Step 2: Read the Rubric
27
+
28
+ 1. Read the type-specific rubric
29
+ 2. Understand the 10 scoring dimensions and what excellent looks like
30
+ 3. This is your evaluation framework — judge both artifacts against it
31
+
32
+ ### Step 3: Score Each Artifact
33
+
34
+ For each of the 10 rubric dimensions:
35
+ 1. Score artifact A (0-10)
36
+ 2. Score artifact B (0-10)
37
+ 3. Note specific evidence for each score
38
+
39
+ ### Step 4: Determine the Winner
40
+
41
+ Compare A and B:
42
+ 1. **Primary**: Total rubric score (sum of 10 dimensions)
43
+ 2. **Secondary**: Depth of the weakest dimension (higher floor wins)
44
+ 3. **Tiebreaker**: If truly equal, declare TIE
45
+
46
+ Be decisive — ties should be rare.
47
+
48
+ ### Step 5: Write Comparison Results
49
+
50
+ Save to the specified output path.
51
+
52
+ ## Output Format
53
+
54
+ ```json
55
+ {
56
+ "winner": "A",
57
+ "reasoning": "Artifact A has stronger Identity section with concrete personality traits and a more comprehensive Process workflow with code examples. Artifact B has better metrics but weaker rules.",
58
+ "rubric": {
59
+ "A": {
60
+ "dimensions": {
61
+ "1_frontmatter": 8,
62
+ "2_identity": 9,
63
+ "3_mission": 7,
64
+ "4_rules": 8,
65
+ "5_process": 9,
66
+ "6_deliverables": 7,
67
+ "7_communication": 8,
68
+ "8_code_examples": 6,
69
+ "9_metrics": 5,
70
+ "10_advanced": 7
71
+ },
72
+ "total": 74,
73
+ "tier": "B"
74
+ },
75
+ "B": {
76
+ "dimensions": {
77
+ "1_frontmatter": 7,
78
+ "2_identity": 5,
79
+ "3_mission": 6,
80
+ "4_rules": 6,
81
+ "5_process": 7,
82
+ "6_deliverables": 6,
83
+ "7_communication": 5,
84
+ "8_code_examples": 5,
85
+ "9_metrics": 8,
86
+ "10_advanced": 6
87
+ },
88
+ "total": 61,
89
+ "tier": "B"
90
+ }
91
+ },
92
+ "output_quality": {
93
+ "A": {
94
+ "score": 74,
95
+ "strengths": ["Rich identity section", "Step-by-step process with examples"],
96
+ "weaknesses": ["Metrics lack specific thresholds"]
97
+ },
98
+ "B": {
99
+ "score": 61,
100
+ "strengths": ["Strong metrics with numbers"],
101
+ "weaknesses": ["Vague identity", "Process lacks code examples"]
102
+ }
103
+ }
104
+ }
105
+ ```
106
+
107
+ ## Guidelines
108
+
109
+ - **Stay blind**: Do NOT try to infer which version is "original" vs "improved"
110
+ - **Use the rubric**: Score against the type-specific dimensions, not your preferences
111
+ - **Be specific**: Quote sections when explaining strengths and weaknesses
112
+ - **Be decisive**: Choose a winner unless artifacts are genuinely equivalent
113
+ - **Think about usability**: The artifact will be consumed by an LLM — which version would produce better behavior?
@@ -0,0 +1,115 @@
1
+ # ADK Grader Agent
2
+
3
+ Evaluate assertions against an execution transcript and outputs for CASRE artifacts.
4
+
5
+ ## Role
6
+
7
+ The Grader reviews a transcript and output files from an ADK create/improve run, then determines whether each assertion passes or fails. Beyond grading, you also critique the evals themselves — a passing grade on a weak assertion creates false confidence, which is worse than no eval at all.
8
+
9
+ ## Inputs
10
+
11
+ - **expectations**: List of assertions to evaluate (strings)
12
+ - **transcript_path**: Path to the execution transcript
13
+ - **outputs_dir**: Directory containing output files (the generated artifact, evals, lint results)
14
+ - **artifact_type**: One of: command, agent, skill, rule, eval
15
+
16
+ ## Process
17
+
18
+ ### Step 1: Read the Transcript
19
+
20
+ 1. Read the transcript completely
21
+ 2. Note: which ADK steps ran (type gate, interview, namespace, scaffold, lint, score)
22
+ 3. Identify any errors, skipped steps, or unexpected behavior
23
+
24
+ ### Step 2: Examine Output Files
25
+
26
+ 1. List all files in outputs_dir
27
+ 2. Read each file relevant to the assertions
28
+ 3. For CASRE artifacts specifically check:
29
+ - Frontmatter completeness (name, description, trigger/severity fields)
30
+ - Required sections present for the artifact type
31
+ - Colocated evals exist in the correct directory pattern
32
+ - Naming conventions match (kebab-case, domain prefix)
33
+
34
+ ### Step 3: Evaluate Each Assertion
35
+
36
+ For each expectation:
37
+
38
+ 1. **Search for evidence** in the transcript and outputs
39
+ 2. **Determine verdict**:
40
+ - **PASS**: Clear evidence the assertion is true AND reflects genuine quality, not surface compliance
41
+ - **FAIL**: No evidence, contradicted, or only superficially satisfied
42
+ 3. **Cite the evidence**: Quote specific text or describe what you found
43
+
44
+ The burden of proof to pass is on the assertion. When uncertain, FAIL.
45
+
46
+ ### Step 4: Extract and Verify Claims
47
+
48
+ Beyond predefined assertions, extract implicit claims from outputs:
49
+
50
+ - **Structural claims**: "The agent has 10 sections" → count them
51
+ - **Quality claims**: "Scores B-Tier" → verify against rubric
52
+ - **Completeness claims**: "All required frontmatter present" → check each field
53
+
54
+ Flag unverifiable claims.
55
+
56
+ ### Step 5: Critique the Evals
57
+
58
+ After grading, consider whether the assertions themselves could be improved:
59
+
60
+ - An assertion that passes but would also pass for a clearly wrong artifact (checking filename but not content)
61
+ - An important outcome no assertion covers (e.g., no check that colocated evals were created)
62
+ - An assertion that can't be verified from available outputs
63
+
64
+ Keep the bar high — only flag things the eval author would say "good catch" about.
65
+
66
+ ### Step 6: Write Grading Results
67
+
68
+ Save to `{outputs_dir}/../grading.json`. Use the schema from [schemas.md](../references/schemas.md).
69
+
70
+ ## Output Format
71
+
72
+ ```json
73
+ {
74
+ "expectations": [
75
+ {
76
+ "text": "The agent has a Core Mission section",
77
+ "passed": true,
78
+ "evidence": "Found '## Core Mission' at line 42 with 3 sentences describing domain and outcomes"
79
+ }
80
+ ],
81
+ "summary": {
82
+ "passed": 8,
83
+ "failed": 2,
84
+ "total": 10,
85
+ "pass_rate": 0.80
86
+ },
87
+ "claims": [
88
+ {
89
+ "claim": "Agent scores B-Tier (65/100)",
90
+ "type": "quality",
91
+ "verified": true,
92
+ "evidence": "Rubric scoring confirms: Identity 8 + Mission 7 + Rules 6 + ... = 65"
93
+ }
94
+ ],
95
+ "eval_feedback": {
96
+ "suggestions": [
97
+ {
98
+ "assertion": "The agent file exists",
99
+ "reason": "Too weak — a file with only frontmatter would pass. Check for minimum section count."
100
+ }
101
+ ],
102
+ "overall": "Assertions cover structure but not behavioral quality. Consider adding rubric-based checks."
103
+ }
104
+ }
105
+ ```
106
+
107
+ ## CASRE-Specific Grading Notes
108
+
109
+ | Type | Key things to verify beyond assertions |
110
+ |---|---|
111
+ | Command | AW-PROTOCOL reference, skill loading gate, phase I/O, human checkpoints, every agent in roster exists and their `skills:` dependencies resolve |
112
+ | Agent | Identity section (4 fields), every skill in `skills:` frontmatter exists in registry, model tier appropriate |
113
+ | Skill | Progressive disclosure (SKILL.md < 5k words), trigger scenarios (3+) |
114
+ | Rule | WRONG/RIGHT examples present, severity specified, manifest entry |
115
+ | Eval | Happy path + failure scenario, grader type specified, parent artifact referenced |