aw-ecc 1.4.31 → 1.4.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.codex/hooks/aw-post-tool-use.sh +8 -2
- package/.codex/hooks/aw-session-start.sh +11 -4
- package/.codex/hooks/aw-stop.sh +8 -2
- package/.codex/hooks/aw-user-prompt-submit.sh +10 -2
- package/.codex/hooks.json +8 -8
- package/.cursor/INSTALL.md +7 -5
- package/.cursor/hooks/adapter.js +41 -4
- package/.cursor/hooks/after-agent-response.js +62 -0
- package/.cursor/hooks/before-submit-prompt.js +7 -1
- package/.cursor/hooks/post-tool-use-failure.js +21 -0
- package/.cursor/hooks/post-tool-use.js +39 -0
- package/.cursor/hooks/shared/aw-phase-definitions.js +53 -0
- package/.cursor/hooks/shared/aw-phase-runner.js +3 -1
- package/.cursor/hooks/subagent-start.js +22 -4
- package/.cursor/hooks/subagent-stop.js +18 -1
- package/.cursor/hooks.json +23 -2
- package/.opencode/package.json +1 -1
- package/AGENTS.md +3 -3
- package/README.md +5 -5
- package/commands/adk.md +52 -0
- package/commands/build.md +22 -9
- package/commands/deploy.md +12 -0
- package/commands/execute.md +9 -0
- package/commands/feature.md +333 -0
- package/commands/investigate.md +18 -5
- package/commands/plan.md +23 -9
- package/commands/publish.md +65 -0
- package/commands/review.md +12 -0
- package/commands/ship.md +12 -0
- package/commands/test.md +12 -0
- package/commands/verify.md +9 -0
- package/hooks/hooks.json +36 -0
- package/manifests/install-components.json +8 -0
- package/manifests/install-modules.json +83 -0
- package/manifests/install-profiles.json +7 -0
- package/package.json +1 -1
- package/scripts/ci/validate-rules.js +51 -0
- package/scripts/cursor-aw-home/hooks.json +23 -2
- package/scripts/cursor-aw-hooks/adapter.js +41 -4
- package/scripts/cursor-aw-hooks/before-submit-prompt.js +7 -1
- package/scripts/hooks/aw-usage-commit-created.js +32 -0
- package/scripts/hooks/aw-usage-post-tool-use-failure.js +56 -0
- package/scripts/hooks/aw-usage-post-tool-use.js +242 -0
- package/scripts/hooks/aw-usage-prompt-submit.js +112 -0
- package/scripts/hooks/aw-usage-session-start.js +48 -0
- package/scripts/hooks/aw-usage-stop.js +182 -0
- package/scripts/hooks/aw-usage-telemetry-send.js +84 -0
- package/scripts/hooks/cost-tracker.js +3 -23
- package/scripts/hooks/shared/aw-phase-definitions.js +53 -0
- package/scripts/hooks/shared/aw-phase-runner.js +3 -1
- package/scripts/lib/aw-hook-contract.js +2 -2
- package/scripts/lib/aw-pricing.js +306 -0
- package/scripts/lib/aw-usage-telemetry.js +472 -0
- package/scripts/lib/codex-hook-config.js +8 -8
- package/scripts/lib/cursor-hook-config.js +25 -10
- package/scripts/lib/install-targets/codex-home.js +7 -0
- package/scripts/lib/install-targets/cursor-project.js +3 -0
- package/scripts/lib/install-targets/helpers.js +20 -3
- package/skills/aw-adk/SKILL.md +317 -0
- package/skills/aw-adk/agents/analyzer.md +113 -0
- package/skills/aw-adk/agents/comparator.md +113 -0
- package/skills/aw-adk/agents/grader.md +115 -0
- package/skills/aw-adk/assets/eval_review.html +76 -0
- package/skills/aw-adk/eval-viewer/generate_review.py +164 -0
- package/skills/aw-adk/eval-viewer/viewer.html +181 -0
- package/skills/aw-adk/evals/eval-colocated-placement.md +84 -0
- package/skills/aw-adk/evals/eval-create-agent.md +90 -0
- package/skills/aw-adk/evals/eval-create-command.md +98 -0
- package/skills/aw-adk/evals/eval-create-eval.md +89 -0
- package/skills/aw-adk/evals/eval-create-rule.md +99 -0
- package/skills/aw-adk/evals/eval-create-skill.md +97 -0
- package/skills/aw-adk/evals/eval-delete-agent.md +79 -0
- package/skills/aw-adk/evals/eval-delete-command.md +89 -0
- package/skills/aw-adk/evals/eval-delete-rule.md +86 -0
- package/skills/aw-adk/evals/eval-delete-skill.md +90 -0
- package/skills/aw-adk/evals/eval-meta-eval-coverage.md +78 -0
- package/skills/aw-adk/evals/eval-meta-eval-determinism.md +81 -0
- package/skills/aw-adk/evals/eval-meta-eval-false-pass.md +81 -0
- package/skills/aw-adk/evals/eval-score-accuracy.md +95 -0
- package/skills/aw-adk/evals/eval-type-redirect.md +68 -0
- package/skills/aw-adk/evals/evals.json +96 -0
- package/skills/aw-adk/references/artifact-wiring.md +162 -0
- package/skills/aw-adk/references/cross-ide-mapping.md +71 -0
- package/skills/aw-adk/references/eval-placement-guide.md +183 -0
- package/skills/aw-adk/references/external-resources.md +75 -0
- package/skills/aw-adk/references/getting-started.md +66 -0
- package/skills/aw-adk/references/registry-structure.md +152 -0
- package/skills/aw-adk/references/rubric-agent.md +36 -0
- package/skills/aw-adk/references/rubric-command.md +36 -0
- package/skills/aw-adk/references/rubric-eval.md +36 -0
- package/skills/aw-adk/references/rubric-meta-eval.md +132 -0
- package/skills/aw-adk/references/rubric-rule.md +36 -0
- package/skills/aw-adk/references/rubric-skill.md +36 -0
- package/skills/aw-adk/references/schemas.md +222 -0
- package/skills/aw-adk/references/template-agent.md +251 -0
- package/skills/aw-adk/references/template-command.md +279 -0
- package/skills/aw-adk/references/template-eval.md +176 -0
- package/skills/aw-adk/references/template-rule.md +119 -0
- package/skills/aw-adk/references/template-skill.md +123 -0
- package/skills/aw-adk/references/type-classifier.md +98 -0
- package/skills/aw-adk/references/writing-good-agents.md +227 -0
- package/skills/aw-adk/references/writing-good-commands.md +258 -0
- package/skills/aw-adk/references/writing-good-evals.md +271 -0
- package/skills/aw-adk/references/writing-good-rules.md +214 -0
- package/skills/aw-adk/references/writing-good-skills.md +159 -0
- package/skills/aw-adk/scripts/aggregate-benchmark.py +190 -0
- package/skills/aw-adk/scripts/lint-artifact.sh +211 -0
- package/skills/aw-adk/scripts/score-artifact.sh +179 -0
- package/skills/aw-adk/scripts/trigger-eval.py +192 -0
- package/skills/aw-build/SKILL.md +19 -2
- package/skills/aw-deploy/SKILL.md +65 -3
- package/skills/aw-design/SKILL.md +156 -0
- package/skills/aw-design/references/highrise-tokens.md +394 -0
- package/skills/aw-design/references/micro-interactions.md +76 -0
- package/skills/aw-design/references/prompt-template.md +160 -0
- package/skills/aw-design/references/quality-checklist.md +70 -0
- package/skills/aw-design/references/self-review.md +497 -0
- package/skills/aw-design/references/stitch-workflow.md +127 -0
- package/skills/aw-feature/SKILL.md +293 -0
- package/skills/aw-investigate/SKILL.md +17 -0
- package/skills/aw-plan/SKILL.md +34 -3
- package/skills/aw-publish/SKILL.md +300 -0
- package/skills/aw-publish/evals/eval-confirmation-gate.md +60 -0
- package/skills/aw-publish/evals/eval-intent-detection.md +111 -0
- package/skills/aw-publish/evals/eval-push-modes.md +67 -0
- package/skills/aw-publish/evals/eval-rules-push.md +60 -0
- package/skills/aw-publish/evals/evals.json +29 -0
- package/skills/aw-publish/references/push-modes.md +38 -0
- package/skills/aw-review/SKILL.md +88 -9
- package/skills/aw-rules-review/SKILL.md +124 -0
- package/skills/aw-rules-review/agents/openai.yaml +3 -0
- package/skills/aw-rules-review/scripts/generate-review-template.mjs +323 -0
- package/skills/aw-ship/SKILL.md +16 -0
- package/skills/aw-spec/SKILL.md +15 -0
- package/skills/aw-tasks/SKILL.md +15 -0
- package/skills/aw-test/SKILL.md +16 -0
- package/skills/aw-yolo/SKILL.md +4 -0
- package/skills/diagnose/SKILL.md +121 -0
- package/skills/diagnose/scripts/hitl-loop.template.sh +41 -0
- package/skills/finish-only-when-green/SKILL.md +265 -0
- package/skills/grill-me/SKILL.md +24 -0
- package/skills/grill-with-docs/SKILL.md +92 -0
- package/skills/grill-with-docs/adr-format.md +47 -0
- package/skills/grill-with-docs/context-format.md +67 -0
- package/skills/improve-codebase-architecture/SKILL.md +75 -0
- package/skills/improve-codebase-architecture/deepening.md +37 -0
- package/skills/improve-codebase-architecture/interface-design.md +44 -0
- package/skills/improve-codebase-architecture/language.md +53 -0
- package/skills/local-ghl-setup-from-screenshot/SKILL.md +538 -0
- package/skills/tdd/SKILL.md +115 -0
- package/skills/tdd/deep-modules.md +33 -0
- package/skills/tdd/interface-design.md +31 -0
- package/skills/tdd/mocking.md +59 -0
- package/skills/tdd/refactoring.md +10 -0
- package/skills/tdd/tests.md +61 -0
- package/skills/to-issues/SKILL.md +62 -0
- package/skills/to-prd/SKILL.md +75 -0
- package/skills/using-aw-skills/SKILL.md +170 -237
- package/skills/using-aw-skills/hooks/session-start.sh +11 -41
- package/skills/zoom-out/SKILL.md +24 -0
- package/.cursor/rules/common-agents.md +0 -53
- package/.cursor/rules/common-aw-routing.md +0 -43
- package/.cursor/rules/common-coding-style.md +0 -52
- package/.cursor/rules/common-development-workflow.md +0 -33
- package/.cursor/rules/common-git-workflow.md +0 -28
- package/.cursor/rules/common-hooks.md +0 -34
- package/.cursor/rules/common-patterns.md +0 -35
- package/.cursor/rules/common-performance.md +0 -59
- package/.cursor/rules/common-security.md +0 -33
- package/.cursor/rules/common-testing.md +0 -33
- package/.cursor/skills/api-and-interface-design/SKILL.md +0 -75
- package/.cursor/skills/article-writing/SKILL.md +0 -85
- package/.cursor/skills/aw-brainstorm/SKILL.md +0 -115
- package/.cursor/skills/aw-build/SKILL.md +0 -152
- package/.cursor/skills/aw-build/evals/build-stage-cases.json +0 -28
- package/.cursor/skills/aw-debug/SKILL.md +0 -49
- package/.cursor/skills/aw-deploy/SKILL.md +0 -101
- package/.cursor/skills/aw-deploy/evals/deploy-stage-cases.json +0 -32
- package/.cursor/skills/aw-execute/SKILL.md +0 -47
- package/.cursor/skills/aw-execute/references/mode-code.md +0 -47
- package/.cursor/skills/aw-execute/references/mode-docs.md +0 -28
- package/.cursor/skills/aw-execute/references/mode-infra.md +0 -44
- package/.cursor/skills/aw-execute/references/mode-migration.md +0 -58
- package/.cursor/skills/aw-execute/references/worker-implementer.md +0 -26
- package/.cursor/skills/aw-execute/references/worker-parallel-worker.md +0 -23
- package/.cursor/skills/aw-execute/references/worker-quality-reviewer.md +0 -23
- package/.cursor/skills/aw-execute/references/worker-spec-reviewer.md +0 -23
- package/.cursor/skills/aw-execute/scripts/build-worker-bundle.js +0 -229
- package/.cursor/skills/aw-finish/SKILL.md +0 -111
- package/.cursor/skills/aw-investigate/SKILL.md +0 -109
- package/.cursor/skills/aw-plan/SKILL.md +0 -368
- package/.cursor/skills/aw-prepare/SKILL.md +0 -118
- package/.cursor/skills/aw-review/SKILL.md +0 -118
- package/.cursor/skills/aw-ship/SKILL.md +0 -115
- package/.cursor/skills/aw-spec/SKILL.md +0 -104
- package/.cursor/skills/aw-tasks/SKILL.md +0 -138
- package/.cursor/skills/aw-test/SKILL.md +0 -118
- package/.cursor/skills/aw-verify/SKILL.md +0 -51
- package/.cursor/skills/aw-yolo/SKILL.md +0 -111
- package/.cursor/skills/browser-testing-with-devtools/SKILL.md +0 -81
- package/.cursor/skills/bun-runtime/SKILL.md +0 -84
- package/.cursor/skills/ci-cd-and-automation/SKILL.md +0 -71
- package/.cursor/skills/code-simplification/SKILL.md +0 -74
- package/.cursor/skills/content-engine/SKILL.md +0 -88
- package/.cursor/skills/context-engineering/SKILL.md +0 -74
- package/.cursor/skills/deprecation-and-migration/SKILL.md +0 -75
- package/.cursor/skills/documentation-and-adrs/SKILL.md +0 -75
- package/.cursor/skills/documentation-lookup/SKILL.md +0 -90
- package/.cursor/skills/frontend-slides/SKILL.md +0 -184
- package/.cursor/skills/frontend-slides/STYLE_PRESETS.md +0 -330
- package/.cursor/skills/frontend-ui-engineering/SKILL.md +0 -68
- package/.cursor/skills/git-workflow-and-versioning/SKILL.md +0 -75
- package/.cursor/skills/idea-refine/SKILL.md +0 -84
- package/.cursor/skills/incremental-implementation/SKILL.md +0 -75
- package/.cursor/skills/investor-materials/SKILL.md +0 -96
- package/.cursor/skills/investor-outreach/SKILL.md +0 -76
- package/.cursor/skills/market-research/SKILL.md +0 -75
- package/.cursor/skills/mcp-server-patterns/SKILL.md +0 -67
- package/.cursor/skills/nextjs-turbopack/SKILL.md +0 -44
- package/.cursor/skills/performance-optimization/SKILL.md +0 -77
- package/.cursor/skills/security-and-hardening/SKILL.md +0 -70
- package/.cursor/skills/using-aw-skills/SKILL.md +0 -290
- package/.cursor/skills/using-aw-skills/evals/skill-trigger-cases.tsv +0 -25
- package/.cursor/skills/using-aw-skills/evals/test-skill-triggers.sh +0 -171
- package/.cursor/skills/using-aw-skills/hooks/hooks.json +0 -9
- package/.cursor/skills/using-aw-skills/hooks/session-start.sh +0 -67
- package/.cursor/skills/using-platform-skills/SKILL.md +0 -163
- package/.cursor/skills/using-platform-skills/evals/platform-selection-cases.json +0 -52
- /package/.cursor/rules/{golang-coding-style.md → golang-coding-style.mdc} +0 -0
- /package/.cursor/rules/{golang-hooks.md → golang-hooks.mdc} +0 -0
- /package/.cursor/rules/{golang-patterns.md → golang-patterns.mdc} +0 -0
- /package/.cursor/rules/{golang-security.md → golang-security.mdc} +0 -0
- /package/.cursor/rules/{golang-testing.md → golang-testing.mdc} +0 -0
- /package/.cursor/rules/{kotlin-coding-style.md → kotlin-coding-style.mdc} +0 -0
- /package/.cursor/rules/{kotlin-hooks.md → kotlin-hooks.mdc} +0 -0
- /package/.cursor/rules/{kotlin-patterns.md → kotlin-patterns.mdc} +0 -0
- /package/.cursor/rules/{kotlin-security.md → kotlin-security.mdc} +0 -0
- /package/.cursor/rules/{kotlin-testing.md → kotlin-testing.mdc} +0 -0
- /package/.cursor/rules/{php-coding-style.md → php-coding-style.mdc} +0 -0
- /package/.cursor/rules/{php-hooks.md → php-hooks.mdc} +0 -0
- /package/.cursor/rules/{php-patterns.md → php-patterns.mdc} +0 -0
- /package/.cursor/rules/{php-security.md → php-security.mdc} +0 -0
- /package/.cursor/rules/{php-testing.md → php-testing.mdc} +0 -0
- /package/.cursor/rules/{python-coding-style.md → python-coding-style.mdc} +0 -0
- /package/.cursor/rules/{python-hooks.md → python-hooks.mdc} +0 -0
- /package/.cursor/rules/{python-patterns.md → python-patterns.mdc} +0 -0
- /package/.cursor/rules/{python-security.md → python-security.mdc} +0 -0
- /package/.cursor/rules/{python-testing.md → python-testing.mdc} +0 -0
- /package/.cursor/rules/{swift-coding-style.md → swift-coding-style.mdc} +0 -0
- /package/.cursor/rules/{swift-hooks.md → swift-hooks.mdc} +0 -0
- /package/.cursor/rules/{swift-patterns.md → swift-patterns.mdc} +0 -0
- /package/.cursor/rules/{swift-security.md → swift-security.mdc} +0 -0
- /package/.cursor/rules/{swift-testing.md → swift-testing.mdc} +0 -0
- /package/.cursor/rules/{typescript-coding-style.md → typescript-coding-style.mdc} +0 -0
- /package/.cursor/rules/{typescript-hooks.md → typescript-hooks.mdc} +0 -0
- /package/.cursor/rules/{typescript-patterns.md → typescript-patterns.mdc} +0 -0
- /package/.cursor/rules/{typescript-security.md → typescript-security.mdc} +0 -0
- /package/.cursor/rules/{typescript-testing.md → typescript-testing.mdc} +0 -0
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: aw-adk
|
|
3
|
+
description: "Agent Development Kit — create, improve, fix, score, comply, audit, and health-check any CASRE artifact (Command, Agent, Skill, Rule, Eval) in the AW registry. Use this skill whenever the user wants to author, scaffold, score, audit, improve, or fix registry artifacts. Also triggers on: 'ADK', 'developer kit', 'create an agent/skill/command/rule/eval', 'score my skill', 'audit all agents', 'make this better', 'fix lint errors'."
|
|
4
|
+
trigger: when the user says /aw:adk, asks to create/add/update/improve/fix/score/audit any CASRE artifact, or wants to author registry content
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Agent Development Kit (ADK)
|
|
8
|
+
|
|
9
|
+
Unified authoring tool for all AW registry artifacts. One entry point, five artifact types, seven modes.
|
|
10
|
+
|
|
11
|
+
## When to Use
|
|
12
|
+
|
|
13
|
+
- **Create**: User wants a new command, agent, skill, rule, or eval
|
|
14
|
+
- **Improve**: User wants to enrich an existing artifact (add examples, references, sections)
|
|
15
|
+
- **Fix**: User wants to resolve lint/rubric failures on an existing artifact
|
|
16
|
+
- **Score**: User wants to audit an artifact against its quality rubric
|
|
17
|
+
- **Comply**: User wants a compliance check against the spec
|
|
18
|
+
- **Audit**: User wants a batch score across all artifacts of a type
|
|
19
|
+
- **Health**: User wants a dashboard of success rates, failure clusters, pending fixes
|
|
20
|
+
- **Delete**: User wants to remove an artifact and clean up all its references
|
|
21
|
+
|
|
22
|
+
## Type × Mode Matrix
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
/aw:adk [type] [mode] [target]
|
|
26
|
+
|
|
27
|
+
Types: command | agent | skill | rule | eval
|
|
28
|
+
Modes: create | improve | fix | score | comply | audit | health | delete
|
|
29
|
+
|
|
30
|
+
Examples:
|
|
31
|
+
/aw:adk → interactive: ask type, then mode
|
|
32
|
+
/aw:adk agent create → create a new agent (guided)
|
|
33
|
+
/aw:adk skill improve my-skill → enrich an existing skill
|
|
34
|
+
/aw:adk agent fix my-agent → resolve lint failures
|
|
35
|
+
/aw:adk skill score my-skill → score against rubric
|
|
36
|
+
/aw:adk rule audit all → audit all rules
|
|
37
|
+
/aw:adk eval create my-agent → create evals for existing agent
|
|
38
|
+
/aw:adk agent delete my-agent → remove agent + its evals + references
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## CASRE Type Classifier
|
|
42
|
+
|
|
43
|
+
Before any work, classify what the user wants. Read [type-classifier.md](references/type-classifier.md) for the full decision tree.
|
|
44
|
+
|
|
45
|
+
**Quick classifier:**
|
|
46
|
+
|
|
47
|
+
| User wants... | Type | Why |
|
|
48
|
+
|---|---|---|
|
|
49
|
+
| Reusable domain knowledge, patterns, checklists | **Skill** | Static knowledge loaded on demand |
|
|
50
|
+
| A persona that makes decisions, has judgment, uses tools | **Agent** | Has identity, model tier, and skills |
|
|
51
|
+
| A multi-phase workflow orchestrating multiple agents | **Command** | Pipeline with phases and agent assignments |
|
|
52
|
+
| An enforceable standard with WRONG/RIGHT examples | **Rule** | Constraint with severity and automation path |
|
|
53
|
+
| Validation scenarios for an existing artifact | **Eval** | Tests that the artifact works correctly |
|
|
54
|
+
|
|
55
|
+
**Common misclassifications:**
|
|
56
|
+
- "Create a command for MongoDB best practices" → That's a **skill** (static knowledge)
|
|
57
|
+
- "Create a command that reviews security" → Likely a **skill** unless it's a multi-phase pipeline
|
|
58
|
+
- "Create a command that acts as a database expert" → That's an **agent** (persona)
|
|
59
|
+
|
|
60
|
+
If misclassified: explain WHY, suggest the correct type, offer to redirect.
|
|
61
|
+
|
|
62
|
+
## Create Flow
|
|
63
|
+
|
|
64
|
+
The create flow follows an eval-driven iteration loop modeled after skill-creator: draft → test → review → improve → repeat.
|
|
65
|
+
|
|
66
|
+
### Steps
|
|
67
|
+
|
|
68
|
+
1. **TYPE GATE** — classify using the decision tree above
|
|
69
|
+
2. **REQUIREMENTS INTERVIEW** — ask 3-5 type-specific questions (one at a time)
|
|
70
|
+
- Read the type-specific section below for which questions to ask
|
|
71
|
+
3. **NAMESPACE RESOLUTION** — construct the exact target path
|
|
72
|
+
- Read [registry-structure.md](references/registry-structure.md) for the path resolution decision tree
|
|
73
|
+
- Walk the decision tree to produce the exact filesystem path (every combination resolves to exactly one path)
|
|
74
|
+
- Example: platform + review domain + agent → `.aw/.aw_registry/platform/review/agents/<slug>.md`
|
|
75
|
+
4. **SCAFFOLD** — generate from template
|
|
76
|
+
- Read the appropriate `references/template-<type>.md`
|
|
77
|
+
- Consult `references/writing-good-<type>s.md` for quality guidance
|
|
78
|
+
- To reference existing artifacts in the same domain, construct their path the same way (e.g., to see existing agents in platform/data: `ls .aw/.aw_registry/platform/data/agents/`). The registry structure is deterministic — use direct paths, not broad searches.
|
|
79
|
+
- **No phantom dependencies.** Every name you put in frontmatter or body is a real pointer — if the target doesn't exist, the artifact breaks at runtime. Before finalizing any artifact, verify its dependencies actually exist. If something doesn't exist yet, either create it first or remove the reference.
|
|
80
|
+
|
|
81
|
+
**Examples of what to check:**
|
|
82
|
+
- Creating an **agent** with `skills: [revex-reselling-redis-patterns]` → run `ls .aw/.aw_registry/revex/reselling/skills/redis-patterns/SKILL.md`. If it doesn't exist, create the skill first or drop it from the list.
|
|
83
|
+
- Creating a **command** with agents in the roster → you just created those agents, so they exist. But each agent may list skills in *its* `skills:` frontmatter — check those too. The chain is command → agents → skills, and every link must resolve.
|
|
84
|
+
- Creating a **skill** that says "run `scripts/validate.sh`" → does `scripts/validate.sh` actually exist in the skill directory? Same for `references/` links in the body.
|
|
85
|
+
- Follow the "explain the why" principle: explain reasoning, not just MUST/NEVER
|
|
86
|
+
5. **CHECKPOINT** — before moving on, output this for the user:
|
|
87
|
+
> **Remaining steps for `<type>`:** LINT → SCORE (rubric-`<type>`.md) → EVALS (2+) → REGISTRY UPDATES → SYNC
|
|
88
|
+
This applies to every type equally — commands, agents, skills, rules, and evals all go through lint, scoring, and eval creation. Rules are not simpler; they just have different checks.
|
|
89
|
+
6. **LINT** — validate the artifact
|
|
90
|
+
- Run `bash skills/aw-adk/scripts/lint-artifact.sh <path> <type>`
|
|
91
|
+
7. **SCORE** — apply the rubric
|
|
92
|
+
- Read the appropriate `references/rubric-<type>.md`
|
|
93
|
+
- Score conservatively — when you created the artifact yourself, there's a natural bias toward generous scoring. If a section exists but is thin or uses placeholder content, score it lower (3-5) not full marks.
|
|
94
|
+
- Must achieve B-Tier (60+) minimum for new artifacts
|
|
95
|
+
8. **EVAL GATE** — create 2+ colocated eval files
|
|
96
|
+
- Read [eval-placement-guide.md](references/eval-placement-guide.md) for placement rules
|
|
97
|
+
- Each eval must cover: happy path + at least one failure scenario
|
|
98
|
+
- **Eval prompts must be self-contained.** Include all context inline (interview answers, config values, expected behavior) so the eval can run non-interactively in any AI tool (Claude Code, Cursor, Codex, etc.). Never write an eval prompt that requires the runner to answer follow-up questions.
|
|
99
|
+
- Include at least one eval that validates the dependency chain — e.g., "all agents in the command's roster exist and all skills in those agents' frontmatter resolve to real files." This catches phantom references before they reach production.
|
|
100
|
+
- **Derive evals from the artifact's own structure, not just generic categories.** Look at what you built — phases, human checkpoints, agent roster, error paths — and create evals that exercise those specific mechanisms:
|
|
101
|
+
- **Commands with human checkpoints:** create at least one eval per checkpoint covering both approve AND reject paths. Human gates are the highest-risk behavior — if they don't block, the command's safety guarantee is void.
|
|
102
|
+
- **Commands with parallel agents:** create an eval where one agent fails while others pass — does the command handle mixed results correctly?
|
|
103
|
+
- **Agents with skills:** create an eval that exercises the skill-loaded behavior vs. skill-missing fallback.
|
|
104
|
+
- **Multi-phase commands:** ensure at least one eval tests a mid-pipeline failure (not just phase 1 or the final phase).
|
|
105
|
+
9. **TEST RUNS** — spawn subagents to validate
|
|
106
|
+
- For each eval: spawn with-artifact + baseline subagents in parallel
|
|
107
|
+
- Collect outputs to `<artifact>-workspace/iteration-<N>/`
|
|
108
|
+
- Grade via `agents/grader.md` — read [schemas.md](references/schemas.md) for JSON structures
|
|
109
|
+
- Aggregate via `scripts/aggregate-benchmark.py`
|
|
110
|
+
- Launch `eval-viewer/generate_review.py` for human review
|
|
111
|
+
10. **ITERATION LOOP** — review → improve → re-test
|
|
112
|
+
- Read feedback from `feedback.json`
|
|
113
|
+
- Improve artifact based on weak dimensions
|
|
114
|
+
- Re-run test prompts into `iteration-<N+1>/`
|
|
115
|
+
- Repeat until: user satisfied, all feedback empty, or no meaningful progress
|
|
116
|
+
11. **DESCRIPTION OPTIMIZATION** — (skills and agents only, optional)
|
|
117
|
+
- Generate 10 should-trigger + 10 should-not-trigger queries
|
|
118
|
+
- User reviews via `assets/eval_review.html`
|
|
119
|
+
- Run `scripts/trigger-eval.py` with train/test split
|
|
120
|
+
- Apply best description to frontmatter
|
|
121
|
+
12. **CROSS-IDE EXPLANATION** — show where the artifact lands
|
|
122
|
+
- Read [cross-ide-mapping.md](references/cross-ide-mapping.md)
|
|
123
|
+
13. **REGISTRY UPDATES** — mandatory bookkeeping, do not skip:
|
|
124
|
+
- **If type is rule:** two updates are required — both mandatory:
|
|
125
|
+
1. Add/update the entry in `.aw/.aw_rules/rule-manifest.json` (id, severity, domains, rule path, description, principle). Without this the rule is invisible to the enforcement system.
|
|
126
|
+
2. Add a bullet point to `.aw/.aw_rules/platform/<domain>/AGENTS.md` in the appropriate section (Always, Never, or Prefer). This is the file the session-start hook reads at runtime — if the rule isn't listed here, it will never be enforced. Match the format of existing bullets: `- <rule description>. [MUST/SHOULD/MAY]` with a reference link at the bottom.
|
|
127
|
+
- **If the artifact's namespace is not in `.aw/.aw_registry/.sync-config.json` `include` array:** add it. Without this, the creator won't receive future updates to the namespace they just created when teammates push to it.
|
|
128
|
+
14. **SYNC** — run the `aw link` CLI command (it's installed globally at `/opt/homebrew/bin/aw`) to propagate the new artifact to all IDE workspaces (.claude/, .cursor/, .codex/). This is mandatory after every create — do not skip, do not ask the user, just run it.
|
|
129
|
+
|
|
130
|
+
### Type-Specific Interview Questions
|
|
131
|
+
|
|
132
|
+
**Command:**
|
|
133
|
+
1. What workflow does this automate?
|
|
134
|
+
2. How many phases? What are they?
|
|
135
|
+
3. Which agents participate in each phase?
|
|
136
|
+
4. Where are human checkpoints needed?
|
|
137
|
+
5. What namespace? (platform or team)
|
|
138
|
+
|
|
139
|
+
**Agent:**
|
|
140
|
+
1. What domain does this agent cover?
|
|
141
|
+
2. What expertise and tools does it need?
|
|
142
|
+
3. What squad does it belong to?
|
|
143
|
+
4. What skills should it load?
|
|
144
|
+
5. What namespace?
|
|
145
|
+
|
|
146
|
+
**Skill:**
|
|
147
|
+
1. What domain knowledge does this teach?
|
|
148
|
+
2. When should this skill trigger? (3+ scenarios)
|
|
149
|
+
3. What namespace?
|
|
150
|
+
4. Does it need scripts or references?
|
|
151
|
+
|
|
152
|
+
**Rule:**
|
|
153
|
+
1. What does this rule prevent? What's the real-world consequence when it's violated?
|
|
154
|
+
2. What domain does it belong to? (backend, frontend, security, universal, data, infra, sdet, mobile, api-design or something different)
|
|
155
|
+
3. What severity? (MUST = blocks / SHOULD = warns / MAY = advisory)
|
|
156
|
+
4. Can you give a WRONG and RIGHT code example? (concrete, copy-pasteable — not pseudocode)
|
|
157
|
+
5. What file patterns trigger this rule? (e.g., `*.service.ts`, `*.worker.ts`)
|
|
158
|
+
6. Are there exceptions where the violation is acceptable? Document them.
|
|
159
|
+
|
|
160
|
+
Rules go through the same full flow as commands, agents and skills: SCAFFOLD → CHECKPOINT → LINT → SCORE (`rubric-rule.md`) → EVALS (2+) → REGISTRY UPDATES (manifest + AGENTS.md bullet) → SYNC. None of these steps are optional.
|
|
161
|
+
|
|
162
|
+
**Eval:**
|
|
163
|
+
1. Which parent artifact does this test?
|
|
164
|
+
2. What scenarios should it cover?
|
|
165
|
+
3. What grader type? (deterministic script / model-based / hybrid)
|
|
166
|
+
|
|
167
|
+
## Improve Flow
|
|
168
|
+
|
|
169
|
+
For enriching existing artifacts. Mirrors skill-creator's iteration pattern.
|
|
170
|
+
|
|
171
|
+
1. **LOCATE** — construct the artifact path from name + type + namespace using [registry-structure.md](references/registry-structure.md). For example, to find skill `my-skill` in platform/data: `.aw/.aw_registry/platform/data/skills/my-skill/SKILL.md`. If the name is ambiguous, `ls` the type directory to list candidates.
|
|
172
|
+
2. **SNAPSHOT** — copy current version to workspace (baseline for A/B comparison)
|
|
173
|
+
3. **SCORE** — apply type rubric, identify lowest-scoring dimensions
|
|
174
|
+
4. **CONSULT AUTHORING GUIDE** — read `references/writing-good-<type>s.md`
|
|
175
|
+
5. **ENRICH** — add missing sections, expand thin examples, add references
|
|
176
|
+
- Follow "explain the why" principle throughout
|
|
177
|
+
- Keep the prompt lean — remove what isn't pulling its weight
|
|
178
|
+
- Generalize from feedback — don't overfit to specific examples
|
|
179
|
+
6. **RE-SCORE** — show before/after tier delta
|
|
180
|
+
7. **TEST RUNS** — run evals against improved version + snapshot baseline
|
|
181
|
+
- Optionally use `agents/comparator.md` for blind A/B comparison
|
|
182
|
+
- Use `agents/analyzer.md` to understand why one version scores higher
|
|
183
|
+
8. **ITERATE** — if user has feedback, improve and re-test
|
|
184
|
+
9. **DESCRIPTION OPTIMIZATION** — if skill/agent, optionally re-optimize trigger
|
|
185
|
+
10. **REGISTRY UPDATES** — if type is rule, update `rule-manifest.json`. If namespace changed, update `.sync-config.json`. Mandatory, do not skip.
|
|
186
|
+
11. **SYNC** — run the `aw link` CLI command (it's installed globally at `/opt/homebrew/bin/aw`) to propagate changes to all IDE workspaces. Mandatory — do not skip, do not ask the user, just run it.
|
|
187
|
+
|
|
188
|
+
## Fix Flow
|
|
189
|
+
|
|
190
|
+
For resolving lint and rubric failures on existing artifacts.
|
|
191
|
+
|
|
192
|
+
1. **LOCATE** — construct the artifact path using [registry-structure.md](references/registry-structure.md) (same as improve flow)
|
|
193
|
+
2. **LINT** — run `scripts/lint-artifact.sh` to identify all failures
|
|
194
|
+
3. **AUTO-FIX** — apply mechanical fixes (missing frontmatter fields, section stubs, name alignment)
|
|
195
|
+
4. **RE-LINT** — confirm all checks pass
|
|
196
|
+
5. **REPORT** — list what was fixed and any remaining manual items
|
|
197
|
+
6. **REGISTRY UPDATES** — if type is rule, update `rule-manifest.json`. Mandatory, do not skip.
|
|
198
|
+
7. **SYNC** — run the `aw link` CLI command (it's installed globally at `/opt/homebrew/bin/aw`) to propagate fixes to all IDE workspaces. Mandatory — do not skip, do not ask the user, just run it.
|
|
199
|
+
|
|
200
|
+
## Score Flow
|
|
201
|
+
|
|
202
|
+
1. Read the artifact completely
|
|
203
|
+
2. Read the appropriate `references/rubric-<type>.md`
|
|
204
|
+
3. Score each dimension 0-10
|
|
205
|
+
4. Calculate total, assign tier
|
|
206
|
+
5. List specific gaps and rewrite suggestions for lowest dimensions
|
|
207
|
+
|
|
208
|
+
## Comply Flow
|
|
209
|
+
|
|
210
|
+
Delegates to `skill-comply` for compliance checking against spec.
|
|
211
|
+
|
|
212
|
+
## Audit Flow
|
|
213
|
+
|
|
214
|
+
Batch score all artifacts of a type. Produces a portfolio report with:
|
|
215
|
+
- Per-artifact scores and tiers
|
|
216
|
+
- Average score by category
|
|
217
|
+
- Artifacts needing improvement (< 60)
|
|
218
|
+
- Reference artifacts (highest scores)
|
|
219
|
+
|
|
220
|
+
## Health Flow
|
|
221
|
+
|
|
222
|
+
Dashboard showing: success rates, failure clusters, pending fixes, score trends.
|
|
223
|
+
|
|
224
|
+
## Delete Flow
|
|
225
|
+
|
|
226
|
+
For removing an artifact and all its associated files. Destructive — requires explicit user confirmation.
|
|
227
|
+
|
|
228
|
+
1. **LOCATE** — construct the artifact path using [registry-structure.md](references/registry-structure.md)
|
|
229
|
+
2. **INVENTORY** — list everything that will be deleted:
|
|
230
|
+
- The artifact file itself
|
|
231
|
+
- Colocated evals directory (e.g., `agents/evals/<slug>/`)
|
|
232
|
+
- Any workspace directories (`<artifact>-workspace/`)
|
|
233
|
+
- **If type is rule:** the `rule-manifest.json` entry AND the `AGENTS.md` bullet
|
|
234
|
+
- **If type is command:** agents created exclusively for this command (ask user — they may be shared)
|
|
235
|
+
- **If type is agent:** check if any command references this agent in its roster (warn if so)
|
|
236
|
+
- **If type is skill:** check if any agent lists this skill in its `skills:` frontmatter (warn if so)
|
|
237
|
+
3. **REVERSE REFERENCE SCAN** — find everything that points TO this artifact and would become a phantom reference after deletion:
|
|
238
|
+
- **Agent being deleted:** scan all commands for this agent name in their `## Agent Roster` section
|
|
239
|
+
- **Skill being deleted:** scan all agents for this skill name in their `skills:` frontmatter
|
|
240
|
+
- **Command being deleted:** check if any other command or skill references it
|
|
241
|
+
- **Rule being deleted:** the manifest entry and AGENTS.md bullet (these are cleaned up in step 6)
|
|
242
|
+
- **Eval being deleted:** just the parent artifact's eval directory (no reverse references)
|
|
243
|
+
- For each reference found, show it to the user: "WARNING: <file> references this artifact. Deleting will create a phantom dependency. Remove the reference too? (yes/skip)"
|
|
244
|
+
4. **CONFIRM** — show the full inventory (files to delete + references to clean) and ask: "This will delete N files and update M references. Proceed? (yes/no)". Never delete without explicit confirmation.
|
|
245
|
+
5. **DELETE** — remove all inventoried files AND clean up confirmed reverse references (remove the artifact from `skills:` arrays, agent roster rows, etc.)
|
|
246
|
+
6. **REGISTRY CLEANUP**:
|
|
247
|
+
- **If type is rule:** remove the entry from `rule-manifest.json` and the bullet from `.aw/.aw_rules/platform/<domain>/AGENTS.md`
|
|
248
|
+
- **If namespace is now empty:** remove the namespace directory (but check `.sync-config.json` — if other artifacts exist in sibling type directories, leave it)
|
|
249
|
+
7. **SYNC** — run `aw link` to propagate the removal to all IDE workspaces
|
|
250
|
+
|
|
251
|
+
## Writing Philosophy
|
|
252
|
+
|
|
253
|
+
These principles shape every artifact the ADK produces. They come from skill-creator (75k+ forks) and are the reason its artifacts work at scale.
|
|
254
|
+
|
|
255
|
+
1. **Explain the why** — If you find yourself writing ALWAYS or NEVER in caps, stop. Explain the reasoning instead. LLMs are smart; give them understanding, not just compliance rules. A model that understands *why* will handle edge cases better than one following rigid directives.
|
|
256
|
+
|
|
257
|
+
2. **Keep it lean** — Remove instructions that aren't pulling their weight. Read test run transcripts: if the model wastes time on unproductive steps, trim the instructions causing it.
|
|
258
|
+
|
|
259
|
+
3. **Generalize from feedback** — When improving an artifact based on test results, don't overfit to the specific test cases. Think about the million future invocations. Fiddly, example-specific fixes produce brittle artifacts.
|
|
260
|
+
|
|
261
|
+
4. **Bundle repeated work** — If test runs consistently produce similar helper scripts or take the same multi-step approach, bundle that as a script in the artifact's `scripts/` directory.
|
|
262
|
+
|
|
263
|
+
5. **Theory of mind** — Write for the model's understanding. Use metaphors, explain context, describe the user's situation. Generic, narrow instructions produce generic, narrow results.
|
|
264
|
+
|
|
265
|
+
## Subagents
|
|
266
|
+
|
|
267
|
+
The ADK uses three subagents for eval-driven iteration (read before spawning):
|
|
268
|
+
|
|
269
|
+
- [agents/grader.md](agents/grader.md) — Evaluates assertions against outputs. Also critiques eval quality.
|
|
270
|
+
- [agents/comparator.md](agents/comparator.md) — Blind A/B comparison between artifact versions.
|
|
271
|
+
- [agents/analyzer.md](agents/analyzer.md) — Analyzes benchmark results, surfaces patterns aggregate stats hide.
|
|
272
|
+
|
|
273
|
+
## Scripts
|
|
274
|
+
|
|
275
|
+
Deterministic tooling for validation and benchmarking:
|
|
276
|
+
|
|
277
|
+
- `scripts/lint-artifact.sh <path> <type>` — Validates frontmatter, sections, naming, paths
|
|
278
|
+
- `scripts/score-artifact.sh <path> <type>` — Applies rubric, produces tier + scores (JSON)
|
|
279
|
+
- `scripts/aggregate-benchmark.py <workspace>/iteration-N --artifact-name <name>` — Aggregates eval results
|
|
280
|
+
- `scripts/trigger-eval.py --eval-set <path> --skill-path <path>` — Tests description triggering accuracy
|
|
281
|
+
|
|
282
|
+
## References
|
|
283
|
+
|
|
284
|
+
Deep content loaded on demand. Do NOT load all at once — read only what the current mode needs.
|
|
285
|
+
|
|
286
|
+
### Registry & Structure
|
|
287
|
+
- [registry-structure.md](references/registry-structure.md) — Namespace/domain/path resolution
|
|
288
|
+
- [cross-ide-mapping.md](references/cross-ide-mapping.md) — How artifacts appear in .claude/.cursor/.codex
|
|
289
|
+
- [type-classifier.md](references/type-classifier.md) — CASRE decision tree with examples
|
|
290
|
+
- [artifact-wiring.md](references/artifact-wiring.md) — How CASRE artifacts reference each other
|
|
291
|
+
- [eval-placement-guide.md](references/eval-placement-guide.md) — Colocated eval placement rules
|
|
292
|
+
|
|
293
|
+
### Quality Rubrics (one per type)
|
|
294
|
+
- [rubric-command.md](references/rubric-command.md) — 10 dimensions, /100
|
|
295
|
+
- [rubric-agent.md](references/rubric-agent.md) — 10 dimensions, /100
|
|
296
|
+
- [rubric-skill.md](references/rubric-skill.md) — 10 dimensions, /100
|
|
297
|
+
- [rubric-rule.md](references/rubric-rule.md) — 10 dimensions, /100
|
|
298
|
+
- [rubric-eval.md](references/rubric-eval.md) — 10 dimensions, /100
|
|
299
|
+
- [rubric-meta-eval.md](references/rubric-meta-eval.md) — 5 dimensions, /50
|
|
300
|
+
|
|
301
|
+
### Templates (one per type)
|
|
302
|
+
- [template-command.md](references/template-command.md)
|
|
303
|
+
- [template-agent.md](references/template-agent.md)
|
|
304
|
+
- [template-skill.md](references/template-skill.md)
|
|
305
|
+
- [template-rule.md](references/template-rule.md)
|
|
306
|
+
- [template-eval.md](references/template-eval.md)
|
|
307
|
+
|
|
308
|
+
### Authoring Guides (how to write good artifacts)
|
|
309
|
+
- [writing-good-skills.md](references/writing-good-skills.md)
|
|
310
|
+
- [writing-good-agents.md](references/writing-good-agents.md)
|
|
311
|
+
- [writing-good-commands.md](references/writing-good-commands.md)
|
|
312
|
+
- [writing-good-rules.md](references/writing-good-rules.md)
|
|
313
|
+
- [writing-good-evals.md](references/writing-good-evals.md)
|
|
314
|
+
|
|
315
|
+
### Meta
|
|
316
|
+
- [schemas.md](references/schemas.md) — JSON structures for evals, grading, benchmarks
|
|
317
|
+
- [external-resources.md](references/external-resources.md) — Curated external references
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# ADK Post-hoc Analyzer Agent
|
|
2
|
+
|
|
3
|
+
Analyze benchmark results to surface patterns and generate improvement suggestions for CASRE artifacts.
|
|
4
|
+
|
|
5
|
+
## Role
|
|
6
|
+
|
|
7
|
+
The Analyzer has two modes:
|
|
8
|
+
|
|
9
|
+
1. **Post-comparison analysis** — After a blind comparison, "unblinds" results to explain WHY the winner won and generate actionable improvements for the loser.
|
|
10
|
+
2. **Benchmark analysis** — Reviews aggregate eval results to surface patterns that aggregate stats hide.
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Mode 1: Post-Comparison Analysis
|
|
15
|
+
|
|
16
|
+
### Inputs
|
|
17
|
+
|
|
18
|
+
- **winner**: "A" or "B" (from blind comparison)
|
|
19
|
+
- **winner_artifact_path**: Path to the winning artifact
|
|
20
|
+
- **loser_artifact_path**: Path to the losing artifact
|
|
21
|
+
- **comparison_result_path**: Path to the comparator's output JSON
|
|
22
|
+
- **output_path**: Where to save analysis results
|
|
23
|
+
|
|
24
|
+
### Process
|
|
25
|
+
|
|
26
|
+
1. **Read comparison result** — note winner, reasoning, and per-dimension scores
|
|
27
|
+
2. **Read both artifacts** — identify structural differences in instructions, examples, edge case handling
|
|
28
|
+
3. **Read test transcripts** (if available) — compare execution patterns
|
|
29
|
+
4. **Identify winner strengths** — what specific content led to better outcomes?
|
|
30
|
+
5. **Identify loser weaknesses** — what gaps caused worse performance?
|
|
31
|
+
6. **Generate improvement suggestions** — prioritized by impact
|
|
32
|
+
|
|
33
|
+
### Output Format
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
{
|
|
37
|
+
"comparison_summary": {
|
|
38
|
+
"winner": "A",
|
|
39
|
+
"winner_artifact": "path/to/winner",
|
|
40
|
+
"loser_artifact": "path/to/loser",
|
|
41
|
+
"score_delta": 13
|
|
42
|
+
},
|
|
43
|
+
"winner_strengths": [
|
|
44
|
+
"Clear step-by-step process with input/output per phase",
|
|
45
|
+
"Concrete code examples using actual package names"
|
|
46
|
+
],
|
|
47
|
+
"loser_weaknesses": [
|
|
48
|
+
"Vague 'handle appropriately' instruction led to inconsistent behavior",
|
|
49
|
+
"No code examples — agent had to improvise patterns"
|
|
50
|
+
],
|
|
51
|
+
"improvement_suggestions": [
|
|
52
|
+
{
|
|
53
|
+
"priority": "high",
|
|
54
|
+
"category": "instructions",
|
|
55
|
+
"suggestion": "Replace 'handle edge cases' with specific numbered steps for each edge case type",
|
|
56
|
+
"expected_impact": "Would eliminate ambiguity in 3 lowest-scoring dimensions"
|
|
57
|
+
}
|
|
58
|
+
]
|
|
59
|
+
}
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Suggestion Categories
|
|
63
|
+
|
|
64
|
+
| Category | Description |
|
|
65
|
+
|---|---|
|
|
66
|
+
| `instructions` | Changes to the artifact's prose instructions |
|
|
67
|
+
| `examples` | Code examples or before/after patterns to add |
|
|
68
|
+
| `structure` | Reorganization of sections or content |
|
|
69
|
+
| `references` | External docs or reference files to add |
|
|
70
|
+
| `frontmatter` | Metadata improvements (description, trigger, etc.) |
|
|
71
|
+
| `error_handling` | Guidance for handling failures or edge cases |
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Mode 2: Benchmark Analysis
|
|
76
|
+
|
|
77
|
+
### Inputs
|
|
78
|
+
|
|
79
|
+
- **benchmark_data_path**: Path to benchmark.json with all run results
|
|
80
|
+
- **artifact_path**: Path to the artifact being benchmarked
|
|
81
|
+
- **output_path**: Where to save notes (JSON array of strings)
|
|
82
|
+
|
|
83
|
+
### Process
|
|
84
|
+
|
|
85
|
+
1. **Read benchmark.json** — note configurations, per-run results, aggregates
|
|
86
|
+
2. **Analyze per-assertion patterns**:
|
|
87
|
+
- Always passes in both configs? → may not differentiate artifact value
|
|
88
|
+
- Always fails in both? → may be broken or beyond capability
|
|
89
|
+
- Passes with artifact, fails without? → artifact clearly adds value
|
|
90
|
+
- Fails with artifact, passes without? → artifact may be hurting
|
|
91
|
+
- Highly variable? → flaky assertion or non-deterministic behavior
|
|
92
|
+
3. **Analyze cross-eval patterns** — which eval types are consistently harder/easier?
|
|
93
|
+
4. **Analyze metrics patterns** — time, tokens, tool calls; outliers that skew aggregates
|
|
94
|
+
5. **Generate notes** — specific observations grounded in data
|
|
95
|
+
|
|
96
|
+
### Output Format
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
[
|
|
100
|
+
"Assertion 'Agent has Identity section' passes 100% in both configs - doesn't differentiate artifact value",
|
|
101
|
+
"Eval 2 (complex multi-phase command) shows high variance (40% ± 30%) - may be flaky",
|
|
102
|
+
"Without-artifact runs consistently fail on eval placement checks (0% pass rate)",
|
|
103
|
+
"Artifact adds 8s average execution time but improves pass rate by 45%"
|
|
104
|
+
]
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Guidelines
|
|
108
|
+
|
|
109
|
+
- **Report what you observe** — be specific about which evals, assertions, or runs
|
|
110
|
+
- **Surface hidden patterns** — things aggregate metrics would hide
|
|
111
|
+
- **Do NOT suggest improvements** — that's for the improvement step, not benchmarking
|
|
112
|
+
- **Do NOT repeat aggregates** — the user can read those in run_summary
|
|
113
|
+
- **Think about generalization** — would this pattern hold across more test cases?
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# ADK Blind Comparator Agent
|
|
2
|
+
|
|
3
|
+
Compare two versions of a CASRE artifact WITHOUT knowing which is the improved version.
|
|
4
|
+
|
|
5
|
+
## Role
|
|
6
|
+
|
|
7
|
+
The Blind Comparator judges which artifact version better accomplishes its purpose. You receive two artifacts labeled A and B, but you do NOT know which is the original and which is improved. This prevents bias toward the "new" version.
|
|
8
|
+
|
|
9
|
+
Your judgment is based purely on artifact quality against its type's rubric dimensions.
|
|
10
|
+
|
|
11
|
+
## Inputs
|
|
12
|
+
|
|
13
|
+
- **artifact_a_path**: Path to the first artifact version
|
|
14
|
+
- **artifact_b_path**: Path to the second artifact version
|
|
15
|
+
- **artifact_type**: One of: command, agent, skill, rule, eval
|
|
16
|
+
- **rubric_path**: Path to the type-specific rubric (e.g., `references/rubric-agent.md`)
|
|
17
|
+
|
|
18
|
+
## Process
|
|
19
|
+
|
|
20
|
+
### Step 1: Read Both Artifacts
|
|
21
|
+
|
|
22
|
+
1. Read artifact A completely
|
|
23
|
+
2. Read artifact B completely
|
|
24
|
+
3. Note structure, sections, depth, and quality of each
|
|
25
|
+
|
|
26
|
+
### Step 2: Read the Rubric
|
|
27
|
+
|
|
28
|
+
1. Read the type-specific rubric
|
|
29
|
+
2. Understand the 10 scoring dimensions and what excellent looks like
|
|
30
|
+
3. This is your evaluation framework — judge both artifacts against it
|
|
31
|
+
|
|
32
|
+
### Step 3: Score Each Artifact
|
|
33
|
+
|
|
34
|
+
For each of the 10 rubric dimensions:
|
|
35
|
+
1. Score artifact A (0-10)
|
|
36
|
+
2. Score artifact B (0-10)
|
|
37
|
+
3. Note specific evidence for each score
|
|
38
|
+
|
|
39
|
+
### Step 4: Determine the Winner
|
|
40
|
+
|
|
41
|
+
Compare A and B:
|
|
42
|
+
1. **Primary**: Total rubric score (sum of 10 dimensions)
|
|
43
|
+
2. **Secondary**: Depth of the weakest dimension (higher floor wins)
|
|
44
|
+
3. **Tiebreaker**: If truly equal, declare TIE
|
|
45
|
+
|
|
46
|
+
Be decisive — ties should be rare.
|
|
47
|
+
|
|
48
|
+
### Step 5: Write Comparison Results
|
|
49
|
+
|
|
50
|
+
Save to the specified output path.
|
|
51
|
+
|
|
52
|
+
## Output Format
|
|
53
|
+
|
|
54
|
+
```json
|
|
55
|
+
{
|
|
56
|
+
"winner": "A",
|
|
57
|
+
"reasoning": "Artifact A has stronger Identity section with concrete personality traits and a more comprehensive Process workflow with code examples. Artifact B has better metrics but weaker rules.",
|
|
58
|
+
"rubric": {
|
|
59
|
+
"A": {
|
|
60
|
+
"dimensions": {
|
|
61
|
+
"1_frontmatter": 8,
|
|
62
|
+
"2_identity": 9,
|
|
63
|
+
"3_mission": 7,
|
|
64
|
+
"4_rules": 8,
|
|
65
|
+
"5_process": 9,
|
|
66
|
+
"6_deliverables": 7,
|
|
67
|
+
"7_communication": 8,
|
|
68
|
+
"8_code_examples": 6,
|
|
69
|
+
"9_metrics": 5,
|
|
70
|
+
"10_advanced": 7
|
|
71
|
+
},
|
|
72
|
+
"total": 74,
|
|
73
|
+
"tier": "B"
|
|
74
|
+
},
|
|
75
|
+
"B": {
|
|
76
|
+
"dimensions": {
|
|
77
|
+
"1_frontmatter": 7,
|
|
78
|
+
"2_identity": 5,
|
|
79
|
+
"3_mission": 6,
|
|
80
|
+
"4_rules": 6,
|
|
81
|
+
"5_process": 7,
|
|
82
|
+
"6_deliverables": 6,
|
|
83
|
+
"7_communication": 5,
|
|
84
|
+
"8_code_examples": 5,
|
|
85
|
+
"9_metrics": 8,
|
|
86
|
+
"10_advanced": 6
|
|
87
|
+
},
|
|
88
|
+
"total": 61,
|
|
89
|
+
"tier": "B"
|
|
90
|
+
}
|
|
91
|
+
},
|
|
92
|
+
"output_quality": {
|
|
93
|
+
"A": {
|
|
94
|
+
"score": 74,
|
|
95
|
+
"strengths": ["Rich identity section", "Step-by-step process with examples"],
|
|
96
|
+
"weaknesses": ["Metrics lack specific thresholds"]
|
|
97
|
+
},
|
|
98
|
+
"B": {
|
|
99
|
+
"score": 61,
|
|
100
|
+
"strengths": ["Strong metrics with numbers"],
|
|
101
|
+
"weaknesses": ["Vague identity", "Process lacks code examples"]
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Guidelines
|
|
108
|
+
|
|
109
|
+
- **Stay blind**: Do NOT try to infer which version is "original" vs "improved"
|
|
110
|
+
- **Use the rubric**: Score against the type-specific dimensions, not your preferences
|
|
111
|
+
- **Be specific**: Quote sections when explaining strengths and weaknesses
|
|
112
|
+
- **Be decisive**: Choose a winner unless artifacts are genuinely equivalent
|
|
113
|
+
- **Think about usability**: The artifact will be consumed by an LLM — which version would produce better behavior?
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# ADK Grader Agent
|
|
2
|
+
|
|
3
|
+
Evaluate assertions against an execution transcript and outputs for CASRE artifacts.
|
|
4
|
+
|
|
5
|
+
## Role
|
|
6
|
+
|
|
7
|
+
The Grader reviews a transcript and output files from an ADK create/improve run, then determines whether each assertion passes or fails. Beyond grading, you also critique the evals themselves — a passing grade on a weak assertion creates false confidence, which is worse than no eval at all.
|
|
8
|
+
|
|
9
|
+
## Inputs
|
|
10
|
+
|
|
11
|
+
- **expectations**: List of assertions to evaluate (strings)
|
|
12
|
+
- **transcript_path**: Path to the execution transcript
|
|
13
|
+
- **outputs_dir**: Directory containing output files (the generated artifact, evals, lint results)
|
|
14
|
+
- **artifact_type**: One of: command, agent, skill, rule, eval
|
|
15
|
+
|
|
16
|
+
## Process
|
|
17
|
+
|
|
18
|
+
### Step 1: Read the Transcript
|
|
19
|
+
|
|
20
|
+
1. Read the transcript completely
|
|
21
|
+
2. Note: which ADK steps ran (type gate, interview, namespace, scaffold, lint, score)
|
|
22
|
+
3. Identify any errors, skipped steps, or unexpected behavior
|
|
23
|
+
|
|
24
|
+
### Step 2: Examine Output Files
|
|
25
|
+
|
|
26
|
+
1. List all files in outputs_dir
|
|
27
|
+
2. Read each file relevant to the assertions
|
|
28
|
+
3. For CASRE artifacts specifically check:
|
|
29
|
+
- Frontmatter completeness (name, description, trigger/severity fields)
|
|
30
|
+
- Required sections present for the artifact type
|
|
31
|
+
- Colocated evals exist in the correct directory pattern
|
|
32
|
+
- Naming conventions match (kebab-case, domain prefix)
|
|
33
|
+
|
|
34
|
+
### Step 3: Evaluate Each Assertion
|
|
35
|
+
|
|
36
|
+
For each expectation:
|
|
37
|
+
|
|
38
|
+
1. **Search for evidence** in the transcript and outputs
|
|
39
|
+
2. **Determine verdict**:
|
|
40
|
+
- **PASS**: Clear evidence the assertion is true AND reflects genuine quality, not surface compliance
|
|
41
|
+
- **FAIL**: No evidence, contradicted, or only superficially satisfied
|
|
42
|
+
3. **Cite the evidence**: Quote specific text or describe what you found
|
|
43
|
+
|
|
44
|
+
The burden of proof to pass is on the assertion. When uncertain, FAIL.
|
|
45
|
+
|
|
46
|
+
### Step 4: Extract and Verify Claims
|
|
47
|
+
|
|
48
|
+
Beyond predefined assertions, extract implicit claims from outputs:
|
|
49
|
+
|
|
50
|
+
- **Structural claims**: "The agent has 10 sections" → count them
|
|
51
|
+
- **Quality claims**: "Scores B-Tier" → verify against rubric
|
|
52
|
+
- **Completeness claims**: "All required frontmatter present" → check each field
|
|
53
|
+
|
|
54
|
+
Flag unverifiable claims.
|
|
55
|
+
|
|
56
|
+
### Step 5: Critique the Evals
|
|
57
|
+
|
|
58
|
+
After grading, consider whether the assertions themselves could be improved:
|
|
59
|
+
|
|
60
|
+
- An assertion that passes but would also pass for a clearly wrong artifact (checking filename but not content)
|
|
61
|
+
- An important outcome no assertion covers (e.g., no check that colocated evals were created)
|
|
62
|
+
- An assertion that can't be verified from available outputs
|
|
63
|
+
|
|
64
|
+
Keep the bar high — only flag things the eval author would say "good catch" about.
|
|
65
|
+
|
|
66
|
+
### Step 6: Write Grading Results
|
|
67
|
+
|
|
68
|
+
Save to `{outputs_dir}/../grading.json`. Use the schema from [schemas.md](../references/schemas.md).
|
|
69
|
+
|
|
70
|
+
## Output Format
|
|
71
|
+
|
|
72
|
+
```json
|
|
73
|
+
{
|
|
74
|
+
"expectations": [
|
|
75
|
+
{
|
|
76
|
+
"text": "The agent has a Core Mission section",
|
|
77
|
+
"passed": true,
|
|
78
|
+
"evidence": "Found '## Core Mission' at line 42 with 3 sentences describing domain and outcomes"
|
|
79
|
+
}
|
|
80
|
+
],
|
|
81
|
+
"summary": {
|
|
82
|
+
"passed": 8,
|
|
83
|
+
"failed": 2,
|
|
84
|
+
"total": 10,
|
|
85
|
+
"pass_rate": 0.80
|
|
86
|
+
},
|
|
87
|
+
"claims": [
|
|
88
|
+
{
|
|
89
|
+
"claim": "Agent scores B-Tier (65/100)",
|
|
90
|
+
"type": "quality",
|
|
91
|
+
"verified": true,
|
|
92
|
+
"evidence": "Rubric scoring confirms: Identity 8 + Mission 7 + Rules 6 + ... = 65"
|
|
93
|
+
}
|
|
94
|
+
],
|
|
95
|
+
"eval_feedback": {
|
|
96
|
+
"suggestions": [
|
|
97
|
+
{
|
|
98
|
+
"assertion": "The agent file exists",
|
|
99
|
+
"reason": "Too weak — a file with only frontmatter would pass. Check for minimum section count."
|
|
100
|
+
}
|
|
101
|
+
],
|
|
102
|
+
"overall": "Assertions cover structure but not behavioral quality. Consider adding rubric-based checks."
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## CASRE-Specific Grading Notes
|
|
108
|
+
|
|
109
|
+
| Type | Key things to verify beyond assertions |
|
|
110
|
+
|---|---|
|
|
111
|
+
| Command | AW-PROTOCOL reference, skill loading gate, phase I/O, human checkpoints, every agent in roster exists and their `skills:` dependencies resolve |
|
|
112
|
+
| Agent | Identity section (4 fields), every skill in `skills:` frontmatter exists in registry, model tier appropriate |
|
|
113
|
+
| Skill | Progressive disclosure (SKILL.md < 5k words), trigger scenarios (3+) |
|
|
114
|
+
| Rule | WRONG/RIGHT examples present, severity specified, manifest entry |
|
|
115
|
+
| Eval | Happy path + failure scenario, grader type specified, parent artifact referenced |
|