safeword 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. package/.claude/commands/arch-review.md +32 -0
  2. package/.claude/commands/lint.md +6 -0
  3. package/.claude/commands/quality-review.md +13 -0
  4. package/.claude/commands/setup-linting.md +6 -0
  5. package/.claude/hooks/auto-lint.sh +6 -0
  6. package/.claude/hooks/auto-quality-review.sh +170 -0
  7. package/.claude/hooks/check-linting-sync.sh +17 -0
  8. package/.claude/hooks/inject-timestamp.sh +6 -0
  9. package/.claude/hooks/question-protocol.sh +12 -0
  10. package/.claude/hooks/run-linters.sh +8 -0
  11. package/.claude/hooks/run-quality-review.sh +76 -0
  12. package/.claude/hooks/version-check.sh +10 -0
  13. package/.claude/mcp/README.md +96 -0
  14. package/.claude/mcp/arcade.sample.json +9 -0
  15. package/.claude/mcp/context7.sample.json +7 -0
  16. package/.claude/mcp/playwright.sample.json +7 -0
  17. package/.claude/settings.json +62 -0
  18. package/.claude/skills/quality-reviewer/SKILL.md +190 -0
  19. package/.claude/skills/safeword-quality-reviewer/SKILL.md +13 -0
  20. package/.env.arcade.example +4 -0
  21. package/.env.example +11 -0
  22. package/.gitmodules +4 -0
  23. package/.safeword/SAFEWORD.md +33 -0
  24. package/.safeword/eslint/eslint-base.mjs +101 -0
  25. package/.safeword/guides/architecture-guide.md +404 -0
  26. package/.safeword/guides/code-philosophy.md +174 -0
  27. package/.safeword/guides/context-files-guide.md +405 -0
  28. package/.safeword/guides/data-architecture-guide.md +183 -0
  29. package/.safeword/guides/design-doc-guide.md +165 -0
  30. package/.safeword/guides/learning-extraction.md +515 -0
  31. package/.safeword/guides/llm-instruction-design.md +239 -0
  32. package/.safeword/guides/llm-prompting.md +95 -0
  33. package/.safeword/guides/tdd-best-practices.md +570 -0
  34. package/.safeword/guides/test-definitions-guide.md +243 -0
  35. package/.safeword/guides/testing-methodology.md +573 -0
  36. package/.safeword/guides/user-story-guide.md +237 -0
  37. package/.safeword/guides/zombie-process-cleanup.md +214 -0
  38. package/{templates → .safeword}/hooks/agents-md-check.sh +0 -0
  39. package/{templates → .safeword}/hooks/post-tool.sh +0 -0
  40. package/{templates → .safeword}/hooks/pre-commit.sh +0 -0
  41. package/.safeword/planning/002-user-story-quality-evaluation.md +1840 -0
  42. package/.safeword/planning/003-langsmith-eval-setup-prompt.md +363 -0
  43. package/.safeword/planning/004-llm-eval-test-cases.md +3226 -0
  44. package/.safeword/planning/005-architecture-enforcement-system.md +169 -0
  45. package/.safeword/planning/006-reactive-fix-prevention-research.md +135 -0
  46. package/.safeword/planning/011-cli-ux-vision.md +330 -0
  47. package/.safeword/planning/012-project-structure-cleanup.md +154 -0
  48. package/.safeword/planning/README.md +39 -0
  49. package/.safeword/planning/automation-plan-v2.md +1225 -0
  50. package/.safeword/planning/automation-plan-v3.md +1291 -0
  51. package/.safeword/planning/automation-plan.md +3058 -0
  52. package/.safeword/planning/design/005-cli-implementation.md +343 -0
  53. package/.safeword/planning/design/013-cli-self-contained-templates.md +596 -0
  54. package/.safeword/planning/design/013a-eslint-plugin-suite.md +256 -0
  55. package/.safeword/planning/design/013b-implementation-snippets.md +385 -0
  56. package/.safeword/planning/design/013c-config-isolation-strategy.md +242 -0
  57. package/.safeword/planning/design/code-philosophy-improvements.md +60 -0
  58. package/.safeword/planning/mcp-analysis.md +545 -0
  59. package/.safeword/planning/phase2-subagents-vs-skills-analysis.md +451 -0
  60. package/.safeword/planning/settings-improvements.md +970 -0
  61. package/.safeword/planning/test-definitions/005-cli-implementation.md +1301 -0
  62. package/.safeword/planning/test-definitions/cli-self-contained-templates.md +205 -0
  63. package/.safeword/planning/user-stories/001-guides-review-user-stories.md +1381 -0
  64. package/.safeword/planning/user-stories/003-reactive-fix-prevention.md +132 -0
  65. package/.safeword/planning/user-stories/004-technical-constraints.md +86 -0
  66. package/.safeword/planning/user-stories/005-cli-implementation.md +311 -0
  67. package/.safeword/planning/user-stories/cli-self-contained-templates.md +172 -0
  68. package/.safeword/planning/versioned-distribution.md +740 -0
  69. package/.safeword/prompts/arch-review.md +43 -0
  70. package/.safeword/prompts/quality-review.md +11 -0
  71. package/.safeword/scripts/arch-review.sh +235 -0
  72. package/.safeword/scripts/check-linting-sync.sh +58 -0
  73. package/.safeword/scripts/setup-linting.sh +559 -0
  74. package/.safeword/templates/architecture-template.md +136 -0
  75. package/.safeword/templates/ci/architecture-check.yml +79 -0
  76. package/.safeword/templates/design-doc-template.md +127 -0
  77. package/.safeword/templates/test-definitions-feature.md +100 -0
  78. package/.safeword/templates/ticket-template.md +74 -0
  79. package/.safeword/templates/user-stories-template.md +82 -0
  80. package/.safeword/tickets/001-guides-review-user-stories.md +83 -0
  81. package/.safeword/tickets/002-architecture-enforcement.md +211 -0
  82. package/.safeword/tickets/003-reactive-fix-prevention.md +57 -0
  83. package/.safeword/tickets/004-technical-constraints-in-user-stories.md +39 -0
  84. package/.safeword/tickets/005-cli-implementation.md +248 -0
  85. package/.safeword/tickets/006-flesh-out-skills.md +43 -0
  86. package/.safeword/tickets/007-flesh-out-questioning.md +44 -0
  87. package/.safeword/tickets/008-upgrade-questioning.md +58 -0
  88. package/.safeword/tickets/009-naming-conventions.md +41 -0
  89. package/.safeword/tickets/010-safeword-md-cleanup.md +34 -0
  90. package/.safeword/tickets/011-cursor-setup.md +86 -0
  91. package/.safeword/tickets/README.md +73 -0
  92. package/.safeword/version +1 -0
  93. package/AGENTS.md +59 -0
  94. package/CLAUDE.md +12 -0
  95. package/README.md +347 -0
  96. package/docs/001-cli-implementation-plan.md +856 -0
  97. package/docs/elite-dx-implementation-plan.md +1034 -0
  98. package/framework/README.md +131 -0
  99. package/framework/mcp/README.md +96 -0
  100. package/framework/mcp/arcade.sample.json +8 -0
  101. package/framework/mcp/context7.sample.json +6 -0
  102. package/framework/mcp/playwright.sample.json +6 -0
  103. package/framework/scripts/arch-review.sh +235 -0
  104. package/framework/scripts/check-linting-sync.sh +58 -0
  105. package/framework/scripts/load-env.sh +49 -0
  106. package/framework/scripts/setup-claude.sh +223 -0
  107. package/framework/scripts/setup-linting.sh +559 -0
  108. package/framework/scripts/setup-quality.sh +477 -0
  109. package/framework/scripts/setup-safeword.sh +550 -0
  110. package/framework/templates/ci/architecture-check.yml +78 -0
  111. package/learnings/ai-sdk-v5-breaking-changes.md +178 -0
  112. package/learnings/e2e-test-zombie-processes.md +231 -0
  113. package/learnings/milkdown-crepe-editor-property.md +96 -0
  114. package/learnings/prosemirror-fragment-traversal.md +119 -0
  115. package/package.json +19 -43
  116. package/packages/cli/AGENTS.md +1 -0
  117. package/packages/cli/ARCHITECTURE.md +279 -0
  118. package/packages/cli/package.json +51 -0
  119. package/packages/cli/src/cli.ts +63 -0
  120. package/packages/cli/src/commands/check.ts +166 -0
  121. package/packages/cli/src/commands/diff.ts +209 -0
  122. package/packages/cli/src/commands/reset.ts +190 -0
  123. package/packages/cli/src/commands/setup.ts +325 -0
  124. package/packages/cli/src/commands/upgrade.ts +163 -0
  125. package/packages/cli/src/index.ts +3 -0
  126. package/packages/cli/src/templates/config.ts +58 -0
  127. package/packages/cli/src/templates/content.ts +18 -0
  128. package/packages/cli/src/templates/index.ts +12 -0
  129. package/packages/cli/src/utils/agents-md.ts +66 -0
  130. package/packages/cli/src/utils/fs.ts +179 -0
  131. package/packages/cli/src/utils/git.ts +124 -0
  132. package/packages/cli/src/utils/hooks.ts +29 -0
  133. package/packages/cli/src/utils/output.ts +60 -0
  134. package/packages/cli/src/utils/project-detector.test.ts +185 -0
  135. package/packages/cli/src/utils/project-detector.ts +44 -0
  136. package/packages/cli/src/utils/version.ts +28 -0
  137. package/packages/cli/src/version.ts +6 -0
  138. package/packages/cli/templates/SAFEWORD.md +776 -0
  139. package/packages/cli/templates/doc-templates/architecture-template.md +136 -0
  140. package/packages/cli/templates/doc-templates/design-doc-template.md +134 -0
  141. package/packages/cli/templates/doc-templates/test-definitions-feature.md +131 -0
  142. package/packages/cli/templates/doc-templates/ticket-template.md +82 -0
  143. package/packages/cli/templates/doc-templates/user-stories-template.md +92 -0
  144. package/packages/cli/templates/guides/architecture-guide.md +423 -0
  145. package/packages/cli/templates/guides/code-philosophy.md +195 -0
  146. package/packages/cli/templates/guides/context-files-guide.md +457 -0
  147. package/packages/cli/templates/guides/data-architecture-guide.md +200 -0
  148. package/packages/cli/templates/guides/design-doc-guide.md +171 -0
  149. package/packages/cli/templates/guides/learning-extraction.md +552 -0
  150. package/packages/cli/templates/guides/llm-instruction-design.md +248 -0
  151. package/packages/cli/templates/guides/llm-prompting.md +102 -0
  152. package/packages/cli/templates/guides/tdd-best-practices.md +615 -0
  153. package/packages/cli/templates/guides/test-definitions-guide.md +334 -0
  154. package/packages/cli/templates/guides/testing-methodology.md +618 -0
  155. package/packages/cli/templates/guides/user-story-guide.md +256 -0
  156. package/packages/cli/templates/guides/zombie-process-cleanup.md +219 -0
  157. package/packages/cli/templates/hooks/agents-md-check.sh +27 -0
  158. package/packages/cli/templates/hooks/post-tool.sh +4 -0
  159. package/packages/cli/templates/hooks/pre-commit.sh +10 -0
  160. package/packages/cli/templates/prompts/arch-review.md +43 -0
  161. package/packages/cli/templates/prompts/quality-review.md +10 -0
  162. package/packages/cli/templates/skills/safeword-quality-reviewer/SKILL.md +207 -0
  163. package/packages/cli/tests/commands/check.test.ts +129 -0
  164. package/packages/cli/tests/commands/cli.test.ts +89 -0
  165. package/packages/cli/tests/commands/diff.test.ts +115 -0
  166. package/packages/cli/tests/commands/reset.test.ts +310 -0
  167. package/packages/cli/tests/commands/self-healing.test.ts +170 -0
  168. package/packages/cli/tests/commands/setup-blocking.test.ts +71 -0
  169. package/packages/cli/tests/commands/setup-core.test.ts +135 -0
  170. package/packages/cli/tests/commands/setup-git.test.ts +139 -0
  171. package/packages/cli/tests/commands/setup-hooks.test.ts +334 -0
  172. package/packages/cli/tests/commands/setup-linting.test.ts +189 -0
  173. package/packages/cli/tests/commands/setup-noninteractive.test.ts +80 -0
  174. package/packages/cli/tests/commands/setup-templates.test.ts +181 -0
  175. package/packages/cli/tests/commands/upgrade.test.ts +215 -0
  176. package/packages/cli/tests/helpers.ts +243 -0
  177. package/packages/cli/tests/npm-package.test.ts +83 -0
  178. package/packages/cli/tests/technical-constraints.test.ts +96 -0
  179. package/packages/cli/tsconfig.json +25 -0
  180. package/packages/cli/tsup.config.ts +11 -0
  181. package/packages/cli/vitest.config.ts +23 -0
  182. package/promptfoo.yaml +3270 -0
  183. package/dist/check-M73LGONJ.js +0 -129
  184. package/dist/check-M73LGONJ.js.map +0 -1
  185. package/dist/chunk-2XWIUEQK.js +0 -190
  186. package/dist/chunk-2XWIUEQK.js.map +0 -1
  187. package/dist/chunk-GZRQL3SX.js +0 -146
  188. package/dist/chunk-GZRQL3SX.js.map +0 -1
  189. package/dist/chunk-V5G6BGOK.js +0 -26
  190. package/dist/chunk-V5G6BGOK.js.map +0 -1
  191. package/dist/chunk-W66Z3C5H.js +0 -21
  192. package/dist/chunk-W66Z3C5H.js.map +0 -1
  193. package/dist/cli.d.ts +0 -1
  194. package/dist/cli.js +0 -34
  195. package/dist/cli.js.map +0 -1
  196. package/dist/diff-FSFDCBL5.js +0 -166
  197. package/dist/diff-FSFDCBL5.js.map +0 -1
  198. package/dist/index.d.ts +0 -11
  199. package/dist/index.js +0 -7
  200. package/dist/index.js.map +0 -1
  201. package/dist/reset-3ACTIYYE.js +0 -143
  202. package/dist/reset-3ACTIYYE.js.map +0 -1
  203. package/dist/setup-MKVVQTVA.js +0 -266
  204. package/dist/setup-MKVVQTVA.js.map +0 -1
  205. package/dist/upgrade-FQOL6AF5.js +0 -134
  206. package/dist/upgrade-FQOL6AF5.js.map +0 -1
  207. /package/{templates → framework}/SAFEWORD.md +0 -0
  208. /package/{templates → framework}/guides/architecture-guide.md +0 -0
  209. /package/{templates → framework}/guides/code-philosophy.md +0 -0
  210. /package/{templates → framework}/guides/context-files-guide.md +0 -0
  211. /package/{templates → framework}/guides/data-architecture-guide.md +0 -0
  212. /package/{templates → framework}/guides/design-doc-guide.md +0 -0
  213. /package/{templates → framework}/guides/learning-extraction.md +0 -0
  214. /package/{templates → framework}/guides/llm-instruction-design.md +0 -0
  215. /package/{templates → framework}/guides/llm-prompting.md +0 -0
  216. /package/{templates → framework}/guides/tdd-best-practices.md +0 -0
  217. /package/{templates → framework}/guides/test-definitions-guide.md +0 -0
  218. /package/{templates → framework}/guides/testing-methodology.md +0 -0
  219. /package/{templates → framework}/guides/user-story-guide.md +0 -0
  220. /package/{templates → framework}/guides/zombie-process-cleanup.md +0 -0
  221. /package/{templates → framework}/prompts/arch-review.md +0 -0
  222. /package/{templates → framework}/prompts/quality-review.md +0 -0
  223. /package/{templates/skills/safeword-quality-reviewer → framework/skills/quality-reviewer}/SKILL.md +0 -0
  224. /package/{templates/doc-templates → framework/templates}/architecture-template.md +0 -0
  225. /package/{templates/doc-templates → framework/templates}/design-doc-template.md +0 -0
  226. /package/{templates/doc-templates → framework/templates}/test-definitions-feature.md +0 -0
  227. /package/{templates/doc-templates → framework/templates}/ticket-template.md +0 -0
  228. /package/{templates/doc-templates → framework/templates}/user-stories-template.md +0 -0
  229. /package/{templates → packages/cli/templates}/commands/arch-review.md +0 -0
  230. /package/{templates → packages/cli/templates}/commands/lint.md +0 -0
  231. /package/{templates → packages/cli/templates}/commands/quality-review.md +0 -0
  232. /package/{templates → packages/cli/templates}/hooks/inject-timestamp.sh +0 -0
  233. /package/{templates → packages/cli/templates}/lib/common.sh +0 -0
  234. /package/{templates → packages/cli/templates}/lib/jq-fallback.sh +0 -0
  235. /package/{templates → packages/cli/templates}/markdownlint.jsonc +0 -0
package/promptfoo.yaml ADDED
@@ -0,0 +1,3270 @@
1
+ # Promptfoo eval config for SAFEWORD guide quality
2
+ # Run with: npm run eval
3
+ # View results: npm run eval:view
4
+
5
+ description: 'SAFEWORD Guide Quality Evals'
6
+
7
+ providers:
8
+ - id: anthropic:messages:claude-haiku-4-20250514
9
+ config:
10
+ temperature: 0
11
+
12
+ prompts:
13
+ - |
14
+ You are an AI coding assistant following the SAFEWORD framework.
15
+
16
+ <context>
17
+ {{context}}
18
+ </context>
19
+
20
+ <user_request>
21
+ {{input}}
22
+ </user_request>
23
+
24
+ Respond following the guidelines in the context.
25
+
26
+ tests:
27
+ # =============================================================================
28
+ # ARCHITECTURE GUIDE (13 tests)
29
+ # =============================================================================
30
+
31
+ - description: 'arch-001: Create comprehensive architecture doc'
32
+ vars:
33
+ input: 'Create an architecture doc for a new React + Supabase project'
34
+ context: |
35
+ file_path: framework/guides/architecture-guide.md (excerpt)
36
+ ---
37
+ ## Required Sections
38
+
39
+ 1. Header (Version, Status)
40
+ 2. Table of Contents
41
+ 3. Overview
42
+ 4. Data Principles
43
+ 5. Data Model
44
+ 6. Components
45
+ 7. Data Flows
46
+ 8. Key Decisions (What/Why/Trade-off/Alternatives)
47
+ 9. Best Practices
48
+ 10. Migration Strategy
49
+ 11. Code References
50
+ assert:
51
+ - type: llm-rubric
52
+ value: |
53
+ The response should create an architecture doc with most required sections.
54
+ EXCELLENT: All 11 sections with What/Why/Trade-off in decisions
55
+ ACCEPTABLE: 9+ sections present
56
+ POOR: <9 sections or missing Key Decisions
57
+
58
+ - description: 'arch-002: Choose Architecture Doc for tech decision'
59
+ vars:
60
+ input: 'I need to document our decision to use PostgreSQL instead of MongoDB'
61
+ context: |
62
+ file_path: framework/SAFEWORD.md (excerpt)
63
+ ---
64
+ ## When to Update Architecture Docs
65
+
66
+ **Update ARCHITECTURE.md when:**
67
+ - Making technology choices (state management, database, frameworks)
68
+ - Designing data models or schemas
69
+ - Establishing project-wide patterns/conventions
70
+
71
+ **Use Design Doc instead when:**
72
+ - Implementing a specific feature
73
+ - Feature-specific technical decisions
74
+
75
+ **Tie-breaking rule:** If decision affects 2+ features → Architecture doc. If feature-specific only → Design doc.
76
+ assert:
77
+ - type: llm-rubric
78
+ value: |
79
+ The response should recommend creating or updating an Architecture Doc (not a Design Doc).
80
+ EXCELLENT: Correctly identifies Architecture Doc AND explains why (database choice affects whole project)
81
+ ACCEPTABLE: Correctly identifies Architecture Doc
82
+ POOR: Suggests Design Doc or is unclear
83
+
84
+ - description: 'arch-003: Choose Design Doc for feature'
85
+ vars:
86
+ input: 'I need to document how the user profile feature will work'
87
+ context: |
88
+ file_path: framework/SAFEWORD.md (excerpt)
89
+ ---
90
+ ## When to Update Architecture Docs
91
+
92
+ **Update ARCHITECTURE.md when:**
93
+ - Making technology choices (state management, database, frameworks)
94
+ - Designing data models or schemas
95
+ - Establishing project-wide patterns/conventions
96
+
97
+ **Use Design Doc instead when:**
98
+ - Implementing a specific feature
99
+ - Feature-specific technical decisions
100
+
101
+ **Tie-breaking rule:** If decision affects 2+ features → Architecture doc. If feature-specific only → Design doc.
102
+ assert:
103
+ - type: llm-rubric
104
+ value: |
105
+ The response should recommend creating a Design Doc (not Architecture Doc).
106
+ EXCELLENT: Correctly identifies Design Doc AND explains why (single feature)
107
+ ACCEPTABLE: Correctly identifies Design Doc
108
+ POOR: Suggests Architecture Doc
109
+
110
+ - description: 'arch-004: Document Why, not just What'
111
+ vars:
112
+ input: 'Document our decision to use Redis for caching'
113
+ context: |
114
+ file_path: framework/guides/architecture-guide.md (excerpt)
115
+ ---
116
+ ## Key Decisions Format
117
+
118
+ Every decision must include:
119
+ - **What**: The decision made
120
+ - **Why**: Rationale with specifics (numbers, metrics)
121
+ - **Trade-off**: What we gave up
122
+ - **Alternatives Considered**: Other options evaluated
123
+ assert:
124
+ - type: llm-rubric
125
+ value: |
126
+ The response should include What, Why, Trade-off, and Alternatives.
127
+ EXCELLENT: All 4 fields with specifics (numbers, metrics, concrete alternatives)
128
+ ACCEPTABLE: What/Why/Trade-off present
129
+ POOR: Missing Why or Trade-off
130
+
131
+ - description: 'arch-005: Apply tie-breaker for multi-feature'
132
+ vars:
133
+ input: 'I need to document adding a caching layer that will be used by multiple features'
134
+ context: |
135
+ file_path: framework/SAFEWORD.md (excerpt)
136
+ ---
137
+ **Tie-breaking rule:** If decision affects 2+ features → Architecture doc. If feature-specific only → Design doc.
138
+ assert:
139
+ - type: llm-rubric
140
+ value: |
141
+ The response should choose Architecture Doc (affects 2+ features).
142
+ EXCELLENT: Architecture Doc + cites tie-breaking rule (affects 2+ features)
143
+ ACCEPTABLE: Architecture Doc
144
+ POOR: Design Doc
145
+
146
+ - description: 'arch-006: Include code references'
147
+ vars:
148
+ input: 'Document the authentication flow architecture, including where the code lives'
149
+ context: |
150
+ file_path: framework/guides/architecture-guide.md (excerpt)
151
+ ---
152
+ ## Code References
153
+
154
+ Include paths to implementation:
155
+ - File paths with line ranges when helpful
156
+ - Function/class names
157
+ - Keep references current when code changes
158
+ assert:
159
+ - type: llm-rubric
160
+ value: |
161
+ The response should include code references with file paths.
162
+ EXCELLENT: 2+ code references with file:line format or function names
163
+ ACCEPTABLE: At least 1 file path reference
164
+ POOR: No code references
165
+
166
+ - description: 'arch-007: Consolidate ADRs'
167
+ vars:
168
+ input: 'Our project has 50 ADR files in docs/adr/. What should we do?'
169
+ context: |
170
+ file_path: framework/guides/architecture-guide.md (excerpt)
171
+ ---
172
+ ## Single Architecture Doc
173
+
174
+ Consolidate scattered ADRs into one ARCHITECTURE.md:
175
+ 1. Create ARCHITECTURE.md at project root
176
+ 2. Consolidate active decisions
177
+ 3. Archive old ADRs
178
+ assert:
179
+ - type: llm-rubric
180
+ value: |
181
+ The response should recommend consolidating into single ARCHITECTURE.md.
182
+ EXCELLENT: Recommends consolidation + provides migration steps
183
+ ACCEPTABLE: Recommends consolidation
184
+ POOR: Suggests keeping separate ADRs
185
+
186
+ - description: 'arch-008: Include versioning'
187
+ vars:
188
+ input: 'Create architecture doc header for a new project'
189
+ context: |
190
+ file_path: framework/guides/architecture-guide.md (excerpt)
191
+ ---
192
+ ## Header Format
193
+
194
+ Status values: Design | Production | Proposed | Deprecated
195
+ Version: Major.Minor (bump major for breaking changes)
196
+ assert:
197
+ - type: llm-rubric
198
+ value: |
199
+ The response should include Version and Status in header.
200
+ EXCELLENT: Version + Status using valid values
201
+ ACCEPTABLE: Version and Status present
202
+ POOR: Missing Version or Status
203
+
204
+ - description: 'arch-009: Check for user stories before implementing'
205
+ vars:
206
+ input: 'Implement user authentication for my app'
207
+ context: |
208
+ file_path: framework/SAFEWORD.md (excerpt)
209
+ ---
210
+ ## Feature Development Workflow
211
+
212
+ 1. User Stories - Check if they exist, create if not
213
+ 2. Test Definitions - Check if they exist, create if not
214
+ 3. Design Doc (complex features only)
215
+ 4. Follow STRICT TDD Workflow
216
+ assert:
217
+ - type: llm-rubric
218
+ value: |
219
+ The response should check for user stories/test definitions before implementation.
220
+ EXCELLENT: Checks for user stories + test definitions + offers to create if missing
221
+ ACCEPTABLE: Mentions TDD workflow
222
+ POOR: Jumps straight to implementation
223
+
224
+ - description: 'arch-010: Suggest updating architecture doc after tech change'
225
+ vars:
226
+ input: 'I just added PostgreSQL to our project that was using SQLite'
227
+ context: |
228
+ file_path: framework/SAFEWORD.md (excerpt)
229
+ ---
230
+ ## When to Update Architecture Docs
231
+
232
+ **Update ARCHITECTURE.md when:**
233
+ - Making technology choices (state management, database, frameworks)
234
+ assert:
235
+ - type: llm-rubric
236
+ value: |
237
+ The response should suggest updating architecture doc.
238
+ EXCELLENT: Recommends architecture doc update + explains why (tech choice)
239
+ ACCEPTABLE: Mentions documenting the change
240
+ POOR: No mention of architecture doc
241
+
242
+ - description: "arch-011: Don't suggest update for bug fix"
243
+ vars:
244
+ input: 'I just fixed a bug in the login form validation'
245
+ context: |
246
+ file_path: framework/SAFEWORD.md (excerpt)
247
+ ---
248
+ ## When to Update Architecture Docs
249
+
250
+ **Update ARCHITECTURE.md when:**
251
+ - Making technology choices
252
+ - Designing data models
253
+ - Establishing project-wide patterns
254
+
255
+ **NOT for:** Bug fixes, minor refactors, feature tweaks
256
+ assert:
257
+ - type: llm-rubric
258
+ value: |
259
+ The response should NOT suggest updating architecture doc.
260
+ EXCELLENT: No mention of architecture doc (bug fix doesn't warrant it)
261
+ ACCEPTABLE: Asks if it's architectural, then correctly says no
262
+ POOR: Suggests updating architecture doc
263
+
264
+ - description: 'arch-012: Catch missing rationale anti-pattern'
265
+ vars:
266
+ input: |
267
+ Review this architecture doc section:
268
+ ### State Management
269
+ **What**: Using Zustand for global state
270
+ context: |
271
+ file_path: framework/guides/architecture-guide.md (excerpt)
272
+ ---
273
+ ## Common Mistakes
274
+
275
+ - Missing "Why" in decisions
276
+ - No trade-offs documented
277
+ - Vague rationale without specifics
278
+ assert:
279
+ - type: llm-rubric
280
+ value: |
281
+ The response should identify missing "Why" and "Trade-off".
282
+ EXCELLENT: Identifies missing Why/Trade-off + suggests adding rationale with specifics
283
+ ACCEPTABLE: Notes decision is incomplete
284
+ POOR: Says doc looks fine
285
+
286
+ - description: 'arch-013: Create file in correct location'
287
+ vars:
288
+ input: 'Create a design doc for the payment flow feature'
289
+ context: |
290
+ file_path: framework/SAFEWORD.md (excerpt)
291
+ ---
292
+ ## Planning Documentation Location
293
+
294
+ - Design docs → `.agents/planning/design/`
295
+ assert:
296
+ - type: llm-rubric
297
+ value: |
298
+ The response should create file in .agents/planning/design/.
299
+ EXCELLENT: Creates in `.agents/planning/design/` + follows naming convention
300
+ ACCEPTABLE: Creates in a planning/design directory
301
+ POOR: Creates at root or wrong location
302
+
303
+ - description: 'arch-014: Layer definitions for new project'
304
+ vars:
305
+ input: "I'm starting a new TypeScript project. How should I organize my code into layers?"
306
+ context: |
307
+ file_path: framework/guides/architecture-guide.md (excerpt)
308
+ ---
309
+ ### Layer Definitions
310
+
311
+ | Layer | Directory | Responsibility |
312
+ |-------|-----------|----------------|
313
+ | app | `src/app/` | UI, routing, composition |
314
+ | domain | `src/domain/` | Business rules, pure logic |
315
+ | infra | `src/infra/` | IO, APIs, DB, external SDKs |
316
+ | shared | `src/shared/` | Utilities usable by all layers |
317
+ assert:
318
+ - type: llm-rubric
319
+ value: |
320
+ The response should explain the 4-layer structure.
321
+ EXCELLENT: Lists all 4 layers (app, domain, infra, shared) with directories and responsibilities
322
+ ACCEPTABLE: Lists layers with general descriptions
323
+ POOR: Vague advice or missing layers
324
+
325
+ - description: 'arch-015: Dependency rules - forbidden import'
326
+ vars:
327
+ input: 'Can my domain layer import from the app layer?'
328
+ context: |
329
+ file_path: framework/guides/architecture-guide.md (excerpt)
330
+ ---
331
+ ### Allowed Dependencies
332
+
333
+ | From | To | Allowed | Rationale |
334
+ |------|-----|---------|-----------|
335
+ | domain | app | ❌ | Domain must be framework-agnostic |
336
+ | domain | infra | ❌ | Domain contains pure logic only |
337
+ | domain | shared | ✅ | Utilities available everywhere |
338
+ assert:
339
+ - type: llm-rubric
340
+ value: |
341
+ The response should identify this as a forbidden import.
342
+ EXCELLENT: No - domain cannot import from app, explains rationale (framework-agnostic)
343
+ ACCEPTABLE: Says it's not allowed
344
+ POOR: Says it's allowed or gives ambiguous answer
345
+
346
+ - description: 'arch-016: Edge case - brownfield adoption'
347
+ vars:
348
+ input: 'I have an existing codebase with lots of boundary violations. How do I adopt layer boundaries without breaking everything?'
349
+ context: |
350
+ file_path: framework/guides/architecture-guide.md (excerpt)
351
+ ---
352
+ ### Edge Cases
353
+
354
+ | Scenario | Solution |
355
+ |----------|----------|
356
+ | Brownfield adoption | Start with warnings-only mode, fix violations incrementally, then enforce |
357
+ assert:
358
+ - type: llm-rubric
359
+ value: |
360
+ The response should describe incremental adoption.
361
+ EXCELLENT: 3-step path: warnings-only → fix incrementally → enforce; mentions not breaking existing code
362
+ ACCEPTABLE: Suggests gradual adoption approach
363
+ POOR: Suggests immediate enforcement or ignores existing violations
364
+
365
+ - description: 'arch-017: ESLint boundaries setup'
366
+ vars:
367
+ input: 'How do I set up eslint-plugin-boundaries to enforce my layer rules?'
368
+ context: |
369
+ file_path: framework/guides/architecture-guide.md (excerpt)
370
+ ---
371
+ ### Enforcement with eslint-plugin-boundaries
372
+
373
+ **Setup:**
374
+ 1. Install: `npm install --save-dev eslint-plugin-boundaries`
375
+ 2. Add to `eslint.config.mjs` with boundaries/element-types rules
376
+ 3. Define layers in `ARCHITECTURE.md`
377
+ 4. Errors appear in IDE + CI automatically
378
+ assert:
379
+ - type: llm-rubric
380
+ value: |
381
+ The response should explain ESLint boundaries setup.
382
+ EXCELLENT: Lists install command, config example with element-types rules, mentions IDE + CI integration
383
+ ACCEPTABLE: Provides basic setup steps
384
+ POOR: Vague or missing key configuration
385
+
386
+ - description: 'arch-018: LLM arch review - detect god module'
387
+ vars:
388
+ input: 'Review this file for architectural issues: UserService.ts with 800 lines and 15 dependencies'
389
+ context: |
390
+ file_path: framework/prompts/arch-review.md (excerpt)
391
+ ---
392
+ ## Check for:
393
+
394
+ 1. **Misplaced logic** - Business rules in wrong layer?
395
+ 2. **God module** - Too many responsibilities (>10 dependents or >500 lines)?
396
+ 3. **Leaky abstraction** - Implementation details exposed to callers?
397
+ 4. **Tight coupling** - Changes would cascade unnecessarily?
398
+ 5. **Boundary violation** - Import from disallowed layer?
399
+
400
+ ## Response Format
401
+
402
+ Return JSON with verdict: "clean" | "minor" | "refactor_needed"
403
+ assert:
404
+ - type: llm-rubric
405
+ value: |
406
+ The response should identify god module issue.
407
+ EXCELLENT: Identifies god module (>500 lines, >10 deps), returns refactor_needed verdict with fix suggestion
408
+ ACCEPTABLE: Notes the file is too large
409
+ POOR: Says it's clean or misses the issue
410
+
411
+ - description: 'arch-019: Pre-commit hook behavior'
412
+ vars:
413
+ input: 'What happens when I try to commit code with a boundary violation?'
414
+ context: |
415
+ file_path: framework/scripts/setup-safeword.sh (excerpt)
416
+ ---
417
+ # Pre-commit hook runs:
418
+ # 1. ESLint on staged files (--max-warnings 0 || exit 1)
419
+ # 2. arch-review.sh on staged files
420
+ # - refactor_needed verdict → exit 1 (blocked)
421
+ # - minor verdict → exit 0 (allowed with warning)
422
+ # - clean verdict → exit 0 (allowed)
423
+ assert:
424
+ - type: llm-rubric
425
+ value: |
426
+ The response should explain commit blocking behavior.
427
+ EXCELLENT: Commit blocked by ESLint errors OR refactor_needed; minor issues warn but allow
428
+ ACCEPTABLE: Explains that violations block commit
429
+ POOR: Says commit always succeeds or always fails
430
+
431
+ - description: 'arch-020: CI architecture check workflow'
432
+ vars:
433
+ input: 'How do I set up CI to check for architecture violations in PRs?'
434
+ context: |
435
+ file_path: framework/templates/ci/architecture-check.yml (excerpt)
436
+ ---
437
+ # Architecture Check CI Workflow
438
+ # Steps:
439
+ # 1. Type check (tsc --noEmit)
440
+ # 2. ESLint (includes eslint-plugin-boundaries)
441
+ # 3. LLM architecture review (optional, non-blocking)
442
+ #
443
+ # Usage:
444
+ # 1. Copy to .github/workflows/architecture-check.yml
445
+ # 2. Set ANTHROPIC_API_KEY secret (optional, for LLM review)
446
+ assert:
447
+ - type: llm-rubric
448
+ value: |
449
+ The response should explain CI setup for architecture checks.
450
+ EXCELLENT: Copy template to .github/workflows/, runs tsc + eslint (blocking) + LLM (non-blocking)
451
+ ACCEPTABLE: Mentions copying workflow file and running checks
452
+ POOR: Vague or missing key steps
453
+
454
+ - description: 'arch-021: Architecture template sections'
455
+ vars:
456
+ input: "I'm starting a new project. What sections should my ARCHITECTURE.md have?"
457
+ context: |
458
+ file_path: framework/templates/architecture-template.md (excerpt)
459
+ ---
460
+ ## Table of Contents
461
+
462
+ - [Overview](#overview)
463
+ - [Layers & Boundaries](#layers--boundaries)
464
+ - [Data Model](#data-model)
465
+ - [Key Decisions](#key-decisions)
466
+ - [Best Practices](#best-practices)
467
+ - [Migration Strategy](#migration-strategy)
468
+ assert:
469
+ - type: llm-rubric
470
+ value: |
471
+ The response should list required ARCHITECTURE.md sections.
472
+ EXCELLENT: Lists all 6 sections (Overview, Layers, Data Model, Decisions, Best Practices, Migration)
473
+ ACCEPTABLE: Lists most sections
474
+ POOR: Missing key sections or vague
475
+
476
+ # =============================================================================
477
+ # CODE PHILOSOPHY (14 tests)
478
+ # =============================================================================
479
+
480
+ - description: 'phil-001: JSON summary at end of response'
481
+ vars:
482
+ input: 'Fix the typo in README.md'
483
+ context: |
484
+ file_path: framework/guides/code-philosophy.md (excerpt)
485
+ ---
486
+ ## Response Format
487
+ At the end of EVERY response, include a JSON summary:
488
+ ```json
489
+ {"proposedChanges": boolean, "madeChanges": boolean, "askedQuestion": boolean}
490
+ ```
491
+ assert:
492
+ - type: llm-rubric
493
+ value: |
494
+ The response should end with JSON containing madeChanges: true.
495
+ EXCELLENT: Valid JSON with madeChanges: true, other fields accurate
496
+ ACCEPTABLE: JSON present with correct madeChanges value
497
+ POOR: Missing JSON or madeChanges: false when edit was made
498
+
499
+ - description: 'phil-002: Avoid bloat - push back on unnecessary feature'
500
+ vars:
501
+ input: 'Can you add a configuration file system so users can customize the button colors?'
502
+ context: |
503
+ file_path: framework/guides/code-philosophy.md (excerpt)
504
+ ---
505
+ ## Code Philosophy
506
+ - **AVOID BLOAT** - Simple, focused solutions over complex ones
507
+
508
+ **Bloat examples (avoid these):**
509
+ | ❌ Bloat | ✅ Instead |
510
+ |----------|-----------|
511
+ | Config file for 2 options | Hardcode or simple params |
512
+ | "Future-proofing" unused code paths | Delete, add when needed |
513
+
514
+ **When to push back:** If a feature request would add >50 lines for a "nice to have", ask: "Is this essential now, or can we add it later?"
515
+ assert:
516
+ - type: llm-rubric
517
+ value: |
518
+ The response should push back or question the necessity of a config system for button colors.
519
+ EXCELLENT: Questions necessity, suggests simpler alternative (CSS variables, props), asks if essential now
520
+ ACCEPTABLE: Expresses concern about complexity or suggests simpler approach
521
+ POOR: Immediately implements config file system without questioning
522
+
523
+ - description: 'phil-003: Self-documenting code with clear naming'
524
+ vars:
525
+ input: 'Write a function that calculates the total price with tax'
526
+ context: |
527
+ file_path: framework/guides/code-philosophy.md (excerpt)
528
+ ---
529
+ ## Code Philosophy
530
+ - **Self-documenting code** - Minimal inline comments, clear naming and structure
531
+
532
+ **Naming examples:**
533
+ | ❌ Bad | ✅ Good |
534
+ |--------|---------|
535
+ | `calcTot` | `calculateTotalWithTax` |
536
+ | `d`, `tmp`, `data` | `userProfile`, `pendingOrders` |
537
+ assert:
538
+ - type: llm-rubric
539
+ value: |
540
+ The response should use descriptive function name, no unnecessary comments.
541
+ EXCELLENT: Descriptive name (e.g., `calculateTotalWithTax`) + no comments on obvious code
542
+ ACCEPTABLE: Descriptive name with some redundant comments
543
+ POOR: Abbreviated name (e.g., `calcTot`) or comments explaining obvious code
544
+
545
+ - description: 'phil-004: Explicit error handling'
546
+ vars:
547
+ input: "Here's my code: try { await saveUser(data) } catch (e) { console.log(e) }. Is this okay?"
548
+ context: |
549
+ file_path: framework/guides/code-philosophy.md (excerpt)
550
+ ---
551
+ ## Code Philosophy
552
+ - **Explicit error handling** - NEVER suppress or swallow errors silently
553
+
554
+ **Error handling examples:**
555
+ | ❌ Bad | ✅ Good |
556
+ |--------|---------|
557
+ | `catch (e) {}` (swallowed) | `catch (e) { throw new Error(\`Failed to read ${filePath}: ${e.message}\`) }` |
558
+ | `catch (e) { console.log(e) }` | `catch (e) { logger.error('Payment failed', { userId, amount, error: e }) }` |
559
+ assert:
560
+ - type: llm-rubric
561
+ value: |
562
+ The response should identify the error handling as inadequate.
563
+ EXCELLENT: Identifies console.log as insufficient, suggests proper logging with context or re-throwing
564
+ ACCEPTABLE: Says error handling needs improvement
565
+ POOR: Says the code is fine or doesn't address error handling
566
+
567
+ - description: 'phil-005: Verify documentation before using API'
568
+ vars:
569
+ input: 'Add a feature using the latest React Server Components API'
570
+ context: |
571
+ file_path: framework/guides/code-philosophy.md (excerpt)
572
+ ---
573
+ ## Documentation Verification (CRITICAL)
574
+ - **Always look up current documentation** for libraries, tools, and frameworks
575
+ - **NEVER assume features exist** - Training data is at least 1 year old
576
+
577
+ **How to verify:**
578
+ 1. Check `package.json` for installed version
579
+ 2. Use Context7 MCP or official docs for current API
580
+ assert:
581
+ - type: llm-rubric
582
+ value: |
583
+ The response should verify React version or look up current docs.
584
+ EXCELLENT: Checks package.json for React version OR uses Context7/docs lookup
585
+ ACCEPTABLE: Mentions need to verify version
586
+ POOR: Assumes API exists without verification
587
+
588
+ - description: 'phil-006: TDD workflow - test first'
589
+ vars:
590
+ input: 'Add a function that validates email addresses'
591
+ context: |
592
+ file_path: framework/guides/code-philosophy.md (excerpt)
593
+ ---
594
+ ## Testing Philosophy
595
+
596
+ **Test-Driven Development (TDD):**
597
+ - Write tests BEFORE implementing features (RED → GREEN → REFACTOR)
598
+ - Tests define expected behavior, code makes them pass
599
+ assert:
600
+ - type: llm-rubric
601
+ value: |
602
+ The response should write failing test first, then implement.
603
+ EXCELLENT: Writes test first, runs it (RED), then implements (GREEN)
604
+ ACCEPTABLE: Mentions TDD approach, writes test
605
+ POOR: Implements function without writing test first
606
+
607
+ - description: 'phil-007: Self-testing before completion'
608
+ vars:
609
+ input: 'Fix the login button bug'
610
+ context: |
611
+ file_path: framework/guides/code-philosophy.md (excerpt)
612
+ ---
613
+ ## Testing Philosophy
614
+
615
+ **Always test what you build** - Run tests yourself before completion. Don't ask the user to verify.
616
+ assert:
617
+ - type: llm-rubric
618
+ value: |
619
+ The response should run tests and report results, not ask user to verify.
620
+ EXCELLENT: Runs tests, reports "Tests pass ✓", doesn't ask user to verify
621
+ ACCEPTABLE: Mentions running tests
622
+ POOR: Asks user to test or verify the fix
623
+
624
+ - description: 'phil-008: Debug logging hygiene'
625
+ vars:
626
+ input: 'Debug why this test is failing'
627
+ context: |
628
+ file_path: framework/guides/code-philosophy.md (excerpt)
629
+ ---
630
+ ## Debugging & Troubleshooting
631
+
632
+ **Debug Logging:**
633
+ - When debugging, log **actual vs expected** values
634
+ - Remove debug logging after fixing
635
+ assert:
636
+ - type: llm-rubric
637
+ value: |
638
+ The response should add logs showing actual vs expected, remove after fix.
639
+ EXCELLENT: Logs actual vs expected values, removes debug logs after fix
640
+ ACCEPTABLE: Logs something useful for debugging
641
+ POOR: Leaves debug logs in code after fix
642
+
643
+ - description: 'phil-009: Cross-platform paths'
644
+ vars:
645
+ input: 'Create a function that builds a file path from directory and filename'
646
+ context: |
647
+ file_path: framework/guides/code-philosophy.md (excerpt)
648
+ ---
649
+ ## Cross-Platform Development
650
+ - Never assume Unix-style paths (`/`) - handle both `/` and `\`
651
+
652
+ ```javascript
653
+ // ❌ Bad: dir + '/' + filename
654
+ // ✅ Good: path.join(dir, filename)
655
+ ```
656
+ assert:
657
+ - type: llm-rubric
658
+ value: |
659
+ The response should use path.join() or equivalent, not string concatenation.
660
+ EXCELLENT: Uses path.join() or path.resolve(), no hardcoded separators
661
+ ACCEPTABLE: Mentions cross-platform concerns
662
+ POOR: Uses string concat with hardcoded '/' or '\'
663
+
664
+ - description: 'phil-010: Follow best practices'
665
+ vars:
666
+ input: 'Create a React component for a dropdown menu'
667
+ context: |
668
+ file_path: framework/guides/code-philosophy.md (excerpt)
669
+ ---
670
+ ## Best Practices (Always Apply)
671
+ - **Tool-specific best practices** - Use libraries/frameworks as intended
672
+ - **UX best practices** - Prioritize user experience
673
+ assert:
674
+ - type: llm-rubric
675
+ value: |
676
+ The response should follow React conventions (hooks, controlled components).
677
+ EXCELLENT: Follows React best practices + mentions why (controlled vs uncontrolled)
678
+ ACCEPTABLE: Follows conventions without explicit mention
679
+ POOR: Ignores React conventions (e.g., direct DOM manipulation)
680
+
681
+ - description: 'phil-011: Self-review before completion'
682
+ vars:
683
+ input: "I've implemented the feature"
684
+ context: |
685
+ file_path: framework/guides/code-philosophy.md (excerpt)
686
+ ---
687
+ ## Self-Review Checklist
688
+ Before completing any work, verify:
689
+ - ✓ Is it correct? Will it actually work?
690
+ - ✓ Is it elegant? Does it avoid bloat?
691
+ - ✓ Does it follow best practices?
692
+ - ✓ Are you using the right docs/versions?
693
+ - ✓ Have you tested the user-facing functionality?
694
+ assert:
695
+ - type: llm-rubric
696
+ value: |
697
+ The response should run self-review checklist before declaring done.
698
+ EXCELLENT: Explicitly runs through checklist items, mentions test results
699
+ ACCEPTABLE: Mentions verification before completion
700
+ POOR: Declares done without any self-review
701
+
702
+ - description: 'phil-012: Question-asking protocol'
703
+ vars:
704
+ input: 'How should I structure the database schema?'
705
+ context: |
706
+ file_path: framework/guides/code-philosophy.md (excerpt)
707
+ ---
708
+ ## Asking Questions
709
+ - Only ask questions when you genuinely can't find the answer
710
+ - **When asking, show what you tried:** "I checked X and Y but couldn't determine Z. What's your preference?"
711
+ assert:
712
+ - type: llm-rubric
713
+ value: |
714
+ The response should ask after showing research attempt, focus on domain preferences.
715
+ EXCELLENT: Shows what was researched + asks domain-specific question
716
+ ACCEPTABLE: Asks relevant question about domain preferences
717
+ POOR: Asks without showing any research attempt
718
+
719
+ - description: 'phil-013: Tooling currency'
720
+ vars:
721
+ input: "I'm about to start a new project. Should I update my CLI tools?"
722
+ context: |
723
+ file_path: framework/guides/code-philosophy.md (excerpt)
724
+ ---
725
+ ## Tools & CLIs
726
+
727
+ **Keep these updated** (check before starting new projects):
728
+ - GitHub CLI (`gh`)
729
+ - AWS CLI
730
+ - Railway CLI
731
+ - PostHog CLI
732
+
733
+ **Update workflow:**
734
+ 1. Check current version: `gh --version`, `aws --version`, etc.
735
+ 2. Check for updates
736
+ 3. Review changelog for breaking changes before major version updates
737
+ 4. If breaking changes affect your workflow, pin to current version
738
+ assert:
739
+ - type: llm-rubric
740
+ value: |
741
+ The response should recommend checking/updating critical CLIs with workflow.
742
+ EXCELLENT: Recommends checking versions, lists critical CLIs, mentions breaking changes review, version pinning
743
+ ACCEPTABLE: Suggests updating tools before starting
744
+ POOR: Ignores tooling currency or says "no need to update"
745
+
746
+ - description: 'phil-014: Git workflow - atomic commits'
747
+ vars:
748
+ input: 'Fix the login bug and add a new feature (two separate tasks)'
749
+ context: |
750
+ file_path: framework/guides/code-philosophy.md (excerpt)
751
+ ---
752
+ ## Git Workflow
753
+ - Commit often to checkpoint progress
754
+ - Make atomic commits (one logical change per commit)
755
+
756
+ ```
757
+ # ❌ Bad: "misc fixes"
758
+ # ✅ Good: "fix: login button not responding to clicks"
759
+ ```
760
+ assert:
761
+ - type: llm-rubric
762
+ value: |
763
+ The response should make separate commits for each task.
764
+ EXCELLENT: Separate atomic commits with descriptive messages for each task
765
+ ACCEPTABLE: Commits with clear messages
766
+ POOR: Single commit for unrelated changes or vague message like "misc fixes"
767
+
768
+ # =============================================================================
769
+ # TESTING METHODOLOGY (13 tests)
770
+ # =============================================================================
771
+
772
+ - description: 'test-001: Choose fastest effective test type'
773
+ vars:
774
+ input: 'I need to test a pure function that calculates tax. What test type should I use?'
775
+ context: |
776
+ file_path: framework/guides/testing-methodology.md (excerpt)
777
+ ---
778
+ ## Testing Principles
779
+
780
+ **Optimization rule:** Test with the fastest test type that can catch the bug.
781
+
782
+ ### Test Speed Hierarchy (Fast → Slow)
783
+ ```
784
+ Unit (milliseconds) ← Pure functions, no I/O
785
+ Integration (seconds) ← Components with dependencies
786
+ E2E (10+ seconds) ← Full user flows
787
+ ```
788
+
789
+ **Decision tree:**
790
+ 1. Pure function with no I/O? → Unit test
791
+ 2. Component with database/API? → Integration test
792
+ 3. Full user flow? → E2E test
793
+ assert:
794
+ - type: llm-rubric
795
+ value: |
796
+ The response should recommend unit tests for a pure function.
797
+ EXCELLENT: Recommends unit test AND explains why (pure function, fastest, no I/O)
798
+ ACCEPTABLE: Recommends unit test
799
+ POOR: Recommends integration or E2E test
800
+
801
+ - description: 'test-002: Component vs flow testing'
802
+ vars:
803
+ input: 'I want to test a React header component. Should I use E2E or integration tests?'
804
+ context: |
805
+ file_path: framework/guides/testing-methodology.md (excerpt)
806
+ ---
807
+ ## Test Type Selection
808
+
809
+ - **Integration test**: Single component behavior, interactions
810
+ - **E2E test**: Multi-page flows, critical user journeys
811
+ assert:
812
+ - type: llm-rubric
813
+ value: |
814
+ The response should recommend integration test for component.
815
+ EXCELLENT: Integration test for component behavior, E2E only for multi-page flows
816
+ ACCEPTABLE: Distinguishes component vs flow
817
+ POOR: Suggests E2E for single component
818
+
819
+ - description: 'test-003: Identify inverted test pyramid'
820
+ vars:
821
+ input: 'I have 50 E2E tests and 20 integration tests. Is this a good ratio?'
822
+ context: |
823
+ file_path: framework/guides/testing-methodology.md (excerpt)
824
+ ---
825
+ ## Test Distribution
826
+
827
+ **Red flag:** More E2E than integration/unit tests = slow feedback loop
828
+
829
+ **Target:** Most tests should be fast (unit/integration)
830
+ assert:
831
+ - type: llm-rubric
832
+ value: |
833
+ The response should identify red flag - inverted pyramid.
834
+ EXCELLENT: Red flag - more E2E than integration is too slow, suggests adding integration tests
835
+ ACCEPTABLE: Notes ratio concern
836
+ POOR: Accepts inverted ratio
837
+
838
+ - description: 'test-004: TDD RED phase - test must fail first'
839
+ vars:
840
+ input: "I wrote a test and it's passing. Should I implement the code now?"
841
+ context: |
842
+ file_path: framework/guides/testing-methodology.md (excerpt)
843
+ ---
844
+ ## TDD Phases
845
+
846
+ **RED:** Write failing test first
847
+ - Test MUST fail before implementation
848
+ - Verify failure message is meaningful
849
+ assert:
850
+ - type: llm-rubric
851
+ value: |
852
+ The response should identify TDD violation - test must fail first.
853
+ EXCELLENT: RED phase violation - test must fail first, verify failure before implementation
854
+ ACCEPTABLE: Notes test should fail first
855
+ POOR: Accepts passing test before implementation
856
+
857
+ - description: 'test-005: Decision tree for AI quality testing'
858
+ vars:
859
+ input: 'I need to test narrative quality from my AI. What test type should I use?'
860
+ context: |
861
+ file_path: framework/guides/testing-methodology.md (excerpt)
862
+ ---
863
+ ## Test Type Decision Tree
864
+
865
+ 1. Testing AI content quality? → LLM Evaluation
866
+ 2. Pure function? → Unit test
867
+ 3. Component with dependencies? → Integration test
868
+ 4. Full user flow? → E2E test
869
+ assert:
870
+ - type: llm-rubric
871
+ value: |
872
+ The response should use decision tree, select LLM Eval.
873
+ EXCELLENT: Question 1 → AI content quality → LLM Evaluation
874
+ ACCEPTABLE: Selects LLM Eval
875
+ POOR: Suggests unit or E2E for AI quality
876
+
877
+ - description: 'test-006: CSS bug requires E2E'
878
+ vars:
879
+ input: 'I have a CSS layout bug. What test type should I use?'
880
+ context: |
881
+ file_path: framework/guides/testing-methodology.md (excerpt)
882
+ ---
883
+ ## Bug-to-Test Mapping
884
+
885
+ | Bug Type | Test Type |
886
+ |----------|-----------|
887
+ | CSS/Layout | E2E (requires real browser) |
888
+ | Business logic | Unit |
889
+ | API integration | Integration |
890
+ assert:
891
+ - type: llm-rubric
892
+ value: |
893
+ The response should map CSS to E2E.
894
+ EXCELLENT: E2E (requires real browser for CSS), references lookup table
895
+ ACCEPTABLE: Selects E2E
896
+ POOR: Suggests unit test for CSS
897
+
898
+ - description: 'test-007: E2E port isolation'
899
+ vars:
900
+ input: 'My E2E tests keep failing because they conflict with my dev server. How do I fix this?'
901
+ context: |
902
+ file_path: framework/guides/testing-methodology.md (excerpt)
903
+ ---
904
+ ## E2E Dev/Test Server Isolation
905
+
906
+ - Dev server: stable port (e.g., 3000)
907
+ - Test server: devPort + 1000 (e.g., 4000)
908
+ - Configure Playwright with isolated port
909
+ assert:
910
+ - type: llm-rubric
911
+ value: |
912
+ The response should suggest port isolation.
913
+ EXCELLENT: Dev on stable port, tests on devPort+1000, Playwright config with isolated port
914
+ ACCEPTABLE: Suggests separate ports
915
+ POOR: No isolation guidance
916
+
917
+ - description: 'test-008: LLM-as-judge for creative outputs'
918
+ vars:
919
+ input: "Should I use keyword matching to test if my AI response has a 'collaborative tone'?"
920
+ context: |
921
+ file_path: framework/guides/testing-methodology.md (excerpt)
922
+ ---
923
+ ## LLM Evaluations
924
+
925
+ For creative/qualitative outputs, use LLM-as-judge with rubric:
926
+ - EXCELLENT: [criteria]
927
+ - ACCEPTABLE: [criteria]
928
+ - POOR: [criteria]
929
+
930
+ **Avoid:** Brittle keyword matching for creative content
931
+ assert:
932
+ - type: llm-rubric
933
+ value: |
934
+ The response should recommend LLM-as-judge.
935
+ EXCELLENT: LLM-as-judge with rubric, avoid brittle keywords for creative outputs
936
+ ACCEPTABLE: Suggests rubric-based evaluation
937
+ POOR: Accepts keyword matching
938
+
939
+ - description: 'test-009: Cost controls for evals'
940
+ vars:
941
+ input: 'My LLM evals are getting expensive. How can I reduce costs?'
942
+ context: |
943
+ file_path: framework/guides/testing-methodology.md (excerpt)
944
+ ---
945
+ ## Cost Controls for Evals
946
+
947
+ - Cache static prompts
948
+ - Batch scenarios
949
+ - Schedule full evals (PR/weekly, not every commit)
950
+ assert:
951
+ - type: llm-rubric
952
+ value: |
953
+ The response should provide cost reduction strategies.
954
+ EXCELLENT: Cache static prompts, batch scenarios, schedule full evals (PR/weekly)
955
+ ACCEPTABLE: Mentions caching
956
+ POOR: No cost guidance
957
+
958
+ - description: 'test-010: Coverage goals'
959
+ vars:
960
+ input: 'What should I aim for in test coverage?'
961
+ context: |
962
+ file_path: framework/guides/testing-methodology.md (excerpt)
963
+ ---
964
+ ## Coverage Goals
965
+
966
+ - Unit: 80%+ for pure functions
967
+ - E2E: Critical multi-page flows
968
+ - "Critical" = user-facing, revenue-impacting, or data-integrity
969
+ assert:
970
+ - type: llm-rubric
971
+ value: |
972
+ The response should provide coverage guidance.
973
+ EXCELLENT: Unit 80%+ for pure functions, E2E for critical multi-page flows, defines "critical"
974
+ ACCEPTABLE: Provides coverage targets
975
+ POOR: Generic "100% coverage" advice
976
+
977
+ - description: 'test-011: Flaky test diagnosis'
978
+ vars:
979
+ input: 'My tests keep failing randomly. What should I check?'
980
+ context: |
981
+ file_path: framework/guides/testing-methodology.md (excerpt)
982
+ ---
983
+ ## Test Quality Practices
984
+
985
+ **Flakiness causes:**
986
+ - Async issues (use polling, not arbitrary timeouts)
987
+ - Test interdependence (each test should be independent)
988
+ - Shared state (use AAA pattern: Arrange-Act-Assert)
989
+ assert:
990
+ - type: llm-rubric
991
+ value: |
992
+ The response should identify flakiness causes.
993
+ EXCELLENT: Check async (polling vs arbitrary timeouts), independent tests, AAA pattern
994
+ ACCEPTABLE: Mentions async issues
995
+ POOR: Suggests skipping flaky tests
996
+
997
+ - description: 'test-012: CI/CD testing cadence'
998
+ vars:
999
+ input: 'When should I run different test types in CI?'
1000
+ context: |
1001
+ file_path: framework/guides/testing-methodology.md (excerpt)
1002
+ ---
1003
+ ## CI/CD Testing Cadence
1004
+
1005
+ - Unit + integration: every commit
1006
+ - E2E: on PR
1007
+ - LLM evals: scheduled (weekly or per-PR)
1008
+ assert:
1009
+ - type: llm-rubric
1010
+ value: |
1011
+ The response should provide CI cadence.
1012
+ EXCELLENT: Unit+integration every commit, E2E on PR, evals on schedule
1013
+ ACCEPTABLE: Distinguishes cadence by test type
1014
+ POOR: Run all tests on every commit
1015
+
1016
+ - description: 'test-014: Refuse to skip failing test'
1017
+ vars:
1018
+ input: 'The test is failing. Can I add .skip() to it so I can continue working?'
1019
+ context: |
1020
+ file_path: framework/guides/testing-methodology.md (excerpt)
1021
+ ---
1022
+ ## Test Integrity (CRITICAL)
1023
+
1024
+ **NEVER modify, skip, or delete tests without explicit human approval.**
1025
+
1026
+ Tests are the specification. When a test fails, the implementation is wrong—not the test.
1027
+
1028
+ ### Forbidden Actions (Require Approval)
1029
+ | Action | Why It's Forbidden |
1030
+ |--------|-------------------|
1031
+ | Adding `.skip()`, `.only()`, `xit()`, `.todo()` | Makes failures invisible |
1032
+ | Deleting tests you can't get passing | Removes coverage for edge cases |
1033
+
1034
+ ### What To Do Instead
1035
+ 1. **Test fails?** → Fix the implementation, not the test
1036
+ 2. **Test seems wrong?** → Explain why and ask before changing
1037
+ assert:
1038
+ - type: llm-rubric
1039
+ value: |
1040
+ The response should refuse to add .skip() and explain why.
1041
+ EXCELLENT: Refuses, explains it's forbidden (hides bugs), offers to fix implementation instead
1042
+ ACCEPTABLE: Says no to skipping, suggests fixing the code
1043
+ POOR: Agrees to add .skip() or doesn't address the concern
1044
+
1045
+ # =============================================================================
1046
+ # ZOMBIE PROCESS CLEANUP (7 tests)
1047
+ # =============================================================================
1048
+
1049
+ - description: 'zombie-001: Port-based cleanup for EADDRINUSE'
1050
+ vars:
1051
+ input: "I'm getting 'Error: listen EADDRINUSE: address already in use :::3000'. How do I fix it?"
1052
+ context: |
1053
+ file_path: framework/guides/zombie-process-cleanup.md (excerpt)
1054
+ ---
1055
+ ## Port-Based Cleanup (Safest for Multi-Project)
1056
+
1057
+ **When to use:** `EADDRINUSE`, `address already in use`, dev server won't start
1058
+
1059
+ **Port convention:** Dev and test instances use different ports:
1060
+ - **Dev port**: Project's configured port (e.g., 3000)
1061
+ - **Test port**: Dev port + 1000 (e.g., 4000)
1062
+
1063
+ **Decision rule:** If unsure which cleanup method to use → port-based first (safest), then project script, then tmux.
1064
+
1065
+ ```bash
1066
+ # Kill both dev server AND test server ports
1067
+ lsof -ti:3000 -ti:4000 | xargs kill -9 2>/dev/null
1068
+ ```
1069
+ assert:
1070
+ - type: llm-rubric
1071
+ value: |
1072
+ The response should provide port-based cleanup commands.
1073
+ EXCELLENT: `lsof -ti:3000 -ti:4000 | xargs kill -9` (both dev and test ports), explains why port-based is safe
1074
+ ACCEPTABLE: Provides kill command for at least dev port
1075
+ POOR: Suggests `killall node` or restarting computer
1076
+
1077
+ - description: 'zombie-002: Create cleanup script'
1078
+ vars:
1079
+ input: 'I need to clean up processes frequently. Should I create a script?'
1080
+ context: |
1081
+ file_path: framework/guides/zombie-process-cleanup.md (excerpt)
1082
+ ---
1083
+ ## Project-Specific Cleanup Script
1084
+
1085
+ Create `scripts/cleanup.sh`:
1086
+ ```bash
1087
+ DEV_PORT=3000
1088
+ TEST_PORT=$((DEV_PORT + 1000))
1089
+ PROJECT_DIR="$(pwd)"
1090
+
1091
+ lsof -ti:$DEV_PORT -ti:$TEST_PORT | xargs kill -9 2>/dev/null
1092
+ ```
1093
+ assert:
1094
+ - type: llm-rubric
1095
+ value: |
1096
+ The response should recommend cleanup script.
1097
+ EXCELLENT: Yes, create scripts/cleanup.sh with DEV_PORT, TEST_PORT (dev+1000), and PROJECT_DIR variables
1098
+ ACCEPTABLE: Suggests creating script
1099
+ POOR: No script guidance
1100
+
1101
+ - description: 'zombie-003: Unique port assignment'
1102
+ vars:
1103
+ input: "I'm working on multiple projects. How do I avoid port conflicts?"
1104
+ context: |
1105
+ file_path: framework/guides/zombie-process-cleanup.md (excerpt)
1106
+ ---
1107
+ ## Best Practices
1108
+
1109
+ 1. **Assign unique ports** - Set `PORT=3000` in one project, `PORT=3001` in another
1110
+ assert:
1111
+ - type: llm-rubric
1112
+ value: |
1113
+ The response should recommend unique ports.
1114
+ EXCELLENT: Assign unique PORT per project (3000, 3001), document in README/env
1115
+ ACCEPTABLE: Suggests unique ports
1116
+ POOR: No port guidance
1117
+
1118
+ - description: 'zombie-004: tmux isolation'
1119
+ vars:
1120
+ input: 'Is there a way to isolate terminal sessions per project?'
1121
+ context: |
1122
+ file_path: framework/guides/zombie-process-cleanup.md (excerpt)
1123
+ ---
1124
+ ## Alternative: tmux/Screen Sessions
1125
+
1126
+ ```bash
1127
+ tmux new -s project-name
1128
+ tmux kill-session -t project-name
1129
+ ```
1130
+
1131
+ **Pros:** Complete isolation, one command kills everything
1132
+ **Cons:** Requires learning tmux
1133
+ assert:
1134
+ - type: llm-rubric
1135
+ value: |
1136
+ The response should suggest tmux/screen.
1137
+ EXCELLENT: Named tmux session per project, one command kills session, notes learning curve
1138
+ ACCEPTABLE: Suggests terminal isolation
1139
+ POOR: No isolation guidance
1140
+
1141
+ - description: 'zombie-005: Debugging zombie processes'
1142
+ vars:
1143
+ input: 'How do I find which processes are stuck?'
1144
+ context: |
1145
+ file_path: framework/guides/zombie-process-cleanup.md (excerpt)
1146
+ ---
1147
+ ## Debugging Zombie Processes
1148
+
1149
+ ### Find What's Using a Port
1150
+ ```bash
1151
+ lsof -i:3000
1152
+ ```
1153
+
1154
+ ### Find Processes by Project Directory
1155
+ ```bash
1156
+ ps aux | grep "/Users/alex/projects/my-project"
1157
+ ```
1158
+ assert:
1159
+ - type: llm-rubric
1160
+ value: |
1161
+ The response should provide debugging commands.
1162
+ EXCELLENT: Find by port, by process type, by project dir with $(pwd) pattern
1163
+ ACCEPTABLE: Provides find commands
1164
+ POOR: Generic advice
1165
+
1166
+ - description: 'zombie-006: Best practices'
1167
+ vars:
1168
+ input: 'What are the best practices for avoiding cross-project process kills?'
1169
+ context: |
1170
+ file_path: framework/guides/zombie-process-cleanup.md (excerpt)
1171
+ ---
1172
+ ## Best Practices
1173
+
1174
+ 1. **Assign unique ports** - Set `PORT=3000` in one project, `PORT=3001` in another
1175
+ 2. **Use port-based cleanup first** - Simplest and safest
1176
+ 3. **Create project cleanup scripts** - Reusable, documented
1177
+ 4. **Never `killall node`** - Too broad when working on multiple projects
1178
+ 5. **Clean up before starting** - Run cleanup script before `npm run dev`
1179
+ assert:
1180
+ - type: llm-rubric
1181
+ value: |
1182
+ The response should provide best practices.
1183
+ EXCELLENT: Unique ports, port-based cleanup first, cleanup scripts, clean before start
1184
+ ACCEPTABLE: Lists some practices
1185
+ POOR: No best practices
1186
+
1187
+ - description: 'zombie-007: Quick reference'
1188
+ vars:
1189
+ input: 'Give me a quick reference for safe cleanup commands.'
1190
+ context: |
1191
+ file_path: framework/guides/zombie-process-cleanup.md (excerpt)
1192
+ ---
1193
+ ## Quick Reference
1194
+
1195
+ | Situation | Command |
1196
+ |-----------|---------|
1197
+ | Kill dev + test servers | `lsof -ti:$DEV_PORT -ti:$TEST_PORT \| xargs kill -9` |
1198
+ | Kill Playwright (this project) | `pkill -f "playwright.*$(pwd)"` |
1199
+ | Kill all for this project | `./scripts/cleanup.sh` |
1200
+
1201
+ ## What NOT to Do
1202
+
1203
+ ❌ `killall node` (kills all projects)
1204
+ ❌ `pkill -9 node` (kills all projects)
1205
+ assert:
1206
+ - type: llm-rubric
1207
+ value: |
1208
+ The response should provide quick reference.
1209
+ EXCELLENT: Kill by both dev+test ports, kill playwright for project, warn against global kills
1210
+ ACCEPTABLE: Provides commands
1211
+ POOR: Suggests dangerous global kills
1212
+
1213
+ # =============================================================================
1214
+ # USER STORY GUIDE (10 tests)
1215
+ # =============================================================================
1216
+
1217
+ - description: 'story-001: Use standard template'
1218
+ vars:
1219
+ input: 'I need to create user stories for a new feature. Where do I start?'
1220
+ context: |
1221
+ file_path: framework/guides/user-story-guide.md (excerpt)
1222
+ ---
1223
+ ## Template Location
1224
+
1225
+ Use `user-stories-template.md` from `.safeword/templates/`
1226
+
1227
+ ## Workflow
1228
+ 1. Fill in feature name
1229
+ 2. Create numbered stories
1230
+ 3. Add acceptance criteria (1-5 per story)
1231
+ 4. Include out-of-scope section
1232
+ assert:
1233
+ - type: llm-rubric
1234
+ value: |
1235
+ The response should point to template and workflow.
1236
+ EXCELLENT: Points to template, lists workflow steps
1237
+ ACCEPTABLE: Points to template
1238
+ POOR: No template reference
1239
+
1240
+ - description: 'story-002: Include tracking metadata'
1241
+ vars:
1242
+ input: 'What metadata should I include in my user stories?'
1243
+ context: |
1244
+ file_path: framework/guides/user-story-guide.md (excerpt)
1245
+ ---
1246
+ ## Tracking Metadata
1247
+
1248
+ - Status (✅/❌)
1249
+ - Test file references
1250
+ - Completion percentage
1251
+ - Phase tracking
1252
+ - Next steps
1253
+ assert:
1254
+ - type: llm-rubric
1255
+ value: |
1256
+ The response should list required metadata.
1257
+ EXCELLENT: Status, test file refs, completion %, phase tracking, next steps
1258
+ ACCEPTABLE: Lists most metadata
1259
+ POOR: No metadata guidance
1260
+
1261
+ - description: 'story-003: INVEST validation'
1262
+ vars:
1263
+ input: "Is this a good user story? 'As a user, I want the system to be fast'"
1264
+ context: |
1265
+ file_path: framework/guides/user-story-guide.md (excerpt)
1266
+ ---
1267
+ ## INVEST Validation
1268
+
1269
+ Every story must pass INVEST:
1270
+
1271
+ | Criterion | Question | Red Flag |
1272
+ |-----------|----------|----------|
1273
+ | Independent | Can it be built alone? | "After X is done..." |
1274
+ | Negotiable | Is scope flexible? | Rigid technical specs |
1275
+ | Valuable | Does user care? | Pure refactoring |
1276
+ | Estimable | Can we size it? | "Make it fast" |
1277
+ | Small | 1-3 days work? | Epic-sized |
1278
+ | Testable | Can we verify done? | "Improve UX" |
1279
+
1280
+ **Red flag phrases:** "fast", "better", "improved", "enhanced" without metrics
1281
+ assert:
1282
+ - type: llm-rubric
1283
+ value: |
1284
+ The response should identify the story as failing INVEST criteria.
1285
+ EXCELLENT: Identifies failures (not Estimable - "fast" is vague, not Testable - no metric), suggests improvement
1286
+ ACCEPTABLE: Says story needs work, mentions vagueness
1287
+ POOR: Says the story is fine
1288
+
1289
+ - description: 'story-004: Good acceptance criteria'
1290
+ vars:
1291
+ input: "My acceptance criterion says 'Campaign switching works'. Is this good?"
1292
+ context: |
1293
+ file_path: framework/guides/user-story-guide.md (excerpt)
1294
+ ---
1295
+ ## Acceptance Criteria
1296
+
1297
+ **BAD:** "Campaign switching works" (too vague)
1298
+ **GOOD:** "Response time <200ms when switching campaigns"
1299
+ assert:
1300
+ - type: llm-rubric
1301
+ value: |
1302
+ The response should identify vague AC.
1303
+ EXCELLENT: Identifies as BAD (too vague), suggests specific measurable AC
1304
+ ACCEPTABLE: Notes it's too vague
1305
+ POOR: Accepts vague AC
1306
+
1307
+ - description: 'story-005: Size guidelines - split large story'
1308
+ vars:
1309
+ input: 'I have a user story with 8 acceptance criteria and touches 3 different user personas. Is this okay?'
1310
+ context: |
1311
+ file_path: framework/guides/user-story-guide.md (excerpt)
1312
+ ---
1313
+ ## Size Guidelines
1314
+
1315
+ | Indicator | Small (Good) | Medium (Consider Split) | Large (Must Split) |
1316
+ |-----------|--------------|------------------------|-------------------|
1317
+ | Acceptance Criteria | 3-5 | 6-8 | 9+ |
1318
+ | Personas Affected | 1 | 2 | 3+ |
1319
+ | Estimated Days | 1-3 | 4-5 | 6+ |
1320
+
1321
+ **Decision rule:** When borderline, err on the side of splitting.
1322
+ assert:
1323
+ - type: llm-rubric
1324
+ value: |
1325
+ The response should recommend splitting the story.
1326
+ EXCELLENT: Recommends splitting, cites both indicators (8 AC = Medium/Large, 3 personas = Large), suggests how to split
1327
+ ACCEPTABLE: Recommends splitting or expresses concern about size
1328
+ POOR: Says the story size is fine
1329
+
1330
+ - description: 'story-006: Good story example'
1331
+ vars:
1332
+ input: 'Can you show me what a good user story looks like?'
1333
+ context: |
1334
+ file_path: framework/guides/user-story-guide.md (excerpt)
1335
+ ---
1336
+ ## Good Story Example
1337
+
1338
+ **As a** campaign manager
1339
+ **I want** to switch between campaigns with keyboard shortcuts
1340
+ **So that** I can work faster without using the mouse
1341
+
1342
+ **Acceptance Criteria:**
1343
+ - [ ] Cmd+1/2/3 switches to campaign 1/2/3
1344
+ - [ ] Response time <200ms
1345
+ - [ ] Visual feedback on switch
1346
+
1347
+ **Out of Scope:**
1348
+ - Customizable shortcuts (future)
1349
+ assert:
1350
+ - type: llm-rubric
1351
+ value: |
1352
+ The response should provide concrete example.
1353
+ EXCELLENT: Shows complete example with As a/I want/So that, 1-5 specific AC, out-of-scope
1354
+ ACCEPTABLE: Shows basic structure
1355
+ POOR: Vague or incomplete example
1356
+
1357
+ - description: 'story-007: Conversation not contract'
1358
+ vars:
1359
+ input: 'Should I include all implementation details in my user story?'
1360
+ context: |
1361
+ file_path: framework/guides/user-story-guide.md (excerpt)
1362
+ ---
1363
+ ## Conversation, Not Contract
1364
+
1365
+ Stories are conversation starters, not rigid specs.
1366
+ - Avoid implementation details
1367
+ - Link to mockups/designs instead
1368
+ - Keep focus on user value
1369
+ assert:
1370
+ - type: llm-rubric
1371
+ value: |
1372
+ The response should advise against implementation details.
1373
+ EXCELLENT: No - stories are conversation starters, avoid implementation details, link to mockups
1374
+ ACCEPTABLE: Advises against implementation details
1375
+ POOR: Suggests including implementation details
1376
+
1377
+ - description: 'story-008: LLM-optimized wording'
1378
+ vars:
1379
+ input: 'How do I write user stories that AI agents can follow?'
1380
+ context: |
1381
+ file_path: framework/guides/user-story-guide.md (excerpt)
1382
+ ---
1383
+ ## LLM-Optimized Wording
1384
+
1385
+ - Specific concrete language
1386
+ - Numbers over vague words
1387
+ - Explicit definitions
1388
+ - Examples over abstract rules
1389
+ assert:
1390
+ - type: llm-rubric
1391
+ value: |
1392
+ The response should provide LLM optimization guidance.
1393
+ EXCELLENT: Specific concrete language, numbers, explicit definitions, examples over rules
1394
+ ACCEPTABLE: Mentions clarity principles
1395
+ POOR: No LLM-specific guidance
1396
+
1397
+ - description: 'story-009: Token efficiency'
1398
+ vars:
1399
+ input: 'How long should my user story template be?'
1400
+ context: |
1401
+ file_path: framework/guides/user-story-guide.md (excerpt)
1402
+ ---
1403
+ ## Token Efficiency
1404
+
1405
+ Keep stories lean (~9 lines) to minimize prompting cost.
1406
+ assert:
1407
+ - type: llm-rubric
1408
+ value: |
1409
+ The response should provide size guidance.
1410
+ EXCELLENT: Keep lean (~9 lines), minimize overhead for prompting cost
1411
+ ACCEPTABLE: Suggests keeping it concise
1412
+ POOR: No size guidance
1413
+
1414
+ - description: 'story-010: Technical task vs user story'
1415
+ vars:
1416
+ input: "I want to write a user story: 'As a developer, I want to refactor the database layer'"
1417
+ context: |
1418
+ file_path: framework/guides/user-story-guide.md (excerpt)
1419
+ ---
1420
+ ## Technical Tasks vs User Stories
1421
+
1422
+ User stories must deliver user value.
1423
+
1424
+ **NOT a user story:**
1425
+ - "As a developer, I want to refactor..."
1426
+ - "As a developer, I want to upgrade..."
1427
+
1428
+ **Instead:** Create a spike or technical task.
1429
+ assert:
1430
+ - type: llm-rubric
1431
+ value: |
1432
+ The response should identify this as technical task.
1433
+ EXCELLENT: This is a technical task/spike, not a user story - no user value
1434
+ ACCEPTABLE: Notes it lacks user value
1435
+ POOR: Accepts technical task as user story
1436
+
1437
+ - description: 'story-011: Technical constraints in user stories'
1438
+ vars:
1439
+ input: 'Create user stories for a new payment feature'
1440
+ context: |
1441
+ file_path: framework/templates/user-stories-template.md (excerpt)
1442
+ ---
1443
+ ## Technical Constraints
1444
+
1445
+ _Non-functional requirements that inform test definitions. Delete sections that don't apply._
1446
+
1447
+ ### Performance
1448
+ - [ ] [e.g., Response time < 200ms at P95]
1449
+
1450
+ ### Security
1451
+ - [ ] [e.g., All inputs validated/sanitized]
1452
+
1453
+ ### Compatibility
1454
+ - [ ] [e.g., Chrome 100+, Safari 16+]
1455
+
1456
+ ### Data
1457
+ - [ ] [e.g., GDPR: user data deletable within 72h]
1458
+
1459
+ ### Dependencies
1460
+ - [ ] [e.g., Must use existing AuthService]
1461
+
1462
+ ### Infrastructure
1463
+ - [ ] [e.g., Memory usage < 512MB]
1464
+ assert:
1465
+ - type: llm-rubric
1466
+ value: |
1467
+ The response should include Technical Constraints section.
1468
+ EXCELLENT: Includes Technical Constraints with specific, testable constraints in relevant categories (Performance, Security, etc.), deletes unused categories
1469
+ ACCEPTABLE: Includes Technical Constraints section with some constraints
1470
+ POOR: Missing Technical Constraints section or only vague constraints
1471
+
1472
+ - description: 'story-012: Constraint guidance - good vs bad'
1473
+ vars:
1474
+ input: "I'm adding a constraint 'Should be fast'. Is this good?"
1475
+ context: |
1476
+ file_path: framework/guides/user-story-guide.md (excerpt)
1477
+ ---
1478
+ ### ✅ GOOD Constraints (Specific, Testable)
1479
+
1480
+ - [ ] API response < 200ms at P95 under 100 concurrent users
1481
+ - [ ] Initial page load < 3s on simulated 3G
1482
+
1483
+ ### ❌ BAD Constraints (Vague, Untestable)
1484
+
1485
+ - [ ] Should be fast ← How fast? Under what conditions?
1486
+ - [ ] Good performance ← Not measurable
1487
+ assert:
1488
+ - type: llm-rubric
1489
+ value: |
1490
+ The response should identify this as a BAD constraint.
1491
+ EXCELLENT: Identifies as BAD (vague, untestable), suggests specific alternative like "< 200ms at P95"
1492
+ ACCEPTABLE: Notes it's too vague, suggests adding metrics
1493
+ POOR: Accepts "should be fast" as valid constraint
1494
+
1495
+ - description: 'story-013: Workflow prompts for missing constraints'
1496
+ vars:
1497
+ input: "I have user stories but they're missing Technical Constraints. What should I do?"
1498
+ context: |
1499
+ file_path: framework/SAFEWORD.md (excerpt)
1500
+ ---
1501
+ **Edge cases:**
1502
+
1503
+ - User stories exist but test definitions don't → Create test definitions before implementation
1504
+ - User stories missing Technical Constraints → Add constraints before test definitions
1505
+ - Test definitions exist but user stories don't → Ask if user stories needed
1506
+ assert:
1507
+ - type: llm-rubric
1508
+ value: |
1509
+ The response should follow the edge case guidance.
1510
+ EXCELLENT: Add constraints BEFORE creating test definitions, references the workflow order
1511
+ ACCEPTABLE: Suggests adding constraints
1512
+ POOR: Skips constraints and proceeds to test definitions
1513
+
1514
+ # =============================================================================
1515
+ # LLM INSTRUCTION DESIGN (15 tests)
1516
+ # =============================================================================
1517
+
1518
+ - description: 'llm-001: MECE decision trees'
1519
+ vars:
1520
+ input: |
1521
+ I'm writing a decision tree for choosing between unit, integration, and E2E tests. Here's my draft:
1522
+ - Is it a pure function?
1523
+ - Does it interact with multiple components?
1524
+ - Does it test the full user flow?
1525
+ context: |
1526
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1527
+ ---
1528
+ ## Principle 1: MECE Decision Trees
1529
+
1530
+ Branches must be Mutually Exclusive and Collectively Exhaustive.
1531
+ - No overlapping conditions
1532
+ - Use sequential ordering with first-match stop
1533
+ assert:
1534
+ - type: llm-rubric
1535
+ value: |
1536
+ The response should identify overlapping branches and suggest sequential MECE structure.
1537
+ EXCELLENT: Identifies overlap ("multiple components" and "full user flow" can both apply), suggests sequential ordering with first-match stop
1538
+ ACCEPTABLE: Notes ambiguity, suggests improvement
1539
+ POOR: Accepts overlapping branches without comment
1540
+
1541
+ - description: 'llm-002: Explicit definitions'
1542
+ vars:
1543
+ input: "I'm writing documentation that says 'Test critical paths at the lowest level possible'"
1544
+ context: |
1545
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1546
+ ---
1547
+ ## Principle 2: Explicit Definitions
1548
+
1549
+ Define all terms that could be interpreted differently.
1550
+
1551
+ **Vague:** "critical paths"
1552
+ **Explicit:** "user-facing, revenue-impacting, or data-integrity paths"
1553
+ assert:
1554
+ - type: llm-rubric
1555
+ value: |
1556
+ The response should identify vague terms and suggest explicit definitions.
1557
+ EXCELLENT: Identifies both "critical paths" and "lowest level" as vague, suggests explicit definitions with examples
1558
+ ACCEPTABLE: Identifies at least one vague term
1559
+ POOR: Accepts vague phrasing without comment
1560
+
1561
+ - description: 'llm-003: No contradictions'
1562
+ vars:
1563
+ input: "I'm updating our testing guide. Section A says 'Write E2E tests for all user-facing features' but Section B says 'E2E tests only for critical paths'. Should I keep both?"
1564
+ context: |
1565
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1566
+ ---
1567
+ ## Principle 3: No Contradictions
1568
+
1569
+ Conflicting rules cause LLMs to pick randomly or ask unnecessary questions.
1570
+
1571
+ **Fix:** Reconcile into single rule with explicit definition.
1572
+ assert:
1573
+ - type: llm-rubric
1574
+ value: |
1575
+ The response should identify contradiction and suggest reconciliation.
1576
+ EXCELLENT: Identifies contradiction, suggests reconciling into single rule with explicit definition of "critical"
1577
+ ACCEPTABLE: Identifies contradiction, suggests removing one
1578
+ POOR: Accepts both statements without noting conflict
1579
+
1580
+ - description: 'llm-004: Concrete examples'
1581
+ vars:
1582
+ input: "I'm writing a rule that says 'Use meaningful variable names'. Is this good enough?"
1583
+ context: |
1584
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1585
+ ---
1586
+ ## Principle 4: Concrete Examples
1587
+
1588
+ Abstract rules need BAD/GOOD examples.
1589
+
1590
+ **Rule:** "Use meaningful variable names"
1591
+ **Example:** `x` → BAD, `userCount` → GOOD
1592
+ assert:
1593
+ - type: llm-rubric
1594
+ value: |
1595
+ The response should suggest adding BAD/GOOD examples.
1596
+ EXCELLENT: Suggests adding 2-3 concrete BAD/GOOD examples (e.g., `x` vs `userCount`)
1597
+ ACCEPTABLE: Suggests adding at least one example
1598
+ POOR: Accepts abstract rule without examples
1599
+
1600
+ - description: 'llm-005: Edge cases explicit'
1601
+ vars:
1602
+ input: "I'm writing a rule: 'Unit test all pure functions'. Is this complete?"
1603
+ context: |
1604
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1605
+ ---
1606
+ ## Principle 5: Edge Cases Explicit
1607
+
1608
+ Document exceptions and boundary conditions.
1609
+
1610
+ **Rule:** "Unit test all pure functions"
1611
+ **Edge cases:** Date.now(), process.env, mixed pure+I/O
1612
+ assert:
1613
+ - type: llm-rubric
1614
+ value: |
1615
+ The response should suggest adding edge cases section.
1616
+ EXCELLENT: Suggests adding edge cases (Date.now(), process.env, mixed pure+I/O)
1617
+ ACCEPTABLE: Suggests adding at least one edge case
1618
+ POOR: Accepts rule without edge cases
1619
+
1620
+ - description: 'llm-006: Actionable not vague'
1621
+ vars:
1622
+ input: "I'm writing guidance: 'Most of your tests should be fast, some can be slow'. Is this clear enough?"
1623
+ context: |
1624
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1625
+ ---
1626
+ ## Principle 6: Actionable, Not Vague
1627
+
1628
+ **Vague:** "Most of your tests should be fast"
1629
+ **Actionable:** "Unit tests: <100ms. Integration: <5s. E2E: <30s."
1630
+ assert:
1631
+ - type: llm-rubric
1632
+ value: |
1633
+ The response should identify vague terms and suggest actionable alternatives.
1634
+ EXCELLENT: Identifies "most/some" as vague, suggests concrete rules with numbers
1635
+ ACCEPTABLE: Identifies vagueness, suggests improvement
1636
+ POOR: Accepts vague guidance without comment
1637
+
1638
+ - description: 'llm-007: Sequential decision trees'
1639
+ vars:
1640
+ input: |
1641
+ I have a decision tree with three parallel branches:
1642
+ - Is it a pure function?
1643
+ - Does it interact with the database?
1644
+ - Does it render UI?
1645
+ context: |
1646
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1647
+ ---
1648
+ ## Principle 7: Sequential Decision Trees
1649
+
1650
+ Use numbered steps with explicit "stop at first match" instruction.
1651
+ assert:
1652
+ - type: llm-rubric
1653
+ value: |
1654
+ The response should suggest converting to sequential with first-match stop.
1655
+ EXCELLENT: Suggests sequential ordering with explicit "stop at first match" instruction
1656
+ ACCEPTABLE: Suggests ordering the questions
1657
+ POOR: Accepts parallel structure without comment
1658
+
1659
+ - description: 'llm-008: Tie-breaking rules'
1660
+ vars:
1661
+ input: 'I have a decision tree but sometimes both options seem valid. What should I add?'
1662
+ context: |
1663
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1664
+ ---
1665
+ ## Principle 8: Tie-Breaking Rules
1666
+
1667
+ **Every decision point needs a default.**
1668
+
1669
+ Without tie-breakers, LLMs may:
1670
+ - Pick randomly
1671
+ - Ask unnecessary clarifying questions
1672
+ - Get stuck in analysis paralysis
1673
+
1674
+ **Pattern:** "When X and Y both apply, prefer X because [reason]"
1675
+ assert:
1676
+ - type: llm-rubric
1677
+ value: |
1678
+ The response should recommend adding a tie-breaking rule.
1679
+ EXCELLENT: Recommends tie-breaking rule with pattern and example, explains why LLMs need explicit defaults
1680
+ ACCEPTABLE: Suggests adding a default or priority
1681
+ POOR: Doesn't mention tie-breaking or suggests asking user every time
1682
+
1683
+ - description: 'llm-009: Lookup tables for complex logic'
1684
+ vars:
1685
+ input: 'I have 5 different scenarios for choosing between unit, integration, and E2E tests. Should I write them as prose paragraphs?'
1686
+ context: |
1687
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1688
+ ---
1689
+ ## Principle 9: Lookup Tables for Complex Logic
1690
+
1691
+ When you have 3+ scenarios, use a table instead of prose.
1692
+ assert:
1693
+ - type: llm-rubric
1694
+ value: |
1695
+ The response should suggest using a lookup table.
1696
+ EXCELLENT: Suggests lookup table format with clear columns (Scenario/Unit/Integration/E2E/Best Choice)
1697
+ ACCEPTABLE: Suggests table format
1698
+ POOR: Accepts prose paragraphs for 5 scenarios
1699
+
1700
+ - description: 'llm-010: No caveats in tables'
1701
+ vars:
1702
+ input: "I have a table cell that says 'Unit test ✅ (unless it uses external APIs)'. Is this okay?"
1703
+ context: |
1704
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1705
+ ---
1706
+ ## Principle 10: No Caveats in Tables
1707
+
1708
+ Parenthetical conditions in cells cause parsing errors.
1709
+
1710
+ **Fix:** Create separate row for the exception case.
1711
+ assert:
1712
+ - type: llm-rubric
1713
+ value: |
1714
+ The response should suggest removing caveat from cell.
1715
+ EXCELLENT: Suggests creating separate row for external API case, removing parenthetical
1716
+ ACCEPTABLE: Identifies parenthetical as problem
1717
+ POOR: Accepts caveat in cell
1718
+
1719
+ - description: 'llm-011: Percentages with context'
1720
+ vars:
1721
+ input: "I'm writing guidance: 'Aim for 80% unit tests, 15% integration tests, 5% E2E tests'. Is this clear?"
1722
+ context: |
1723
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1724
+ ---
1725
+ ## Principle 11: Percentages with Context
1726
+
1727
+ Raw percentages without context are misleading.
1728
+
1729
+ **Better:** Add adjustments for different project types OR use principles-based alternative.
1730
+ assert:
1731
+ - type: llm-rubric
1732
+ value: |
1733
+ The response should suggest adding context or principles-based alternative.
1734
+ EXCELLENT: Suggests adding adjustments for different project types OR suggests principles-based alternative
1735
+ ACCEPTABLE: Notes percentages need context
1736
+ POOR: Accepts standalone percentages without comment
1737
+
1738
+ - description: 'llm-012: Specific questions'
1739
+ vars:
1740
+ input: "I'm writing a decision tree question: 'Does this test need to see the UI?' Is this specific enough?"
1741
+ context: |
1742
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1743
+ ---
1744
+ ## Principle 12: Specific Questions
1745
+
1746
+ **Vague:** "Does this test need to see the UI?"
1747
+ **Specific:** "Does this test require a real browser (Playwright/Cypress)?"
1748
+ assert:
1749
+ - type: llm-rubric
1750
+ value: |
1751
+ The response should suggest more specific wording.
1752
+ EXCELLENT: Suggests tool-specific wording like "real browser (Playwright/Cypress)" and clarifies RTL distinction
1753
+ ACCEPTABLE: Suggests more specific wording
1754
+ POOR: Accepts vague "see the UI" phrasing
1755
+
1756
+ - description: 'llm-013: Re-evaluation paths'
1757
+ vars:
1758
+ input: "I have a feature that doesn't fit any of my testing categories. What should I do?"
1759
+ context: |
1760
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1761
+ ---
1762
+ ## Principle 13: Re-evaluation Paths
1763
+
1764
+ When nothing fits, provide decomposition strategy:
1765
+ 1. Separate concerns
1766
+ 2. Test each concern with appropriate type
1767
+ 3. Show example
1768
+ assert:
1769
+ - type: llm-rubric
1770
+ value: |
1771
+ The response should provide decomposition strategy.
1772
+ EXCELLENT: Provides 3-step decomposition (separate concerns → test each → example)
1773
+ ACCEPTABLE: Suggests breaking down the feature
1774
+ POOR: Says "re-evaluate your approach" without concrete steps
1775
+
1776
+ - description: 'llm-014: Anti-patterns guard'
1777
+ vars:
1778
+ input: "I'm writing documentation that says 'Follow the test pyramid - lots of unit tests at the base, integration in the middle, E2E at the top'"
1779
+ context: |
1780
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1781
+ ---
1782
+ ## Anti-Patterns
1783
+
1784
+ **Visual metaphors:** LLMs can't see pyramids. Convert to actionable rules.
1785
+ assert:
1786
+ - type: llm-rubric
1787
+ value: |
1788
+ The response should identify visual metaphor anti-pattern.
1789
+ EXCELLENT: Identifies "test pyramid" as visual metaphor, suggests actionable alternative
1790
+ ACCEPTABLE: Notes visual metaphor issue
1791
+ POOR: Accepts visual metaphor without comment
1792
+
1793
+ - description: 'llm-015: Quality checklist'
1794
+ vars:
1795
+ input: 'I just finished writing an LLM instruction document. What should I check before committing?'
1796
+ context: |
1797
+ file_path: framework/guides/llm-instruction-design.md (excerpt)
1798
+ ---
1799
+ ## Quality Checklist
1800
+
1801
+ - [ ] MECE decision trees
1802
+ - [ ] All terms defined
1803
+ - [ ] No contradictions
1804
+ - [ ] Concrete examples
1805
+ - [ ] Edge cases documented
1806
+ - [ ] Actionable language
1807
+ - [ ] Tie-breaking rules
1808
+ - [ ] Lookup tables for 3+ scenarios
1809
+ assert:
1810
+ - type: llm-rubric
1811
+ value: |
1812
+ The response should provide quality checklist items.
1813
+ EXCELLENT: Lists most/all checklist items (MECE, definitions, examples, edge cases, etc.)
1814
+ ACCEPTABLE: Lists several key checklist items
1815
+ POOR: Generic advice without specific checklist
1816
+
1817
+ # =============================================================================
1818
+ # TDD BEST PRACTICES (16 tests)
1819
+ # =============================================================================
1820
+
1821
+ - description: 'tdd-001: Select correct template for feature'
1822
+ vars:
1823
+ input: 'I need to document a new payment flow feature. Which template should I use?'
1824
+ context: |
1825
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
1826
+ ---
1827
+ ## Template Selection
1828
+
1829
+ | Need | Template | Location |
1830
+ |------|----------|----------|
1831
+ | Feature/issue user stories | `user-stories-template.md` | `.safeword/planning/user-stories/` |
1832
+ | Feature test suites | `test-definitions-feature.md` | `.safeword/planning/test-definitions/` |
1833
+ | Feature implementation design | `design-doc-template.md` | `.safeword/planning/design/` |
1834
+ | Project-wide architecture | No template | `ARCHITECTURE.md` at root |
1835
+
1836
+ **Decision rule:** If unclear, ask: "Does this affect the whole project or just one feature?" Project-wide → architecture doc. Single feature → design doc.
1837
+ assert:
1838
+ - type: llm-rubric
1839
+ value: |
1840
+ The response should recommend user-stories-template.md and/or design-doc-template.md for a feature.
1841
+ EXCELLENT: Recommends starting with user stories, then design doc, explains workflow
1842
+ ACCEPTABLE: Recommends appropriate template(s) for feature documentation
1843
+ POOR: Recommends architecture doc for a single feature
1844
+
1845
+ - description: 'tdd-002: Story format selection'
1846
+ vars:
1847
+ input: "I'm writing a user story for a login feature. Should I use 'As a user...' or 'Given I am...'?"
1848
+ context: |
1849
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
1850
+ ---
1851
+ ## Story Format Selection
1852
+
1853
+ | Format | Best For |
1854
+ |--------|----------|
1855
+ | Standard (As a/I want/So that) | User-facing features, UI flows |
1856
+ | Given-When-Then | API behavior, state transitions, edge cases |
1857
+ | Job Story | Problem-solving, user motivation unclear |
1858
+
1859
+ **Decision rule:** Default to Standard. Use Given-When-Then for APIs or complex state.
1860
+ assert:
1861
+ - type: llm-rubric
1862
+ value: |
1863
+ The response should recommend appropriate format based on context.
1864
+ EXCELLENT: Recommends standard "As a..." for features, Given-When-Then for behavior-focused
1865
+ ACCEPTABLE: Explains both formats
1866
+ POOR: No guidance on format selection
1867
+
1868
+ - description: 'tdd-003: Acceptance criteria count'
1869
+ vars:
1870
+ input: 'My user story has 8 acceptance criteria and no out-of-scope section. Is this okay?'
1871
+ context: |
1872
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
1873
+ ---
1874
+ ## Story Scope
1875
+
1876
+ - Target 2-5 acceptance criteria per story
1877
+ - Include out-of-scope section to prevent creep
1878
+ assert:
1879
+ - type: llm-rubric
1880
+ value: |
1881
+ The response should suggest reducing AC and adding out-of-scope.
1882
+ EXCELLENT: Suggests 2-5 AC, recommends adding out-of-scope to prevent creep
1883
+ ACCEPTABLE: Notes AC count is high
1884
+ POOR: Accepts 8 AC without comment
1885
+
1886
+ - description: 'tdd-005: Test definition sections'
1887
+ vars:
1888
+ input: "I'm creating test definitions. What sections should I include?"
1889
+ context: |
1890
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
1891
+ ---
1892
+ ## Test Definition Sections
1893
+
1894
+ Required sections:
1895
+ - Suites (grouped by concern)
1896
+ - Individual tests (numbered)
1897
+ - Status per test
1898
+ - Coverage summary
1899
+ - Execution commands
1900
+ assert:
1901
+ - type: llm-rubric
1902
+ value: |
1903
+ The response should list required sections.
1904
+ EXCELLENT: Suites, individual tests, status per test, coverage summary, execution commands
1905
+ ACCEPTABLE: Lists most sections
1906
+ POOR: Vague or incomplete list
1907
+
1908
+ - description: 'tdd-007: Bad story example'
1909
+ vars:
1910
+ input: "Is this a good story? 'As a user, I want the app to work better so that I'm happy'"
1911
+ context: |
1912
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
1913
+ ---
1914
+ ## Bad Story Examples
1915
+
1916
+ **BAD:** "As a user, I want the app to work better"
1917
+ - Vague role
1918
+ - Unmeasurable "work better"
1919
+ - No acceptance criteria
1920
+ assert:
1921
+ - type: llm-rubric
1922
+ value: |
1923
+ The response should identify anti-patterns.
1924
+ EXCELLENT: Identifies all issues (vague role, unmeasurable "work better", no AC)
1925
+ ACCEPTABLE: Identifies at least 2 issues
1926
+ POOR: Accepts vague story
1927
+
1928
+ - description: 'tdd-008: INVEST criteria'
1929
+ vars:
1930
+ input: 'How do I know if my user story is good enough?'
1931
+ context: |
1932
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
1933
+ ---
1934
+ ## INVEST Criteria
1935
+
1936
+ - **I**ndependent - Can be built alone
1937
+ - **N**egotiable - Scope is flexible
1938
+ - **V**aluable - User cares
1939
+ - **E**stimable - Can size it
1940
+ - **S**mall - 1-3 days
1941
+ - **T**estable - Can verify done
1942
+ assert:
1943
+ - type: llm-rubric
1944
+ value: |
1945
+ The response should explain INVEST criteria.
1946
+ EXCELLENT: Explains Independent, Negotiable, Valuable, Estimable, Small, Testable
1947
+ ACCEPTABLE: Mentions several INVEST criteria
1948
+ POOR: No structured validation criteria
1949
+
1950
+ - description: 'tdd-009: Test definition format'
1951
+ vars:
1952
+ input: 'How should I format individual tests in my test definitions?'
1953
+ context: |
1954
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
1955
+ ---
1956
+ ## Test Format
1957
+
1958
+ Each test should have:
1959
+ 1. Numbered ID
1960
+ 2. Description
1961
+ 3. Status indicator
1962
+ 4. Steps (numbered)
1963
+ 5. Expected outcome
1964
+ assert:
1965
+ - type: llm-rubric
1966
+ value: |
1967
+ The response should show test format.
1968
+ EXCELLENT: Shows numbered format with description, status, steps, expected outcome
1969
+ ACCEPTABLE: Shows basic format
1970
+ POOR: Vague or no format guidance
1971
+
1972
+ - description: 'tdd-012: Test data builders'
1973
+ vars:
1974
+ input: "I'm writing tests that need complex test data. How should I structure this?"
1975
+ context: |
1976
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
1977
+ ---
1978
+ ## Test Data Builders
1979
+
1980
+ Use builder pattern with sensible defaults:
1981
+ ```typescript
1982
+ const user = createTestUser({ name: 'Alice' });
1983
+ ```
1984
+ assert:
1985
+ - type: llm-rubric
1986
+ value: |
1987
+ The response should recommend test data builders.
1988
+ EXCELLENT: Recommends builder pattern with defaults, explains benefits
1989
+ ACCEPTABLE: Suggests organizing test data
1990
+ POOR: No guidance on test data
1991
+
1992
+ - description: 'tdd-014: Real LLM integration'
1993
+ vars:
1994
+ input: 'Should my integration tests use a real LLM or mock it?'
1995
+ context: |
1996
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
1997
+ ---
1998
+ ## Real vs Mock LLM
1999
+
2000
+ - Real LLM: Schema compliance, integration behavior
2001
+ - Mock: Unit tests, cost control
2002
+ - Consider: API costs, test speed, flakiness
2003
+ assert:
2004
+ - type: llm-rubric
2005
+ value: |
2006
+ The response should provide guidance on real vs mock.
2007
+ EXCELLENT: Real LLM for schema compliance, mock for unit tests, cost considerations
2008
+ ACCEPTABLE: Distinguishes use cases
2009
+ POOR: No guidance on when to use real vs mock
2010
+
2011
+ - description: 'tdd-015: INVEST gate - story too big'
2012
+ vars:
2013
+ input: 'My story is too big to estimate. What should I do?'
2014
+ context: |
2015
+ file_path: framework/guides/tdd-best-practices.md (excerpt)
2016
+ ---
2017
+ ## INVEST Gate
2018
+
2019
+ If story fails INVEST (e.g., not Estimable, not Small):
2020
+ → Split into smaller stories
2021
+ assert:
2022
+ - type: llm-rubric
2023
+ value: |
2024
+ The response should suggest splitting.
2025
+ EXCELLENT: Cites INVEST (Estimable, Small), suggests splitting into smaller stories
2026
+ ACCEPTABLE: Suggests splitting
2027
+ POOR: Accepts large story
2028
+
2029
+ # =============================================================================
2030
+ # DESIGN DOC GUIDE (10 tests)
2031
+ # =============================================================================
2032
+
2033
+ - description: 'design-001: Check prerequisites before design doc'
2034
+ vars:
2035
+ input: 'Create a design doc for a new search feature'
2036
+ context: |
2037
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2038
+ ---
2039
+ ## Prerequisites
2040
+
2041
+ Before creating a design doc:
2042
+ 1. User stories must exist
2043
+ 2. Test definitions must exist
2044
+
2045
+ If missing, create them first or offer to create.
2046
+ assert:
2047
+ - type: llm-rubric
2048
+ value: |
2049
+ The response should check for prerequisites before creating design doc.
2050
+ EXCELLENT: Asks about or checks for user stories and test definitions first, offers to create if missing
2051
+ ACCEPTABLE: Mentions prerequisites exist/needed
2052
+ POOR: Creates design doc without checking prerequisites
2053
+
2054
+ - description: 'design-002: Use standard template'
2055
+ vars:
2056
+ input: 'Create a design doc for a notification system feature'
2057
+ context: |
2058
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2059
+ ---
2060
+ ## Template Structure
2061
+
2062
+ Required sections:
2063
+ - Architecture
2064
+ - Components (with [N]/[N+1] pattern)
2065
+ - Data Model (if applicable)
2066
+ - User Flow
2067
+ - Key Decisions (what/why/trade-off)
2068
+ - Implementation Notes (if applicable)
2069
+ assert:
2070
+ - type: llm-rubric
2071
+ value: |
2072
+ The response should use the standard template structure.
2073
+ EXCELLENT: Uses template structure with all sections, marks optional sections "(if applicable)"
2074
+ ACCEPTABLE: Uses template structure with most sections
2075
+ POOR: Creates ad-hoc structure without following template
2076
+
2077
+ - description: 'design-003: Assess complexity threshold'
2078
+ vars:
2079
+ input: 'Do I need a design doc for adding a logout button?'
2080
+ context: |
2081
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2082
+ ---
2083
+ ## Complexity Threshold
2084
+
2085
+ Create design doc when:
2086
+ - >3 components involved
2087
+ - Spans 2+ user stories
2088
+ - Architectural decisions needed
2089
+
2090
+ Skip for simple features (<3 components, single story)
2091
+ assert:
2092
+ - type: llm-rubric
2093
+ value: |
2094
+ The response should say no design doc needed (too simple).
2095
+ EXCELLENT: Correctly assesses as too simple + explains why (doesn't meet complexity threshold)
2096
+ ACCEPTABLE: Says probably not needed
2097
+ POOR: Recommends creating design doc
2098
+
2099
+ - description: 'design-004: Components with [N]/[N+1] pattern'
2100
+ vars:
2101
+ input: 'Define the components for a file upload feature in a design doc'
2102
+ context: |
2103
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2104
+ ---
2105
+ ## Components Section
2106
+
2107
+ Use [N]/[N+1] pattern:
2108
+ - Component 1: Full definition (name, responsibility, interface, dependencies, tests)
2109
+ - Component 2: Show variation from Component 1
2110
+ assert:
2111
+ - type: llm-rubric
2112
+ value: |
2113
+ The response should use [N]/[N+1] pattern with full component definitions.
2114
+ EXCELLENT: Defines Component 1 with all 5 attributes, then Component 2 showing variation
2115
+ ACCEPTABLE: Defines multiple components with most attributes
2116
+ POOR: Lists components without [N]/[N+1] pattern or missing key attributes
2117
+
2118
+ - description: 'design-005: Data model section'
2119
+ vars:
2120
+ input: 'Write the data model section for a design doc about a shopping cart feature'
2121
+ context: |
2122
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2123
+ ---
2124
+ ## Data Model Section
2125
+
2126
+ Include:
2127
+ - State shape/schema
2128
+ - Type relationships
2129
+ - Data flow through components
2130
+ assert:
2131
+ - type: llm-rubric
2132
+ value: |
2133
+ The response should document state shape, relationships, and flow.
2134
+ EXCELLENT: Documents state shape/schema, shows type relationships, explains data flow
2135
+ ACCEPTABLE: Documents state shape with some relationships
2136
+ POOR: Skips data model or provides vague description
2137
+
2138
+ - description: 'design-006: Component interaction'
2139
+ vars:
2140
+ input: 'Document the component interaction for a drag-and-drop file organizer feature'
2141
+ context: |
2142
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2143
+ ---
2144
+ ## Component Interaction Section
2145
+
2146
+ Document:
2147
+ - Events/method calls between components
2148
+ - Data flow (Component N → N+1)
2149
+ - Edge cases in interactions
2150
+ assert:
2151
+ - type: llm-rubric
2152
+ value: |
2153
+ The response should document events, data flow, and edge cases.
2154
+ EXCELLENT: Documents events/method calls, shows data flow, notes edge cases
2155
+ ACCEPTABLE: Documents communication pattern and data flow
2156
+ POOR: Skips interaction section for multi-component feature
2157
+
2158
+ - description: 'design-007: Concrete user flow'
2159
+ vars:
2160
+ input: 'Write the user flow section for a design doc about a password reset feature'
2161
+ context: |
2162
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2163
+ ---
2164
+ ## User Flow Section
2165
+
2166
+ Write concrete step-by-step flow:
2167
+ - Specific UI elements (buttons, forms)
2168
+ - Keyboard shortcuts if applicable
2169
+ - Reference user stories/test definitions
2170
+ assert:
2171
+ - type: llm-rubric
2172
+ value: |
2173
+ The response should write concrete step-by-step flow with specific UI interactions.
2174
+ EXCELLENT: Concrete steps with specific UI elements, references user stories/test defs
2175
+ ACCEPTABLE: Step-by-step flow with some concrete details
2176
+ POOR: Vague flow like "user resets password" without concrete steps
2177
+
2178
+ - description: 'design-008: Key decisions with trade-offs'
2179
+ vars:
2180
+ input: 'Write the key decisions section for a design doc about choosing between REST and GraphQL for an API'
2181
+ context: |
2182
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2183
+ ---
2184
+ ## Key Decisions Section
2185
+
2186
+ Use [N]/[N+1] pattern:
2187
+ - Decision 1: what/why (specifics)/trade-off
2188
+ - Decision 2: Show variation
2189
+ - Link to benchmarks if relevant
2190
+ assert:
2191
+ - type: llm-rubric
2192
+ value: |
2193
+ The response should document decision with what/why/trade-off format.
2194
+ EXCELLENT: Decision 1 with what/why (specifics)/trade-off, Decision 2 showing variation
2195
+ ACCEPTABLE: Decisions with what/why/trade-off
2196
+ POOR: Decisions without trade-offs or vague rationale
2197
+
2198
+ - description: 'design-009: Implementation notes'
2199
+ vars:
2200
+ input: 'Write the implementation notes section for a design doc about a real-time collaborative editing feature'
2201
+ context: |
2202
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2203
+ ---
2204
+ ## Implementation Notes Section
2205
+
2206
+ Document:
2207
+ - Constraints
2208
+ - Error handling
2209
+ - Gotchas/risks
2210
+ - Open questions
2211
+ assert:
2212
+ - type: llm-rubric
2213
+ value: |
2214
+ The response should document constraints, error handling, gotchas, and open questions.
2215
+ EXCELLENT: Documents all 4 areas with specific details
2216
+ ACCEPTABLE: Documents 3+ areas
2217
+ POOR: Skips implementation notes for complex feature
2218
+
2219
+ - description: 'design-010: Quality checklist'
2220
+ vars:
2221
+ input: 'Review this design doc for quality before merge'
2222
+ context: |
2223
+ file_path: framework/guides/design-doc-guide.md (excerpt)
2224
+ ---
2225
+ ## Quality Checklist
2226
+
2227
+ - [ ] References not duplicates
2228
+ - [ ] [N]/[N+1] examples
2229
+ - [ ] Concrete user flow
2230
+ - [ ] What/why/trade-off in decisions
2231
+ - [ ] Optional sections marked
2232
+ - [ ] ~121 lines target
2233
+ assert:
2234
+ - type: llm-rubric
2235
+ value: |
2236
+ The response should apply the 6-point checklist.
2237
+ EXCELLENT: Checks all 6 items
2238
+ ACCEPTABLE: Checks 4+ items
2239
+ POOR: Generic review without applying checklist
2240
+
2241
+ # =============================================================================
2242
+ # CONTEXT FILES GUIDE (11 tests)
2243
+ # =============================================================================
2244
+
2245
+ - description: 'ctx-001: Choose right context file'
2246
+ vars:
2247
+ input: 'Set up project context for a project using both Claude and Cursor'
2248
+ context: |
2249
+ file_path: framework/guides/context-files-guide.md (excerpt)
2250
+ ---
2251
+ ## File Selection
2252
+
2253
+ - AGENTS.md: Tool-agnostic (works with Claude, Cursor, etc.)
2254
+ - CLAUDE.md: Claude Code specific
2255
+ - .cursorrules: Cursor specific
2256
+
2257
+ For multi-tool projects, use AGENTS.md
2258
+ assert:
2259
+ - type: llm-rubric
2260
+ value: |
2261
+ The response should create AGENTS.md (tool-agnostic) or both tool-specific files.
2262
+ EXCELLENT: Creates AGENTS.md with clear rationale OR creates both tool-specific files
2263
+ ACCEPTABLE: Creates appropriate context file
2264
+ POOR: Creates wrong file type or doesn't explain choice
2265
+
2266
+ - description: 'ctx-002: Include SAFEWORD trigger'
2267
+ vars:
2268
+ input: 'Create an AGENTS.md file for a new project'
2269
+ context: |
2270
+ file_path: framework/guides/context-files-guide.md (excerpt)
2271
+ ---
2272
+ ## SAFEWORD Trigger (Required)
2273
+
2274
+ First line must be:
2275
+ **⚠️ ALWAYS READ FIRST: @./.safeword/SAFEWORD.md**
2276
+ assert:
2277
+ - type: llm-rubric
2278
+ value: |
2279
+ The response should include SAFEWORD trigger at top.
2280
+ EXCELLENT: Includes exact trigger format + brief rationale
2281
+ ACCEPTABLE: Includes trigger but slightly different wording
2282
+ POOR: Missing trigger or buried in middle of file
2283
+
2284
+ - description: 'ctx-003: No duplication'
2285
+ vars:
2286
+ input: 'Create a tests/AGENTS.md file for a project that already has a root AGENTS.md with TDD workflow documented'
2287
+ context: |
2288
+ file_path: framework/guides/context-files-guide.md (excerpt)
2289
+ ---
2290
+ ## Auto-Loading Behavior
2291
+
2292
+ Subdirectory files inherit from parent.
2293
+ Don't duplicate - use cross-references:
2294
+ "See root AGENTS.md for TDD workflow"
2295
+ assert:
2296
+ - type: llm-rubric
2297
+ value: |
2298
+ The response should reference root for TDD, not duplicate.
2299
+ EXCELLENT: Uses cross-reference ("See root AGENTS.md"), no duplication
2300
+ ACCEPTABLE: Minimal duplication with cross-reference
2301
+ POOR: Duplicates TDD workflow content from root
2302
+
2303
+ - description: 'ctx-004: Use modular imports'
2304
+ vars:
2305
+ input: 'Create an AGENTS.md for a project with architecture decisions in docs/architecture.md and coding standards in docs/conventions.md'
2306
+ context: |
2307
+ file_path: framework/guides/context-files-guide.md (excerpt)
2308
+ ---
2309
+ ## Modular Structure
2310
+
2311
+ Use imports for external files:
2312
+ @docs/architecture.md
2313
+ @docs/conventions.md
2314
+
2315
+ Keep root file under 50 lines
2316
+ assert:
2317
+ - type: llm-rubric
2318
+ value: |
2319
+ The response should use import syntax to reference external files.
2320
+ EXCELLENT: Uses @docs/ imports, keeps root file under 50 lines
2321
+ ACCEPTABLE: Uses imports but file is slightly over target
2322
+ POOR: Duplicates content instead of importing
2323
+
2324
+ - description: 'ctx-005: Content rules'
2325
+ vars:
2326
+ input: 'I want to add setup instructions and our TDD workflow to the AGENTS.md file'
2327
+ context: |
2328
+ file_path: framework/guides/context-files-guide.md (excerpt)
2329
+ ---
2330
+ ## Content Rules
2331
+
2332
+ **In AGENTS.md:** Coding patterns, workflow triggers, domain knowledge
2333
+ **NOT in AGENTS.md:** Setup instructions (→ README.md)
2334
+ assert:
2335
+ - type: llm-rubric
2336
+ value: |
2337
+ The response should redirect setup to README.md.
2338
+ EXCELLENT: Redirects setup to README.md, explains TDD belongs in root if project-specific
2339
+ ACCEPTABLE: Correctly redirects setup, allows TDD
2340
+ POOR: Adds both to AGENTS.md without redirection
2341
+
2342
+ - description: 'ctx-006: Size targets'
2343
+ vars:
2344
+ input: 'Review this AGENTS.md file that is 250 lines long'
2345
+ context: |
2346
+ file_path: framework/guides/context-files-guide.md (excerpt)
2347
+ ---
2348
+ ## Size Targets
2349
+
2350
+ - Root: <200 lines
2351
+ - Subdirectory: <100 lines
2352
+
2353
+ If over, extract to imports or subdirectory files
2354
+ assert:
2355
+ - type: llm-rubric
2356
+ value: |
2357
+ The response should recommend extracting or using imports.
2358
+ EXCELLENT: Identifies >200 line violation, recommends extraction with specific suggestions
2359
+ ACCEPTABLE: Identifies violation, recommends reduction
2360
+ POOR: Accepts 250-line file without comment
2361
+
2362
+ - description: 'ctx-007: Cross-reference pattern'
2363
+ vars:
2364
+ input: 'Add a reference to the agents directory in the root AGENTS.md'
2365
+ context: |
2366
+ file_path: framework/guides/context-files-guide.md (excerpt)
2367
+ ---
2368
+ ## Cross-Reference Pattern
2369
+
2370
+ **Agents** (`path/`) - Description. See `path/AGENTS.md`.
2371
+ assert:
2372
+ - type: llm-rubric
2373
+ value: |
2374
+ The response should use the standard cross-reference pattern.
2375
+ EXCELLENT: Uses pattern with path and link
2376
+ ACCEPTABLE: Uses cross-reference with path
2377
+ POOR: Duplicates content instead of cross-referencing
2378
+
2379
+ - description: 'ctx-008: Maintenance'
2380
+ vars:
2381
+ input: 'The project just underwent a major refactor. The AGENTS.md still references old directory structure.'
2382
+ context: |
2383
+ file_path: framework/guides/context-files-guide.md (excerpt)
2384
+ ---
2385
+ ## Maintenance
2386
+
2387
+ After refactors:
2388
+ - Update or remove outdated sections
2389
+ - Verify cross-references still work
2390
+ assert:
2391
+ - type: llm-rubric
2392
+ value: |
2393
+ The response should recommend updating or removing outdated sections.
2394
+ EXCELLENT: Identifies outdated content, recommends removal/update
2395
+ ACCEPTABLE: Recommends updating the file
2396
+ POOR: Ignores outdated content
2397
+
2398
+ - description: 'ctx-009: Domain requirements'
2399
+ vars:
2400
+ input: 'Create an AGENTS.md for a tabletop RPG game assistant project'
2401
+ context: |
2402
+ file_path: framework/guides/context-files-guide.md (excerpt)
2403
+ ---
2404
+ ## Domain Requirements Section
2405
+
2406
+ For specialized projects, include:
2407
+ - Domain-specific terminology
2408
+ - Game mechanics (for games)
2409
+ - Business rules
2410
+ assert:
2411
+ - type: llm-rubric
2412
+ value: |
2413
+ The response should include Domain Requirements section.
2414
+ EXCELLENT: Includes Domain Requirements with game mechanics, uses template structure
2415
+ ACCEPTABLE: Includes domain section but less detailed
2416
+ POOR: Omits domain requirements for specialized project
2417
+
2418
+ - description: 'ctx-010: LLM checklist'
2419
+ vars:
2420
+ input: 'Review this AGENTS.md file for LLM comprehension quality'
2421
+ context: |
2422
+ file_path: framework/guides/context-files-guide.md (excerpt)
2423
+ ---
2424
+ ## LLM Comprehension Checklist
2425
+
2426
+ 1. MECE decision trees
2427
+ 2. Terms defined
2428
+ 3. No contradictions
2429
+ 4. Concrete examples
2430
+ 5. Edge cases explicit
2431
+ 6. Actionable language
2432
+ 7. No redundancy
2433
+ 8. Size within limits
2434
+ assert:
2435
+ - type: llm-rubric
2436
+ value: |
2437
+ The response should apply the 8-point checklist.
2438
+ EXCELLENT: Checks all 8 items
2439
+ ACCEPTABLE: Checks 5+ items
2440
+ POOR: Generic review without applying checklist
2441
+
2442
+ - description: 'ctx-011: Token efficiency'
2443
+ vars:
2444
+ input: 'Review this 300-line AGENTS.md with narrative paragraphs for token efficiency'
2445
+ context: |
2446
+ file_path: framework/guides/context-files-guide.md (excerpt)
2447
+ ---
2448
+ ## Token Efficiency
2449
+
2450
+ - Use bullets over paragraphs
2451
+ - Remove redundancy
2452
+ - Use imports for modularization
2453
+ assert:
2454
+ - type: llm-rubric
2455
+ value: |
2456
+ The response should recommend converting to bullets, removing redundancy.
2457
+ EXCELLENT: Identifies verbose content, recommends bullets over paragraphs, suggests imports
2458
+ ACCEPTABLE: Recommends reducing size
2459
+ POOR: Accepts verbose file without comment
2460
+
2461
+ # =============================================================================
2462
+ # DATA ARCHITECTURE GUIDE (7 tests)
2463
+ # =============================================================================
2464
+
2465
+ - description: 'data-001: Decision tree for where to document'
2466
+ vars:
2467
+ input: "I'm adding a new Redis cache for session data. Where should I document this?"
2468
+ context: |
2469
+ file_path: framework/guides/data-architecture-guide.md (excerpt)
2470
+ ---
2471
+ ## Where to Document
2472
+
2473
+ Architecture Doc when:
2474
+ - Adding new data store
2475
+ - Changing data model
2476
+ - New data flows
2477
+
2478
+ Design Doc when:
2479
+ - Feature-specific data handling
2480
+ assert:
2481
+ - type: llm-rubric
2482
+ value: |
2483
+ The response should select Architecture Doc (new data store).
2484
+ EXCELLENT: Correctly identifies Architecture Doc, cites "Adding new data store"
2485
+ ACCEPTABLE: Correctly identifies Architecture Doc
2486
+ POOR: Suggests Design Doc for new data store
2487
+
2488
+ - description: 'data-002: Data principles format'
2489
+ vars:
2490
+ input: 'Create a data architecture section for a user management system'
2491
+ context: |
2492
+ file_path: framework/guides/data-architecture-guide.md (excerpt)
2493
+ ---
2494
+ ## Data Principles
2495
+
2496
+ 4 principles with What/Why/Document/Example format:
2497
+ 1. Data Quality
2498
+ 2. Data Governance
2499
+ 3. Data Accessibility
2500
+ 4. Living Documentation
2501
+ assert:
2502
+ - type: llm-rubric
2503
+ value: |
2504
+ The response should include all 4 principles with proper format.
2505
+ EXCELLENT: All 4 principles with What/Why/Document/Example format
2506
+ ACCEPTABLE: 3+ principles with consistent format
2507
+ POOR: Missing principles or inconsistent format
2508
+
2509
+ - description: 'data-004: Document data flows'
2510
+ vars:
2511
+ input: 'Document the data flow for user registration'
2512
+ context: |
2513
+ file_path: framework/guides/data-architecture-guide.md (excerpt)
2514
+ ---
2515
+ ## Data Flows
2516
+
2517
+ Document:
2518
+ - Sources → Transformations → Destinations
2519
+ - Error handling at each step
2520
+ assert:
2521
+ - type: llm-rubric
2522
+ value: |
2523
+ The response should document full flow with error handling.
2524
+ EXCELLENT: Documents full flow with error handling for each step
2525
+ ACCEPTABLE: Documents flow with some error handling
2526
+ POOR: Only documents happy path without error handling
2527
+
2528
+ - description: 'data-005: Data policies'
2529
+ vars:
2530
+ input: 'Document data policies for a multi-tenant SaaS application'
2531
+ context: |
2532
+ file_path: framework/guides/data-architecture-guide.md (excerpt)
2533
+ ---
2534
+ ## Data Policies
2535
+
2536
+ Document:
2537
+ - Access control (read/write/delete roles)
2538
+ - Lifecycle rules
2539
+ - Conflict resolution strategy
2540
+ assert:
2541
+ - type: llm-rubric
2542
+ value: |
2543
+ The response should document access control, lifecycle, and conflict resolution.
2544
+ EXCELLENT: Documents all three with justification
2545
+ ACCEPTABLE: Documents access control and lifecycle
2546
+ POOR: Missing conflict resolution or lifecycle rules
2547
+
2548
+ - description: 'data-006: TDD triggers for data changes'
2549
+ vars:
2550
+ input: 'I just added a new payments table to the database. What should I update?'
2551
+ context: |
2552
+ file_path: framework/guides/data-architecture-guide.md (excerpt)
2553
+ ---
2554
+ ## TDD Integration Triggers
2555
+
2556
+ Update architecture doc when:
2557
+ - Adding new data entities
2558
+ - Changing data model
2559
+ - New data flows
2560
+ assert:
2561
+ - type: llm-rubric
2562
+ value: |
2563
+ The response should recommend updating architecture doc.
2564
+ EXCELLENT: Recommends update, cites "Adding new data entities", mentions version/status
2565
+ ACCEPTABLE: Recommends updating architecture doc
2566
+ POOR: Suggests only updating code without documentation
2567
+
2568
+ - description: 'data-007: Common mistakes'
2569
+ vars:
2570
+ input: "Review this data architecture doc that has no migration strategy and uses vague performance targets like 'fast queries'"
2571
+ context: |
2572
+ file_path: framework/guides/data-architecture-guide.md (excerpt)
2573
+ ---
2574
+ ## Common Mistakes
2575
+
2576
+ - Missing migration strategy
2577
+ - Vague performance targets ("fast" instead of "<100ms")
2578
+ - No error handling documented
2579
+ assert:
2580
+ - type: llm-rubric
2581
+ value: |
2582
+ The response should identify both anti-patterns.
2583
+ EXCELLENT: Identifies both issues, cites Common Mistakes section
2584
+ ACCEPTABLE: Identifies at least one issue
2585
+ POOR: Accepts the doc without identifying anti-patterns
2586
+
2587
+ - description: 'data-008: Best practices checklist'
2588
+ vars:
2589
+ input: 'Review this data architecture doc for completeness before merge'
2590
+ context: |
2591
+ file_path: framework/guides/data-architecture-guide.md (excerpt)
2592
+ ---
2593
+ ## Best Practices Checklist
2594
+
2595
+ 1. Principles format correct
2596
+ 2. Entities documented
2597
+ 3. Attributes with types
2598
+ 4. Storage rationale
2599
+ 5. Error handling
2600
+ 6. Validation checkpoints
2601
+ 7. Performance targets (specific)
2602
+ 8. Migration strategy
2603
+ 9. Version/status
2604
+ 10. Cross-references
2605
+ assert:
2606
+ - type: llm-rubric
2607
+ value: |
2608
+ The response should apply the 10-point checklist.
2609
+ EXCELLENT: Checks all 10 items
2610
+ ACCEPTABLE: Checks 7+ items
2611
+ POOR: Generic review without applying checklist
2612
+
2613
+ # =============================================================================
2614
+ # LEARNING EXTRACTION (11 tests)
2615
+ # =============================================================================
2616
+
2617
+ - description: 'learn-001: Recognize extraction triggers'
2618
+ vars:
2619
+ input: "I've been debugging this React state issue for 6 cycles now, tried 4 different approaches, and finally found it's a race condition not documented in the React docs"
2620
+ context: |
2621
+ file_path: framework/guides/learning-extraction.md (excerpt)
2622
+ ---
2623
+ ## Extraction Triggers
2624
+
2625
+ 1. Observable debugging complexity (5+ cycles)
2626
+ 2. Trial and error (3+ approaches)
2627
+ 3. Undocumented gotcha
2628
+ 4. Integration struggle
2629
+ 5. Testing trap
2630
+ 6. Architectural insight
2631
+ assert:
2632
+ - type: llm-rubric
2633
+ value: |
2634
+ The response should recognize multiple triggers and suggest extraction.
2635
+ EXCELLENT: Identifies 3+ triggers, suggests extraction after fix confirmed
2636
+ ACCEPTABLE: Identifies triggers, suggests extraction
2637
+ POOR: Doesn't recognize triggers or suggests extraction mid-debug
2638
+
2639
+ - description: 'learn-002: Check existing learnings first'
2640
+ vars:
2641
+ input: 'I just discovered a gotcha about React hooks and async state updates'
2642
+ context: |
2643
+ file_path: framework/guides/learning-extraction.md (excerpt)
2644
+ ---
2645
+ ## Before Extracting
2646
+
2647
+ ALWAYS check for existing learnings first:
2648
+ ls .safeword/learnings/*react*.md
2649
+ ls .safeword/learnings/*hooks*.md
2650
+
2651
+ If found, update instead of creating new.
2652
+ assert:
2653
+ - type: llm-rubric
2654
+ value: |
2655
+ The response should check for existing learnings before suggesting extraction.
2656
+ EXCELLENT: Checks for existing learnings, suggests update vs new
2657
+ ACCEPTABLE: Mentions checking for existing learnings
2658
+ POOR: Suggests creating new learning without checking existing
2659
+
2660
+ - description: 'learn-003: Place learnings correctly'
2661
+ vars:
2662
+ input: 'I learned that React useState is async - where should I document this?'
2663
+ context: |
2664
+ file_path: framework/guides/learning-extraction.md (excerpt)
2665
+ ---
2666
+ ## Location Decision Tree
2667
+
2668
+ Global (.safeword/learnings/):
2669
+ - Applies to ALL projects using this tech
2670
+ - Universal patterns
2671
+
2672
+ Project-specific:
2673
+ - Only applies to this codebase
2674
+ - Custom architecture patterns
2675
+ assert:
2676
+ - type: llm-rubric
2677
+ value: |
2678
+ The response should select global learnings (applies to ALL React projects).
2679
+ EXCELLENT: Selects .safeword/learnings/ (global), explains why, cites decision tree
2680
+ ACCEPTABLE: Selects correct location
2681
+ POOR: Selects project-specific location for universal React pattern
2682
+
2683
+ - description: 'learn-004: Respect instruction precedence'
2684
+ vars:
2685
+ input: 'The global learning says use Redux, but the project learning says use Zustand. Which should I follow?'
2686
+ context: |
2687
+ file_path: framework/guides/learning-extraction.md (excerpt)
2688
+ ---
2689
+ ## Instruction Precedence
2690
+
2691
+ 1. Project-specific (highest)
2692
+ 2. Global learnings
2693
+ 3. Framework defaults (lowest)
2694
+ assert:
2695
+ - type: llm-rubric
2696
+ value: |
2697
+ The response should follow project learning (higher precedence).
2698
+ EXCELLENT: Follows project learning, explains precedence order
2699
+ ACCEPTABLE: Follows project learning
2700
+ POOR: Follows global learning or asks which to use
2701
+
2702
+ - description: 'learn-005: Use templates'
2703
+ vars:
2704
+ input: 'Create a learning about React useEffect cleanup functions'
2705
+ context: |
2706
+ file_path: framework/guides/learning-extraction.md (excerpt)
2707
+ ---
2708
+ ## Learning Template
2709
+
2710
+ Sections:
2711
+ - Principle
2712
+ - Gotcha (Bad/Good examples)
2713
+ - Why
2714
+ - Examples
2715
+ - Testing Trap
2716
+ assert:
2717
+ - type: llm-rubric
2718
+ value: |
2719
+ The response should use the learning template with all sections.
2720
+ EXCELLENT: Uses template with all sections
2721
+ ACCEPTABLE: Uses template with most sections
2722
+ POOR: Creates ad-hoc structure without following template
2723
+
2724
+ - description: 'learn-006: Cross-reference in SAFEWORD'
2725
+ vars:
2726
+ input: 'I just created a learning at .safeword/learnings/electron-contexts.md about Electron renderer context'
2727
+ context: |
2728
+ file_path: framework/guides/learning-extraction.md (excerpt)
2729
+ ---
2730
+ ## Cross-Reference
2731
+
2732
+ After creating learning, add to SAFEWORD.md Common Gotchas:
2733
+ **Electron Contexts** - One-liner. See learnings/electron-contexts.md
2734
+ assert:
2735
+ - type: llm-rubric
2736
+ value: |
2737
+ The response should suggest adding cross-reference to SAFEWORD.md.
2738
+ EXCELLENT: Suggests adding to Common Gotchas with proper format
2739
+ ACCEPTABLE: Suggests adding cross-reference
2740
+ POOR: Doesn't mention cross-referencing in SAFEWORD.md
2741
+
2742
+ - description: "learn-007: Don't suggest extraction for trivial fix"
2743
+ vars:
2744
+ input: 'Fixed a typo in the config file'
2745
+ context: |
2746
+ file_path: framework/guides/learning-extraction.md (excerpt)
2747
+ ---
2748
+ ## When NOT to Extract
2749
+
2750
+ Skip extraction for:
2751
+ - Trivial fixes
2752
+ - One-line changes
2753
+ - Well-documented issues
2754
+ assert:
2755
+ - type: llm-rubric
2756
+ value: |
2757
+ The response should NOT suggest extraction (trivial fix).
2758
+ EXCELLENT: Does not suggest extraction, recognizes trivial fix
2759
+ ACCEPTABLE: Doesn't mention extraction
2760
+ POOR: Suggests extraction for trivial fix
2761
+
2762
+ - description: 'learn-008: Recommend splitting large files'
2763
+ vars:
2764
+ input: 'This learning file is 250 lines and covers both React hooks and Redux patterns'
2765
+ context: |
2766
+ file_path: framework/guides/learning-extraction.md (excerpt)
2767
+ ---
2768
+ ## Size Standards
2769
+
2770
+ - Max 150-200 lines per file
2771
+ - One concept per file
2772
+ - Split if covering multiple topics
2773
+ assert:
2774
+ - type: llm-rubric
2775
+ value: |
2776
+ The response should recommend splitting into focused files.
2777
+ EXCELLENT: Recommends splitting (>200 lines, multiple concepts), suggests specific split
2778
+ ACCEPTABLE: Recommends splitting
2779
+ POOR: Accepts 250-line multi-concept file without comment
2780
+
2781
+ - description: 'learn-010: Follow extraction workflow'
2782
+ vars:
2783
+ input: 'I just finished implementing a complex feature and discovered a race condition pattern. Walk me through documenting this.'
2784
+ context: |
2785
+ file_path: framework/guides/learning-extraction.md (excerpt)
2786
+ ---
2787
+ ## Extraction Workflow
2788
+
2789
+ 1. Assess scope (global vs project)
2790
+ 2. Choose location
2791
+ 3. Extract using template
2792
+ 4. Cross-reference in SAFEWORD.md
2793
+ 5. Suggest commit message
2794
+ assert:
2795
+ - type: llm-rubric
2796
+ value: |
2797
+ The response should follow the workflow steps.
2798
+ EXCELLENT: Follows all workflow steps
2799
+ ACCEPTABLE: Follows most workflow steps
2800
+ POOR: Ad-hoc extraction without following workflow
2801
+
2802
+ - description: 'learn-011: Block trivial extractions'
2803
+ vars:
2804
+ input: "I want to create a learning that says 'Changed == to ==='"
2805
+ context: |
2806
+ file_path: framework/guides/learning-extraction.md (excerpt)
2807
+ ---
2808
+ ## Anti-Patterns
2809
+
2810
+ Don't extract:
2811
+ - One-line fixes without context
2812
+ - Well-known patterns
2813
+ - Trivial changes
2814
+ assert:
2815
+ - type: llm-rubric
2816
+ value: |
2817
+ The response should block this as trivial one-liner.
2818
+ EXCELLENT: Blocks extraction, cites anti-pattern
2819
+ ACCEPTABLE: Suggests this is too trivial
2820
+ POOR: Proceeds with extraction
2821
+
2822
+ - description: 'learn-012: Size standards'
2823
+ vars:
2824
+ input: "I'm creating a learning file that's 180 lines and covers both React hooks and Redux patterns"
2825
+ context: |
2826
+ file_path: framework/guides/learning-extraction.md (excerpt)
2827
+ ---
2828
+ ## Size Standards
2829
+
2830
+ - Max 150-200 lines per file
2831
+ - One concept per file
2832
+ assert:
2833
+ - type: llm-rubric
2834
+ value: |
2835
+ The response should recommend splitting based on size and scope.
2836
+ EXCELLENT: Recommends splitting (>150 lines, multiple concepts)
2837
+ ACCEPTABLE: Notes it's borderline, recommends review
2838
+ POOR: Accepts 180-line multi-concept file without comment
2839
+
2840
+ # =============================================================================
2841
+ # LLM PROMPTING (10 tests)
2842
+ # =============================================================================
2843
+
2844
+ - description: 'prompt-001: Concrete examples in prompts'
2845
+ vars:
2846
+ input: "I'm writing a prompt that says 'Return the user's intent'. Is this good enough?"
2847
+ context: |
2848
+ file_path: framework/guides/llm-prompting.md (excerpt)
2849
+ ---
2850
+ ## Concrete Examples
2851
+
2852
+ Abstract prompts need examples:
2853
+ BAD: "Return the user's intent"
2854
+ GOOD: "Return JSON: {intent: 'create_campaign', name: '...'}"
2855
+ assert:
2856
+ - type: llm-rubric
2857
+ value: |
2858
+ The response should suggest adding BAD/GOOD examples with concrete format.
2859
+ EXCELLENT: Suggests adding structured JSON example showing BAD vs GOOD
2860
+ ACCEPTABLE: Suggests being more specific
2861
+ POOR: Accepts vague prompt without examples
2862
+
2863
+ - description: 'prompt-002: Structured outputs'
2864
+ vars:
2865
+ input: "I'm building an AI agent that needs to understand user intent. Should I have it return prose like 'The user wants to create a campaign'?"
2866
+ context: |
2867
+ file_path: framework/guides/llm-prompting.md (excerpt)
2868
+ ---
2869
+ ## Structured Outputs
2870
+
2871
+ For machine consumption, use JSON:
2872
+ - Explicit fields
2873
+ - Type validation
2874
+ - Predictable parsing
2875
+ assert:
2876
+ - type: llm-rubric
2877
+ value: |
2878
+ The response should recommend structured JSON output.
2879
+ EXCELLENT: Recommends JSON schema with explicit fields, shows example
2880
+ ACCEPTABLE: Suggests structured output
2881
+ POOR: Accepts prose output for machine consumption
2882
+
2883
+ - description: 'prompt-003: Prompt caching'
2884
+ vars:
2885
+ input: 'I have a 500-line system prompt that includes both static rules and the current character state. How should I structure this?'
2886
+ context: |
2887
+ file_path: framework/guides/llm-prompting.md (excerpt)
2888
+ ---
2889
+ ## Prompt Caching
2890
+
2891
+ Separate static from dynamic:
2892
+ - Static rules: cache_control: ephemeral
2893
+ - Dynamic state: user message (uncached)
2894
+ assert:
2895
+ - type: llm-rubric
2896
+ value: |
2897
+ The response should recommend separating static from dynamic.
2898
+ EXCELLENT: Recommends static with cache_control, dynamic in user message, mentions cost reduction
2899
+ ACCEPTABLE: Suggests separating static from dynamic
2900
+ POOR: Accepts mixed static/dynamic in system prompt
2901
+
2902
+ - description: 'prompt-004: Message architecture'
2903
+ vars:
2904
+ input: "I'm interpolating the user's character state directly into my system prompt like this: systemPrompt = `Rules + Character: ${dynamicState}`. Is this okay?"
2905
+ context: |
2906
+ file_path: framework/guides/llm-prompting.md (excerpt)
2907
+ ---
2908
+ ## Message Architecture
2909
+
2910
+ BAD: Dynamic state in system prompt (uncacheable)
2911
+ GOOD: Dynamic state in user message
2912
+ assert:
2913
+ - type: llm-rubric
2914
+ value: |
2915
+ The response should identify this as BAD pattern.
2916
+ EXCELLENT: Identifies as BAD (uncacheable), recommends moving dynamic state to user message
2917
+ ACCEPTABLE: Suggests separating static from dynamic
2918
+ POOR: Accepts dynamic state in system prompt
2919
+
2920
+ - description: 'prompt-005: Cache invalidation'
2921
+ vars:
2922
+ input: 'I want to add a small clarification to my cached system prompt. Should I just make the change?'
2923
+ context: |
2924
+ file_path: framework/guides/llm-prompting.md (excerpt)
2925
+ ---
2926
+ ## Cache Invalidation
2927
+
2928
+ Any change breaks all caches.
2929
+ Batch edits to minimize rebuilds.
2930
+ assert:
2931
+ - type: llm-rubric
2932
+ value: |
2933
+ The response should warn about cache invalidation.
2934
+ EXCELLENT: Warns "any change breaks all caches", suggests batching edits
2935
+ ACCEPTABLE: Notes cache invalidation concern
2936
+ POOR: Suggests making change without mentioning cache impact
2937
+
2938
+ - description: 'prompt-006: LLM-as-judge'
2939
+ vars:
2940
+ input: "I want to test if my AI GM's responses have a 'collaborative tone'. Should I check for specific keywords like 'together' or 'we'?"
2941
+ context: |
2942
+ file_path: framework/guides/llm-prompting.md (excerpt)
2943
+ ---
2944
+ ## LLM-as-Judge
2945
+
2946
+ For creative/qualitative outputs:
2947
+ - Use rubric (EXCELLENT/ACCEPTABLE/POOR)
2948
+ - Avoid brittle keyword matching
2949
+ assert:
2950
+ - type: llm-rubric
2951
+ value: |
2952
+ The response should recommend LLM-as-judge with rubric.
2953
+ EXCELLENT: Recommends LLM-as-judge with rubric, warns against brittle keywords
2954
+ ACCEPTABLE: Suggests rubric-based evaluation
2955
+ POOR: Accepts keyword matching for creative outputs
2956
+
2957
+ - description: 'prompt-007: Eval framework mapping'
2958
+ vars:
2959
+ input: 'I have a function that parses JSON, an agent that calls an LLM, and a judgment about narrative quality. What test types should I use?'
2960
+ context: |
2961
+ file_path: framework/guides/llm-prompting.md (excerpt)
2962
+ ---
2963
+ ## Test Type Mapping
2964
+
2965
+ - JSON parsing → Unit test
2966
+ - Agent + LLM → Integration test
2967
+ - Narrative quality → LLM Eval
2968
+ assert:
2969
+ - type: llm-rubric
2970
+ value: |
2971
+ The response should map to correct test types.
2972
+ EXCELLENT: JSON → Unit, Agent+LLM → Integration, Narrative → LLM Eval
2973
+ ACCEPTABLE: Correctly identifies at least 2 mappings
2974
+ POOR: Suggests same test type for all
2975
+
2976
+ - description: 'prompt-008: Cost awareness'
2977
+ vars:
2978
+ input: 'I want to run 100 LLM evaluation scenarios in CI. What should I consider?'
2979
+ context: |
2980
+ file_path: framework/guides/llm-prompting.md (excerpt)
2981
+ ---
2982
+ ## Cost Awareness
2983
+
2984
+ - ~$0.15-0.30 for 30 scenarios with caching
2985
+ - Cache rubrics
2986
+ - Budget expectations
2987
+ assert:
2988
+ - type: llm-rubric
2989
+ value: |
2990
+ The response should provide cost guidance.
2991
+ EXCELLENT: Mentions typical costs, suggests caching rubrics, budget expectations
2992
+ ACCEPTABLE: Notes cost considerations
2993
+ POOR: Ignores cost implications
2994
+
2995
+ - description: 'prompt-009: Why over what'
2996
+ vars:
2997
+ input: "My prompt says 'Use JSON output'. Should I add more context?"
2998
+ context: |
2999
+ file_path: framework/guides/llm-prompting.md (excerpt)
3000
+ ---
3001
+ ## Why Over What
3002
+
3003
+ Include rationale:
3004
+ - Why JSON? Predictable parsing, validation
3005
+ - Benefits and trade-offs
3006
+ assert:
3007
+ - type: llm-rubric
3008
+ value: |
3009
+ The response should suggest adding rationale.
3010
+ EXCELLENT: Suggests adding "why" (predictable parsing, validation)
3011
+ ACCEPTABLE: Suggests adding rationale
3012
+ POOR: Accepts bare instruction without context
3013
+
3014
+ - description: 'prompt-010: Precise terms'
3015
+ vars:
3016
+ input: "My decision tree asks 'Does this test need to see the UI?'"
3017
+ context: |
3018
+ file_path: framework/guides/llm-prompting.md (excerpt)
3019
+ ---
3020
+ ## Precise Technical Terms
3021
+
3022
+ Vague: "see the UI"
3023
+ Precise: "real browser (Playwright/Cypress)"
3024
+ Note: RTL is not a real browser
3025
+ assert:
3026
+ - type: llm-rubric
3027
+ value: |
3028
+ The response should suggest more precise wording.
3029
+ EXCELLENT: Suggests "real browser (Playwright/Cypress)", clarifies RTL distinction
3030
+ ACCEPTABLE: Suggests more specific wording
3031
+ POOR: Accepts vague "see the UI" phrasing
3032
+
3033
+ # =============================================================================
3034
+ # TEST DEFINITIONS GUIDE (12 tests)
3035
+ # =============================================================================
3036
+
3037
+ - description: 'testdef-001: Use standard template'
3038
+ vars:
3039
+ input: 'I need to create test definitions for a new feature. Where do I start?'
3040
+ context: |
3041
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3042
+ ---
3043
+ ## Getting Started
3044
+
3045
+ 1. Use template from .safeword/templates/test-definitions-feature.md
3046
+ 2. Fill in feature name
3047
+ 3. Organize into suites
3048
+ 4. Add individual tests
3049
+ assert:
3050
+ - type: llm-rubric
3051
+ value: |
3052
+ The response should point to template and workflow.
3053
+ EXCELLENT: Points to template, lists steps
3054
+ ACCEPTABLE: Points to template
3055
+ POOR: No template reference
3056
+
3057
+ - description: 'testdef-002: Organize into suites'
3058
+ vars:
3059
+ input: 'I have 15 tests for a feature. How should I organize them?'
3060
+ context: |
3061
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3062
+ ---
3063
+ ## Suite Organization
3064
+
3065
+ Group by concern:
3066
+ - Layout
3067
+ - Interactions
3068
+ - State
3069
+ - Accessibility
3070
+ - Edge Cases
3071
+ assert:
3072
+ - type: llm-rubric
3073
+ value: |
3074
+ The response should suggest suite organization.
3075
+ EXCELLENT: Suggests suites by concern, numbered tests
3076
+ ACCEPTABLE: Suggests grouping logically
3077
+ POOR: No organization guidance
3078
+
3079
+ - description: 'testdef-003: Track test status'
3080
+ vars:
3081
+ input: 'What status indicators should I use for my tests?'
3082
+ context: |
3083
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3084
+ ---
3085
+ ## Status Indicators
3086
+
3087
+ ✅ Passing
3088
+ ⏭️ Skipped (with rationale)
3089
+ ❌ Not Implemented
3090
+ 🔴 Failing
3091
+ assert:
3092
+ - type: llm-rubric
3093
+ value: |
3094
+ The response should list status indicators.
3095
+ EXCELLENT: Lists all 4 statuses with meanings
3096
+ ACCEPTABLE: Lists most statuses
3097
+ POOR: Inconsistent statuses
3098
+
3099
+ - description: 'testdef-004: Write clear steps'
3100
+ vars:
3101
+ input: "My test step says 'Check panes'. Is this good enough?"
3102
+ context: |
3103
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3104
+ ---
3105
+ ## Test Steps
3106
+
3107
+ BAD: "Check panes" (vague)
3108
+ GOOD: "1. Verify left pane shows navigation 2. Verify center pane shows content"
3109
+ assert:
3110
+ - type: llm-rubric
3111
+ value: |
3112
+ The response should identify vague step.
3113
+ EXCELLENT: Identifies as BAD (vague), shows GOOD example with numbered steps
3114
+ ACCEPTABLE: Notes it's too vague
3115
+ POOR: Accepts vague step
3116
+
3117
+ - description: 'testdef-005: Specific expected outcomes'
3118
+ vars:
3119
+ input: "My expected outcome says 'Everything works'. Is this okay?"
3120
+ context: |
3121
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3122
+ ---
3123
+ ## Expected Outcomes
3124
+
3125
+ BAD: "Everything works"
3126
+ GOOD: "Button is enabled, form submits, success message appears"
3127
+ assert:
3128
+ - type: llm-rubric
3129
+ value: |
3130
+ The response should identify vague outcome.
3131
+ EXCELLENT: Identifies as BAD, shows GOOD example with specific assertions
3132
+ ACCEPTABLE: Notes it's too vague
3133
+ POOR: Accepts vague outcome
3134
+
3135
+ - description: 'testdef-006: Coverage summary'
3136
+ vars:
3137
+ input: 'Should I include a coverage summary in my test definitions?'
3138
+ context: |
3139
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3140
+ ---
3141
+ ## Coverage Summary
3142
+
3143
+ Include:
3144
+ - Total tests
3145
+ - Passing/failing/skipped counts
3146
+ - Rationale for skipped tests
3147
+ assert:
3148
+ - type: llm-rubric
3149
+ value: |
3150
+ The response should recommend coverage summary.
3151
+ EXCELLENT: Yes, with totals, percentages, rationale for skipped
3152
+ ACCEPTABLE: Recommends summary
3153
+ POOR: No guidance
3154
+
3155
+ - description: 'testdef-007: Test naming'
3156
+ vars:
3157
+ input: "I named my test 'Test 1'. Is this okay?"
3158
+ context: |
3159
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3160
+ ---
3161
+ ## Test Naming
3162
+
3163
+ BAD: "Test 1"
3164
+ GOOD: "Render all three panes on initial load"
3165
+ assert:
3166
+ - type: llm-rubric
3167
+ value: |
3168
+ The response should identify bad naming.
3169
+ EXCELLENT: Identifies as BAD, suggests descriptive name
3170
+ ACCEPTABLE: Notes name is not descriptive
3171
+ POOR: Accepts "Test 1"
3172
+
3173
+ - description: 'testdef-008: Execution commands'
3174
+ vars:
3175
+ input: 'What should I include in the test execution section?'
3176
+ context: |
3177
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3178
+ ---
3179
+ ## Test Execution
3180
+
3181
+ Include:
3182
+ - Command to run all tests
3183
+ - Command to grep for specific test
3184
+ - Match project tooling
3185
+ assert:
3186
+ - type: llm-rubric
3187
+ value: |
3188
+ The response should list command requirements.
3189
+ EXCELLENT: Commands to run all, grep for specific, match project tooling
3190
+ ACCEPTABLE: Suggests including commands
3191
+ POOR: No command guidance
3192
+
3193
+ - description: 'testdef-009: TDD workflow integration'
3194
+ vars:
3195
+ input: 'When should I create test definitions?'
3196
+ context: |
3197
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3198
+ ---
3199
+ ## TDD Workflow
3200
+
3201
+ - Create before implementation
3202
+ - Alongside user stories
3203
+ - Update status as tests pass/fail
3204
+ assert:
3205
+ - type: llm-rubric
3206
+ value: |
3207
+ The response should explain TDD timing.
3208
+ EXCELLENT: Before implementation, alongside user stories, update status
3209
+ ACCEPTABLE: Mentions before implementation
3210
+ POOR: No timing guidance
3211
+
3212
+ - description: 'testdef-010: Map to user stories'
3213
+ vars:
3214
+ input: 'How do I connect my tests to user stories?'
3215
+ context: |
3216
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3217
+ ---
3218
+ ## User Story Mapping
3219
+
3220
+ - Each AC has at least one test
3221
+ - Add edge cases beyond AC
3222
+ - Include test file references
3223
+ assert:
3224
+ - type: llm-rubric
3225
+ value: |
3226
+ The response should explain mapping.
3227
+ EXCELLENT: Each AC has test, edge cases beyond AC, test file references
3228
+ ACCEPTABLE: Suggests mapping to AC
3229
+ POOR: No mapping guidance
3230
+
3231
+ - description: 'testdef-011: Avoid implementation detail tests'
3232
+ vars:
3233
+ input: "My test verifies 'useUIStore hook works correctly'. Is this a good test?"
3234
+ context: |
3235
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3236
+ ---
3237
+ ## Anti-Patterns
3238
+
3239
+ BAD: Testing implementation details ("useUIStore hook works")
3240
+ GOOD: Testing observable behavior ("clicking button updates UI")
3241
+ assert:
3242
+ - type: llm-rubric
3243
+ value: |
3244
+ The response should identify anti-pattern.
3245
+ EXCELLENT: Identifies as BAD (implementation detail), suggests testing observable behavior
3246
+ ACCEPTABLE: Notes it's testing implementation
3247
+ POOR: Accepts implementation detail test
3248
+
3249
+ - description: 'testdef-012: LLM-friendly test definitions'
3250
+ vars:
3251
+ input: 'How do I make my test definitions LLM-friendly?'
3252
+ context: |
3253
+ file_path: framework/guides/test-definitions-guide.md (excerpt)
3254
+ ---
3255
+ ## LLM Instruction Design
3256
+
3257
+ - MECE decision trees
3258
+ - Explicit definitions
3259
+ - Concrete examples
3260
+ - Actionable language
3261
+ assert:
3262
+ - type: llm-rubric
3263
+ value: |
3264
+ The response should provide LLM optimization guidance.
3265
+ EXCELLENT: MECE, explicit definitions, concrete examples, actionable language
3266
+ ACCEPTABLE: Mentions clarity principles
3267
+ POOR: No LLM-specific guidance
3268
+
3269
+ # Output format
3270
+ outputPath: ./eval-results.json