aiblueprint-cli 1.4.11 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/claude-code-config/scripts/.claude/commands/fix-on-my-computer.md +87 -0
  2. package/claude-code-config/scripts/command-validator/CLAUDE.md +112 -0
  3. package/claude-code-config/scripts/command-validator/src/__tests__/validator.test.ts +62 -111
  4. package/claude-code-config/scripts/command-validator/src/cli.ts +5 -3
  5. package/claude-code-config/scripts/command-validator/src/lib/security-rules.ts +3 -4
  6. package/claude-code-config/scripts/command-validator/src/lib/types.ts +1 -0
  7. package/claude-code-config/scripts/command-validator/src/lib/validator.ts +47 -317
  8. package/claude-code-config/scripts/statusline/CLAUDE.md +29 -7
  9. package/claude-code-config/scripts/statusline/README.md +89 -1
  10. package/claude-code-config/scripts/statusline/defaults.json +75 -0
  11. package/claude-code-config/scripts/statusline/src/index.ts +101 -24
  12. package/claude-code-config/scripts/statusline/src/lib/config-types.ts +100 -0
  13. package/claude-code-config/scripts/statusline/src/lib/config.ts +21 -0
  14. package/claude-code-config/scripts/statusline/src/lib/context.ts +32 -11
  15. package/claude-code-config/scripts/statusline/src/lib/formatters.ts +360 -22
  16. package/claude-code-config/scripts/statusline/src/lib/git.ts +100 -0
  17. package/claude-code-config/scripts/statusline/src/lib/render-pure.ts +177 -0
  18. package/claude-code-config/scripts/statusline/src/lib/types.ts +11 -0
  19. package/claude-code-config/scripts/statusline/statusline.config.json +93 -0
  20. package/claude-code-config/skills/claude-memory/SKILL.md +689 -0
  21. package/claude-code-config/skills/claude-memory/references/comprehensive-example.md +175 -0
  22. package/claude-code-config/skills/claude-memory/references/project-patterns.md +334 -0
  23. package/claude-code-config/skills/claude-memory/references/prompting-techniques.md +411 -0
  24. package/claude-code-config/skills/claude-memory/references/section-templates.md +347 -0
  25. package/claude-code-config/skills/create-slash-commands/SKILL.md +1110 -0
  26. package/claude-code-config/skills/create-slash-commands/references/arguments.md +273 -0
  27. package/claude-code-config/skills/create-slash-commands/references/patterns.md +947 -0
  28. package/claude-code-config/skills/create-slash-commands/references/prompt-examples.md +656 -0
  29. package/claude-code-config/skills/create-slash-commands/references/tool-restrictions.md +389 -0
  30. package/claude-code-config/skills/create-subagents/SKILL.md +425 -0
  31. package/claude-code-config/skills/create-subagents/references/context-management.md +567 -0
  32. package/claude-code-config/skills/create-subagents/references/debugging-agents.md +714 -0
  33. package/claude-code-config/skills/create-subagents/references/error-handling-and-recovery.md +502 -0
  34. package/claude-code-config/skills/create-subagents/references/evaluation-and-testing.md +374 -0
  35. package/claude-code-config/skills/create-subagents/references/orchestration-patterns.md +591 -0
  36. package/claude-code-config/skills/create-subagents/references/subagents.md +599 -0
  37. package/claude-code-config/skills/create-subagents/references/writing-subagent-prompts.md +513 -0
  38. package/dist/cli.js +20 -3
  39. package/package.json +1 -1
  40. package/claude-code-config/commands/apex.md +0 -109
  41. package/claude-code-config/commands/tasks/run-task.md +0 -220
  42. package/claude-code-config/commands/utils/watch-ci.md +0 -47
  43. package/claude-code-config/scripts/command-validator/biome.json +0 -29
  44. package/claude-code-config/scripts/command-validator/bun.lockb +0 -0
  45. package/claude-code-config/scripts/command-validator/package.json +0 -27
  46. package/claude-code-config/scripts/command-validator/vitest.config.ts +0 -7
  47. package/claude-code-config/scripts/hook-post-file.ts +0 -162
  48. package/claude-code-config/scripts/statusline/biome.json +0 -34
  49. package/claude-code-config/scripts/statusline/bun.lockb +0 -0
  50. package/claude-code-config/scripts/statusline/fixtures/test-input.json +0 -25
  51. package/claude-code-config/scripts/statusline/package.json +0 -19
  52. package/claude-code-config/scripts/statusline/statusline.config.ts +0 -25
  53. package/claude-code-config/scripts/statusline/test.ts +0 -20
  54. package/claude-code-config/scripts/validate-command.js +0 -712
  55. package/claude-code-config/scripts/validate-command.readme.md +0 -283
@@ -0,0 +1,374 @@
1
+ # Evaluation and Testing for Subagents
2
+
3
+ <evaluation_framework>
4
+
5
+
6
+ <task_completion>
7
+ **Primary metric**: Proportion of tasks completed correctly and satisfactorily.
8
+
9
+ Measure:
10
+ - Did the subagent complete the requested task?
11
+ - Did it produce the expected output?
12
+ - Would a human consider the task "done"?
13
+
14
+ **Testing approach**: Create test cases with known expected outcomes, invoke subagent, compare results.
15
+ </task_completion>
16
+
17
+ <tool_correctness>
18
+ **Secondary metric**: Whether subagent calls correct tools for given task.
19
+
20
+ Measure:
21
+ - Are tool selections appropriate for the task?
22
+ - Does it use tools efficiently (not calling unnecessary tools)?
23
+ - Does it use tools in correct sequence?
24
+
25
+ **Testing approach**: Review tool call patterns in execution logs.
26
+ </tool_correctness>
27
+
28
+ <output_quality>
29
+ **Quality metric**: Assess quality of subagent-generated outputs.
30
+
31
+ Measure:
32
+ - Accuracy of analysis
33
+ - Completeness of coverage
34
+ - Clarity of communication
35
+ - Adherence to specified format
36
+
37
+ **Testing approach**: Human review or LLM-as-judge evaluation.
38
+ </output_quality>
39
+
40
+ <robustness>
41
+ **Resilience metric**: How well subagent handles failures and edge cases.
42
+
43
+ Measure:
44
+ - Graceful handling of missing files
45
+ - Recovery from tool failures
46
+ - Appropriate responses to unexpected inputs
47
+ - Boundary condition handling
48
+
49
+ **Testing approach**: Inject failures (missing files, malformed data) and verify responses.
50
+ </robustness>
51
+
52
+ <efficiency>
53
+ **Performance metrics**: Response time and resource usage.
54
+
55
+ Measure:
56
+ - Token usage (cost)
57
+ - Latency (response time)
58
+ - Number of tool calls
59
+
60
+ **Testing approach**: Monitor metrics across multiple invocations, track trends.
61
+ </efficiency>
62
+ </evaluation_framework>
63
+
64
+ <g_eval>
65
+
66
+
67
+ **G-Eval**: Use LLMs with chain-of-thought to evaluate outputs against ANY custom criteria defined in natural language.
68
+
69
+ <example>
70
+ **Custom criterion**: "Security review completeness"
71
+
72
+ ```markdown
73
+ Evaluate the security review output on a 1-5 scale:
74
+
75
+ 1. Missing critical vulnerability types
76
+ 2. Covers basic vulnerabilities but misses some common patterns
77
+ 3. Covers standard OWASP Top 10 vulnerabilities
78
+ 4. Comprehensive coverage including framework-specific issues
79
+ 5. Exceptional coverage including business logic vulnerabilities
80
+
81
+ Think step-by-step about which vulnerabilities were checked and which were missed.
82
+ ```
83
+
84
+ **Implementation**: Pass subagent output and criteria to Claude, get structured evaluation.
85
+ </example>
86
+
87
+ **When to use**: Complex quality metrics that can't be measured programmatically (thoroughness, insight quality, appropriateness of recommendations).
88
+ </g_eval>
89
+
90
+ <validation_strategies>
91
+
92
+
93
+ <offline_testing>
94
+ **Offline validation**: Test before deployment with synthetic scenarios.
95
+
96
+ **Process**:
97
+ 1. Create representative test cases covering:
98
+ - Happy path scenarios
99
+ - Edge cases (boundary conditions, unusual inputs)
100
+ - Error conditions (missing data, tool failures)
101
+ - Adversarial inputs (malformed, malicious)
102
+ 2. Invoke subagent with each test case
103
+ 3. Compare outputs to expected results
104
+ 4. Document failures and iterate on prompt
105
+
106
+ **Example test suite for code-reviewer subagent**:
107
+ ```markdown
108
+ Test 1 (Happy path): Recent commit with SQL injection vulnerability
109
+ Expected: Identifies SQL injection, provides fix, rates as Critical
110
+
111
+ Test 2 (Edge case): No recent code changes
112
+ Expected: Confirms review completed, no issues found
113
+
114
+ Test 3 (Error condition): Git repository not initialized
115
+ Expected: Gracefully handles missing git, provides helpful message
116
+
117
+ Test 4 (Adversarial): Obfuscated code with hidden vulnerability
118
+ Expected: Identifies pattern despite obfuscation
119
+ ```
120
+ </offline_testing>
121
+
122
+ <simulation>
123
+ **Simulation testing**: Run subagent in realistic but controlled environments.
124
+
125
+ **Use cases**:
126
+ - Testing against historical issues (can it find bugs that were previously fixed?)
127
+ - Benchmark datasets (SWE-bench for code agents)
128
+ - Controlled codebases with known vulnerabilities
129
+
130
+ **Benefit**: Higher confidence than synthetic tests, safer than production testing.
131
+ </simulation>
132
+
133
+ <online_monitoring>
134
+ **Production monitoring**: Track metrics during real usage.
135
+
136
+ **Key metrics**:
137
+ - Success rate (completed vs failed tasks)
138
+ - User satisfaction (explicit feedback)
139
+ - Retry rate (how often users reinvoke after failure)
140
+ - Token usage trends (increasing = potential prompt issues)
141
+ - Error rates by error type
142
+
143
+ **Implementation**: Log all invocations with context, outcomes, and metrics. Review regularly for patterns.
144
+ </online_monitoring>
145
+ </validation_strategies>
146
+
147
+ <evaluation_driven_development>
148
+
149
+
150
+ **Philosophy**: Integrate evaluation throughout subagent lifecycle, not just at validation stage.
151
+
152
+ <workflow>
153
+ 1. **Initial creation**: Define success criteria before writing prompt
154
+ 2. **Development**: Test after each prompt iteration
155
+ 3. **Pre-deployment**: Comprehensive offline testing
156
+ 4. **Deployment**: Online monitoring with metrics collection
157
+ 5. **Iteration**: Regular review of failures, update prompt based on learnings
158
+ 6. **Continuous**: Ongoing evaluation → feedback → refinement cycles
159
+ </workflow>
160
+
161
+ **Anti-pattern**: Writing subagent, deploying, never measuring effectiveness or iterating.
162
+
163
+ **Best practice**: Treat subagent prompts as living documents that evolve based on real-world performance data.
164
+ </evaluation_driven_development>
165
+
166
+ <testing_checklist>
167
+
168
+
169
+ <before_deployment>
170
+ Before deploying a subagent, complete this validation:
171
+
172
+ **Basic functionality**:
173
+ - [ ] Invoke with representative task, verify completion
174
+ - [ ] Check output format matches specification
175
+ - [ ] Verify workflow steps are followed in sequence
176
+ - [ ] Confirm constraints are respected
177
+
178
+ **Edge cases**:
179
+ - [ ] Test with missing/incomplete data
180
+ - [ ] Test with unusual but valid inputs
181
+ - [ ] Test with boundary conditions (empty files, large files, etc.)
182
+
183
+ **Error handling**:
184
+ - [ ] Test with unavailable tools (if tool access restricted)
185
+ - [ ] Test with malformed inputs
186
+ - [ ] Verify graceful degradation when ideal path fails
187
+
188
+ **Quality checks**:
189
+ - [ ] Human review of outputs for accuracy
190
+ - [ ] Verify no hallucinations or fabricated information
191
+ - [ ] Check output is actionable and useful
192
+
193
+ **Security**:
194
+ - [ ] Verify tool access follows least privilege
195
+ - [ ] Check for potential unsafe operations
196
+ - [ ] Ensure sensitive data handling is appropriate
197
+
198
+ **Documentation**:
199
+ - [ ] Description field clearly indicates when to use
200
+ - [ ] Role and focus areas are specific
201
+ - [ ] Workflow is complete and logical
202
+ </before_deployment>
203
+ </testing_checklist>
204
+
205
+ <synthetic_data>
206
+
207
+
208
+ <when_to_use>
209
+ Synthetic data generation useful for:
210
+ - **Cold starts**: No real usage data yet
211
+ - **Edge cases**: Rare scenarios hard to capture from real data
212
+ - **Adversarial testing**: Security, robustness testing
213
+ - **Scenario coverage**: Systematic coverage of input space
214
+ </when_to_use>
215
+
216
+ <generation_approaches>
217
+ **Persona-based generation**: Create test cases from different user personas.
218
+
219
+ ```markdown
220
+ Persona: Junior developer
221
+ Task: "Fix the bug where the login page crashes"
222
+ Expected behavior: Subagent provides detailed debugging steps
223
+
224
+ Persona: Senior engineer
225
+ Task: "Investigate authentication flow security"
226
+ Expected behavior: Subagent performs deep security analysis
227
+ ```
228
+
229
+ **Scenario simulation**: Generate variations of common scenarios.
230
+
231
+ ```markdown
232
+ Scenario: SQL injection vulnerability review
233
+ Variations:
234
+ - Direct SQL concatenation
235
+ - ORM with raw queries
236
+ - Prepared statements (should pass)
237
+ - Stored procedures with dynamic SQL
238
+ ```
239
+ </generation_approaches>
240
+
241
+ <critical_limitation>
242
+ **Never rely exclusively on synthetic data.**
243
+
244
+ Maintain a validation set of real usage examples. Synthetic data can miss:
245
+ - Real-world complexity
246
+ - Actual user intent patterns
247
+ - Production environment constraints
248
+ - Emergent usage patterns
249
+
250
+ **Best practice**: 70% synthetic (for coverage), 30% real (for reality check).
251
+ </critical_limitation>
252
+ </synthetic_data>
253
+
254
+ <llm_as_judge>
255
+
256
+
257
+ <basic_pattern>
258
+ Use LLM to evaluate subagent outputs when human review is impractical at scale.
259
+
260
+ **Example evaluation prompt**:
261
+ ```markdown
262
+ You are evaluating a security code review performed by an AI subagent.
263
+
264
+ Review output:
265
+ {subagent_output}
266
+
267
+ Code that was reviewed:
268
+ {code}
269
+
270
+ Evaluate on these criteria:
271
+ 1. Accuracy: Are identified vulnerabilities real? (Yes/Partial/No)
272
+ 2. Completeness: Were obvious vulnerabilities missed? (None missed/Some missed/Many missed)
273
+ 3. Actionability: Are fixes specific and implementable? (Very/Somewhat/Not really)
274
+
275
+ Provide:
276
+ - Overall grade (A/B/C/D/F)
277
+ - Specific issues with the review
278
+ - What a human reviewer would have done differently
279
+ ```
280
+ </basic_pattern>
281
+
282
+ <comparison_pattern>
283
+ **Ground truth comparison**: When correct answer is known.
284
+
285
+ ```markdown
286
+ Expected vulnerabilities in test code:
287
+ 1. SQL injection on line 42
288
+ 2. XSS vulnerability on line 67
289
+ 3. Missing authentication check on line 103
290
+
291
+ Subagent identified:
292
+ {subagent_findings}
293
+
294
+ Calculate:
295
+ - Precision: % of identified issues that are real
296
+ - Recall: % of real issues that were identified
297
+ - F1 score: Harmonic mean of precision and recall
298
+ ```
299
+ </comparison_pattern>
300
+ </llm_as_judge>
301
+
302
+ <test_driven_development>
303
+
304
+
305
+ Anthropic guidance: "Test-driven development becomes even more powerful with agentic coding."
306
+
307
+ <approach>
308
+ **Before writing subagent prompt**:
309
+ 1. Define expected input/output pairs
310
+ 2. Create test cases that subagent must pass
311
+ 3. Write initial prompt
312
+ 4. Run tests, observe failures
313
+ 5. Refine prompt based on failures
314
+ 6. Repeat until all tests pass
315
+
316
+ **Example for test-writer subagent**:
317
+ ```markdown
318
+ Test 1:
319
+ Input: Function that adds two numbers
320
+ Expected output: Test file with:
321
+ - Happy path (2 + 2 = 4)
322
+ - Edge cases (0 + 0, negative numbers)
323
+ - Type errors (string + number)
324
+
325
+ Test 2:
326
+ Input: Async function that fetches user data
327
+ Expected output: Test file with:
328
+ - Successful fetch
329
+ - Network error handling
330
+ - Invalid user ID handling
331
+ - Mocked HTTP calls (no real API calls)
332
+ ```
333
+
334
+ **Invoke subagent → check if outputs match expectations → iterate on prompt.**
335
+ </approach>
336
+
337
+ **Benefit**: Clear acceptance criteria before development, objective measure of prompt quality.
338
+ </test_driven_development>
339
+
340
+ <anti_patterns>
341
+
342
+
343
+ <anti_pattern name="no_testing">
344
+ ❌ Deploying subagents without any validation
345
+
346
+ **Risk**: Subagent fails on real tasks, wastes user time, damages trust.
347
+
348
+ **Fix**: Minimum viable testing = invoke with 3 representative tasks before deploying.
349
+ </anti_pattern>
350
+
351
+ <anti_pattern name="only_happy_path">
352
+ ❌ Testing only ideal scenarios
353
+
354
+ **Risk**: Subagent fails on edge cases, error conditions, or unusual (but valid) inputs.
355
+
356
+ **Fix**: Test matrix covering happy path, edge cases, and error conditions.
357
+ </anti_pattern>
358
+
359
+ <anti_pattern name="no_metrics">
360
+ ❌ No measurement of effectiveness
361
+
362
+ **Risk**: Can't tell if prompt changes improve or degrade performance.
363
+
364
+ **Fix**: Define at least one quantitative metric (task completion rate, output quality score).
365
+ </anti_pattern>
366
+
367
+ <anti_pattern name="test_once_deploy_forever">
368
+ ❌ Testing once at creation, never revisiting
369
+
370
+ **Risk**: Subagent degrades over time as usage patterns shift, codebases change, or models update.
371
+
372
+ **Fix**: Periodic re-evaluation with current usage patterns and edge cases.
373
+ </anti_pattern>
374
+ </anti_patterns>