codingbuddy-rules 4.4.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/.ai-rules/adapters/antigravity.md +6 -6
  2. package/.ai-rules/adapters/claude-code.md +107 -4
  3. package/.ai-rules/adapters/codex.md +5 -5
  4. package/.ai-rules/adapters/cursor.md +2 -2
  5. package/.ai-rules/adapters/kiro.md +8 -8
  6. package/.ai-rules/adapters/opencode.md +7 -7
  7. package/.ai-rules/adapters/q.md +2 -2
  8. package/.ai-rules/agents/README.md +66 -16
  9. package/.ai-rules/agents/accessibility-specialist.json +2 -1
  10. package/.ai-rules/agents/act-mode.json +2 -1
  11. package/.ai-rules/agents/agent-architect.json +8 -7
  12. package/.ai-rules/agents/ai-ml-engineer.json +1 -0
  13. package/.ai-rules/agents/architecture-specialist.json +1 -0
  14. package/.ai-rules/agents/auto-mode.json +4 -2
  15. package/.ai-rules/agents/backend-developer.json +1 -0
  16. package/.ai-rules/agents/code-quality-specialist.json +1 -0
  17. package/.ai-rules/agents/code-reviewer.json +65 -64
  18. package/.ai-rules/agents/data-engineer.json +8 -7
  19. package/.ai-rules/agents/data-scientist.json +10 -9
  20. package/.ai-rules/agents/devops-engineer.json +1 -0
  21. package/.ai-rules/agents/documentation-specialist.json +1 -0
  22. package/.ai-rules/agents/eval-mode.json +20 -19
  23. package/.ai-rules/agents/event-architecture-specialist.json +1 -0
  24. package/.ai-rules/agents/frontend-developer.json +1 -0
  25. package/.ai-rules/agents/i18n-specialist.json +2 -1
  26. package/.ai-rules/agents/integration-specialist.json +1 -0
  27. package/.ai-rules/agents/migration-specialist.json +1 -0
  28. package/.ai-rules/agents/mobile-developer.json +8 -7
  29. package/.ai-rules/agents/observability-specialist.json +1 -0
  30. package/.ai-rules/agents/parallel-orchestrator.json +346 -0
  31. package/.ai-rules/agents/performance-specialist.json +1 -0
  32. package/.ai-rules/agents/plan-mode.json +3 -1
  33. package/.ai-rules/agents/plan-reviewer.json +208 -0
  34. package/.ai-rules/agents/platform-engineer.json +1 -0
  35. package/.ai-rules/agents/security-engineer.json +9 -8
  36. package/.ai-rules/agents/security-specialist.json +2 -1
  37. package/.ai-rules/agents/seo-specialist.json +1 -0
  38. package/.ai-rules/agents/software-engineer.json +1 -0
  39. package/.ai-rules/agents/solution-architect.json +11 -10
  40. package/.ai-rules/agents/systems-developer.json +9 -8
  41. package/.ai-rules/agents/technical-planner.json +11 -10
  42. package/.ai-rules/agents/test-engineer.json +7 -6
  43. package/.ai-rules/agents/test-strategy-specialist.json +1 -0
  44. package/.ai-rules/agents/tooling-engineer.json +4 -3
  45. package/.ai-rules/agents/ui-ux-designer.json +1 -0
  46. package/.ai-rules/keyword-modes.json +4 -4
  47. package/.ai-rules/rules/clarification-guide.md +14 -14
  48. package/.ai-rules/rules/core.md +90 -1
  49. package/.ai-rules/rules/parallel-execution.md +217 -0
  50. package/.ai-rules/skills/README.md +23 -1
  51. package/.ai-rules/skills/agent-design/SKILL.md +5 -0
  52. package/.ai-rules/skills/agent-design/examples/agent-template.json +58 -0
  53. package/.ai-rules/skills/agent-design/references/expertise-guidelines.md +112 -0
  54. package/.ai-rules/skills/agent-discussion/SKILL.md +199 -0
  55. package/.ai-rules/skills/agent-discussion-panel/SKILL.md +448 -0
  56. package/.ai-rules/skills/api-design/SKILL.md +5 -0
  57. package/.ai-rules/skills/api-design/examples/error-response.json +159 -0
  58. package/.ai-rules/skills/api-design/examples/openapi-template.yaml +393 -0
  59. package/.ai-rules/skills/build-fix/SKILL.md +234 -0
  60. package/.ai-rules/skills/code-explanation/SKILL.md +4 -0
  61. package/.ai-rules/skills/context-management/SKILL.md +1 -0
  62. package/.ai-rules/skills/cost-budget/SKILL.md +348 -0
  63. package/.ai-rules/skills/cross-repo-issues/SKILL.md +257 -0
  64. package/.ai-rules/skills/database-migration/SKILL.md +1 -0
  65. package/.ai-rules/skills/deepsearch/SKILL.md +214 -0
  66. package/.ai-rules/skills/deployment-checklist/SKILL.md +1 -0
  67. package/.ai-rules/skills/error-analysis/SKILL.md +1 -0
  68. package/.ai-rules/skills/finishing-a-development-branch/SKILL.md +281 -0
  69. package/.ai-rules/skills/frontend-design/SKILL.md +5 -0
  70. package/.ai-rules/skills/frontend-design/examples/component-template.tsx +203 -0
  71. package/.ai-rules/skills/frontend-design/references/css-patterns.md +243 -0
  72. package/.ai-rules/skills/git-master/SKILL.md +358 -0
  73. package/.ai-rules/skills/incident-response/SKILL.md +1 -0
  74. package/.ai-rules/skills/legacy-modernization/SKILL.md +1 -0
  75. package/.ai-rules/skills/mcp-builder/SKILL.md +7 -0
  76. package/.ai-rules/skills/mcp-builder/examples/resource-example.ts +233 -0
  77. package/.ai-rules/skills/mcp-builder/examples/tool-example.ts +203 -0
  78. package/.ai-rules/skills/mcp-builder/references/protocol-spec.md +215 -0
  79. package/.ai-rules/skills/performance-optimization/SKILL.md +3 -0
  80. package/.ai-rules/skills/plan-and-review/SKILL.md +115 -0
  81. package/.ai-rules/skills/pr-all-in-one/SKILL.md +15 -13
  82. package/.ai-rules/skills/pr-all-in-one/configuration-guide.md +7 -7
  83. package/.ai-rules/skills/pr-all-in-one/pr-templates.md +10 -10
  84. package/.ai-rules/skills/pr-review/SKILL.md +4 -0
  85. package/.ai-rules/skills/receiving-code-review/SKILL.md +347 -0
  86. package/.ai-rules/skills/refactoring/SKILL.md +1 -0
  87. package/.ai-rules/skills/requesting-code-review/SKILL.md +348 -0
  88. package/.ai-rules/skills/rule-authoring/SKILL.md +5 -0
  89. package/.ai-rules/skills/rule-authoring/examples/rule-template.md +142 -0
  90. package/.ai-rules/skills/rule-authoring/examples/trigger-patterns.md +126 -0
  91. package/.ai-rules/skills/security-audit/SKILL.md +4 -0
  92. package/.ai-rules/skills/skill-creator/SKILL.md +461 -0
  93. package/.ai-rules/skills/skill-creator/agents/analyzer.md +206 -0
  94. package/.ai-rules/skills/skill-creator/agents/comparator.md +167 -0
  95. package/.ai-rules/skills/skill-creator/agents/grader.md +152 -0
  96. package/.ai-rules/skills/skill-creator/assets/eval_review.html +289 -0
  97. package/.ai-rules/skills/skill-creator/assets/skill-template.md +43 -0
  98. package/.ai-rules/skills/skill-creator/eval-viewer/generate_review.py +496 -0
  99. package/.ai-rules/skills/skill-creator/references/frontmatter-guide.md +632 -0
  100. package/.ai-rules/skills/skill-creator/references/multi-tool-compat.md +480 -0
  101. package/.ai-rules/skills/skill-creator/references/schemas.md +784 -0
  102. package/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py +302 -0
  103. package/.ai-rules/skills/skill-creator/scripts/init_skill.sh +196 -0
  104. package/.ai-rules/skills/skill-creator/scripts/run_loop.py +327 -0
  105. package/.ai-rules/skills/systematic-debugging/SKILL.md +1 -0
  106. package/.ai-rules/skills/tech-debt/SKILL.md +1 -0
  107. package/.ai-rules/skills/test-coverage-gate/SKILL.md +303 -0
  108. package/.ai-rules/skills/tmux-master/SKILL.md +491 -0
  109. package/.ai-rules/skills/using-git-worktrees/SKILL.md +368 -0
  110. package/.ai-rules/skills/verification-before-completion/SKILL.md +234 -0
  111. package/.ai-rules/skills/widget-slot-architecture/SKILL.md +6 -0
  112. package/.ai-rules/skills/widget-slot-architecture/examples/parallel-route-setup.tsx +206 -0
  113. package/.ai-rules/skills/widget-slot-architecture/examples/widget-component.tsx +250 -0
  114. package/.ai-rules/skills/writing-plans/SKILL.md +78 -0
  115. package/bin/cli.js +178 -0
  116. package/lib/init/detect-stack.js +148 -0
  117. package/lib/init/generate-config.js +31 -0
  118. package/lib/init/index.js +86 -0
  119. package/lib/init/prompt.js +60 -0
  120. package/lib/init/scaffold.js +67 -0
  121. package/lib/init/suggest-agent.js +46 -0
  122. package/package.json +10 -2
@@ -0,0 +1,784 @@
1
+ # Skill Evaluation Schemas Reference
2
+
3
+ JSON schema definitions and workspace directory structure reference for the skill evaluation system.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Workspace Directory Structure](#workspace-directory-structure)
8
+ - [Schema Definitions](#schema-definitions)
9
+ - [1. evals.json](#1-evalsjson)
10
+ - [2. eval_metadata.json](#2-eval_metadatajson)
11
+ - [3. grading.json](#3-gradingjson)
12
+ - [4. timing.json](#4-timingjson)
13
+ - [5. feedback.json](#5-feedbackjson)
14
+ - [6. benchmark.json](#6-benchmarkjson)
15
+ - [7. trigger_eval.json](#7-trigger_evaljson)
16
+ - [Schema Relationships](#schema-relationships)
17
+ - [Validation](#validation)
18
+
19
+ ---
20
+
21
+ ## Workspace Directory Structure
22
+
23
+ Workspace directory structure used during skill evaluation. Each iteration represents a skill modification cycle, and each eval includes a `with_skill` (skill applied) and `without_skill` (baseline) comparison run.
24
+
25
+ ```
26
+ workspace/
27
+ ├── evals.json # Evaluation scenario definitions (shared across all iterations)
28
+ ├── trigger_eval.json # Trigger evaluation cases (benchmark mode)
29
+ └── iteration-N/ # Nth skill modification cycle
30
+ ├── eval-0/ # First evaluation (0-based)
31
+ │ ├── with_skill/ # Skill-applied run
32
+ │ │ ├── outputs/ # Generated output files
33
+ │ │ ├── eval_metadata.json # Evaluation metadata
34
+ │ │ ├── grading.json # Grading results
35
+ │ │ └── timing.json # Execution time measurements
36
+ │ └── without_skill/ # Baseline run (no skill applied)
37
+ │ ├── outputs/ # Generated output files
38
+ │ ├── eval_metadata.json # Evaluation metadata
39
+ │ ├── grading.json # Grading results
40
+ │ └── timing.json # Execution time measurements
41
+ ├── eval-1/ # Second evaluation
42
+ │ ├── with_skill/
43
+ │ │ └── ...
44
+ │ └── without_skill/
45
+ │ └── ...
46
+ ├── benchmark.json # Iteration benchmark aggregate results
47
+ ├── benchmark.md # Benchmark markdown report
48
+ └── feedback.json # User feedback
49
+ ```
50
+
51
+ **Directory naming conventions:**
52
+ - `iteration-N`: 1-based sequential number. A new iteration is created each time SKILL.md is modified
53
+ - `eval-N`: 0-based sequential number. Corresponds to each scenario in `evals.json`
54
+ - `with_skill/`: Evaluation run with the skill applied
55
+ - `without_skill/`: Baseline run without the skill (comparison target)
56
+ - `outputs/`: Files generated during each run (code, documents, etc.)
57
+
58
+ **File location rules:**
59
+ - `evals.json`, `trigger_eval.json`: Workspace root (iteration-independent)
60
+ - `eval_metadata.json`, `grading.json`, `timing.json`: Inside each eval's `with_skill/` or `without_skill/`
61
+ - `benchmark.json`, `benchmark.md`, `feedback.json`: Iteration root
62
+
63
+ ---
64
+
65
+ ## Schema Definitions
66
+
67
+ ### 1. evals.json
68
+
69
+ Defines the list of evaluation scenarios. Located at the workspace root and shared across all iterations.
70
+
71
+ **Schema:**
72
+
73
+ ```json
74
+ {
75
+ "$schema": "http://json-schema.org/draft-07/schema#",
76
+ "$id": "https://codingbuddy.dev/schemas/skill-eval/evals.json",
77
+ "title": "Skill Evaluation Scenarios",
78
+ "description": "Skill evaluation scenario definitions",
79
+ "type": "object",
80
+ "required": ["skill_name", "evals"],
81
+ "properties": {
82
+ "skill_name": {
83
+ "type": "string",
84
+ "description": "Name of the skill being evaluated (kebab-case)",
85
+ "pattern": "^[a-z][a-z0-9-]*$"
86
+ },
87
+ "evals": {
88
+ "type": "array",
89
+ "description": "List of evaluation scenarios",
90
+ "minItems": 1,
91
+ "items": {
92
+ "type": "object",
93
+ "required": ["id", "prompt", "expected_output", "files"],
94
+ "properties": {
95
+ "id": {
96
+ "type": "integer",
97
+ "minimum": 1,
98
+ "description": "Scenario ID (1-based)"
99
+ },
100
+ "prompt": {
101
+ "type": "string",
102
+ "description": "User task prompt"
103
+ },
104
+ "expected_output": {
105
+ "type": "string",
106
+ "description": "Expected result description"
107
+ },
108
+ "files": {
109
+ "type": "array",
110
+ "items": { "type": "string" },
111
+ "description": "List of input file paths (empty array if none)"
112
+ }
113
+ }
114
+ }
115
+ }
116
+ }
117
+ }
118
+ ```
119
+
120
+ **Example:**
121
+
122
+ ```json
123
+ {
124
+ "skill_name": "test-driven-development",
125
+ "evals": [
126
+ {
127
+ "id": 1,
128
+ "prompt": "Add a function that validates email addresses",
129
+ "expected_output": "Test file created first with failing test, then implementation, then refactor",
130
+ "files": ["src/utils/validators.ts"]
131
+ },
132
+ {
133
+ "id": 2,
134
+ "prompt": "Fix the login timeout bug",
135
+ "expected_output": "Failing test reproducing the bug, then minimal fix, then cleanup",
136
+ "files": ["src/auth/login.ts", "src/auth/login.test.ts"]
137
+ }
138
+ ]
139
+ }
140
+ ```
141
+
142
+ ---
143
+
144
+ ### 2. eval_metadata.json
145
+
146
+ Records metadata for an individual evaluation run. Located in each `with_skill/` and `without_skill/` directory.
147
+
148
+ **Schema:**
149
+
150
+ ```json
151
+ {
152
+ "$schema": "http://json-schema.org/draft-07/schema#",
153
+ "$id": "https://codingbuddy.dev/schemas/skill-eval/eval_metadata.json",
154
+ "title": "Evaluation Metadata",
155
+ "description": "Metadata for an individual evaluation run",
156
+ "type": "object",
157
+ "required": ["eval_id", "eval_name", "prompt", "assertions"],
158
+ "properties": {
159
+ "eval_id": {
160
+ "type": "integer",
161
+ "minimum": 0,
162
+ "description": "Evaluation ID (0-based, corresponds to evals.json id-1)"
163
+ },
164
+ "eval_name": {
165
+ "type": "string",
166
+ "description": "Descriptive name for the evaluation"
167
+ },
168
+ "prompt": {
169
+ "type": "string",
170
+ "description": "User task prompt (copied from evals.json)"
171
+ },
172
+ "assertions": {
173
+ "type": "array",
174
+ "description": "List of verification items",
175
+ "minItems": 1,
176
+ "items": {
177
+ "type": "object",
178
+ "required": ["name", "description"],
179
+ "properties": {
180
+ "name": {
181
+ "type": "string",
182
+ "description": "Verifiable item name"
183
+ },
184
+ "description": {
185
+ "type": "string",
186
+ "description": "Description of pass criteria"
187
+ }
188
+ }
189
+ }
190
+ }
191
+ }
192
+ }
193
+ ```
194
+
195
+ **Example:**
196
+
197
+ ```json
198
+ {
199
+ "eval_id": 0,
200
+ "eval_name": "Email Validation TDD Cycle",
201
+ "prompt": "Add a function that validates email addresses",
202
+ "assertions": [
203
+ {
204
+ "name": "test_file_created_first",
205
+ "description": "Test file was created before the implementation file"
206
+ },
207
+ {
208
+ "name": "test_initially_fails",
209
+ "description": "Test fails when run before implementation"
210
+ },
211
+ {
212
+ "name": "minimal_implementation",
213
+ "description": "Only minimal code to pass the test was written"
214
+ },
215
+ {
216
+ "name": "refactor_step_present",
217
+ "description": "A refactoring step was performed after GREEN"
218
+ }
219
+ ]
220
+ }
221
+ ```
222
+
223
+ ---
224
+
225
+ ### 3. grading.json
226
+
227
+ Records grading results for an evaluation run. Includes pass/fail judgment and supporting evidence for each assertion.
228
+
229
+ **Schema:**
230
+
231
+ ```json
232
+ {
233
+ "$schema": "http://json-schema.org/draft-07/schema#",
234
+ "$id": "https://codingbuddy.dev/schemas/skill-eval/grading.json",
235
+ "title": "Grading Results",
236
+ "description": "Evaluation grading results",
237
+ "type": "object",
238
+ "required": ["expectations"],
239
+ "properties": {
240
+ "expectations": {
241
+ "type": "array",
242
+ "description": "Grading results per assertion",
243
+ "minItems": 1,
244
+ "items": {
245
+ "type": "object",
246
+ "required": ["text", "passed", "evidence"],
247
+ "properties": {
248
+ "text": {
249
+ "type": "string",
250
+ "description": "Assertion description (corresponds to assertion.description in eval_metadata.json)"
251
+ },
252
+ "passed": {
253
+ "type": "boolean",
254
+ "description": "Whether it passed"
255
+ },
256
+ "evidence": {
257
+ "type": "string",
258
+ "description": "Basis for judgment (specific evidence)"
259
+ }
260
+ }
261
+ }
262
+ }
263
+ }
264
+ }
265
+ ```
266
+
267
+ **Example:**
268
+
269
+ ```json
270
+ {
271
+ "expectations": [
272
+ {
273
+ "text": "Test file was created before the implementation file",
274
+ "passed": true,
275
+ "evidence": "validators.test.ts was created 2 minutes before validators.ts (confirmed via git log)"
276
+ },
277
+ {
278
+ "text": "Test fails when run before implementation",
279
+ "passed": true,
280
+ "evidence": "RED phase confirmed: 'Expected isValidEmail to be defined' error"
281
+ },
282
+ {
283
+ "text": "Only minimal code to pass the test was written",
284
+ "passed": false,
285
+ "evidence": "Initial implementation included unnecessary domain validation logic (exceeds assertion scope)"
286
+ }
287
+ ]
288
+ }
289
+ ```
290
+
291
+ **Pass rate calculation:**
292
+
293
+ ```
294
+ pass_rate = expectations.filter(e => e.passed).length / expectations.length
295
+ ```
296
+
297
+ ---
298
+
299
+ ### 4. timing.json
300
+
301
+ Records time and token usage for an evaluation run.
302
+
303
+ **Schema:**
304
+
305
+ ```json
306
+ {
307
+ "$schema": "http://json-schema.org/draft-07/schema#",
308
+ "$id": "https://codingbuddy.dev/schemas/skill-eval/timing.json",
309
+ "title": "Timing Data",
310
+ "description": "Evaluation execution time and token usage",
311
+ "type": "object",
312
+ "required": ["total_tokens", "duration_ms", "total_duration_seconds"],
313
+ "properties": {
314
+ "total_tokens": {
315
+ "type": "integer",
316
+ "minimum": 0,
317
+ "description": "Total token usage (input + output)"
318
+ },
319
+ "duration_ms": {
320
+ "type": "integer",
321
+ "minimum": 0,
322
+ "description": "Execution time (milliseconds)"
323
+ },
324
+ "total_duration_seconds": {
325
+ "type": "number",
326
+ "minimum": 0,
327
+ "description": "Total execution time (seconds, with decimal)"
328
+ }
329
+ }
330
+ }
331
+ ```
332
+
333
+ **Example:**
334
+
335
+ ```json
336
+ {
337
+ "total_tokens": 45230,
338
+ "duration_ms": 32150,
339
+ "total_duration_seconds": 32.15
340
+ }
341
+ ```
342
+
343
+ **Used in benchmark comparison:**
344
+ - Compare timing.json between `with_skill` and `without_skill` to measure token/time overhead from skill application
345
+
346
+ ---
347
+
348
+ ### 5. feedback.json
349
+
350
+ Records user feedback on evaluations. Located at the iteration root and consolidates feedback across multiple eval runs.
351
+
352
+ **Schema:**
353
+
354
+ ```json
355
+ {
356
+ "$schema": "http://json-schema.org/draft-07/schema#",
357
+ "$id": "https://codingbuddy.dev/schemas/skill-eval/feedback.json",
358
+ "title": "Evaluation Feedback",
359
+ "description": "User feedback on evaluations",
360
+ "type": "object",
361
+ "required": ["reviews", "status"],
362
+ "properties": {
363
+ "reviews": {
364
+ "type": "array",
365
+ "description": "List of feedback items",
366
+ "items": {
367
+ "type": "object",
368
+ "required": ["run_id", "feedback", "timestamp"],
369
+ "properties": {
370
+ "run_id": {
371
+ "type": "string",
372
+ "description": "Run identifier (e.g., eval-0-with_skill, eval-1-without_skill)",
373
+ "pattern": "^eval-[0-9]+-(?:with_skill|without_skill)$"
374
+ },
375
+ "feedback": {
376
+ "type": "string",
377
+ "description": "User feedback content"
378
+ },
379
+ "timestamp": {
380
+ "type": "string",
381
+ "format": "date-time",
382
+ "description": "Feedback creation time (ISO 8601)"
383
+ }
384
+ }
385
+ }
386
+ },
387
+ "status": {
388
+ "type": "string",
389
+ "enum": ["in_progress", "complete"],
390
+ "description": "Feedback collection status"
391
+ }
392
+ }
393
+ }
394
+ ```
395
+
396
+ **Example:**
397
+
398
+ ```json
399
+ {
400
+ "reviews": [
401
+ {
402
+ "run_id": "eval-0-with_skill",
403
+ "feedback": "TDD cycle was well followed, but error message verification was skipped in the RED phase",
404
+ "timestamp": "2026-03-21T14:30:00.000Z"
405
+ },
406
+ {
407
+ "run_id": "eval-0-without_skill",
408
+ "feedback": "When run without the skill, there was a tendency to write tests after implementation",
409
+ "timestamp": "2026-03-21T14:35:00.000Z"
410
+ }
411
+ ],
412
+ "status": "in_progress"
413
+ }
414
+ ```
415
+
416
+ **run_id format:**
417
+ - `eval-{eval_id}-with_skill`: Feedback for skill-applied run
418
+ - `eval-{eval_id}-without_skill`: Feedback for baseline run
419
+
420
+ ---
421
+
422
+ ### 6. benchmark.json
423
+
424
+ Records aggregate benchmark results per iteration. Aggregates `with_skill` vs `without_skill` comparison data across all evals.
425
+
426
+ **Schema:**
427
+
428
+ ```json
429
+ {
430
+ "$schema": "http://json-schema.org/draft-07/schema#",
431
+ "$id": "https://codingbuddy.dev/schemas/skill-eval/benchmark.json",
432
+ "title": "Benchmark Results",
433
+ "description": "Iteration benchmark aggregate results",
434
+ "type": "object",
435
+ "required": ["skill_name", "iteration", "summary", "eval_results"],
436
+ "properties": {
437
+ "skill_name": {
438
+ "type": "string",
439
+ "description": "Name of the skill being evaluated",
440
+ "pattern": "^[a-z][a-z0-9-]*$"
441
+ },
442
+ "iteration": {
443
+ "type": "integer",
444
+ "minimum": 1,
445
+ "description": "Iteration number (1-based)"
446
+ },
447
+ "summary": {
448
+ "type": "object",
449
+ "description": "Aggregate summary statistics across all evals",
450
+ "required": ["pass_rate", "tokens", "duration_seconds"],
451
+ "properties": {
452
+ "pass_rate": {
453
+ "type": "object",
454
+ "required": ["mean", "stddev"],
455
+ "properties": {
456
+ "mean": {
457
+ "type": "number",
458
+ "minimum": 0,
459
+ "maximum": 1,
460
+ "description": "Mean pass rate (0.0~1.0)"
461
+ },
462
+ "stddev": {
463
+ "type": "number",
464
+ "minimum": 0,
465
+ "description": "Pass rate standard deviation"
466
+ }
467
+ }
468
+ },
469
+ "tokens": {
470
+ "type": "object",
471
+ "required": ["mean", "stddev"],
472
+ "properties": {
473
+ "mean": {
474
+ "type": "number",
475
+ "minimum": 0,
476
+ "description": "Mean token usage"
477
+ },
478
+ "stddev": {
479
+ "type": "number",
480
+ "minimum": 0,
481
+ "description": "Token standard deviation"
482
+ }
483
+ }
484
+ },
485
+ "duration_seconds": {
486
+ "type": "object",
487
+ "required": ["mean", "stddev"],
488
+ "properties": {
489
+ "mean": {
490
+ "type": "number",
491
+ "minimum": 0,
492
+ "description": "Mean execution time (seconds)"
493
+ },
494
+ "stddev": {
495
+ "type": "number",
496
+ "minimum": 0,
497
+ "description": "Time standard deviation"
498
+ }
499
+ }
500
+ }
501
+ }
502
+ },
503
+ "eval_results": {
504
+ "type": "array",
505
+ "description": "Individual eval comparison results",
506
+ "items": {
507
+ "type": "object",
508
+ "required": ["eval_id", "with_skill", "baseline"],
509
+ "properties": {
510
+ "eval_id": {
511
+ "type": "integer",
512
+ "minimum": 0,
513
+ "description": "Evaluation ID (0-based)"
514
+ },
515
+ "with_skill": {
516
+ "type": "object",
517
+ "required": ["pass_rate", "tokens", "duration"],
518
+ "description": "Skill-applied results",
519
+ "properties": {
520
+ "pass_rate": {
521
+ "type": "number",
522
+ "minimum": 0,
523
+ "maximum": 1,
524
+ "description": "Pass rate (0.0~1.0)"
525
+ },
526
+ "tokens": {
527
+ "type": "integer",
528
+ "minimum": 0,
529
+ "description": "Token usage"
530
+ },
531
+ "duration": {
532
+ "type": "number",
533
+ "minimum": 0,
534
+ "description": "Execution time (seconds)"
535
+ }
536
+ }
537
+ },
538
+ "baseline": {
539
+ "type": "object",
540
+ "required": ["pass_rate", "tokens", "duration"],
541
+ "description": "Baseline (no skill applied) results",
542
+ "properties": {
543
+ "pass_rate": {
544
+ "type": "number",
545
+ "minimum": 0,
546
+ "maximum": 1
547
+ },
548
+ "tokens": {
549
+ "type": "integer",
550
+ "minimum": 0
551
+ },
552
+ "duration": {
553
+ "type": "number",
554
+ "minimum": 0
555
+ }
556
+ }
557
+ }
558
+ }
559
+ }
560
+ }
561
+ }
562
+ }
563
+ ```
564
+
565
+ **Example:**
566
+
567
+ ```json
568
+ {
569
+ "skill_name": "test-driven-development",
570
+ "iteration": 1,
571
+ "summary": {
572
+ "pass_rate": { "mean": 0.85, "stddev": 0.12 },
573
+ "tokens": { "mean": 42000, "stddev": 5200 },
574
+ "duration_seconds": { "mean": 35.5, "stddev": 8.3 }
575
+ },
576
+ "eval_results": [
577
+ {
578
+ "eval_id": 0,
579
+ "with_skill": { "pass_rate": 0.75, "tokens": 45230, "duration": 32.15 },
580
+ "baseline": { "pass_rate": 0.50, "tokens": 38400, "duration": 28.90 }
581
+ },
582
+ {
583
+ "eval_id": 1,
584
+ "with_skill": { "pass_rate": 1.0, "tokens": 38770, "duration": 38.85 },
585
+ "baseline": { "pass_rate": 0.25, "tokens": 35200, "duration": 25.40 }
586
+ }
587
+ ]
588
+ }
589
+ ```
590
+
591
+ **Interpretation guide:**
592
+ - `with_skill.pass_rate > baseline.pass_rate`: Skill improves quality
593
+ - `with_skill.tokens > baseline.tokens`: Token overhead from skill application
594
+ - Lower `summary.pass_rate.stddev` means more consistent performance
595
+
596
+ ---
597
+
598
+ ### 7. trigger_eval.json
599
+
600
+ Defines test cases for evaluating skill recommendation triggers. Used in benchmark mode to measure the trigger accuracy of `recommend_skills`.
601
+
602
+ **Schema:**
603
+
604
+ ```json
605
+ {
606
+ "$schema": "http://json-schema.org/draft-07/schema#",
607
+ "$id": "https://codingbuddy.dev/schemas/skill-eval/trigger_eval.json",
608
+ "title": "Trigger Evaluation Cases",
609
+ "description": "Test cases for skill recommendation trigger accuracy",
610
+ "type": "array",
611
+ "minItems": 1,
612
+ "items": {
613
+ "type": "object",
614
+ "required": ["query", "should_trigger"],
615
+ "properties": {
616
+ "query": {
617
+ "type": "string",
618
+ "description": "User prompt (test input)"
619
+ },
620
+ "should_trigger": {
621
+ "type": "boolean",
622
+ "description": "Whether the skill should be recommended for this prompt"
623
+ }
624
+ }
625
+ }
626
+ }
627
+ ```
628
+
629
+ **Example:**
630
+
631
+ ```json
632
+ [
633
+ {
634
+ "query": "Add a new feature to validate user registration",
635
+ "should_trigger": true
636
+ },
637
+ {
638
+ "query": "Fix the null pointer exception in the payment module",
639
+ "should_trigger": true
640
+ },
641
+ {
642
+ "query": "What does this function do?",
643
+ "should_trigger": false
644
+ },
645
+ {
646
+ "query": "Deploy the application to production",
647
+ "should_trigger": false
648
+ }
649
+ ]
650
+ ```
651
+
652
+ **Usage:**
653
+ 1. Pass each `query` from `trigger_eval.json` to `recommend_skills`
654
+ 2. Check whether the target skill is included in the results
655
+ 3. Compare against `should_trigger` to calculate accuracy
656
+
657
+ **Accuracy metrics:**
658
+ - **Precision**: Ratio of correct recommendations among all recommendations
659
+ - **Recall**: Ratio of actual recommendations among all that should have been recommended
660
+ - **F1 Score**: Harmonic mean of Precision and Recall
661
+
662
+ ---
663
+
664
+ ## Schema Relationships
665
+
666
+ ```
667
+ evals.json (workspace root)
668
+
669
+ │ eval.id → eval_id mapping
670
+
671
+ iteration-N/eval-{id}/
672
+ ├── with_skill/
673
+ │ ├── eval_metadata.json ◄── Prompt and assertions defined in evals.json
674
+ │ ├── grading.json ◄── Grading of assertions from eval_metadata.json
675
+ │ └── timing.json ── Independent measurement
676
+ └── without_skill/
677
+ ├── eval_metadata.json
678
+ ├── grading.json
679
+ └── timing.json
680
+
681
+ ▼ Aggregation
682
+ iteration-N/
683
+ ├── benchmark.json ◄── with_skill vs without_skill comparison across all evals
684
+ └── feedback.json ◄── User feedback (references evals via run_id)
685
+
686
+ trigger_eval.json (workspace root) ── recommend_skills accuracy measurement (independent)
687
+ ```
688
+
689
+ **Data flow:**
690
+ 1. Define evaluation scenarios in `evals.json`
691
+ 2. Run `with_skill/` and `without_skill/` for each eval
692
+ 3. Record execution information in `eval_metadata.json`
693
+ 4. Store per-assertion grading results in `grading.json`
694
+ 5. Measure tokens/time in `timing.json`
695
+ 6. Aggregate and compare all eval results in `benchmark.json`
696
+ 7. Collect user feedback in `feedback.json`
697
+ 8. Measure recommendation accuracy separately with `trigger_eval.json`
698
+
699
+ **ID mapping:**
700
+ - `evals.json` `id` (1-based) → `eval-{id-1}/` directory (0-based)
701
+ - `eval_metadata.json` `eval_id` (0-based) = eval number of the directory
702
+ - `feedback.json` `run_id` = `eval-{eval_id}-{with_skill|without_skill}`
703
+
704
+ ---
705
+
706
+ ## Validation
707
+
708
+ ### Validation with ajv CLI
709
+
710
+ ```bash
711
+ # Validate evals.json
712
+ npx ajv-cli@5.0.0 validate \
713
+ -s schemas/evals.schema.json \
714
+ -d workspace/evals.json \
715
+ --spec=draft7
716
+
717
+ # Validate all grading.json files within an iteration
718
+ for f in workspace/iteration-*/eval-*/*/grading.json; do
719
+ npx ajv-cli@5.0.0 validate \
720
+ -s schemas/grading.schema.json \
721
+ -d "$f" \
722
+ --spec=draft7
723
+ done
724
+
725
+ # Validate trigger_eval.json
726
+ npx ajv-cli@5.0.0 validate \
727
+ -s schemas/trigger_eval.schema.json \
728
+ -d workspace/trigger_eval.json \
729
+ --spec=draft7
730
+ ```
731
+
732
+ ### Programmatic Validation
733
+
734
+ ```typescript
735
+ import Ajv from 'ajv';
736
+ import addFormats from 'ajv-formats';
737
+
738
+ const ajv = new Ajv({ allErrors: true });
739
+ addFormats(ajv);
740
+
741
+ // Load schemas
742
+ const evalsSchema = require('./schemas/evals.schema.json');
743
+ const gradingSchema = require('./schemas/grading.schema.json');
744
+ const timingSchema = require('./schemas/timing.schema.json');
745
+
746
+ // Compile validation functions
747
+ const validateEvals = ajv.compile(evalsSchema);
748
+ const validateGrading = ajv.compile(gradingSchema);
749
+ const validateTiming = ajv.compile(timingSchema);
750
+
751
+ // Validate data
752
+ const evalsData = require('./workspace/evals.json');
753
+ if (!validateEvals(evalsData)) {
754
+ console.error('evals.json validation errors:', validateEvals.errors);
755
+ }
756
+ ```
757
+
758
+ ### Consistency Validation
759
+
760
+ Additional validation to check referential integrity between schemas:
761
+
762
+ ```typescript
763
+ // Verify evals.json id to eval directory mapping
764
+ function validateConsistency(workspacePath: string): string[] {
765
+ const errors: string[] = [];
766
+ const evals = require(`${workspacePath}/evals.json`);
767
+
768
+ for (const eval of evals.evals) {
769
+ const evalDir = `${workspacePath}/iteration-1/eval-${eval.id - 1}`;
770
+
771
+ // Check with_skill directory exists
772
+ if (!fs.existsSync(`${evalDir}/with_skill/eval_metadata.json`)) {
773
+ errors.push(`Missing: ${evalDir}/with_skill/eval_metadata.json`);
774
+ }
775
+
776
+ // Check without_skill directory exists
777
+ if (!fs.existsSync(`${evalDir}/without_skill/eval_metadata.json`)) {
778
+ errors.push(`Missing: ${evalDir}/without_skill/eval_metadata.json`);
779
+ }
780
+ }
781
+
782
+ return errors;
783
+ }
784
+ ```