codingbuddy-rules 4.5.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.ai-rules/adapters/antigravity.md +6 -6
- package/.ai-rules/adapters/claude-code.md +68 -4
- package/.ai-rules/adapters/codex.md +5 -5
- package/.ai-rules/adapters/cursor.md +2 -2
- package/.ai-rules/adapters/kiro.md +8 -8
- package/.ai-rules/adapters/opencode.md +7 -7
- package/.ai-rules/adapters/q.md +2 -2
- package/.ai-rules/agents/README.md +66 -16
- package/.ai-rules/agents/accessibility-specialist.json +2 -1
- package/.ai-rules/agents/act-mode.json +2 -1
- package/.ai-rules/agents/agent-architect.json +8 -7
- package/.ai-rules/agents/ai-ml-engineer.json +1 -0
- package/.ai-rules/agents/architecture-specialist.json +1 -0
- package/.ai-rules/agents/auto-mode.json +4 -2
- package/.ai-rules/agents/backend-developer.json +1 -0
- package/.ai-rules/agents/code-quality-specialist.json +1 -0
- package/.ai-rules/agents/code-reviewer.json +65 -64
- package/.ai-rules/agents/data-engineer.json +8 -7
- package/.ai-rules/agents/data-scientist.json +10 -9
- package/.ai-rules/agents/devops-engineer.json +1 -0
- package/.ai-rules/agents/documentation-specialist.json +1 -0
- package/.ai-rules/agents/eval-mode.json +20 -19
- package/.ai-rules/agents/event-architecture-specialist.json +1 -0
- package/.ai-rules/agents/frontend-developer.json +1 -0
- package/.ai-rules/agents/i18n-specialist.json +2 -1
- package/.ai-rules/agents/integration-specialist.json +1 -0
- package/.ai-rules/agents/migration-specialist.json +1 -0
- package/.ai-rules/agents/mobile-developer.json +8 -7
- package/.ai-rules/agents/observability-specialist.json +1 -0
- package/.ai-rules/agents/parallel-orchestrator.json +346 -0
- package/.ai-rules/agents/performance-specialist.json +1 -0
- package/.ai-rules/agents/plan-mode.json +3 -1
- package/.ai-rules/agents/plan-reviewer.json +208 -0
- package/.ai-rules/agents/platform-engineer.json +1 -0
- package/.ai-rules/agents/security-engineer.json +9 -8
- package/.ai-rules/agents/security-specialist.json +2 -1
- package/.ai-rules/agents/seo-specialist.json +1 -0
- package/.ai-rules/agents/software-engineer.json +1 -0
- package/.ai-rules/agents/solution-architect.json +11 -10
- package/.ai-rules/agents/systems-developer.json +9 -8
- package/.ai-rules/agents/technical-planner.json +11 -10
- package/.ai-rules/agents/test-engineer.json +7 -6
- package/.ai-rules/agents/test-strategy-specialist.json +1 -0
- package/.ai-rules/agents/tooling-engineer.json +4 -3
- package/.ai-rules/agents/ui-ux-designer.json +1 -0
- package/.ai-rules/keyword-modes.json +4 -4
- package/.ai-rules/rules/clarification-guide.md +14 -14
- package/.ai-rules/rules/core.md +73 -0
- package/.ai-rules/rules/parallel-execution.md +217 -0
- package/.ai-rules/skills/README.md +23 -1
- package/.ai-rules/skills/agent-design/SKILL.md +5 -0
- package/.ai-rules/skills/agent-design/examples/agent-template.json +58 -0
- package/.ai-rules/skills/agent-design/references/expertise-guidelines.md +112 -0
- package/.ai-rules/skills/agent-discussion/SKILL.md +199 -0
- package/.ai-rules/skills/agent-discussion-panel/SKILL.md +448 -0
- package/.ai-rules/skills/api-design/SKILL.md +5 -0
- package/.ai-rules/skills/api-design/examples/error-response.json +159 -0
- package/.ai-rules/skills/api-design/examples/openapi-template.yaml +393 -0
- package/.ai-rules/skills/build-fix/SKILL.md +234 -0
- package/.ai-rules/skills/code-explanation/SKILL.md +4 -0
- package/.ai-rules/skills/context-management/SKILL.md +1 -0
- package/.ai-rules/skills/cost-budget/SKILL.md +348 -0
- package/.ai-rules/skills/cross-repo-issues/SKILL.md +257 -0
- package/.ai-rules/skills/database-migration/SKILL.md +1 -0
- package/.ai-rules/skills/deepsearch/SKILL.md +214 -0
- package/.ai-rules/skills/deployment-checklist/SKILL.md +1 -0
- package/.ai-rules/skills/error-analysis/SKILL.md +1 -0
- package/.ai-rules/skills/finishing-a-development-branch/SKILL.md +281 -0
- package/.ai-rules/skills/frontend-design/SKILL.md +5 -0
- package/.ai-rules/skills/frontend-design/examples/component-template.tsx +203 -0
- package/.ai-rules/skills/frontend-design/references/css-patterns.md +243 -0
- package/.ai-rules/skills/git-master/SKILL.md +358 -0
- package/.ai-rules/skills/incident-response/SKILL.md +1 -0
- package/.ai-rules/skills/legacy-modernization/SKILL.md +1 -0
- package/.ai-rules/skills/mcp-builder/SKILL.md +7 -0
- package/.ai-rules/skills/mcp-builder/examples/resource-example.ts +233 -0
- package/.ai-rules/skills/mcp-builder/examples/tool-example.ts +203 -0
- package/.ai-rules/skills/mcp-builder/references/protocol-spec.md +215 -0
- package/.ai-rules/skills/performance-optimization/SKILL.md +3 -0
- package/.ai-rules/skills/plan-and-review/SKILL.md +115 -0
- package/.ai-rules/skills/pr-all-in-one/SKILL.md +15 -13
- package/.ai-rules/skills/pr-all-in-one/configuration-guide.md +7 -7
- package/.ai-rules/skills/pr-all-in-one/pr-templates.md +10 -10
- package/.ai-rules/skills/pr-review/SKILL.md +4 -0
- package/.ai-rules/skills/receiving-code-review/SKILL.md +347 -0
- package/.ai-rules/skills/refactoring/SKILL.md +1 -0
- package/.ai-rules/skills/requesting-code-review/SKILL.md +348 -0
- package/.ai-rules/skills/rule-authoring/SKILL.md +5 -0
- package/.ai-rules/skills/rule-authoring/examples/rule-template.md +142 -0
- package/.ai-rules/skills/rule-authoring/examples/trigger-patterns.md +126 -0
- package/.ai-rules/skills/security-audit/SKILL.md +4 -0
- package/.ai-rules/skills/skill-creator/SKILL.md +461 -0
- package/.ai-rules/skills/skill-creator/agents/analyzer.md +206 -0
- package/.ai-rules/skills/skill-creator/agents/comparator.md +167 -0
- package/.ai-rules/skills/skill-creator/agents/grader.md +152 -0
- package/.ai-rules/skills/skill-creator/assets/eval_review.html +289 -0
- package/.ai-rules/skills/skill-creator/assets/skill-template.md +43 -0
- package/.ai-rules/skills/skill-creator/eval-viewer/generate_review.py +496 -0
- package/.ai-rules/skills/skill-creator/references/frontmatter-guide.md +632 -0
- package/.ai-rules/skills/skill-creator/references/multi-tool-compat.md +480 -0
- package/.ai-rules/skills/skill-creator/references/schemas.md +784 -0
- package/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py +302 -0
- package/.ai-rules/skills/skill-creator/scripts/init_skill.sh +196 -0
- package/.ai-rules/skills/skill-creator/scripts/run_loop.py +327 -0
- package/.ai-rules/skills/systematic-debugging/SKILL.md +1 -0
- package/.ai-rules/skills/tech-debt/SKILL.md +1 -0
- package/.ai-rules/skills/test-coverage-gate/SKILL.md +303 -0
- package/.ai-rules/skills/tmux-master/SKILL.md +491 -0
- package/.ai-rules/skills/using-git-worktrees/SKILL.md +368 -0
- package/.ai-rules/skills/verification-before-completion/SKILL.md +234 -0
- package/.ai-rules/skills/widget-slot-architecture/SKILL.md +6 -0
- package/.ai-rules/skills/widget-slot-architecture/examples/parallel-route-setup.tsx +206 -0
- package/.ai-rules/skills/widget-slot-architecture/examples/widget-component.tsx +250 -0
- package/.ai-rules/skills/writing-plans/SKILL.md +78 -0
- package/bin/cli.js +178 -0
- package/lib/init/detect-stack.js +148 -0
- package/lib/init/generate-config.js +31 -0
- package/lib/init/index.js +86 -0
- package/lib/init/prompt.js +60 -0
- package/lib/init/scaffold.js +67 -0
- package/lib/init/suggest-agent.js +46 -0
- package/package.json +10 -2
|
@@ -0,0 +1,784 @@
|
|
|
1
|
+
# Skill Evaluation Schemas Reference
|
|
2
|
+
|
|
3
|
+
JSON schema definitions and workspace directory structure reference for the skill evaluation system.
|
|
4
|
+
|
|
5
|
+
## Table of Contents
|
|
6
|
+
|
|
7
|
+
- [Workspace Directory Structure](#workspace-directory-structure)
|
|
8
|
+
- [Schema Definitions](#schema-definitions)
|
|
9
|
+
- [1. evals.json](#1-evalsjson)
|
|
10
|
+
- [2. eval_metadata.json](#2-eval_metadatajson)
|
|
11
|
+
- [3. grading.json](#3-gradingjson)
|
|
12
|
+
- [4. timing.json](#4-timingjson)
|
|
13
|
+
- [5. feedback.json](#5-feedbackjson)
|
|
14
|
+
- [6. benchmark.json](#6-benchmarkjson)
|
|
15
|
+
- [7. trigger_eval.json](#7-trigger_evaljson)
|
|
16
|
+
- [Schema Relationships](#schema-relationships)
|
|
17
|
+
- [Validation](#validation)
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Workspace Directory Structure
|
|
22
|
+
|
|
23
|
+
Workspace directory structure used during skill evaluation. Each iteration represents a skill modification cycle, and each eval includes a `with_skill` (skill applied) and `without_skill` (baseline) comparison run.
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
workspace/
|
|
27
|
+
├── evals.json # Evaluation scenario definitions (shared across all iterations)
|
|
28
|
+
├── trigger_eval.json # Trigger evaluation cases (benchmark mode)
|
|
29
|
+
└── iteration-N/ # Nth skill modification cycle
|
|
30
|
+
├── eval-0/ # First evaluation (0-based)
|
|
31
|
+
│ ├── with_skill/ # Skill-applied run
|
|
32
|
+
│ │ ├── outputs/ # Generated output files
|
|
33
|
+
│ │ ├── eval_metadata.json # Evaluation metadata
|
|
34
|
+
│ │ ├── grading.json # Grading results
|
|
35
|
+
│ │ └── timing.json # Execution time measurements
|
|
36
|
+
│ └── without_skill/ # Baseline run (no skill applied)
|
|
37
|
+
│ ├── outputs/ # Generated output files
|
|
38
|
+
│ ├── eval_metadata.json # Evaluation metadata
|
|
39
|
+
│ ├── grading.json # Grading results
|
|
40
|
+
│ └── timing.json # Execution time measurements
|
|
41
|
+
├── eval-1/ # Second evaluation
|
|
42
|
+
│ ├── with_skill/
|
|
43
|
+
│ │ └── ...
|
|
44
|
+
│ └── without_skill/
|
|
45
|
+
│ └── ...
|
|
46
|
+
├── benchmark.json # Iteration benchmark aggregate results
|
|
47
|
+
├── benchmark.md # Benchmark markdown report
|
|
48
|
+
└── feedback.json # User feedback
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**Directory naming conventions:**
|
|
52
|
+
- `iteration-N`: 1-based sequential number. A new iteration is created each time SKILL.md is modified
|
|
53
|
+
- `eval-N`: 0-based sequential number. Corresponds to each scenario in `evals.json`
|
|
54
|
+
- `with_skill/`: Evaluation run with the skill applied
|
|
55
|
+
- `without_skill/`: Baseline run without the skill (comparison target)
|
|
56
|
+
- `outputs/`: Files generated during each run (code, documents, etc.)
|
|
57
|
+
|
|
58
|
+
**File location rules:**
|
|
59
|
+
- `evals.json`, `trigger_eval.json`: Workspace root (iteration-independent)
|
|
60
|
+
- `eval_metadata.json`, `grading.json`, `timing.json`: Inside each eval's `with_skill/` or `without_skill/`
|
|
61
|
+
- `benchmark.json`, `benchmark.md`, `feedback.json`: Iteration root
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Schema Definitions
|
|
66
|
+
|
|
67
|
+
### 1. evals.json
|
|
68
|
+
|
|
69
|
+
Defines the list of evaluation scenarios. Located at the workspace root and shared across all iterations.
|
|
70
|
+
|
|
71
|
+
**Schema:**
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{
|
|
75
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
76
|
+
"$id": "https://codingbuddy.dev/schemas/skill-eval/evals.json",
|
|
77
|
+
"title": "Skill Evaluation Scenarios",
|
|
78
|
+
"description": "Skill evaluation scenario definitions",
|
|
79
|
+
"type": "object",
|
|
80
|
+
"required": ["skill_name", "evals"],
|
|
81
|
+
"properties": {
|
|
82
|
+
"skill_name": {
|
|
83
|
+
"type": "string",
|
|
84
|
+
"description": "Name of the skill being evaluated (kebab-case)",
|
|
85
|
+
"pattern": "^[a-z][a-z0-9-]*$"
|
|
86
|
+
},
|
|
87
|
+
"evals": {
|
|
88
|
+
"type": "array",
|
|
89
|
+
"description": "List of evaluation scenarios",
|
|
90
|
+
"minItems": 1,
|
|
91
|
+
"items": {
|
|
92
|
+
"type": "object",
|
|
93
|
+
"required": ["id", "prompt", "expected_output", "files"],
|
|
94
|
+
"properties": {
|
|
95
|
+
"id": {
|
|
96
|
+
"type": "integer",
|
|
97
|
+
"minimum": 1,
|
|
98
|
+
"description": "Scenario ID (1-based)"
|
|
99
|
+
},
|
|
100
|
+
"prompt": {
|
|
101
|
+
"type": "string",
|
|
102
|
+
"description": "User task prompt"
|
|
103
|
+
},
|
|
104
|
+
"expected_output": {
|
|
105
|
+
"type": "string",
|
|
106
|
+
"description": "Expected result description"
|
|
107
|
+
},
|
|
108
|
+
"files": {
|
|
109
|
+
"type": "array",
|
|
110
|
+
"items": { "type": "string" },
|
|
111
|
+
"description": "List of input file paths (empty array if none)"
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**Example:**
|
|
121
|
+
|
|
122
|
+
```json
|
|
123
|
+
{
|
|
124
|
+
"skill_name": "test-driven-development",
|
|
125
|
+
"evals": [
|
|
126
|
+
{
|
|
127
|
+
"id": 1,
|
|
128
|
+
"prompt": "Add a function that validates email addresses",
|
|
129
|
+
"expected_output": "Test file created first with failing test, then implementation, then refactor",
|
|
130
|
+
"files": ["src/utils/validators.ts"]
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
"id": 2,
|
|
134
|
+
"prompt": "Fix the login timeout bug",
|
|
135
|
+
"expected_output": "Failing test reproducing the bug, then minimal fix, then cleanup",
|
|
136
|
+
"files": ["src/auth/login.ts", "src/auth/login.test.ts"]
|
|
137
|
+
}
|
|
138
|
+
]
|
|
139
|
+
}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
### 2. eval_metadata.json
|
|
145
|
+
|
|
146
|
+
Records metadata for an individual evaluation run. Located in each `with_skill/` and `without_skill/` directory.
|
|
147
|
+
|
|
148
|
+
**Schema:**
|
|
149
|
+
|
|
150
|
+
```json
|
|
151
|
+
{
|
|
152
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
153
|
+
"$id": "https://codingbuddy.dev/schemas/skill-eval/eval_metadata.json",
|
|
154
|
+
"title": "Evaluation Metadata",
|
|
155
|
+
"description": "Metadata for an individual evaluation run",
|
|
156
|
+
"type": "object",
|
|
157
|
+
"required": ["eval_id", "eval_name", "prompt", "assertions"],
|
|
158
|
+
"properties": {
|
|
159
|
+
"eval_id": {
|
|
160
|
+
"type": "integer",
|
|
161
|
+
"minimum": 0,
|
|
162
|
+
"description": "Evaluation ID (0-based, corresponds to evals.json id-1)"
|
|
163
|
+
},
|
|
164
|
+
"eval_name": {
|
|
165
|
+
"type": "string",
|
|
166
|
+
"description": "Descriptive name for the evaluation"
|
|
167
|
+
},
|
|
168
|
+
"prompt": {
|
|
169
|
+
"type": "string",
|
|
170
|
+
"description": "User task prompt (copied from evals.json)"
|
|
171
|
+
},
|
|
172
|
+
"assertions": {
|
|
173
|
+
"type": "array",
|
|
174
|
+
"description": "List of verification items",
|
|
175
|
+
"minItems": 1,
|
|
176
|
+
"items": {
|
|
177
|
+
"type": "object",
|
|
178
|
+
"required": ["name", "description"],
|
|
179
|
+
"properties": {
|
|
180
|
+
"name": {
|
|
181
|
+
"type": "string",
|
|
182
|
+
"description": "Verifiable item name"
|
|
183
|
+
},
|
|
184
|
+
"description": {
|
|
185
|
+
"type": "string",
|
|
186
|
+
"description": "Description of pass criteria"
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
**Example:**
|
|
196
|
+
|
|
197
|
+
```json
|
|
198
|
+
{
|
|
199
|
+
"eval_id": 0,
|
|
200
|
+
"eval_name": "Email Validation TDD Cycle",
|
|
201
|
+
"prompt": "Add a function that validates email addresses",
|
|
202
|
+
"assertions": [
|
|
203
|
+
{
|
|
204
|
+
"name": "test_file_created_first",
|
|
205
|
+
"description": "Test file was created before the implementation file"
|
|
206
|
+
},
|
|
207
|
+
{
|
|
208
|
+
"name": "test_initially_fails",
|
|
209
|
+
"description": "Test fails when run before implementation"
|
|
210
|
+
},
|
|
211
|
+
{
|
|
212
|
+
"name": "minimal_implementation",
|
|
213
|
+
"description": "Only minimal code to pass the test was written"
|
|
214
|
+
},
|
|
215
|
+
{
|
|
216
|
+
"name": "refactor_step_present",
|
|
217
|
+
"description": "A refactoring step was performed after GREEN"
|
|
218
|
+
}
|
|
219
|
+
]
|
|
220
|
+
}
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
### 3. grading.json
|
|
226
|
+
|
|
227
|
+
Records grading results for an evaluation run. Includes pass/fail judgment and supporting evidence for each assertion.
|
|
228
|
+
|
|
229
|
+
**Schema:**
|
|
230
|
+
|
|
231
|
+
```json
|
|
232
|
+
{
|
|
233
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
234
|
+
"$id": "https://codingbuddy.dev/schemas/skill-eval/grading.json",
|
|
235
|
+
"title": "Grading Results",
|
|
236
|
+
"description": "Evaluation grading results",
|
|
237
|
+
"type": "object",
|
|
238
|
+
"required": ["expectations"],
|
|
239
|
+
"properties": {
|
|
240
|
+
"expectations": {
|
|
241
|
+
"type": "array",
|
|
242
|
+
"description": "Grading results per assertion",
|
|
243
|
+
"minItems": 1,
|
|
244
|
+
"items": {
|
|
245
|
+
"type": "object",
|
|
246
|
+
"required": ["text", "passed", "evidence"],
|
|
247
|
+
"properties": {
|
|
248
|
+
"text": {
|
|
249
|
+
"type": "string",
|
|
250
|
+
"description": "Assertion description (corresponds to assertion.description in eval_metadata.json)"
|
|
251
|
+
},
|
|
252
|
+
"passed": {
|
|
253
|
+
"type": "boolean",
|
|
254
|
+
"description": "Whether it passed"
|
|
255
|
+
},
|
|
256
|
+
"evidence": {
|
|
257
|
+
"type": "string",
|
|
258
|
+
"description": "Basis for judgment (specific evidence)"
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
**Example:**
|
|
268
|
+
|
|
269
|
+
```json
|
|
270
|
+
{
|
|
271
|
+
"expectations": [
|
|
272
|
+
{
|
|
273
|
+
"text": "Test file was created before the implementation file",
|
|
274
|
+
"passed": true,
|
|
275
|
+
"evidence": "validators.test.ts was created 2 minutes before validators.ts (confirmed via git log)"
|
|
276
|
+
},
|
|
277
|
+
{
|
|
278
|
+
"text": "Test fails when run before implementation",
|
|
279
|
+
"passed": true,
|
|
280
|
+
"evidence": "RED phase confirmed: 'Expected isValidEmail to be defined' error"
|
|
281
|
+
},
|
|
282
|
+
{
|
|
283
|
+
"text": "Only minimal code to pass the test was written",
|
|
284
|
+
"passed": false,
|
|
285
|
+
"evidence": "Initial implementation included unnecessary domain validation logic (exceeds assertion scope)"
|
|
286
|
+
}
|
|
287
|
+
]
|
|
288
|
+
}
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
**Pass rate calculation:**
|
|
292
|
+
|
|
293
|
+
```
|
|
294
|
+
pass_rate = expectations.filter(e => e.passed).length / expectations.length
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
---
|
|
298
|
+
|
|
299
|
+
### 4. timing.json
|
|
300
|
+
|
|
301
|
+
Records time and token usage for an evaluation run.
|
|
302
|
+
|
|
303
|
+
**Schema:**
|
|
304
|
+
|
|
305
|
+
```json
|
|
306
|
+
{
|
|
307
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
308
|
+
"$id": "https://codingbuddy.dev/schemas/skill-eval/timing.json",
|
|
309
|
+
"title": "Timing Data",
|
|
310
|
+
"description": "Evaluation execution time and token usage",
|
|
311
|
+
"type": "object",
|
|
312
|
+
"required": ["total_tokens", "duration_ms", "total_duration_seconds"],
|
|
313
|
+
"properties": {
|
|
314
|
+
"total_tokens": {
|
|
315
|
+
"type": "integer",
|
|
316
|
+
"minimum": 0,
|
|
317
|
+
"description": "Total token usage (input + output)"
|
|
318
|
+
},
|
|
319
|
+
"duration_ms": {
|
|
320
|
+
"type": "integer",
|
|
321
|
+
"minimum": 0,
|
|
322
|
+
"description": "Execution time (milliseconds)"
|
|
323
|
+
},
|
|
324
|
+
"total_duration_seconds": {
|
|
325
|
+
"type": "number",
|
|
326
|
+
"minimum": 0,
|
|
327
|
+
"description": "Total execution time (seconds, with decimal)"
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
**Example:**
|
|
334
|
+
|
|
335
|
+
```json
|
|
336
|
+
{
|
|
337
|
+
"total_tokens": 45230,
|
|
338
|
+
"duration_ms": 32150,
|
|
339
|
+
"total_duration_seconds": 32.15
|
|
340
|
+
}
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
**Used in benchmark comparison:**
|
|
344
|
+
- Compare timing.json between `with_skill` and `without_skill` to measure token/time overhead from skill application
|
|
345
|
+
|
|
346
|
+
---
|
|
347
|
+
|
|
348
|
+
### 5. feedback.json
|
|
349
|
+
|
|
350
|
+
Records user feedback on evaluations. Located at the iteration root and consolidates feedback across multiple eval runs.
|
|
351
|
+
|
|
352
|
+
**Schema:**
|
|
353
|
+
|
|
354
|
+
```json
|
|
355
|
+
{
|
|
356
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
357
|
+
"$id": "https://codingbuddy.dev/schemas/skill-eval/feedback.json",
|
|
358
|
+
"title": "Evaluation Feedback",
|
|
359
|
+
"description": "User feedback on evaluations",
|
|
360
|
+
"type": "object",
|
|
361
|
+
"required": ["reviews", "status"],
|
|
362
|
+
"properties": {
|
|
363
|
+
"reviews": {
|
|
364
|
+
"type": "array",
|
|
365
|
+
"description": "List of feedback items",
|
|
366
|
+
"items": {
|
|
367
|
+
"type": "object",
|
|
368
|
+
"required": ["run_id", "feedback", "timestamp"],
|
|
369
|
+
"properties": {
|
|
370
|
+
"run_id": {
|
|
371
|
+
"type": "string",
|
|
372
|
+
"description": "Run identifier (e.g., eval-0-with_skill, eval-1-without_skill)",
|
|
373
|
+
"pattern": "^eval-[0-9]+-(?:with_skill|without_skill)$"
|
|
374
|
+
},
|
|
375
|
+
"feedback": {
|
|
376
|
+
"type": "string",
|
|
377
|
+
"description": "User feedback content"
|
|
378
|
+
},
|
|
379
|
+
"timestamp": {
|
|
380
|
+
"type": "string",
|
|
381
|
+
"format": "date-time",
|
|
382
|
+
"description": "Feedback creation time (ISO 8601)"
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
},
|
|
387
|
+
"status": {
|
|
388
|
+
"type": "string",
|
|
389
|
+
"enum": ["in_progress", "complete"],
|
|
390
|
+
"description": "Feedback collection status"
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
**Example:**
|
|
397
|
+
|
|
398
|
+
```json
|
|
399
|
+
{
|
|
400
|
+
"reviews": [
|
|
401
|
+
{
|
|
402
|
+
"run_id": "eval-0-with_skill",
|
|
403
|
+
"feedback": "TDD cycle was well followed, but error message verification was skipped in the RED phase",
|
|
404
|
+
"timestamp": "2026-03-21T14:30:00.000Z"
|
|
405
|
+
},
|
|
406
|
+
{
|
|
407
|
+
"run_id": "eval-0-without_skill",
|
|
408
|
+
"feedback": "When run without the skill, there was a tendency to write tests after implementation",
|
|
409
|
+
"timestamp": "2026-03-21T14:35:00.000Z"
|
|
410
|
+
}
|
|
411
|
+
],
|
|
412
|
+
"status": "in_progress"
|
|
413
|
+
}
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
**run_id format:**
|
|
417
|
+
- `eval-{eval_id}-with_skill`: Feedback for skill-applied run
|
|
418
|
+
- `eval-{eval_id}-without_skill`: Feedback for baseline run
|
|
419
|
+
|
|
420
|
+
---
|
|
421
|
+
|
|
422
|
+
### 6. benchmark.json
|
|
423
|
+
|
|
424
|
+
Records aggregate benchmark results per iteration. Aggregates `with_skill` vs `without_skill` comparison data across all evals.
|
|
425
|
+
|
|
426
|
+
**Schema:**
|
|
427
|
+
|
|
428
|
+
```json
|
|
429
|
+
{
|
|
430
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
431
|
+
"$id": "https://codingbuddy.dev/schemas/skill-eval/benchmark.json",
|
|
432
|
+
"title": "Benchmark Results",
|
|
433
|
+
"description": "Iteration benchmark aggregate results",
|
|
434
|
+
"type": "object",
|
|
435
|
+
"required": ["skill_name", "iteration", "summary", "eval_results"],
|
|
436
|
+
"properties": {
|
|
437
|
+
"skill_name": {
|
|
438
|
+
"type": "string",
|
|
439
|
+
"description": "Name of the skill being evaluated",
|
|
440
|
+
"pattern": "^[a-z][a-z0-9-]*$"
|
|
441
|
+
},
|
|
442
|
+
"iteration": {
|
|
443
|
+
"type": "integer",
|
|
444
|
+
"minimum": 1,
|
|
445
|
+
"description": "Iteration number (1-based)"
|
|
446
|
+
},
|
|
447
|
+
"summary": {
|
|
448
|
+
"type": "object",
|
|
449
|
+
"description": "Aggregate summary statistics across all evals",
|
|
450
|
+
"required": ["pass_rate", "tokens", "duration_seconds"],
|
|
451
|
+
"properties": {
|
|
452
|
+
"pass_rate": {
|
|
453
|
+
"type": "object",
|
|
454
|
+
"required": ["mean", "stddev"],
|
|
455
|
+
"properties": {
|
|
456
|
+
"mean": {
|
|
457
|
+
"type": "number",
|
|
458
|
+
"minimum": 0,
|
|
459
|
+
"maximum": 1,
|
|
460
|
+
"description": "Mean pass rate (0.0~1.0)"
|
|
461
|
+
},
|
|
462
|
+
"stddev": {
|
|
463
|
+
"type": "number",
|
|
464
|
+
"minimum": 0,
|
|
465
|
+
"description": "Pass rate standard deviation"
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
},
|
|
469
|
+
"tokens": {
|
|
470
|
+
"type": "object",
|
|
471
|
+
"required": ["mean", "stddev"],
|
|
472
|
+
"properties": {
|
|
473
|
+
"mean": {
|
|
474
|
+
"type": "number",
|
|
475
|
+
"minimum": 0,
|
|
476
|
+
"description": "Mean token usage"
|
|
477
|
+
},
|
|
478
|
+
"stddev": {
|
|
479
|
+
"type": "number",
|
|
480
|
+
"minimum": 0,
|
|
481
|
+
"description": "Token standard deviation"
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
},
|
|
485
|
+
"duration_seconds": {
|
|
486
|
+
"type": "object",
|
|
487
|
+
"required": ["mean", "stddev"],
|
|
488
|
+
"properties": {
|
|
489
|
+
"mean": {
|
|
490
|
+
"type": "number",
|
|
491
|
+
"minimum": 0,
|
|
492
|
+
"description": "Mean execution time (seconds)"
|
|
493
|
+
},
|
|
494
|
+
"stddev": {
|
|
495
|
+
"type": "number",
|
|
496
|
+
"minimum": 0,
|
|
497
|
+
"description": "Time standard deviation"
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
},
|
|
503
|
+
"eval_results": {
|
|
504
|
+
"type": "array",
|
|
505
|
+
"description": "Individual eval comparison results",
|
|
506
|
+
"items": {
|
|
507
|
+
"type": "object",
|
|
508
|
+
"required": ["eval_id", "with_skill", "baseline"],
|
|
509
|
+
"properties": {
|
|
510
|
+
"eval_id": {
|
|
511
|
+
"type": "integer",
|
|
512
|
+
"minimum": 0,
|
|
513
|
+
"description": "Evaluation ID (0-based)"
|
|
514
|
+
},
|
|
515
|
+
"with_skill": {
|
|
516
|
+
"type": "object",
|
|
517
|
+
"required": ["pass_rate", "tokens", "duration"],
|
|
518
|
+
"description": "Skill-applied results",
|
|
519
|
+
"properties": {
|
|
520
|
+
"pass_rate": {
|
|
521
|
+
"type": "number",
|
|
522
|
+
"minimum": 0,
|
|
523
|
+
"maximum": 1,
|
|
524
|
+
"description": "Pass rate (0.0~1.0)"
|
|
525
|
+
},
|
|
526
|
+
"tokens": {
|
|
527
|
+
"type": "integer",
|
|
528
|
+
"minimum": 0,
|
|
529
|
+
"description": "Token usage"
|
|
530
|
+
},
|
|
531
|
+
"duration": {
|
|
532
|
+
"type": "number",
|
|
533
|
+
"minimum": 0,
|
|
534
|
+
"description": "Execution time (seconds)"
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
},
|
|
538
|
+
"baseline": {
|
|
539
|
+
"type": "object",
|
|
540
|
+
"required": ["pass_rate", "tokens", "duration"],
|
|
541
|
+
"description": "Baseline (no skill applied) results",
|
|
542
|
+
"properties": {
|
|
543
|
+
"pass_rate": {
|
|
544
|
+
"type": "number",
|
|
545
|
+
"minimum": 0,
|
|
546
|
+
"maximum": 1
|
|
547
|
+
},
|
|
548
|
+
"tokens": {
|
|
549
|
+
"type": "integer",
|
|
550
|
+
"minimum": 0
|
|
551
|
+
},
|
|
552
|
+
"duration": {
|
|
553
|
+
"type": "number",
|
|
554
|
+
"minimum": 0
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
**Example:**
|
|
566
|
+
|
|
567
|
+
```json
|
|
568
|
+
{
|
|
569
|
+
"skill_name": "test-driven-development",
|
|
570
|
+
"iteration": 1,
|
|
571
|
+
"summary": {
|
|
572
|
+
"pass_rate": { "mean": 0.85, "stddev": 0.12 },
|
|
573
|
+
"tokens": { "mean": 42000, "stddev": 5200 },
|
|
574
|
+
"duration_seconds": { "mean": 35.5, "stddev": 8.3 }
|
|
575
|
+
},
|
|
576
|
+
"eval_results": [
|
|
577
|
+
{
|
|
578
|
+
"eval_id": 0,
|
|
579
|
+
"with_skill": { "pass_rate": 0.75, "tokens": 45230, "duration": 32.15 },
|
|
580
|
+
"baseline": { "pass_rate": 0.50, "tokens": 38400, "duration": 28.90 }
|
|
581
|
+
},
|
|
582
|
+
{
|
|
583
|
+
"eval_id": 1,
|
|
584
|
+
"with_skill": { "pass_rate": 1.0, "tokens": 38770, "duration": 38.85 },
|
|
585
|
+
"baseline": { "pass_rate": 0.25, "tokens": 35200, "duration": 25.40 }
|
|
586
|
+
}
|
|
587
|
+
]
|
|
588
|
+
}
|
|
589
|
+
```
|
|
590
|
+
|
|
591
|
+
**Interpretation guide:**
|
|
592
|
+
- `with_skill.pass_rate > baseline.pass_rate`: Skill improves quality
|
|
593
|
+
- `with_skill.tokens > baseline.tokens`: Token overhead from skill application
|
|
594
|
+
- Lower `summary.pass_rate.stddev` means more consistent performance
|
|
595
|
+
|
|
596
|
+
---
|
|
597
|
+
|
|
598
|
+
### 7. trigger_eval.json
|
|
599
|
+
|
|
600
|
+
Defines test cases for evaluating skill recommendation triggers. Used in benchmark mode to measure the trigger accuracy of `recommend_skills`.
|
|
601
|
+
|
|
602
|
+
**Schema:**
|
|
603
|
+
|
|
604
|
+
```json
|
|
605
|
+
{
|
|
606
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
607
|
+
"$id": "https://codingbuddy.dev/schemas/skill-eval/trigger_eval.json",
|
|
608
|
+
"title": "Trigger Evaluation Cases",
|
|
609
|
+
"description": "Test cases for skill recommendation trigger accuracy",
|
|
610
|
+
"type": "array",
|
|
611
|
+
"minItems": 1,
|
|
612
|
+
"items": {
|
|
613
|
+
"type": "object",
|
|
614
|
+
"required": ["query", "should_trigger"],
|
|
615
|
+
"properties": {
|
|
616
|
+
"query": {
|
|
617
|
+
"type": "string",
|
|
618
|
+
"description": "User prompt (test input)"
|
|
619
|
+
},
|
|
620
|
+
"should_trigger": {
|
|
621
|
+
"type": "boolean",
|
|
622
|
+
"description": "Whether the skill should be recommended for this prompt"
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
```
|
|
628
|
+
|
|
629
|
+
**Example:**
|
|
630
|
+
|
|
631
|
+
```json
|
|
632
|
+
[
|
|
633
|
+
{
|
|
634
|
+
"query": "Add a new feature to validate user registration",
|
|
635
|
+
"should_trigger": true
|
|
636
|
+
},
|
|
637
|
+
{
|
|
638
|
+
"query": "Fix the null pointer exception in the payment module",
|
|
639
|
+
"should_trigger": true
|
|
640
|
+
},
|
|
641
|
+
{
|
|
642
|
+
"query": "What does this function do?",
|
|
643
|
+
"should_trigger": false
|
|
644
|
+
},
|
|
645
|
+
{
|
|
646
|
+
"query": "Deploy the application to production",
|
|
647
|
+
"should_trigger": false
|
|
648
|
+
}
|
|
649
|
+
]
|
|
650
|
+
```
|
|
651
|
+
|
|
652
|
+
**Usage:**
|
|
653
|
+
1. Pass each `query` from `trigger_eval.json` to `recommend_skills`
|
|
654
|
+
2. Check whether the target skill is included in the results
|
|
655
|
+
3. Compare against `should_trigger` to calculate accuracy
|
|
656
|
+
|
|
657
|
+
**Accuracy metrics:**
|
|
658
|
+
- **Precision**: Ratio of correct recommendations among all recommendations
|
|
659
|
+
- **Recall**: Ratio of actual recommendations among all that should have been recommended
|
|
660
|
+
- **F1 Score**: Harmonic mean of Precision and Recall
|
|
661
|
+
|
|
662
|
+
---
|
|
663
|
+
|
|
664
|
+
## Schema Relationships
|
|
665
|
+
|
|
666
|
+
```
|
|
667
|
+
evals.json (workspace root)
|
|
668
|
+
│
|
|
669
|
+
│ eval.id → eval_id mapping
|
|
670
|
+
▼
|
|
671
|
+
iteration-N/eval-{id}/
|
|
672
|
+
├── with_skill/
|
|
673
|
+
│ ├── eval_metadata.json ◄── Prompt and assertions defined in evals.json
|
|
674
|
+
│ ├── grading.json ◄── Grading of assertions from eval_metadata.json
|
|
675
|
+
│ └── timing.json ── Independent measurement
|
|
676
|
+
└── without_skill/
|
|
677
|
+
├── eval_metadata.json
|
|
678
|
+
├── grading.json
|
|
679
|
+
└── timing.json
|
|
680
|
+
│
|
|
681
|
+
▼ Aggregation
|
|
682
|
+
iteration-N/
|
|
683
|
+
├── benchmark.json ◄── with_skill vs without_skill comparison across all evals
|
|
684
|
+
└── feedback.json ◄── User feedback (references evals via run_id)
|
|
685
|
+
|
|
686
|
+
trigger_eval.json (workspace root) ── recommend_skills accuracy measurement (independent)
|
|
687
|
+
```
|
|
688
|
+
|
|
689
|
+
**Data flow:**
|
|
690
|
+
1. Define evaluation scenarios in `evals.json`
|
|
691
|
+
2. Run `with_skill/` and `without_skill/` for each eval
|
|
692
|
+
3. Record execution information in `eval_metadata.json`
|
|
693
|
+
4. Store per-assertion grading results in `grading.json`
|
|
694
|
+
5. Measure tokens/time in `timing.json`
|
|
695
|
+
6. Aggregate and compare all eval results in `benchmark.json`
|
|
696
|
+
7. Collect user feedback in `feedback.json`
|
|
697
|
+
8. Measure recommendation accuracy separately with `trigger_eval.json`
|
|
698
|
+
|
|
699
|
+
**ID mapping:**
|
|
700
|
+
- `evals.json` `id` (1-based) → `eval-{id-1}/` directory (0-based)
|
|
701
|
+
- `eval_metadata.json` `eval_id` (0-based) = eval number of the directory
|
|
702
|
+
- `feedback.json` `run_id` = `eval-{eval_id}-{with_skill|without_skill}`
|
|
703
|
+
|
|
704
|
+
---
|
|
705
|
+
|
|
706
|
+
## Validation
|
|
707
|
+
|
|
708
|
+
### Validation with ajv CLI
|
|
709
|
+
|
|
710
|
+
```bash
|
|
711
|
+
# Validate evals.json
|
|
712
|
+
npx ajv-cli@5.0.0 validate \
|
|
713
|
+
-s schemas/evals.schema.json \
|
|
714
|
+
-d workspace/evals.json \
|
|
715
|
+
--spec=draft7
|
|
716
|
+
|
|
717
|
+
# Validate all grading.json files within an iteration
|
|
718
|
+
for f in workspace/iteration-*/eval-*/*/grading.json; do
|
|
719
|
+
npx ajv-cli@5.0.0 validate \
|
|
720
|
+
-s schemas/grading.schema.json \
|
|
721
|
+
-d "$f" \
|
|
722
|
+
--spec=draft7
|
|
723
|
+
done
|
|
724
|
+
|
|
725
|
+
# Validate trigger_eval.json
|
|
726
|
+
npx ajv-cli@5.0.0 validate \
|
|
727
|
+
-s schemas/trigger_eval.schema.json \
|
|
728
|
+
-d workspace/trigger_eval.json \
|
|
729
|
+
--spec=draft7
|
|
730
|
+
```
|
|
731
|
+
|
|
732
|
+
### Programmatic Validation
|
|
733
|
+
|
|
734
|
+
```typescript
|
|
735
|
+
import Ajv from 'ajv';
|
|
736
|
+
import addFormats from 'ajv-formats';
|
|
737
|
+
|
|
738
|
+
const ajv = new Ajv({ allErrors: true });
|
|
739
|
+
addFormats(ajv);
|
|
740
|
+
|
|
741
|
+
// Load schemas
|
|
742
|
+
const evalsSchema = require('./schemas/evals.schema.json');
|
|
743
|
+
const gradingSchema = require('./schemas/grading.schema.json');
|
|
744
|
+
const timingSchema = require('./schemas/timing.schema.json');
|
|
745
|
+
|
|
746
|
+
// Compile validation functions
|
|
747
|
+
const validateEvals = ajv.compile(evalsSchema);
|
|
748
|
+
const validateGrading = ajv.compile(gradingSchema);
|
|
749
|
+
const validateTiming = ajv.compile(timingSchema);
|
|
750
|
+
|
|
751
|
+
// Validate data
|
|
752
|
+
const evalsData = require('./workspace/evals.json');
|
|
753
|
+
if (!validateEvals(evalsData)) {
|
|
754
|
+
console.error('evals.json validation errors:', validateEvals.errors);
|
|
755
|
+
}
|
|
756
|
+
```
|
|
757
|
+
|
|
758
|
+
### Consistency Validation
|
|
759
|
+
|
|
760
|
+
Additional validation to check referential integrity between schemas:
|
|
761
|
+
|
|
762
|
+
```typescript
|
|
763
|
+
// Verify evals.json id to eval directory mapping
|
|
764
|
+
function validateConsistency(workspacePath: string): string[] {
|
|
765
|
+
const errors: string[] = [];
|
|
766
|
+
const evals = require(`${workspacePath}/evals.json`);
|
|
767
|
+
|
|
768
|
+
for (const eval of evals.evals) {
|
|
769
|
+
const evalDir = `${workspacePath}/iteration-1/eval-${eval.id - 1}`;
|
|
770
|
+
|
|
771
|
+
// Check with_skill directory exists
|
|
772
|
+
if (!fs.existsSync(`${evalDir}/with_skill/eval_metadata.json`)) {
|
|
773
|
+
errors.push(`Missing: ${evalDir}/with_skill/eval_metadata.json`);
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
// Check without_skill directory exists
|
|
777
|
+
if (!fs.existsSync(`${evalDir}/without_skill/eval_metadata.json`)) {
|
|
778
|
+
errors.push(`Missing: ${evalDir}/without_skill/eval_metadata.json`);
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
return errors;
|
|
783
|
+
}
|
|
784
|
+
```
|