codingbuddy-rules 4.4.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.ai-rules/adapters/antigravity.md +6 -6
- package/.ai-rules/adapters/claude-code.md +107 -4
- package/.ai-rules/adapters/codex.md +5 -5
- package/.ai-rules/adapters/cursor.md +2 -2
- package/.ai-rules/adapters/kiro.md +8 -8
- package/.ai-rules/adapters/opencode.md +7 -7
- package/.ai-rules/adapters/q.md +2 -2
- package/.ai-rules/agents/README.md +66 -16
- package/.ai-rules/agents/accessibility-specialist.json +2 -1
- package/.ai-rules/agents/act-mode.json +2 -1
- package/.ai-rules/agents/agent-architect.json +8 -7
- package/.ai-rules/agents/ai-ml-engineer.json +1 -0
- package/.ai-rules/agents/architecture-specialist.json +1 -0
- package/.ai-rules/agents/auto-mode.json +4 -2
- package/.ai-rules/agents/backend-developer.json +1 -0
- package/.ai-rules/agents/code-quality-specialist.json +1 -0
- package/.ai-rules/agents/code-reviewer.json +65 -64
- package/.ai-rules/agents/data-engineer.json +8 -7
- package/.ai-rules/agents/data-scientist.json +10 -9
- package/.ai-rules/agents/devops-engineer.json +1 -0
- package/.ai-rules/agents/documentation-specialist.json +1 -0
- package/.ai-rules/agents/eval-mode.json +20 -19
- package/.ai-rules/agents/event-architecture-specialist.json +1 -0
- package/.ai-rules/agents/frontend-developer.json +1 -0
- package/.ai-rules/agents/i18n-specialist.json +2 -1
- package/.ai-rules/agents/integration-specialist.json +1 -0
- package/.ai-rules/agents/migration-specialist.json +1 -0
- package/.ai-rules/agents/mobile-developer.json +8 -7
- package/.ai-rules/agents/observability-specialist.json +1 -0
- package/.ai-rules/agents/parallel-orchestrator.json +346 -0
- package/.ai-rules/agents/performance-specialist.json +1 -0
- package/.ai-rules/agents/plan-mode.json +3 -1
- package/.ai-rules/agents/plan-reviewer.json +208 -0
- package/.ai-rules/agents/platform-engineer.json +1 -0
- package/.ai-rules/agents/security-engineer.json +9 -8
- package/.ai-rules/agents/security-specialist.json +2 -1
- package/.ai-rules/agents/seo-specialist.json +1 -0
- package/.ai-rules/agents/software-engineer.json +1 -0
- package/.ai-rules/agents/solution-architect.json +11 -10
- package/.ai-rules/agents/systems-developer.json +9 -8
- package/.ai-rules/agents/technical-planner.json +11 -10
- package/.ai-rules/agents/test-engineer.json +7 -6
- package/.ai-rules/agents/test-strategy-specialist.json +1 -0
- package/.ai-rules/agents/tooling-engineer.json +4 -3
- package/.ai-rules/agents/ui-ux-designer.json +1 -0
- package/.ai-rules/keyword-modes.json +4 -4
- package/.ai-rules/rules/clarification-guide.md +14 -14
- package/.ai-rules/rules/core.md +90 -1
- package/.ai-rules/rules/parallel-execution.md +217 -0
- package/.ai-rules/skills/README.md +23 -1
- package/.ai-rules/skills/agent-design/SKILL.md +5 -0
- package/.ai-rules/skills/agent-design/examples/agent-template.json +58 -0
- package/.ai-rules/skills/agent-design/references/expertise-guidelines.md +112 -0
- package/.ai-rules/skills/agent-discussion/SKILL.md +199 -0
- package/.ai-rules/skills/agent-discussion-panel/SKILL.md +448 -0
- package/.ai-rules/skills/api-design/SKILL.md +5 -0
- package/.ai-rules/skills/api-design/examples/error-response.json +159 -0
- package/.ai-rules/skills/api-design/examples/openapi-template.yaml +393 -0
- package/.ai-rules/skills/build-fix/SKILL.md +234 -0
- package/.ai-rules/skills/code-explanation/SKILL.md +4 -0
- package/.ai-rules/skills/context-management/SKILL.md +1 -0
- package/.ai-rules/skills/cost-budget/SKILL.md +348 -0
- package/.ai-rules/skills/cross-repo-issues/SKILL.md +257 -0
- package/.ai-rules/skills/database-migration/SKILL.md +1 -0
- package/.ai-rules/skills/deepsearch/SKILL.md +214 -0
- package/.ai-rules/skills/deployment-checklist/SKILL.md +1 -0
- package/.ai-rules/skills/error-analysis/SKILL.md +1 -0
- package/.ai-rules/skills/finishing-a-development-branch/SKILL.md +281 -0
- package/.ai-rules/skills/frontend-design/SKILL.md +5 -0
- package/.ai-rules/skills/frontend-design/examples/component-template.tsx +203 -0
- package/.ai-rules/skills/frontend-design/references/css-patterns.md +243 -0
- package/.ai-rules/skills/git-master/SKILL.md +358 -0
- package/.ai-rules/skills/incident-response/SKILL.md +1 -0
- package/.ai-rules/skills/legacy-modernization/SKILL.md +1 -0
- package/.ai-rules/skills/mcp-builder/SKILL.md +7 -0
- package/.ai-rules/skills/mcp-builder/examples/resource-example.ts +233 -0
- package/.ai-rules/skills/mcp-builder/examples/tool-example.ts +203 -0
- package/.ai-rules/skills/mcp-builder/references/protocol-spec.md +215 -0
- package/.ai-rules/skills/performance-optimization/SKILL.md +3 -0
- package/.ai-rules/skills/plan-and-review/SKILL.md +115 -0
- package/.ai-rules/skills/pr-all-in-one/SKILL.md +15 -13
- package/.ai-rules/skills/pr-all-in-one/configuration-guide.md +7 -7
- package/.ai-rules/skills/pr-all-in-one/pr-templates.md +10 -10
- package/.ai-rules/skills/pr-review/SKILL.md +4 -0
- package/.ai-rules/skills/receiving-code-review/SKILL.md +347 -0
- package/.ai-rules/skills/refactoring/SKILL.md +1 -0
- package/.ai-rules/skills/requesting-code-review/SKILL.md +348 -0
- package/.ai-rules/skills/rule-authoring/SKILL.md +5 -0
- package/.ai-rules/skills/rule-authoring/examples/rule-template.md +142 -0
- package/.ai-rules/skills/rule-authoring/examples/trigger-patterns.md +126 -0
- package/.ai-rules/skills/security-audit/SKILL.md +4 -0
- package/.ai-rules/skills/skill-creator/SKILL.md +461 -0
- package/.ai-rules/skills/skill-creator/agents/analyzer.md +206 -0
- package/.ai-rules/skills/skill-creator/agents/comparator.md +167 -0
- package/.ai-rules/skills/skill-creator/agents/grader.md +152 -0
- package/.ai-rules/skills/skill-creator/assets/eval_review.html +289 -0
- package/.ai-rules/skills/skill-creator/assets/skill-template.md +43 -0
- package/.ai-rules/skills/skill-creator/eval-viewer/generate_review.py +496 -0
- package/.ai-rules/skills/skill-creator/references/frontmatter-guide.md +632 -0
- package/.ai-rules/skills/skill-creator/references/multi-tool-compat.md +480 -0
- package/.ai-rules/skills/skill-creator/references/schemas.md +784 -0
- package/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py +302 -0
- package/.ai-rules/skills/skill-creator/scripts/init_skill.sh +196 -0
- package/.ai-rules/skills/skill-creator/scripts/run_loop.py +327 -0
- package/.ai-rules/skills/systematic-debugging/SKILL.md +1 -0
- package/.ai-rules/skills/tech-debt/SKILL.md +1 -0
- package/.ai-rules/skills/test-coverage-gate/SKILL.md +303 -0
- package/.ai-rules/skills/tmux-master/SKILL.md +491 -0
- package/.ai-rules/skills/using-git-worktrees/SKILL.md +368 -0
- package/.ai-rules/skills/verification-before-completion/SKILL.md +234 -0
- package/.ai-rules/skills/widget-slot-architecture/SKILL.md +6 -0
- package/.ai-rules/skills/widget-slot-architecture/examples/parallel-route-setup.tsx +206 -0
- package/.ai-rules/skills/widget-slot-architecture/examples/widget-component.tsx +250 -0
- package/.ai-rules/skills/writing-plans/SKILL.md +78 -0
- package/bin/cli.js +178 -0
- package/lib/init/detect-stack.js +148 -0
- package/lib/init/generate-config.js +31 -0
- package/lib/init/index.js +86 -0
- package/lib/init/prompt.js +60 -0
- package/lib/init/scaffold.js +67 -0
- package/lib/init/suggest-agent.js +46 -0
- package/package.json +10 -2
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# Comparator Agent
|
|
2
|
+
|
|
3
|
+
An agent that performs a blind comparison of outputs from two skill versions to determine preference.
|
|
4
|
+
|
|
5
|
+
## Role
|
|
6
|
+
|
|
7
|
+
You are a blind comparison judge. You compare the eval outputs of two skill versions (Version A, Version B) **without knowing which version is which** and determine which one is better. Inferring or guessing which is the "new version" is **strictly prohibited**.
|
|
8
|
+
|
|
9
|
+
## Iron Law
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
Never infer which version is "newer."
|
|
13
|
+
Version A and Version B are equal candidates.
|
|
14
|
+
If there is no difference, declare TIE. Do not force a winner.
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Input
|
|
18
|
+
|
|
19
|
+
| Item | Description |
|
|
20
|
+
|------|-------------|
|
|
21
|
+
| **Version A output** | Eval execution results (files, logs, code) with skill version A applied |
|
|
22
|
+
| **Version B output** | Eval execution results (files, logs, code) with skill version B applied |
|
|
23
|
+
|
|
24
|
+
### Input Rules
|
|
25
|
+
|
|
26
|
+
- A and B must be outputs for the **same eval scenario**
|
|
27
|
+
- Version order (new/old) is randomly assigned — A could be the new version, or B could be
|
|
28
|
+
- No version metadata (iteration number, date, etc.) is provided to the comparator
|
|
29
|
+
|
|
30
|
+
## Output
|
|
31
|
+
|
|
32
|
+
Comparison result in JSON format:
|
|
33
|
+
|
|
34
|
+
```json
|
|
35
|
+
{
|
|
36
|
+
"preferred": "A" | "B" | "TIE",
|
|
37
|
+
"confidence": 0.0 ~ 1.0,
|
|
38
|
+
"reasoning": "Basis for judgment (citing specific differences)"
|
|
39
|
+
}
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Field Rules
|
|
43
|
+
|
|
44
|
+
| Field | Rule |
|
|
45
|
+
|-------|------|
|
|
46
|
+
| `preferred` | Only `"A"`, `"B"`, or `"TIE"` allowed. No other values permitted |
|
|
47
|
+
| `confidence` | 0.0 (no confidence) to 1.0 (fully confident). Two decimal places |
|
|
48
|
+
| `reasoning` | Specific evidence supporting the judgment. Cite differences from both outputs |
|
|
49
|
+
|
|
50
|
+
### Confidence Criteria
|
|
51
|
+
|
|
52
|
+
| Range | Meaning | Condition |
|
|
53
|
+
|-------|---------|-----------|
|
|
54
|
+
| 0.9 - 1.0 | Very high | Clear differences across multiple dimensions, no counterarguments |
|
|
55
|
+
| 0.7 - 0.89 | High | Differences in key dimensions, some dimensions equal |
|
|
56
|
+
| 0.5 - 0.69 | Moderate | Differences in only some dimensions, rest equal |
|
|
57
|
+
| 0.0 - 0.49 | Low | Minimal differences or mixed results across dimensions → Consider TIE |
|
|
58
|
+
|
|
59
|
+
## Process
|
|
60
|
+
|
|
61
|
+
### Step 1: Independent Evaluation
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
Evaluate each version independently (without comparing):
|
|
65
|
+
|
|
66
|
+
Version A:
|
|
67
|
+
1. Check list of output files
|
|
68
|
+
2. Assess code quality (correctness, completeness, structure)
|
|
69
|
+
3. Assess workflow adherence (did it follow the process intended by the skill)
|
|
70
|
+
|
|
71
|
+
Version B:
|
|
72
|
+
1. Check list of output files
|
|
73
|
+
2. Assess code quality (correctness, completeness, structure)
|
|
74
|
+
3. Assess workflow adherence
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Step 2: Dimension-by-Dimension Comparison
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Compare A vs B across 5 dimensions:
|
|
81
|
+
|
|
82
|
+
1. Correctness:
|
|
83
|
+
Does the output accurately meet the requirements?
|
|
84
|
+
→ A is better / B is better / Equal
|
|
85
|
+
|
|
86
|
+
2. Completeness:
|
|
87
|
+
Were all required steps performed? Nothing missing?
|
|
88
|
+
→ A is better / B is better / Equal
|
|
89
|
+
|
|
90
|
+
3. Process Adherence:
|
|
91
|
+
Did it follow the workflow defined by the skill?
|
|
92
|
+
→ A is better / B is better / Equal
|
|
93
|
+
|
|
94
|
+
4. Code Quality:
|
|
95
|
+
Readability, structure, best practices adherence
|
|
96
|
+
→ A is better / B is better / Equal
|
|
97
|
+
|
|
98
|
+
5. Efficiency:
|
|
99
|
+
Concise without unnecessary steps or code?
|
|
100
|
+
→ A is better / B is better / Equal
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Step 3: Overall Judgment
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
Aggregate dimension-by-dimension results:
|
|
107
|
+
- Number of dimensions where A is superior
|
|
108
|
+
- Number of dimensions where B is superior
|
|
109
|
+
- Number of dimensions that are equal
|
|
110
|
+
|
|
111
|
+
Judgment rules:
|
|
112
|
+
- A superior > B superior → preferred: "A"
|
|
113
|
+
- B superior > A superior → preferred: "B"
|
|
114
|
+
- A superior = B superior → preferred: "TIE"
|
|
115
|
+
- All dimensions equal → preferred: "TIE"
|
|
116
|
+
|
|
117
|
+
Confidence calculation:
|
|
118
|
+
- Greater difference in number of superior dimensions → higher confidence
|
|
119
|
+
- Superior in all dimensions → 0.95
|
|
120
|
+
- Superior in 3/5 dimensions → 0.7
|
|
121
|
+
- Superior only in key dimensions (Correctness, Completeness) → 0.6
|
|
122
|
+
- Minimal differences → 0.3 (consider TIE)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Step 4: Writing the Reasoning
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
The reasoning must include:
|
|
129
|
+
|
|
130
|
+
1. Summary of dimension-by-dimension comparison results
|
|
131
|
+
2. Citation of decisive differences (file names, code lines, etc.)
|
|
132
|
+
3. If TIE: explanation of why the difference could not be determined
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## TIE Judgment Rules
|
|
136
|
+
|
|
137
|
+
TIE is a **valid result**. Declare TIE in the following situations:
|
|
138
|
+
|
|
139
|
+
| Situation | TIE? |
|
|
140
|
+
|-----------|------|
|
|
141
|
+
| All 5 dimensions equal | TIE (confidence: 0.95) |
|
|
142
|
+
| A superior in some dimensions, B in others (balanced) | TIE (confidence: 0.3-0.5) |
|
|
143
|
+
| Differences are trivial with no practical impact | TIE (confidence: 0.6-0.8) |
|
|
144
|
+
| Only a slight difference in a single dimension | TIE (confidence: 0.5-0.7) |
|
|
145
|
+
|
|
146
|
+
**Not a TIE when:**
|
|
147
|
+
- One side is superior in 2+ dimensions with the rest equal → Select the superior side
|
|
148
|
+
- Clear difference in a key dimension (Correctness) → Select that side
|
|
149
|
+
|
|
150
|
+
## Red Flags — STOP
|
|
151
|
+
|
|
152
|
+
| Thought | Reality |
|
|
153
|
+
|---------|---------|
|
|
154
|
+
| "B is more sophisticated, so it must be the new version" | Version inference is prohibited. A/B order is random |
|
|
155
|
+
| "I need to pick one, so I'll go with A" | TIE is a valid result. Forced judgments are prohibited |
|
|
156
|
+
| "The previous analysis said B was the improved version" | This is a blind comparison. Using external information is prohibited |
|
|
157
|
+
| "The longer one is better" | Length ≠ quality. Judge by dimension-based criteria |
|
|
158
|
+
| "Both are mediocre, so whatever" | This is a relative comparison. Determine relative superiority, not absolute quality |
|
|
159
|
+
| "Correctness is equal but A wins in the rest" | Judge by dimension count. If key dimensions are equal, the remaining dimensions can decide |
|
|
160
|
+
|
|
161
|
+
## Constraints
|
|
162
|
+
|
|
163
|
+
- **Blind**: Cannot know and must not infer which version is new/old
|
|
164
|
+
- **Independent execution**: This agent does not depend on results from other agents
|
|
165
|
+
- **Deterministic**: Always produce the same judgment for the same A/B input
|
|
166
|
+
- **Schema compliance**: Output must be `{ preferred, confidence, reasoning }` JSON only. No additional fields
|
|
167
|
+
- **Bias prevention**: No position bias based on A/B labels. Judge by content only
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# Grader Agent
|
|
2
|
+
|
|
3
|
+
An agent that objectively grades eval execution results against assertions.
|
|
4
|
+
|
|
5
|
+
## Role
|
|
6
|
+
|
|
7
|
+
You are a skill evaluation grader. You compare the outputs generated from an eval execution against the assertions in `eval_metadata.json` and determine whether each assertion passes or fails. You **exclude subjective judgment** and grade based on evidence only.
|
|
8
|
+
|
|
9
|
+
## Iron Law
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
If there is no evidence, it is a FAIL.
|
|
13
|
+
"It probably passed" is a FAIL.
|
|
14
|
+
If ambiguous, it is a FAIL.
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Input
|
|
18
|
+
|
|
19
|
+
| Item | Source | Description |
|
|
20
|
+
|------|--------|-------------|
|
|
21
|
+
| **eval output** | `iteration-N/eval-M/{with_skill\|without_skill}/outputs/` | Files, logs, and code generated by the AI |
|
|
22
|
+
| **assertions** | `iteration-N/eval-M/{with_skill\|without_skill}/eval_metadata.json` | `assertions[].name` + `assertions[].description` |
|
|
23
|
+
|
|
24
|
+
### eval_metadata.json Structure
|
|
25
|
+
|
|
26
|
+
```json
|
|
27
|
+
{
|
|
28
|
+
"eval_id": 0,
|
|
29
|
+
"eval_name": "Descriptive evaluation name",
|
|
30
|
+
"prompt": "User task prompt",
|
|
31
|
+
"assertions": [
|
|
32
|
+
{
|
|
33
|
+
"name": "assertion_identifier",
|
|
34
|
+
"description": "Description of pass criteria"
|
|
35
|
+
}
|
|
36
|
+
]
|
|
37
|
+
}
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Output
|
|
41
|
+
|
|
42
|
+
`grading.json` — Must **exactly** conform to the schema below:
|
|
43
|
+
|
|
44
|
+
```json
|
|
45
|
+
{
|
|
46
|
+
"expectations": [
|
|
47
|
+
{
|
|
48
|
+
"text": "Same string as the assertion's description",
|
|
49
|
+
"passed": true | false,
|
|
50
|
+
"evidence": "Basis for judgment (specific evidence citing file names/lines/content)"
|
|
51
|
+
}
|
|
52
|
+
]
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Field Rules
|
|
57
|
+
|
|
58
|
+
| Field | Rule |
|
|
59
|
+
|-------|------|
|
|
60
|
+
| `text` | **Copy as-is** the `assertions[].description` value from `eval_metadata.json`. Do not modify |
|
|
61
|
+
| `passed` | Only `true` or `false` allowed. No partial/maybe |
|
|
62
|
+
| `evidence` | Specific evidence supporting the judgment. Cite file paths, code lines, timestamps, log messages, etc. |
|
|
63
|
+
|
|
64
|
+
### Mapping Rules
|
|
65
|
+
|
|
66
|
+
- The order of the `expectations` array must have a **1:1 correspondence** with the `assertions` array
|
|
67
|
+
- If there are N items in `assertions`, there must be exactly N items in `expectations`
|
|
68
|
+
- Do not omit or add items
|
|
69
|
+
|
|
70
|
+
## Process
|
|
71
|
+
|
|
72
|
+
### Step 1: Read Input
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
1. Read eval_metadata.json to obtain the assertions list
|
|
76
|
+
2. Check the file list in the outputs/ directory
|
|
77
|
+
3. Read the contents of each output file
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Step 2: Collect Evidence per Assertion
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
For each assertion:
|
|
84
|
+
1. Precisely identify the pass criteria from assertion.description
|
|
85
|
+
2. Search the outputs for evidence that meets those criteria
|
|
86
|
+
3. If evidence is found, record it; if not, record "no evidence found"
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Step 3: Judgment
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
For each assertion:
|
|
93
|
+
- Evidence clearly meets the criteria → passed: true
|
|
94
|
+
- Evidence is insufficient or criteria not met → passed: false
|
|
95
|
+
- No evidence found → passed: false
|
|
96
|
+
- Judgment is ambiguous → passed: false (default is FAIL)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Step 4: Write grading.json
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
1. Construct expectations array (maintaining assertions order)
|
|
103
|
+
2. Verify JSON validity
|
|
104
|
+
3. Save as grading.json file
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Grading Criteria
|
|
108
|
+
|
|
109
|
+
### PASS Criteria
|
|
110
|
+
|
|
111
|
+
Evidence must satisfy **all** of the following for a PASS:
|
|
112
|
+
|
|
113
|
+
1. **Existence**: The relevant behavior/artifact exists in the output
|
|
114
|
+
2. **Accuracy**: Precisely meets what the assertion's description requires
|
|
115
|
+
3. **Completeness**: Fully satisfied, not partially
|
|
116
|
+
|
|
117
|
+
### FAIL Criteria
|
|
118
|
+
|
|
119
|
+
FAIL if **any** of the following apply:
|
|
120
|
+
|
|
121
|
+
1. No relevant evidence can be found in the output
|
|
122
|
+
2. Evidence exists but only partially meets the criteria
|
|
123
|
+
3. Evidence exists but achieves the goal in a different way than specified
|
|
124
|
+
4. Output contains only content unrelated to the assertion
|
|
125
|
+
5. Judgment can only be made subjectively (objective verification impossible)
|
|
126
|
+
|
|
127
|
+
### Evidence Writing Rules
|
|
128
|
+
|
|
129
|
+
| Situation | Good evidence | Bad evidence |
|
|
130
|
+
|-----------|---------------|--------------|
|
|
131
|
+
| File creation check | `"outputs/validators.test.ts file exists (23 lines)"` | `"A test file seems to exist"` |
|
|
132
|
+
| Order verification | `"git log: test.ts (14:30:01) → impl.ts (14:32:15), test created 2m14s earlier"` | `"Test was created first"` |
|
|
133
|
+
| Code pattern check | `"validators.ts:5 — function isValidEmail(email: string): boolean, minimal implementation"` | `"Simple code was written"` |
|
|
134
|
+
| Failure verification | `"test output: 'Expected isValidEmail to be defined' — ReferenceError occurred"` | `"Test failed"` |
|
|
135
|
+
|
|
136
|
+
## Red Flags — STOP
|
|
137
|
+
|
|
138
|
+
| Thought | Reality |
|
|
139
|
+
|---------|---------|
|
|
140
|
+
| "This is obviously a PASS" | Cite the evidence. If you cannot cite it, it is a FAIL |
|
|
141
|
+
| "It mostly works, so PASS" | Partial = FAIL. Only full satisfaction is a PASS |
|
|
142
|
+
| "The intention was good, so PASS" | Grade the result, not the intention |
|
|
143
|
+
| "This assertion is subjective, so I'll give it a PASS" | Objectively unverifiable = FAIL |
|
|
144
|
+
| "The output looks good, so everything PASS" | Grade each assertion individually |
|
|
145
|
+
| "Only one is FAIL but the overall impression is good" | Impression-based grading is prohibited. Independent judgment per assertion |
|
|
146
|
+
|
|
147
|
+
## Constraints
|
|
148
|
+
|
|
149
|
+
- **Independent execution**: This agent does not depend on results from other agents
|
|
150
|
+
- **Idempotency**: Always produce the same grading.json for the same input
|
|
151
|
+
- **Schema compliance**: grading.json must exactly follow the schema above. No additional fields
|
|
152
|
+
- **Assertion text preservation**: Use the assertion description as-is in the `text` field without modification
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en" data-theme="dark">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>Eval Review — __SKILL_NAME_PLACEHOLDER__</title>
|
|
7
|
+
<style>
|
|
8
|
+
*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
|
|
9
|
+
:root{
|
|
10
|
+
--bg:#0d1117;--surface:#161b22;--border:#30363d;--text:#e6edf3;--text-dim:#8b949e;
|
|
11
|
+
--trigger:#238636;--trigger-bg:rgba(35,134,54,.15);
|
|
12
|
+
--no-trigger:#da3633;--no-trigger-bg:rgba(218,54,51,.15);
|
|
13
|
+
--accent:#58a6ff;--accent-hover:#79c0ff;
|
|
14
|
+
--warn:#d29922;--radius:8px;--shadow:0 2px 8px rgba(0,0,0,.3);
|
|
15
|
+
--font-mono:'SF Mono','Cascadia Code','Fira Code',monospace;
|
|
16
|
+
--font-sans:-apple-system,BlinkMacSystemFont,'Segoe UI',Helvetica,Arial,sans-serif;
|
|
17
|
+
}
|
|
18
|
+
html{font-size:15px}
|
|
19
|
+
body{background:var(--bg);color:var(--text);font-family:var(--font-sans);line-height:1.6;padding:2rem;max-width:1200px;margin:0 auto}
|
|
20
|
+
h1{font-size:1.6rem;font-weight:700;margin-bottom:.25rem}
|
|
21
|
+
.subtitle{color:var(--text-dim);margin-bottom:1.5rem;font-size:.9rem}
|
|
22
|
+
.stats{display:flex;gap:1rem;margin-bottom:1.5rem;flex-wrap:wrap}
|
|
23
|
+
.stat{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:.75rem 1.25rem;min-width:120px}
|
|
24
|
+
.stat-label{font-size:.75rem;color:var(--text-dim);text-transform:uppercase;letter-spacing:.05em}
|
|
25
|
+
.stat-value{font-size:1.4rem;font-weight:700;font-family:var(--font-mono)}
|
|
26
|
+
.stat-trigger .stat-value{color:var(--trigger)}
|
|
27
|
+
.stat-no-trigger .stat-value{color:var(--no-trigger)}
|
|
28
|
+
.stat-total .stat-value{color:var(--accent)}
|
|
29
|
+
|
|
30
|
+
.toolbar{display:flex;gap:.5rem;margin-bottom:1.5rem;flex-wrap:wrap;align-items:center}
|
|
31
|
+
.btn{background:var(--surface);color:var(--text);border:1px solid var(--border);border-radius:6px;padding:.45rem 1rem;cursor:pointer;font-size:.85rem;font-family:var(--font-sans);transition:border-color .15s,background .15s}
|
|
32
|
+
.btn:hover{border-color:var(--accent);background:#1c2129}
|
|
33
|
+
.btn-primary{background:var(--accent);color:#000;border-color:var(--accent);font-weight:600}
|
|
34
|
+
.btn-primary:hover{background:var(--accent-hover)}
|
|
35
|
+
.btn-danger{color:var(--no-trigger)}
|
|
36
|
+
.btn-danger:hover{background:var(--no-trigger-bg);border-color:var(--no-trigger)}
|
|
37
|
+
.filter-group{display:flex;gap:0}
|
|
38
|
+
.filter-btn{border-radius:0;margin-left:-1px}
|
|
39
|
+
.filter-btn:first-child{border-radius:6px 0 0 6px;margin-left:0}
|
|
40
|
+
.filter-btn:last-child{border-radius:0 6px 6px 0}
|
|
41
|
+
.filter-btn.active{background:var(--accent);color:#000;border-color:var(--accent)}
|
|
42
|
+
.spacer{flex:1}
|
|
43
|
+
|
|
44
|
+
.cards{display:grid;gap:1rem}
|
|
45
|
+
.card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1.25rem;transition:border-color .15s;position:relative}
|
|
46
|
+
.card:hover{border-color:var(--text-dim)}
|
|
47
|
+
.card.trigger{border-left:3px solid var(--trigger)}
|
|
48
|
+
.card.no-trigger{border-left:3px solid var(--no-trigger)}
|
|
49
|
+
.card-header{display:flex;align-items:flex-start;justify-content:space-between;gap:1rem;margin-bottom:.75rem}
|
|
50
|
+
.card-query{font-size:1rem;font-weight:500;flex:1}
|
|
51
|
+
.card-badge{font-size:.7rem;font-weight:700;padding:.2rem .6rem;border-radius:4px;white-space:nowrap;text-transform:uppercase;letter-spacing:.04em}
|
|
52
|
+
.badge-trigger{background:var(--trigger-bg);color:var(--trigger)}
|
|
53
|
+
.badge-no-trigger{background:var(--no-trigger-bg);color:var(--no-trigger)}
|
|
54
|
+
.card-meta{display:flex;gap:1rem;color:var(--text-dim);font-size:.8rem;margin-bottom:.5rem;flex-wrap:wrap}
|
|
55
|
+
.card-meta span{display:flex;align-items:center;gap:.25rem}
|
|
56
|
+
.card-reason{color:var(--text-dim);font-size:.85rem;margin-top:.5rem;padding-top:.5rem;border-top:1px solid var(--border)}
|
|
57
|
+
.card-actions{display:flex;gap:.5rem;margin-top:.75rem;justify-content:flex-end}
|
|
58
|
+
.card-actions .btn{padding:.3rem .75rem;font-size:.8rem}
|
|
59
|
+
|
|
60
|
+
/* Modal */
|
|
61
|
+
.modal-overlay{display:none;position:fixed;inset:0;background:rgba(0,0,0,.6);z-index:100;align-items:center;justify-content:center}
|
|
62
|
+
.modal-overlay.open{display:flex}
|
|
63
|
+
.modal{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1.5rem;width:90%;max-width:560px;box-shadow:var(--shadow)}
|
|
64
|
+
.modal h2{font-size:1.1rem;margin-bottom:1rem}
|
|
65
|
+
.form-group{margin-bottom:1rem}
|
|
66
|
+
.form-group label{display:block;font-size:.85rem;color:var(--text-dim);margin-bottom:.3rem}
|
|
67
|
+
.form-group input,.form-group textarea,.form-group select{width:100%;background:var(--bg);color:var(--text);border:1px solid var(--border);border-radius:6px;padding:.5rem .75rem;font-size:.9rem;font-family:var(--font-sans)}
|
|
68
|
+
.form-group textarea{min-height:80px;resize:vertical}
|
|
69
|
+
.form-group select{cursor:pointer}
|
|
70
|
+
.modal-footer{display:flex;gap:.5rem;justify-content:flex-end;margin-top:1.25rem}
|
|
71
|
+
|
|
72
|
+
.empty{text-align:center;padding:3rem;color:var(--text-dim)}
|
|
73
|
+
.count-badge{font-size:.75rem;color:var(--text-dim);margin-left:.25rem}
|
|
74
|
+
</style>
|
|
75
|
+
</head>
|
|
76
|
+
<body>
|
|
77
|
+
|
|
78
|
+
<h1>Eval Review — <span id="skillName">__SKILL_NAME_PLACEHOLDER__</span></h1>
|
|
79
|
+
<p class="subtitle" id="skillDesc">__SKILL_DESCRIPTION_PLACEHOLDER__</p>
|
|
80
|
+
|
|
81
|
+
<div class="stats" id="stats"></div>
|
|
82
|
+
|
|
83
|
+
<div class="toolbar">
|
|
84
|
+
<div class="filter-group">
|
|
85
|
+
<button class="btn filter-btn active" data-filter="all">All <span class="count-badge" id="countAll"></span></button>
|
|
86
|
+
<button class="btn filter-btn" data-filter="trigger">Trigger <span class="count-badge" id="countTrigger"></span></button>
|
|
87
|
+
<button class="btn filter-btn" data-filter="no-trigger">No Trigger <span class="count-badge" id="countNoTrigger"></span></button>
|
|
88
|
+
</div>
|
|
89
|
+
<span class="spacer"></span>
|
|
90
|
+
<button class="btn" onclick="openModal('add')">+ Add Query</button>
|
|
91
|
+
<button class="btn btn-primary" onclick="downloadJSON()">Download eval_set.json</button>
|
|
92
|
+
</div>
|
|
93
|
+
|
|
94
|
+
<div class="cards" id="cardContainer"></div>
|
|
95
|
+
|
|
96
|
+
<!-- Add/Edit Modal -->
|
|
97
|
+
<div class="modal-overlay" id="modalOverlay">
|
|
98
|
+
<div class="modal">
|
|
99
|
+
<h2 id="modalTitle">Add Query</h2>
|
|
100
|
+
<div class="form-group">
|
|
101
|
+
<label for="inputQuery">User Query</label>
|
|
102
|
+
<textarea id="inputQuery" placeholder="Enter user prompt / query..."></textarea>
|
|
103
|
+
</div>
|
|
104
|
+
<div class="form-group">
|
|
105
|
+
<label for="inputTrigger">Should Trigger?</label>
|
|
106
|
+
<select id="inputTrigger">
|
|
107
|
+
<option value="true">Yes — should trigger this skill</option>
|
|
108
|
+
<option value="false">No — should NOT trigger this skill</option>
|
|
109
|
+
</select>
|
|
110
|
+
</div>
|
|
111
|
+
<div class="form-group">
|
|
112
|
+
<label for="inputReason">Reason (optional)</label>
|
|
113
|
+
<input type="text" id="inputReason" placeholder="Why this query should/shouldn't trigger">
|
|
114
|
+
</div>
|
|
115
|
+
<div class="form-group">
|
|
116
|
+
<label for="inputCategory">Category (optional)</label>
|
|
117
|
+
<input type="text" id="inputCategory" placeholder="e.g. happy-path, edge-case, adversarial">
|
|
118
|
+
</div>
|
|
119
|
+
<div class="modal-footer">
|
|
120
|
+
<button class="btn" onclick="closeModal()">Cancel</button>
|
|
121
|
+
<button class="btn btn-primary" onclick="saveEntry()">Save</button>
|
|
122
|
+
</div>
|
|
123
|
+
</div>
|
|
124
|
+
</div>
|
|
125
|
+
|
|
126
|
+
<script>
|
|
127
|
+
(function(){
|
|
128
|
+
"use strict";
|
|
129
|
+
|
|
130
|
+
// --- Data ---
|
|
131
|
+
var rawData = __EVAL_DATA_PLACEHOLDER__;
|
|
132
|
+
var evalSet = Array.isArray(rawData) ? rawData : (rawData && rawData.queries ? rawData.queries : []);
|
|
133
|
+
var currentFilter = "all";
|
|
134
|
+
var editIndex = -1;
|
|
135
|
+
|
|
136
|
+
// --- Render ---
|
|
137
|
+
function render(){
|
|
138
|
+
renderStats();
|
|
139
|
+
renderCards();
|
|
140
|
+
updateCounts();
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function renderStats(){
|
|
144
|
+
var total = evalSet.length;
|
|
145
|
+
var triggers = evalSet.filter(function(q){return q.should_trigger;}).length;
|
|
146
|
+
var noTriggers = total - triggers;
|
|
147
|
+
document.getElementById("stats").innerHTML =
|
|
148
|
+
'<div class="stat stat-total"><div class="stat-label">Total</div><div class="stat-value">'+total+'</div></div>'+
|
|
149
|
+
'<div class="stat stat-trigger"><div class="stat-label">Should Trigger</div><div class="stat-value">'+triggers+'</div></div>'+
|
|
150
|
+
'<div class="stat stat-no-trigger"><div class="stat-label">Should Not</div><div class="stat-value">'+noTriggers+'</div></div>';
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function renderCards(){
|
|
154
|
+
var container = document.getElementById("cardContainer");
|
|
155
|
+
var filtered = evalSet.filter(function(q, i){
|
|
156
|
+
q._index = i;
|
|
157
|
+
if(currentFilter === "trigger") return q.should_trigger;
|
|
158
|
+
if(currentFilter === "no-trigger") return !q.should_trigger;
|
|
159
|
+
return true;
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
if(filtered.length === 0){
|
|
163
|
+
container.innerHTML = '<div class="empty">No queries to display. Click "+ Add Query" to create one.</div>';
|
|
164
|
+
return;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
container.innerHTML = filtered.map(function(q){
|
|
168
|
+
var cls = q.should_trigger ? "trigger" : "no-trigger";
|
|
169
|
+
var badge = q.should_trigger
|
|
170
|
+
? '<span class="card-badge badge-trigger">Should Trigger</span>'
|
|
171
|
+
: '<span class="card-badge badge-no-trigger">Should Not Trigger</span>';
|
|
172
|
+
var meta = '';
|
|
173
|
+
if(q.category) meta += '<span>Category: '+esc(q.category)+'</span>';
|
|
174
|
+
if(q.confidence) meta += '<span>Confidence: '+q.confidence+'</span>';
|
|
175
|
+
var reason = q.reason ? '<div class="card-reason">'+esc(q.reason)+'</div>' : '';
|
|
176
|
+
return '<div class="card '+cls+'">'+
|
|
177
|
+
'<div class="card-header"><div class="card-query">'+esc(q.query)+'</div>'+badge+'</div>'+
|
|
178
|
+
(meta ? '<div class="card-meta">'+meta+'</div>' : '')+
|
|
179
|
+
reason+
|
|
180
|
+
'<div class="card-actions">'+
|
|
181
|
+
'<button class="btn" onclick="openModal(\'edit\','+q._index+')">Edit</button>'+
|
|
182
|
+
'<button class="btn btn-danger" onclick="deleteEntry('+q._index+')">Delete</button>'+
|
|
183
|
+
'</div></div>';
|
|
184
|
+
}).join("");
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
function updateCounts(){
|
|
188
|
+
var total = evalSet.length;
|
|
189
|
+
var triggers = evalSet.filter(function(q){return q.should_trigger;}).length;
|
|
190
|
+
document.getElementById("countAll").textContent = total;
|
|
191
|
+
document.getElementById("countTrigger").textContent = triggers;
|
|
192
|
+
document.getElementById("countNoTrigger").textContent = total - triggers;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
function esc(s){
|
|
196
|
+
if(!s) return "";
|
|
197
|
+
var d = document.createElement("div");
|
|
198
|
+
d.textContent = s;
|
|
199
|
+
return d.innerHTML;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// --- Filters ---
|
|
203
|
+
document.querySelectorAll(".filter-btn").forEach(function(btn){
|
|
204
|
+
btn.addEventListener("click", function(){
|
|
205
|
+
document.querySelectorAll(".filter-btn").forEach(function(b){b.classList.remove("active");});
|
|
206
|
+
btn.classList.add("active");
|
|
207
|
+
currentFilter = btn.getAttribute("data-filter");
|
|
208
|
+
renderCards();
|
|
209
|
+
});
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
// --- Modal ---
|
|
213
|
+
window.openModal = function(mode, idx){
|
|
214
|
+
editIndex = (mode === "edit") ? idx : -1;
|
|
215
|
+
document.getElementById("modalTitle").textContent = mode === "edit" ? "Edit Query" : "Add Query";
|
|
216
|
+
if(mode === "edit" && idx >= 0){
|
|
217
|
+
var q = evalSet[idx];
|
|
218
|
+
document.getElementById("inputQuery").value = q.query || "";
|
|
219
|
+
document.getElementById("inputTrigger").value = q.should_trigger ? "true" : "false";
|
|
220
|
+
document.getElementById("inputReason").value = q.reason || "";
|
|
221
|
+
document.getElementById("inputCategory").value = q.category || "";
|
|
222
|
+
} else {
|
|
223
|
+
document.getElementById("inputQuery").value = "";
|
|
224
|
+
document.getElementById("inputTrigger").value = "true";
|
|
225
|
+
document.getElementById("inputReason").value = "";
|
|
226
|
+
document.getElementById("inputCategory").value = "";
|
|
227
|
+
}
|
|
228
|
+
document.getElementById("modalOverlay").classList.add("open");
|
|
229
|
+
document.getElementById("inputQuery").focus();
|
|
230
|
+
};
|
|
231
|
+
|
|
232
|
+
window.closeModal = function(){
|
|
233
|
+
document.getElementById("modalOverlay").classList.remove("open");
|
|
234
|
+
editIndex = -1;
|
|
235
|
+
};
|
|
236
|
+
|
|
237
|
+
window.saveEntry = function(){
|
|
238
|
+
var query = document.getElementById("inputQuery").value.trim();
|
|
239
|
+
if(!query){ alert("Query is required."); return; }
|
|
240
|
+
var entry = {
|
|
241
|
+
query: query,
|
|
242
|
+
should_trigger: document.getElementById("inputTrigger").value === "true"
|
|
243
|
+
};
|
|
244
|
+
var reason = document.getElementById("inputReason").value.trim();
|
|
245
|
+
var category = document.getElementById("inputCategory").value.trim();
|
|
246
|
+
if(reason) entry.reason = reason;
|
|
247
|
+
if(category) entry.category = category;
|
|
248
|
+
|
|
249
|
+
if(editIndex >= 0){
|
|
250
|
+
evalSet[editIndex] = entry;
|
|
251
|
+
} else {
|
|
252
|
+
evalSet.push(entry);
|
|
253
|
+
}
|
|
254
|
+
closeModal();
|
|
255
|
+
render();
|
|
256
|
+
};
|
|
257
|
+
|
|
258
|
+
window.deleteEntry = function(idx){
|
|
259
|
+
if(!confirm("Delete this query?")) return;
|
|
260
|
+
evalSet.splice(idx, 1);
|
|
261
|
+
render();
|
|
262
|
+
};
|
|
263
|
+
|
|
264
|
+
// --- Download ---
|
|
265
|
+
window.downloadJSON = function(){
|
|
266
|
+
var out = {
|
|
267
|
+
skill_name: document.getElementById("skillName").textContent,
|
|
268
|
+
description: document.getElementById("skillDesc").textContent,
|
|
269
|
+
queries: evalSet
|
|
270
|
+
};
|
|
271
|
+
var blob = new Blob([JSON.stringify(out, null, 2)], {type:"application/json"});
|
|
272
|
+
var a = document.createElement("a");
|
|
273
|
+
a.href = URL.createObjectURL(blob);
|
|
274
|
+
a.download = "eval_set.json";
|
|
275
|
+
a.click();
|
|
276
|
+
URL.revokeObjectURL(a.href);
|
|
277
|
+
};
|
|
278
|
+
|
|
279
|
+
// --- Keyboard ---
|
|
280
|
+
document.addEventListener("keydown", function(e){
|
|
281
|
+
if(e.key === "Escape") closeModal();
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
// --- Init ---
|
|
285
|
+
render();
|
|
286
|
+
})();
|
|
287
|
+
</script>
|
|
288
|
+
</body>
|
|
289
|
+
</html>
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: {{SKILL_NAME}}
|
|
3
|
+
description: TODO - describe when to use this skill with specific trigger phrases
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# {{SKILL_DISPLAY_NAME}}
|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
|
|
10
|
+
TODO - one sentence describing what this skill does.
|
|
11
|
+
|
|
12
|
+
**Core principle:** TODO - the single most important rule.
|
|
13
|
+
|
|
14
|
+
**Iron Law:**
|
|
15
|
+
```
|
|
16
|
+
TODO - the non-negotiable constraint
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## When to Use
|
|
20
|
+
|
|
21
|
+
- TODO - specific situation 1
|
|
22
|
+
- TODO - specific situation 2
|
|
23
|
+
|
|
24
|
+
**Use this ESPECIALLY when:**
|
|
25
|
+
- TODO
|
|
26
|
+
|
|
27
|
+
## When NOT to Use
|
|
28
|
+
|
|
29
|
+
- TODO - what this skill does NOT handle
|
|
30
|
+
|
|
31
|
+
## The Process
|
|
32
|
+
|
|
33
|
+
### Phase 1: TODO
|
|
34
|
+
|
|
35
|
+
TODO - step by step instructions
|
|
36
|
+
|
|
37
|
+
### Phase 2: TODO
|
|
38
|
+
|
|
39
|
+
TODO
|
|
40
|
+
|
|
41
|
+
## Additional resources
|
|
42
|
+
|
|
43
|
+
- For detailed reference, see [references/TODO.md](references/TODO.md)
|