@intentsolutionsio/skill-creator 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +17 -0
- package/README.md +55 -0
- package/package.json +38 -0
- package/scripts/validate-skill.py +1132 -0
- package/skills/agent-creator/SKILL.md +305 -0
- package/skills/agent-creator/references/anthropic-agent-spec.md +89 -0
- package/skills/skill-creator/SKILL.md +267 -0
- package/skills/skill-creator/agents/analyzer.md +279 -0
- package/skills/skill-creator/agents/comparator.md +207 -0
- package/skills/skill-creator/agents/grader.md +228 -0
- package/skills/skill-creator/assets/eval_review.html +146 -0
- package/skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/skill-creator/references/advanced-eval-workflow.md +320 -0
- package/skills/skill-creator/references/anthropic-comparison.md +93 -0
- package/skills/skill-creator/references/ard-template.md +47 -0
- package/skills/skill-creator/references/creation-guide.md +305 -0
- package/skills/skill-creator/references/errors-template.md +27 -0
- package/skills/skill-creator/references/examples-template.md +40 -0
- package/skills/skill-creator/references/frontmatter-spec.md +531 -0
- package/skills/skill-creator/references/implementation-template.md +42 -0
- package/skills/skill-creator/references/output-patterns.md +193 -0
- package/skills/skill-creator/references/prd-template.md +55 -0
- package/skills/skill-creator/references/schemas.md +430 -0
- package/skills/skill-creator/references/source-of-truth.md +658 -0
- package/skills/skill-creator/references/validation-rules.md +528 -0
- package/skills/skill-creator/references/workflows.md +233 -0
- package/skills/skill-creator/scripts/__init__.py +0 -0
- package/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/skill-creator/scripts/generate_report.py +326 -0
- package/skills/skill-creator/scripts/improve_description.py +247 -0
- package/skills/skill-creator/scripts/package_skill.py +136 -0
- package/skills/skill-creator/scripts/quick_validate.py +103 -0
- package/skills/skill-creator/scripts/run_eval.py +344 -0
- package/skills/skill-creator/scripts/run_loop.py +329 -0
- package/skills/skill-creator/scripts/utils.py +47 -0
- package/skills/skill-creator/scripts/validate-skill.py +87 -0
- package/skills/skill-creator/templates/agent-template.md +99 -0
- package/skills/skill-creator/templates/skill-template.md +122 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# Skill Output Patterns
|
|
2
|
+
|
|
3
|
+
Reference for structuring skill output. Source: Anthropic best practices.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Template Pattern
|
|
8
|
+
|
|
9
|
+
Generate output from predefined templates with variable substitution.
|
|
10
|
+
|
|
11
|
+
### Strict Template (Low Degrees of Freedom)
|
|
12
|
+
|
|
13
|
+
Use when output format must be exact (API payloads, config files, compliance docs).
|
|
14
|
+
|
|
15
|
+
```markdown
|
|
16
|
+
## Instructions
|
|
17
|
+
|
|
18
|
+
Generate output using this exact template:
|
|
19
|
+
|
|
20
|
+
```json
|
|
21
|
+
{
|
|
22
|
+
"name": "{project_name}",
|
|
23
|
+
"version": "{version}",
|
|
24
|
+
"scripts": {
|
|
25
|
+
"build": "{build_command}",
|
|
26
|
+
"test": "{test_command}"
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Replace placeholders with gathered values. Do not add extra fields.
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Flexible Template (Medium Degrees of Freedom)
|
|
35
|
+
|
|
36
|
+
Use when structure is defined but content is creative.
|
|
37
|
+
|
|
38
|
+
```markdown
|
|
39
|
+
## Instructions
|
|
40
|
+
|
|
41
|
+
Generate a report following this structure:
|
|
42
|
+
|
|
43
|
+
# {Title}
|
|
44
|
+
|
|
45
|
+
## Summary
|
|
46
|
+
{2-3 sentence executive summary}
|
|
47
|
+
|
|
48
|
+
## Findings
|
|
49
|
+
{Detailed findings - use tables, lists, or prose as appropriate}
|
|
50
|
+
|
|
51
|
+
## Recommendations
|
|
52
|
+
{Prioritized list of action items}
|
|
53
|
+
|
|
54
|
+
Adapt section depth to the complexity of findings.
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Examples Pattern
|
|
60
|
+
|
|
61
|
+
Provide input/output pairs that demonstrate expected behavior.
|
|
62
|
+
|
|
63
|
+
### Inline Examples
|
|
64
|
+
|
|
65
|
+
```markdown
|
|
66
|
+
## Examples
|
|
67
|
+
|
|
68
|
+
### Simple case
|
|
69
|
+
**Input**: `/skill-name auth.py`
|
|
70
|
+
**Output**:
|
|
71
|
+
```
|
|
72
|
+
auth.py: 3 issues found
|
|
73
|
+
Line 15: SQL injection risk in query builder
|
|
74
|
+
Line 42: Hardcoded credential detected
|
|
75
|
+
Line 89: Missing input validation
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Complex case
|
|
79
|
+
**Input**: `/skill-name --deep src/`
|
|
80
|
+
**Output**:
|
|
81
|
+
```
|
|
82
|
+
Deep scan: 12 files, 7 issues
|
|
83
|
+
CRITICAL (2): sql-injection, hardcoded-secret
|
|
84
|
+
WARNING (3): missing-validation, weak-hash, cors-wildcard
|
|
85
|
+
INFO (2): deprecated-api, unused-import
|
|
86
|
+
```
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### When to Use
|
|
90
|
+
|
|
91
|
+
- Always include at least 1 example
|
|
92
|
+
- Show both simple and complex cases
|
|
93
|
+
- Show edge cases (empty input, errors)
|
|
94
|
+
- Examples teach Claude the expected output format better than rules
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Visual Output Pattern
|
|
99
|
+
|
|
100
|
+
Generate HTML artifacts for rich visual output.
|
|
101
|
+
|
|
102
|
+
```markdown
|
|
103
|
+
## Instructions
|
|
104
|
+
|
|
105
|
+
### Step 1: Gather Data
|
|
106
|
+
Collect the metrics and data points needed.
|
|
107
|
+
|
|
108
|
+
### Step 2: Generate HTML Report
|
|
109
|
+
Create a self-contained HTML file with:
|
|
110
|
+
- Inline CSS (no external dependencies)
|
|
111
|
+
- Responsive layout
|
|
112
|
+
- Data tables with sorting
|
|
113
|
+
- Charts if applicable (use inline SVG)
|
|
114
|
+
|
|
115
|
+
### Step 3: Write Output
|
|
116
|
+
Save to `{output_path}/report.html`
|
|
117
|
+
|
|
118
|
+
Example HTML structure:
|
|
119
|
+
```html
|
|
120
|
+
<!DOCTYPE html>
|
|
121
|
+
<html>
|
|
122
|
+
<head>
|
|
123
|
+
<style>
|
|
124
|
+
body { font-family: system-ui; max-width: 900px; margin: 0 auto; padding: 2rem; }
|
|
125
|
+
table { width: 100%; border-collapse: collapse; }
|
|
126
|
+
th, td { padding: 8px; border: 1px solid #ddd; text-align: left; }
|
|
127
|
+
.critical { color: #dc3545; }
|
|
128
|
+
.warning { color: #ffc107; }
|
|
129
|
+
.pass { color: #28a745; }
|
|
130
|
+
</style>
|
|
131
|
+
</head>
|
|
132
|
+
<body>
|
|
133
|
+
<h1>{Report Title}</h1>
|
|
134
|
+
<!-- Content here -->
|
|
135
|
+
</body>
|
|
136
|
+
</html>
|
|
137
|
+
```
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### When to Use
|
|
141
|
+
|
|
142
|
+
- Dashboards and reports
|
|
143
|
+
- Documentation previews
|
|
144
|
+
- Visual diffs or comparisons
|
|
145
|
+
- Any output that benefits from formatting beyond plain text
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Structured Data Pattern
|
|
150
|
+
|
|
151
|
+
Output structured data (JSON, YAML, CSV) for programmatic consumption.
|
|
152
|
+
|
|
153
|
+
```markdown
|
|
154
|
+
## Output Format
|
|
155
|
+
|
|
156
|
+
Results are written as JSON to `{output_path}/results.json`:
|
|
157
|
+
|
|
158
|
+
```json
|
|
159
|
+
{
|
|
160
|
+
"scan_date": "2025-01-15",
|
|
161
|
+
"files_scanned": 42,
|
|
162
|
+
"issues": [
|
|
163
|
+
{
|
|
164
|
+
"file": "auth.py",
|
|
165
|
+
"line": 15,
|
|
166
|
+
"severity": "critical",
|
|
167
|
+
"rule": "sql-injection",
|
|
168
|
+
"message": "Unsanitized input in SQL query"
|
|
169
|
+
}
|
|
170
|
+
],
|
|
171
|
+
"summary": {
|
|
172
|
+
"critical": 2,
|
|
173
|
+
"warning": 5,
|
|
174
|
+
"info": 3
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Additionally, a human-readable summary is printed to the conversation.
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Choosing Output Patterns
|
|
185
|
+
|
|
186
|
+
| If the output is... | Use |
|
|
187
|
+
|---------------------|-----|
|
|
188
|
+
| Exact format required | Strict Template |
|
|
189
|
+
| Structured but flexible | Flexible Template |
|
|
190
|
+
| Best shown by example | Examples Pattern |
|
|
191
|
+
| Rich/visual | Visual Output (HTML) |
|
|
192
|
+
| Machine-readable | Structured Data (JSON/YAML) |
|
|
193
|
+
| Multiple audiences | Combine: JSON file + conversation summary |
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# PRD Template
|
|
2
|
+
|
|
3
|
+
Standard Product Requirements Document for all marketplace skills. Every PRD.md MUST follow this exact structure. No sections added, no sections removed.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
```markdown
|
|
8
|
+
# PRD: {Skill Name}
|
|
9
|
+
|
|
10
|
+
**Version:** {version from SKILL.md frontmatter}
|
|
11
|
+
**Author:** {author from SKILL.md frontmatter}
|
|
12
|
+
**Status:** Active
|
|
13
|
+
**Marketplace:** [tonsofskills.com](https://tonsofskills.com) by [Intent Solutions](https://intentsolutions.io)
|
|
14
|
+
**Portfolio:** [jeremylongshore.com](https://jeremylongshore.com)
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## Problem Statement
|
|
19
|
+
|
|
20
|
+
{2-4 sentences. What problem does this skill solve? Why does the problem exist? What happens without this skill?}
|
|
21
|
+
|
|
22
|
+
## Target Users
|
|
23
|
+
|
|
24
|
+
| User | Context | Primary Need |
|
|
25
|
+
|------|---------|-------------|
|
|
26
|
+
| {role} | {when they encounter the problem} | {what they need from this skill} |
|
|
27
|
+
| {role} | {when they encounter the problem} | {what they need from this skill} |
|
|
28
|
+
|
|
29
|
+
## Success Criteria
|
|
30
|
+
|
|
31
|
+
1. {Measurable outcome — time, quality, or completeness metric}
|
|
32
|
+
2. {Measurable outcome}
|
|
33
|
+
3. {Measurable outcome}
|
|
34
|
+
|
|
35
|
+
## Functional Requirements
|
|
36
|
+
|
|
37
|
+
1. {What the skill must do — numbered, specific, testable}
|
|
38
|
+
2. {What the skill must do}
|
|
39
|
+
3. {What the skill must do}
|
|
40
|
+
|
|
41
|
+
## Non-Functional Requirements
|
|
42
|
+
|
|
43
|
+
- {Performance, security, compatibility, or operational constraint}
|
|
44
|
+
- {Constraint}
|
|
45
|
+
|
|
46
|
+
## Dependencies
|
|
47
|
+
|
|
48
|
+
- {What must be installed, configured, or available}
|
|
49
|
+
- {External services, APIs, credentials}
|
|
50
|
+
|
|
51
|
+
## Out of Scope
|
|
52
|
+
|
|
53
|
+
- {What this skill explicitly does NOT do}
|
|
54
|
+
- {Adjacent concerns handled by other skills}
|
|
55
|
+
```
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
# JSON Schemas
|
|
2
|
+
|
|
3
|
+
This document defines the JSON schemas used by skill-creator.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## evals.json
|
|
8
|
+
|
|
9
|
+
Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
|
|
10
|
+
|
|
11
|
+
```json
|
|
12
|
+
{
|
|
13
|
+
"skill_name": "example-skill",
|
|
14
|
+
"evals": [
|
|
15
|
+
{
|
|
16
|
+
"id": 1,
|
|
17
|
+
"prompt": "User's example prompt",
|
|
18
|
+
"expected_output": "Description of expected result",
|
|
19
|
+
"files": ["evals/files/sample1.pdf"],
|
|
20
|
+
"expectations": [
|
|
21
|
+
"The output includes X",
|
|
22
|
+
"The skill used script Y"
|
|
23
|
+
]
|
|
24
|
+
}
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**Fields:**
|
|
30
|
+
- `skill_name`: Name matching the skill's frontmatter
|
|
31
|
+
- `evals[].id`: Unique integer identifier
|
|
32
|
+
- `evals[].prompt`: The task to execute
|
|
33
|
+
- `evals[].expected_output`: Human-readable description of success
|
|
34
|
+
- `evals[].files`: Optional list of input file paths (relative to skill root)
|
|
35
|
+
- `evals[].expectations`: List of verifiable statements
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## history.json
|
|
40
|
+
|
|
41
|
+
Tracks version progression in Improve mode. Located at workspace root.
|
|
42
|
+
|
|
43
|
+
```json
|
|
44
|
+
{
|
|
45
|
+
"started_at": "2026-01-15T10:30:00Z",
|
|
46
|
+
"skill_name": "pdf",
|
|
47
|
+
"current_best": "v2",
|
|
48
|
+
"iterations": [
|
|
49
|
+
{
|
|
50
|
+
"version": "v0",
|
|
51
|
+
"parent": null,
|
|
52
|
+
"expectation_pass_rate": 0.65,
|
|
53
|
+
"grading_result": "baseline",
|
|
54
|
+
"is_current_best": false
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
"version": "v1",
|
|
58
|
+
"parent": "v0",
|
|
59
|
+
"expectation_pass_rate": 0.75,
|
|
60
|
+
"grading_result": "won",
|
|
61
|
+
"is_current_best": false
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"version": "v2",
|
|
65
|
+
"parent": "v1",
|
|
66
|
+
"expectation_pass_rate": 0.85,
|
|
67
|
+
"grading_result": "won",
|
|
68
|
+
"is_current_best": true
|
|
69
|
+
}
|
|
70
|
+
]
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**Fields:**
|
|
75
|
+
- `started_at`: ISO timestamp of when improvement started
|
|
76
|
+
- `skill_name`: Name of the skill being improved
|
|
77
|
+
- `current_best`: Version identifier of the best performer
|
|
78
|
+
- `iterations[].version`: Version identifier (v0, v1, ...)
|
|
79
|
+
- `iterations[].parent`: Parent version this was derived from
|
|
80
|
+
- `iterations[].expectation_pass_rate`: Pass rate from grading
|
|
81
|
+
- `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
|
|
82
|
+
- `iterations[].is_current_best`: Whether this is the current best version
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## grading.json
|
|
87
|
+
|
|
88
|
+
Output from the grader agent. Located at `<run-dir>/grading.json`.
|
|
89
|
+
|
|
90
|
+
```json
|
|
91
|
+
{
|
|
92
|
+
"expectations": [
|
|
93
|
+
{
|
|
94
|
+
"text": "The output includes the name 'John Smith'",
|
|
95
|
+
"passed": true,
|
|
96
|
+
"evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"text": "The spreadsheet has a SUM formula in cell B10",
|
|
100
|
+
"passed": false,
|
|
101
|
+
"evidence": "No spreadsheet was created. The output was a text file."
|
|
102
|
+
}
|
|
103
|
+
],
|
|
104
|
+
"summary": {
|
|
105
|
+
"passed": 2,
|
|
106
|
+
"failed": 1,
|
|
107
|
+
"total": 3,
|
|
108
|
+
"pass_rate": 0.67
|
|
109
|
+
},
|
|
110
|
+
"execution_metrics": {
|
|
111
|
+
"tool_calls": {
|
|
112
|
+
"Read": 5,
|
|
113
|
+
"Write": 2,
|
|
114
|
+
"Bash": 8
|
|
115
|
+
},
|
|
116
|
+
"total_tool_calls": 15,
|
|
117
|
+
"total_steps": 6,
|
|
118
|
+
"errors_encountered": 0,
|
|
119
|
+
"output_chars": 12450,
|
|
120
|
+
"transcript_chars": 3200
|
|
121
|
+
},
|
|
122
|
+
"timing": {
|
|
123
|
+
"executor_duration_seconds": 165.0,
|
|
124
|
+
"grader_duration_seconds": 26.0,
|
|
125
|
+
"total_duration_seconds": 191.0
|
|
126
|
+
},
|
|
127
|
+
"claims": [
|
|
128
|
+
{
|
|
129
|
+
"claim": "The form has 12 fillable fields",
|
|
130
|
+
"type": "factual",
|
|
131
|
+
"verified": true,
|
|
132
|
+
"evidence": "Counted 12 fields in field_info.json"
|
|
133
|
+
}
|
|
134
|
+
],
|
|
135
|
+
"user_notes_summary": {
|
|
136
|
+
"uncertainties": ["Used 2023 data, may be stale"],
|
|
137
|
+
"needs_review": [],
|
|
138
|
+
"workarounds": ["Fell back to text overlay for non-fillable fields"]
|
|
139
|
+
},
|
|
140
|
+
"eval_feedback": {
|
|
141
|
+
"suggestions": [
|
|
142
|
+
{
|
|
143
|
+
"assertion": "The output includes the name 'John Smith'",
|
|
144
|
+
"reason": "A hallucinated document that mentions the name would also pass"
|
|
145
|
+
}
|
|
146
|
+
],
|
|
147
|
+
"overall": "Assertions check presence but not correctness."
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
**Fields:**
|
|
153
|
+
- `expectations[]`: Graded expectations with evidence
|
|
154
|
+
- `summary`: Aggregate pass/fail counts
|
|
155
|
+
- `execution_metrics`: Tool usage and output size (from executor's metrics.json)
|
|
156
|
+
- `timing`: Wall clock timing (from timing.json)
|
|
157
|
+
- `claims`: Extracted and verified claims from the output
|
|
158
|
+
- `user_notes_summary`: Issues flagged by the executor
|
|
159
|
+
- `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## metrics.json
|
|
164
|
+
|
|
165
|
+
Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
|
|
166
|
+
|
|
167
|
+
```json
|
|
168
|
+
{
|
|
169
|
+
"tool_calls": {
|
|
170
|
+
"Read": 5,
|
|
171
|
+
"Write": 2,
|
|
172
|
+
"Bash": 8,
|
|
173
|
+
"Edit": 1,
|
|
174
|
+
"Glob": 2,
|
|
175
|
+
"Grep": 0
|
|
176
|
+
},
|
|
177
|
+
"total_tool_calls": 18,
|
|
178
|
+
"total_steps": 6,
|
|
179
|
+
"files_created": ["filled_form.pdf", "field_values.json"],
|
|
180
|
+
"errors_encountered": 0,
|
|
181
|
+
"output_chars": 12450,
|
|
182
|
+
"transcript_chars": 3200
|
|
183
|
+
}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**Fields:**
|
|
187
|
+
- `tool_calls`: Count per tool type
|
|
188
|
+
- `total_tool_calls`: Sum of all tool calls
|
|
189
|
+
- `total_steps`: Number of major execution steps
|
|
190
|
+
- `files_created`: List of output files created
|
|
191
|
+
- `errors_encountered`: Number of errors during execution
|
|
192
|
+
- `output_chars`: Total character count of output files
|
|
193
|
+
- `transcript_chars`: Character count of transcript
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## timing.json
|
|
198
|
+
|
|
199
|
+
Wall clock timing for a run. Located at `<run-dir>/timing.json`.
|
|
200
|
+
|
|
201
|
+
**How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact.
|
|
202
|
+
|
|
203
|
+
```json
|
|
204
|
+
{
|
|
205
|
+
"total_tokens": 84852,
|
|
206
|
+
"duration_ms": 23332,
|
|
207
|
+
"total_duration_seconds": 23.3,
|
|
208
|
+
"executor_start": "2026-01-15T10:30:00Z",
|
|
209
|
+
"executor_end": "2026-01-15T10:32:45Z",
|
|
210
|
+
"executor_duration_seconds": 165.0,
|
|
211
|
+
"grader_start": "2026-01-15T10:32:46Z",
|
|
212
|
+
"grader_end": "2026-01-15T10:33:12Z",
|
|
213
|
+
"grader_duration_seconds": 26.0
|
|
214
|
+
}
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## benchmark.json
|
|
220
|
+
|
|
221
|
+
Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
|
|
222
|
+
|
|
223
|
+
```json
|
|
224
|
+
{
|
|
225
|
+
"metadata": {
|
|
226
|
+
"skill_name": "pdf",
|
|
227
|
+
"skill_path": "/path/to/pdf",
|
|
228
|
+
"executor_model": "claude-sonnet-4-20250514",
|
|
229
|
+
"analyzer_model": "most-capable-model",
|
|
230
|
+
"timestamp": "2026-01-15T10:30:00Z",
|
|
231
|
+
"evals_run": [1, 2, 3],
|
|
232
|
+
"runs_per_configuration": 3
|
|
233
|
+
},
|
|
234
|
+
|
|
235
|
+
"runs": [
|
|
236
|
+
{
|
|
237
|
+
"eval_id": 1,
|
|
238
|
+
"eval_name": "Ocean",
|
|
239
|
+
"configuration": "with_skill",
|
|
240
|
+
"run_number": 1,
|
|
241
|
+
"result": {
|
|
242
|
+
"pass_rate": 0.85,
|
|
243
|
+
"passed": 6,
|
|
244
|
+
"failed": 1,
|
|
245
|
+
"total": 7,
|
|
246
|
+
"time_seconds": 42.5,
|
|
247
|
+
"tokens": 3800,
|
|
248
|
+
"tool_calls": 18,
|
|
249
|
+
"errors": 0
|
|
250
|
+
},
|
|
251
|
+
"expectations": [
|
|
252
|
+
{"text": "...", "passed": true, "evidence": "..."}
|
|
253
|
+
],
|
|
254
|
+
"notes": [
|
|
255
|
+
"Used 2023 data, may be stale",
|
|
256
|
+
"Fell back to text overlay for non-fillable fields"
|
|
257
|
+
]
|
|
258
|
+
}
|
|
259
|
+
],
|
|
260
|
+
|
|
261
|
+
"run_summary": {
|
|
262
|
+
"with_skill": {
|
|
263
|
+
"pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
|
|
264
|
+
"time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
|
|
265
|
+
"tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
|
|
266
|
+
},
|
|
267
|
+
"without_skill": {
|
|
268
|
+
"pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
|
|
269
|
+
"time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
|
|
270
|
+
"tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
|
|
271
|
+
},
|
|
272
|
+
"delta": {
|
|
273
|
+
"pass_rate": "+0.50",
|
|
274
|
+
"time_seconds": "+13.0",
|
|
275
|
+
"tokens": "+1700"
|
|
276
|
+
}
|
|
277
|
+
},
|
|
278
|
+
|
|
279
|
+
"notes": [
|
|
280
|
+
"Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
|
|
281
|
+
"Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
|
|
282
|
+
"Without-skill runs consistently fail on table extraction expectations",
|
|
283
|
+
"Skill adds 13s average execution time but improves pass rate by 50%"
|
|
284
|
+
]
|
|
285
|
+
}
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
**Fields:**
|
|
289
|
+
- `metadata`: Information about the benchmark run
|
|
290
|
+
- `skill_name`: Name of the skill
|
|
291
|
+
- `timestamp`: When the benchmark was run
|
|
292
|
+
- `evals_run`: List of eval names or IDs
|
|
293
|
+
- `runs_per_configuration`: Number of runs per config (e.g. 3)
|
|
294
|
+
- `runs[]`: Individual run results
|
|
295
|
+
- `eval_id`: Numeric eval identifier
|
|
296
|
+
- `eval_name`: Human-readable eval name (used as section header in the viewer)
|
|
297
|
+
- `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding)
|
|
298
|
+
- `run_number`: Integer run number (1, 2, 3...)
|
|
299
|
+
- `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors`
|
|
300
|
+
- `run_summary`: Statistical aggregates per configuration
|
|
301
|
+
- `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields
|
|
302
|
+
- `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"`
|
|
303
|
+
- `notes`: Freeform observations from the analyzer
|
|
304
|
+
|
|
305
|
+
**Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually.
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
## comparison.json
|
|
310
|
+
|
|
311
|
+
Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
|
|
312
|
+
|
|
313
|
+
```json
|
|
314
|
+
{
|
|
315
|
+
"winner": "A",
|
|
316
|
+
"reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
|
|
317
|
+
"rubric": {
|
|
318
|
+
"A": {
|
|
319
|
+
"content": {
|
|
320
|
+
"correctness": 5,
|
|
321
|
+
"completeness": 5,
|
|
322
|
+
"accuracy": 4
|
|
323
|
+
},
|
|
324
|
+
"structure": {
|
|
325
|
+
"organization": 4,
|
|
326
|
+
"formatting": 5,
|
|
327
|
+
"usability": 4
|
|
328
|
+
},
|
|
329
|
+
"content_score": 4.7,
|
|
330
|
+
"structure_score": 4.3,
|
|
331
|
+
"overall_score": 9.0
|
|
332
|
+
},
|
|
333
|
+
"B": {
|
|
334
|
+
"content": {
|
|
335
|
+
"correctness": 3,
|
|
336
|
+
"completeness": 2,
|
|
337
|
+
"accuracy": 3
|
|
338
|
+
},
|
|
339
|
+
"structure": {
|
|
340
|
+
"organization": 3,
|
|
341
|
+
"formatting": 2,
|
|
342
|
+
"usability": 3
|
|
343
|
+
},
|
|
344
|
+
"content_score": 2.7,
|
|
345
|
+
"structure_score": 2.7,
|
|
346
|
+
"overall_score": 5.4
|
|
347
|
+
}
|
|
348
|
+
},
|
|
349
|
+
"output_quality": {
|
|
350
|
+
"A": {
|
|
351
|
+
"score": 9,
|
|
352
|
+
"strengths": ["Complete solution", "Well-formatted", "All fields present"],
|
|
353
|
+
"weaknesses": ["Minor style inconsistency in header"]
|
|
354
|
+
},
|
|
355
|
+
"B": {
|
|
356
|
+
"score": 5,
|
|
357
|
+
"strengths": ["Readable output", "Correct basic structure"],
|
|
358
|
+
"weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
|
|
359
|
+
}
|
|
360
|
+
},
|
|
361
|
+
"expectation_results": {
|
|
362
|
+
"A": {
|
|
363
|
+
"passed": 4,
|
|
364
|
+
"total": 5,
|
|
365
|
+
"pass_rate": 0.80,
|
|
366
|
+
"details": [
|
|
367
|
+
{"text": "Output includes name", "passed": true}
|
|
368
|
+
]
|
|
369
|
+
},
|
|
370
|
+
"B": {
|
|
371
|
+
"passed": 3,
|
|
372
|
+
"total": 5,
|
|
373
|
+
"pass_rate": 0.60,
|
|
374
|
+
"details": [
|
|
375
|
+
{"text": "Output includes name", "passed": true}
|
|
376
|
+
]
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
---
|
|
383
|
+
|
|
384
|
+
## analysis.json
|
|
385
|
+
|
|
386
|
+
Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
|
|
387
|
+
|
|
388
|
+
```json
|
|
389
|
+
{
|
|
390
|
+
"comparison_summary": {
|
|
391
|
+
"winner": "A",
|
|
392
|
+
"winner_skill": "path/to/winner/skill",
|
|
393
|
+
"loser_skill": "path/to/loser/skill",
|
|
394
|
+
"comparator_reasoning": "Brief summary of why comparator chose winner"
|
|
395
|
+
},
|
|
396
|
+
"winner_strengths": [
|
|
397
|
+
"Clear step-by-step instructions for handling multi-page documents",
|
|
398
|
+
"Included validation script that caught formatting errors"
|
|
399
|
+
],
|
|
400
|
+
"loser_weaknesses": [
|
|
401
|
+
"Vague instruction 'process the document appropriately' led to inconsistent behavior",
|
|
402
|
+
"No script for validation, agent had to improvise"
|
|
403
|
+
],
|
|
404
|
+
"instruction_following": {
|
|
405
|
+
"winner": {
|
|
406
|
+
"score": 9,
|
|
407
|
+
"issues": ["Minor: skipped optional logging step"]
|
|
408
|
+
},
|
|
409
|
+
"loser": {
|
|
410
|
+
"score": 6,
|
|
411
|
+
"issues": [
|
|
412
|
+
"Did not use the skill's formatting template",
|
|
413
|
+
"Invented own approach instead of following step 3"
|
|
414
|
+
]
|
|
415
|
+
}
|
|
416
|
+
},
|
|
417
|
+
"improvement_suggestions": [
|
|
418
|
+
{
|
|
419
|
+
"priority": "high",
|
|
420
|
+
"category": "instructions",
|
|
421
|
+
"suggestion": "Replace 'process the document appropriately' with explicit steps",
|
|
422
|
+
"expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
|
|
423
|
+
}
|
|
424
|
+
],
|
|
425
|
+
"transcript_insights": {
|
|
426
|
+
"winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
|
|
427
|
+
"loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
```
|