@intentsolutionsio/skill-creator 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +17 -0
- package/README.md +55 -0
- package/package.json +38 -0
- package/scripts/validate-skill.py +1132 -0
- package/skills/agent-creator/SKILL.md +305 -0
- package/skills/agent-creator/references/anthropic-agent-spec.md +89 -0
- package/skills/skill-creator/SKILL.md +267 -0
- package/skills/skill-creator/agents/analyzer.md +279 -0
- package/skills/skill-creator/agents/comparator.md +207 -0
- package/skills/skill-creator/agents/grader.md +228 -0
- package/skills/skill-creator/assets/eval_review.html +146 -0
- package/skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/skill-creator/references/advanced-eval-workflow.md +320 -0
- package/skills/skill-creator/references/anthropic-comparison.md +93 -0
- package/skills/skill-creator/references/ard-template.md +47 -0
- package/skills/skill-creator/references/creation-guide.md +305 -0
- package/skills/skill-creator/references/errors-template.md +27 -0
- package/skills/skill-creator/references/examples-template.md +40 -0
- package/skills/skill-creator/references/frontmatter-spec.md +531 -0
- package/skills/skill-creator/references/implementation-template.md +42 -0
- package/skills/skill-creator/references/output-patterns.md +193 -0
- package/skills/skill-creator/references/prd-template.md +55 -0
- package/skills/skill-creator/references/schemas.md +430 -0
- package/skills/skill-creator/references/source-of-truth.md +658 -0
- package/skills/skill-creator/references/validation-rules.md +528 -0
- package/skills/skill-creator/references/workflows.md +233 -0
- package/skills/skill-creator/scripts/__init__.py +0 -0
- package/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/skill-creator/scripts/generate_report.py +326 -0
- package/skills/skill-creator/scripts/improve_description.py +247 -0
- package/skills/skill-creator/scripts/package_skill.py +136 -0
- package/skills/skill-creator/scripts/quick_validate.py +103 -0
- package/skills/skill-creator/scripts/run_eval.py +344 -0
- package/skills/skill-creator/scripts/run_loop.py +329 -0
- package/skills/skill-creator/scripts/utils.py +47 -0
- package/skills/skill-creator/scripts/validate-skill.py +87 -0
- package/skills/skill-creator/templates/agent-template.md +99 -0
- package/skills/skill-creator/templates/skill-template.md +122 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# Skill Workflow Patterns
|
|
2
|
+
|
|
3
|
+
Reference for common workflow patterns in Claude Agent Skills.
|
|
4
|
+
Source: Anthropic best practices, official skill-creator patterns.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Sequential Workflow
|
|
9
|
+
|
|
10
|
+
Steps execute in fixed order. Simplest pattern.
|
|
11
|
+
|
|
12
|
+
```markdown
|
|
13
|
+
## Instructions
|
|
14
|
+
|
|
15
|
+
### Step 1: Gather Input
|
|
16
|
+
Read the target file and extract relevant data.
|
|
17
|
+
|
|
18
|
+
### Step 2: Process
|
|
19
|
+
Transform the data according to the rules.
|
|
20
|
+
|
|
21
|
+
### Step 3: Output
|
|
22
|
+
Write the result to the specified location.
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
**Best for**: File conversion, data transformation, build scripts.
|
|
26
|
+
**Degrees of freedom**: Low (fixed steps, predictable output).
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Conditional Workflow
|
|
31
|
+
|
|
32
|
+
Branch execution based on input or context.
|
|
33
|
+
|
|
34
|
+
```markdown
|
|
35
|
+
## Instructions
|
|
36
|
+
|
|
37
|
+
### Step 1: Analyze Input
|
|
38
|
+
Determine the type of input:
|
|
39
|
+
- If markdown file: proceed to Step 2a
|
|
40
|
+
- If HTML file: proceed to Step 2b
|
|
41
|
+
- If unknown: ask user with AskUserQuestion
|
|
42
|
+
|
|
43
|
+
### Step 2a: Process Markdown
|
|
44
|
+
Convert markdown using pandoc conventions...
|
|
45
|
+
|
|
46
|
+
### Step 2b: Process HTML
|
|
47
|
+
Parse HTML and extract content...
|
|
48
|
+
|
|
49
|
+
### Step 3: Output
|
|
50
|
+
Write the converted result.
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
**Best for**: Skills handling multiple input types, multi-purpose tools.
|
|
54
|
+
**Degrees of freedom**: Medium (defined branches, flexible within each).
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Wizard-Style Workflow
|
|
59
|
+
|
|
60
|
+
Interactive multi-step gathering using AskUserQuestion.
|
|
61
|
+
|
|
62
|
+
```markdown
|
|
63
|
+
## Instructions
|
|
64
|
+
|
|
65
|
+
### Step 1: Gather Requirements
|
|
66
|
+
Ask the user:
|
|
67
|
+
1. What is the project name?
|
|
68
|
+
2. Which framework? (React / Vue / Svelte)
|
|
69
|
+
3. Include testing? (Yes / No)
|
|
70
|
+
|
|
71
|
+
Use AskUserQuestion for each decision point.
|
|
72
|
+
|
|
73
|
+
### Step 2: Generate Based on Choices
|
|
74
|
+
Based on responses, generate the appropriate scaffold.
|
|
75
|
+
|
|
76
|
+
### Step 3: Verify
|
|
77
|
+
Show the user what was created and confirm.
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Best for**: Complex setup, project scaffolding, configuration generation.
|
|
81
|
+
**Degrees of freedom**: Medium (user drives decisions).
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Plan-Validate-Execute
|
|
86
|
+
|
|
87
|
+
Verifiable intermediates with feedback loops. Anthropic's recommended pattern for high-stakes tasks.
|
|
88
|
+
|
|
89
|
+
```markdown
|
|
90
|
+
## Instructions
|
|
91
|
+
|
|
92
|
+
### Step 1: Plan
|
|
93
|
+
Analyze the current state and create an execution plan.
|
|
94
|
+
Show the plan to the user before proceeding.
|
|
95
|
+
|
|
96
|
+
### Step 2: Validate Plan
|
|
97
|
+
Check each planned step for:
|
|
98
|
+
- Prerequisites met
|
|
99
|
+
- No conflicts with existing state
|
|
100
|
+
- Reversibility if something goes wrong
|
|
101
|
+
|
|
102
|
+
### Step 3: Execute
|
|
103
|
+
Execute each step, verifying success before proceeding:
|
|
104
|
+
1. Execute step → verify → continue
|
|
105
|
+
2. If verification fails → rollback → report
|
|
106
|
+
|
|
107
|
+
### Step 4: Report
|
|
108
|
+
Summarize what was done, what succeeded, and any issues.
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**Best for**: Deployment, migration, refactoring, any destructive operation.
|
|
112
|
+
**Degrees of freedom**: Low (strict verification at each stage).
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Feedback Loop Workflow
|
|
117
|
+
|
|
118
|
+
Iterative refinement until quality threshold met.
|
|
119
|
+
|
|
120
|
+
```markdown
|
|
121
|
+
## Instructions
|
|
122
|
+
|
|
123
|
+
### Step 1: Generate Initial Output
|
|
124
|
+
Create the first draft/version.
|
|
125
|
+
|
|
126
|
+
### Step 2: Evaluate Quality
|
|
127
|
+
Check against criteria:
|
|
128
|
+
- [ ] Criterion A met?
|
|
129
|
+
- [ ] Criterion B met?
|
|
130
|
+
- [ ] Criterion C met?
|
|
131
|
+
|
|
132
|
+
### Step 3: Iterate if Needed
|
|
133
|
+
If any criteria not met:
|
|
134
|
+
1. Identify the gap
|
|
135
|
+
2. Refine the output
|
|
136
|
+
3. Return to Step 2
|
|
137
|
+
|
|
138
|
+
Maximum 3 iterations. If still not passing, report issues.
|
|
139
|
+
|
|
140
|
+
### Step 4: Finalize
|
|
141
|
+
Output the final version with quality report.
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Best for**: Content generation, code quality, optimization tasks.
|
|
145
|
+
**Degrees of freedom**: Medium (defined criteria, flexible refinement).
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Search-Analyze-Report
|
|
150
|
+
|
|
151
|
+
Codebase exploration with structured analysis.
|
|
152
|
+
|
|
153
|
+
```markdown
|
|
154
|
+
## Instructions
|
|
155
|
+
|
|
156
|
+
### Step 1: Search
|
|
157
|
+
Use Glob and Grep to find relevant files:
|
|
158
|
+
- Pattern: `**/*.py`
|
|
159
|
+
- Search: `def.*deprecated`
|
|
160
|
+
|
|
161
|
+
### Step 2: Analyze
|
|
162
|
+
For each finding:
|
|
163
|
+
- Context (what file, what function)
|
|
164
|
+
- Severity (critical / warning / info)
|
|
165
|
+
- Recommended action
|
|
166
|
+
|
|
167
|
+
### Step 3: Report
|
|
168
|
+
Generate structured report:
|
|
169
|
+
|
|
170
|
+
| File | Line | Issue | Severity | Fix |
|
|
171
|
+
|------|------|-------|----------|-----|
|
|
172
|
+
| ... | ... | ... | ... | ... |
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Best for**: Code review, security audit, dependency analysis, codebase understanding.
|
|
176
|
+
**Degrees of freedom**: High (Claude decides what's relevant).
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Checklist Workflow
|
|
181
|
+
|
|
182
|
+
Copy-pasteable progress tracking for complex multi-step processes. Claude checks items off as it completes them.
|
|
183
|
+
|
|
184
|
+
```markdown
|
|
185
|
+
## Instructions
|
|
186
|
+
|
|
187
|
+
### Progress Checklist
|
|
188
|
+
- [ ] Step 1: Gather requirements
|
|
189
|
+
- [ ] Step 2: Validate inputs
|
|
190
|
+
- [ ] Step 3: Execute primary operation
|
|
191
|
+
- [ ] Step 4: Run verification checks
|
|
192
|
+
- [ ] Step 5: Generate report
|
|
193
|
+
|
|
194
|
+
### Step 1: Gather Requirements
|
|
195
|
+
{{GATHER_INSTRUCTIONS}}
|
|
196
|
+
Update checklist: mark Step 1 complete.
|
|
197
|
+
|
|
198
|
+
### Step 2: Validate Inputs
|
|
199
|
+
{{VALIDATION_INSTRUCTIONS}}
|
|
200
|
+
Update checklist: mark Step 2 complete.
|
|
201
|
+
|
|
202
|
+
### Step 3: Execute
|
|
203
|
+
{{EXECUTION_INSTRUCTIONS}}
|
|
204
|
+
Update checklist: mark Step 3 complete.
|
|
205
|
+
|
|
206
|
+
### Step 4: Verify
|
|
207
|
+
{{VERIFICATION_INSTRUCTIONS}}
|
|
208
|
+
Update checklist: mark Step 4 complete.
|
|
209
|
+
|
|
210
|
+
### Step 5: Report
|
|
211
|
+
Show completed checklist with summary of each step's outcome.
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
**Best for**: Multi-step processes where progress visibility matters (releases, migrations, audits).
|
|
215
|
+
**Degrees of freedom**: Low to Medium (defined steps, flexible execution within each).
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Choosing the Right Pattern
|
|
220
|
+
|
|
221
|
+
| If your skill... | Use this pattern |
|
|
222
|
+
|-----------------|-----------------|
|
|
223
|
+
| Does one thing in fixed order | Sequential |
|
|
224
|
+
| Handles different input types | Conditional |
|
|
225
|
+
| Needs user decisions | Wizard-Style |
|
|
226
|
+
| Does something risky/irreversible | Plan-Validate-Execute |
|
|
227
|
+
| Needs iterative quality | Feedback Loop |
|
|
228
|
+
| Explores and reports on code | Search-Analyze-Report |
|
|
229
|
+
| Needs visible progress tracking | Checklist |
|
|
230
|
+
| Combines multiple concerns | Compose patterns together |
|
|
231
|
+
|
|
232
|
+
Patterns can be composed. A deployment skill might use:
|
|
233
|
+
Wizard (gather config) → Plan-Validate-Execute (deploy) → Feedback Loop (health check).
|
|
File without changes
|
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Aggregate individual run results into benchmark summary statistics.
|
|
4
|
+
|
|
5
|
+
Reads grading.json files from run directories and produces:
|
|
6
|
+
- run_summary with mean, stddev, min, max for each metric
|
|
7
|
+
- delta between with_skill and without_skill configurations
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python aggregate_benchmark.py <benchmark_dir>
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
|
|
14
|
+
|
|
15
|
+
The script supports two directory layouts:
|
|
16
|
+
|
|
17
|
+
Workspace layout (from skill-creator iterations):
|
|
18
|
+
<benchmark_dir>/
|
|
19
|
+
└── eval-N/
|
|
20
|
+
├── with_skill/
|
|
21
|
+
│ ├── run-1/grading.json
|
|
22
|
+
│ └── run-2/grading.json
|
|
23
|
+
└── without_skill/
|
|
24
|
+
├── run-1/grading.json
|
|
25
|
+
└── run-2/grading.json
|
|
26
|
+
|
|
27
|
+
Legacy layout (with runs/ subdirectory):
|
|
28
|
+
<benchmark_dir>/
|
|
29
|
+
└── runs/
|
|
30
|
+
└── eval-N/
|
|
31
|
+
├── with_skill/
|
|
32
|
+
│ └── run-1/grading.json
|
|
33
|
+
└── without_skill/
|
|
34
|
+
└── run-1/grading.json
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
import argparse
|
|
38
|
+
import json
|
|
39
|
+
import math
|
|
40
|
+
import sys
|
|
41
|
+
from datetime import datetime, timezone
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def calculate_stats(values: list[float]) -> dict:
|
|
46
|
+
"""Calculate mean, stddev, min, max for a list of values."""
|
|
47
|
+
if not values:
|
|
48
|
+
return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
|
|
49
|
+
|
|
50
|
+
n = len(values)
|
|
51
|
+
mean = sum(values) / n
|
|
52
|
+
|
|
53
|
+
if n > 1:
|
|
54
|
+
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
|
|
55
|
+
stddev = math.sqrt(variance)
|
|
56
|
+
else:
|
|
57
|
+
stddev = 0.0
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
"mean": round(mean, 4),
|
|
61
|
+
"stddev": round(stddev, 4),
|
|
62
|
+
"min": round(min(values), 4),
|
|
63
|
+
"max": round(max(values), 4)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def load_run_results(benchmark_dir: Path) -> dict:
|
|
68
|
+
"""
|
|
69
|
+
Load all run results from a benchmark directory.
|
|
70
|
+
|
|
71
|
+
Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
|
|
72
|
+
or "new_skill"/"old_skill"), each containing a list of run results.
|
|
73
|
+
"""
|
|
74
|
+
# Support both layouts: eval dirs directly under benchmark_dir, or under runs/
|
|
75
|
+
runs_dir = benchmark_dir / "runs"
|
|
76
|
+
if runs_dir.exists():
|
|
77
|
+
search_dir = runs_dir
|
|
78
|
+
elif list(benchmark_dir.glob("eval-*")):
|
|
79
|
+
search_dir = benchmark_dir
|
|
80
|
+
else:
|
|
81
|
+
print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
|
|
82
|
+
return {}
|
|
83
|
+
|
|
84
|
+
results: dict[str, list] = {}
|
|
85
|
+
|
|
86
|
+
for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
|
|
87
|
+
metadata_path = eval_dir / "eval_metadata.json"
|
|
88
|
+
if metadata_path.exists():
|
|
89
|
+
try:
|
|
90
|
+
with open(metadata_path) as mf:
|
|
91
|
+
eval_id = json.load(mf).get("eval_id", eval_idx)
|
|
92
|
+
except (json.JSONDecodeError, OSError):
|
|
93
|
+
eval_id = eval_idx
|
|
94
|
+
else:
|
|
95
|
+
try:
|
|
96
|
+
eval_id = int(eval_dir.name.split("-")[1])
|
|
97
|
+
except ValueError:
|
|
98
|
+
eval_id = eval_idx
|
|
99
|
+
|
|
100
|
+
# Discover config directories dynamically rather than hardcoding names
|
|
101
|
+
for config_dir in sorted(eval_dir.iterdir()):
|
|
102
|
+
if not config_dir.is_dir():
|
|
103
|
+
continue
|
|
104
|
+
# Skip non-config directories (inputs, outputs, etc.)
|
|
105
|
+
if not list(config_dir.glob("run-*")):
|
|
106
|
+
continue
|
|
107
|
+
config = config_dir.name
|
|
108
|
+
if config not in results:
|
|
109
|
+
results[config] = []
|
|
110
|
+
|
|
111
|
+
for run_dir in sorted(config_dir.glob("run-*")):
|
|
112
|
+
run_number = int(run_dir.name.split("-")[1])
|
|
113
|
+
grading_file = run_dir / "grading.json"
|
|
114
|
+
|
|
115
|
+
if not grading_file.exists():
|
|
116
|
+
print(f"Warning: grading.json not found in {run_dir}")
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
with open(grading_file) as f:
|
|
121
|
+
grading = json.load(f)
|
|
122
|
+
except json.JSONDecodeError as e:
|
|
123
|
+
print(f"Warning: Invalid JSON in {grading_file}: {e}")
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
# Extract metrics
|
|
127
|
+
result = {
|
|
128
|
+
"eval_id": eval_id,
|
|
129
|
+
"run_number": run_number,
|
|
130
|
+
"pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
|
|
131
|
+
"passed": grading.get("summary", {}).get("passed", 0),
|
|
132
|
+
"failed": grading.get("summary", {}).get("failed", 0),
|
|
133
|
+
"total": grading.get("summary", {}).get("total", 0),
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Extract timing — check grading.json first, then sibling timing.json
|
|
137
|
+
timing = grading.get("timing", {})
|
|
138
|
+
result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
|
|
139
|
+
timing_file = run_dir / "timing.json"
|
|
140
|
+
if result["time_seconds"] == 0.0 and timing_file.exists():
|
|
141
|
+
try:
|
|
142
|
+
with open(timing_file) as tf:
|
|
143
|
+
timing_data = json.load(tf)
|
|
144
|
+
result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
|
|
145
|
+
result["tokens"] = timing_data.get("total_tokens", 0)
|
|
146
|
+
except json.JSONDecodeError:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
# Extract metrics if available
|
|
150
|
+
metrics = grading.get("execution_metrics", {})
|
|
151
|
+
result["tool_calls"] = metrics.get("total_tool_calls", 0)
|
|
152
|
+
if not result.get("tokens"):
|
|
153
|
+
result["tokens"] = metrics.get("output_chars", 0)
|
|
154
|
+
result["errors"] = metrics.get("errors_encountered", 0)
|
|
155
|
+
|
|
156
|
+
# Extract expectations — viewer requires fields: text, passed, evidence
|
|
157
|
+
raw_expectations = grading.get("expectations", [])
|
|
158
|
+
for exp in raw_expectations:
|
|
159
|
+
if "text" not in exp or "passed" not in exp:
|
|
160
|
+
print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
|
|
161
|
+
result["expectations"] = raw_expectations
|
|
162
|
+
|
|
163
|
+
# Extract notes from user_notes_summary
|
|
164
|
+
notes_summary = grading.get("user_notes_summary", {})
|
|
165
|
+
notes = []
|
|
166
|
+
notes.extend(notes_summary.get("uncertainties", []))
|
|
167
|
+
notes.extend(notes_summary.get("needs_review", []))
|
|
168
|
+
notes.extend(notes_summary.get("workarounds", []))
|
|
169
|
+
result["notes"] = notes
|
|
170
|
+
|
|
171
|
+
results[config].append(result)
|
|
172
|
+
|
|
173
|
+
return results
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def aggregate_results(results: dict) -> dict:
|
|
177
|
+
"""
|
|
178
|
+
Aggregate run results into summary statistics.
|
|
179
|
+
|
|
180
|
+
Returns run_summary with stats for each configuration and delta.
|
|
181
|
+
"""
|
|
182
|
+
run_summary = {}
|
|
183
|
+
configs = list(results.keys())
|
|
184
|
+
|
|
185
|
+
for config in configs:
|
|
186
|
+
runs = results.get(config, [])
|
|
187
|
+
|
|
188
|
+
if not runs:
|
|
189
|
+
run_summary[config] = {
|
|
190
|
+
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
|
191
|
+
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
|
192
|
+
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
|
193
|
+
}
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
pass_rates = [r["pass_rate"] for r in runs]
|
|
197
|
+
times = [r["time_seconds"] for r in runs]
|
|
198
|
+
tokens = [r.get("tokens", 0) for r in runs]
|
|
199
|
+
|
|
200
|
+
run_summary[config] = {
|
|
201
|
+
"pass_rate": calculate_stats(pass_rates),
|
|
202
|
+
"time_seconds": calculate_stats(times),
|
|
203
|
+
"tokens": calculate_stats(tokens)
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
# Calculate delta between the first two configs (if two exist)
|
|
207
|
+
if len(configs) >= 2:
|
|
208
|
+
primary = run_summary.get(configs[0], {})
|
|
209
|
+
baseline = run_summary.get(configs[1], {})
|
|
210
|
+
else:
|
|
211
|
+
primary = run_summary.get(configs[0], {}) if configs else {}
|
|
212
|
+
baseline = {}
|
|
213
|
+
|
|
214
|
+
delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
|
|
215
|
+
delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
|
|
216
|
+
delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
|
|
217
|
+
|
|
218
|
+
run_summary["delta"] = {
|
|
219
|
+
"pass_rate": f"{delta_pass_rate:+.2f}",
|
|
220
|
+
"time_seconds": f"{delta_time:+.1f}",
|
|
221
|
+
"tokens": f"{delta_tokens:+.0f}"
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return run_summary
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
|
|
228
|
+
"""
|
|
229
|
+
Generate complete benchmark.json from run results.
|
|
230
|
+
"""
|
|
231
|
+
results = load_run_results(benchmark_dir)
|
|
232
|
+
run_summary = aggregate_results(results)
|
|
233
|
+
|
|
234
|
+
# Build runs array for benchmark.json
|
|
235
|
+
runs = []
|
|
236
|
+
for config in results:
|
|
237
|
+
for result in results[config]:
|
|
238
|
+
runs.append({
|
|
239
|
+
"eval_id": result["eval_id"],
|
|
240
|
+
"configuration": config,
|
|
241
|
+
"run_number": result["run_number"],
|
|
242
|
+
"result": {
|
|
243
|
+
"pass_rate": result["pass_rate"],
|
|
244
|
+
"passed": result["passed"],
|
|
245
|
+
"failed": result["failed"],
|
|
246
|
+
"total": result["total"],
|
|
247
|
+
"time_seconds": result["time_seconds"],
|
|
248
|
+
"tokens": result.get("tokens", 0),
|
|
249
|
+
"tool_calls": result.get("tool_calls", 0),
|
|
250
|
+
"errors": result.get("errors", 0)
|
|
251
|
+
},
|
|
252
|
+
"expectations": result["expectations"],
|
|
253
|
+
"notes": result["notes"]
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
# Determine eval IDs from results
|
|
257
|
+
eval_ids = sorted(set(
|
|
258
|
+
r["eval_id"]
|
|
259
|
+
for config in results.values()
|
|
260
|
+
for r in config
|
|
261
|
+
))
|
|
262
|
+
|
|
263
|
+
benchmark = {
|
|
264
|
+
"metadata": {
|
|
265
|
+
"skill_name": skill_name or "<skill-name>",
|
|
266
|
+
"skill_path": skill_path or "<path/to/skill>",
|
|
267
|
+
"executor_model": "<model-name>",
|
|
268
|
+
"analyzer_model": "<model-name>",
|
|
269
|
+
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
270
|
+
"evals_run": eval_ids,
|
|
271
|
+
"runs_per_configuration": 3
|
|
272
|
+
},
|
|
273
|
+
"runs": runs,
|
|
274
|
+
"run_summary": run_summary,
|
|
275
|
+
"notes": [] # To be filled by analyzer
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
return benchmark
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def generate_markdown(benchmark: dict) -> str:
|
|
282
|
+
"""Generate human-readable benchmark.md from benchmark data."""
|
|
283
|
+
metadata = benchmark["metadata"]
|
|
284
|
+
run_summary = benchmark["run_summary"]
|
|
285
|
+
|
|
286
|
+
# Determine config names (excluding "delta")
|
|
287
|
+
configs = [k for k in run_summary if k != "delta"]
|
|
288
|
+
config_a = configs[0] if len(configs) >= 1 else "config_a"
|
|
289
|
+
config_b = configs[1] if len(configs) >= 2 else "config_b"
|
|
290
|
+
label_a = config_a.replace("_", " ").title()
|
|
291
|
+
label_b = config_b.replace("_", " ").title()
|
|
292
|
+
|
|
293
|
+
lines = [
|
|
294
|
+
f"# Skill Benchmark: {metadata['skill_name']}",
|
|
295
|
+
"",
|
|
296
|
+
f"**Model**: {metadata['executor_model']}",
|
|
297
|
+
f"**Date**: {metadata['timestamp']}",
|
|
298
|
+
f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
|
|
299
|
+
"",
|
|
300
|
+
"## Summary",
|
|
301
|
+
"",
|
|
302
|
+
f"| Metric | {label_a} | {label_b} | Delta |",
|
|
303
|
+
"|--------|------------|---------------|-------|",
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
a_summary = run_summary.get(config_a, {})
|
|
307
|
+
b_summary = run_summary.get(config_b, {})
|
|
308
|
+
delta = run_summary.get("delta", {})
|
|
309
|
+
|
|
310
|
+
# Format pass rate
|
|
311
|
+
a_pr = a_summary.get("pass_rate", {})
|
|
312
|
+
b_pr = b_summary.get("pass_rate", {})
|
|
313
|
+
lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
|
|
314
|
+
|
|
315
|
+
# Format time
|
|
316
|
+
a_time = a_summary.get("time_seconds", {})
|
|
317
|
+
b_time = b_summary.get("time_seconds", {})
|
|
318
|
+
lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
|
|
319
|
+
|
|
320
|
+
# Format tokens
|
|
321
|
+
a_tokens = a_summary.get("tokens", {})
|
|
322
|
+
b_tokens = b_summary.get("tokens", {})
|
|
323
|
+
lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
|
|
324
|
+
|
|
325
|
+
# Notes section
|
|
326
|
+
if benchmark.get("notes"):
|
|
327
|
+
lines.extend([
|
|
328
|
+
"",
|
|
329
|
+
"## Notes",
|
|
330
|
+
""
|
|
331
|
+
])
|
|
332
|
+
for note in benchmark["notes"]:
|
|
333
|
+
lines.append(f"- {note}")
|
|
334
|
+
|
|
335
|
+
return "\n".join(lines)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def main():
|
|
339
|
+
parser = argparse.ArgumentParser(
|
|
340
|
+
description="Aggregate benchmark run results into summary statistics"
|
|
341
|
+
)
|
|
342
|
+
parser.add_argument(
|
|
343
|
+
"benchmark_dir",
|
|
344
|
+
type=Path,
|
|
345
|
+
help="Path to the benchmark directory"
|
|
346
|
+
)
|
|
347
|
+
parser.add_argument(
|
|
348
|
+
"--skill-name",
|
|
349
|
+
default="",
|
|
350
|
+
help="Name of the skill being benchmarked"
|
|
351
|
+
)
|
|
352
|
+
parser.add_argument(
|
|
353
|
+
"--skill-path",
|
|
354
|
+
default="",
|
|
355
|
+
help="Path to the skill being benchmarked"
|
|
356
|
+
)
|
|
357
|
+
parser.add_argument(
|
|
358
|
+
"--output", "-o",
|
|
359
|
+
type=Path,
|
|
360
|
+
help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
args = parser.parse_args()
|
|
364
|
+
|
|
365
|
+
if not args.benchmark_dir.exists():
|
|
366
|
+
print(f"Directory not found: {args.benchmark_dir}")
|
|
367
|
+
sys.exit(1)
|
|
368
|
+
|
|
369
|
+
# Generate benchmark
|
|
370
|
+
benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
|
|
371
|
+
|
|
372
|
+
# Determine output paths
|
|
373
|
+
output_json = args.output or (args.benchmark_dir / "benchmark.json")
|
|
374
|
+
output_md = output_json.with_suffix(".md")
|
|
375
|
+
|
|
376
|
+
# Write benchmark.json
|
|
377
|
+
with open(output_json, "w") as f:
|
|
378
|
+
json.dump(benchmark, f, indent=2)
|
|
379
|
+
print(f"Generated: {output_json}")
|
|
380
|
+
|
|
381
|
+
# Write benchmark.md
|
|
382
|
+
markdown = generate_markdown(benchmark)
|
|
383
|
+
with open(output_md, "w") as f:
|
|
384
|
+
f.write(markdown)
|
|
385
|
+
print(f"Generated: {output_md}")
|
|
386
|
+
|
|
387
|
+
# Print summary
|
|
388
|
+
run_summary = benchmark["run_summary"]
|
|
389
|
+
configs = [k for k in run_summary if k != "delta"]
|
|
390
|
+
delta = run_summary.get("delta", {})
|
|
391
|
+
|
|
392
|
+
print(f"\nSummary:")
|
|
393
|
+
for config in configs:
|
|
394
|
+
pr = run_summary[config]["pass_rate"]["mean"]
|
|
395
|
+
label = config.replace("_", " ").title()
|
|
396
|
+
print(f" {label}: {pr*100:.1f}% pass rate")
|
|
397
|
+
print(f" Delta: {delta.get('pass_rate', '—')}")
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
if __name__ == "__main__":
|
|
401
|
+
main()
|