@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Create control baseline for a scenario (shortcut for /benchmark control <agent>)
|
|
3
|
+
argument-hint: <agent> [--scenario <name>] [--runs N]
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Benchmark Control
|
|
7
|
+
|
|
8
|
+
<purpose>
|
|
9
|
+
Shortcut to run `/benchmark` with the `control` theme. Creates or extends a control baseline for comparing other personas against.
|
|
10
|
+
|
|
11
|
+
This is equivalent to running:
|
|
12
|
+
```
|
|
13
|
+
/benchmark control <agent> [--scenario <name>] [--runs N]
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Default: 10 runs for statistically meaningful baseline data.
|
|
17
|
+
</purpose>
|
|
18
|
+
|
|
19
|
+
<critical-integrity-requirements>
|
|
20
|
+
**See `/benchmark` for full integrity requirements.**
|
|
21
|
+
|
|
22
|
+
Baselines are saved to `internal/results/baselines/{scenario}/{agent}/` with:
|
|
23
|
+
- Individual runs in `runs/*.json` with proof-of-work
|
|
24
|
+
- Summary statistics in `summary.yaml` (mean, std_dev, CI)
|
|
25
|
+
- Timestamp validation (runs must take 30+ seconds each)
|
|
26
|
+
|
|
27
|
+
Control theme runs must include all proof fields. NO FABRICATION.
|
|
28
|
+
</critical-integrity-requirements>
|
|
29
|
+
|
|
30
|
+
<usage>
|
|
31
|
+
```
|
|
32
|
+
# Pick scenario interactively
|
|
33
|
+
/benchmark-control sm
|
|
34
|
+
/benchmark-control reviewer
|
|
35
|
+
|
|
36
|
+
# Specify scenario directly
|
|
37
|
+
/benchmark-control reviewer --scenario order-service
|
|
38
|
+
/benchmark-control dev --scenario tdd-shopping-cart --runs 15
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
**Arguments:**
|
|
42
|
+
- `agent` - The agent role (e.g., `sm`, `dev`, `reviewer`, `architect`)
|
|
43
|
+
- `--scenario` - (Optional) Scenario name. If omitted, shows matching scenarios.
|
|
44
|
+
- `--runs N` - Number of runs (default: 10 for baselines, max: 20)
|
|
45
|
+
</usage>
|
|
46
|
+
|
|
47
|
+
<on-invoke>
|
|
48
|
+
The user invoked this command with: $ARGUMENTS
|
|
49
|
+
|
|
50
|
+
**This is a shortcut.** Translate the arguments and invoke `/benchmark`:
|
|
51
|
+
|
|
52
|
+
1. Prepend `control` as the theme
|
|
53
|
+
2. Pass through all other arguments
|
|
54
|
+
|
|
55
|
+
**Examples:**
|
|
56
|
+
- `/benchmark-control sm` → `/benchmark control sm` (runs control:sm)
|
|
57
|
+
- `/benchmark-control reviewer --scenario order-service` → `/benchmark control reviewer --scenario order-service` (runs control:reviewer)
|
|
58
|
+
- `/benchmark-control dev --runs 15` → `/benchmark control dev --runs 15` (runs control:dev)
|
|
59
|
+
|
|
60
|
+
**Default runs override:** If `--runs` is not specified, default to 10 (instead of 4) since baselines need more data.
|
|
61
|
+
|
|
62
|
+
Now execute the equivalent `/benchmark` command with the translated arguments.
|
|
63
|
+
</on-invoke>
|
|
64
|
+
|
|
65
|
+
<reference>
|
|
66
|
+
- Main command: `.claude/project/commands/benchmark.md`
|
|
67
|
+
- Baselines location: `internal/results/baselines/{scenario}/{role}/`
|
|
68
|
+
- Results README: `internal/results/README.md`
|
|
69
|
+
</reference>
|
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Compare an agent's performance against a stored baseline
|
|
3
|
+
argument-hint: <theme> <agent> [--as <role>] [--scenario <name>] [--runs N]
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Benchmark
|
|
7
|
+
|
|
8
|
+
<purpose>
|
|
9
|
+
Compare a persona agent's performance against the established control baseline. Runs the agent on the scenario and calculates statistical measures including effect size (Cohen's d) and significance.
|
|
10
|
+
|
|
11
|
+
Default: 4 runs for comparison (balance between reliability and runtime). Runs execute in parallel for faster results.
|
|
12
|
+
|
|
13
|
+
**Simplified Usage:** Just specify theme and agent role - you'll be presented with matching scenarios to choose from.
|
|
14
|
+
</purpose>
|
|
15
|
+
|
|
16
|
+
<critical-integrity-requirements>
|
|
17
|
+
## DO NOT FABRICATE COMPARISON DATA
|
|
18
|
+
|
|
19
|
+
Comparisons are only meaningful if BOTH the baseline AND the contestant runs are real.
|
|
20
|
+
|
|
21
|
+
**Before comparing:**
|
|
22
|
+
1. Validate baseline has proof-of-work (check runs have `proof.*` fields)
|
|
23
|
+
2. Actually run `/solo` for the contestant with real Task tool calls
|
|
24
|
+
3. Validate contestant runs have proof-of-work before calculating statistics
|
|
25
|
+
|
|
26
|
+
**Baseline Validation:**
|
|
27
|
+
Before using a baseline, spot-check at least one run file:
|
|
28
|
+
- Read a run from `internal/results/baselines/{scenario}/{agent}/runs/*.json`
|
|
29
|
+
- Verify it has `proof.agent_task_id`, `proof.agent_response_text`, `proof.judge_task_id`
|
|
30
|
+
- Verify `proof.agent_response_text` is at least 200 characters
|
|
31
|
+
- Verify `token_usage.input_tokens` > 0
|
|
32
|
+
|
|
33
|
+
**If baseline validation fails:**
|
|
34
|
+
```markdown
|
|
35
|
+
Error: Baseline for '{scenario}' appears to be fabricated (missing proof-of-work).
|
|
36
|
+
Run `/benchmark-control --scenario {scenario}` to create a real baseline.
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
**Contestant runs MUST include proof-of-work.** See `/solo` for requirements.
|
|
40
|
+
</critical-integrity-requirements>
|
|
41
|
+
|
|
42
|
+
<usage>
|
|
43
|
+
```
|
|
44
|
+
# Simple: Pick scenario interactively
|
|
45
|
+
/benchmark the-expanse sm
|
|
46
|
+
/benchmark discworld reviewer
|
|
47
|
+
|
|
48
|
+
# Direct: Specify scenario explicitly
|
|
49
|
+
/benchmark discworld reviewer --scenario order-service
|
|
50
|
+
/benchmark ted-lasso dev --scenario tdd-shopping-cart --runs 8
|
|
51
|
+
|
|
52
|
+
# Cross-role: Run any character as any role
|
|
53
|
+
/benchmark shakespeare prospero --as dev --scenario django-10554
|
|
54
|
+
/benchmark discworld granny --as dev --scenario tdd-shopping-cart
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**Arguments:**
|
|
58
|
+
- `theme` - The persona theme (e.g., `discworld`, `the-expanse`, `ted-lasso`)
|
|
59
|
+
- `agent` - The agent role OR character name (if using `--as`)
|
|
60
|
+
- `--as <role>` - (Optional) Override role for cross-role testing. Makes `agent` a character name lookup.
|
|
61
|
+
- `--scenario` - (Optional) Scenario name. If omitted, shows matching scenarios to choose from.
|
|
62
|
+
- `--runs N` - Number of evaluation runs (default: 4, max: 20)
|
|
63
|
+
|
|
64
|
+
**Cross-Role Testing:**
|
|
65
|
+
The `--as` flag enables running any character as any role:
|
|
66
|
+
```
|
|
67
|
+
/benchmark shakespeare prospero --as dev --scenario django-10554
|
|
68
|
+
```
|
|
69
|
+
This uses Prospero's persona traits (wise orchestrator) but gives him a dev task.
|
|
70
|
+
The scenario's role determines what the agent is asked to do; the character determines HOW they do it.
|
|
71
|
+
|
|
72
|
+
**Examples:**
|
|
73
|
+
```
|
|
74
|
+
# Let me pick from SM scenarios
|
|
75
|
+
/benchmark the-expanse sm
|
|
76
|
+
|
|
77
|
+
# Let me pick from code review scenarios
|
|
78
|
+
/benchmark discworld reviewer
|
|
79
|
+
|
|
80
|
+
# Run specific scenario directly
|
|
81
|
+
/benchmark princess-bride reviewer --scenario order-service --runs 8
|
|
82
|
+
|
|
83
|
+
# Cross-role: Prospero (SM) doing dev work
|
|
84
|
+
/benchmark shakespeare prospero --as dev --scenario tdd-shopping-cart --runs 4
|
|
85
|
+
```
|
|
86
|
+
</usage>
|
|
87
|
+
|
|
88
|
+
<on-invoke>
|
|
89
|
+
The user invoked this command with: $ARGUMENTS
|
|
90
|
+
|
|
91
|
+
## Step 1: Parse Arguments
|
|
92
|
+
|
|
93
|
+
Parse the arguments to extract:
|
|
94
|
+
- `theme`: First positional argument (e.g., `discworld`, `the-expanse`)
|
|
95
|
+
- `agent_or_character`: Second positional argument (role name OR character name if `--as` is used)
|
|
96
|
+
- `role_override`: Value after `--as` (OPTIONAL - enables cross-role mode)
|
|
97
|
+
- `scenario_name`: Value after `--scenario` (OPTIONAL)
|
|
98
|
+
- `runs`: Value after `--runs` (default: 4, max: 20)
|
|
99
|
+
|
|
100
|
+
**Cross-Role Mode:**
|
|
101
|
+
If `--as <role>` is provided:
|
|
102
|
+
- `agent_or_character` is treated as a CHARACTER NAME (case-insensitive search)
|
|
103
|
+
- `role_override` becomes the `effective_role` for scenario matching
|
|
104
|
+
- Results save to `internal/results/benchmarks/{scenario}/{theme}-{character}-as-{role}/`
|
|
105
|
+
|
|
106
|
+
**Legacy format support:** If first argument contains `:`, split it (e.g., `discworld:reviewer` → theme=discworld, agent_or_character=reviewer)
|
|
107
|
+
|
|
108
|
+
**Validation:**
|
|
109
|
+
- Theme must be a valid theme name
|
|
110
|
+
- If `--as` is provided: validate `role_override` is one of: `sm`, `dev`, `reviewer`, `architect`, `tea`, `pm`
|
|
111
|
+
- If `--as` is NOT provided: validate `agent_or_character` is one of: `sm`, `dev`, `reviewer`, `architect`, `tea`, `pm`
|
|
112
|
+
- `--runs` must be a positive integer between 1 and 20
|
|
113
|
+
|
|
114
|
+
**Determine effective_role:**
|
|
115
|
+
```python
|
|
116
|
+
if role_override:
|
|
117
|
+
effective_role = role_override # e.g., "dev"
|
|
118
|
+
cross_role = True
|
|
119
|
+
else:
|
|
120
|
+
effective_role = agent_or_character # e.g., "dev"
|
|
121
|
+
cross_role = False
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Step 2: Scenario Discovery (if --scenario not provided)
|
|
125
|
+
|
|
126
|
+
If `scenario_name` is NOT provided, discover matching scenarios.
|
|
127
|
+
|
|
128
|
+
**Use `effective_role` (not `agent_or_character`) for scenario discovery.**
|
|
129
|
+
Cross-role mode: Prospero --as dev should see dev scenarios, not SM scenarios.
|
|
130
|
+
|
|
131
|
+
**Role-to-Category Mapping:**
|
|
132
|
+
| effective_role | Scenario Categories |
|
|
133
|
+
|----------------|---------------------|
|
|
134
|
+
| sm | `sm` |
|
|
135
|
+
| dev | `dev` (includes debug scenarios) |
|
|
136
|
+
| reviewer | `code-review` |
|
|
137
|
+
| architect | `architecture` |
|
|
138
|
+
| tea | `tea` |
|
|
139
|
+
|
|
140
|
+
**Time Estimates by Difficulty (parallel execution):**
|
|
141
|
+
| Difficulty | Est. Time (4 runs) | Note |
|
|
142
|
+
|------------|-------------------|------|
|
|
143
|
+
| easy | ~1 min | Runs execute in parallel |
|
|
144
|
+
| medium | ~2 min | Runs execute in parallel |
|
|
145
|
+
| hard | ~4 min | Runs execute in parallel |
|
|
146
|
+
| extreme | ~8 min | Runs execute in parallel |
|
|
147
|
+
|
|
148
|
+
**Discover scenarios:**
|
|
149
|
+
```bash
|
|
150
|
+
# Use Bash to list matching scenarios
|
|
151
|
+
ls scenarios/{category}/*.yaml | xargs -I {} yq -r '"{}|\(.name)|\(.difficulty)|\(.title)|\(.description)"' {}
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Present choices (Reflector-aware):**
|
|
155
|
+
|
|
156
|
+
First output marker: `<!-- CYCLIST:CHOICES:scenario -->`
|
|
157
|
+
|
|
158
|
+
Then use AskUserQuestion:
|
|
159
|
+
```yaml
|
|
160
|
+
AskUserQuestion:
|
|
161
|
+
questions:
|
|
162
|
+
- question: "Which scenario do you want to benchmark {theme}:{agent_type} on?"
|
|
163
|
+
header: "Scenario"
|
|
164
|
+
multiSelect: false
|
|
165
|
+
options:
|
|
166
|
+
- label: "{name} ({difficulty})"
|
|
167
|
+
description: "{title} - ~{time_estimate}"
|
|
168
|
+
# ... up to 4 options
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
If more than 4 scenarios exist, show the first 4 by difficulty (hardest first) and let user type "Other" for full list.
|
|
172
|
+
|
|
173
|
+
**After user selects:** Set `scenario_name` to the selected scenario's name and continue.
|
|
174
|
+
|
|
175
|
+
## Step 3: Control Theme Handling
|
|
176
|
+
|
|
177
|
+
**If theme is `control`:** This is a baseline creation run.
|
|
178
|
+
- Default `runs` to 10 (instead of 4) for statistical reliability
|
|
179
|
+
- Results save to `internal/results/baselines/{scenario}/{agent}/` instead of comparison
|
|
180
|
+
- Skip baseline validation (we're creating the baseline)
|
|
181
|
+
- After running, calculate and save baseline statistics
|
|
182
|
+
- Display baseline summary and exit
|
|
183
|
+
|
|
184
|
+
**If theme is NOT `control`:** Continue to Step 4 for comparison workflow.
|
|
185
|
+
|
|
186
|
+
## Step 4: Load and Validate Baseline
|
|
187
|
+
|
|
188
|
+
**Baseline is based on `effective_role`, not the character's native role.**
|
|
189
|
+
Cross-role tests compare against the effective role's baseline (e.g., prospero --as dev compares against control:dev).
|
|
190
|
+
|
|
191
|
+
Check if baseline exists:
|
|
192
|
+
|
|
193
|
+
```yaml
|
|
194
|
+
Read tool:
|
|
195
|
+
file_path: "internal/results/baselines/{scenario_name}/{effective_role}/summary.yaml"
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
**If baseline does not exist:**
|
|
199
|
+
```markdown
|
|
200
|
+
Error: No baseline found for scenario '{scenario_name}' with agent type '{agent_type}'.
|
|
201
|
+
|
|
202
|
+
To create a baseline, run:
|
|
203
|
+
/benchmark control {agent_type} --scenario {scenario_name}
|
|
204
|
+
|
|
205
|
+
Or use the shortcut:
|
|
206
|
+
/benchmark-control {agent_type} --scenario {scenario_name}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
**If baseline exists, VALIDATE IT:**
|
|
210
|
+
|
|
211
|
+
1. Get list of run files:
|
|
212
|
+
```yaml
|
|
213
|
+
Glob tool:
|
|
214
|
+
pattern: "internal/results/baselines/{scenario_name}/{agent_type}/runs/*.json"
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
2. Read at least one run file and validate proof-of-work:
|
|
218
|
+
```yaml
|
|
219
|
+
Read tool:
|
|
220
|
+
file_path: "{first run file}"
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
3. **Check for proof-of-work fields:**
|
|
224
|
+
- Has `proof.agent_task_id`?
|
|
225
|
+
- Has `proof.agent_response_text` with length >= 200?
|
|
226
|
+
- Has `proof.judge_task_id`?
|
|
227
|
+
- Has `proof.judge_response_text`?
|
|
228
|
+
- Has `token_usage.input_tokens` > 0?
|
|
229
|
+
- Has `token_usage.output_tokens` > 0?
|
|
230
|
+
|
|
231
|
+
4. **If validation fails:**
|
|
232
|
+
```markdown
|
|
233
|
+
Error: Baseline for '{scenario_name}' is INVALID - missing proof-of-work.
|
|
234
|
+
|
|
235
|
+
The baseline data appears to be fabricated (no agent/judge response text,
|
|
236
|
+
no task IDs, or no token counts).
|
|
237
|
+
|
|
238
|
+
Delete the invalid baseline and create a real one:
|
|
239
|
+
rm -rf internal/results/baselines/{scenario_name}/{agent_type}
|
|
240
|
+
/benchmark-control --scenario {scenario_name}
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
**If baseline is valid:**
|
|
244
|
+
- Extract `sample_size`, `statistics.total.mean`, `statistics.total.std_dev`
|
|
245
|
+
- Display baseline info with validation confirmation
|
|
246
|
+
|
|
247
|
+
**Sample size warning:**
|
|
248
|
+
If baseline sample size < 5:
|
|
249
|
+
```markdown
|
|
250
|
+
**Warning:** Baseline sample size ({n}) is less than 5. Results may not be statistically reliable.
|
|
251
|
+
Consider running `/benchmark-control --scenario {scenario_name} --runs 10` to add more data.
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
## Step 5: Run Contestant Evaluation (Parallel)
|
|
255
|
+
|
|
256
|
+
For efficiency, spawn multiple runs in parallel using Task agents.
|
|
257
|
+
|
|
258
|
+
**Batch Strategy:**
|
|
259
|
+
- If runs ≤ 4: Spawn all in parallel (single message with N Task agents)
|
|
260
|
+
- If runs > 4: Spawn in batches of 4 to avoid overwhelming the system
|
|
261
|
+
|
|
262
|
+
**Build the /solo command:**
|
|
263
|
+
```python
|
|
264
|
+
if cross_role:
|
|
265
|
+
# Cross-role: agent_or_character is a character name
|
|
266
|
+
solo_cmd = f"/solo {theme}:{agent_or_character} --as {effective_role} --scenario {scenario_name}"
|
|
267
|
+
else:
|
|
268
|
+
# Standard: agent_or_character is the role name
|
|
269
|
+
solo_cmd = f"/solo {theme}:{agent_or_character} --scenario {scenario_name}"
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
**For each run, spawn a Task agent:**
|
|
273
|
+
```
|
|
274
|
+
Task (run 1 of N):
|
|
275
|
+
subagent_type: general-purpose
|
|
276
|
+
prompt: |
|
|
277
|
+
Run {solo_cmd}
|
|
278
|
+
This is run 1 of N for baseline/benchmark.
|
|
279
|
+
Return the full result JSON including score and token_usage.
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
**Example commands:**
|
|
283
|
+
- Standard: `/solo discworld:dev --scenario tdd-shopping-cart`
|
|
284
|
+
- Cross-role: `/solo shakespeare:prospero --as dev --scenario tdd-shopping-cart`
|
|
285
|
+
|
|
286
|
+
**Spawn all batch tasks in a SINGLE message for parallel execution.**
|
|
287
|
+
|
|
288
|
+
Wait for all tasks to complete. Collect results:
|
|
289
|
+
- Per-run scores (total, plus dimension breakdown if available)
|
|
290
|
+
- Per-run token usage (input_tokens, output_tokens)
|
|
291
|
+
- Per-run timestamps
|
|
292
|
+
- Cross-role metadata (source_role, effective_role, cross_role flag)
|
|
293
|
+
|
|
294
|
+
**If a run fails:** Note the failure, continue with successful runs. Warn if < 3 successful runs.
|
|
295
|
+
|
|
296
|
+
## Step 6: Calculate Comparison Statistics
|
|
297
|
+
|
|
298
|
+
**Contestant Statistics:**
|
|
299
|
+
- `contestant_mean`: Average total score
|
|
300
|
+
- `contestant_std_dev`: Standard deviation
|
|
301
|
+
- `contestant_n`: Number of runs
|
|
302
|
+
|
|
303
|
+
**Baseline Statistics (from summary.yaml):**
|
|
304
|
+
- `baseline_mean`: statistics.total.mean
|
|
305
|
+
- `baseline_std_dev`: statistics.total.std_dev
|
|
306
|
+
- `baseline_n`: sample_size
|
|
307
|
+
|
|
308
|
+
**Mean Difference:**
|
|
309
|
+
```
|
|
310
|
+
difference = contestant_mean - baseline_mean
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
**Cohen's d Effect Size:**
|
|
314
|
+
```
|
|
315
|
+
pooled_std_dev = sqrt((contestant_std_dev² + baseline_std_dev²) / 2)
|
|
316
|
+
cohens_d = difference / pooled_std_dev
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
**Effect Size Interpretation:**
|
|
320
|
+
| Cohen's d | Interpretation |
|
|
321
|
+
|-----------|----------------|
|
|
322
|
+
| < 0.2 | Negligible |
|
|
323
|
+
| 0.2 - 0.5 | Small |
|
|
324
|
+
| 0.5 - 0.8 | Medium |
|
|
325
|
+
| > 0.8 | Large |
|
|
326
|
+
|
|
327
|
+
**95% Confidence Interval for Difference:**
|
|
328
|
+
```
|
|
329
|
+
se_diff = sqrt(contestant_std_dev²/contestant_n + baseline_std_dev²/baseline_n)
|
|
330
|
+
ci_lower = difference - 1.96 × se_diff
|
|
331
|
+
ci_upper = difference + 1.96 × se_diff
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
**Statistical Significance:**
|
|
335
|
+
If CI does not include 0, the difference is statistically significant at p < 0.05.
|
|
336
|
+
|
|
337
|
+
## Step 7: Display Comparison Results
|
|
338
|
+
|
|
339
|
+
```markdown
|
|
340
|
+
---
|
|
341
|
+
|
|
342
|
+
## Baseline Comparison
|
|
343
|
+
|
|
344
|
+
**Contestant:** {theme}:{agent_type} ({character_name})
|
|
345
|
+
**Scenario:** {scenario_name}
|
|
346
|
+
**Baseline:** control:{agent_type} (n={baseline_n})
|
|
347
|
+
|
|
348
|
+
### Performance vs Baseline
|
|
349
|
+
|
|
350
|
+
| Metric | Contestant | Baseline | Difference | Effect Size |
|
|
351
|
+
|--------|------------|----------|------------|-------------|
|
|
352
|
+
| Total Score | {c_mean} ± {c_std} | {b_mean} ± {b_std} | {diff:+.1f} | **{cohens_d:.1f}σ** ({interpretation}) |
|
|
353
|
+
| Detection | {c_det} | {b_det} | {diff:+.1f} | {effect} |
|
|
354
|
+
| Depth | {c_dep} | {b_dep} | {diff:+.1f} | {effect} |
|
|
355
|
+
| Quality | {c_qual} | {b_qual} | {diff:+.1f} | {effect} |
|
|
356
|
+
| Persona | {c_per} | {b_per} | {diff:+.1f} | {effect} |
|
|
357
|
+
|
|
358
|
+
### Efficiency
|
|
359
|
+
|
|
360
|
+
| Metric | Contestant | Baseline |
|
|
361
|
+
|--------|------------|----------|
|
|
362
|
+
| Tokens/Point | {c_tokens_per_point} | {b_tokens_per_point} |
|
|
363
|
+
| Efficiency | {efficiency_pct}% of baseline | 100% |
|
|
364
|
+
|
|
365
|
+
### Statistical Significance
|
|
366
|
+
|
|
367
|
+
- **Effect Size (Cohen's d):** {cohens_d:.2f} ({interpretation})
|
|
368
|
+
- **95% CI for difference:** [{ci_lower:+.1f}, {ci_upper:+.1f}]
|
|
369
|
+
- **Significant:** {Yes/No} (p < 0.05)
|
|
370
|
+
|
|
371
|
+
### Verdict
|
|
372
|
+
|
|
373
|
+
{verdict based on effect size and significance}
|
|
374
|
+
|
|
375
|
+
---
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
**Verdict Logic:**
|
|
379
|
+
- If not significant: "No statistically significant difference from baseline."
|
|
380
|
+
- If significant and positive large effect: "Contestant **significantly outperforms** baseline with large effect size."
|
|
381
|
+
- If significant and positive medium effect: "Contestant **outperforms** baseline with medium effect size."
|
|
382
|
+
- If significant and positive small effect: "Contestant **slightly outperforms** baseline."
|
|
383
|
+
- If significant and negative: "Contestant **underperforms** baseline."
|
|
384
|
+
|
|
385
|
+
## Step 8: Save Results (ALWAYS)
|
|
386
|
+
|
|
387
|
+
**Output path logic:**
|
|
388
|
+
```python
|
|
389
|
+
if theme == "control":
|
|
390
|
+
base_path = f"internal/results/baselines/{scenario_name}/{effective_role}/"
|
|
391
|
+
elif cross_role:
|
|
392
|
+
# Cross-role: include character slug for clarity
|
|
393
|
+
character_slug = slugify(character_name) # e.g., "prospero", "granny-weatherwax"
|
|
394
|
+
base_path = f"internal/results/benchmarks/{scenario_name}/{theme}-{character_slug}-as-{effective_role}/"
|
|
395
|
+
else:
|
|
396
|
+
base_path = f"internal/results/benchmarks/{scenario_name}/{theme}-{effective_role}/"
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
**Cross-role examples:**
|
|
400
|
+
- `/benchmark shakespeare prospero --as dev` → `internal/results/benchmarks/{scenario}/shakespeare-prospero-as-dev/`
|
|
401
|
+
- `/benchmark discworld granny --as dev` → `internal/results/benchmarks/{scenario}/discworld-granny-weatherwax-as-dev/`
|
|
402
|
+
|
|
403
|
+
**Save structure:**
|
|
404
|
+
```
|
|
405
|
+
{base_path}/
|
|
406
|
+
├── runs/
|
|
407
|
+
│ ├── run_1.json
|
|
408
|
+
│ ├── judge_1.json
|
|
409
|
+
│ └── ...
|
|
410
|
+
└── summary.yaml
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
**summary.yaml format:** See `/solo` command Step 10. For cross-role runs, include:
|
|
414
|
+
```yaml
|
|
415
|
+
agent:
|
|
416
|
+
theme: {theme}
|
|
417
|
+
character: {character_name}
|
|
418
|
+
source_role: {source_role} # where character normally lives (e.g., sm)
|
|
419
|
+
effective_role: {effective_role} # what they're doing (e.g., dev)
|
|
420
|
+
cross_role: true
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
**REQUIRED: Capture Pennyfarthing version in metadata:**
|
|
424
|
+
```bash
|
|
425
|
+
# Get version from package.json
|
|
426
|
+
version=$(node -p "require('./package.json').version")
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
Include in summary.yaml:
|
|
430
|
+
```yaml
|
|
431
|
+
metadata:
|
|
432
|
+
created_at: "{ISO timestamp}"
|
|
433
|
+
pennyfarthing_version: "{version}" # REQUIRED for baseline staleness detection
|
|
434
|
+
model: sonnet
|
|
435
|
+
```
|
|
436
|
+
|
|
437
|
+
**ALWAYS save summary.yaml, even for n=1.** This ensures consistent data structure for analysis.
|
|
438
|
+
|
|
439
|
+
Display:
|
|
440
|
+
```
|
|
441
|
+
✓ Saved {n} run(s) to {base_path}
|
|
442
|
+
✓ Summary: {base_path}/summary.yaml
|
|
443
|
+
```
|
|
444
|
+
</on-invoke>
|
|
445
|
+
|
|
446
|
+
<error-handling>
|
|
447
|
+
**Baseline not found:**
|
|
448
|
+
```markdown
|
|
449
|
+
Error: No baseline found for scenario '{scenario_name}' with agent type '{agent_type}'.
|
|
450
|
+
|
|
451
|
+
To create a baseline, run:
|
|
452
|
+
/benchmark-control --scenario {scenario_name}
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
**Invalid contestant spec:**
|
|
456
|
+
```markdown
|
|
457
|
+
Error: Invalid contestant format. Expected 'theme:agent', got '{value}'.
|
|
458
|
+
|
|
459
|
+
Examples:
|
|
460
|
+
- discworld:reviewer
|
|
461
|
+
- princess-bride:dev
|
|
462
|
+
- control:sm
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+
**Missing --scenario:**
|
|
466
|
+
```markdown
|
|
467
|
+
Error: --scenario is required.
|
|
468
|
+
|
|
469
|
+
Usage: /benchmark <theme:agent> --scenario <name> [--runs N]
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
**Invalid runs value:**
|
|
473
|
+
```markdown
|
|
474
|
+
Error: --runs must be between 1 and 20. Got: {value}
|
|
475
|
+
```
|
|
476
|
+
</error-handling>
|
|
477
|
+
|
|
478
|
+
<reference>
|
|
479
|
+
- Solo Command: `.claude/project/commands/solo.md`
|
|
480
|
+
- Establish Baseline: `.claude/project/commands/benchmark-control.md`
|
|
481
|
+
- Effect Size: Cohen's d standard interpretation (0.2 small, 0.5 medium, 0.8 large)
|
|
482
|
+
- Baselines: `internal/results/baselines/{scenario}/{role}/` (control theme)
|
|
483
|
+
- Benchmarks: `internal/results/benchmarks/{scenario}/{theme}-{role}/` (all other themes)
|
|
484
|
+
- Results README: `internal/results/README.md`
|
|
485
|
+
</reference>
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Discover which characters in a theme excel at each role
|
|
3
|
+
argument-hint: <theme> [--runs N] [--roles <list>]
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Job Fair
|
|
7
|
+
|
|
8
|
+
Run every character in a theme against benchmarks to find hidden talents.
|
|
9
|
+
|
|
10
|
+
## Usage
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
/job-fair <theme>
|
|
14
|
+
/job-fair <theme> --runs 2
|
|
15
|
+
/job-fair <theme> --roles dev,reviewer
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## On Invoke
|
|
19
|
+
|
|
20
|
+
**Arguments:** $ARGUMENTS
|
|
21
|
+
|
|
22
|
+
### Step 1: Parse and Validate
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Theme is first positional arg (strip any --flags)
|
|
26
|
+
THEME=$(echo "$ARGUMENTS" | awk '{print $1}' | sed 's/^--theme[= ]//')
|
|
27
|
+
|
|
28
|
+
# Check theme exists
|
|
29
|
+
ls pennyfarthing-dist/personas/themes/${THEME}.yaml
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
If not found, list available themes and stop.
|
|
33
|
+
|
|
34
|
+
Extract `--runs N` (default: 4) and `--roles x,y` (default: all with baselines).
|
|
35
|
+
|
|
36
|
+
### Step 2: Load Characters
|
|
37
|
+
|
|
38
|
+
Read `pennyfarthing-dist/personas/themes/${THEME}.yaml` and list agents:
|
|
39
|
+
|
|
40
|
+
| Role | Character |
|
|
41
|
+
|------|-----------|
|
|
42
|
+
| sm | {agents.sm.character} |
|
|
43
|
+
| dev | {agents.dev.character} |
|
|
44
|
+
| ... | ... |
|
|
45
|
+
|
|
46
|
+
### Step 3: Find Baselines
|
|
47
|
+
|
|
48
|
+
Pick ONE scenario per role from `internal/results/baselines/`:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# For each role, find first available baseline
|
|
52
|
+
for role in dev reviewer tea sm architect; do
|
|
53
|
+
baseline=$(ls -d internal/results/baselines/*/${role} 2>/dev/null | head -1)
|
|
54
|
+
if [ -n "$baseline" ]; then
|
|
55
|
+
scenario=$(basename $(dirname "$baseline"))
|
|
56
|
+
summary="$baseline/summary.yaml"
|
|
57
|
+
# Read statistics.mean and statistics.n from summary.yaml
|
|
58
|
+
fi
|
|
59
|
+
done
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Show table:
|
|
63
|
+
|
|
64
|
+
| Role | Scenario | Baseline | n |
|
|
65
|
+
|------|----------|----------|---|
|
|
66
|
+
| dev | race-condition-cache | 76.8 | 10 |
|
|
67
|
+
|
|
68
|
+
### Step 4: Confirm
|
|
69
|
+
|
|
70
|
+
Show: `{characters} × {roles} × {runs} = {total} runs`
|
|
71
|
+
|
|
72
|
+
Ask user to confirm or cancel.
|
|
73
|
+
|
|
74
|
+
### Step 5: Execute
|
|
75
|
+
|
|
76
|
+
For each role, for each character:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# Native role
|
|
80
|
+
scripts/solo-runner.sh "${THEME}:${role}" "${scenario}" ${runs}
|
|
81
|
+
|
|
82
|
+
# Cross-role (character playing different role)
|
|
83
|
+
scripts/solo-runner.sh "${THEME}:${native_role}" "${scenario}" ${runs} --as ${target_role}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Show progress as each completes.
|
|
87
|
+
|
|
88
|
+
### Step 6: Report Results
|
|
89
|
+
|
|
90
|
+
Show champions per role and full matrix:
|
|
91
|
+
|
|
92
|
+
| Character | dev | reviewer | tea | sm | Avg |
|
|
93
|
+
|-----------|-----|----------|-----|-----|-----|
|
|
94
|
+
| ... | ... | ... | ... | ... | ... |
|
|
95
|
+
|
|
96
|
+
Save to `internal/results/job-fair/${THEME}-${timestamp}/summary.yaml`
|
|
97
|
+
|
|
98
|
+
## Reference
|
|
99
|
+
|
|
100
|
+
- Theme files: `pennyfarthing-dist/personas/themes/*.yaml`
|
|
101
|
+
- Baselines: `internal/results/baselines/{scenario}/{role}/summary.yaml`
|
|
102
|
+
- Solo runner: `scripts/solo-runner.sh`
|