@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,644 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: judge
|
|
3
|
+
description: Evaluate agent responses using standardized rubrics. Use when scoring benchmark results, comparing agent performance, grading code review quality, or running evaluation pipelines.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Judge Skill
|
|
7
|
+
|
|
8
|
+
Canonical evaluation of agent responses. All judging goes through this skill.
|
|
9
|
+
|
|
10
|
+
<run>
|
|
11
|
+
Judge is invoked via CLI with `/judge --mode <mode> --data <json>` to evaluate agent responses using standardized rubrics. Modes include solo (single response), compare (two responses), phase-specific modes (SM/TEA/Dev/Reviewer), coherence (chain coherence), swebench (SWE-bench evaluation), and ground-truth (patch comparison).
|
|
12
|
+
</run>
|
|
13
|
+
|
|
14
|
+
<output>
|
|
15
|
+
Judge returns structured JSON output containing evaluation scores, weighted totals, reasoning, and token usage information. Output format varies by mode: solo/compare return individual or comparative scores with dimensions (correctness, depth, quality, persona); phase modes return team evaluations; coherence returns a rating (excellent/good/poor); swebench/ground-truth return deterministic scores via Python scripts. All responses include validation of results and error handling for failed evaluations.
|
|
16
|
+
</output>
|
|
17
|
+
|
|
18
|
+
## Invocation
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
/judge --mode <mode> --data <json>
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
**Modes:**
|
|
25
|
+
- `solo` - Single response, absolute rubric (or checklist if baseline_issues provided)
|
|
26
|
+
- `compare` - Two responses, comparative rubric
|
|
27
|
+
- `phase-sm` - Relay SM phase rubric
|
|
28
|
+
- `phase-tea` - Relay TEA phase rubric
|
|
29
|
+
- `phase-dev` - Relay Dev phase rubric
|
|
30
|
+
- `phase-reviewer` - Relay Reviewer phase rubric
|
|
31
|
+
- `coherence` - Relay chain coherence rating
|
|
32
|
+
- `swebench` - Deterministic SWE-bench evaluation (Python script)
|
|
33
|
+
- `ground-truth` - Ground-truth patch comparison (Python script)
|
|
34
|
+
|
|
35
|
+
## Unified Rubric (solo/compare)
|
|
36
|
+
|
|
37
|
+
| Dimension | Weight | Criteria |
|
|
38
|
+
|-----------|--------|----------|
|
|
39
|
+
| **Correctness** | 25% | Technical accuracy. Right issues? Valid solutions? |
|
|
40
|
+
| **Depth** | 25% | Thoroughness. Root causes? Implications? |
|
|
41
|
+
| **Quality** | 25% | Clarity and actionability. Organized? Useful? |
|
|
42
|
+
| **Persona** | 25% | Character embodiment. Consistent? Added value? |
|
|
43
|
+
|
|
44
|
+
**Formula:** `(correctness × 2.5) + (depth × 2.5) + (quality × 2.5) + (persona × 2.5) = WEIGHTED_TOTAL`
|
|
45
|
+
|
|
46
|
+
## Relay Phase Rubrics
|
|
47
|
+
|
|
48
|
+
<details>
|
|
49
|
+
<summary><strong>SM Phase Rubric</strong></summary>
|
|
50
|
+
|
|
51
|
+
| Dimension | Weight |
|
|
52
|
+
|-----------|--------|
|
|
53
|
+
| Clarity | 30% |
|
|
54
|
+
| Handoff | 40% |
|
|
55
|
+
| Completeness | 30% |
|
|
56
|
+
|
|
57
|
+
</details>
|
|
58
|
+
|
|
59
|
+
<details>
|
|
60
|
+
<summary><strong>TEA Phase Rubric</strong></summary>
|
|
61
|
+
|
|
62
|
+
| Dimension | Weight |
|
|
63
|
+
|-----------|--------|
|
|
64
|
+
| Coverage | 35% |
|
|
65
|
+
| RED State | 35% |
|
|
66
|
+
| Handoff | 30% |
|
|
67
|
+
|
|
68
|
+
</details>
|
|
69
|
+
|
|
70
|
+
<details>
|
|
71
|
+
<summary><strong>Dev Phase Rubric</strong></summary>
|
|
72
|
+
|
|
73
|
+
| Dimension | Weight |
|
|
74
|
+
|-----------|--------|
|
|
75
|
+
| GREEN State | 40% |
|
|
76
|
+
| Code Quality | 30% |
|
|
77
|
+
| Handoff | 30% |
|
|
78
|
+
|
|
79
|
+
</details>
|
|
80
|
+
|
|
81
|
+
<details>
|
|
82
|
+
<summary><strong>Reviewer Phase Rubric</strong></summary>
|
|
83
|
+
|
|
84
|
+
| Dimension | Weight |
|
|
85
|
+
|-----------|--------|
|
|
86
|
+
| Detection | 40% |
|
|
87
|
+
| Verdict | 30% |
|
|
88
|
+
| Persona | 30% |
|
|
89
|
+
|
|
90
|
+
</details>
|
|
91
|
+
|
|
92
|
+
<details>
|
|
93
|
+
<summary><strong>Chain Coherence Multipliers</strong></summary>
|
|
94
|
+
|
|
95
|
+
| Rating | Multiplier |
|
|
96
|
+
|--------|------------|
|
|
97
|
+
| excellent | 1.2x |
|
|
98
|
+
| good | 1.0x |
|
|
99
|
+
| poor | 0.8x |
|
|
100
|
+
|
|
101
|
+
</details>
|
|
102
|
+
|
|
103
|
+
## On Invoke
|
|
104
|
+
|
|
105
|
+
### Step 1: Parse Arguments
|
|
106
|
+
|
|
107
|
+
Extract:
|
|
108
|
+
- `mode`: One of the modes listed above
|
|
109
|
+
- `data`: JSON object with required fields for that mode
|
|
110
|
+
|
|
111
|
+
**Data requirements by mode:**
|
|
112
|
+
|
|
113
|
+
| Mode | Required Fields | Optional Fields |
|
|
114
|
+
|------|-----------------|-----------------|
|
|
115
|
+
| solo | `spec`, `character`, `challenge`, `response` | `code`, `baseline_issues`, `baseline_criteria`, `bonus_issues`, `bonus_criteria` |
|
|
116
|
+
| compare | `contestants[]` (each with spec, character, response), `challenge` | `baseline_issues`, `baseline_criteria` |
|
|
117
|
+
| phase-* | `team1`, `team2` (each with theme, response), `context` | |
|
|
118
|
+
| coherence | `theme`, `sm_response`, `tea_response`, `dev_response`, `reviewer_response` | |
|
|
119
|
+
| swebench | `scenario`, `response_file` | |
|
|
120
|
+
| ground-truth | `scenario`, `response_file` | |
|
|
121
|
+
|
|
122
|
+
**Note:** When checklist data is provided, solo mode uses checklist-based evaluation:
|
|
123
|
+
- `baseline_issues` → code-review, tea, dev scenarios (things to FIND)
|
|
124
|
+
- `baseline_criteria` → SM scenarios (behaviors to DEMONSTRATE)
|
|
125
|
+
- `bonus_issues` / `bonus_criteria` → Extra credit items (optional)
|
|
126
|
+
|
|
127
|
+
### Step 2: Build Judge Prompt
|
|
128
|
+
|
|
129
|
+
Based on mode, construct the appropriate prompt:
|
|
130
|
+
|
|
131
|
+
<details>
|
|
132
|
+
<summary><strong>Solo Mode Prompt (Generic Rubric)</strong></summary>
|
|
133
|
+
|
|
134
|
+
**If NO baseline_issues provided, use generic rubric:**
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
You are an impartial judge evaluating an AI agent's response.
|
|
138
|
+
|
|
139
|
+
## Contestant
|
|
140
|
+
- **{spec}** ({character})
|
|
141
|
+
|
|
142
|
+
## Challenge
|
|
143
|
+
{challenge}
|
|
144
|
+
|
|
145
|
+
## Response
|
|
146
|
+
{response}
|
|
147
|
+
|
|
148
|
+
## Evaluation
|
|
149
|
+
|
|
150
|
+
Score 1-10 on each dimension:
|
|
151
|
+
|
|
152
|
+
1. **Correctness (25%)** - Technical accuracy
|
|
153
|
+
2. **Depth (25%)** - Thoroughness
|
|
154
|
+
3. **Quality (25%)** - Clarity and actionability
|
|
155
|
+
4. **Persona (25%)** - Character embodiment
|
|
156
|
+
|
|
157
|
+
Formula: (correctness × 2.5) + (depth × 2.5) + (quality × 2.5) + (persona × 2.5) = WEIGHTED_TOTAL
|
|
158
|
+
|
|
159
|
+
**IMPORTANT: Output your evaluation as JSON only. No markdown, no extra text.**
|
|
160
|
+
|
|
161
|
+
```json
|
|
162
|
+
{
|
|
163
|
+
"scores": {
|
|
164
|
+
"correctness": { "value": 8, "reasoning": "..." },
|
|
165
|
+
"depth": { "value": 7, "reasoning": "..." },
|
|
166
|
+
"quality": { "value": 9, "reasoning": "..." },
|
|
167
|
+
"persona": { "value": 8, "reasoning": "..." }
|
|
168
|
+
},
|
|
169
|
+
"weighted_total": 80.0,
|
|
170
|
+
"assessment": "2-3 sentence overall assessment"
|
|
171
|
+
}
|
|
172
|
+
```
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
</details>
|
|
176
|
+
|
|
177
|
+
<details>
|
|
178
|
+
<summary><strong>Solo Mode Prompt (Checklist Rubric v2 - Precision/Recall)</strong></summary>
|
|
179
|
+
|
|
180
|
+
**If baseline_issues IS provided, use checklist rubric (v2 - precision/recall):**
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
You are an impartial judge evaluating an AI agent's response against a checklist of expected findings.
|
|
184
|
+
|
|
185
|
+
## Contestant
|
|
186
|
+
- **{spec}** ({character})
|
|
187
|
+
|
|
188
|
+
## Challenge
|
|
189
|
+
{challenge}
|
|
190
|
+
|
|
191
|
+
{if code provided}
|
|
192
|
+
## Code Under Review
|
|
193
|
+
{code}
|
|
194
|
+
{endif}
|
|
195
|
+
|
|
196
|
+
## Expected Findings
|
|
197
|
+
|
|
198
|
+
Below are the known issues/requirements. Severity indicates weight:
|
|
199
|
+
- CRITICAL: weight 15 (must find)
|
|
200
|
+
- HIGH: weight 10 (should find)
|
|
201
|
+
- MEDIUM: weight 5 (good to find)
|
|
202
|
+
- LOW: weight 2 (bonus)
|
|
203
|
+
- (unlabeled categories like happy_path, validation: weight 5 each)
|
|
204
|
+
|
|
205
|
+
{baseline_issues formatted as checklist}
|
|
206
|
+
|
|
207
|
+
## Response to Evaluate
|
|
208
|
+
{response}
|
|
209
|
+
|
|
210
|
+
## Evaluation Instructions
|
|
211
|
+
|
|
212
|
+
Evaluate the response and output ONLY valid JSON (no markdown, no extra text):
|
|
213
|
+
|
|
214
|
+
```json
|
|
215
|
+
{
|
|
216
|
+
"baseline_findings": [
|
|
217
|
+
{"id": "ISSUE_ID", "severity": "critical|high|medium|low", "found": true, "evidence": "quote or null"}
|
|
218
|
+
],
|
|
219
|
+
"novel_findings": [
|
|
220
|
+
{"description": "...", "valid": true, "reasoning": "..."}
|
|
221
|
+
],
|
|
222
|
+
"false_positives": [
|
|
223
|
+
{"claim": "...", "why_invalid": "..."}
|
|
224
|
+
],
|
|
225
|
+
"detection": {
|
|
226
|
+
"by_severity": {
|
|
227
|
+
"critical": {"found": 5, "total": 6},
|
|
228
|
+
"high": {"found": 4, "total": 6},
|
|
229
|
+
"medium": {"found": 3, "total": 8},
|
|
230
|
+
"low": {"found": 1, "total": 2}
|
|
231
|
+
},
|
|
232
|
+
"novel_valid": 2,
|
|
233
|
+
"false_positive_count": 1,
|
|
234
|
+
"metrics": {
|
|
235
|
+
"weighted_found": 98,
|
|
236
|
+
"weighted_total": 120,
|
|
237
|
+
"recall": 0.817,
|
|
238
|
+
"precision": 0.929,
|
|
239
|
+
"f2_score": 0.843
|
|
240
|
+
},
|
|
241
|
+
"components": {
|
|
242
|
+
"recall_score": 24.5,
|
|
243
|
+
"precision_score": 9.3,
|
|
244
|
+
"novel_bonus": 6.0
|
|
245
|
+
},
|
|
246
|
+
"subtotal": 39.8
|
|
247
|
+
},
|
|
248
|
+
"quality": {
|
|
249
|
+
"clear_explanations": 8,
|
|
250
|
+
"actionable_fixes": 7,
|
|
251
|
+
"subtotal": 18.75
|
|
252
|
+
},
|
|
253
|
+
"persona": {
|
|
254
|
+
"in_character": 9,
|
|
255
|
+
"professional_tone": 8,
|
|
256
|
+
"subtotal": 21.25
|
|
257
|
+
},
|
|
258
|
+
"weighted_total": 79.8,
|
|
259
|
+
"assessment": "2-3 sentence summary of strengths and gaps"
|
|
260
|
+
}
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
**Detection Scoring Rules (v2 - Precision/Recall):**
|
|
264
|
+
|
|
265
|
+
- Severity Weights: critical=15, high=10, medium=5, low=2
|
|
266
|
+
- recall = weighted_found / weighted_total
|
|
267
|
+
- precision = true_positives / (true_positives + false_positives)
|
|
268
|
+
- f2_score = 5 × (precision × recall) / (4 × precision + recall)
|
|
269
|
+
- detection.subtotal = (recall × 30) + (precision × 10) + min(novel_valid × 3, 10)
|
|
270
|
+
|
|
271
|
+
**Other Dimensions:**
|
|
272
|
+
- Quality (25 max): (clear_explanations/10 × 12.5) + (actionable_fixes/10 × 12.5)
|
|
273
|
+
- Persona (25 max): (in_character/10 × 12.5) + (professional_tone/10 × 12.5)
|
|
274
|
+
- weighted_total = detection.subtotal + quality.subtotal + persona.subtotal
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
</details>
|
|
278
|
+
|
|
279
|
+
<details>
|
|
280
|
+
<summary><strong>Detection Scoring Deep Dive</strong></summary>
|
|
281
|
+
|
|
282
|
+
**Metric Calculations:**
|
|
283
|
+
```
|
|
284
|
+
weighted_found = Σ(found_issues × severity_weight)
|
|
285
|
+
weighted_total = Σ(all_baseline_issues × severity_weight)
|
|
286
|
+
|
|
287
|
+
recall = weighted_found / weighted_total
|
|
288
|
+
precision = true_positives / (true_positives + false_positives)
|
|
289
|
+
f2_score = 5 × (precision × recall) / (4 × precision + recall)
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
**Component Scores (Detection = 50 max):**
|
|
293
|
+
```
|
|
294
|
+
recall_score = recall × 30 # max 30 pts - coverage matters most
|
|
295
|
+
precision_score = precision × 10 # max 10 pts - penalizes hallucinations
|
|
296
|
+
novel_bonus = min(novel_valid × 3, 10) # max 10 pts - rewards thoroughness
|
|
297
|
+
|
|
298
|
+
detection.subtotal = recall_score + precision_score + novel_bonus
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
**Why this design:**
|
|
302
|
+
- **Recall weighted 3x precision**: Missing a critical vulnerability is worse than a false positive
|
|
303
|
+
- **Severity-weighted recall**: Finding 5 critical issues > finding 5 low issues
|
|
304
|
+
- **Separate novel bonus**: Rewards thoroughness beyond baseline without affecting precision
|
|
305
|
+
- **Visible metrics**: recall, precision, f2_score all reported for transparency
|
|
306
|
+
|
|
307
|
+
**Example Calculations:**
|
|
308
|
+
```
|
|
309
|
+
Scenario: 6 critical (90 pts), 6 high (60 pts), 8 medium (40 pts), 2 low (4 pts) = 194 weighted total
|
|
310
|
+
Agent finds: 5 critical, 4 high, 3 medium, 1 low = 75+40+15+2 = 132 weighted found
|
|
311
|
+
Agent flags: 14 true positives, 1 false positive, 2 valid novel findings
|
|
312
|
+
|
|
313
|
+
recall = 132/194 = 0.680
|
|
314
|
+
precision = 14/15 = 0.933
|
|
315
|
+
f2_score = 5 × (0.933 × 0.680) / (4 × 0.933 + 0.680) = 0.718
|
|
316
|
+
|
|
317
|
+
recall_score = 0.680 × 30 = 20.4
|
|
318
|
+
precision_score = 0.933 × 10 = 9.3
|
|
319
|
+
novel_bonus = min(2 × 3, 10) = 6.0
|
|
320
|
+
|
|
321
|
+
detection.subtotal = 20.4 + 9.3 + 6.0 = 35.7
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
**Checklist Scoring Notes:**
|
|
325
|
+
- **Recall dominates** (30/50 pts): Comprehensive coverage is primary goal
|
|
326
|
+
- **Precision matters** (10/50 pts): Penalizes hallucinated issues proportionally
|
|
327
|
+
- **Novel findings rewarded** (10/50 pts): Encourages going beyond baseline
|
|
328
|
+
- **Severity-weighted**: Critical issues count 7.5x more than low issues
|
|
329
|
+
- **Transparent metrics**: All intermediate values visible for debugging
|
|
330
|
+
- Quality/Persona still matter (25% each) - not just about finding issues
|
|
331
|
+
|
|
332
|
+
</details>
|
|
333
|
+
|
|
334
|
+
<details>
|
|
335
|
+
<summary><strong>Solo Mode Prompt (Behavior Checklist - SM Scenarios)</strong></summary>
|
|
336
|
+
|
|
337
|
+
**If baseline_criteria IS provided (SM scenarios), use behavior checklist:**
|
|
338
|
+
|
|
339
|
+
```
|
|
340
|
+
You are an impartial judge evaluating an AI agent's facilitation/management response.
|
|
341
|
+
|
|
342
|
+
## Contestant
|
|
343
|
+
- **{spec}** ({character})
|
|
344
|
+
|
|
345
|
+
## Challenge
|
|
346
|
+
{challenge}
|
|
347
|
+
|
|
348
|
+
## Expected Behaviors
|
|
349
|
+
|
|
350
|
+
Below are the behaviors a good response should demonstrate:
|
|
351
|
+
|
|
352
|
+
**BASELINE CRITERIA (5 pts each):**
|
|
353
|
+
{baseline_criteria formatted by category}
|
|
354
|
+
|
|
355
|
+
**BONUS CRITERIA (3 pts each, if present):**
|
|
356
|
+
{bonus_criteria formatted, or "None specified"}
|
|
357
|
+
|
|
358
|
+
## Response to Evaluate
|
|
359
|
+
{response}
|
|
360
|
+
|
|
361
|
+
## Evaluation Instructions
|
|
362
|
+
|
|
363
|
+
Evaluate the response and output ONLY valid JSON (no markdown, no extra text):
|
|
364
|
+
|
|
365
|
+
```json
|
|
366
|
+
{
|
|
367
|
+
"baseline_behaviors": [
|
|
368
|
+
{"id": "BEHAVIOR_ID", "category": "...", "demonstrated": true, "evidence": "quote or null"}
|
|
369
|
+
],
|
|
370
|
+
"bonus_behaviors": [
|
|
371
|
+
{"id": "BONUS_ID", "category": "...", "demonstrated": true, "evidence": "quote or null"}
|
|
372
|
+
],
|
|
373
|
+
"execution": {
|
|
374
|
+
"baseline_count": 8,
|
|
375
|
+
"bonus_count": 2,
|
|
376
|
+
"subtotal": 46
|
|
377
|
+
},
|
|
378
|
+
"quality": {
|
|
379
|
+
"clear_actionable": 8,
|
|
380
|
+
"well_structured": 7,
|
|
381
|
+
"subtotal": 18.75
|
|
382
|
+
},
|
|
383
|
+
"persona": {
|
|
384
|
+
"in_character": 9,
|
|
385
|
+
"enhances_delivery": 8,
|
|
386
|
+
"subtotal": 21.25
|
|
387
|
+
},
|
|
388
|
+
"weighted_total": 86.0,
|
|
389
|
+
"assessment": "2-3 sentence summary of facilitation effectiveness"
|
|
390
|
+
}
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
Scoring rules:
|
|
394
|
+
- Execution (50 max): baseline×5 (cap 40) + bonus×3 (cap 10)
|
|
395
|
+
- Quality (25 max): (clear_actionable/10 × 12.5) + (well_structured/10 × 12.5)
|
|
396
|
+
- Persona (25 max): (in_character/10 × 12.5) + (enhances_delivery/10 × 12.5)
|
|
397
|
+
- weighted_total = execution.subtotal + quality.subtotal + persona.subtotal
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
</details>
|
|
401
|
+
|
|
402
|
+
<details>
|
|
403
|
+
<summary><strong>Compare Mode Prompt</strong></summary>
|
|
404
|
+
|
|
405
|
+
```
|
|
406
|
+
You are an impartial judge comparing two AI personas.
|
|
407
|
+
|
|
408
|
+
## Contestants
|
|
409
|
+
- **{spec1}** ({character1})
|
|
410
|
+
- **{spec2}** ({character2})
|
|
411
|
+
|
|
412
|
+
## Challenge
|
|
413
|
+
{challenge}
|
|
414
|
+
|
|
415
|
+
## Response from {character1}
|
|
416
|
+
{response1}
|
|
417
|
+
|
|
418
|
+
## Response from {character2}
|
|
419
|
+
{response2}
|
|
420
|
+
|
|
421
|
+
## Evaluation
|
|
422
|
+
|
|
423
|
+
Score both on each dimension (1-10). Output ONLY valid JSON (no markdown, no extra text):
|
|
424
|
+
|
|
425
|
+
```json
|
|
426
|
+
{
|
|
427
|
+
"contestants": {
|
|
428
|
+
"{spec1}": {
|
|
429
|
+
"scores": {
|
|
430
|
+
"correctness": { "value": 8, "reasoning": "..." },
|
|
431
|
+
"depth": { "value": 7, "reasoning": "..." },
|
|
432
|
+
"quality": { "value": 9, "reasoning": "..." },
|
|
433
|
+
"persona": { "value": 8, "reasoning": "..." }
|
|
434
|
+
},
|
|
435
|
+
"weighted_total": 80.0
|
|
436
|
+
},
|
|
437
|
+
"{spec2}": {
|
|
438
|
+
"scores": {
|
|
439
|
+
"correctness": { "value": 7, "reasoning": "..." },
|
|
440
|
+
"depth": { "value": 8, "reasoning": "..." },
|
|
441
|
+
"quality": { "value": 7, "reasoning": "..." },
|
|
442
|
+
"persona": { "value": 9, "reasoning": "..." }
|
|
443
|
+
},
|
|
444
|
+
"weighted_total": 77.5
|
|
445
|
+
}
|
|
446
|
+
},
|
|
447
|
+
"winner": "{spec1}",
|
|
448
|
+
"justification": "Brief explanation of why winner was chosen"
|
|
449
|
+
}
|
|
450
|
+
```
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
</details>
|
|
454
|
+
|
|
455
|
+
<details>
|
|
456
|
+
<summary><strong>Phase Mode and Coherence Mode Prompts</strong></summary>
|
|
457
|
+
|
|
458
|
+
**Phase Mode Prompts:**
|
|
459
|
+
|
|
460
|
+
Use phase-specific rubrics from tables above. Evaluate both teams. Output JSON format.
|
|
461
|
+
|
|
462
|
+
**Coherence Mode Prompt:**
|
|
463
|
+
|
|
464
|
+
```
|
|
465
|
+
Evaluate chain coherence for {theme}.
|
|
466
|
+
|
|
467
|
+
## Chain
|
|
468
|
+
SM: {sm_response}
|
|
469
|
+
TEA: {tea_response}
|
|
470
|
+
Dev: {dev_response}
|
|
471
|
+
Reviewer: {reviewer_response}
|
|
472
|
+
|
|
473
|
+
Output ONLY valid JSON (no markdown, no extra text):
|
|
474
|
+
|
|
475
|
+
```json
|
|
476
|
+
{
|
|
477
|
+
"rating": "excellent|good|poor",
|
|
478
|
+
"reasoning": "explanation of coherence assessment"
|
|
479
|
+
}
|
|
480
|
+
```
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
</details>
|
|
484
|
+
|
|
485
|
+
<details>
|
|
486
|
+
<summary><strong>SWE-bench Mode (Deterministic Python Evaluation)</strong></summary>
|
|
487
|
+
|
|
488
|
+
**For `swebench` and `ground-truth` modes, use Python scripts instead of LLM-as-judge.**
|
|
489
|
+
|
|
490
|
+
These modes use deterministic scoring based on ground-truth patches from the SWE-bench dataset.
|
|
491
|
+
|
|
492
|
+
**Prerequisites:**
|
|
493
|
+
```bash
|
|
494
|
+
# Ensure SWE-bench data is downloaded (one-time)
|
|
495
|
+
.pennyfarthing/scripts/test/ensure-swebench-data.sh
|
|
496
|
+
```
|
|
497
|
+
|
|
498
|
+
**swebench mode:**
|
|
499
|
+
Uses structured rubric + ground truth validation. Scores:
|
|
500
|
+
- root_cause (30%): Bug location + explanation
|
|
501
|
+
- fix_quality (40%): Addresses issue + minimal + syntax correct
|
|
502
|
+
- completeness (20%): Edge cases + test coverage
|
|
503
|
+
- persona (10%): In-character delivery
|
|
504
|
+
|
|
505
|
+
```bash
|
|
506
|
+
# Execute via Python script
|
|
507
|
+
python3 .pennyfarthing/scripts/test/swebench-judge.py <scenario_name> <response_file>
|
|
508
|
+
|
|
509
|
+
# Example
|
|
510
|
+
python3 .pennyfarthing/scripts/test/swebench-judge.py flask-5014 /tmp/run_1.json
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
**ground-truth mode:**
|
|
514
|
+
Compares fix against actual SWE-bench patch. Scores:
|
|
515
|
+
- file_identification (20%): Correct files identified
|
|
516
|
+
- location_identification (20%): Correct functions/locations
|
|
517
|
+
- fix_logic_match (40%): Code matches ground truth
|
|
518
|
+
- completeness (20%): Has all elements of good fix
|
|
519
|
+
|
|
520
|
+
```bash
|
|
521
|
+
# Execute via Python script
|
|
522
|
+
python3 .pennyfarthing/scripts/test/ground-truth-judge.py <scenario_name> <response_file>
|
|
523
|
+
|
|
524
|
+
# Example
|
|
525
|
+
python3 .pennyfarthing/scripts/test/ground-truth-judge.py django-10554 /tmp/run_1.json
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
**Response file format:**
|
|
529
|
+
Both scripts expect JSON with either:
|
|
530
|
+
- `result`: The agent's response text
|
|
531
|
+
- `response_text`: Alternative field name
|
|
532
|
+
|
|
533
|
+
**Output:**
|
|
534
|
+
Scripts print scores to stdout and save detailed JSON to `{input_path.replace('run_', 'swebench_judge_')}` or `{input_path.replace('run_', 'gt_judge_')}`.
|
|
535
|
+
|
|
536
|
+
</details>
|
|
537
|
+
|
|
538
|
+
<details>
|
|
539
|
+
<summary><strong>Step 3: Execute Judge via CLI</strong></summary>
|
|
540
|
+
|
|
541
|
+
**CRITICAL: Follow this execution pattern for all contexts (main session, skills, subagents).**
|
|
542
|
+
|
|
543
|
+
**Three rules to avoid shell parsing errors:**
|
|
544
|
+
|
|
545
|
+
1. **Use Write tool for prompt files** - NOT `echo` in Bash (handles special characters)
|
|
546
|
+
2. **Use file redirection for output** - NOT variable capture `$(...)` (avoids zsh parse errors)
|
|
547
|
+
3. **Use pipe syntax** - NOT heredocs (works in subagents)
|
|
548
|
+
|
|
549
|
+
**Why variable capture fails:**
|
|
550
|
+
```bash
|
|
551
|
+
# This FAILS - zsh tries to parse JSON with () characters
|
|
552
|
+
OUTPUT=$(cat prompt.txt | claude -p --output-format json --tools "")
|
|
553
|
+
# Error: parse error near ')'
|
|
554
|
+
```
|
|
555
|
+
|
|
556
|
+
**Correct pattern:**
|
|
557
|
+
|
|
558
|
+
```bash
|
|
559
|
+
# Step 1: Use Write tool to create prompt file (NOT echo in Bash)
|
|
560
|
+
# The Write tool handles escaping properly in all contexts
|
|
561
|
+
|
|
562
|
+
# Step 2: Capture timestamp (simple command, safe to capture)
|
|
563
|
+
date -u +%Y-%m-%dT%H:%M:%SZ > .scratch/judge_ts.txt
|
|
564
|
+
|
|
565
|
+
# Step 3: Execute with FILE REDIRECTION (NOT variable capture)
|
|
566
|
+
cat .scratch/judge_prompt.txt | claude -p --output-format json --tools "" > .scratch/judge_output.json
|
|
567
|
+
|
|
568
|
+
# Step 4: Extract from files (reading files is always safe)
|
|
569
|
+
JUDGE_RESPONSE=$(jq -r '.result' .scratch/judge_output.json)
|
|
570
|
+
JUDGE_INPUT_TOKENS=$(jq -r '.usage.input_tokens // 0' .scratch/judge_output.json)
|
|
571
|
+
JUDGE_OUTPUT_TOKENS=$(jq -r '.usage.output_tokens // 0' .scratch/judge_output.json)
|
|
572
|
+
```
|
|
573
|
+
|
|
574
|
+
**Key insight:** The shell never parses the JSON when using file redirection.
|
|
575
|
+
The output goes directly to a file, then jq reads it safely.
|
|
576
|
+
|
|
577
|
+
</details>
|
|
578
|
+
|
|
579
|
+
<details>
|
|
580
|
+
<summary><strong>Step 4: Extract Scores</strong></summary>
|
|
581
|
+
|
|
582
|
+
```bash
|
|
583
|
+
# All modes now output JSON - parse with jq
|
|
584
|
+
# Solo mode
|
|
585
|
+
SCORE=$(echo "$JUDGE_RESPONSE" | jq -r '.weighted_total // empty')
|
|
586
|
+
|
|
587
|
+
# Compare mode
|
|
588
|
+
SCORE1=$(echo "$JUDGE_RESPONSE" | jq -r '.contestants["{spec1}"].weighted_total // empty')
|
|
589
|
+
SCORE2=$(echo "$JUDGE_RESPONSE" | jq -r '.contestants["{spec2}"].weighted_total // empty')
|
|
590
|
+
WINNER=$(echo "$JUDGE_RESPONSE" | jq -r '.winner // empty')
|
|
591
|
+
|
|
592
|
+
# Coherence mode
|
|
593
|
+
RATING=$(echo "$JUDGE_RESPONSE" | jq -r '.rating // empty')
|
|
594
|
+
|
|
595
|
+
# Fallback: try grep if JSON parsing fails (backwards compatibility)
|
|
596
|
+
if [[ -z "$SCORE" ]]; then
|
|
597
|
+
SCORE=$(echo "$JUDGE_RESPONSE" | grep -oE "weighted_total[\"':]*\s*([0-9.]+)" | grep -oE "[0-9.]+" | tail -1)
|
|
598
|
+
fi
|
|
599
|
+
```
|
|
600
|
+
|
|
601
|
+
</details>
|
|
602
|
+
|
|
603
|
+
### Step 5: Validate Results
|
|
604
|
+
|
|
605
|
+
| Check | Requirement |
|
|
606
|
+
|-------|-------------|
|
|
607
|
+
| `JUDGE_TIMESTAMP` | Valid ISO8601 |
|
|
608
|
+
| `JUDGE_RESPONSE` | At least 200 chars |
|
|
609
|
+
| `SCORE` (if applicable) | Number 1-100 |
|
|
610
|
+
| `RATING` (if coherence) | One of: excellent, good, poor |
|
|
611
|
+
| `JUDGE_INPUT_TOKENS` | > 0 |
|
|
612
|
+
| `JUDGE_OUTPUT_TOKENS` | > 0 |
|
|
613
|
+
|
|
614
|
+
**If validation fails:** Return error, do NOT estimate.
|
|
615
|
+
|
|
616
|
+
### Step 6: Return Results
|
|
617
|
+
|
|
618
|
+
Output structured result for caller:
|
|
619
|
+
|
|
620
|
+
```json
|
|
621
|
+
{
|
|
622
|
+
"success": true,
|
|
623
|
+
"mode": "{mode}",
|
|
624
|
+
"timestamp": "{JUDGE_TIMESTAMP}",
|
|
625
|
+
"scores": {
|
|
626
|
+
"{spec1}": {score1},
|
|
627
|
+
"{spec2}": {score2}
|
|
628
|
+
},
|
|
629
|
+
"winner": "{winner_spec}",
|
|
630
|
+
"token_usage": {
|
|
631
|
+
"input": {JUDGE_INPUT_TOKENS},
|
|
632
|
+
"output": {JUDGE_OUTPUT_TOKENS}
|
|
633
|
+
},
|
|
634
|
+
"response_text": "{JUDGE_RESPONSE}"
|
|
635
|
+
}
|
|
636
|
+
```
|
|
637
|
+
|
|
638
|
+
## Error Handling
|
|
639
|
+
|
|
640
|
+
```
|
|
641
|
+
❌ Judge validation failed: {reason}
|
|
642
|
+
❌ Mode: {mode}
|
|
643
|
+
❌ DO NOT estimate scores
|
|
644
|
+
```
|