@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,644 @@
1
+ ---
2
+ name: judge
3
+ description: Evaluate agent responses using standardized rubrics. Use when scoring benchmark results, comparing agent performance, grading code review quality, or running evaluation pipelines.
4
+ ---
5
+
6
+ # Judge Skill
7
+
8
+ Canonical evaluation of agent responses. All judging goes through this skill.
9
+
10
+ <run>
11
+ Judge is invoked via CLI with `/judge --mode <mode> --data <json>` to evaluate agent responses using standardized rubrics. Modes include solo (single response), compare (two responses), phase-specific modes (SM/TEA/Dev/Reviewer), coherence (chain coherence), swebench (SWE-bench evaluation), and ground-truth (patch comparison).
12
+ </run>
13
+
14
+ <output>
15
+ Judge returns structured JSON output containing evaluation scores, weighted totals, reasoning, and token usage information. Output format varies by mode: solo/compare return individual or comparative scores with dimensions (correctness, depth, quality, persona); phase modes return team evaluations; coherence returns a rating (excellent/good/poor); swebench/ground-truth return deterministic scores via Python scripts. All responses include validation of results and error handling for failed evaluations.
16
+ </output>
17
+
18
+ ## Invocation
19
+
20
+ ```
21
+ /judge --mode <mode> --data <json>
22
+ ```
23
+
24
+ **Modes:**
25
+ - `solo` - Single response, absolute rubric (or checklist if baseline_issues provided)
26
+ - `compare` - Two responses, comparative rubric
27
+ - `phase-sm` - Relay SM phase rubric
28
+ - `phase-tea` - Relay TEA phase rubric
29
+ - `phase-dev` - Relay Dev phase rubric
30
+ - `phase-reviewer` - Relay Reviewer phase rubric
31
+ - `coherence` - Relay chain coherence rating
32
+ - `swebench` - Deterministic SWE-bench evaluation (Python script)
33
+ - `ground-truth` - Ground-truth patch comparison (Python script)
34
+
35
+ ## Unified Rubric (solo/compare)
36
+
37
+ | Dimension | Weight | Criteria |
38
+ |-----------|--------|----------|
39
+ | **Correctness** | 25% | Technical accuracy. Right issues? Valid solutions? |
40
+ | **Depth** | 25% | Thoroughness. Root causes? Implications? |
41
+ | **Quality** | 25% | Clarity and actionability. Organized? Useful? |
42
+ | **Persona** | 25% | Character embodiment. Consistent? Added value? |
43
+
44
+ **Formula:** `(correctness × 2.5) + (depth × 2.5) + (quality × 2.5) + (persona × 2.5) = WEIGHTED_TOTAL`
45
+
46
+ ## Relay Phase Rubrics
47
+
48
+ <details>
49
+ <summary><strong>SM Phase Rubric</strong></summary>
50
+
51
+ | Dimension | Weight |
52
+ |-----------|--------|
53
+ | Clarity | 30% |
54
+ | Handoff | 40% |
55
+ | Completeness | 30% |
56
+
57
+ </details>
58
+
59
+ <details>
60
+ <summary><strong>TEA Phase Rubric</strong></summary>
61
+
62
+ | Dimension | Weight |
63
+ |-----------|--------|
64
+ | Coverage | 35% |
65
+ | RED State | 35% |
66
+ | Handoff | 30% |
67
+
68
+ </details>
69
+
70
+ <details>
71
+ <summary><strong>Dev Phase Rubric</strong></summary>
72
+
73
+ | Dimension | Weight |
74
+ |-----------|--------|
75
+ | GREEN State | 40% |
76
+ | Code Quality | 30% |
77
+ | Handoff | 30% |
78
+
79
+ </details>
80
+
81
+ <details>
82
+ <summary><strong>Reviewer Phase Rubric</strong></summary>
83
+
84
+ | Dimension | Weight |
85
+ |-----------|--------|
86
+ | Detection | 40% |
87
+ | Verdict | 30% |
88
+ | Persona | 30% |
89
+
90
+ </details>
91
+
92
+ <details>
93
+ <summary><strong>Chain Coherence Multipliers</strong></summary>
94
+
95
+ | Rating | Multiplier |
96
+ |--------|------------|
97
+ | excellent | 1.2x |
98
+ | good | 1.0x |
99
+ | poor | 0.8x |
100
+
101
+ </details>
102
+
103
+ ## On Invoke
104
+
105
+ ### Step 1: Parse Arguments
106
+
107
+ Extract:
108
+ - `mode`: One of the modes listed above
109
+ - `data`: JSON object with required fields for that mode
110
+
111
+ **Data requirements by mode:**
112
+
113
+ | Mode | Required Fields | Optional Fields |
114
+ |------|-----------------|-----------------|
115
+ | solo | `spec`, `character`, `challenge`, `response` | `code`, `baseline_issues`, `baseline_criteria`, `bonus_issues`, `bonus_criteria` |
116
+ | compare | `contestants[]` (each with spec, character, response), `challenge` | `baseline_issues`, `baseline_criteria` |
117
+ | phase-* | `team1`, `team2` (each with theme, response), `context` | |
118
+ | coherence | `theme`, `sm_response`, `tea_response`, `dev_response`, `reviewer_response` | |
119
+ | swebench | `scenario`, `response_file` | |
120
+ | ground-truth | `scenario`, `response_file` | |
121
+
122
+ **Note:** When checklist data is provided, solo mode uses checklist-based evaluation:
123
+ - `baseline_issues` → code-review, tea, dev scenarios (things to FIND)
124
+ - `baseline_criteria` → SM scenarios (behaviors to DEMONSTRATE)
125
+ - `bonus_issues` / `bonus_criteria` → Extra credit items (optional)
126
+
127
+ ### Step 2: Build Judge Prompt
128
+
129
+ Based on mode, construct the appropriate prompt:
130
+
131
+ <details>
132
+ <summary><strong>Solo Mode Prompt (Generic Rubric)</strong></summary>
133
+
134
+ **If NO baseline_issues provided, use generic rubric:**
135
+
136
+ ```
137
+ You are an impartial judge evaluating an AI agent's response.
138
+
139
+ ## Contestant
140
+ - **{spec}** ({character})
141
+
142
+ ## Challenge
143
+ {challenge}
144
+
145
+ ## Response
146
+ {response}
147
+
148
+ ## Evaluation
149
+
150
+ Score 1-10 on each dimension:
151
+
152
+ 1. **Correctness (25%)** - Technical accuracy
153
+ 2. **Depth (25%)** - Thoroughness
154
+ 3. **Quality (25%)** - Clarity and actionability
155
+ 4. **Persona (25%)** - Character embodiment
156
+
157
+ Formula: (correctness × 2.5) + (depth × 2.5) + (quality × 2.5) + (persona × 2.5) = WEIGHTED_TOTAL
158
+
159
+ **IMPORTANT: Output your evaluation as JSON only. No markdown, no extra text.**
160
+
161
+ ```json
162
+ {
163
+ "scores": {
164
+ "correctness": { "value": 8, "reasoning": "..." },
165
+ "depth": { "value": 7, "reasoning": "..." },
166
+ "quality": { "value": 9, "reasoning": "..." },
167
+ "persona": { "value": 8, "reasoning": "..." }
168
+ },
169
+ "weighted_total": 80.0,
170
+ "assessment": "2-3 sentence overall assessment"
171
+ }
172
+ ```
173
+ ```
174
+
175
+ </details>
176
+
177
+ <details>
178
+ <summary><strong>Solo Mode Prompt (Checklist Rubric v2 - Precision/Recall)</strong></summary>
179
+
180
+ **If baseline_issues IS provided, use checklist rubric (v2 - precision/recall):**
181
+
182
+ ```
183
+ You are an impartial judge evaluating an AI agent's response against a checklist of expected findings.
184
+
185
+ ## Contestant
186
+ - **{spec}** ({character})
187
+
188
+ ## Challenge
189
+ {challenge}
190
+
191
+ {if code provided}
192
+ ## Code Under Review
193
+ {code}
194
+ {endif}
195
+
196
+ ## Expected Findings
197
+
198
+ Below are the known issues/requirements. Severity indicates weight:
199
+ - CRITICAL: weight 15 (must find)
200
+ - HIGH: weight 10 (should find)
201
+ - MEDIUM: weight 5 (good to find)
202
+ - LOW: weight 2 (bonus)
203
+ - (unlabeled categories like happy_path, validation: weight 5 each)
204
+
205
+ {baseline_issues formatted as checklist}
206
+
207
+ ## Response to Evaluate
208
+ {response}
209
+
210
+ ## Evaluation Instructions
211
+
212
+ Evaluate the response and output ONLY valid JSON (no markdown, no extra text):
213
+
214
+ ```json
215
+ {
216
+ "baseline_findings": [
217
+ {"id": "ISSUE_ID", "severity": "critical|high|medium|low", "found": true, "evidence": "quote or null"}
218
+ ],
219
+ "novel_findings": [
220
+ {"description": "...", "valid": true, "reasoning": "..."}
221
+ ],
222
+ "false_positives": [
223
+ {"claim": "...", "why_invalid": "..."}
224
+ ],
225
+ "detection": {
226
+ "by_severity": {
227
+ "critical": {"found": 5, "total": 6},
228
+ "high": {"found": 4, "total": 6},
229
+ "medium": {"found": 3, "total": 8},
230
+ "low": {"found": 1, "total": 2}
231
+ },
232
+ "novel_valid": 2,
233
+ "false_positive_count": 1,
234
+ "metrics": {
235
+ "weighted_found": 98,
236
+ "weighted_total": 120,
237
+ "recall": 0.817,
238
+ "precision": 0.929,
239
+ "f2_score": 0.843
240
+ },
241
+ "components": {
242
+ "recall_score": 24.5,
243
+ "precision_score": 9.3,
244
+ "novel_bonus": 6.0
245
+ },
246
+ "subtotal": 39.8
247
+ },
248
+ "quality": {
249
+ "clear_explanations": 8,
250
+ "actionable_fixes": 7,
251
+ "subtotal": 18.75
252
+ },
253
+ "persona": {
254
+ "in_character": 9,
255
+ "professional_tone": 8,
256
+ "subtotal": 21.25
257
+ },
258
+ "weighted_total": 79.8,
259
+ "assessment": "2-3 sentence summary of strengths and gaps"
260
+ }
261
+ ```
262
+
263
+ **Detection Scoring Rules (v2 - Precision/Recall):**
264
+
265
+ - Severity Weights: critical=15, high=10, medium=5, low=2
266
+ - recall = weighted_found / weighted_total
267
+ - precision = true_positives / (true_positives + false_positives)
268
+ - f2_score = 5 × (precision × recall) / (4 × precision + recall)
269
+ - detection.subtotal = (recall × 30) + (precision × 10) + min(novel_valid × 3, 10)
270
+
271
+ **Other Dimensions:**
272
+ - Quality (25 max): (clear_explanations/10 × 12.5) + (actionable_fixes/10 × 12.5)
273
+ - Persona (25 max): (in_character/10 × 12.5) + (professional_tone/10 × 12.5)
274
+ - weighted_total = detection.subtotal + quality.subtotal + persona.subtotal
275
+ ```
276
+
277
+ </details>
278
+
279
+ <details>
280
+ <summary><strong>Detection Scoring Deep Dive</strong></summary>
281
+
282
+ **Metric Calculations:**
283
+ ```
284
+ weighted_found = Σ(found_issues × severity_weight)
285
+ weighted_total = Σ(all_baseline_issues × severity_weight)
286
+
287
+ recall = weighted_found / weighted_total
288
+ precision = true_positives / (true_positives + false_positives)
289
+ f2_score = 5 × (precision × recall) / (4 × precision + recall)
290
+ ```
291
+
292
+ **Component Scores (Detection = 50 max):**
293
+ ```
294
+ recall_score = recall × 30 # max 30 pts - coverage matters most
295
+ precision_score = precision × 10 # max 10 pts - penalizes hallucinations
296
+ novel_bonus = min(novel_valid × 3, 10) # max 10 pts - rewards thoroughness
297
+
298
+ detection.subtotal = recall_score + precision_score + novel_bonus
299
+ ```
300
+
301
+ **Why this design:**
302
+ - **Recall weighted 3x precision**: Missing a critical vulnerability is worse than a false positive
303
+ - **Severity-weighted recall**: Finding 5 critical issues > finding 5 low issues
304
+ - **Separate novel bonus**: Rewards thoroughness beyond baseline without affecting precision
305
+ - **Visible metrics**: recall, precision, f2_score all reported for transparency
306
+
307
+ **Example Calculations:**
308
+ ```
309
+ Scenario: 6 critical (90 pts), 6 high (60 pts), 8 medium (40 pts), 2 low (4 pts) = 194 weighted total
310
+ Agent finds: 5 critical, 4 high, 3 medium, 1 low = 75+40+15+2 = 132 weighted found
311
+ Agent flags: 14 true positives, 1 false positive, 2 valid novel findings
312
+
313
+ recall = 132/194 = 0.680
314
+ precision = 14/15 = 0.933
315
+ f2_score = 5 × (0.933 × 0.680) / (4 × 0.933 + 0.680) = 0.718
316
+
317
+ recall_score = 0.680 × 30 = 20.4
318
+ precision_score = 0.933 × 10 = 9.3
319
+ novel_bonus = min(2 × 3, 10) = 6.0
320
+
321
+ detection.subtotal = 20.4 + 9.3 + 6.0 = 35.7
322
+ ```
323
+
324
+ **Checklist Scoring Notes:**
325
+ - **Recall dominates** (30/50 pts): Comprehensive coverage is primary goal
326
+ - **Precision matters** (10/50 pts): Penalizes hallucinated issues proportionally
327
+ - **Novel findings rewarded** (10/50 pts): Encourages going beyond baseline
328
+ - **Severity-weighted**: Critical issues count 7.5x more than low issues
329
+ - **Transparent metrics**: All intermediate values visible for debugging
330
+ - Quality/Persona still matter (25% each) - not just about finding issues
331
+
332
+ </details>
333
+
334
+ <details>
335
+ <summary><strong>Solo Mode Prompt (Behavior Checklist - SM Scenarios)</strong></summary>
336
+
337
+ **If baseline_criteria IS provided (SM scenarios), use behavior checklist:**
338
+
339
+ ```
340
+ You are an impartial judge evaluating an AI agent's facilitation/management response.
341
+
342
+ ## Contestant
343
+ - **{spec}** ({character})
344
+
345
+ ## Challenge
346
+ {challenge}
347
+
348
+ ## Expected Behaviors
349
+
350
+ Below are the behaviors a good response should demonstrate:
351
+
352
+ **BASELINE CRITERIA (5 pts each):**
353
+ {baseline_criteria formatted by category}
354
+
355
+ **BONUS CRITERIA (3 pts each, if present):**
356
+ {bonus_criteria formatted, or "None specified"}
357
+
358
+ ## Response to Evaluate
359
+ {response}
360
+
361
+ ## Evaluation Instructions
362
+
363
+ Evaluate the response and output ONLY valid JSON (no markdown, no extra text):
364
+
365
+ ```json
366
+ {
367
+ "baseline_behaviors": [
368
+ {"id": "BEHAVIOR_ID", "category": "...", "demonstrated": true, "evidence": "quote or null"}
369
+ ],
370
+ "bonus_behaviors": [
371
+ {"id": "BONUS_ID", "category": "...", "demonstrated": true, "evidence": "quote or null"}
372
+ ],
373
+ "execution": {
374
+ "baseline_count": 8,
375
+ "bonus_count": 2,
376
+ "subtotal": 46
377
+ },
378
+ "quality": {
379
+ "clear_actionable": 8,
380
+ "well_structured": 7,
381
+ "subtotal": 18.75
382
+ },
383
+ "persona": {
384
+ "in_character": 9,
385
+ "enhances_delivery": 8,
386
+ "subtotal": 21.25
387
+ },
388
+ "weighted_total": 86.0,
389
+ "assessment": "2-3 sentence summary of facilitation effectiveness"
390
+ }
391
+ ```
392
+
393
+ Scoring rules:
394
+ - Execution (50 max): baseline×5 (cap 40) + bonus×3 (cap 10)
395
+ - Quality (25 max): (clear_actionable/10 × 12.5) + (well_structured/10 × 12.5)
396
+ - Persona (25 max): (in_character/10 × 12.5) + (enhances_delivery/10 × 12.5)
397
+ - weighted_total = execution.subtotal + quality.subtotal + persona.subtotal
398
+ ```
399
+
400
+ </details>
401
+
402
+ <details>
403
+ <summary><strong>Compare Mode Prompt</strong></summary>
404
+
405
+ ```
406
+ You are an impartial judge comparing two AI personas.
407
+
408
+ ## Contestants
409
+ - **{spec1}** ({character1})
410
+ - **{spec2}** ({character2})
411
+
412
+ ## Challenge
413
+ {challenge}
414
+
415
+ ## Response from {character1}
416
+ {response1}
417
+
418
+ ## Response from {character2}
419
+ {response2}
420
+
421
+ ## Evaluation
422
+
423
+ Score both on each dimension (1-10). Output ONLY valid JSON (no markdown, no extra text):
424
+
425
+ ```json
426
+ {
427
+ "contestants": {
428
+ "{spec1}": {
429
+ "scores": {
430
+ "correctness": { "value": 8, "reasoning": "..." },
431
+ "depth": { "value": 7, "reasoning": "..." },
432
+ "quality": { "value": 9, "reasoning": "..." },
433
+ "persona": { "value": 8, "reasoning": "..." }
434
+ },
435
+ "weighted_total": 80.0
436
+ },
437
+ "{spec2}": {
438
+ "scores": {
439
+ "correctness": { "value": 7, "reasoning": "..." },
440
+ "depth": { "value": 8, "reasoning": "..." },
441
+ "quality": { "value": 7, "reasoning": "..." },
442
+ "persona": { "value": 9, "reasoning": "..." }
443
+ },
444
+ "weighted_total": 77.5
445
+ }
446
+ },
447
+ "winner": "{spec1}",
448
+ "justification": "Brief explanation of why winner was chosen"
449
+ }
450
+ ```
451
+ ```
452
+
453
+ </details>
454
+
455
+ <details>
456
+ <summary><strong>Phase Mode and Coherence Mode Prompts</strong></summary>
457
+
458
+ **Phase Mode Prompts:**
459
+
460
+ Use phase-specific rubrics from tables above. Evaluate both teams. Output JSON format.
461
+
462
+ **Coherence Mode Prompt:**
463
+
464
+ ```
465
+ Evaluate chain coherence for {theme}.
466
+
467
+ ## Chain
468
+ SM: {sm_response}
469
+ TEA: {tea_response}
470
+ Dev: {dev_response}
471
+ Reviewer: {reviewer_response}
472
+
473
+ Output ONLY valid JSON (no markdown, no extra text):
474
+
475
+ ```json
476
+ {
477
+ "rating": "excellent|good|poor",
478
+ "reasoning": "explanation of coherence assessment"
479
+ }
480
+ ```
481
+ ```
482
+
483
+ </details>
484
+
485
+ <details>
486
+ <summary><strong>SWE-bench Mode (Deterministic Python Evaluation)</strong></summary>
487
+
488
+ **For `swebench` and `ground-truth` modes, use Python scripts instead of LLM-as-judge.**
489
+
490
+ These modes use deterministic scoring based on ground-truth patches from the SWE-bench dataset.
491
+
492
+ **Prerequisites:**
493
+ ```bash
494
+ # Ensure SWE-bench data is downloaded (one-time)
495
+ .pennyfarthing/scripts/test/ensure-swebench-data.sh
496
+ ```
497
+
498
+ **swebench mode:**
499
+ Uses structured rubric + ground truth validation. Scores:
500
+ - root_cause (30%): Bug location + explanation
501
+ - fix_quality (40%): Addresses issue + minimal + syntax correct
502
+ - completeness (20%): Edge cases + test coverage
503
+ - persona (10%): In-character delivery
504
+
505
+ ```bash
506
+ # Execute via Python script
507
+ python3 .pennyfarthing/scripts/test/swebench-judge.py <scenario_name> <response_file>
508
+
509
+ # Example
510
+ python3 .pennyfarthing/scripts/test/swebench-judge.py flask-5014 /tmp/run_1.json
511
+ ```
512
+
513
+ **ground-truth mode:**
514
+ Compares fix against actual SWE-bench patch. Scores:
515
+ - file_identification (20%): Correct files identified
516
+ - location_identification (20%): Correct functions/locations
517
+ - fix_logic_match (40%): Code matches ground truth
518
+ - completeness (20%): Has all elements of good fix
519
+
520
+ ```bash
521
+ # Execute via Python script
522
+ python3 .pennyfarthing/scripts/test/ground-truth-judge.py <scenario_name> <response_file>
523
+
524
+ # Example
525
+ python3 .pennyfarthing/scripts/test/ground-truth-judge.py django-10554 /tmp/run_1.json
526
+ ```
527
+
528
+ **Response file format:**
529
+ Both scripts expect JSON with either:
530
+ - `result`: The agent's response text
531
+ - `response_text`: Alternative field name
532
+
533
+ **Output:**
534
+ Scripts print scores to stdout and save detailed JSON to `{input_path.replace('run_', 'swebench_judge_')}` or `{input_path.replace('run_', 'gt_judge_')}`.
535
+
536
+ </details>
537
+
538
+ <details>
539
+ <summary><strong>Step 3: Execute Judge via CLI</strong></summary>
540
+
541
+ **CRITICAL: Follow this execution pattern for all contexts (main session, skills, subagents).**
542
+
543
+ **Three rules to avoid shell parsing errors:**
544
+
545
+ 1. **Use Write tool for prompt files** - NOT `echo` in Bash (handles special characters)
546
+ 2. **Use file redirection for output** - NOT variable capture `$(...)` (avoids zsh parse errors)
547
+ 3. **Use pipe syntax** - NOT heredocs (works in subagents)
548
+
549
+ **Why variable capture fails:**
550
+ ```bash
551
+ # This FAILS - zsh tries to parse JSON with () characters
552
+ OUTPUT=$(cat prompt.txt | claude -p --output-format json --tools "")
553
+ # Error: parse error near ')'
554
+ ```
555
+
556
+ **Correct pattern:**
557
+
558
+ ```bash
559
+ # Step 1: Use Write tool to create prompt file (NOT echo in Bash)
560
+ # The Write tool handles escaping properly in all contexts
561
+
562
+ # Step 2: Capture timestamp (simple command, safe to capture)
563
+ date -u +%Y-%m-%dT%H:%M:%SZ > .scratch/judge_ts.txt
564
+
565
+ # Step 3: Execute with FILE REDIRECTION (NOT variable capture)
566
+ cat .scratch/judge_prompt.txt | claude -p --output-format json --tools "" > .scratch/judge_output.json
567
+
568
+ # Step 4: Extract from files (reading files is always safe)
569
+ JUDGE_RESPONSE=$(jq -r '.result' .scratch/judge_output.json)
570
+ JUDGE_INPUT_TOKENS=$(jq -r '.usage.input_tokens // 0' .scratch/judge_output.json)
571
+ JUDGE_OUTPUT_TOKENS=$(jq -r '.usage.output_tokens // 0' .scratch/judge_output.json)
572
+ ```
573
+
574
+ **Key insight:** The shell never parses the JSON when using file redirection.
575
+ The output goes directly to a file, then jq reads it safely.
576
+
577
+ </details>
578
+
579
+ <details>
580
+ <summary><strong>Step 4: Extract Scores</strong></summary>
581
+
582
+ ```bash
583
+ # All modes now output JSON - parse with jq
584
+ # Solo mode
585
+ SCORE=$(echo "$JUDGE_RESPONSE" | jq -r '.weighted_total // empty')
586
+
587
+ # Compare mode
588
+ SCORE1=$(echo "$JUDGE_RESPONSE" | jq -r '.contestants["{spec1}"].weighted_total // empty')
589
+ SCORE2=$(echo "$JUDGE_RESPONSE" | jq -r '.contestants["{spec2}"].weighted_total // empty')
590
+ WINNER=$(echo "$JUDGE_RESPONSE" | jq -r '.winner // empty')
591
+
592
+ # Coherence mode
593
+ RATING=$(echo "$JUDGE_RESPONSE" | jq -r '.rating // empty')
594
+
595
+ # Fallback: try grep if JSON parsing fails (backwards compatibility)
596
+ if [[ -z "$SCORE" ]]; then
597
+ SCORE=$(echo "$JUDGE_RESPONSE" | grep -oE "weighted_total[\"':]*\s*([0-9.]+)" | grep -oE "[0-9.]+" | tail -1)
598
+ fi
599
+ ```
600
+
601
+ </details>
602
+
603
+ ### Step 5: Validate Results
604
+
605
+ | Check | Requirement |
606
+ |-------|-------------|
607
+ | `JUDGE_TIMESTAMP` | Valid ISO8601 |
608
+ | `JUDGE_RESPONSE` | At least 200 chars |
609
+ | `SCORE` (if applicable) | Number 1-100 |
610
+ | `RATING` (if coherence) | One of: excellent, good, poor |
611
+ | `JUDGE_INPUT_TOKENS` | > 0 |
612
+ | `JUDGE_OUTPUT_TOKENS` | > 0 |
613
+
614
+ **If validation fails:** Return error, do NOT estimate.
615
+
616
+ ### Step 6: Return Results
617
+
618
+ Output structured result for caller:
619
+
620
+ ```json
621
+ {
622
+ "success": true,
623
+ "mode": "{mode}",
624
+ "timestamp": "{JUDGE_TIMESTAMP}",
625
+ "scores": {
626
+ "{spec1}": {score1},
627
+ "{spec2}": {score2}
628
+ },
629
+ "winner": "{winner_spec}",
630
+ "token_usage": {
631
+ "input": {JUDGE_INPUT_TOKENS},
632
+ "output": {JUDGE_OUTPUT_TOKENS}
633
+ },
634
+ "response_text": "{JUDGE_RESPONSE}"
635
+ }
636
+ ```
637
+
638
+ ## Error Handling
639
+
640
+ ```
641
+ ❌ Judge validation failed: {reason}
642
+ ❌ Mode: {mode}
643
+ ❌ DO NOT estimate scores
644
+ ```