@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,344 @@
1
+ #!/bin/bash
2
+ # Solo Benchmark Runner
3
+ # Executes a single agent benchmark with proper pipe syntax (NOT heredocs)
4
+ # Usage: ./scripts/solo-runner.sh <theme:agent> <scenario> [output_dir] [--as <role>]
5
+ #
6
+ # Cross-role mode:
7
+ # ./scripts/solo-runner.sh shakespeare:prospero race-condition /tmp --as dev
8
+ # This runs Prospero (normally SM) as a dev on the scenario.
9
+
10
+ set -e
11
+
12
+ SPEC="$1"
13
+ SCENARIO="$2"
14
+ OUTPUT_DIR="${3:-/tmp/solo-results}"
15
+ ROLE_OVERRIDE=""
16
+
17
+ # Check for --as flag (can be in position 4 or after output_dir)
18
+ if [[ "$4" == "--as" && -n "$5" ]]; then
19
+ ROLE_OVERRIDE="$5"
20
+ elif [[ "$3" == "--as" && -n "$4" ]]; then
21
+ # Handle case where output_dir is omitted: theme:agent scenario --as role
22
+ OUTPUT_DIR="/tmp/solo-results"
23
+ ROLE_OVERRIDE="$4"
24
+ fi
25
+
26
+ if [[ -z "$SPEC" || -z "$SCENARIO" ]]; then
27
+ echo "Usage: $0 <theme:agent> <scenario> [output_dir] [--as <role>]" >&2
28
+ echo "" >&2
29
+ echo "Cross-role mode:" >&2
30
+ echo " $0 shakespeare:prospero scenario /tmp --as dev" >&2
31
+ exit 1
32
+ fi
33
+
34
+ # Valid roles for --as validation
35
+ VALID_ROLES="sm dev reviewer architect tea pm orchestrator tech-writer ux-designer devops"
36
+
37
+ if [[ -n "$ROLE_OVERRIDE" ]]; then
38
+ if ! echo "$VALID_ROLES" | grep -qw "$ROLE_OVERRIDE"; then
39
+ echo "Error: Invalid role '$ROLE_OVERRIDE'. Must be one of: $VALID_ROLES" >&2
40
+ exit 1
41
+ fi
42
+ fi
43
+
44
+ # Parse spec
45
+ THEME="${SPEC%%:*}"
46
+ AGENT="${SPEC##*:}"
47
+
48
+ if [[ "$THEME" == "$AGENT" ]]; then
49
+ echo "Error: Invalid spec format. Expected theme:agent, got: $SPEC" >&2
50
+ exit 1
51
+ fi
52
+
53
+ # Find paths
54
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
55
+ PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
56
+ PERSONA_FILE="$PROJECT_DIR/pennyfarthing-dist/personas/themes/${THEME}.yaml"
57
+ SCENARIO_FILE=$(find "$SCRIPT_DIR/../scenarios" -name "${SCENARIO}.yaml" 2>/dev/null | head -1)
58
+
59
+ if [[ ! -f "$PERSONA_FILE" ]]; then
60
+ echo "Error: Theme not found: $PERSONA_FILE" >&2
61
+ exit 1
62
+ fi
63
+
64
+ if [[ -z "$SCENARIO_FILE" || ! -f "$SCENARIO_FILE" ]]; then
65
+ echo "Error: Scenario not found: $SCENARIO" >&2
66
+ exit 1
67
+ fi
68
+
69
+ # Function to find which role a character belongs to (case-insensitive search)
70
+ find_character_in_theme() {
71
+ local theme_file="$1"
72
+ local query="$2"
73
+
74
+ # Search all agents for matching character (case-insensitive)
75
+ # Use (?i) prefix for case-insensitive matching in yq
76
+ yq -r ".agents | to_entries[] | select(.value.character | test(\"(?i)$query\")) | .key" "$theme_file" | head -1
77
+ }
78
+
79
+ # Determine source role and effective role
80
+ CROSS_ROLE=false
81
+ if [[ -n "$ROLE_OVERRIDE" ]]; then
82
+ # Cross-role mode: AGENT is a character name, not a role
83
+ CHARACTER_QUERY="$AGENT"
84
+ SOURCE_ROLE=$(find_character_in_theme "$PERSONA_FILE" "$CHARACTER_QUERY")
85
+
86
+ if [[ -z "$SOURCE_ROLE" ]]; then
87
+ echo "Error: Character '$CHARACTER_QUERY' not found in theme '$THEME'" >&2
88
+ echo "Available characters:" >&2
89
+ yq -r '.agents | to_entries[] | " - \(.key): \(.value.character)"' "$PERSONA_FILE" >&2
90
+ exit 1
91
+ fi
92
+
93
+ EFFECTIVE_ROLE="$ROLE_OVERRIDE"
94
+ CROSS_ROLE=true
95
+ # Use SOURCE_ROLE for persona lookup
96
+ LOOKUP_ROLE="$SOURCE_ROLE"
97
+ else
98
+ # Standard mode: AGENT is the role name
99
+ SOURCE_ROLE="$AGENT"
100
+ EFFECTIVE_ROLE="$AGENT"
101
+ LOOKUP_ROLE="$AGENT"
102
+ fi
103
+
104
+ # Create output directory and temp dir
105
+ mkdir -p "$OUTPUT_DIR"
106
+ TMPDIR=$(mktemp -d)
107
+ trap "rm -rf $TMPDIR" EXIT
108
+
109
+ # Generate run ID
110
+ TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
111
+ RUN_ID=$(echo "$SPEC-$SCENARIO-$$" | md5sum | head -c 8)
112
+
113
+ # Extract persona info using yq (use LOOKUP_ROLE for persona, EFFECTIVE_ROLE for task)
114
+ CHARACTER=$(yq -r ".agents.${LOOKUP_ROLE}.character // \"Unknown\"" "$PERSONA_FILE")
115
+ STYLE=$(yq -r ".agents.${LOOKUP_ROLE}.style // \"Professional\"" "$PERSONA_FILE")
116
+ EXPERTISE=$(yq -r ".agents.${LOOKUP_ROLE}.expertise // \"Software development\"" "$PERSONA_FILE")
117
+ CATCHPHRASES=$(yq -r ".agents.${LOOKUP_ROLE}.catchphrases // [] | .[]" "$PERSONA_FILE" 2>/dev/null | sed 's/^/ - /' | head -5)
118
+
119
+ # Generate character slug for cross-role output paths
120
+ CHARACTER_SLUG=$(echo "$CHARACTER" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-//' | sed 's/-$//')
121
+
122
+ # Extract scenario info to files (avoids shell escaping issues)
123
+ yq -r '.prompt' "$SCENARIO_FILE" > "$TMPDIR/prompt.txt"
124
+ yq -r '.code // ""' "$SCENARIO_FILE" > "$TMPDIR/code.txt"
125
+
126
+ # Build agent prompt file directly
127
+ cat > "$TMPDIR/agent_prompt.txt" << AGENT_PROMPT_EOF
128
+ You are ${CHARACTER}.
129
+
130
+ **Style:** ${STYLE}
131
+ **Expertise:** ${EXPERTISE}
132
+ **Catchphrases:**
133
+ ${CATCHPHRASES}
134
+
135
+ ---
136
+
137
+ ## Challenge
138
+
139
+ $(cat "$TMPDIR/prompt.txt")
140
+
141
+ ## Code
142
+
143
+ \`\`\`go
144
+ $(cat "$TMPDIR/code.txt")
145
+ \`\`\`
146
+
147
+ ---
148
+
149
+ Respond fully in character. Under 500 words.
150
+
151
+ **IMPORTANT:** Provide your complete response directly. Do not attempt to use tools, read files, or make function calls.
152
+ AGENT_PROMPT_EOF
153
+
154
+ # Execute agent via claude CLI with PIPE SYNTAX
155
+ AGENT_OUTPUT=$(cat "$TMPDIR/agent_prompt.txt" | claude -p --output-format json --tools "" 2>/dev/null || echo '{"error": "CLI failed"}')
156
+
157
+ # Extract agent results
158
+ echo "$AGENT_OUTPUT" | jq -r '.result // .error // "No response"' > "$TMPDIR/response.txt"
159
+ INPUT_TOKENS=$(echo "$AGENT_OUTPUT" | jq -r '.usage.input_tokens // 0')
160
+ OUTPUT_TOKENS=$(echo "$AGENT_OUTPUT" | jq -r '.usage.output_tokens // 0')
161
+ RESPONSE_LENGTH=$(wc -c < "$TMPDIR/response.txt" | tr -d ' ')
162
+ # Extract model usage for per-model token tracking
163
+ echo "$AGENT_OUTPUT" | jq -c '.modelUsage // {}' > "$TMPDIR/agent_model_usage.json"
164
+ AGENT_COST=$(echo "$AGENT_OUTPUT" | jq -r '.total_cost_usd // 0')
165
+
166
+ # Validate response
167
+ if [[ $RESPONSE_LENGTH -lt 100 ]]; then
168
+ echo "Error: Response too short ($RESPONSE_LENGTH chars)" >&2
169
+ cat "$TMPDIR/response.txt" >&2
170
+ exit 1
171
+ fi
172
+
173
+ # Save agent response using jq to properly escape the response
174
+ jq -n \
175
+ --arg run_id "$RUN_ID" \
176
+ --arg spec "$SPEC" \
177
+ --arg theme "$THEME" \
178
+ --arg agent "$AGENT" \
179
+ --arg character "$CHARACTER" \
180
+ --arg character_slug "$CHARACTER_SLUG" \
181
+ --arg source_role "$SOURCE_ROLE" \
182
+ --arg effective_role "$EFFECTIVE_ROLE" \
183
+ --argjson cross_role "$CROSS_ROLE" \
184
+ --arg scenario "$SCENARIO" \
185
+ --arg timestamp "$TIMESTAMP" \
186
+ --rawfile response "$TMPDIR/response.txt" \
187
+ --argjson response_length "$RESPONSE_LENGTH" \
188
+ --argjson input_tokens "$INPUT_TOKENS" \
189
+ --argjson output_tokens "$OUTPUT_TOKENS" \
190
+ --slurpfile model_usage "$TMPDIR/agent_model_usage.json" \
191
+ --argjson cost_usd "$AGENT_COST" \
192
+ '{
193
+ run_id: $run_id,
194
+ spec: $spec,
195
+ theme: $theme,
196
+ agent: $agent,
197
+ character: $character,
198
+ character_slug: $character_slug,
199
+ source_role: $source_role,
200
+ effective_role: $effective_role,
201
+ cross_role: $cross_role,
202
+ scenario: $scenario,
203
+ timestamp: $timestamp,
204
+ response: $response,
205
+ response_length: $response_length,
206
+ input_tokens: $input_tokens,
207
+ output_tokens: $output_tokens,
208
+ model_usage: $model_usage[0],
209
+ cost_usd: $cost_usd
210
+ }' > "$OUTPUT_DIR/agent_${RUN_ID}.json"
211
+
212
+ # Build judge prompt file
213
+ JUDGE_TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
214
+
215
+ cat > "$TMPDIR/judge_prompt.txt" << JUDGE_PROMPT_EOF
216
+ You are an impartial judge evaluating an AI agent's response.
217
+
218
+ ## Contestant
219
+ - **${SPEC}** (${CHARACTER})
220
+
221
+ ## Challenge
222
+ $(cat "$TMPDIR/prompt.txt")
223
+
224
+ ## Response
225
+ $(cat "$TMPDIR/response.txt")
226
+
227
+ ## Evaluation
228
+
229
+ Score 1-10 on each dimension:
230
+
231
+ 1. **Correctness (25%)** - Technical accuracy
232
+ 2. **Depth (25%)** - Thoroughness
233
+ 3. **Quality (25%)** - Clarity and actionability
234
+ 4. **Persona (25%)** - Character embodiment
235
+
236
+ Formula: (correctness × 2.5) + (depth × 2.5) + (quality × 2.5) + (persona × 2.5) = WEIGHTED_TOTAL
237
+
238
+ **IMPORTANT: Output your evaluation as JSON only. No markdown, no extra text.**
239
+
240
+ {
241
+ "scores": {
242
+ "correctness": { "value": N, "reasoning": "..." },
243
+ "depth": { "value": N, "reasoning": "..." },
244
+ "quality": { "value": N, "reasoning": "..." },
245
+ "persona": { "value": N, "reasoning": "..." }
246
+ },
247
+ "weighted_total": NN.N,
248
+ "assessment": "2-3 sentence overall assessment"
249
+ }
250
+ JUDGE_PROMPT_EOF
251
+
252
+ # Execute judge via pipe syntax
253
+ JUDGE_OUTPUT=$(cat "$TMPDIR/judge_prompt.txt" | claude -p --output-format json --tools "" 2>/dev/null || echo '{"error": "Judge CLI failed"}')
254
+
255
+ # Extract judge results
256
+ echo "$JUDGE_OUTPUT" | jq -r '.result // .error // "No response"' > "$TMPDIR/judge_response.txt"
257
+ JUDGE_INPUT_TOKENS=$(echo "$JUDGE_OUTPUT" | jq -r '.usage.input_tokens // 0')
258
+ JUDGE_OUTPUT_TOKENS=$(echo "$JUDGE_OUTPUT" | jq -r '.usage.output_tokens // 0')
259
+ echo "$JUDGE_OUTPUT" | jq -c '.modelUsage // {}' > "$TMPDIR/judge_model_usage.json"
260
+ JUDGE_COST=$(echo "$JUDGE_OUTPUT" | jq -r '.total_cost_usd // 0')
261
+
262
+ # Extract score from judge response - try JSON first, then regex fallback
263
+ SCORE=$(cat "$TMPDIR/judge_response.txt" | jq -r '.weighted_total // empty' 2>/dev/null || true)
264
+ if [[ -z "$SCORE" ]]; then
265
+ # Fallback: extract from text
266
+ SCORE=$(grep -oE '"weighted_total"[^0-9]*([0-9.]+)' "$TMPDIR/judge_response.txt" | grep -oE '[0-9.]+' | tail -1 || true)
267
+ fi
268
+ if [[ -z "$SCORE" ]]; then
269
+ # Last resort: look for any WEIGHTED_TOTAL pattern
270
+ SCORE=$(grep -oE 'WEIGHTED_TOTAL[^0-9]*([0-9.]+)' "$TMPDIR/judge_response.txt" | grep -oE '[0-9.]+' | tail -1 || true)
271
+ fi
272
+
273
+ if [[ -z "$SCORE" || "$SCORE" == "null" ]]; then
274
+ echo "Error: Could not extract score from judge response" >&2
275
+ cat "$TMPDIR/judge_response.txt" >&2
276
+ exit 1
277
+ fi
278
+
279
+ # Save judge response using jq
280
+ jq -n \
281
+ --arg run_id "$RUN_ID" \
282
+ --arg spec "$SPEC" \
283
+ --arg scenario "$SCENARIO" \
284
+ --arg timestamp "$JUDGE_TIMESTAMP" \
285
+ --argjson score "$SCORE" \
286
+ --rawfile judge_response "$TMPDIR/judge_response.txt" \
287
+ --argjson input_tokens "$JUDGE_INPUT_TOKENS" \
288
+ --argjson output_tokens "$JUDGE_OUTPUT_TOKENS" \
289
+ --slurpfile model_usage "$TMPDIR/judge_model_usage.json" \
290
+ --argjson cost_usd "$JUDGE_COST" \
291
+ '{
292
+ run_id: $run_id,
293
+ spec: $spec,
294
+ scenario: $scenario,
295
+ timestamp: $timestamp,
296
+ score: $score,
297
+ judge_response: $judge_response,
298
+ input_tokens: $input_tokens,
299
+ output_tokens: $output_tokens,
300
+ model_usage: $model_usage[0],
301
+ cost_usd: $cost_usd
302
+ }' > "$OUTPUT_DIR/judge_${RUN_ID}.json"
303
+
304
+ # Output summary JSON to stdout
305
+ jq -n \
306
+ --argjson success true \
307
+ --arg run_id "$RUN_ID" \
308
+ --arg spec "$SPEC" \
309
+ --arg theme "$THEME" \
310
+ --arg character "$CHARACTER" \
311
+ --arg character_slug "$CHARACTER_SLUG" \
312
+ --arg source_role "$SOURCE_ROLE" \
313
+ --arg effective_role "$EFFECTIVE_ROLE" \
314
+ --argjson cross_role "$CROSS_ROLE" \
315
+ --arg scenario "$SCENARIO" \
316
+ --argjson score "$SCORE" \
317
+ --argjson agent_tokens "$((INPUT_TOKENS + OUTPUT_TOKENS))" \
318
+ --argjson judge_tokens "$((JUDGE_INPUT_TOKENS + JUDGE_OUTPUT_TOKENS))" \
319
+ --slurpfile agent_model_usage "$TMPDIR/agent_model_usage.json" \
320
+ --slurpfile judge_model_usage "$TMPDIR/judge_model_usage.json" \
321
+ --argjson agent_cost "$AGENT_COST" \
322
+ --argjson judge_cost "$JUDGE_COST" \
323
+ --arg agent_file "$OUTPUT_DIR/agent_${RUN_ID}.json" \
324
+ --arg judge_file "$OUTPUT_DIR/judge_${RUN_ID}.json" \
325
+ '{
326
+ success: $success,
327
+ run_id: $run_id,
328
+ spec: $spec,
329
+ theme: $theme,
330
+ character: $character,
331
+ character_slug: $character_slug,
332
+ source_role: $source_role,
333
+ effective_role: $effective_role,
334
+ cross_role: $cross_role,
335
+ scenario: $scenario,
336
+ score: $score,
337
+ agent_tokens: $agent_tokens,
338
+ judge_tokens: $judge_tokens,
339
+ agent_model_usage: $agent_model_usage[0],
340
+ judge_model_usage: $judge_model_usage[0],
341
+ total_cost_usd: ($agent_cost + $judge_cost),
342
+ agent_file: $agent_file,
343
+ judge_file: $judge_file
344
+ }'
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env bash
2
+ # ensure-swebench-data.sh - Downloads SWE-bench data if not present
3
+ #
4
+ # Usage: ensure-swebench-data.sh [--force]
5
+ #
6
+ # Downloads SWE-bench Verified dataset from HuggingFace to /tmp/swebench_all.json
7
+ # This is a dependency for:
8
+ # - swebench-judge.py
9
+ # - ground-truth-judge.py
10
+ #
11
+ # Options:
12
+ # --force Re-download even if file exists
13
+
14
+ set -euo pipefail
15
+
16
+ CACHE_PATH="/tmp/swebench_all.json"
17
+ DATASET_URL="https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified/resolve/main/data/test.jsonl"
18
+
19
+ force=false
20
+ if [[ "${1:-}" == "--force" ]]; then
21
+ force=true
22
+ fi
23
+
24
+ # Check if already present
25
+ if [[ -f "$CACHE_PATH" ]] && [[ "$force" == "false" ]]; then
26
+ echo "SWE-bench data already cached at $CACHE_PATH"
27
+ exit 0
28
+ fi
29
+
30
+ echo "Downloading SWE-bench Verified dataset..."
31
+
32
+ # Download JSONL and convert to JSON array
33
+ if command -v curl &>/dev/null; then
34
+ curl -sL "$DATASET_URL" | python3 -c "
35
+ import json
36
+ import sys
37
+ lines = [json.loads(line) for line in sys.stdin if line.strip()]
38
+ print(json.dumps(lines, indent=2))
39
+ " > "$CACHE_PATH"
40
+ elif command -v wget &>/dev/null; then
41
+ wget -qO- "$DATASET_URL" | python3 -c "
42
+ import json
43
+ import sys
44
+ lines = [json.loads(line) for line in sys.stdin if line.strip()]
45
+ print(json.dumps(lines, indent=2))
46
+ " > "$CACHE_PATH"
47
+ else
48
+ echo "Error: curl or wget required to download SWE-bench data"
49
+ exit 1
50
+ fi
51
+
52
+ # Verify download
53
+ if [[ -f "$CACHE_PATH" ]]; then
54
+ count=$(python3 -c "import json; print(len(json.load(open('$CACHE_PATH'))))")
55
+ echo "Downloaded $count SWE-bench scenarios to $CACHE_PATH"
56
+ else
57
+ echo "Error: Failed to download SWE-bench data"
58
+ exit 1
59
+ fi
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Ground-truth judge for SWE-bench scenarios.
4
+
5
+ Compares Claude's proposed fix against the actual SWE-bench patch.
6
+ Scores based on:
7
+ - File identification (20%)
8
+ - Function/location identification (20%)
9
+ - Fix logic match (40%)
10
+ - Completeness (20%)
11
+ """
12
+
13
+ import json
14
+ import re
15
+ import sys
16
+ from pathlib import Path
17
+ from difflib import SequenceMatcher
18
+
19
+ # Add parent to path for pennyfarthing_scripts imports
20
+ sys.path.insert(0, str(Path(__file__).resolve().parents[4]))
21
+
22
+ from pennyfarthing_scripts.swebench import (
23
+ extract_patch_info,
24
+ extract_problem_keywords,
25
+ find_scenario,
26
+ get_meaningful_patterns,
27
+ load_swebench_data,
28
+ )
29
+
30
+
31
+ def score_response(response_text, ground_truth):
32
+ """Score a response against ground truth patch."""
33
+ patch_info = extract_patch_info(ground_truth['patch'])
34
+
35
+ scores = {
36
+ 'file_identification': 0,
37
+ 'location_identification': 0,
38
+ 'fix_logic_match': 0,
39
+ 'completeness': 0,
40
+ 'details': {}
41
+ }
42
+
43
+ response_lower = response_text.lower()
44
+
45
+ # 1. FILE IDENTIFICATION (20 points)
46
+ files_found = 0
47
+ for f in patch_info.files:
48
+ # Check various forms of the filename
49
+ filename = Path(f).name
50
+ if filename.lower() in response_lower or f.lower() in response_lower:
51
+ files_found += 1
52
+
53
+ if patch_info.files:
54
+ file_score = (files_found / len(patch_info.files)) * 20
55
+ scores['file_identification'] = min(20, file_score)
56
+ scores['details']['files_expected'] = patch_info.files
57
+ scores['details']['files_found'] = files_found
58
+ else:
59
+ scores['file_identification'] = 20 # No specific file in patch
60
+
61
+ # 2. LOCATION IDENTIFICATION (20 points)
62
+ # Look for function/class names mentioned in the patch
63
+ locations_found = 0
64
+ for func in patch_info.functions:
65
+ # Extract the function/class name
66
+ func_match = re.search(r'(def|class)\s+(\w+)', func)
67
+ if func_match:
68
+ func_name = func_match.group(2)
69
+ if func_name.lower() in response_lower:
70
+ locations_found += 1
71
+ elif func.strip() and func.strip().split()[0] in response_lower:
72
+ locations_found += 1
73
+
74
+ if patch_info.functions:
75
+ loc_score = (locations_found / len(patch_info.functions)) * 20
76
+ scores['location_identification'] = min(20, loc_score)
77
+ scores['details']['locations_expected'] = patch_info.functions[:3]
78
+ scores['details']['locations_found'] = locations_found
79
+ else:
80
+ scores['location_identification'] = 10 # Partial credit
81
+
82
+ # 3. FIX LOGIC MATCH (40 points)
83
+ # Check if key code patterns from the fix appear in the response
84
+ meaningful_patterns = get_meaningful_patterns(patch_info.key_patterns)
85
+
86
+ patterns_found = 0
87
+ for pattern in meaningful_patterns:
88
+ if pattern.lower() in response_lower:
89
+ patterns_found += 1
90
+
91
+ if meaningful_patterns:
92
+ pattern_score = (patterns_found / len(meaningful_patterns)) * 20
93
+ scores['details']['patterns_expected'] = meaningful_patterns[:10]
94
+ scores['details']['patterns_found'] = patterns_found
95
+ else:
96
+ pattern_score = 10
97
+
98
+ # Check for actual code additions
99
+ additions_matched = 0
100
+ for addition in patch_info.additions[:5]: # Check first 5 additions
101
+ # Normalize and check
102
+ addition_normalized = re.sub(r'\s+', ' ', addition.lower())
103
+ response_normalized = re.sub(r'\s+', ' ', response_lower)
104
+
105
+ # Use fuzzy matching
106
+ similarity = SequenceMatcher(None, addition_normalized, response_normalized).ratio()
107
+ if similarity > 0.6 or addition_normalized in response_normalized:
108
+ additions_matched += 1
109
+
110
+ if patch_info.additions:
111
+ addition_score = (additions_matched / min(5, len(patch_info.additions))) * 20
112
+ scores['details']['additions_matched'] = additions_matched
113
+ else:
114
+ addition_score = 10
115
+
116
+ scores['fix_logic_match'] = min(40, pattern_score + addition_score)
117
+
118
+ # 4. COMPLETENESS (20 points)
119
+ # Does the response have all the elements of a good fix?
120
+ completeness_score = 0
121
+
122
+ # Has code block?
123
+ if '```' in response_text:
124
+ completeness_score += 5
125
+
126
+ # Has test considerations?
127
+ if 'test' in response_lower:
128
+ completeness_score += 5
129
+
130
+ # Mentions the specific error/issue?
131
+ problem_keywords = extract_problem_keywords(ground_truth.get('problem_statement', ''))
132
+ keywords_found = sum(1 for kw in problem_keywords if kw.lower() in response_lower)
133
+ if problem_keywords:
134
+ completeness_score += min(5, (keywords_found / len(problem_keywords)) * 5)
135
+ else:
136
+ completeness_score += 2.5
137
+
138
+ # Has explanation of why fix works?
139
+ explanation_words = ['because', 'this fixes', 'this resolves', 'the issue', 'the problem', 'solution']
140
+ if any(word in response_lower for word in explanation_words):
141
+ completeness_score += 5
142
+
143
+ scores['completeness'] = min(20, completeness_score)
144
+
145
+ # Total
146
+ scores['total'] = round(
147
+ scores['file_identification'] +
148
+ scores['location_identification'] +
149
+ scores['fix_logic_match'] +
150
+ scores['completeness']
151
+ , 1)
152
+
153
+ return scores
154
+
155
+
156
+ def main():
157
+ if len(sys.argv) < 3:
158
+ print("Usage: ground-truth-judge.py <scenario_name> <response_file>")
159
+ print("Example: ground-truth-judge.py flask-5014 run_20260102T134237Z.json")
160
+ sys.exit(1)
161
+
162
+ scenario_name = sys.argv[1]
163
+ response_file = sys.argv[2]
164
+
165
+ # Load SWE-bench data
166
+ swebench_data = load_swebench_data()
167
+
168
+ # Find scenario
169
+ scenario = find_scenario(swebench_data, scenario_name)
170
+ if not scenario:
171
+ print(f"Error: Scenario '{scenario_name}' not found in SWE-bench data")
172
+ sys.exit(1)
173
+
174
+ # Load response
175
+ with open(response_file, 'r') as f:
176
+ response_data = json.load(f)
177
+
178
+ response_text = response_data.get('result', '')
179
+ if not response_text:
180
+ print("Error: No 'result' field in response file")
181
+ sys.exit(1)
182
+
183
+ # Score
184
+ scores = score_response(response_text, scenario)
185
+
186
+ # Output
187
+ print(f"\n{'='*60}")
188
+ print(f"GROUND TRUTH EVALUATION: {scenario_name}")
189
+ print(f"{'='*60}")
190
+ print(f"\nScores:")
191
+ print(f" File Identification: {scores['file_identification']:5.1f}/20")
192
+ print(f" Location Identification: {scores['location_identification']:5.1f}/20")
193
+ print(f" Fix Logic Match: {scores['fix_logic_match']:5.1f}/40")
194
+ print(f" Completeness: {scores['completeness']:5.1f}/20")
195
+ print(f" {'─'*40}")
196
+ print(f" TOTAL: {scores['total']:5.1f}/100")
197
+
198
+ print(f"\nDetails:")
199
+ for key, value in scores['details'].items():
200
+ print(f" {key}: {value}")
201
+
202
+ # Output JSON for programmatic use
203
+ output = {
204
+ 'scenario': scenario_name,
205
+ 'instance_id': scenario.get('instance_id'),
206
+ 'scores': scores,
207
+ 'ground_truth_patch_preview': scenario.get('patch', '')[:300]
208
+ }
209
+
210
+ # Save judge output
211
+ output_path = response_file.replace('run_', 'gt_judge_')
212
+ with open(output_path, 'w') as f:
213
+ json.dump(output, f, indent=2)
214
+ print(f"\nSaved to: {output_path}")
215
+
216
+ return scores
217
+
218
+
219
+ if __name__ == '__main__':
220
+ main()