@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Solo Benchmark Runner
|
|
3
|
+
# Executes a single agent benchmark with proper pipe syntax (NOT heredocs)
|
|
4
|
+
# Usage: ./scripts/solo-runner.sh <theme:agent> <scenario> [output_dir] [--as <role>]
|
|
5
|
+
#
|
|
6
|
+
# Cross-role mode:
|
|
7
|
+
# ./scripts/solo-runner.sh shakespeare:prospero race-condition /tmp --as dev
|
|
8
|
+
# This runs Prospero (normally SM) as a dev on the scenario.
|
|
9
|
+
|
|
10
|
+
set -e
|
|
11
|
+
|
|
12
|
+
SPEC="$1"
|
|
13
|
+
SCENARIO="$2"
|
|
14
|
+
OUTPUT_DIR="${3:-/tmp/solo-results}"
|
|
15
|
+
ROLE_OVERRIDE=""
|
|
16
|
+
|
|
17
|
+
# Check for --as flag (can be in position 4 or after output_dir)
|
|
18
|
+
if [[ "$4" == "--as" && -n "$5" ]]; then
|
|
19
|
+
ROLE_OVERRIDE="$5"
|
|
20
|
+
elif [[ "$3" == "--as" && -n "$4" ]]; then
|
|
21
|
+
# Handle case where output_dir is omitted: theme:agent scenario --as role
|
|
22
|
+
OUTPUT_DIR="/tmp/solo-results"
|
|
23
|
+
ROLE_OVERRIDE="$4"
|
|
24
|
+
fi
|
|
25
|
+
|
|
26
|
+
if [[ -z "$SPEC" || -z "$SCENARIO" ]]; then
|
|
27
|
+
echo "Usage: $0 <theme:agent> <scenario> [output_dir] [--as <role>]" >&2
|
|
28
|
+
echo "" >&2
|
|
29
|
+
echo "Cross-role mode:" >&2
|
|
30
|
+
echo " $0 shakespeare:prospero scenario /tmp --as dev" >&2
|
|
31
|
+
exit 1
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
# Valid roles for --as validation
|
|
35
|
+
VALID_ROLES="sm dev reviewer architect tea pm orchestrator tech-writer ux-designer devops"
|
|
36
|
+
|
|
37
|
+
if [[ -n "$ROLE_OVERRIDE" ]]; then
|
|
38
|
+
if ! echo "$VALID_ROLES" | grep -qw "$ROLE_OVERRIDE"; then
|
|
39
|
+
echo "Error: Invalid role '$ROLE_OVERRIDE'. Must be one of: $VALID_ROLES" >&2
|
|
40
|
+
exit 1
|
|
41
|
+
fi
|
|
42
|
+
fi
|
|
43
|
+
|
|
44
|
+
# Parse spec
|
|
45
|
+
THEME="${SPEC%%:*}"
|
|
46
|
+
AGENT="${SPEC##*:}"
|
|
47
|
+
|
|
48
|
+
if [[ "$THEME" == "$AGENT" ]]; then
|
|
49
|
+
echo "Error: Invalid spec format. Expected theme:agent, got: $SPEC" >&2
|
|
50
|
+
exit 1
|
|
51
|
+
fi
|
|
52
|
+
|
|
53
|
+
# Find paths
|
|
54
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
55
|
+
PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
|
|
56
|
+
PERSONA_FILE="$PROJECT_DIR/pennyfarthing-dist/personas/themes/${THEME}.yaml"
|
|
57
|
+
SCENARIO_FILE=$(find "$SCRIPT_DIR/../scenarios" -name "${SCENARIO}.yaml" 2>/dev/null | head -1)
|
|
58
|
+
|
|
59
|
+
if [[ ! -f "$PERSONA_FILE" ]]; then
|
|
60
|
+
echo "Error: Theme not found: $PERSONA_FILE" >&2
|
|
61
|
+
exit 1
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
if [[ -z "$SCENARIO_FILE" || ! -f "$SCENARIO_FILE" ]]; then
|
|
65
|
+
echo "Error: Scenario not found: $SCENARIO" >&2
|
|
66
|
+
exit 1
|
|
67
|
+
fi
|
|
68
|
+
|
|
69
|
+
# Function to find which role a character belongs to (case-insensitive search)
|
|
70
|
+
find_character_in_theme() {
|
|
71
|
+
local theme_file="$1"
|
|
72
|
+
local query="$2"
|
|
73
|
+
|
|
74
|
+
# Search all agents for matching character (case-insensitive)
|
|
75
|
+
# Use (?i) prefix for case-insensitive matching in yq
|
|
76
|
+
yq -r ".agents | to_entries[] | select(.value.character | test(\"(?i)$query\")) | .key" "$theme_file" | head -1
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Determine source role and effective role
|
|
80
|
+
CROSS_ROLE=false
|
|
81
|
+
if [[ -n "$ROLE_OVERRIDE" ]]; then
|
|
82
|
+
# Cross-role mode: AGENT is a character name, not a role
|
|
83
|
+
CHARACTER_QUERY="$AGENT"
|
|
84
|
+
SOURCE_ROLE=$(find_character_in_theme "$PERSONA_FILE" "$CHARACTER_QUERY")
|
|
85
|
+
|
|
86
|
+
if [[ -z "$SOURCE_ROLE" ]]; then
|
|
87
|
+
echo "Error: Character '$CHARACTER_QUERY' not found in theme '$THEME'" >&2
|
|
88
|
+
echo "Available characters:" >&2
|
|
89
|
+
yq -r '.agents | to_entries[] | " - \(.key): \(.value.character)"' "$PERSONA_FILE" >&2
|
|
90
|
+
exit 1
|
|
91
|
+
fi
|
|
92
|
+
|
|
93
|
+
EFFECTIVE_ROLE="$ROLE_OVERRIDE"
|
|
94
|
+
CROSS_ROLE=true
|
|
95
|
+
# Use SOURCE_ROLE for persona lookup
|
|
96
|
+
LOOKUP_ROLE="$SOURCE_ROLE"
|
|
97
|
+
else
|
|
98
|
+
# Standard mode: AGENT is the role name
|
|
99
|
+
SOURCE_ROLE="$AGENT"
|
|
100
|
+
EFFECTIVE_ROLE="$AGENT"
|
|
101
|
+
LOOKUP_ROLE="$AGENT"
|
|
102
|
+
fi
|
|
103
|
+
|
|
104
|
+
# Create output directory and temp dir
|
|
105
|
+
mkdir -p "$OUTPUT_DIR"
|
|
106
|
+
TMPDIR=$(mktemp -d)
|
|
107
|
+
trap "rm -rf $TMPDIR" EXIT
|
|
108
|
+
|
|
109
|
+
# Generate run ID
|
|
110
|
+
TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
111
|
+
RUN_ID=$(echo "$SPEC-$SCENARIO-$$" | md5sum | head -c 8)
|
|
112
|
+
|
|
113
|
+
# Extract persona info using yq (use LOOKUP_ROLE for persona, EFFECTIVE_ROLE for task)
|
|
114
|
+
CHARACTER=$(yq -r ".agents.${LOOKUP_ROLE}.character // \"Unknown\"" "$PERSONA_FILE")
|
|
115
|
+
STYLE=$(yq -r ".agents.${LOOKUP_ROLE}.style // \"Professional\"" "$PERSONA_FILE")
|
|
116
|
+
EXPERTISE=$(yq -r ".agents.${LOOKUP_ROLE}.expertise // \"Software development\"" "$PERSONA_FILE")
|
|
117
|
+
CATCHPHRASES=$(yq -r ".agents.${LOOKUP_ROLE}.catchphrases // [] | .[]" "$PERSONA_FILE" 2>/dev/null | sed 's/^/ - /' | head -5)
|
|
118
|
+
|
|
119
|
+
# Generate character slug for cross-role output paths
|
|
120
|
+
CHARACTER_SLUG=$(echo "$CHARACTER" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g' | sed 's/^-//' | sed 's/-$//')
|
|
121
|
+
|
|
122
|
+
# Extract scenario info to files (avoids shell escaping issues)
|
|
123
|
+
yq -r '.prompt' "$SCENARIO_FILE" > "$TMPDIR/prompt.txt"
|
|
124
|
+
yq -r '.code // ""' "$SCENARIO_FILE" > "$TMPDIR/code.txt"
|
|
125
|
+
|
|
126
|
+
# Build agent prompt file directly
|
|
127
|
+
cat > "$TMPDIR/agent_prompt.txt" << AGENT_PROMPT_EOF
|
|
128
|
+
You are ${CHARACTER}.
|
|
129
|
+
|
|
130
|
+
**Style:** ${STYLE}
|
|
131
|
+
**Expertise:** ${EXPERTISE}
|
|
132
|
+
**Catchphrases:**
|
|
133
|
+
${CATCHPHRASES}
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Challenge
|
|
138
|
+
|
|
139
|
+
$(cat "$TMPDIR/prompt.txt")
|
|
140
|
+
|
|
141
|
+
## Code
|
|
142
|
+
|
|
143
|
+
\`\`\`go
|
|
144
|
+
$(cat "$TMPDIR/code.txt")
|
|
145
|
+
\`\`\`
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
Respond fully in character. Under 500 words.
|
|
150
|
+
|
|
151
|
+
**IMPORTANT:** Provide your complete response directly. Do not attempt to use tools, read files, or make function calls.
|
|
152
|
+
AGENT_PROMPT_EOF
|
|
153
|
+
|
|
154
|
+
# Execute agent via claude CLI with PIPE SYNTAX
|
|
155
|
+
AGENT_OUTPUT=$(cat "$TMPDIR/agent_prompt.txt" | claude -p --output-format json --tools "" 2>/dev/null || echo '{"error": "CLI failed"}')
|
|
156
|
+
|
|
157
|
+
# Extract agent results
|
|
158
|
+
echo "$AGENT_OUTPUT" | jq -r '.result // .error // "No response"' > "$TMPDIR/response.txt"
|
|
159
|
+
INPUT_TOKENS=$(echo "$AGENT_OUTPUT" | jq -r '.usage.input_tokens // 0')
|
|
160
|
+
OUTPUT_TOKENS=$(echo "$AGENT_OUTPUT" | jq -r '.usage.output_tokens // 0')
|
|
161
|
+
RESPONSE_LENGTH=$(wc -c < "$TMPDIR/response.txt" | tr -d ' ')
|
|
162
|
+
# Extract model usage for per-model token tracking
|
|
163
|
+
echo "$AGENT_OUTPUT" | jq -c '.modelUsage // {}' > "$TMPDIR/agent_model_usage.json"
|
|
164
|
+
AGENT_COST=$(echo "$AGENT_OUTPUT" | jq -r '.total_cost_usd // 0')
|
|
165
|
+
|
|
166
|
+
# Validate response
|
|
167
|
+
if [[ $RESPONSE_LENGTH -lt 100 ]]; then
|
|
168
|
+
echo "Error: Response too short ($RESPONSE_LENGTH chars)" >&2
|
|
169
|
+
cat "$TMPDIR/response.txt" >&2
|
|
170
|
+
exit 1
|
|
171
|
+
fi
|
|
172
|
+
|
|
173
|
+
# Save agent response using jq to properly escape the response
|
|
174
|
+
jq -n \
|
|
175
|
+
--arg run_id "$RUN_ID" \
|
|
176
|
+
--arg spec "$SPEC" \
|
|
177
|
+
--arg theme "$THEME" \
|
|
178
|
+
--arg agent "$AGENT" \
|
|
179
|
+
--arg character "$CHARACTER" \
|
|
180
|
+
--arg character_slug "$CHARACTER_SLUG" \
|
|
181
|
+
--arg source_role "$SOURCE_ROLE" \
|
|
182
|
+
--arg effective_role "$EFFECTIVE_ROLE" \
|
|
183
|
+
--argjson cross_role "$CROSS_ROLE" \
|
|
184
|
+
--arg scenario "$SCENARIO" \
|
|
185
|
+
--arg timestamp "$TIMESTAMP" \
|
|
186
|
+
--rawfile response "$TMPDIR/response.txt" \
|
|
187
|
+
--argjson response_length "$RESPONSE_LENGTH" \
|
|
188
|
+
--argjson input_tokens "$INPUT_TOKENS" \
|
|
189
|
+
--argjson output_tokens "$OUTPUT_TOKENS" \
|
|
190
|
+
--slurpfile model_usage "$TMPDIR/agent_model_usage.json" \
|
|
191
|
+
--argjson cost_usd "$AGENT_COST" \
|
|
192
|
+
'{
|
|
193
|
+
run_id: $run_id,
|
|
194
|
+
spec: $spec,
|
|
195
|
+
theme: $theme,
|
|
196
|
+
agent: $agent,
|
|
197
|
+
character: $character,
|
|
198
|
+
character_slug: $character_slug,
|
|
199
|
+
source_role: $source_role,
|
|
200
|
+
effective_role: $effective_role,
|
|
201
|
+
cross_role: $cross_role,
|
|
202
|
+
scenario: $scenario,
|
|
203
|
+
timestamp: $timestamp,
|
|
204
|
+
response: $response,
|
|
205
|
+
response_length: $response_length,
|
|
206
|
+
input_tokens: $input_tokens,
|
|
207
|
+
output_tokens: $output_tokens,
|
|
208
|
+
model_usage: $model_usage[0],
|
|
209
|
+
cost_usd: $cost_usd
|
|
210
|
+
}' > "$OUTPUT_DIR/agent_${RUN_ID}.json"
|
|
211
|
+
|
|
212
|
+
# Build judge prompt file
|
|
213
|
+
JUDGE_TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
214
|
+
|
|
215
|
+
cat > "$TMPDIR/judge_prompt.txt" << JUDGE_PROMPT_EOF
|
|
216
|
+
You are an impartial judge evaluating an AI agent's response.
|
|
217
|
+
|
|
218
|
+
## Contestant
|
|
219
|
+
- **${SPEC}** (${CHARACTER})
|
|
220
|
+
|
|
221
|
+
## Challenge
|
|
222
|
+
$(cat "$TMPDIR/prompt.txt")
|
|
223
|
+
|
|
224
|
+
## Response
|
|
225
|
+
$(cat "$TMPDIR/response.txt")
|
|
226
|
+
|
|
227
|
+
## Evaluation
|
|
228
|
+
|
|
229
|
+
Score 1-10 on each dimension:
|
|
230
|
+
|
|
231
|
+
1. **Correctness (25%)** - Technical accuracy
|
|
232
|
+
2. **Depth (25%)** - Thoroughness
|
|
233
|
+
3. **Quality (25%)** - Clarity and actionability
|
|
234
|
+
4. **Persona (25%)** - Character embodiment
|
|
235
|
+
|
|
236
|
+
Formula: (correctness × 2.5) + (depth × 2.5) + (quality × 2.5) + (persona × 2.5) = WEIGHTED_TOTAL
|
|
237
|
+
|
|
238
|
+
**IMPORTANT: Output your evaluation as JSON only. No markdown, no extra text.**
|
|
239
|
+
|
|
240
|
+
{
|
|
241
|
+
"scores": {
|
|
242
|
+
"correctness": { "value": N, "reasoning": "..." },
|
|
243
|
+
"depth": { "value": N, "reasoning": "..." },
|
|
244
|
+
"quality": { "value": N, "reasoning": "..." },
|
|
245
|
+
"persona": { "value": N, "reasoning": "..." }
|
|
246
|
+
},
|
|
247
|
+
"weighted_total": NN.N,
|
|
248
|
+
"assessment": "2-3 sentence overall assessment"
|
|
249
|
+
}
|
|
250
|
+
JUDGE_PROMPT_EOF
|
|
251
|
+
|
|
252
|
+
# Execute judge via pipe syntax
|
|
253
|
+
JUDGE_OUTPUT=$(cat "$TMPDIR/judge_prompt.txt" | claude -p --output-format json --tools "" 2>/dev/null || echo '{"error": "Judge CLI failed"}')
|
|
254
|
+
|
|
255
|
+
# Extract judge results
|
|
256
|
+
echo "$JUDGE_OUTPUT" | jq -r '.result // .error // "No response"' > "$TMPDIR/judge_response.txt"
|
|
257
|
+
JUDGE_INPUT_TOKENS=$(echo "$JUDGE_OUTPUT" | jq -r '.usage.input_tokens // 0')
|
|
258
|
+
JUDGE_OUTPUT_TOKENS=$(echo "$JUDGE_OUTPUT" | jq -r '.usage.output_tokens // 0')
|
|
259
|
+
echo "$JUDGE_OUTPUT" | jq -c '.modelUsage // {}' > "$TMPDIR/judge_model_usage.json"
|
|
260
|
+
JUDGE_COST=$(echo "$JUDGE_OUTPUT" | jq -r '.total_cost_usd // 0')
|
|
261
|
+
|
|
262
|
+
# Extract score from judge response - try JSON first, then regex fallback
|
|
263
|
+
SCORE=$(cat "$TMPDIR/judge_response.txt" | jq -r '.weighted_total // empty' 2>/dev/null || true)
|
|
264
|
+
if [[ -z "$SCORE" ]]; then
|
|
265
|
+
# Fallback: extract from text
|
|
266
|
+
SCORE=$(grep -oE '"weighted_total"[^0-9]*([0-9.]+)' "$TMPDIR/judge_response.txt" | grep -oE '[0-9.]+' | tail -1 || true)
|
|
267
|
+
fi
|
|
268
|
+
if [[ -z "$SCORE" ]]; then
|
|
269
|
+
# Last resort: look for any WEIGHTED_TOTAL pattern
|
|
270
|
+
SCORE=$(grep -oE 'WEIGHTED_TOTAL[^0-9]*([0-9.]+)' "$TMPDIR/judge_response.txt" | grep -oE '[0-9.]+' | tail -1 || true)
|
|
271
|
+
fi
|
|
272
|
+
|
|
273
|
+
if [[ -z "$SCORE" || "$SCORE" == "null" ]]; then
|
|
274
|
+
echo "Error: Could not extract score from judge response" >&2
|
|
275
|
+
cat "$TMPDIR/judge_response.txt" >&2
|
|
276
|
+
exit 1
|
|
277
|
+
fi
|
|
278
|
+
|
|
279
|
+
# Save judge response using jq
|
|
280
|
+
jq -n \
|
|
281
|
+
--arg run_id "$RUN_ID" \
|
|
282
|
+
--arg spec "$SPEC" \
|
|
283
|
+
--arg scenario "$SCENARIO" \
|
|
284
|
+
--arg timestamp "$JUDGE_TIMESTAMP" \
|
|
285
|
+
--argjson score "$SCORE" \
|
|
286
|
+
--rawfile judge_response "$TMPDIR/judge_response.txt" \
|
|
287
|
+
--argjson input_tokens "$JUDGE_INPUT_TOKENS" \
|
|
288
|
+
--argjson output_tokens "$JUDGE_OUTPUT_TOKENS" \
|
|
289
|
+
--slurpfile model_usage "$TMPDIR/judge_model_usage.json" \
|
|
290
|
+
--argjson cost_usd "$JUDGE_COST" \
|
|
291
|
+
'{
|
|
292
|
+
run_id: $run_id,
|
|
293
|
+
spec: $spec,
|
|
294
|
+
scenario: $scenario,
|
|
295
|
+
timestamp: $timestamp,
|
|
296
|
+
score: $score,
|
|
297
|
+
judge_response: $judge_response,
|
|
298
|
+
input_tokens: $input_tokens,
|
|
299
|
+
output_tokens: $output_tokens,
|
|
300
|
+
model_usage: $model_usage[0],
|
|
301
|
+
cost_usd: $cost_usd
|
|
302
|
+
}' > "$OUTPUT_DIR/judge_${RUN_ID}.json"
|
|
303
|
+
|
|
304
|
+
# Output summary JSON to stdout
|
|
305
|
+
jq -n \
|
|
306
|
+
--argjson success true \
|
|
307
|
+
--arg run_id "$RUN_ID" \
|
|
308
|
+
--arg spec "$SPEC" \
|
|
309
|
+
--arg theme "$THEME" \
|
|
310
|
+
--arg character "$CHARACTER" \
|
|
311
|
+
--arg character_slug "$CHARACTER_SLUG" \
|
|
312
|
+
--arg source_role "$SOURCE_ROLE" \
|
|
313
|
+
--arg effective_role "$EFFECTIVE_ROLE" \
|
|
314
|
+
--argjson cross_role "$CROSS_ROLE" \
|
|
315
|
+
--arg scenario "$SCENARIO" \
|
|
316
|
+
--argjson score "$SCORE" \
|
|
317
|
+
--argjson agent_tokens "$((INPUT_TOKENS + OUTPUT_TOKENS))" \
|
|
318
|
+
--argjson judge_tokens "$((JUDGE_INPUT_TOKENS + JUDGE_OUTPUT_TOKENS))" \
|
|
319
|
+
--slurpfile agent_model_usage "$TMPDIR/agent_model_usage.json" \
|
|
320
|
+
--slurpfile judge_model_usage "$TMPDIR/judge_model_usage.json" \
|
|
321
|
+
--argjson agent_cost "$AGENT_COST" \
|
|
322
|
+
--argjson judge_cost "$JUDGE_COST" \
|
|
323
|
+
--arg agent_file "$OUTPUT_DIR/agent_${RUN_ID}.json" \
|
|
324
|
+
--arg judge_file "$OUTPUT_DIR/judge_${RUN_ID}.json" \
|
|
325
|
+
'{
|
|
326
|
+
success: $success,
|
|
327
|
+
run_id: $run_id,
|
|
328
|
+
spec: $spec,
|
|
329
|
+
theme: $theme,
|
|
330
|
+
character: $character,
|
|
331
|
+
character_slug: $character_slug,
|
|
332
|
+
source_role: $source_role,
|
|
333
|
+
effective_role: $effective_role,
|
|
334
|
+
cross_role: $cross_role,
|
|
335
|
+
scenario: $scenario,
|
|
336
|
+
score: $score,
|
|
337
|
+
agent_tokens: $agent_tokens,
|
|
338
|
+
judge_tokens: $judge_tokens,
|
|
339
|
+
agent_model_usage: $agent_model_usage[0],
|
|
340
|
+
judge_model_usage: $judge_model_usage[0],
|
|
341
|
+
total_cost_usd: ($agent_cost + $judge_cost),
|
|
342
|
+
agent_file: $agent_file,
|
|
343
|
+
judge_file: $judge_file
|
|
344
|
+
}'
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# ensure-swebench-data.sh - Downloads SWE-bench data if not present
|
|
3
|
+
#
|
|
4
|
+
# Usage: ensure-swebench-data.sh [--force]
|
|
5
|
+
#
|
|
6
|
+
# Downloads SWE-bench Verified dataset from HuggingFace to /tmp/swebench_all.json
|
|
7
|
+
# This is a dependency for:
|
|
8
|
+
# - swebench-judge.py
|
|
9
|
+
# - ground-truth-judge.py
|
|
10
|
+
#
|
|
11
|
+
# Options:
|
|
12
|
+
# --force Re-download even if file exists
|
|
13
|
+
|
|
14
|
+
set -euo pipefail
|
|
15
|
+
|
|
16
|
+
CACHE_PATH="/tmp/swebench_all.json"
|
|
17
|
+
DATASET_URL="https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified/resolve/main/data/test.jsonl"
|
|
18
|
+
|
|
19
|
+
force=false
|
|
20
|
+
if [[ "${1:-}" == "--force" ]]; then
|
|
21
|
+
force=true
|
|
22
|
+
fi
|
|
23
|
+
|
|
24
|
+
# Check if already present
|
|
25
|
+
if [[ -f "$CACHE_PATH" ]] && [[ "$force" == "false" ]]; then
|
|
26
|
+
echo "SWE-bench data already cached at $CACHE_PATH"
|
|
27
|
+
exit 0
|
|
28
|
+
fi
|
|
29
|
+
|
|
30
|
+
echo "Downloading SWE-bench Verified dataset..."
|
|
31
|
+
|
|
32
|
+
# Download JSONL and convert to JSON array
|
|
33
|
+
if command -v curl &>/dev/null; then
|
|
34
|
+
curl -sL "$DATASET_URL" | python3 -c "
|
|
35
|
+
import json
|
|
36
|
+
import sys
|
|
37
|
+
lines = [json.loads(line) for line in sys.stdin if line.strip()]
|
|
38
|
+
print(json.dumps(lines, indent=2))
|
|
39
|
+
" > "$CACHE_PATH"
|
|
40
|
+
elif command -v wget &>/dev/null; then
|
|
41
|
+
wget -qO- "$DATASET_URL" | python3 -c "
|
|
42
|
+
import json
|
|
43
|
+
import sys
|
|
44
|
+
lines = [json.loads(line) for line in sys.stdin if line.strip()]
|
|
45
|
+
print(json.dumps(lines, indent=2))
|
|
46
|
+
" > "$CACHE_PATH"
|
|
47
|
+
else
|
|
48
|
+
echo "Error: curl or wget required to download SWE-bench data"
|
|
49
|
+
exit 1
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
# Verify download
|
|
53
|
+
if [[ -f "$CACHE_PATH" ]]; then
|
|
54
|
+
count=$(python3 -c "import json; print(len(json.load(open('$CACHE_PATH'))))")
|
|
55
|
+
echo "Downloaded $count SWE-bench scenarios to $CACHE_PATH"
|
|
56
|
+
else
|
|
57
|
+
echo "Error: Failed to download SWE-bench data"
|
|
58
|
+
exit 1
|
|
59
|
+
fi
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Ground-truth judge for SWE-bench scenarios.
|
|
4
|
+
|
|
5
|
+
Compares Claude's proposed fix against the actual SWE-bench patch.
|
|
6
|
+
Scores based on:
|
|
7
|
+
- File identification (20%)
|
|
8
|
+
- Function/location identification (20%)
|
|
9
|
+
- Fix logic match (40%)
|
|
10
|
+
- Completeness (20%)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import re
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from difflib import SequenceMatcher
|
|
18
|
+
|
|
19
|
+
# Add parent to path for pennyfarthing_scripts imports
|
|
20
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[4]))
|
|
21
|
+
|
|
22
|
+
from pennyfarthing_scripts.swebench import (
|
|
23
|
+
extract_patch_info,
|
|
24
|
+
extract_problem_keywords,
|
|
25
|
+
find_scenario,
|
|
26
|
+
get_meaningful_patterns,
|
|
27
|
+
load_swebench_data,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def score_response(response_text, ground_truth):
|
|
32
|
+
"""Score a response against ground truth patch."""
|
|
33
|
+
patch_info = extract_patch_info(ground_truth['patch'])
|
|
34
|
+
|
|
35
|
+
scores = {
|
|
36
|
+
'file_identification': 0,
|
|
37
|
+
'location_identification': 0,
|
|
38
|
+
'fix_logic_match': 0,
|
|
39
|
+
'completeness': 0,
|
|
40
|
+
'details': {}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
response_lower = response_text.lower()
|
|
44
|
+
|
|
45
|
+
# 1. FILE IDENTIFICATION (20 points)
|
|
46
|
+
files_found = 0
|
|
47
|
+
for f in patch_info.files:
|
|
48
|
+
# Check various forms of the filename
|
|
49
|
+
filename = Path(f).name
|
|
50
|
+
if filename.lower() in response_lower or f.lower() in response_lower:
|
|
51
|
+
files_found += 1
|
|
52
|
+
|
|
53
|
+
if patch_info.files:
|
|
54
|
+
file_score = (files_found / len(patch_info.files)) * 20
|
|
55
|
+
scores['file_identification'] = min(20, file_score)
|
|
56
|
+
scores['details']['files_expected'] = patch_info.files
|
|
57
|
+
scores['details']['files_found'] = files_found
|
|
58
|
+
else:
|
|
59
|
+
scores['file_identification'] = 20 # No specific file in patch
|
|
60
|
+
|
|
61
|
+
# 2. LOCATION IDENTIFICATION (20 points)
|
|
62
|
+
# Look for function/class names mentioned in the patch
|
|
63
|
+
locations_found = 0
|
|
64
|
+
for func in patch_info.functions:
|
|
65
|
+
# Extract the function/class name
|
|
66
|
+
func_match = re.search(r'(def|class)\s+(\w+)', func)
|
|
67
|
+
if func_match:
|
|
68
|
+
func_name = func_match.group(2)
|
|
69
|
+
if func_name.lower() in response_lower:
|
|
70
|
+
locations_found += 1
|
|
71
|
+
elif func.strip() and func.strip().split()[0] in response_lower:
|
|
72
|
+
locations_found += 1
|
|
73
|
+
|
|
74
|
+
if patch_info.functions:
|
|
75
|
+
loc_score = (locations_found / len(patch_info.functions)) * 20
|
|
76
|
+
scores['location_identification'] = min(20, loc_score)
|
|
77
|
+
scores['details']['locations_expected'] = patch_info.functions[:3]
|
|
78
|
+
scores['details']['locations_found'] = locations_found
|
|
79
|
+
else:
|
|
80
|
+
scores['location_identification'] = 10 # Partial credit
|
|
81
|
+
|
|
82
|
+
# 3. FIX LOGIC MATCH (40 points)
|
|
83
|
+
# Check if key code patterns from the fix appear in the response
|
|
84
|
+
meaningful_patterns = get_meaningful_patterns(patch_info.key_patterns)
|
|
85
|
+
|
|
86
|
+
patterns_found = 0
|
|
87
|
+
for pattern in meaningful_patterns:
|
|
88
|
+
if pattern.lower() in response_lower:
|
|
89
|
+
patterns_found += 1
|
|
90
|
+
|
|
91
|
+
if meaningful_patterns:
|
|
92
|
+
pattern_score = (patterns_found / len(meaningful_patterns)) * 20
|
|
93
|
+
scores['details']['patterns_expected'] = meaningful_patterns[:10]
|
|
94
|
+
scores['details']['patterns_found'] = patterns_found
|
|
95
|
+
else:
|
|
96
|
+
pattern_score = 10
|
|
97
|
+
|
|
98
|
+
# Check for actual code additions
|
|
99
|
+
additions_matched = 0
|
|
100
|
+
for addition in patch_info.additions[:5]: # Check first 5 additions
|
|
101
|
+
# Normalize and check
|
|
102
|
+
addition_normalized = re.sub(r'\s+', ' ', addition.lower())
|
|
103
|
+
response_normalized = re.sub(r'\s+', ' ', response_lower)
|
|
104
|
+
|
|
105
|
+
# Use fuzzy matching
|
|
106
|
+
similarity = SequenceMatcher(None, addition_normalized, response_normalized).ratio()
|
|
107
|
+
if similarity > 0.6 or addition_normalized in response_normalized:
|
|
108
|
+
additions_matched += 1
|
|
109
|
+
|
|
110
|
+
if patch_info.additions:
|
|
111
|
+
addition_score = (additions_matched / min(5, len(patch_info.additions))) * 20
|
|
112
|
+
scores['details']['additions_matched'] = additions_matched
|
|
113
|
+
else:
|
|
114
|
+
addition_score = 10
|
|
115
|
+
|
|
116
|
+
scores['fix_logic_match'] = min(40, pattern_score + addition_score)
|
|
117
|
+
|
|
118
|
+
# 4. COMPLETENESS (20 points)
|
|
119
|
+
# Does the response have all the elements of a good fix?
|
|
120
|
+
completeness_score = 0
|
|
121
|
+
|
|
122
|
+
# Has code block?
|
|
123
|
+
if '```' in response_text:
|
|
124
|
+
completeness_score += 5
|
|
125
|
+
|
|
126
|
+
# Has test considerations?
|
|
127
|
+
if 'test' in response_lower:
|
|
128
|
+
completeness_score += 5
|
|
129
|
+
|
|
130
|
+
# Mentions the specific error/issue?
|
|
131
|
+
problem_keywords = extract_problem_keywords(ground_truth.get('problem_statement', ''))
|
|
132
|
+
keywords_found = sum(1 for kw in problem_keywords if kw.lower() in response_lower)
|
|
133
|
+
if problem_keywords:
|
|
134
|
+
completeness_score += min(5, (keywords_found / len(problem_keywords)) * 5)
|
|
135
|
+
else:
|
|
136
|
+
completeness_score += 2.5
|
|
137
|
+
|
|
138
|
+
# Has explanation of why fix works?
|
|
139
|
+
explanation_words = ['because', 'this fixes', 'this resolves', 'the issue', 'the problem', 'solution']
|
|
140
|
+
if any(word in response_lower for word in explanation_words):
|
|
141
|
+
completeness_score += 5
|
|
142
|
+
|
|
143
|
+
scores['completeness'] = min(20, completeness_score)
|
|
144
|
+
|
|
145
|
+
# Total
|
|
146
|
+
scores['total'] = round(
|
|
147
|
+
scores['file_identification'] +
|
|
148
|
+
scores['location_identification'] +
|
|
149
|
+
scores['fix_logic_match'] +
|
|
150
|
+
scores['completeness']
|
|
151
|
+
, 1)
|
|
152
|
+
|
|
153
|
+
return scores
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def main():
|
|
157
|
+
if len(sys.argv) < 3:
|
|
158
|
+
print("Usage: ground-truth-judge.py <scenario_name> <response_file>")
|
|
159
|
+
print("Example: ground-truth-judge.py flask-5014 run_20260102T134237Z.json")
|
|
160
|
+
sys.exit(1)
|
|
161
|
+
|
|
162
|
+
scenario_name = sys.argv[1]
|
|
163
|
+
response_file = sys.argv[2]
|
|
164
|
+
|
|
165
|
+
# Load SWE-bench data
|
|
166
|
+
swebench_data = load_swebench_data()
|
|
167
|
+
|
|
168
|
+
# Find scenario
|
|
169
|
+
scenario = find_scenario(swebench_data, scenario_name)
|
|
170
|
+
if not scenario:
|
|
171
|
+
print(f"Error: Scenario '{scenario_name}' not found in SWE-bench data")
|
|
172
|
+
sys.exit(1)
|
|
173
|
+
|
|
174
|
+
# Load response
|
|
175
|
+
with open(response_file, 'r') as f:
|
|
176
|
+
response_data = json.load(f)
|
|
177
|
+
|
|
178
|
+
response_text = response_data.get('result', '')
|
|
179
|
+
if not response_text:
|
|
180
|
+
print("Error: No 'result' field in response file")
|
|
181
|
+
sys.exit(1)
|
|
182
|
+
|
|
183
|
+
# Score
|
|
184
|
+
scores = score_response(response_text, scenario)
|
|
185
|
+
|
|
186
|
+
# Output
|
|
187
|
+
print(f"\n{'='*60}")
|
|
188
|
+
print(f"GROUND TRUTH EVALUATION: {scenario_name}")
|
|
189
|
+
print(f"{'='*60}")
|
|
190
|
+
print(f"\nScores:")
|
|
191
|
+
print(f" File Identification: {scores['file_identification']:5.1f}/20")
|
|
192
|
+
print(f" Location Identification: {scores['location_identification']:5.1f}/20")
|
|
193
|
+
print(f" Fix Logic Match: {scores['fix_logic_match']:5.1f}/40")
|
|
194
|
+
print(f" Completeness: {scores['completeness']:5.1f}/20")
|
|
195
|
+
print(f" {'─'*40}")
|
|
196
|
+
print(f" TOTAL: {scores['total']:5.1f}/100")
|
|
197
|
+
|
|
198
|
+
print(f"\nDetails:")
|
|
199
|
+
for key, value in scores['details'].items():
|
|
200
|
+
print(f" {key}: {value}")
|
|
201
|
+
|
|
202
|
+
# Output JSON for programmatic use
|
|
203
|
+
output = {
|
|
204
|
+
'scenario': scenario_name,
|
|
205
|
+
'instance_id': scenario.get('instance_id'),
|
|
206
|
+
'scores': scores,
|
|
207
|
+
'ground_truth_patch_preview': scenario.get('patch', '')[:300]
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# Save judge output
|
|
211
|
+
output_path = response_file.replace('run_', 'gt_judge_')
|
|
212
|
+
with open(output_path, 'w') as f:
|
|
213
|
+
json.dump(output, f, indent=2)
|
|
214
|
+
print(f"\nSaved to: {output_path}")
|
|
215
|
+
|
|
216
|
+
return scores
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
if __name__ == '__main__':
|
|
220
|
+
main()
|