@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SWE-bench scenario judge using:
|
|
4
|
+
1. Scenario-specific scoring rubric from YAML
|
|
5
|
+
2. Ground-truth validation from actual SWE-bench patches
|
|
6
|
+
|
|
7
|
+
Scoring structure:
|
|
8
|
+
- root_cause (30%): IDENTIFIES_BUG_LOCATION (15) + EXPLAINS_WHY_BROKEN (15)
|
|
9
|
+
- fix_quality (40%): FIX_ADDRESSES_ISSUE (20) + FIX_IS_MINIMAL (10) + FIX_SYNTAX_CORRECT (10)
|
|
10
|
+
- completeness (20%): EDGE_CASES (10) + TEST_COVERAGE (10)
|
|
11
|
+
- persona (10%): IN_CHARACTER (10)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from difflib import SequenceMatcher
|
|
19
|
+
|
|
20
|
+
# Add parent to path for pennyfarthing_scripts imports
|
|
21
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[4]))
|
|
22
|
+
|
|
23
|
+
from pennyfarthing_scripts.swebench import (
|
|
24
|
+
extract_patch_info,
|
|
25
|
+
find_scenario,
|
|
26
|
+
load_swebench_data,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def score_identifies_bug_location(response, ground_truth):
|
|
31
|
+
"""Score IDENTIFIES_BUG_LOCATION (15 pts) using ground truth."""
|
|
32
|
+
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
33
|
+
response_lower = response.lower()
|
|
34
|
+
|
|
35
|
+
score = 0
|
|
36
|
+
details = []
|
|
37
|
+
|
|
38
|
+
# Check files (7.5 pts)
|
|
39
|
+
files_found = 0
|
|
40
|
+
for f in patch_info.files:
|
|
41
|
+
filename = Path(f).name.lower()
|
|
42
|
+
if filename in response_lower or f.lower() in response_lower:
|
|
43
|
+
files_found += 1
|
|
44
|
+
|
|
45
|
+
if patch_info.files:
|
|
46
|
+
file_score = (files_found / len(patch_info.files)) * 7.5
|
|
47
|
+
score += file_score
|
|
48
|
+
details.append(f"Files: {files_found}/{len(patch_info.files)} found")
|
|
49
|
+
|
|
50
|
+
# Check functions/classes (7.5 pts)
|
|
51
|
+
funcs_found = 0
|
|
52
|
+
for func in patch_info.functions:
|
|
53
|
+
func_match = re.search(r'(def|class)\s+(\w+)', func)
|
|
54
|
+
if func_match:
|
|
55
|
+
func_name = func_match.group(2).lower()
|
|
56
|
+
if func_name in response_lower:
|
|
57
|
+
funcs_found += 1
|
|
58
|
+
|
|
59
|
+
if patch_info.functions:
|
|
60
|
+
func_score = min(7.5, (funcs_found / len(patch_info.functions)) * 7.5)
|
|
61
|
+
score += func_score
|
|
62
|
+
details.append(f"Functions: {funcs_found}/{len(patch_info.functions)} found")
|
|
63
|
+
else:
|
|
64
|
+
score += 3.75 # Partial credit if no specific function in patch
|
|
65
|
+
|
|
66
|
+
return min(15, score), details
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def score_explains_why_broken(response, ground_truth):
|
|
70
|
+
"""Score EXPLAINS_WHY_BROKEN (15 pts)."""
|
|
71
|
+
response_lower = response.lower()
|
|
72
|
+
problem = ground_truth.get('problem_statement', '').lower()
|
|
73
|
+
|
|
74
|
+
score = 0
|
|
75
|
+
details = []
|
|
76
|
+
|
|
77
|
+
# Extract key terms from problem statement
|
|
78
|
+
key_terms = re.findall(r'[`\'"]([^`\'"]+)[`\'"]', problem)
|
|
79
|
+
key_terms += re.findall(r'\b\w+Error\b|\b\w+Exception\b', problem, re.IGNORECASE)
|
|
80
|
+
key_terms = list(set(key_terms))[:10]
|
|
81
|
+
|
|
82
|
+
# Check for explanation of the issue
|
|
83
|
+
explanation_markers = ['because', 'this happens', 'the issue', 'the problem', 'fails when', 'breaks when', 'causes']
|
|
84
|
+
has_explanation = any(marker in response_lower for marker in explanation_markers)
|
|
85
|
+
if has_explanation:
|
|
86
|
+
score += 7.5
|
|
87
|
+
details.append("Has explanation of why broken")
|
|
88
|
+
|
|
89
|
+
# Check for key terms from problem
|
|
90
|
+
terms_found = sum(1 for term in key_terms if term.lower() in response_lower)
|
|
91
|
+
if key_terms:
|
|
92
|
+
term_score = (terms_found / len(key_terms)) * 7.5
|
|
93
|
+
score += term_score
|
|
94
|
+
details.append(f"Key terms: {terms_found}/{len(key_terms)}")
|
|
95
|
+
else:
|
|
96
|
+
score += 3.75
|
|
97
|
+
|
|
98
|
+
return min(15, score), details
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def score_fix_addresses_issue(response, ground_truth):
|
|
102
|
+
"""Score FIX_ADDRESSES_ISSUE (20 pts) using ground truth patch."""
|
|
103
|
+
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
104
|
+
response_lower = response.lower()
|
|
105
|
+
|
|
106
|
+
score = 0
|
|
107
|
+
details = []
|
|
108
|
+
|
|
109
|
+
# Check if key additions from patch appear in response
|
|
110
|
+
additions_matched = 0
|
|
111
|
+
for addition in patch_info.additions[:5]:
|
|
112
|
+
# Normalize whitespace
|
|
113
|
+
addition_norm = re.sub(r'\s+', ' ', addition.lower())
|
|
114
|
+
response_norm = re.sub(r'\s+', ' ', response_lower)
|
|
115
|
+
|
|
116
|
+
# Check for exact or fuzzy match
|
|
117
|
+
if addition_norm in response_norm:
|
|
118
|
+
additions_matched += 1
|
|
119
|
+
else:
|
|
120
|
+
# Fuzzy match
|
|
121
|
+
sim = SequenceMatcher(None, addition_norm, response_norm).ratio()
|
|
122
|
+
if sim > 0.7:
|
|
123
|
+
additions_matched += 0.5
|
|
124
|
+
|
|
125
|
+
if patch_info.additions:
|
|
126
|
+
addition_score = (additions_matched / min(5, len(patch_info.additions))) * 15
|
|
127
|
+
score += addition_score
|
|
128
|
+
details.append(f"Code matches: {additions_matched}/{min(5, len(patch_info.additions))}")
|
|
129
|
+
|
|
130
|
+
# Check for code block with fix
|
|
131
|
+
if '```' in response:
|
|
132
|
+
score += 5
|
|
133
|
+
details.append("Has code block")
|
|
134
|
+
|
|
135
|
+
return min(20, score), details
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def score_fix_is_minimal(response, ground_truth):
|
|
139
|
+
"""Score FIX_IS_MINIMAL (10 pts)."""
|
|
140
|
+
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
141
|
+
|
|
142
|
+
score = 0
|
|
143
|
+
details = []
|
|
144
|
+
|
|
145
|
+
# Count lines in patch vs lines in response code blocks
|
|
146
|
+
patch_lines = len(patch_info.additions) + len(patch_info.deletions)
|
|
147
|
+
|
|
148
|
+
# Extract code blocks from response
|
|
149
|
+
code_blocks = re.findall(r'```[\w]*\n(.*?)```', response, re.DOTALL)
|
|
150
|
+
response_code_lines = sum(len(block.strip().split('\n')) for block in code_blocks)
|
|
151
|
+
|
|
152
|
+
# If response is within 2x of patch size, it's minimal
|
|
153
|
+
if patch_lines > 0:
|
|
154
|
+
ratio = response_code_lines / patch_lines if response_code_lines > 0 else 1
|
|
155
|
+
if ratio <= 2:
|
|
156
|
+
score = 10
|
|
157
|
+
details.append(f"Minimal: {response_code_lines} lines (patch: {patch_lines})")
|
|
158
|
+
elif ratio <= 4:
|
|
159
|
+
score = 5
|
|
160
|
+
details.append(f"Somewhat verbose: {response_code_lines} lines (patch: {patch_lines})")
|
|
161
|
+
else:
|
|
162
|
+
score = 2
|
|
163
|
+
details.append(f"Over-engineered: {response_code_lines} lines (patch: {patch_lines})")
|
|
164
|
+
else:
|
|
165
|
+
score = 5
|
|
166
|
+
|
|
167
|
+
return min(10, score), details
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def score_fix_syntax_correct(response):
|
|
171
|
+
"""Score FIX_SYNTAX_CORRECT (10 pts)."""
|
|
172
|
+
score = 0
|
|
173
|
+
details = []
|
|
174
|
+
|
|
175
|
+
# Extract code blocks
|
|
176
|
+
code_blocks = re.findall(r'```python\n(.*?)```', response, re.DOTALL)
|
|
177
|
+
if not code_blocks:
|
|
178
|
+
code_blocks = re.findall(r'```\n(.*?)```', response, re.DOTALL)
|
|
179
|
+
|
|
180
|
+
if code_blocks:
|
|
181
|
+
# Basic syntax checks
|
|
182
|
+
valid = True
|
|
183
|
+
for block in code_blocks:
|
|
184
|
+
try:
|
|
185
|
+
compile(block, '<string>', 'exec')
|
|
186
|
+
except SyntaxError:
|
|
187
|
+
valid = False
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
if valid:
|
|
191
|
+
score = 10
|
|
192
|
+
details.append("Syntax valid")
|
|
193
|
+
else:
|
|
194
|
+
score = 5
|
|
195
|
+
details.append("Syntax errors detected")
|
|
196
|
+
else:
|
|
197
|
+
score = 5
|
|
198
|
+
details.append("No code blocks to validate")
|
|
199
|
+
|
|
200
|
+
return min(10, score), details
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def score_edge_cases(response):
|
|
204
|
+
"""Score EDGE_CASES (10 pts)."""
|
|
205
|
+
response_lower = response.lower()
|
|
206
|
+
|
|
207
|
+
score = 0
|
|
208
|
+
details = []
|
|
209
|
+
|
|
210
|
+
edge_markers = ['edge case', 'corner case', 'what if', 'consider', 'also', 'none', 'empty', 'null', 'zero', 'negative', 'boundary']
|
|
211
|
+
found = sum(1 for m in edge_markers if m in response_lower)
|
|
212
|
+
|
|
213
|
+
score = min(10, found * 2)
|
|
214
|
+
details.append(f"Edge case markers: {found}")
|
|
215
|
+
|
|
216
|
+
return score, details
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def score_test_coverage(response):
|
|
220
|
+
"""Score TEST_COVERAGE (10 pts)."""
|
|
221
|
+
response_lower = response.lower()
|
|
222
|
+
|
|
223
|
+
score = 0
|
|
224
|
+
details = []
|
|
225
|
+
|
|
226
|
+
# Check for test-related content
|
|
227
|
+
has_test_section = 'test' in response_lower
|
|
228
|
+
has_test_function = 'def test_' in response_lower or 'test_' in response
|
|
229
|
+
has_assert = 'assert' in response_lower or 'pytest' in response_lower
|
|
230
|
+
|
|
231
|
+
if has_test_function:
|
|
232
|
+
score += 5
|
|
233
|
+
details.append("Has test function")
|
|
234
|
+
if has_assert:
|
|
235
|
+
score += 3
|
|
236
|
+
details.append("Has assertions")
|
|
237
|
+
if has_test_section:
|
|
238
|
+
score += 2
|
|
239
|
+
details.append("Has test section")
|
|
240
|
+
|
|
241
|
+
return min(10, score), details
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def score_in_character(response, persona="senior developer"):
|
|
245
|
+
"""Score IN_CHARACTER (10 pts)."""
|
|
246
|
+
response_lower = response.lower()
|
|
247
|
+
|
|
248
|
+
score = 0
|
|
249
|
+
details = []
|
|
250
|
+
|
|
251
|
+
# For control baseline, check professional tone
|
|
252
|
+
professional_markers = ['i recommend', 'we should', 'this approach', 'the fix', 'analysis', 'root cause']
|
|
253
|
+
found = sum(1 for m in professional_markers if m in response_lower)
|
|
254
|
+
|
|
255
|
+
score = min(10, found * 2)
|
|
256
|
+
details.append(f"Professional markers: {found}")
|
|
257
|
+
|
|
258
|
+
return score, details
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def judge_response(scenario_name, response_text, swebench_data):
|
|
262
|
+
"""Full judgment using scenario rubric + ground truth."""
|
|
263
|
+
ground_truth = find_scenario(swebench_data, scenario_name)
|
|
264
|
+
|
|
265
|
+
if not ground_truth:
|
|
266
|
+
return {'error': f'Scenario {scenario_name} not found in SWE-bench data'}
|
|
267
|
+
|
|
268
|
+
scores = {}
|
|
269
|
+
all_details = {}
|
|
270
|
+
|
|
271
|
+
# root_cause (30%)
|
|
272
|
+
loc_score, loc_details = score_identifies_bug_location(response_text, ground_truth)
|
|
273
|
+
why_score, why_details = score_explains_why_broken(response_text, ground_truth)
|
|
274
|
+
scores['root_cause'] = {
|
|
275
|
+
'IDENTIFIES_BUG_LOCATION': loc_score,
|
|
276
|
+
'EXPLAINS_WHY_BROKEN': why_score,
|
|
277
|
+
'subtotal': loc_score + why_score
|
|
278
|
+
}
|
|
279
|
+
all_details['root_cause'] = loc_details + why_details
|
|
280
|
+
|
|
281
|
+
# fix_quality (40%)
|
|
282
|
+
fix_score, fix_details = score_fix_addresses_issue(response_text, ground_truth)
|
|
283
|
+
min_score, min_details = score_fix_is_minimal(response_text, ground_truth)
|
|
284
|
+
syn_score, syn_details = score_fix_syntax_correct(response_text)
|
|
285
|
+
scores['fix_quality'] = {
|
|
286
|
+
'FIX_ADDRESSES_ISSUE': fix_score,
|
|
287
|
+
'FIX_IS_MINIMAL': min_score,
|
|
288
|
+
'FIX_SYNTAX_CORRECT': syn_score,
|
|
289
|
+
'subtotal': fix_score + min_score + syn_score
|
|
290
|
+
}
|
|
291
|
+
all_details['fix_quality'] = fix_details + min_details + syn_details
|
|
292
|
+
|
|
293
|
+
# completeness (20%)
|
|
294
|
+
edge_score, edge_details = score_edge_cases(response_text)
|
|
295
|
+
test_score, test_details = score_test_coverage(response_text)
|
|
296
|
+
scores['completeness'] = {
|
|
297
|
+
'EDGE_CASES': edge_score,
|
|
298
|
+
'TEST_COVERAGE': test_score,
|
|
299
|
+
'subtotal': edge_score + test_score
|
|
300
|
+
}
|
|
301
|
+
all_details['completeness'] = edge_details + test_details
|
|
302
|
+
|
|
303
|
+
# persona (10%)
|
|
304
|
+
char_score, char_details = score_in_character(response_text)
|
|
305
|
+
scores['persona'] = {
|
|
306
|
+
'IN_CHARACTER': char_score,
|
|
307
|
+
'subtotal': char_score
|
|
308
|
+
}
|
|
309
|
+
all_details['persona'] = char_details
|
|
310
|
+
|
|
311
|
+
# Total
|
|
312
|
+
total = (
|
|
313
|
+
scores['root_cause']['subtotal'] +
|
|
314
|
+
scores['fix_quality']['subtotal'] +
|
|
315
|
+
scores['completeness']['subtotal'] +
|
|
316
|
+
scores['persona']['subtotal']
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
320
|
+
return {
|
|
321
|
+
'scenario': scenario_name,
|
|
322
|
+
'instance_id': ground_truth.get('instance_id'),
|
|
323
|
+
'scores': scores,
|
|
324
|
+
'total': round(total, 1),
|
|
325
|
+
'details': all_details,
|
|
326
|
+
'ground_truth_files': patch_info.files
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def main():
|
|
331
|
+
if len(sys.argv) < 3:
|
|
332
|
+
print("Usage: swebench-judge.py <scenario_name> <response_file>")
|
|
333
|
+
sys.exit(1)
|
|
334
|
+
|
|
335
|
+
scenario_name = sys.argv[1]
|
|
336
|
+
response_file = sys.argv[2]
|
|
337
|
+
|
|
338
|
+
# Load data
|
|
339
|
+
swebench_data = load_swebench_data()
|
|
340
|
+
|
|
341
|
+
with open(response_file, 'r') as f:
|
|
342
|
+
response_data = json.load(f)
|
|
343
|
+
|
|
344
|
+
# Handle different JSON structures
|
|
345
|
+
response_text = response_data.get('result', '') or response_data.get('response_text', '')
|
|
346
|
+
|
|
347
|
+
# Judge
|
|
348
|
+
result = judge_response(scenario_name, response_text, swebench_data)
|
|
349
|
+
|
|
350
|
+
# Display
|
|
351
|
+
print(f"\n{'='*60}")
|
|
352
|
+
print(f"SWE-BENCH JUDGE: {scenario_name}")
|
|
353
|
+
print(f"{'='*60}")
|
|
354
|
+
|
|
355
|
+
for category, scores in result['scores'].items():
|
|
356
|
+
print(f"\n{category.upper()} ({scores['subtotal']:.1f} pts)")
|
|
357
|
+
for criterion, score in scores.items():
|
|
358
|
+
if criterion != 'subtotal':
|
|
359
|
+
print(f" {criterion}: {score:.1f}")
|
|
360
|
+
|
|
361
|
+
print(f"\n{'─'*40}")
|
|
362
|
+
print(f"TOTAL: {result['total']}/100")
|
|
363
|
+
|
|
364
|
+
print(f"\nGround truth files: {result['ground_truth_files']}")
|
|
365
|
+
|
|
366
|
+
# Save
|
|
367
|
+
output_path = response_file.replace('run_', 'swebench_judge_')
|
|
368
|
+
with open(output_path, 'w') as f:
|
|
369
|
+
json.dump(result, f, indent=2)
|
|
370
|
+
print(f"\nSaved to: {output_path}")
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
if __name__ == '__main__':
|
|
374
|
+
main()
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# test-cache.sh - Read/write test result cache in session files
|
|
3
|
+
#
|
|
4
|
+
# Usage:
|
|
5
|
+
# source test-cache.sh
|
|
6
|
+
#
|
|
7
|
+
# # Check if valid cache exists (returns 0 if valid, 1 if stale/missing)
|
|
8
|
+
# if test_cache_valid "$SESSION_FILE"; then
|
|
9
|
+
# RESULT=$(test_cache_get "$SESSION_FILE" "result")
|
|
10
|
+
# echo "Using cached result: $RESULT"
|
|
11
|
+
# fi
|
|
12
|
+
#
|
|
13
|
+
# # Write cache after running tests
|
|
14
|
+
# test_cache_write "$SESSION_FILE" "GREEN" 42 0 1 "12s"
|
|
15
|
+
|
|
16
|
+
set -euo pipefail
|
|
17
|
+
|
|
18
|
+
# Cache validity window (5 minutes)
|
|
19
|
+
CACHE_MAX_AGE_MINUTES=5
|
|
20
|
+
|
|
21
|
+
# Check if test cache exists and is valid (same SHA, < 5 min old)
|
|
22
|
+
# Returns: 0 if valid, 1 if invalid/missing
|
|
23
|
+
test_cache_valid() {
|
|
24
|
+
local session_file="$1"
|
|
25
|
+
|
|
26
|
+
if [[ ! -f "$session_file" ]]; then
|
|
27
|
+
return 1
|
|
28
|
+
fi
|
|
29
|
+
|
|
30
|
+
if ! grep -q "^## Test Cache" "$session_file" 2>/dev/null; then
|
|
31
|
+
return 1
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
local current_sha
|
|
35
|
+
current_sha=$(git rev-parse HEAD 2>/dev/null || echo "unknown")
|
|
36
|
+
|
|
37
|
+
local cache_sha
|
|
38
|
+
cache_sha=$(grep "| Git SHA |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs)
|
|
39
|
+
|
|
40
|
+
if [[ "$cache_sha" != "$current_sha" ]]; then
|
|
41
|
+
return 1
|
|
42
|
+
fi
|
|
43
|
+
|
|
44
|
+
local cache_time
|
|
45
|
+
cache_time=$(grep "| Last Run |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs)
|
|
46
|
+
|
|
47
|
+
local cache_epoch now_epoch age_minutes
|
|
48
|
+
# macOS date
|
|
49
|
+
cache_epoch=$(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$cache_time" +%s 2>/dev/null || \
|
|
50
|
+
date -d "$cache_time" +%s 2>/dev/null || echo 0)
|
|
51
|
+
now_epoch=$(date +%s)
|
|
52
|
+
age_minutes=$(( (now_epoch - cache_epoch) / 60 ))
|
|
53
|
+
|
|
54
|
+
if [[ $age_minutes -ge $CACHE_MAX_AGE_MINUTES ]]; then
|
|
55
|
+
return 1
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
return 0
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
# Get a field from the test cache
|
|
62
|
+
# Usage: test_cache_get "$SESSION_FILE" "result|pass|fail|skip|duration|sha|time"
|
|
63
|
+
test_cache_get() {
|
|
64
|
+
local session_file="$1"
|
|
65
|
+
local field="$2"
|
|
66
|
+
|
|
67
|
+
case "$field" in
|
|
68
|
+
result) grep "| Result |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
|
|
69
|
+
pass) grep "| Pass |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
|
|
70
|
+
fail) grep "| Fail |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
|
|
71
|
+
skip) grep "| Skip |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
|
|
72
|
+
duration) grep "| Duration |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
|
|
73
|
+
sha) grep "| Git SHA |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
|
|
74
|
+
time) grep "| Last Run |" "$session_file" 2>/dev/null | sed 's/.*| \([^ ]*\) |.*/\1/' | xargs ;;
|
|
75
|
+
*) echo "Unknown field: $field" >&2; return 1 ;;
|
|
76
|
+
esac
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Write test cache to session file
|
|
80
|
+
# Usage: test_cache_write "$SESSION_FILE" "GREEN" 42 0 1 "12s"
|
|
81
|
+
test_cache_write() {
|
|
82
|
+
local session_file="$1"
|
|
83
|
+
local result="$2" # GREEN, RED, YELLOW
|
|
84
|
+
local pass="$3" # pass count
|
|
85
|
+
local fail="$4" # fail count
|
|
86
|
+
local skip="$5" # skip count
|
|
87
|
+
local duration="$6" # e.g., "12s"
|
|
88
|
+
|
|
89
|
+
local git_sha timestamp
|
|
90
|
+
git_sha=$(git rev-parse HEAD 2>/dev/null || echo "unknown")
|
|
91
|
+
timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
92
|
+
|
|
93
|
+
local cache_section="## Test Cache
|
|
94
|
+
|
|
95
|
+
| Field | Value |
|
|
96
|
+
|-------|-------|
|
|
97
|
+
| Last Run | $timestamp |
|
|
98
|
+
| Git SHA | $git_sha |
|
|
99
|
+
| Result | $result |
|
|
100
|
+
| Pass | $pass |
|
|
101
|
+
| Fail | $fail |
|
|
102
|
+
| Skip | $skip |
|
|
103
|
+
| Duration | $duration |"
|
|
104
|
+
|
|
105
|
+
if [[ ! -f "$session_file" ]]; then
|
|
106
|
+
echo "Session file not found: $session_file" >&2
|
|
107
|
+
return 1
|
|
108
|
+
fi
|
|
109
|
+
|
|
110
|
+
# Check if Test Cache section exists
|
|
111
|
+
if grep -q "^## Test Cache" "$session_file" 2>/dev/null; then
|
|
112
|
+
# Replace existing section - find start and end, replace content
|
|
113
|
+
local temp_file
|
|
114
|
+
temp_file=$(mktemp)
|
|
115
|
+
|
|
116
|
+
awk -v new_section="$cache_section" '
|
|
117
|
+
/^## Test Cache/ {
|
|
118
|
+
in_section = 1
|
|
119
|
+
print new_section
|
|
120
|
+
next
|
|
121
|
+
}
|
|
122
|
+
/^## / && in_section {
|
|
123
|
+
in_section = 0
|
|
124
|
+
}
|
|
125
|
+
!in_section { print }
|
|
126
|
+
' "$session_file" > "$temp_file"
|
|
127
|
+
|
|
128
|
+
mv "$temp_file" "$session_file"
|
|
129
|
+
else
|
|
130
|
+
# Append before "## Workflow Tracking" if present, else at end
|
|
131
|
+
if grep -q "^## Workflow Tracking" "$session_file" 2>/dev/null; then
|
|
132
|
+
local temp_file
|
|
133
|
+
temp_file=$(mktemp)
|
|
134
|
+
|
|
135
|
+
awk -v new_section="$cache_section" '
|
|
136
|
+
/^## Workflow Tracking/ {
|
|
137
|
+
print new_section
|
|
138
|
+
print ""
|
|
139
|
+
}
|
|
140
|
+
{ print }
|
|
141
|
+
' "$session_file" > "$temp_file"
|
|
142
|
+
|
|
143
|
+
mv "$temp_file" "$session_file"
|
|
144
|
+
else
|
|
145
|
+
echo "" >> "$session_file"
|
|
146
|
+
echo "$cache_section" >> "$session_file"
|
|
147
|
+
fi
|
|
148
|
+
fi
|
|
149
|
+
|
|
150
|
+
echo "Test cache written: $result ($pass passed, $fail failed, $skip skipped)"
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
# Print cache status for debugging
|
|
154
|
+
test_cache_status() {
|
|
155
|
+
local session_file="$1"
|
|
156
|
+
|
|
157
|
+
if test_cache_valid "$session_file"; then
|
|
158
|
+
echo "Cache: VALID"
|
|
159
|
+
echo " Result: $(test_cache_get "$session_file" "result")"
|
|
160
|
+
echo " SHA: $(test_cache_get "$session_file" "sha")"
|
|
161
|
+
echo " Time: $(test_cache_get "$session_file" "time")"
|
|
162
|
+
else
|
|
163
|
+
echo "Cache: INVALID or MISSING"
|
|
164
|
+
fi
|
|
165
|
+
}
|