@pennyfarthing/core 7.6.1 → 7.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +109 -201
- package/package.json +1 -1
- package/packages/core/dist/cli/commands/doctor.d.ts.map +1 -1
- package/packages/core/dist/cli/commands/doctor.js +91 -0
- package/packages/core/dist/cli/commands/doctor.js.map +1 -1
- package/packages/core/dist/cli/commands/init.js +31 -0
- package/packages/core/dist/cli/commands/init.js.map +1 -1
- package/packages/core/dist/cli/commands/update.js +31 -0
- package/packages/core/dist/cli/commands/update.js.map +1 -1
- package/pennyfarthing-dist/agents/architect.md +48 -53
- package/pennyfarthing-dist/agents/dev.md +74 -164
- package/pennyfarthing-dist/agents/devops.md +44 -39
- package/pennyfarthing-dist/agents/handoff.md +46 -23
- package/pennyfarthing-dist/agents/orchestrator.md +84 -255
- package/pennyfarthing-dist/agents/pm.md +40 -50
- package/pennyfarthing-dist/agents/reviewer-preflight.md +58 -26
- package/pennyfarthing-dist/agents/reviewer.md +107 -298
- package/pennyfarthing-dist/agents/sm-file-summary.md +51 -30
- package/pennyfarthing-dist/agents/sm-finish.md +59 -38
- package/pennyfarthing-dist/agents/sm-handoff.md +40 -33
- package/pennyfarthing-dist/agents/sm-setup.md +89 -47
- package/pennyfarthing-dist/agents/sm.md +171 -558
- package/pennyfarthing-dist/agents/tea.md +77 -146
- package/pennyfarthing-dist/agents/tech-writer.md +43 -24
- package/pennyfarthing-dist/agents/testing-runner.md +73 -30
- package/pennyfarthing-dist/agents/ux-designer.md +39 -25
- package/pennyfarthing-dist/agents/workflow-status-check.md +34 -16
- package/pennyfarthing-dist/commands/benchmark.md +19 -1
- package/pennyfarthing-dist/commands/continue-session.md +1 -1
- package/pennyfarthing-dist/commands/solo.md +5 -0
- package/pennyfarthing-dist/commands/theme-maker.md +5 -5
- package/pennyfarthing-dist/commands/work.md +1 -1
- package/pennyfarthing-dist/guides/XML-TAGS.md +179 -0
- package/pennyfarthing-dist/guides/agent-behavior.md +22 -9
- package/pennyfarthing-dist/guides/agent-tag-taxonomy.md +432 -0
- package/pennyfarthing-dist/guides/patterns/approval-gates-pattern.md +27 -7
- package/pennyfarthing-dist/guides/scale-levels.md +114 -0
- package/pennyfarthing-dist/personas/themes/gilligans-island.yaml +2 -2
- package/pennyfarthing-dist/personas/themes/star-trek-tos.yaml +1 -1
- package/pennyfarthing-dist/scripts/core/agent-session.sh +13 -7
- package/pennyfarthing-dist/scripts/core/check-context.sh +6 -1
- package/pennyfarthing-dist/scripts/core/prime.sh +57 -32
- package/pennyfarthing-dist/scripts/git/create-feature-branches.sh +45 -4
- package/pennyfarthing-dist/scripts/git/git-status-all.sh +32 -7
- package/pennyfarthing-dist/scripts/hooks/bell-mode-hook.sh +30 -11
- package/pennyfarthing-dist/scripts/hooks/pre-commit.sh +80 -23
- package/pennyfarthing-dist/scripts/hooks/question-reflector-check.mjs +66 -53
- package/pennyfarthing-dist/scripts/hooks/question-reflector-check.sh +4 -4
- package/pennyfarthing-dist/scripts/hooks/question_reflector_check.py +402 -0
- package/pennyfarthing-dist/scripts/hooks/session-stop.sh +7 -0
- package/pennyfarthing-dist/scripts/hooks/welcome-hook.sh +94 -0
- package/pennyfarthing-dist/scripts/jira/jira-claim-story.sh +10 -152
- package/pennyfarthing-dist/scripts/jira/jira-sync-story.sh +14 -4
- package/pennyfarthing-dist/scripts/jira/jira-sync.sh +12 -4
- package/pennyfarthing-dist/scripts/jira/sync-epic-jira.sh +11 -99
- package/pennyfarthing-dist/scripts/lib/common.sh +55 -0
- package/pennyfarthing-dist/scripts/maintenance/sidecar-health.sh +97 -0
- package/pennyfarthing-dist/scripts/misc/statusline.sh +27 -22
- package/pennyfarthing-dist/scripts/story/create-story.sh +14 -154
- package/pennyfarthing-dist/scripts/story/size-story.sh +12 -192
- package/pennyfarthing-dist/scripts/story/story-template.sh +12 -156
- package/pennyfarthing-dist/scripts/test/ground-truth-judge.py +24 -93
- package/pennyfarthing-dist/scripts/test/swebench-judge.py +33 -59
- package/pennyfarthing-dist/scripts/validation/validate-agent-schema.sh +575 -0
- package/pennyfarthing-dist/scripts/workflow/check.py +502 -0
- package/pennyfarthing-dist/skills/skill-registry.yaml +52 -16
- package/pennyfarthing-dist/skills/sprint/skill.md +1 -1
- package/pennyfarthing-dist/templates/settings.local.json.template +11 -0
|
@@ -16,71 +16,21 @@ import sys
|
|
|
16
16
|
from pathlib import Path
|
|
17
17
|
from difflib import SequenceMatcher
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# Try various matching strategies
|
|
30
|
-
if scenario_name in instance_id.replace('__', '-'):
|
|
31
|
-
return item
|
|
32
|
-
if scenario_name.replace('-', '__') in instance_id:
|
|
33
|
-
return item
|
|
34
|
-
return None
|
|
35
|
-
|
|
36
|
-
def extract_patch_elements(patch_text):
|
|
37
|
-
"""Extract key elements from a patch."""
|
|
38
|
-
elements = {
|
|
39
|
-
'files': [],
|
|
40
|
-
'functions': [],
|
|
41
|
-
'additions': [],
|
|
42
|
-
'deletions': [],
|
|
43
|
-
'key_patterns': []
|
|
44
|
-
}
|
|
19
|
+
# Add parent to path for pennyfarthing_scripts imports
|
|
20
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
|
|
21
|
+
|
|
22
|
+
from pennyfarthing_scripts.swebench import (
|
|
23
|
+
extract_patch_info,
|
|
24
|
+
extract_problem_keywords,
|
|
25
|
+
find_scenario,
|
|
26
|
+
get_meaningful_patterns,
|
|
27
|
+
load_swebench_data,
|
|
28
|
+
)
|
|
45
29
|
|
|
46
|
-
current_file = None
|
|
47
|
-
for line in patch_text.split('\n'):
|
|
48
|
-
# File changes
|
|
49
|
-
if line.startswith('diff --git'):
|
|
50
|
-
match = re.search(r'b/(.+)$', line)
|
|
51
|
-
if match:
|
|
52
|
-
current_file = match.group(1)
|
|
53
|
-
elements['files'].append(current_file)
|
|
54
|
-
|
|
55
|
-
# Function/class context
|
|
56
|
-
if line.startswith('@@'):
|
|
57
|
-
match = re.search(r'@@.*@@\s*(.+)$', line)
|
|
58
|
-
if match:
|
|
59
|
-
elements['functions'].append(match.group(1).strip())
|
|
60
|
-
|
|
61
|
-
# Additions
|
|
62
|
-
if line.startswith('+') and not line.startswith('+++'):
|
|
63
|
-
clean_line = line[1:].strip()
|
|
64
|
-
if clean_line and not clean_line.startswith('#'):
|
|
65
|
-
elements['additions'].append(clean_line)
|
|
66
|
-
# Extract key patterns (function calls, variable names, etc.)
|
|
67
|
-
patterns = re.findall(r'\b\w+\b', clean_line)
|
|
68
|
-
elements['key_patterns'].extend(patterns)
|
|
69
|
-
|
|
70
|
-
# Deletions
|
|
71
|
-
if line.startswith('-') and not line.startswith('---'):
|
|
72
|
-
clean_line = line[1:].strip()
|
|
73
|
-
if clean_line and not clean_line.startswith('#'):
|
|
74
|
-
elements['deletions'].append(clean_line)
|
|
75
|
-
|
|
76
|
-
# Deduplicate
|
|
77
|
-
elements['key_patterns'] = list(set(elements['key_patterns']))
|
|
78
|
-
|
|
79
|
-
return elements
|
|
80
30
|
|
|
81
31
|
def score_response(response_text, ground_truth):
|
|
82
32
|
"""Score a response against ground truth patch."""
|
|
83
|
-
|
|
33
|
+
patch_info = extract_patch_info(ground_truth['patch'])
|
|
84
34
|
|
|
85
35
|
scores = {
|
|
86
36
|
'file_identification': 0,
|
|
@@ -94,16 +44,16 @@ def score_response(response_text, ground_truth):
|
|
|
94
44
|
|
|
95
45
|
# 1. FILE IDENTIFICATION (20 points)
|
|
96
46
|
files_found = 0
|
|
97
|
-
for f in
|
|
47
|
+
for f in patch_info.files:
|
|
98
48
|
# Check various forms of the filename
|
|
99
49
|
filename = Path(f).name
|
|
100
50
|
if filename.lower() in response_lower or f.lower() in response_lower:
|
|
101
51
|
files_found += 1
|
|
102
52
|
|
|
103
|
-
if
|
|
104
|
-
file_score = (files_found / len(
|
|
53
|
+
if patch_info.files:
|
|
54
|
+
file_score = (files_found / len(patch_info.files)) * 20
|
|
105
55
|
scores['file_identification'] = min(20, file_score)
|
|
106
|
-
scores['details']['files_expected'] =
|
|
56
|
+
scores['details']['files_expected'] = patch_info.files
|
|
107
57
|
scores['details']['files_found'] = files_found
|
|
108
58
|
else:
|
|
109
59
|
scores['file_identification'] = 20 # No specific file in patch
|
|
@@ -111,7 +61,7 @@ def score_response(response_text, ground_truth):
|
|
|
111
61
|
# 2. LOCATION IDENTIFICATION (20 points)
|
|
112
62
|
# Look for function/class names mentioned in the patch
|
|
113
63
|
locations_found = 0
|
|
114
|
-
for func in
|
|
64
|
+
for func in patch_info.functions:
|
|
115
65
|
# Extract the function/class name
|
|
116
66
|
func_match = re.search(r'(def|class)\s+(\w+)', func)
|
|
117
67
|
if func_match:
|
|
@@ -121,20 +71,17 @@ def score_response(response_text, ground_truth):
|
|
|
121
71
|
elif func.strip() and func.strip().split()[0] in response_lower:
|
|
122
72
|
locations_found += 1
|
|
123
73
|
|
|
124
|
-
if
|
|
125
|
-
loc_score = (locations_found / len(
|
|
74
|
+
if patch_info.functions:
|
|
75
|
+
loc_score = (locations_found / len(patch_info.functions)) * 20
|
|
126
76
|
scores['location_identification'] = min(20, loc_score)
|
|
127
|
-
scores['details']['locations_expected'] =
|
|
77
|
+
scores['details']['locations_expected'] = patch_info.functions[:3]
|
|
128
78
|
scores['details']['locations_found'] = locations_found
|
|
129
79
|
else:
|
|
130
80
|
scores['location_identification'] = 10 # Partial credit
|
|
131
81
|
|
|
132
82
|
# 3. FIX LOGIC MATCH (40 points)
|
|
133
83
|
# Check if key code patterns from the fix appear in the response
|
|
134
|
-
|
|
135
|
-
# Filter to meaningful patterns (not common words)
|
|
136
|
-
common_words = {'if', 'else', 'return', 'self', 'def', 'class', 'for', 'in', 'not', 'and', 'or', 'is', 'none', 'true', 'false'}
|
|
137
|
-
meaningful_patterns = [p for p in key_patterns if p.lower() not in common_words and len(p) > 2]
|
|
84
|
+
meaningful_patterns = get_meaningful_patterns(patch_info.key_patterns)
|
|
138
85
|
|
|
139
86
|
patterns_found = 0
|
|
140
87
|
for pattern in meaningful_patterns:
|
|
@@ -150,7 +97,7 @@ def score_response(response_text, ground_truth):
|
|
|
150
97
|
|
|
151
98
|
# Check for actual code additions
|
|
152
99
|
additions_matched = 0
|
|
153
|
-
for addition in
|
|
100
|
+
for addition in patch_info.additions[:5]: # Check first 5 additions
|
|
154
101
|
# Normalize and check
|
|
155
102
|
addition_normalized = re.sub(r'\s+', ' ', addition.lower())
|
|
156
103
|
response_normalized = re.sub(r'\s+', ' ', response_lower)
|
|
@@ -160,8 +107,8 @@ def score_response(response_text, ground_truth):
|
|
|
160
107
|
if similarity > 0.6 or addition_normalized in response_normalized:
|
|
161
108
|
additions_matched += 1
|
|
162
109
|
|
|
163
|
-
if
|
|
164
|
-
addition_score = (additions_matched / min(5, len(
|
|
110
|
+
if patch_info.additions:
|
|
111
|
+
addition_score = (additions_matched / min(5, len(patch_info.additions))) * 20
|
|
165
112
|
scores['details']['additions_matched'] = additions_matched
|
|
166
113
|
else:
|
|
167
114
|
addition_score = 10
|
|
@@ -205,23 +152,6 @@ def score_response(response_text, ground_truth):
|
|
|
205
152
|
|
|
206
153
|
return scores
|
|
207
154
|
|
|
208
|
-
def extract_problem_keywords(problem_statement):
|
|
209
|
-
"""Extract key technical terms from problem statement."""
|
|
210
|
-
if not problem_statement:
|
|
211
|
-
return []
|
|
212
|
-
|
|
213
|
-
# Find quoted strings, function names, error messages
|
|
214
|
-
keywords = []
|
|
215
|
-
|
|
216
|
-
# Find quoted terms
|
|
217
|
-
quoted = re.findall(r'[`\'"]([^`\'"]+)[`\'"]', problem_statement)
|
|
218
|
-
keywords.extend(quoted)
|
|
219
|
-
|
|
220
|
-
# Find CamelCase or snake_case identifiers
|
|
221
|
-
identifiers = re.findall(r'\b[A-Z][a-z]+[A-Z]\w*\b|\b\w+_\w+\b', problem_statement)
|
|
222
|
-
keywords.extend(identifiers)
|
|
223
|
-
|
|
224
|
-
return list(set(keywords))[:10]
|
|
225
155
|
|
|
226
156
|
def main():
|
|
227
157
|
if len(sys.argv) < 3:
|
|
@@ -285,5 +215,6 @@ def main():
|
|
|
285
215
|
|
|
286
216
|
return scores
|
|
287
217
|
|
|
218
|
+
|
|
288
219
|
if __name__ == '__main__':
|
|
289
220
|
main()
|
|
@@ -17,52 +17,15 @@ import sys
|
|
|
17
17
|
from pathlib import Path
|
|
18
18
|
from difflib import SequenceMatcher
|
|
19
19
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
with open(cache_path, 'r') as f:
|
|
23
|
-
return json.load(f)
|
|
24
|
-
|
|
25
|
-
def find_ground_truth(data, scenario_name):
|
|
26
|
-
"""Find scenario in SWE-bench data."""
|
|
27
|
-
for item in data:
|
|
28
|
-
instance_id = item.get('instance_id', '')
|
|
29
|
-
if scenario_name in instance_id.replace('__', '-'):
|
|
30
|
-
return item
|
|
31
|
-
return None
|
|
32
|
-
|
|
33
|
-
def extract_patch_info(patch_text):
|
|
34
|
-
"""Extract structured info from patch."""
|
|
35
|
-
info = {
|
|
36
|
-
'files': [],
|
|
37
|
-
'functions': [],
|
|
38
|
-
'additions': [],
|
|
39
|
-
'deletions': [],
|
|
40
|
-
'key_code': []
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
for line in patch_text.split('\n'):
|
|
44
|
-
if line.startswith('diff --git'):
|
|
45
|
-
match = re.search(r'b/(.+)$', line)
|
|
46
|
-
if match:
|
|
47
|
-
info['files'].append(match.group(1))
|
|
48
|
-
|
|
49
|
-
if line.startswith('@@'):
|
|
50
|
-
match = re.search(r'@@.*@@\s*(.+)$', line)
|
|
51
|
-
if match:
|
|
52
|
-
info['functions'].append(match.group(1).strip())
|
|
53
|
-
|
|
54
|
-
if line.startswith('+') and not line.startswith('+++'):
|
|
55
|
-
clean = line[1:].strip()
|
|
56
|
-
if clean and not clean.startswith('#'):
|
|
57
|
-
info['additions'].append(clean)
|
|
58
|
-
info['key_code'].append(clean)
|
|
20
|
+
# Add parent to path for pennyfarthing_scripts imports
|
|
21
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[3]))
|
|
59
22
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
23
|
+
from pennyfarthing_scripts.swebench import (
|
|
24
|
+
extract_patch_info,
|
|
25
|
+
find_scenario,
|
|
26
|
+
load_swebench_data,
|
|
27
|
+
)
|
|
64
28
|
|
|
65
|
-
return info
|
|
66
29
|
|
|
67
30
|
def score_identifies_bug_location(response, ground_truth):
|
|
68
31
|
"""Score IDENTIFIES_BUG_LOCATION (15 pts) using ground truth."""
|
|
@@ -74,34 +37,35 @@ def score_identifies_bug_location(response, ground_truth):
|
|
|
74
37
|
|
|
75
38
|
# Check files (7.5 pts)
|
|
76
39
|
files_found = 0
|
|
77
|
-
for f in patch_info
|
|
40
|
+
for f in patch_info.files:
|
|
78
41
|
filename = Path(f).name.lower()
|
|
79
42
|
if filename in response_lower or f.lower() in response_lower:
|
|
80
43
|
files_found += 1
|
|
81
44
|
|
|
82
|
-
if patch_info
|
|
83
|
-
file_score = (files_found / len(patch_info
|
|
45
|
+
if patch_info.files:
|
|
46
|
+
file_score = (files_found / len(patch_info.files)) * 7.5
|
|
84
47
|
score += file_score
|
|
85
|
-
details.append(f"Files: {files_found}/{len(patch_info
|
|
48
|
+
details.append(f"Files: {files_found}/{len(patch_info.files)} found")
|
|
86
49
|
|
|
87
50
|
# Check functions/classes (7.5 pts)
|
|
88
51
|
funcs_found = 0
|
|
89
|
-
for func in patch_info
|
|
52
|
+
for func in patch_info.functions:
|
|
90
53
|
func_match = re.search(r'(def|class)\s+(\w+)', func)
|
|
91
54
|
if func_match:
|
|
92
55
|
func_name = func_match.group(2).lower()
|
|
93
56
|
if func_name in response_lower:
|
|
94
57
|
funcs_found += 1
|
|
95
58
|
|
|
96
|
-
if patch_info
|
|
97
|
-
func_score = min(7.5, (funcs_found / len(patch_info
|
|
59
|
+
if patch_info.functions:
|
|
60
|
+
func_score = min(7.5, (funcs_found / len(patch_info.functions)) * 7.5)
|
|
98
61
|
score += func_score
|
|
99
|
-
details.append(f"Functions: {funcs_found}/{len(patch_info
|
|
62
|
+
details.append(f"Functions: {funcs_found}/{len(patch_info.functions)} found")
|
|
100
63
|
else:
|
|
101
64
|
score += 3.75 # Partial credit if no specific function in patch
|
|
102
65
|
|
|
103
66
|
return min(15, score), details
|
|
104
67
|
|
|
68
|
+
|
|
105
69
|
def score_explains_why_broken(response, ground_truth):
|
|
106
70
|
"""Score EXPLAINS_WHY_BROKEN (15 pts)."""
|
|
107
71
|
response_lower = response.lower()
|
|
@@ -133,6 +97,7 @@ def score_explains_why_broken(response, ground_truth):
|
|
|
133
97
|
|
|
134
98
|
return min(15, score), details
|
|
135
99
|
|
|
100
|
+
|
|
136
101
|
def score_fix_addresses_issue(response, ground_truth):
|
|
137
102
|
"""Score FIX_ADDRESSES_ISSUE (20 pts) using ground truth patch."""
|
|
138
103
|
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
@@ -143,7 +108,7 @@ def score_fix_addresses_issue(response, ground_truth):
|
|
|
143
108
|
|
|
144
109
|
# Check if key additions from patch appear in response
|
|
145
110
|
additions_matched = 0
|
|
146
|
-
for addition in patch_info
|
|
111
|
+
for addition in patch_info.additions[:5]:
|
|
147
112
|
# Normalize whitespace
|
|
148
113
|
addition_norm = re.sub(r'\s+', ' ', addition.lower())
|
|
149
114
|
response_norm = re.sub(r'\s+', ' ', response_lower)
|
|
@@ -157,10 +122,10 @@ def score_fix_addresses_issue(response, ground_truth):
|
|
|
157
122
|
if sim > 0.7:
|
|
158
123
|
additions_matched += 0.5
|
|
159
124
|
|
|
160
|
-
if patch_info
|
|
161
|
-
addition_score = (additions_matched / min(5, len(patch_info
|
|
125
|
+
if patch_info.additions:
|
|
126
|
+
addition_score = (additions_matched / min(5, len(patch_info.additions))) * 15
|
|
162
127
|
score += addition_score
|
|
163
|
-
details.append(f"Code matches: {additions_matched}/{min(5, len(patch_info
|
|
128
|
+
details.append(f"Code matches: {additions_matched}/{min(5, len(patch_info.additions))}")
|
|
164
129
|
|
|
165
130
|
# Check for code block with fix
|
|
166
131
|
if '```' in response:
|
|
@@ -169,6 +134,7 @@ def score_fix_addresses_issue(response, ground_truth):
|
|
|
169
134
|
|
|
170
135
|
return min(20, score), details
|
|
171
136
|
|
|
137
|
+
|
|
172
138
|
def score_fix_is_minimal(response, ground_truth):
|
|
173
139
|
"""Score FIX_IS_MINIMAL (10 pts)."""
|
|
174
140
|
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
@@ -177,7 +143,7 @@ def score_fix_is_minimal(response, ground_truth):
|
|
|
177
143
|
details = []
|
|
178
144
|
|
|
179
145
|
# Count lines in patch vs lines in response code blocks
|
|
180
|
-
patch_lines = len(patch_info
|
|
146
|
+
patch_lines = len(patch_info.additions) + len(patch_info.deletions)
|
|
181
147
|
|
|
182
148
|
# Extract code blocks from response
|
|
183
149
|
code_blocks = re.findall(r'```[\w]*\n(.*?)```', response, re.DOTALL)
|
|
@@ -200,6 +166,7 @@ def score_fix_is_minimal(response, ground_truth):
|
|
|
200
166
|
|
|
201
167
|
return min(10, score), details
|
|
202
168
|
|
|
169
|
+
|
|
203
170
|
def score_fix_syntax_correct(response):
|
|
204
171
|
"""Score FIX_SYNTAX_CORRECT (10 pts)."""
|
|
205
172
|
score = 0
|
|
@@ -232,6 +199,7 @@ def score_fix_syntax_correct(response):
|
|
|
232
199
|
|
|
233
200
|
return min(10, score), details
|
|
234
201
|
|
|
202
|
+
|
|
235
203
|
def score_edge_cases(response):
|
|
236
204
|
"""Score EDGE_CASES (10 pts)."""
|
|
237
205
|
response_lower = response.lower()
|
|
@@ -247,6 +215,7 @@ def score_edge_cases(response):
|
|
|
247
215
|
|
|
248
216
|
return score, details
|
|
249
217
|
|
|
218
|
+
|
|
250
219
|
def score_test_coverage(response):
|
|
251
220
|
"""Score TEST_COVERAGE (10 pts)."""
|
|
252
221
|
response_lower = response.lower()
|
|
@@ -271,6 +240,7 @@ def score_test_coverage(response):
|
|
|
271
240
|
|
|
272
241
|
return min(10, score), details
|
|
273
242
|
|
|
243
|
+
|
|
274
244
|
def score_in_character(response, persona="senior developer"):
|
|
275
245
|
"""Score IN_CHARACTER (10 pts)."""
|
|
276
246
|
response_lower = response.lower()
|
|
@@ -287,9 +257,10 @@ def score_in_character(response, persona="senior developer"):
|
|
|
287
257
|
|
|
288
258
|
return score, details
|
|
289
259
|
|
|
260
|
+
|
|
290
261
|
def judge_response(scenario_name, response_text, swebench_data):
|
|
291
262
|
"""Full judgment using scenario rubric + ground truth."""
|
|
292
|
-
ground_truth =
|
|
263
|
+
ground_truth = find_scenario(swebench_data, scenario_name)
|
|
293
264
|
|
|
294
265
|
if not ground_truth:
|
|
295
266
|
return {'error': f'Scenario {scenario_name} not found in SWE-bench data'}
|
|
@@ -345,15 +316,17 @@ def judge_response(scenario_name, response_text, swebench_data):
|
|
|
345
316
|
scores['persona']['subtotal']
|
|
346
317
|
)
|
|
347
318
|
|
|
319
|
+
patch_info = extract_patch_info(ground_truth.get('patch', ''))
|
|
348
320
|
return {
|
|
349
321
|
'scenario': scenario_name,
|
|
350
322
|
'instance_id': ground_truth.get('instance_id'),
|
|
351
323
|
'scores': scores,
|
|
352
324
|
'total': round(total, 1),
|
|
353
325
|
'details': all_details,
|
|
354
|
-
'ground_truth_files':
|
|
326
|
+
'ground_truth_files': patch_info.files
|
|
355
327
|
}
|
|
356
328
|
|
|
329
|
+
|
|
357
330
|
def main():
|
|
358
331
|
if len(sys.argv) < 3:
|
|
359
332
|
print("Usage: swebench-judge.py <scenario_name> <response_file>")
|
|
@@ -396,5 +369,6 @@ def main():
|
|
|
396
369
|
json.dump(result, f, indent=2)
|
|
397
370
|
print(f"\nSaved to: {output_path}")
|
|
398
371
|
|
|
372
|
+
|
|
399
373
|
if __name__ == '__main__':
|
|
400
374
|
main()
|