@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench Verified Scenario
|
|
3
|
+
# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
|
|
4
|
+
# Instance: django__django-10973
|
|
5
|
+
|
|
6
|
+
name: django-10973
|
|
7
|
+
title: "Use subprocess.run and PGPASSWORD for client in postgres backend"
|
|
8
|
+
category: dev
|
|
9
|
+
difficulty: medium # SWE-bench: 15 min - 1 hour
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: django__django-10973
|
|
15
|
+
repo: django/django
|
|
16
|
+
base_commit: ddb293685235
|
|
17
|
+
|
|
18
|
+
description: |
|
|
19
|
+
Real GitHub issue from django/django requiring code changes to resolve.
|
|
20
|
+
This is a human-validated problem from the SWE-bench Verified dataset.
|
|
21
|
+
|
|
22
|
+
prompt: |
|
|
23
|
+
You are working on the django/django repository at commit ddb293685235.
|
|
24
|
+
|
|
25
|
+
A user has reported the following issue:
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
Use subprocess.run and PGPASSWORD for client in postgres backend
|
|
29
|
+
Description
|
|
30
|
+
|
|
31
|
+
subprocess.run was added in python 3.5 (which is the minimum version since Django 2.1). This function allows you to pass a custom environment for the subprocess.
|
|
32
|
+
Using this in django.db.backends.postgres.client to set PGPASSWORD simplifies the code and makes it more reliable.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
Analyze this issue and provide:
|
|
37
|
+
1. Root cause analysis - what is causing the bug?
|
|
38
|
+
2. Proposed fix - what code changes would resolve this?
|
|
39
|
+
3. Test considerations - how would you verify the fix works?
|
|
40
|
+
|
|
41
|
+
Provide your response with specific file paths and code changes.
|
|
42
|
+
|
|
43
|
+
scoring:
|
|
44
|
+
# Adapted for SWE-bench bug-fix scenarios
|
|
45
|
+
categories:
|
|
46
|
+
- name: root_cause
|
|
47
|
+
weight: 30
|
|
48
|
+
description: "Correctly identifies the underlying cause of the bug"
|
|
49
|
+
criteria:
|
|
50
|
+
- id: IDENTIFIES_BUG_LOCATION
|
|
51
|
+
description: "Points to correct file(s) and function(s)"
|
|
52
|
+
points: 15
|
|
53
|
+
- id: EXPLAINS_WHY_BROKEN
|
|
54
|
+
description: "Explains why current code fails"
|
|
55
|
+
points: 15
|
|
56
|
+
|
|
57
|
+
- name: fix_quality
|
|
58
|
+
weight: 40
|
|
59
|
+
description: "Proposes a correct and complete fix"
|
|
60
|
+
criteria:
|
|
61
|
+
- id: FIX_ADDRESSES_ISSUE
|
|
62
|
+
description: "Fix would resolve the reported problem"
|
|
63
|
+
points: 20
|
|
64
|
+
- id: FIX_IS_MINIMAL
|
|
65
|
+
description: "Fix is appropriately scoped, not over-engineered"
|
|
66
|
+
points: 10
|
|
67
|
+
- id: FIX_SYNTAX_CORRECT
|
|
68
|
+
description: "Code changes are syntactically valid"
|
|
69
|
+
points: 10
|
|
70
|
+
|
|
71
|
+
- name: completeness
|
|
72
|
+
weight: 20
|
|
73
|
+
description: "Considers edge cases and testing"
|
|
74
|
+
criteria:
|
|
75
|
+
- id: EDGE_CASES
|
|
76
|
+
description: "Considers related scenarios that might break"
|
|
77
|
+
points: 10
|
|
78
|
+
- id: TEST_COVERAGE
|
|
79
|
+
description: "Suggests appropriate test cases"
|
|
80
|
+
points: 10
|
|
81
|
+
|
|
82
|
+
- name: persona
|
|
83
|
+
weight: 10
|
|
84
|
+
description: "Maintains character while solving"
|
|
85
|
+
criteria:
|
|
86
|
+
- id: IN_CHARACTER
|
|
87
|
+
description: "Response reflects persona traits"
|
|
88
|
+
points: 10
|
|
89
|
+
|
|
90
|
+
# Metadata for full harness evaluation (optional)
|
|
91
|
+
swebench_metadata:
|
|
92
|
+
fail_to_pass: ["test_accent (dbshell.test_postgresql.PostgreSqlDbshellCommandTestCase)", "test_basic (dbshell.test_postgresql.PostgreSqlDbshellCommandTestCase)", "test_column (dbshell.test_postgresql.PostgreSqlDbshellCommandTestCase)"]
|
|
93
|
+
environment_version: "3.0"
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench Reviewer Scenario
|
|
3
|
+
# Adapted from: pallets__flask-5014
|
|
4
|
+
# Role: Code Reviewer evaluating a proposed fix
|
|
5
|
+
|
|
6
|
+
name: flask-5014-reviewer
|
|
7
|
+
title: "Review: Require non-empty name for Blueprints"
|
|
8
|
+
category: reviewer
|
|
9
|
+
difficulty: easy
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: pallets__flask-5014
|
|
15
|
+
repo: pallets/flask
|
|
16
|
+
base_commit: 7ee9ceb71e86
|
|
17
|
+
adapted_for: reviewer
|
|
18
|
+
|
|
19
|
+
description: |
|
|
20
|
+
Code review scenario adapted from SWE-bench. The reviewer must evaluate
|
|
21
|
+
a proposed patch for correctness, completeness, and code quality.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
You are reviewing a pull request for the pallets/flask repository.
|
|
25
|
+
|
|
26
|
+
## Issue Being Fixed
|
|
27
|
+
|
|
28
|
+
**Title:** Require a non-empty name for Blueprints
|
|
29
|
+
|
|
30
|
+
**Description:**
|
|
31
|
+
Things do not work correctly if a Blueprint is given an empty name (e.g. #4944).
|
|
32
|
+
It would be helpful if a `ValueError` was raised when trying to do that.
|
|
33
|
+
|
|
34
|
+
## Proposed Patch
|
|
35
|
+
|
|
36
|
+
The developer has submitted the following changes:
|
|
37
|
+
|
|
38
|
+
**src/flask/blueprints.py** - In the Blueprint.__init__ method:
|
|
39
|
+
```python
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
name: str,
|
|
43
|
+
import_name: str,
|
|
44
|
+
...
|
|
45
|
+
) -> None:
|
|
46
|
+
# NEW: Validate non-empty name
|
|
47
|
+
if not name:
|
|
48
|
+
raise ValueError("'name' may not be empty.")
|
|
49
|
+
|
|
50
|
+
if "." in name:
|
|
51
|
+
raise ValueError("'name' may not contain a dot '.' character.")
|
|
52
|
+
...
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**tests/test_blueprints.py** - New test added:
|
|
56
|
+
```python
|
|
57
|
+
def test_empty_name_not_allowed(app, client):
|
|
58
|
+
with pytest.raises(ValueError):
|
|
59
|
+
flask.Blueprint("", __name__)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**CHANGES.rst** - Added to changelog:
|
|
63
|
+
```
|
|
64
|
+
- If a blueprint is created with an empty name it raises a ValueError.
|
|
65
|
+
:issue:`5010`
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Your Review Task
|
|
69
|
+
|
|
70
|
+
Evaluate this pull request and provide:
|
|
71
|
+
1. **Correctness** - Does the fix address the issue? Any bugs?
|
|
72
|
+
2. **Edge Cases** - What about whitespace-only names? None values?
|
|
73
|
+
3. **Test Coverage** - Are the tests sufficient?
|
|
74
|
+
4. **Code Quality** - Style, error message clarity, placement
|
|
75
|
+
5. **Verdict** - APPROVE, REQUEST_CHANGES, or COMMENT
|
|
76
|
+
|
|
77
|
+
Be thorough but fair. Identify real issues, not nitpicks.
|
|
78
|
+
|
|
79
|
+
scoring:
|
|
80
|
+
categories:
|
|
81
|
+
- name: issue_detection
|
|
82
|
+
weight: 35
|
|
83
|
+
description: "Identifies legitimate issues with the patch"
|
|
84
|
+
criteria:
|
|
85
|
+
- id: WHITESPACE_EDGE_CASE
|
|
86
|
+
description: "Notes that whitespace-only names like ' ' would pass"
|
|
87
|
+
points: 15
|
|
88
|
+
severity: high
|
|
89
|
+
- id: NONE_HANDLING
|
|
90
|
+
description: "Considers behavior if name=None is passed"
|
|
91
|
+
points: 10
|
|
92
|
+
severity: medium
|
|
93
|
+
- id: ERROR_MESSAGE_QUALITY
|
|
94
|
+
description: "Comments on error message clarity/consistency"
|
|
95
|
+
points: 5
|
|
96
|
+
severity: low
|
|
97
|
+
- id: TEST_COVERAGE_GAP
|
|
98
|
+
description: "Notes missing edge case tests"
|
|
99
|
+
points: 5
|
|
100
|
+
severity: low
|
|
101
|
+
|
|
102
|
+
- name: review_quality
|
|
103
|
+
weight: 30
|
|
104
|
+
description: "Quality of the review feedback"
|
|
105
|
+
criteria:
|
|
106
|
+
- id: CONSTRUCTIVE_FEEDBACK
|
|
107
|
+
description: "Provides actionable suggestions, not just complaints"
|
|
108
|
+
points: 15
|
|
109
|
+
- id: CORRECT_ASSESSMENT
|
|
110
|
+
description: "Doesn't flag false positives or miss obvious issues"
|
|
111
|
+
points: 15
|
|
112
|
+
|
|
113
|
+
- name: verdict_appropriateness
|
|
114
|
+
weight: 20
|
|
115
|
+
description: "Appropriate review decision"
|
|
116
|
+
criteria:
|
|
117
|
+
- id: REASONABLE_VERDICT
|
|
118
|
+
description: "Verdict matches severity of issues found"
|
|
119
|
+
points: 10
|
|
120
|
+
- id: JUSTIFIED_DECISION
|
|
121
|
+
description: "Decision is well-reasoned"
|
|
122
|
+
points: 10
|
|
123
|
+
|
|
124
|
+
- name: persona
|
|
125
|
+
weight: 15
|
|
126
|
+
description: "Maintains reviewer character"
|
|
127
|
+
criteria:
|
|
128
|
+
- id: IN_CHARACTER
|
|
129
|
+
description: "Response reflects persona traits"
|
|
130
|
+
points: 15
|
|
131
|
+
|
|
132
|
+
baseline_issues:
|
|
133
|
+
critical: []
|
|
134
|
+
high:
|
|
135
|
+
- id: WHITESPACE_EDGE_CASE
|
|
136
|
+
description: "Empty check 'if not name' passes for whitespace-only strings"
|
|
137
|
+
expected_fix: "Use 'if not name or not name.strip()' or similar"
|
|
138
|
+
medium:
|
|
139
|
+
- id: NONE_HANDLING
|
|
140
|
+
description: "If name=None, 'if not name' works but error message is misleading"
|
|
141
|
+
low:
|
|
142
|
+
- id: ERROR_MESSAGE_CONSISTENCY
|
|
143
|
+
description: "Error message style differs from existing dot check"
|
|
144
|
+
- id: MISSING_WHITESPACE_TEST
|
|
145
|
+
description: "No test for whitespace-only names"
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench TEA Scenario
|
|
3
|
+
# Adapted from: pallets__flask-5014
|
|
4
|
+
# Role: Test Engineer writing failing tests BEFORE the fix
|
|
5
|
+
|
|
6
|
+
name: flask-5014-tea
|
|
7
|
+
title: "RED Phase: Write tests for empty Blueprint name validation"
|
|
8
|
+
category: tea
|
|
9
|
+
difficulty: easy
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: pallets__flask-5014
|
|
15
|
+
repo: pallets/flask
|
|
16
|
+
base_commit: 7ee9ceb71e86
|
|
17
|
+
adapted_for: tea
|
|
18
|
+
|
|
19
|
+
description: |
|
|
20
|
+
TDD RED phase scenario adapted from SWE-bench. The test engineer must write
|
|
21
|
+
failing tests that specify the expected behavior BEFORE seeing any fix.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
You are a Test Engineer working on the pallets/flask repository.
|
|
25
|
+
|
|
26
|
+
## Bug Report
|
|
27
|
+
|
|
28
|
+
**Title:** Require a non-empty name for Blueprints
|
|
29
|
+
|
|
30
|
+
**Description:**
|
|
31
|
+
Things do not work correctly if a Blueprint is given an empty name (e.g. #4944).
|
|
32
|
+
It would be helpful if a `ValueError` was raised when trying to do that.
|
|
33
|
+
|
|
34
|
+
**Current Behavior:** Creating `Blueprint("", __name__)` succeeds but causes
|
|
35
|
+
problems downstream when the blueprint is registered.
|
|
36
|
+
|
|
37
|
+
**Expected Behavior:** A `ValueError` should be raised immediately when
|
|
38
|
+
attempting to create a Blueprint with an empty name.
|
|
39
|
+
|
|
40
|
+
## Your Task (RED Phase)
|
|
41
|
+
|
|
42
|
+
Write comprehensive failing tests that:
|
|
43
|
+
1. Specify the expected behavior (ValueError on empty name)
|
|
44
|
+
2. Cover edge cases (whitespace, None, etc.)
|
|
45
|
+
3. Will FAIL against the current codebase
|
|
46
|
+
4. Will PASS once the fix is implemented
|
|
47
|
+
|
|
48
|
+
Use pytest style. Provide complete, runnable test code.
|
|
49
|
+
|
|
50
|
+
**Important:** You are writing tests BEFORE the fix exists. These tests
|
|
51
|
+
should fail now and define what "correct" means.
|
|
52
|
+
|
|
53
|
+
scoring:
|
|
54
|
+
categories:
|
|
55
|
+
- name: test_coverage
|
|
56
|
+
weight: 40
|
|
57
|
+
description: "Comprehensive test cases"
|
|
58
|
+
criteria:
|
|
59
|
+
- id: EMPTY_STRING_TEST
|
|
60
|
+
description: "Tests that '' raises ValueError"
|
|
61
|
+
points: 15
|
|
62
|
+
severity: critical
|
|
63
|
+
- id: WHITESPACE_TEST
|
|
64
|
+
description: "Tests that ' ' (whitespace-only) should also fail"
|
|
65
|
+
points: 10
|
|
66
|
+
severity: high
|
|
67
|
+
- id: NONE_TEST
|
|
68
|
+
description: "Tests behavior when name=None"
|
|
69
|
+
points: 10
|
|
70
|
+
severity: medium
|
|
71
|
+
- id: VALID_NAME_TEST
|
|
72
|
+
description: "Includes sanity check that valid names still work"
|
|
73
|
+
points: 5
|
|
74
|
+
severity: low
|
|
75
|
+
|
|
76
|
+
- name: test_quality
|
|
77
|
+
weight: 30
|
|
78
|
+
description: "Well-written test code"
|
|
79
|
+
criteria:
|
|
80
|
+
- id: PYTEST_STYLE
|
|
81
|
+
description: "Uses pytest idioms correctly (pytest.raises, fixtures)"
|
|
82
|
+
points: 10
|
|
83
|
+
- id: CLEAR_ASSERTIONS
|
|
84
|
+
description: "Assertions clearly express expected behavior"
|
|
85
|
+
points: 10
|
|
86
|
+
- id: ERROR_MESSAGE_CHECK
|
|
87
|
+
description: "Verifies error message content, not just exception type"
|
|
88
|
+
points: 10
|
|
89
|
+
|
|
90
|
+
- name: red_phase_understanding
|
|
91
|
+
weight: 15
|
|
92
|
+
description: "Understands TDD RED phase"
|
|
93
|
+
criteria:
|
|
94
|
+
- id: TESTS_SHOULD_FAIL
|
|
95
|
+
description: "Acknowledges tests will fail against current code"
|
|
96
|
+
points: 8
|
|
97
|
+
- id: DEFINES_BEHAVIOR
|
|
98
|
+
description: "Tests define expected behavior, not implementation"
|
|
99
|
+
points: 7
|
|
100
|
+
|
|
101
|
+
- name: persona
|
|
102
|
+
weight: 15
|
|
103
|
+
description: "Maintains TEA character"
|
|
104
|
+
criteria:
|
|
105
|
+
- id: IN_CHARACTER
|
|
106
|
+
description: "Response reflects persona traits"
|
|
107
|
+
points: 15
|
|
108
|
+
|
|
109
|
+
baseline_criteria:
|
|
110
|
+
required_tests:
|
|
111
|
+
- id: EMPTY_STRING
|
|
112
|
+
description: "Blueprint('', __name__) raises ValueError"
|
|
113
|
+
- id: WHITESPACE_ONLY
|
|
114
|
+
description: "Blueprint(' ', __name__) raises ValueError"
|
|
115
|
+
- id: VALID_STILL_WORKS
|
|
116
|
+
description: "Blueprint('valid_name', __name__) succeeds"
|
|
117
|
+
bonus_tests:
|
|
118
|
+
- id: NONE_VALUE
|
|
119
|
+
description: "Blueprint(None, __name__) behavior specified"
|
|
120
|
+
- id: ERROR_MESSAGE
|
|
121
|
+
description: "Asserts on error message content"
|
|
122
|
+
- id: SINGLE_SPACE
|
|
123
|
+
description: "Blueprint(' ', __name__) edge case"
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench Verified Scenario
|
|
3
|
+
# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
|
|
4
|
+
# Instance: pallets__flask-5014
|
|
5
|
+
|
|
6
|
+
name: flask-5014
|
|
7
|
+
title: "Require a non-empty name for Blueprints"
|
|
8
|
+
category: dev
|
|
9
|
+
difficulty: easy # SWE-bench: <15 min fix
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: pallets__flask-5014
|
|
15
|
+
repo: pallets/flask
|
|
16
|
+
base_commit: 7ee9ceb71e86
|
|
17
|
+
|
|
18
|
+
description: |
|
|
19
|
+
Real GitHub issue from pallets/flask requiring code changes to resolve.
|
|
20
|
+
This is a human-validated problem from the SWE-bench Verified dataset.
|
|
21
|
+
|
|
22
|
+
prompt: |
|
|
23
|
+
You are working on the pallets/flask repository at commit 7ee9ceb71e86.
|
|
24
|
+
|
|
25
|
+
A user has reported the following issue:
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
Require a non-empty name for Blueprints
|
|
29
|
+
Things do not work correctly if a Blueprint is given an empty name (e.g. #4944).
|
|
30
|
+
It would be helpful if a `ValueError` was raised when trying to do that.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
Analyze this issue and provide:
|
|
35
|
+
1. Root cause analysis - what is causing the bug?
|
|
36
|
+
2. Proposed fix - what code changes would resolve this?
|
|
37
|
+
3. Test considerations - how would you verify the fix works?
|
|
38
|
+
|
|
39
|
+
Provide your response with specific file paths and code changes.
|
|
40
|
+
|
|
41
|
+
scoring:
|
|
42
|
+
# Adapted for SWE-bench bug-fix scenarios
|
|
43
|
+
categories:
|
|
44
|
+
- name: root_cause
|
|
45
|
+
weight: 30
|
|
46
|
+
description: "Correctly identifies the underlying cause of the bug"
|
|
47
|
+
criteria:
|
|
48
|
+
- id: IDENTIFIES_BUG_LOCATION
|
|
49
|
+
description: "Points to correct file(s) and function(s)"
|
|
50
|
+
points: 15
|
|
51
|
+
- id: EXPLAINS_WHY_BROKEN
|
|
52
|
+
description: "Explains why current code fails"
|
|
53
|
+
points: 15
|
|
54
|
+
|
|
55
|
+
- name: fix_quality
|
|
56
|
+
weight: 40
|
|
57
|
+
description: "Proposes a correct and complete fix"
|
|
58
|
+
criteria:
|
|
59
|
+
- id: FIX_ADDRESSES_ISSUE
|
|
60
|
+
description: "Fix would resolve the reported problem"
|
|
61
|
+
points: 20
|
|
62
|
+
- id: FIX_IS_MINIMAL
|
|
63
|
+
description: "Fix is appropriately scoped, not over-engineered"
|
|
64
|
+
points: 10
|
|
65
|
+
- id: FIX_SYNTAX_CORRECT
|
|
66
|
+
description: "Code changes are syntactically valid"
|
|
67
|
+
points: 10
|
|
68
|
+
|
|
69
|
+
- name: completeness
|
|
70
|
+
weight: 20
|
|
71
|
+
description: "Considers edge cases and testing"
|
|
72
|
+
criteria:
|
|
73
|
+
- id: EDGE_CASES
|
|
74
|
+
description: "Considers related scenarios that might break"
|
|
75
|
+
points: 10
|
|
76
|
+
- id: TEST_COVERAGE
|
|
77
|
+
description: "Suggests appropriate test cases"
|
|
78
|
+
points: 10
|
|
79
|
+
|
|
80
|
+
- name: persona
|
|
81
|
+
weight: 10
|
|
82
|
+
description: "Maintains character while solving"
|
|
83
|
+
criteria:
|
|
84
|
+
- id: IN_CHARACTER
|
|
85
|
+
description: "Response reflects persona traits"
|
|
86
|
+
points: 10
|
|
87
|
+
|
|
88
|
+
# Metadata for full harness evaluation (optional)
|
|
89
|
+
swebench_metadata:
|
|
90
|
+
fail_to_pass: ["tests/test_blueprints.py::test_empty_name_not_allowed"]
|
|
91
|
+
environment_version: "2.3"
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Import SWE-bench Verified scenarios into Pennyfarthing format.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python import-swebench.py [--count N] [--difficulty LEVEL]
|
|
7
|
+
|
|
8
|
+
This script:
|
|
9
|
+
1. Loads the cached SWE-bench data from /tmp/swebench_all.json
|
|
10
|
+
2. Selects a stratified sample across difficulty levels
|
|
11
|
+
3. Generates Pennyfarthing scenario YAML files
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import sys
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from textwrap import dedent, indent
|
|
20
|
+
|
|
21
|
+
# Difficulty mapping: SWE-bench -> Pennyfarthing
|
|
22
|
+
DIFFICULTY_MAP = {
|
|
23
|
+
"<15 min fix": "easy",
|
|
24
|
+
"15 min - 1 hour": "medium",
|
|
25
|
+
"1-4 hours": "hard",
|
|
26
|
+
">4 hours": "extreme"
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# How many to import per difficulty level
|
|
30
|
+
DEFAULT_COUNTS = {
|
|
31
|
+
"easy": 5,
|
|
32
|
+
"medium": 5,
|
|
33
|
+
"hard": 3,
|
|
34
|
+
"extreme": 2
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
def sanitize_name(instance_id: str) -> str:
|
|
38
|
+
"""Convert instance_id to valid filename."""
|
|
39
|
+
# astropy__astropy-12907 -> astropy-12907
|
|
40
|
+
parts = instance_id.split("__")
|
|
41
|
+
if len(parts) == 2:
|
|
42
|
+
return parts[1].lower()
|
|
43
|
+
return instance_id.lower().replace("__", "-")
|
|
44
|
+
|
|
45
|
+
def truncate_text(text: str, max_chars: int = 4000) -> str:
|
|
46
|
+
"""Truncate text to max characters, preserving complete lines."""
|
|
47
|
+
if len(text) <= max_chars:
|
|
48
|
+
return text
|
|
49
|
+
truncated = text[:max_chars]
|
|
50
|
+
# Find last newline to avoid cutting mid-line
|
|
51
|
+
last_nl = truncated.rfind('\n')
|
|
52
|
+
if last_nl > max_chars * 0.8:
|
|
53
|
+
truncated = truncated[:last_nl]
|
|
54
|
+
return truncated + "\n\n[... truncated for brevity ...]"
|
|
55
|
+
|
|
56
|
+
def generate_scenario_yaml(instance: dict) -> str:
|
|
57
|
+
"""Generate Pennyfarthing scenario YAML from SWE-bench instance."""
|
|
58
|
+
|
|
59
|
+
instance_id = instance['instance_id']
|
|
60
|
+
repo = instance['repo']
|
|
61
|
+
problem = instance['problem_statement']
|
|
62
|
+
swe_difficulty = instance.get('difficulty', '15 min - 1 hour')
|
|
63
|
+
pf_difficulty = DIFFICULTY_MAP.get(swe_difficulty, 'medium')
|
|
64
|
+
base_commit = instance['base_commit'][:12]
|
|
65
|
+
|
|
66
|
+
# Parse repo for cleaner name
|
|
67
|
+
repo_name = repo.split('/')[-1]
|
|
68
|
+
scenario_name = sanitize_name(instance_id)
|
|
69
|
+
|
|
70
|
+
# Extract first line as title (usually issue title)
|
|
71
|
+
lines = problem.strip().split('\n')
|
|
72
|
+
title = lines[0][:80] if lines else f"Issue in {repo_name}"
|
|
73
|
+
# Clean markdown formatting from title
|
|
74
|
+
title = re.sub(r'[`*#]', '', title).strip()
|
|
75
|
+
if title.startswith('- '):
|
|
76
|
+
title = title[2:]
|
|
77
|
+
|
|
78
|
+
# Truncate problem statement for scenario
|
|
79
|
+
problem_truncated = truncate_text(problem, 6000)
|
|
80
|
+
|
|
81
|
+
yaml_content = f'''---
|
|
82
|
+
# SWE-bench Verified Scenario
|
|
83
|
+
# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
|
|
84
|
+
# Instance: {instance_id}
|
|
85
|
+
|
|
86
|
+
name: {scenario_name}
|
|
87
|
+
title: "{title}"
|
|
88
|
+
category: dev
|
|
89
|
+
difficulty: {pf_difficulty} # SWE-bench: {swe_difficulty}
|
|
90
|
+
version: "1.0"
|
|
91
|
+
|
|
92
|
+
source:
|
|
93
|
+
benchmark: swe-bench-verified
|
|
94
|
+
instance_id: {instance_id}
|
|
95
|
+
repo: {repo}
|
|
96
|
+
base_commit: {base_commit}
|
|
97
|
+
|
|
98
|
+
description: |
|
|
99
|
+
Real GitHub issue from {repo} requiring code changes to resolve.
|
|
100
|
+
This is a human-validated problem from the SWE-bench Verified dataset.
|
|
101
|
+
|
|
102
|
+
prompt: |
|
|
103
|
+
You are working on the {repo} repository at commit {base_commit}.
|
|
104
|
+
|
|
105
|
+
A user has reported the following issue:
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
{indent(problem_truncated, " ")}
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
Analyze this issue and provide:
|
|
112
|
+
1. Root cause analysis - what is causing the bug?
|
|
113
|
+
2. Proposed fix - what code changes would resolve this?
|
|
114
|
+
3. Test considerations - how would you verify the fix works?
|
|
115
|
+
|
|
116
|
+
Provide your response with specific file paths and code changes.
|
|
117
|
+
|
|
118
|
+
scoring:
|
|
119
|
+
# Adapted for SWE-bench bug-fix scenarios
|
|
120
|
+
categories:
|
|
121
|
+
- name: root_cause
|
|
122
|
+
weight: 30
|
|
123
|
+
description: "Correctly identifies the underlying cause of the bug"
|
|
124
|
+
criteria:
|
|
125
|
+
- id: IDENTIFIES_BUG_LOCATION
|
|
126
|
+
description: "Points to correct file(s) and function(s)"
|
|
127
|
+
points: 15
|
|
128
|
+
- id: EXPLAINS_WHY_BROKEN
|
|
129
|
+
description: "Explains why current code fails"
|
|
130
|
+
points: 15
|
|
131
|
+
|
|
132
|
+
- name: fix_quality
|
|
133
|
+
weight: 40
|
|
134
|
+
description: "Proposes a correct and complete fix"
|
|
135
|
+
criteria:
|
|
136
|
+
- id: FIX_ADDRESSES_ISSUE
|
|
137
|
+
description: "Fix would resolve the reported problem"
|
|
138
|
+
points: 20
|
|
139
|
+
- id: FIX_IS_MINIMAL
|
|
140
|
+
description: "Fix is appropriately scoped, not over-engineered"
|
|
141
|
+
points: 10
|
|
142
|
+
- id: FIX_SYNTAX_CORRECT
|
|
143
|
+
description: "Code changes are syntactically valid"
|
|
144
|
+
points: 10
|
|
145
|
+
|
|
146
|
+
- name: completeness
|
|
147
|
+
weight: 20
|
|
148
|
+
description: "Considers edge cases and testing"
|
|
149
|
+
criteria:
|
|
150
|
+
- id: EDGE_CASES
|
|
151
|
+
description: "Considers related scenarios that might break"
|
|
152
|
+
points: 10
|
|
153
|
+
- id: TEST_COVERAGE
|
|
154
|
+
description: "Suggests appropriate test cases"
|
|
155
|
+
points: 10
|
|
156
|
+
|
|
157
|
+
- name: persona
|
|
158
|
+
weight: 10
|
|
159
|
+
description: "Maintains character while solving"
|
|
160
|
+
criteria:
|
|
161
|
+
- id: IN_CHARACTER
|
|
162
|
+
description: "Response reflects persona traits"
|
|
163
|
+
points: 10
|
|
164
|
+
|
|
165
|
+
# Metadata for full harness evaluation (optional)
|
|
166
|
+
swebench_metadata:
|
|
167
|
+
fail_to_pass: {json.dumps(json.loads(instance.get('FAIL_TO_PASS', '[]'))[:3])}
|
|
168
|
+
environment_version: "{instance.get('version', 'unknown')}"
|
|
169
|
+
'''
|
|
170
|
+
|
|
171
|
+
return yaml_content
|
|
172
|
+
|
|
173
|
+
def select_scenarios(all_instances: list, counts: dict) -> list:
|
|
174
|
+
"""Select stratified sample of scenarios."""
|
|
175
|
+
selected = []
|
|
176
|
+
|
|
177
|
+
# Group by difficulty
|
|
178
|
+
by_difficulty = {}
|
|
179
|
+
for inst in all_instances:
|
|
180
|
+
swe_diff = inst.get('difficulty', '15 min - 1 hour')
|
|
181
|
+
pf_diff = DIFFICULTY_MAP.get(swe_diff, 'medium')
|
|
182
|
+
by_difficulty.setdefault(pf_diff, []).append(inst)
|
|
183
|
+
|
|
184
|
+
# Select from each difficulty, preferring repo diversity
|
|
185
|
+
for difficulty, count in counts.items():
|
|
186
|
+
available = by_difficulty.get(difficulty, [])
|
|
187
|
+
|
|
188
|
+
# Sort by repo to get diversity, then take first N
|
|
189
|
+
seen_repos = set()
|
|
190
|
+
diverse_selection = []
|
|
191
|
+
for inst in available:
|
|
192
|
+
repo = inst['repo']
|
|
193
|
+
if repo not in seen_repos:
|
|
194
|
+
diverse_selection.append(inst)
|
|
195
|
+
seen_repos.add(repo)
|
|
196
|
+
if len(diverse_selection) >= count:
|
|
197
|
+
break
|
|
198
|
+
|
|
199
|
+
# If we don't have enough diverse repos, add more from any repo
|
|
200
|
+
if len(diverse_selection) < count:
|
|
201
|
+
for inst in available:
|
|
202
|
+
if inst not in diverse_selection:
|
|
203
|
+
diverse_selection.append(inst)
|
|
204
|
+
if len(diverse_selection) >= count:
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
selected.extend(diverse_selection[:count])
|
|
208
|
+
|
|
209
|
+
return selected
|
|
210
|
+
|
|
211
|
+
def main():
|
|
212
|
+
# Load cached data
|
|
213
|
+
cache_file = Path('/tmp/swebench_all.json')
|
|
214
|
+
if not cache_file.exists():
|
|
215
|
+
print("Error: Run the download script first to create /tmp/swebench_all.json")
|
|
216
|
+
sys.exit(1)
|
|
217
|
+
|
|
218
|
+
with open(cache_file) as f:
|
|
219
|
+
all_instances = json.load(f)
|
|
220
|
+
|
|
221
|
+
print(f"Loaded {len(all_instances)} SWE-bench instances")
|
|
222
|
+
|
|
223
|
+
# Select scenarios
|
|
224
|
+
selected = select_scenarios(all_instances, DEFAULT_COUNTS)
|
|
225
|
+
print(f"Selected {len(selected)} scenarios for import")
|
|
226
|
+
|
|
227
|
+
# Output directory
|
|
228
|
+
output_dir = Path(__file__).parent
|
|
229
|
+
|
|
230
|
+
# Generate YAML files
|
|
231
|
+
for inst in selected:
|
|
232
|
+
scenario_name = sanitize_name(inst['instance_id'])
|
|
233
|
+
output_file = output_dir / f"{scenario_name}.yaml"
|
|
234
|
+
|
|
235
|
+
yaml_content = generate_scenario_yaml(inst)
|
|
236
|
+
|
|
237
|
+
with open(output_file, 'w') as f:
|
|
238
|
+
f.write(yaml_content)
|
|
239
|
+
|
|
240
|
+
pf_diff = DIFFICULTY_MAP.get(inst.get('difficulty', 'medium'), 'medium')
|
|
241
|
+
print(f" Created: {scenario_name}.yaml ({pf_diff})")
|
|
242
|
+
|
|
243
|
+
print(f"\nImport complete! {len(selected)} scenarios created in {output_dir}")
|
|
244
|
+
|
|
245
|
+
if __name__ == '__main__':
|
|
246
|
+
main()
|