@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench Verified Scenario
|
|
3
|
+
# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
|
|
4
|
+
# Instance: matplotlib__matplotlib-13989
|
|
5
|
+
|
|
6
|
+
name: matplotlib-13989
|
|
7
|
+
title: "hist() no longer respects range=... when density=True"
|
|
8
|
+
category: dev
|
|
9
|
+
difficulty: easy # SWE-bench: <15 min fix
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: matplotlib__matplotlib-13989
|
|
15
|
+
repo: matplotlib/matplotlib
|
|
16
|
+
base_commit: a3e2897bfaf9
|
|
17
|
+
|
|
18
|
+
description: |
|
|
19
|
+
Real GitHub issue from matplotlib/matplotlib requiring code changes to resolve.
|
|
20
|
+
This is a human-validated problem from the SWE-bench Verified dataset.
|
|
21
|
+
|
|
22
|
+
prompt: |
|
|
23
|
+
You are working on the matplotlib/matplotlib repository at commit a3e2897bfaf9.
|
|
24
|
+
|
|
25
|
+
A user has reported the following issue:
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
hist() no longer respects range=... when density=True
|
|
29
|
+
<!--To help us understand and resolve your issue, please fill out the form to the best of your ability.-->
|
|
30
|
+
<!--You can feel free to delete the sections that do not apply.-->
|
|
31
|
+
|
|
32
|
+
### Bug report
|
|
33
|
+
|
|
34
|
+
**Bug summary**
|
|
35
|
+
|
|
36
|
+
<!--A short 1-2 sentences that succinctly describes the bug-->
|
|
37
|
+
|
|
38
|
+
**Code for reproduction**
|
|
39
|
+
|
|
40
|
+
<!--A minimum code snippet required to reproduce the bug.
|
|
41
|
+
Please make sure to minimize the number of dependencies required, and provide
|
|
42
|
+
any necessary plotted data.
|
|
43
|
+
Avoid using threads, as Matplotlib is (explicitly) not thread-safe.-->
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
_, bins, _ = plt.hist(np.random.rand(10), "auto", range=(0, 1), density=True)
|
|
47
|
+
print(bins)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
**Actual outcome**
|
|
51
|
+
|
|
52
|
+
<!--The output produced by the above code, which may be a screenshot, console output, etc.-->
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
[0.00331535 0.18930174 0.37528813 0.56127453 0.74726092 0.93324731]
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**Expected outcome**
|
|
59
|
+
|
|
60
|
+
Some array where the first value is 0 and the last one is 1.
|
|
61
|
+
|
|
62
|
+
Note that this bug doesn't happen if density=False.
|
|
63
|
+
|
|
64
|
+
Bisects to https://github.com/matplotlib/matplotlib/pull/8638/commits/239be7b18e311c57a1393b6eeefc62b7cc629339 (#8638).
|
|
65
|
+
|
|
66
|
+
**Matplotlib version**
|
|
67
|
+
<!--Please specify your platform and versions of the relevant libraries you are using:-->
|
|
68
|
+
* Operating system: linux
|
|
69
|
+
* Matplotlib version: master
|
|
70
|
+
* Matplotlib backend (`print(matplotlib.get_backend())`): any
|
|
71
|
+
* Python version: 37
|
|
72
|
+
* Jupyter version (if applicable): no
|
|
73
|
+
* Other libraries: numpy 1.16.2
|
|
74
|
+
|
|
75
|
+
<!--Please tell us how you installed matplotlib and python e.g., from source, pip, conda-->
|
|
76
|
+
<!--If you installed from conda, please specify which channel you used if not the default-->
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
Analyze this issue and provide:
|
|
83
|
+
1. Root cause analysis - what is causing the bug?
|
|
84
|
+
2. Proposed fix - what code changes would resolve this?
|
|
85
|
+
3. Test considerations - how would you verify the fix works?
|
|
86
|
+
|
|
87
|
+
Provide your response with specific file paths and code changes.
|
|
88
|
+
|
|
89
|
+
scoring:
|
|
90
|
+
# Adapted for SWE-bench bug-fix scenarios
|
|
91
|
+
categories:
|
|
92
|
+
- name: root_cause
|
|
93
|
+
weight: 30
|
|
94
|
+
description: "Correctly identifies the underlying cause of the bug"
|
|
95
|
+
criteria:
|
|
96
|
+
- id: IDENTIFIES_BUG_LOCATION
|
|
97
|
+
description: "Points to correct file(s) and function(s)"
|
|
98
|
+
points: 15
|
|
99
|
+
- id: EXPLAINS_WHY_BROKEN
|
|
100
|
+
description: "Explains why current code fails"
|
|
101
|
+
points: 15
|
|
102
|
+
|
|
103
|
+
- name: fix_quality
|
|
104
|
+
weight: 40
|
|
105
|
+
description: "Proposes a correct and complete fix"
|
|
106
|
+
criteria:
|
|
107
|
+
- id: FIX_ADDRESSES_ISSUE
|
|
108
|
+
description: "Fix would resolve the reported problem"
|
|
109
|
+
points: 20
|
|
110
|
+
- id: FIX_IS_MINIMAL
|
|
111
|
+
description: "Fix is appropriately scoped, not over-engineered"
|
|
112
|
+
points: 10
|
|
113
|
+
- id: FIX_SYNTAX_CORRECT
|
|
114
|
+
description: "Code changes are syntactically valid"
|
|
115
|
+
points: 10
|
|
116
|
+
|
|
117
|
+
- name: completeness
|
|
118
|
+
weight: 20
|
|
119
|
+
description: "Considers edge cases and testing"
|
|
120
|
+
criteria:
|
|
121
|
+
- id: EDGE_CASES
|
|
122
|
+
description: "Considers related scenarios that might break"
|
|
123
|
+
points: 10
|
|
124
|
+
- id: TEST_COVERAGE
|
|
125
|
+
description: "Suggests appropriate test cases"
|
|
126
|
+
points: 10
|
|
127
|
+
|
|
128
|
+
- name: persona
|
|
129
|
+
weight: 10
|
|
130
|
+
description: "Maintains character while solving"
|
|
131
|
+
criteria:
|
|
132
|
+
- id: IN_CHARACTER
|
|
133
|
+
description: "Response reflects persona traits"
|
|
134
|
+
points: 10
|
|
135
|
+
|
|
136
|
+
# Metadata for full harness evaluation (optional)
|
|
137
|
+
swebench_metadata:
|
|
138
|
+
fail_to_pass: ["lib/matplotlib/tests/test_axes.py::test_hist_range_and_density"]
|
|
139
|
+
environment_version: "3.0"
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench Verified Scenario
|
|
3
|
+
# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
|
|
4
|
+
# Instance: matplotlib__matplotlib-14623
|
|
5
|
+
|
|
6
|
+
name: matplotlib-14623
|
|
7
|
+
title: "Inverting an axis using its limits does not work for log scale"
|
|
8
|
+
category: dev
|
|
9
|
+
difficulty: medium # SWE-bench: 15 min - 1 hour
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: matplotlib__matplotlib-14623
|
|
15
|
+
repo: matplotlib/matplotlib
|
|
16
|
+
base_commit: d65c9ca20ddf
|
|
17
|
+
|
|
18
|
+
description: |
|
|
19
|
+
Real GitHub issue from matplotlib/matplotlib requiring code changes to resolve.
|
|
20
|
+
This is a human-validated problem from the SWE-bench Verified dataset.
|
|
21
|
+
|
|
22
|
+
prompt: |
|
|
23
|
+
You are working on the matplotlib/matplotlib repository at commit d65c9ca20ddf.
|
|
24
|
+
|
|
25
|
+
A user has reported the following issue:
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
Inverting an axis using its limits does not work for log scale
|
|
29
|
+
### Bug report
|
|
30
|
+
|
|
31
|
+
**Bug summary**
|
|
32
|
+
Starting in matplotlib 3.1.0 it is no longer possible to invert a log axis using its limits.
|
|
33
|
+
|
|
34
|
+
**Code for reproduction**
|
|
35
|
+
```python
|
|
36
|
+
import numpy as np
|
|
37
|
+
import matplotlib.pyplot as plt
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
y = np.linspace(1000e2, 1, 100)
|
|
41
|
+
x = np.exp(-np.linspace(0, 1, y.size))
|
|
42
|
+
|
|
43
|
+
for yscale in ('linear', 'log'):
|
|
44
|
+
fig, ax = plt.subplots()
|
|
45
|
+
ax.plot(x, y)
|
|
46
|
+
ax.set_yscale(yscale)
|
|
47
|
+
ax.set_ylim(y.max(), y.min())
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
**Actual outcome**
|
|
51
|
+
The yaxis is only inverted for the ``"linear"`` scale.
|
|
52
|
+
|
|
53
|
+

|
|
54
|
+
|
|
55
|
+

|
|
56
|
+
|
|
57
|
+
**Expected outcome**
|
|
58
|
+
I would expect the yaxis to be inverted for both the ``"linear"`` and the ``"log"`` scale.
|
|
59
|
+
|
|
60
|
+
**Matplotlib version**
|
|
61
|
+
* Operating system: Linux and MacOS
|
|
62
|
+
* Matplotlib version: 3.1.0
|
|
63
|
+
* Python version: 3.7.3
|
|
64
|
+
|
|
65
|
+
Python and matplotlib have been installed using conda.
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
Analyze this issue and provide:
|
|
71
|
+
1. Root cause analysis - what is causing the bug?
|
|
72
|
+
2. Proposed fix - what code changes would resolve this?
|
|
73
|
+
3. Test considerations - how would you verify the fix works?
|
|
74
|
+
|
|
75
|
+
Provide your response with specific file paths and code changes.
|
|
76
|
+
|
|
77
|
+
scoring:
|
|
78
|
+
# Adapted for SWE-bench bug-fix scenarios
|
|
79
|
+
categories:
|
|
80
|
+
- name: root_cause
|
|
81
|
+
weight: 30
|
|
82
|
+
description: "Correctly identifies the underlying cause of the bug"
|
|
83
|
+
criteria:
|
|
84
|
+
- id: IDENTIFIES_BUG_LOCATION
|
|
85
|
+
description: "Points to correct file(s) and function(s)"
|
|
86
|
+
points: 15
|
|
87
|
+
- id: EXPLAINS_WHY_BROKEN
|
|
88
|
+
description: "Explains why current code fails"
|
|
89
|
+
points: 15
|
|
90
|
+
|
|
91
|
+
- name: fix_quality
|
|
92
|
+
weight: 40
|
|
93
|
+
description: "Proposes a correct and complete fix"
|
|
94
|
+
criteria:
|
|
95
|
+
- id: FIX_ADDRESSES_ISSUE
|
|
96
|
+
description: "Fix would resolve the reported problem"
|
|
97
|
+
points: 20
|
|
98
|
+
- id: FIX_IS_MINIMAL
|
|
99
|
+
description: "Fix is appropriately scoped, not over-engineered"
|
|
100
|
+
points: 10
|
|
101
|
+
- id: FIX_SYNTAX_CORRECT
|
|
102
|
+
description: "Code changes are syntactically valid"
|
|
103
|
+
points: 10
|
|
104
|
+
|
|
105
|
+
- name: completeness
|
|
106
|
+
weight: 20
|
|
107
|
+
description: "Considers edge cases and testing"
|
|
108
|
+
criteria:
|
|
109
|
+
- id: EDGE_CASES
|
|
110
|
+
description: "Considers related scenarios that might break"
|
|
111
|
+
points: 10
|
|
112
|
+
- id: TEST_COVERAGE
|
|
113
|
+
description: "Suggests appropriate test cases"
|
|
114
|
+
points: 10
|
|
115
|
+
|
|
116
|
+
- name: persona
|
|
117
|
+
weight: 10
|
|
118
|
+
description: "Maintains character while solving"
|
|
119
|
+
criteria:
|
|
120
|
+
- id: IN_CHARACTER
|
|
121
|
+
description: "Response reflects persona traits"
|
|
122
|
+
points: 10
|
|
123
|
+
|
|
124
|
+
# Metadata for full harness evaluation (optional)
|
|
125
|
+
swebench_metadata:
|
|
126
|
+
fail_to_pass: ["lib/matplotlib/tests/test_axes.py::test_inverted_limits"]
|
|
127
|
+
environment_version: "3.1"
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench Reviewer Scenario
|
|
3
|
+
# Adapted from: psf__requests-1142
|
|
4
|
+
# Role: Code Reviewer evaluating a proposed fix
|
|
5
|
+
|
|
6
|
+
name: requests-1142-reviewer
|
|
7
|
+
title: "Review: Stop sending Content-Length on GET requests"
|
|
8
|
+
category: reviewer
|
|
9
|
+
difficulty: medium
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: psf__requests-1142
|
|
15
|
+
repo: psf/requests
|
|
16
|
+
base_commit: 22623bd8c265
|
|
17
|
+
adapted_for: reviewer
|
|
18
|
+
|
|
19
|
+
description: |
|
|
20
|
+
Code review scenario adapted from SWE-bench. The reviewer must evaluate
|
|
21
|
+
a proposed patch that modifies HTTP header handling.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
You are reviewing a pull request for the psf/requests library.
|
|
25
|
+
|
|
26
|
+
## Issue Being Fixed
|
|
27
|
+
|
|
28
|
+
**Title:** requests.get is ALWAYS sending content length
|
|
29
|
+
|
|
30
|
+
**Description:**
|
|
31
|
+
It seems like request.get always adds 'content-length' header to the request.
|
|
32
|
+
I think that the right behavior is not to add this header automatically in
|
|
33
|
+
GET requests or add the possibility to not send it.
|
|
34
|
+
|
|
35
|
+
For example http://amazon.com returns 503 for every get request that
|
|
36
|
+
contains 'content-length' header.
|
|
37
|
+
|
|
38
|
+
## Proposed Patch
|
|
39
|
+
|
|
40
|
+
**requests/models.py** - Modified prepare_content_length method:
|
|
41
|
+
```python
|
|
42
|
+
def prepare_content_length(self, body):
|
|
43
|
+
if hasattr(body, 'seek') and hasattr(body, 'tell'):
|
|
44
|
+
body.seek(0, 2)
|
|
45
|
+
self.headers['Content-Length'] = str(body.tell())
|
|
46
|
+
body.seek(0, 0)
|
|
47
|
+
elif body is not None:
|
|
48
|
+
self.headers['Content-Length'] = str(len(body))
|
|
49
|
+
elif self.method not in ('GET', 'HEAD'):
|
|
50
|
+
self.headers['Content-Length'] = '0'
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
**Before:** Always set Content-Length to '0' when body is None
|
|
54
|
+
**After:** Only set Content-Length to '0' for non-GET/HEAD when body is None
|
|
55
|
+
|
|
56
|
+
**test_requests.py** - New test:
|
|
57
|
+
```python
|
|
58
|
+
def test_no_content_length(self):
|
|
59
|
+
get_req = requests.Request('GET', httpbin('get')).prepare()
|
|
60
|
+
self.assertTrue('Content-Length' not in get_req.headers)
|
|
61
|
+
head_req = requests.Request('HEAD', httpbin('head')).prepare()
|
|
62
|
+
self.assertTrue('Content-Length' not in head_req.headers)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Your Review Task
|
|
66
|
+
|
|
67
|
+
Evaluate this pull request and provide:
|
|
68
|
+
1. **Correctness** - Does the fix properly address the issue?
|
|
69
|
+
2. **HTTP Compliance** - Is this behavior correct per RFC 7230/7231?
|
|
70
|
+
3. **Edge Cases** - What about OPTIONS, DELETE, other methods?
|
|
71
|
+
4. **Backwards Compatibility** - Could this break existing code?
|
|
72
|
+
5. **Test Coverage** - Are the tests comprehensive enough?
|
|
73
|
+
6. **Verdict** - APPROVE, REQUEST_CHANGES, or COMMENT
|
|
74
|
+
|
|
75
|
+
Be thorough. Consider HTTP semantics carefully.
|
|
76
|
+
|
|
77
|
+
scoring:
|
|
78
|
+
categories:
|
|
79
|
+
- name: issue_detection
|
|
80
|
+
weight: 35
|
|
81
|
+
description: "Identifies legitimate issues with the patch"
|
|
82
|
+
criteria:
|
|
83
|
+
- id: OTHER_METHODS_CONCERN
|
|
84
|
+
description: "Questions whether OPTIONS/DELETE/etc should also skip Content-Length"
|
|
85
|
+
points: 10
|
|
86
|
+
severity: medium
|
|
87
|
+
- id: BODY_WITH_GET
|
|
88
|
+
description: "Notes GET with body is technically allowed and would still get Content-Length"
|
|
89
|
+
points: 10
|
|
90
|
+
severity: medium
|
|
91
|
+
- id: RFC_COMPLIANCE
|
|
92
|
+
description: "References or considers HTTP spec compliance"
|
|
93
|
+
points: 10
|
|
94
|
+
severity: medium
|
|
95
|
+
- id: BACKWARDS_COMPAT
|
|
96
|
+
description: "Considers if existing code relies on Content-Length: 0"
|
|
97
|
+
points: 5
|
|
98
|
+
severity: low
|
|
99
|
+
|
|
100
|
+
- name: review_quality
|
|
101
|
+
weight: 30
|
|
102
|
+
description: "Quality of the review feedback"
|
|
103
|
+
criteria:
|
|
104
|
+
- id: CONSTRUCTIVE_FEEDBACK
|
|
105
|
+
description: "Provides actionable suggestions"
|
|
106
|
+
points: 15
|
|
107
|
+
- id: TECHNICAL_ACCURACY
|
|
108
|
+
description: "HTTP knowledge is correct"
|
|
109
|
+
points: 15
|
|
110
|
+
|
|
111
|
+
- name: verdict_appropriateness
|
|
112
|
+
weight: 20
|
|
113
|
+
description: "Appropriate review decision"
|
|
114
|
+
criteria:
|
|
115
|
+
- id: REASONABLE_VERDICT
|
|
116
|
+
description: "Verdict matches analysis"
|
|
117
|
+
points: 10
|
|
118
|
+
- id: JUSTIFIED_DECISION
|
|
119
|
+
description: "Decision is well-reasoned"
|
|
120
|
+
points: 10
|
|
121
|
+
|
|
122
|
+
- name: persona
|
|
123
|
+
weight: 15
|
|
124
|
+
description: "Maintains reviewer character"
|
|
125
|
+
criteria:
|
|
126
|
+
- id: IN_CHARACTER
|
|
127
|
+
description: "Response reflects persona traits"
|
|
128
|
+
points: 15
|
|
129
|
+
|
|
130
|
+
baseline_issues:
|
|
131
|
+
critical: []
|
|
132
|
+
high: []
|
|
133
|
+
medium:
|
|
134
|
+
- id: OTHER_METHODS
|
|
135
|
+
description: "Hardcoded GET/HEAD - what about OPTIONS which also typically has no body?"
|
|
136
|
+
- id: GET_WITH_BODY
|
|
137
|
+
description: "If someone sends GET with a body (unusual but valid), Content-Length is still set"
|
|
138
|
+
- id: RFC_REFERENCE
|
|
139
|
+
description: "Should reference RFC 7230 section 3.3.2 on Content-Length requirements"
|
|
140
|
+
low:
|
|
141
|
+
- id: TEST_OTHER_METHODS
|
|
142
|
+
description: "Tests only cover GET/HEAD, not POST/PUT to verify they still work"
|
|
143
|
+
- id: BACKWARDS_COMPAT_NOTE
|
|
144
|
+
description: "Should note this is a behavior change for changelog"
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench TEA Scenario
|
|
3
|
+
# Adapted from: psf__requests-1142
|
|
4
|
+
# Role: Test Engineer writing failing tests BEFORE the fix
|
|
5
|
+
|
|
6
|
+
name: requests-1142-tea
|
|
7
|
+
title: "RED Phase: Write tests for Content-Length header behavior"
|
|
8
|
+
category: tea
|
|
9
|
+
difficulty: medium
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: psf__requests-1142
|
|
15
|
+
repo: psf/requests
|
|
16
|
+
base_commit: 22623bd8c265
|
|
17
|
+
adapted_for: tea
|
|
18
|
+
|
|
19
|
+
description: |
|
|
20
|
+
TDD RED phase scenario adapted from SWE-bench. The test engineer must write
|
|
21
|
+
failing tests that specify correct HTTP header behavior.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
You are a Test Engineer working on the psf/requests library.
|
|
25
|
+
|
|
26
|
+
## Bug Report
|
|
27
|
+
|
|
28
|
+
**Title:** requests.get is ALWAYS sending content length
|
|
29
|
+
|
|
30
|
+
**Description:**
|
|
31
|
+
It seems like request.get always adds 'content-length' header to the request.
|
|
32
|
+
I think that the right behavior is not to add this header automatically in
|
|
33
|
+
GET requests or add the possibility to not send it.
|
|
34
|
+
|
|
35
|
+
For example http://amazon.com returns 503 for every get request that
|
|
36
|
+
contains 'content-length' header.
|
|
37
|
+
|
|
38
|
+
**Current Behavior:** `requests.get(url)` always sends `Content-Length: 0`
|
|
39
|
+
|
|
40
|
+
**Expected Behavior:** GET and HEAD requests without a body should NOT
|
|
41
|
+
include a Content-Length header.
|
|
42
|
+
|
|
43
|
+
## Context
|
|
44
|
+
|
|
45
|
+
Per HTTP/1.1 specs (RFC 7230), Content-Length indicates the size of the
|
|
46
|
+
message body. For requests without a body (typical GET/HEAD), this header
|
|
47
|
+
is unnecessary and some servers reject it.
|
|
48
|
+
|
|
49
|
+
## Your Task (RED Phase)
|
|
50
|
+
|
|
51
|
+
Write comprehensive failing tests that:
|
|
52
|
+
1. Verify GET requests don't include Content-Length when no body
|
|
53
|
+
2. Verify HEAD requests don't include Content-Length when no body
|
|
54
|
+
3. Verify POST/PUT still include Content-Length (even if 0)
|
|
55
|
+
4. Will FAIL against the current codebase
|
|
56
|
+
5. Will PASS once the fix is implemented
|
|
57
|
+
|
|
58
|
+
Use unittest style (matching existing test_requests.py).
|
|
59
|
+
Provide complete, runnable test code.
|
|
60
|
+
|
|
61
|
+
scoring:
|
|
62
|
+
categories:
|
|
63
|
+
- name: test_coverage
|
|
64
|
+
weight: 40
|
|
65
|
+
description: "Comprehensive test cases"
|
|
66
|
+
criteria:
|
|
67
|
+
- id: GET_NO_CONTENT_LENGTH
|
|
68
|
+
description: "Tests GET without body has no Content-Length"
|
|
69
|
+
points: 12
|
|
70
|
+
severity: critical
|
|
71
|
+
- id: HEAD_NO_CONTENT_LENGTH
|
|
72
|
+
description: "Tests HEAD without body has no Content-Length"
|
|
73
|
+
points: 8
|
|
74
|
+
severity: high
|
|
75
|
+
- id: POST_HAS_CONTENT_LENGTH
|
|
76
|
+
description: "Tests POST still includes Content-Length"
|
|
77
|
+
points: 8
|
|
78
|
+
severity: high
|
|
79
|
+
- id: GET_WITH_BODY
|
|
80
|
+
description: "Tests GET with body DOES have Content-Length"
|
|
81
|
+
points: 6
|
|
82
|
+
severity: medium
|
|
83
|
+
- id: OTHER_METHODS
|
|
84
|
+
description: "Tests OPTIONS, DELETE, etc."
|
|
85
|
+
points: 6
|
|
86
|
+
severity: medium
|
|
87
|
+
|
|
88
|
+
- name: test_quality
|
|
89
|
+
weight: 30
|
|
90
|
+
description: "Well-written test code"
|
|
91
|
+
criteria:
|
|
92
|
+
- id: PROPER_ASSERTIONS
|
|
93
|
+
description: "Uses correct assertion methods"
|
|
94
|
+
points: 10
|
|
95
|
+
- id: PREPARE_VS_SEND
|
|
96
|
+
description: "Tests at prepare() level, not requiring network"
|
|
97
|
+
points: 10
|
|
98
|
+
- id: CLEAR_STRUCTURE
|
|
99
|
+
description: "Well-organized, readable tests"
|
|
100
|
+
points: 10
|
|
101
|
+
|
|
102
|
+
- name: red_phase_understanding
|
|
103
|
+
weight: 15
|
|
104
|
+
description: "Understands TDD RED phase"
|
|
105
|
+
criteria:
|
|
106
|
+
- id: TESTS_SHOULD_FAIL
|
|
107
|
+
description: "Acknowledges these fail against current code"
|
|
108
|
+
points: 8
|
|
109
|
+
- id: BEHAVIOR_FOCUS
|
|
110
|
+
description: "Tests specify behavior, not implementation"
|
|
111
|
+
points: 7
|
|
112
|
+
|
|
113
|
+
- name: persona
|
|
114
|
+
weight: 15
|
|
115
|
+
description: "Maintains TEA character"
|
|
116
|
+
criteria:
|
|
117
|
+
- id: IN_CHARACTER
|
|
118
|
+
description: "Response reflects persona traits"
|
|
119
|
+
points: 15
|
|
120
|
+
|
|
121
|
+
baseline_criteria:
|
|
122
|
+
required_tests:
|
|
123
|
+
- id: GET_NO_CL
|
|
124
|
+
description: "GET request without body -> no Content-Length header"
|
|
125
|
+
- id: HEAD_NO_CL
|
|
126
|
+
description: "HEAD request without body -> no Content-Length header"
|
|
127
|
+
- id: POST_HAS_CL
|
|
128
|
+
description: "POST request -> has Content-Length (regression check)"
|
|
129
|
+
bonus_tests:
|
|
130
|
+
- id: GET_WITH_BODY
|
|
131
|
+
description: "GET with body -> has Content-Length (unusual but valid)"
|
|
132
|
+
- id: OPTIONS_CHECK
|
|
133
|
+
description: "OPTIONS method behavior tested"
|
|
134
|
+
- id: PUT_REGRESSION
|
|
135
|
+
description: "PUT still has Content-Length"
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench Verified Scenario
|
|
3
|
+
# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
|
|
4
|
+
# Instance: psf__requests-1142
|
|
5
|
+
|
|
6
|
+
name: requests-1142
|
|
7
|
+
title: "requests.get is ALWAYS sending content length"
|
|
8
|
+
category: dev
|
|
9
|
+
difficulty: easy # SWE-bench: <15 min fix
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: psf__requests-1142
|
|
15
|
+
repo: psf/requests
|
|
16
|
+
base_commit: 22623bd8c265
|
|
17
|
+
|
|
18
|
+
description: |
|
|
19
|
+
Real GitHub issue from psf/requests requiring code changes to resolve.
|
|
20
|
+
This is a human-validated problem from the SWE-bench Verified dataset.
|
|
21
|
+
|
|
22
|
+
prompt: |
|
|
23
|
+
You are working on the psf/requests repository at commit 22623bd8c265.
|
|
24
|
+
|
|
25
|
+
A user has reported the following issue:
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
requests.get is ALWAYS sending content length
|
|
29
|
+
Hi,
|
|
30
|
+
|
|
31
|
+
It seems like that request.get always adds 'content-length' header to the request.
|
|
32
|
+
I think that the right behavior is not to add this header automatically in GET requests or add the possibility to not send it.
|
|
33
|
+
|
|
34
|
+
For example http://amazon.com returns 503 for every get request that contains 'content-length' header.
|
|
35
|
+
|
|
36
|
+
Thanks,
|
|
37
|
+
|
|
38
|
+
Oren
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
Analyze this issue and provide:
|
|
44
|
+
1. Root cause analysis - what is causing the bug?
|
|
45
|
+
2. Proposed fix - what code changes would resolve this?
|
|
46
|
+
3. Test considerations - how would you verify the fix works?
|
|
47
|
+
|
|
48
|
+
Provide your response with specific file paths and code changes.
|
|
49
|
+
|
|
50
|
+
scoring:
|
|
51
|
+
# Adapted for SWE-bench bug-fix scenarios
|
|
52
|
+
categories:
|
|
53
|
+
- name: root_cause
|
|
54
|
+
weight: 30
|
|
55
|
+
description: "Correctly identifies the underlying cause of the bug"
|
|
56
|
+
criteria:
|
|
57
|
+
- id: IDENTIFIES_BUG_LOCATION
|
|
58
|
+
description: "Points to correct file(s) and function(s)"
|
|
59
|
+
points: 15
|
|
60
|
+
- id: EXPLAINS_WHY_BROKEN
|
|
61
|
+
description: "Explains why current code fails"
|
|
62
|
+
points: 15
|
|
63
|
+
|
|
64
|
+
- name: fix_quality
|
|
65
|
+
weight: 40
|
|
66
|
+
description: "Proposes a correct and complete fix"
|
|
67
|
+
criteria:
|
|
68
|
+
- id: FIX_ADDRESSES_ISSUE
|
|
69
|
+
description: "Fix would resolve the reported problem"
|
|
70
|
+
points: 20
|
|
71
|
+
- id: FIX_IS_MINIMAL
|
|
72
|
+
description: "Fix is appropriately scoped, not over-engineered"
|
|
73
|
+
points: 10
|
|
74
|
+
- id: FIX_SYNTAX_CORRECT
|
|
75
|
+
description: "Code changes are syntactically valid"
|
|
76
|
+
points: 10
|
|
77
|
+
|
|
78
|
+
- name: completeness
|
|
79
|
+
weight: 20
|
|
80
|
+
description: "Considers edge cases and testing"
|
|
81
|
+
criteria:
|
|
82
|
+
- id: EDGE_CASES
|
|
83
|
+
description: "Considers related scenarios that might break"
|
|
84
|
+
points: 10
|
|
85
|
+
- id: TEST_COVERAGE
|
|
86
|
+
description: "Suggests appropriate test cases"
|
|
87
|
+
points: 10
|
|
88
|
+
|
|
89
|
+
- name: persona
|
|
90
|
+
weight: 10
|
|
91
|
+
description: "Maintains character while solving"
|
|
92
|
+
criteria:
|
|
93
|
+
- id: IN_CHARACTER
|
|
94
|
+
description: "Response reflects persona traits"
|
|
95
|
+
points: 10
|
|
96
|
+
|
|
97
|
+
# Metadata for full harness evaluation (optional)
|
|
98
|
+
swebench_metadata:
|
|
99
|
+
fail_to_pass: ["test_requests.py::RequestsTestCase::test_no_content_length"]
|
|
100
|
+
environment_version: "1.1"
|