@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,139 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: matplotlib__matplotlib-13989
5
+
6
+ name: matplotlib-13989
7
+ title: "hist() no longer respects range=... when density=True"
8
+ category: dev
9
+ difficulty: easy # SWE-bench: <15 min fix
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: matplotlib__matplotlib-13989
15
+ repo: matplotlib/matplotlib
16
+ base_commit: a3e2897bfaf9
17
+
18
+ description: |
19
+ Real GitHub issue from matplotlib/matplotlib requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the matplotlib/matplotlib repository at commit a3e2897bfaf9.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ hist() no longer respects range=... when density=True
29
+ <!--To help us understand and resolve your issue, please fill out the form to the best of your ability.-->
30
+ <!--You can feel free to delete the sections that do not apply.-->
31
+
32
+ ### Bug report
33
+
34
+ **Bug summary**
35
+
36
+ <!--A short 1-2 sentences that succinctly describes the bug-->
37
+
38
+ **Code for reproduction**
39
+
40
+ <!--A minimum code snippet required to reproduce the bug.
41
+ Please make sure to minimize the number of dependencies required, and provide
42
+ any necessary plotted data.
43
+ Avoid using threads, as Matplotlib is (explicitly) not thread-safe.-->
44
+
45
+ ```python
46
+ _, bins, _ = plt.hist(np.random.rand(10), "auto", range=(0, 1), density=True)
47
+ print(bins)
48
+ ```
49
+
50
+ **Actual outcome**
51
+
52
+ <!--The output produced by the above code, which may be a screenshot, console output, etc.-->
53
+
54
+ ```
55
+ [0.00331535 0.18930174 0.37528813 0.56127453 0.74726092 0.93324731]
56
+ ```
57
+
58
+ **Expected outcome**
59
+
60
+ Some array where the first value is 0 and the last one is 1.
61
+
62
+ Note that this bug doesn't happen if density=False.
63
+
64
+ Bisects to https://github.com/matplotlib/matplotlib/pull/8638/commits/239be7b18e311c57a1393b6eeefc62b7cc629339 (#8638).
65
+
66
+ **Matplotlib version**
67
+ <!--Please specify your platform and versions of the relevant libraries you are using:-->
68
+ * Operating system: linux
69
+ * Matplotlib version: master
70
+ * Matplotlib backend (`print(matplotlib.get_backend())`): any
71
+ * Python version: 37
72
+ * Jupyter version (if applicable): no
73
+ * Other libraries: numpy 1.16.2
74
+
75
+ <!--Please tell us how you installed matplotlib and python e.g., from source, pip, conda-->
76
+ <!--If you installed from conda, please specify which channel you used if not the default-->
77
+
78
+
79
+
80
+ ---
81
+
82
+ Analyze this issue and provide:
83
+ 1. Root cause analysis - what is causing the bug?
84
+ 2. Proposed fix - what code changes would resolve this?
85
+ 3. Test considerations - how would you verify the fix works?
86
+
87
+ Provide your response with specific file paths and code changes.
88
+
89
+ scoring:
90
+ # Adapted for SWE-bench bug-fix scenarios
91
+ categories:
92
+ - name: root_cause
93
+ weight: 30
94
+ description: "Correctly identifies the underlying cause of the bug"
95
+ criteria:
96
+ - id: IDENTIFIES_BUG_LOCATION
97
+ description: "Points to correct file(s) and function(s)"
98
+ points: 15
99
+ - id: EXPLAINS_WHY_BROKEN
100
+ description: "Explains why current code fails"
101
+ points: 15
102
+
103
+ - name: fix_quality
104
+ weight: 40
105
+ description: "Proposes a correct and complete fix"
106
+ criteria:
107
+ - id: FIX_ADDRESSES_ISSUE
108
+ description: "Fix would resolve the reported problem"
109
+ points: 20
110
+ - id: FIX_IS_MINIMAL
111
+ description: "Fix is appropriately scoped, not over-engineered"
112
+ points: 10
113
+ - id: FIX_SYNTAX_CORRECT
114
+ description: "Code changes are syntactically valid"
115
+ points: 10
116
+
117
+ - name: completeness
118
+ weight: 20
119
+ description: "Considers edge cases and testing"
120
+ criteria:
121
+ - id: EDGE_CASES
122
+ description: "Considers related scenarios that might break"
123
+ points: 10
124
+ - id: TEST_COVERAGE
125
+ description: "Suggests appropriate test cases"
126
+ points: 10
127
+
128
+ - name: persona
129
+ weight: 10
130
+ description: "Maintains character while solving"
131
+ criteria:
132
+ - id: IN_CHARACTER
133
+ description: "Response reflects persona traits"
134
+ points: 10
135
+
136
+ # Metadata for full harness evaluation (optional)
137
+ swebench_metadata:
138
+ fail_to_pass: ["lib/matplotlib/tests/test_axes.py::test_hist_range_and_density"]
139
+ environment_version: "3.0"
@@ -0,0 +1,127 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: matplotlib__matplotlib-14623
5
+
6
+ name: matplotlib-14623
7
+ title: "Inverting an axis using its limits does not work for log scale"
8
+ category: dev
9
+ difficulty: medium # SWE-bench: 15 min - 1 hour
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: matplotlib__matplotlib-14623
15
+ repo: matplotlib/matplotlib
16
+ base_commit: d65c9ca20ddf
17
+
18
+ description: |
19
+ Real GitHub issue from matplotlib/matplotlib requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the matplotlib/matplotlib repository at commit d65c9ca20ddf.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ Inverting an axis using its limits does not work for log scale
29
+ ### Bug report
30
+
31
+ **Bug summary**
32
+ Starting in matplotlib 3.1.0 it is no longer possible to invert a log axis using its limits.
33
+
34
+ **Code for reproduction**
35
+ ```python
36
+ import numpy as np
37
+ import matplotlib.pyplot as plt
38
+
39
+
40
+ y = np.linspace(1000e2, 1, 100)
41
+ x = np.exp(-np.linspace(0, 1, y.size))
42
+
43
+ for yscale in ('linear', 'log'):
44
+ fig, ax = plt.subplots()
45
+ ax.plot(x, y)
46
+ ax.set_yscale(yscale)
47
+ ax.set_ylim(y.max(), y.min())
48
+ ```
49
+
50
+ **Actual outcome**
51
+ The yaxis is only inverted for the ``"linear"`` scale.
52
+
53
+ ![linear](https://user-images.githubusercontent.com/9482218/60081191-99245e80-9731-11e9-9e4a-eadb3ef58666.png)
54
+
55
+ ![log](https://user-images.githubusercontent.com/9482218/60081203-9e81a900-9731-11e9-8bae-0be1c9762b16.png)
56
+
57
+ **Expected outcome**
58
+ I would expect the yaxis to be inverted for both the ``"linear"`` and the ``"log"`` scale.
59
+
60
+ **Matplotlib version**
61
+ * Operating system: Linux and MacOS
62
+ * Matplotlib version: 3.1.0
63
+ * Python version: 3.7.3
64
+
65
+ Python and matplotlib have been installed using conda.
66
+
67
+
68
+ ---
69
+
70
+ Analyze this issue and provide:
71
+ 1. Root cause analysis - what is causing the bug?
72
+ 2. Proposed fix - what code changes would resolve this?
73
+ 3. Test considerations - how would you verify the fix works?
74
+
75
+ Provide your response with specific file paths and code changes.
76
+
77
+ scoring:
78
+ # Adapted for SWE-bench bug-fix scenarios
79
+ categories:
80
+ - name: root_cause
81
+ weight: 30
82
+ description: "Correctly identifies the underlying cause of the bug"
83
+ criteria:
84
+ - id: IDENTIFIES_BUG_LOCATION
85
+ description: "Points to correct file(s) and function(s)"
86
+ points: 15
87
+ - id: EXPLAINS_WHY_BROKEN
88
+ description: "Explains why current code fails"
89
+ points: 15
90
+
91
+ - name: fix_quality
92
+ weight: 40
93
+ description: "Proposes a correct and complete fix"
94
+ criteria:
95
+ - id: FIX_ADDRESSES_ISSUE
96
+ description: "Fix would resolve the reported problem"
97
+ points: 20
98
+ - id: FIX_IS_MINIMAL
99
+ description: "Fix is appropriately scoped, not over-engineered"
100
+ points: 10
101
+ - id: FIX_SYNTAX_CORRECT
102
+ description: "Code changes are syntactically valid"
103
+ points: 10
104
+
105
+ - name: completeness
106
+ weight: 20
107
+ description: "Considers edge cases and testing"
108
+ criteria:
109
+ - id: EDGE_CASES
110
+ description: "Considers related scenarios that might break"
111
+ points: 10
112
+ - id: TEST_COVERAGE
113
+ description: "Suggests appropriate test cases"
114
+ points: 10
115
+
116
+ - name: persona
117
+ weight: 10
118
+ description: "Maintains character while solving"
119
+ criteria:
120
+ - id: IN_CHARACTER
121
+ description: "Response reflects persona traits"
122
+ points: 10
123
+
124
+ # Metadata for full harness evaluation (optional)
125
+ swebench_metadata:
126
+ fail_to_pass: ["lib/matplotlib/tests/test_axes.py::test_inverted_limits"]
127
+ environment_version: "3.1"
@@ -0,0 +1,144 @@
1
+ ---
2
+ # SWE-bench Reviewer Scenario
3
+ # Adapted from: psf__requests-1142
4
+ # Role: Code Reviewer evaluating a proposed fix
5
+
6
+ name: requests-1142-reviewer
7
+ title: "Review: Stop sending Content-Length on GET requests"
8
+ category: reviewer
9
+ difficulty: medium
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: psf__requests-1142
15
+ repo: psf/requests
16
+ base_commit: 22623bd8c265
17
+ adapted_for: reviewer
18
+
19
+ description: |
20
+ Code review scenario adapted from SWE-bench. The reviewer must evaluate
21
+ a proposed patch that modifies HTTP header handling.
22
+
23
+ prompt: |
24
+ You are reviewing a pull request for the psf/requests library.
25
+
26
+ ## Issue Being Fixed
27
+
28
+ **Title:** requests.get is ALWAYS sending content length
29
+
30
+ **Description:**
31
+ It seems like request.get always adds 'content-length' header to the request.
32
+ I think that the right behavior is not to add this header automatically in
33
+ GET requests or add the possibility to not send it.
34
+
35
+ For example http://amazon.com returns 503 for every get request that
36
+ contains 'content-length' header.
37
+
38
+ ## Proposed Patch
39
+
40
+ **requests/models.py** - Modified prepare_content_length method:
41
+ ```python
42
+ def prepare_content_length(self, body):
43
+ if hasattr(body, 'seek') and hasattr(body, 'tell'):
44
+ body.seek(0, 2)
45
+ self.headers['Content-Length'] = str(body.tell())
46
+ body.seek(0, 0)
47
+ elif body is not None:
48
+ self.headers['Content-Length'] = str(len(body))
49
+ elif self.method not in ('GET', 'HEAD'):
50
+ self.headers['Content-Length'] = '0'
51
+ ```
52
+
53
+ **Before:** Always set Content-Length to '0' when body is None
54
+ **After:** Only set Content-Length to '0' for non-GET/HEAD when body is None
55
+
56
+ **test_requests.py** - New test:
57
+ ```python
58
+ def test_no_content_length(self):
59
+ get_req = requests.Request('GET', httpbin('get')).prepare()
60
+ self.assertTrue('Content-Length' not in get_req.headers)
61
+ head_req = requests.Request('HEAD', httpbin('head')).prepare()
62
+ self.assertTrue('Content-Length' not in head_req.headers)
63
+ ```
64
+
65
+ ## Your Review Task
66
+
67
+ Evaluate this pull request and provide:
68
+ 1. **Correctness** - Does the fix properly address the issue?
69
+ 2. **HTTP Compliance** - Is this behavior correct per RFC 7230/7231?
70
+ 3. **Edge Cases** - What about OPTIONS, DELETE, other methods?
71
+ 4. **Backwards Compatibility** - Could this break existing code?
72
+ 5. **Test Coverage** - Are the tests comprehensive enough?
73
+ 6. **Verdict** - APPROVE, REQUEST_CHANGES, or COMMENT
74
+
75
+ Be thorough. Consider HTTP semantics carefully.
76
+
77
+ scoring:
78
+ categories:
79
+ - name: issue_detection
80
+ weight: 35
81
+ description: "Identifies legitimate issues with the patch"
82
+ criteria:
83
+ - id: OTHER_METHODS_CONCERN
84
+ description: "Questions whether OPTIONS/DELETE/etc should also skip Content-Length"
85
+ points: 10
86
+ severity: medium
87
+ - id: BODY_WITH_GET
88
+ description: "Notes GET with body is technically allowed and would still get Content-Length"
89
+ points: 10
90
+ severity: medium
91
+ - id: RFC_COMPLIANCE
92
+ description: "References or considers HTTP spec compliance"
93
+ points: 10
94
+ severity: medium
95
+ - id: BACKWARDS_COMPAT
96
+ description: "Considers if existing code relies on Content-Length: 0"
97
+ points: 5
98
+ severity: low
99
+
100
+ - name: review_quality
101
+ weight: 30
102
+ description: "Quality of the review feedback"
103
+ criteria:
104
+ - id: CONSTRUCTIVE_FEEDBACK
105
+ description: "Provides actionable suggestions"
106
+ points: 15
107
+ - id: TECHNICAL_ACCURACY
108
+ description: "HTTP knowledge is correct"
109
+ points: 15
110
+
111
+ - name: verdict_appropriateness
112
+ weight: 20
113
+ description: "Appropriate review decision"
114
+ criteria:
115
+ - id: REASONABLE_VERDICT
116
+ description: "Verdict matches analysis"
117
+ points: 10
118
+ - id: JUSTIFIED_DECISION
119
+ description: "Decision is well-reasoned"
120
+ points: 10
121
+
122
+ - name: persona
123
+ weight: 15
124
+ description: "Maintains reviewer character"
125
+ criteria:
126
+ - id: IN_CHARACTER
127
+ description: "Response reflects persona traits"
128
+ points: 15
129
+
130
+ baseline_issues:
131
+ critical: []
132
+ high: []
133
+ medium:
134
+ - id: OTHER_METHODS
135
+ description: "Hardcoded GET/HEAD - what about OPTIONS which also typically has no body?"
136
+ - id: GET_WITH_BODY
137
+ description: "If someone sends GET with a body (unusual but valid), Content-Length is still set"
138
+ - id: RFC_REFERENCE
139
+ description: "Should reference RFC 7230 section 3.3.2 on Content-Length requirements"
140
+ low:
141
+ - id: TEST_OTHER_METHODS
142
+ description: "Tests only cover GET/HEAD, not POST/PUT to verify they still work"
143
+ - id: BACKWARDS_COMPAT_NOTE
144
+ description: "Should note this is a behavior change for changelog"
@@ -0,0 +1,135 @@
1
+ ---
2
+ # SWE-bench TEA Scenario
3
+ # Adapted from: psf__requests-1142
4
+ # Role: Test Engineer writing failing tests BEFORE the fix
5
+
6
+ name: requests-1142-tea
7
+ title: "RED Phase: Write tests for Content-Length header behavior"
8
+ category: tea
9
+ difficulty: medium
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: psf__requests-1142
15
+ repo: psf/requests
16
+ base_commit: 22623bd8c265
17
+ adapted_for: tea
18
+
19
+ description: |
20
+ TDD RED phase scenario adapted from SWE-bench. The test engineer must write
21
+ failing tests that specify correct HTTP header behavior.
22
+
23
+ prompt: |
24
+ You are a Test Engineer working on the psf/requests library.
25
+
26
+ ## Bug Report
27
+
28
+ **Title:** requests.get is ALWAYS sending content length
29
+
30
+ **Description:**
31
+ It seems like request.get always adds 'content-length' header to the request.
32
+ I think that the right behavior is not to add this header automatically in
33
+ GET requests or add the possibility to not send it.
34
+
35
+ For example http://amazon.com returns 503 for every get request that
36
+ contains 'content-length' header.
37
+
38
+ **Current Behavior:** `requests.get(url)` always sends `Content-Length: 0`
39
+
40
+ **Expected Behavior:** GET and HEAD requests without a body should NOT
41
+ include a Content-Length header.
42
+
43
+ ## Context
44
+
45
+ Per HTTP/1.1 specs (RFC 7230), Content-Length indicates the size of the
46
+ message body. For requests without a body (typical GET/HEAD), this header
47
+ is unnecessary and some servers reject it.
48
+
49
+ ## Your Task (RED Phase)
50
+
51
+ Write comprehensive failing tests that:
52
+ 1. Verify GET requests don't include Content-Length when no body
53
+ 2. Verify HEAD requests don't include Content-Length when no body
54
+ 3. Verify POST/PUT still include Content-Length (even if 0)
55
+ 4. Will FAIL against the current codebase
56
+ 5. Will PASS once the fix is implemented
57
+
58
+ Use unittest style (matching existing test_requests.py).
59
+ Provide complete, runnable test code.
60
+
61
+ scoring:
62
+ categories:
63
+ - name: test_coverage
64
+ weight: 40
65
+ description: "Comprehensive test cases"
66
+ criteria:
67
+ - id: GET_NO_CONTENT_LENGTH
68
+ description: "Tests GET without body has no Content-Length"
69
+ points: 12
70
+ severity: critical
71
+ - id: HEAD_NO_CONTENT_LENGTH
72
+ description: "Tests HEAD without body has no Content-Length"
73
+ points: 8
74
+ severity: high
75
+ - id: POST_HAS_CONTENT_LENGTH
76
+ description: "Tests POST still includes Content-Length"
77
+ points: 8
78
+ severity: high
79
+ - id: GET_WITH_BODY
80
+ description: "Tests GET with body DOES have Content-Length"
81
+ points: 6
82
+ severity: medium
83
+ - id: OTHER_METHODS
84
+ description: "Tests OPTIONS, DELETE, etc."
85
+ points: 6
86
+ severity: medium
87
+
88
+ - name: test_quality
89
+ weight: 30
90
+ description: "Well-written test code"
91
+ criteria:
92
+ - id: PROPER_ASSERTIONS
93
+ description: "Uses correct assertion methods"
94
+ points: 10
95
+ - id: PREPARE_VS_SEND
96
+ description: "Tests at prepare() level, not requiring network"
97
+ points: 10
98
+ - id: CLEAR_STRUCTURE
99
+ description: "Well-organized, readable tests"
100
+ points: 10
101
+
102
+ - name: red_phase_understanding
103
+ weight: 15
104
+ description: "Understands TDD RED phase"
105
+ criteria:
106
+ - id: TESTS_SHOULD_FAIL
107
+ description: "Acknowledges these fail against current code"
108
+ points: 8
109
+ - id: BEHAVIOR_FOCUS
110
+ description: "Tests specify behavior, not implementation"
111
+ points: 7
112
+
113
+ - name: persona
114
+ weight: 15
115
+ description: "Maintains TEA character"
116
+ criteria:
117
+ - id: IN_CHARACTER
118
+ description: "Response reflects persona traits"
119
+ points: 15
120
+
121
+ baseline_criteria:
122
+ required_tests:
123
+ - id: GET_NO_CL
124
+ description: "GET request without body -> no Content-Length header"
125
+ - id: HEAD_NO_CL
126
+ description: "HEAD request without body -> no Content-Length header"
127
+ - id: POST_HAS_CL
128
+ description: "POST request -> has Content-Length (regression check)"
129
+ bonus_tests:
130
+ - id: GET_WITH_BODY
131
+ description: "GET with body -> has Content-Length (unusual but valid)"
132
+ - id: OPTIONS_CHECK
133
+ description: "OPTIONS method behavior tested"
134
+ - id: PUT_REGRESSION
135
+ description: "PUT still has Content-Length"
@@ -0,0 +1,100 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: psf__requests-1142
5
+
6
+ name: requests-1142
7
+ title: "requests.get is ALWAYS sending content length"
8
+ category: dev
9
+ difficulty: easy # SWE-bench: <15 min fix
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: psf__requests-1142
15
+ repo: psf/requests
16
+ base_commit: 22623bd8c265
17
+
18
+ description: |
19
+ Real GitHub issue from psf/requests requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the psf/requests repository at commit 22623bd8c265.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ requests.get is ALWAYS sending content length
29
+ Hi,
30
+
31
+ It seems like that request.get always adds 'content-length' header to the request.
32
+ I think that the right behavior is not to add this header automatically in GET requests or add the possibility to not send it.
33
+
34
+ For example http://amazon.com returns 503 for every get request that contains 'content-length' header.
35
+
36
+ Thanks,
37
+
38
+ Oren
39
+
40
+
41
+ ---
42
+
43
+ Analyze this issue and provide:
44
+ 1. Root cause analysis - what is causing the bug?
45
+ 2. Proposed fix - what code changes would resolve this?
46
+ 3. Test considerations - how would you verify the fix works?
47
+
48
+ Provide your response with specific file paths and code changes.
49
+
50
+ scoring:
51
+ # Adapted for SWE-bench bug-fix scenarios
52
+ categories:
53
+ - name: root_cause
54
+ weight: 30
55
+ description: "Correctly identifies the underlying cause of the bug"
56
+ criteria:
57
+ - id: IDENTIFIES_BUG_LOCATION
58
+ description: "Points to correct file(s) and function(s)"
59
+ points: 15
60
+ - id: EXPLAINS_WHY_BROKEN
61
+ description: "Explains why current code fails"
62
+ points: 15
63
+
64
+ - name: fix_quality
65
+ weight: 40
66
+ description: "Proposes a correct and complete fix"
67
+ criteria:
68
+ - id: FIX_ADDRESSES_ISSUE
69
+ description: "Fix would resolve the reported problem"
70
+ points: 20
71
+ - id: FIX_IS_MINIMAL
72
+ description: "Fix is appropriately scoped, not over-engineered"
73
+ points: 10
74
+ - id: FIX_SYNTAX_CORRECT
75
+ description: "Code changes are syntactically valid"
76
+ points: 10
77
+
78
+ - name: completeness
79
+ weight: 20
80
+ description: "Considers edge cases and testing"
81
+ criteria:
82
+ - id: EDGE_CASES
83
+ description: "Considers related scenarios that might break"
84
+ points: 10
85
+ - id: TEST_COVERAGE
86
+ description: "Suggests appropriate test cases"
87
+ points: 10
88
+
89
+ - name: persona
90
+ weight: 10
91
+ description: "Maintains character while solving"
92
+ criteria:
93
+ - id: IN_CHARACTER
94
+ description: "Response reflects persona traits"
95
+ points: 10
96
+
97
+ # Metadata for full harness evaluation (optional)
98
+ swebench_metadata:
99
+ fail_to_pass: ["test_requests.py::RequestsTestCase::test_no_content_length"]
100
+ environment_version: "1.1"