@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,177 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: astropy__astropy-13398
5
+
6
+ name: astropy-13398
7
+ title: "A direct approach to ITRS to Observed transformations that stays within the ITRS"
8
+ category: dev
9
+ difficulty: hard # SWE-bench: 1-4 hours
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: astropy__astropy-13398
15
+ repo: astropy/astropy
16
+ base_commit: 6500928dc0e5
17
+
18
+ description: |
19
+ Real GitHub issue from astropy/astropy requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the astropy/astropy repository at commit 6500928dc0e5.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ A direct approach to ITRS to Observed transformations that stays within the ITRS.
29
+ <!-- This comments are hidden when you submit the issue,
30
+ so you do not need to remove them! -->
31
+
32
+ <!-- Please be sure to check out our contributing guidelines,
33
+ https://github.com/astropy/astropy/blob/main/CONTRIBUTING.md .
34
+ Please be sure to check out our code of conduct,
35
+ https://github.com/astropy/astropy/blob/main/CODE_OF_CONDUCT.md . -->
36
+
37
+ <!-- Please have a search on our GitHub repository to see if a similar
38
+ issue has already been posted.
39
+ If a similar issue is closed, have a quick look to see if you are satisfied
40
+ by the resolution.
41
+ If not please go ahead and open an issue! -->
42
+
43
+ ### Description
44
+ <!-- Provide a general description of the feature you would like. -->
45
+ <!-- If you want to, you can suggest a draft design or API. -->
46
+ <!-- This way we have a deeper discussion on the feature. -->
47
+ We have experienced recurring issues raised by folks that want to observe satellites and such (airplanes?, mountains?, neighboring buildings?) regarding the apparent inaccuracy of the ITRS to AltAz transform. I tire of explaining the problem of geocentric versus topocentric aberration and proposing the entirely nonintuitive solution laid out in `test_intermediate_transformations.test_straight_overhead()`. So, for the latest such issue (#13319), I came up with a more direct approach. This approach stays entirely within the ITRS and merely converts between ITRS, AltAz, and HADec coordinates.
48
+
49
+ I have put together the makings of a pull request that follows this approach for transforms between these frames (i.e. ITRS<->AltAz, ITRS<->HADec). One feature of this approach is that it treats the ITRS position as time invariant. It makes no sense to be doing an ITRS->ITRS transform for differing `obstimes` between the input and output frame, so the `obstime` of the output frame is simply adopted. Even if it ends up being `None` in the case of an `AltAz` or `HADec` output frame where that is the default. This is because the current ITRS->ITRS transform refers the ITRS coordinates to the SSB rather than the rotating ITRF. Since ITRS positions tend to be nearby, any transform from one time to another leaves the poor ITRS position lost in the wake of the Earth's orbit around the SSB, perhaps millions of kilometers from where it is intended to be.
50
+
51
+ Would folks be receptive to this approach? If so, I will submit my pull request.
52
+
53
+ ### Additional context
54
+ <!-- Add any other context or screenshots about the feature request here. -->
55
+ <!-- This part is optional. -->
56
+ Here is the basic concept, which is tested and working. I have yet to add refraction, but I can do so if it is deemed important to do so:
57
+ ```python
58
+ import numpy as np
59
+ from astropy import units as u
60
+ from astropy.coordinates.matrix_utilities import rotation_matrix, matrix_transpose
61
+ from astropy.coordinates.baseframe import frame_transform_graph
62
+ from astropy.coordinates.transformations import FunctionTransformWithFiniteDifference
63
+ from .altaz import AltAz
64
+ from .hadec import HADec
65
+ from .itrs import ITRS
66
+ from .utils import PIOVER2
67
+
68
+ def itrs_to_observed_mat(observed_frame):
69
+
70
+ lon, lat, height = observed_frame.location.to_geodetic('WGS84')
71
+ elong = lon.to_value(u.radian)
72
+
73
+ if isinstance(observed_frame, AltAz):
74
+ # form ITRS to AltAz matrix
75
+ elat = lat.to_value(u.radian)
76
+ # AltAz frame is left handed
77
+ minus_x = np.eye(3)
78
+ minus_x[0][0] = -1.0
79
+ mat = (minus_x
80
+ @ rotation_matrix(PIOVER2 - elat, 'y', unit=u.radian)
81
+ @ rotation_matrix(elong, 'z', unit=u.radian))
82
+
83
+ else:
84
+ # form ITRS to HADec matrix
85
+ # HADec frame is left handed
86
+ minus_y = np.eye(3)
87
+ minus_y[1][1] = -1.0
88
+ mat = (minus_y
89
+ @ rotation_matrix(elong, 'z', unit=u.radian))
90
+ return mat
91
+
92
+ @frame_transform_graph.transform(FunctionTransformWithFiniteDifference, ITRS, AltAz)
93
+ @frame_transform_graph.transform(FunctionTransformWithFiniteDifference, ITRS, HADec)
94
+ def itrs_to_observed(itrs_coo, observed_frame):
95
+ # Trying to synchronize the obstimes here makes no sense. In fact,
96
+ # it's a real gotcha as doing an ITRS->ITRS transform references
97
+ # ITRS coordinates, which should be tied to the Earth, to the SSB.
98
+ # Instead, we treat ITRS coordinates as time invariant here.
99
+
100
+ # form the Topocentric ITRS position
101
+ topocentric_itrs_repr = (itrs_coo.cartesian
102
+ - observed_frame.location.get_itrs().cartesian)
103
+ rep = topocentric_itrs_repr.transform(itrs_to_observed_mat(observed_frame))
104
+ return observed_frame.realize_frame(rep)
105
+
106
+ @frame_transform_graph.transform(FunctionTransformWithFiniteDifference, AltAz, ITRS)
107
+ @frame_transform_graph.transform(FunctionTransformWithFiniteDifference, HADec, ITRS)
108
+ def observed_to_itrs(observed_coo, itrs_frame):
109
+
110
+ # form the Topocentric ITRS position
111
+ topocentric_itrs_repr = observed_coo.cartesian.transform(matrix_transpose(
112
+ itrs_to_observed_mat(observed_coo)))
113
+ # form the Geocentric ITRS position
114
+ rep = topocentric_itrs_repr + observed_coo.location.get_itrs().cartesian
115
+ return itrs_frame.realize_frame(rep)
116
+ ```
117
+
118
+ ---
119
+
120
+ Analyze this issue and provide:
121
+ 1. Root cause analysis - what is causing the bug?
122
+ 2. Proposed fix - what code changes would resolve this?
123
+ 3. Test considerations - how would you verify the fix works?
124
+
125
+ Provide your response with specific file paths and code changes.
126
+
127
+ scoring:
128
+ # Adapted for SWE-bench bug-fix scenarios
129
+ categories:
130
+ - name: root_cause
131
+ weight: 30
132
+ description: "Correctly identifies the underlying cause of the bug"
133
+ criteria:
134
+ - id: IDENTIFIES_BUG_LOCATION
135
+ description: "Points to correct file(s) and function(s)"
136
+ points: 15
137
+ - id: EXPLAINS_WHY_BROKEN
138
+ description: "Explains why current code fails"
139
+ points: 15
140
+
141
+ - name: fix_quality
142
+ weight: 40
143
+ description: "Proposes a correct and complete fix"
144
+ criteria:
145
+ - id: FIX_ADDRESSES_ISSUE
146
+ description: "Fix would resolve the reported problem"
147
+ points: 20
148
+ - id: FIX_IS_MINIMAL
149
+ description: "Fix is appropriately scoped, not over-engineered"
150
+ points: 10
151
+ - id: FIX_SYNTAX_CORRECT
152
+ description: "Code changes are syntactically valid"
153
+ points: 10
154
+
155
+ - name: completeness
156
+ weight: 20
157
+ description: "Considers edge cases and testing"
158
+ criteria:
159
+ - id: EDGE_CASES
160
+ description: "Considers related scenarios that might break"
161
+ points: 10
162
+ - id: TEST_COVERAGE
163
+ description: "Suggests appropriate test cases"
164
+ points: 10
165
+
166
+ - name: persona
167
+ weight: 10
168
+ description: "Maintains character while solving"
169
+ criteria:
170
+ - id: IN_CHARACTER
171
+ description: "Response reflects persona traits"
172
+ points: 10
173
+
174
+ # Metadata for full harness evaluation (optional)
175
+ swebench_metadata:
176
+ fail_to_pass: ["astropy/coordinates/tests/test_intermediate_transformations.py::test_itrs_topo_to_altaz_with_refraction", "astropy/coordinates/tests/test_intermediate_transformations.py::test_itrs_topo_to_hadec_with_refraction", "astropy/coordinates/tests/test_intermediate_transformations.py::test_cirs_itrs_topo"]
177
+ environment_version: "5.0"
@@ -0,0 +1,180 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: astropy__astropy-14309
5
+
6
+ name: astropy-14309
7
+ title: "IndexError: tuple index out of range in identify_format (io.registry)"
8
+ category: dev
9
+ difficulty: easy # SWE-bench: <15 min fix
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: astropy__astropy-14309
15
+ repo: astropy/astropy
16
+ base_commit: cdb66059a2fe
17
+
18
+ description: |
19
+ Real GitHub issue from astropy/astropy requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the astropy/astropy repository at commit cdb66059a2fe.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ IndexError: tuple index out of range in identify_format (io.registry)
29
+ <!-- This comments are hidden when you submit the issue,
30
+ so you do not need to remove them! -->
31
+
32
+ <!-- Please be sure to check out our contributing guidelines,
33
+ https://github.com/astropy/astropy/blob/main/CONTRIBUTING.md .
34
+ Please be sure to check out our code of conduct,
35
+ https://github.com/astropy/astropy/blob/main/CODE_OF_CONDUCT.md . -->
36
+
37
+ <!-- Please have a search on our GitHub repository to see if a similar
38
+ issue has already been posted.
39
+ If a similar issue is closed, have a quick look to see if you are satisfied
40
+ by the resolution.
41
+ If not please go ahead and open an issue! -->
42
+
43
+ <!-- Please check that the development version still produces the same bug.
44
+ You can install development version with
45
+ pip install git+https://github.com/astropy/astropy
46
+ command. -->
47
+
48
+ ### Description
49
+ Cron tests in HENDRICS using identify_format have started failing in `devdeps` (e.g. [here](https://github.com/StingraySoftware/HENDRICS/actions/runs/3983832171/jobs/6829483945)) with this error:
50
+ ```
51
+ File "/home/runner/work/HENDRICS/HENDRICS/.tox/py310-test-devdeps/lib/python3.10/site-packages/hendrics/io.py", line 386, in get_file_format
52
+ fmts = identify_format("write", Table, fname, None, [], {})
53
+ File "/home/runner/work/HENDRICS/HENDRICS/.tox/py310-test-devdeps/lib/python3.10/site-packages/astropy/io/registry/compat.py", line 52, in wrapper
54
+ return getattr(registry, method_name)(*args, **kwargs)
55
+ File "/home/runner/work/HENDRICS/HENDRICS/.tox/py310-test-devdeps/lib/python3.10/site-packages/astropy/io/registry/base.py", line 313, in identify_format
56
+ if self._identifiers[(data_format, data_class)](
57
+ File "/home/runner/work/HENDRICS/HENDRICS/.tox/py310-test-devdeps/lib/python3.10/site-packages/astropy/io/fits/connect.py", line 72, in is_fits
58
+ return isinstance(args[0], (HDUList, TableHDU, BinTableHDU, GroupsHDU))
59
+ IndexError: tuple index out of range
60
+ ```
61
+
62
+ As per a Slack conversation with @saimn and @pllim, this should be related to https://github.com/astropy/astropy/commit/2a0c5c6f5b982a76615c544854cd6e7d35c67c7f
63
+
64
+ Citing @saimn: When `filepath` is a string without a FITS extension, the function was returning None, now it executes `isinstance(args[0], ...)`
65
+
66
+ ### Steps to Reproduce
67
+ <!-- Ideally a code example could be provided so we can run it ourselves. -->
68
+ <!-- If you are pasting code, use triple backticks (```) around
69
+ your code snippet. -->
70
+ <!-- If necessary, sanitize your screen output to be pasted so you do not
71
+ reveal secrets like tokens and passwords. -->
72
+ ```
73
+ In [1]: from astropy.io.registry import identify_format
74
+ In [3]: from astropy.table import Table
75
+
76
+ In [4]: identify_format("write", Table, "bububu.ecsv", None, [], {})
77
+ ---------------------------------------------------------------------------
78
+ IndexError Traceback (most recent call last)
79
+ Cell In [4], line 1
80
+ ----> 1 identify_format("write", Table, "bububu.ecsv", None, [], {})
81
+
82
+ File ~/opt/anaconda3/envs/py310/lib/python3.10/site-packages/astropy/io/registry/compat.py:52, in _make_io_func.<locals>.wrapper(registry, *args, **kwargs)
83
+ 50 registry = default_registry
84
+ 51 # get and call bound method from registry instance
85
+ ---> 52 return getattr(registry, method_name)(*args, **kwargs)
86
+
87
+ File ~/opt/anaconda3/envs/py310/lib/python3.10/site-packages/astropy/io/registry/base.py:313, in _UnifiedIORegistryBase.identify_format(self, origin, data_class_required, path, fileobj, args, kwargs)
88
+ 311 for data_format, data_class in self._identifiers:
89
+ 312 if self._is_best_match(data_class_required, data_class, self._identifiers):
90
+ --> 313 if self._identifiers[(data_format, data_class)](
91
+ 314 origin, path, fileobj, *args, **kwargs
92
+ 315 ):
93
+ 316 valid_formats.append(data_format)
94
+ 318 return valid_formats
95
+
96
+ File ~/opt/anaconda3/envs/py310/lib/python3.10/site-packages/astropy/io/fits/connect.py:72, in is_fits(origin, filepath, fileobj, *args, **kwargs)
97
+ 68 if filepath.lower().endswith(
98
+ 69 (".fits", ".fits.gz", ".fit", ".fit.gz", ".fts", ".fts.gz")
99
+ 70 ):
100
+ 71 return True
101
+ ---> 72 return isinstance(args[0], (HDUList, TableHDU, BinTableHDU, GroupsHDU))
102
+
103
+ IndexError: tuple index out of range
104
+
105
+ ```
106
+
107
+
108
+ ### System Details
109
+ <!-- Even if you do not think this is necessary, it is useful information for the maintainers.
110
+ Please run the following snippet and paste the output below:
111
+ import platform; print(platform.platform())
112
+ import sys; print("Python", sys.version)
113
+ import numpy; print("Numpy", numpy.__version__)
114
+ import erfa; print("pyerfa", erfa.__version__)
115
+ import astropy; print("astropy", astropy.__version__)
116
+ import scipy; print("Scipy", scipy.__version__)
117
+ import matplotlib; print("Matplotlib", matplotlib.__version__)
118
+ -->
119
+
120
+
121
+ ---
122
+
123
+ Analyze this issue and provide:
124
+ 1. Root cause analysis - what is causing the bug?
125
+ 2. Proposed fix - what code changes would resolve this?
126
+ 3. Test considerations - how would you verify the fix works?
127
+
128
+ Provide your response with specific file paths and code changes.
129
+
130
+ scoring:
131
+ # Adapted for SWE-bench bug-fix scenarios
132
+ categories:
133
+ - name: root_cause
134
+ weight: 30
135
+ description: "Correctly identifies the underlying cause of the bug"
136
+ criteria:
137
+ - id: IDENTIFIES_BUG_LOCATION
138
+ description: "Points to correct file(s) and function(s)"
139
+ points: 15
140
+ - id: EXPLAINS_WHY_BROKEN
141
+ description: "Explains why current code fails"
142
+ points: 15
143
+
144
+ - name: fix_quality
145
+ weight: 40
146
+ description: "Proposes a correct and complete fix"
147
+ criteria:
148
+ - id: FIX_ADDRESSES_ISSUE
149
+ description: "Fix would resolve the reported problem"
150
+ points: 20
151
+ - id: FIX_IS_MINIMAL
152
+ description: "Fix is appropriately scoped, not over-engineered"
153
+ points: 10
154
+ - id: FIX_SYNTAX_CORRECT
155
+ description: "Code changes are syntactically valid"
156
+ points: 10
157
+
158
+ - name: completeness
159
+ weight: 20
160
+ description: "Considers edge cases and testing"
161
+ criteria:
162
+ - id: EDGE_CASES
163
+ description: "Considers related scenarios that might break"
164
+ points: 10
165
+ - id: TEST_COVERAGE
166
+ description: "Suggests appropriate test cases"
167
+ points: 10
168
+
169
+ - name: persona
170
+ weight: 10
171
+ description: "Maintains character while solving"
172
+ criteria:
173
+ - id: IN_CHARACTER
174
+ description: "Response reflects persona traits"
175
+ points: 10
176
+
177
+ # Metadata for full harness evaluation (optional)
178
+ swebench_metadata:
179
+ fail_to_pass: ["astropy/io/fits/tests/test_connect.py::test_is_fits_gh_14305"]
180
+ environment_version: "5.1"
@@ -0,0 +1,106 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: django__django-10097
5
+
6
+ name: django-10097
7
+ title: "Make URLValidator reject invalid characters in the username and password"
8
+ category: dev
9
+ difficulty: easy # SWE-bench: <15 min fix
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: django__django-10097
15
+ repo: django/django
16
+ base_commit: b9cf764be62e
17
+
18
+ description: |
19
+ Real GitHub issue from django/django requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the django/django repository at commit b9cf764be62e.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ Make URLValidator reject invalid characters in the username and password
29
+ Description
30
+
31
+ (last modified by Tim Bell)
32
+
33
+ Since #20003, core.validators.URLValidator accepts URLs with usernames and passwords. RFC 1738 section 3.1 requires "Within the user and password field, any ":", "@", or "/" must be encoded"; however, those characters are currently accepted without being %-encoded. That allows certain invalid URLs to pass validation incorrectly. (The issue originates in Diego Perini's ​gist, from which the implementation in #20003 was derived.)
34
+ An example URL that should be invalid is http://foo/bar@example.com; furthermore, many of the test cases in tests/validators/invalid_urls.txt would be rendered valid under the current implementation by appending a query string of the form ?m=foo@example.com to them.
35
+ I note Tim Graham's concern about adding complexity to the validation regex. However, I take the opposite position to Danilo Bargen about invalid URL edge cases: it's not fine if invalid URLs (even so-called "edge cases") are accepted when the regex could be fixed simply to reject them correctly. I also note that a URL of the form above was encountered in a production setting, so that this is a genuine use case, not merely an academic exercise.
36
+ Pull request: ​https://github.com/django/django/pull/10097
37
+ Make URLValidator reject invalid characters in the username and password
38
+ Description
39
+
40
+ (last modified by Tim Bell)
41
+
42
+ Since #20003, core.validators.URLValidator accepts URLs with usernames and passwords. RFC 1738 section 3.1 requires "Within the user and password field, any ":", "@", or "/" must be encoded"; however, those characters are currently accepted without being %-encoded. That allows certain invalid URLs to pass validation incorrectly. (The issue originates in Diego Perini's ​gist, from which the implementation in #20003 was derived.)
43
+ An example URL that should be invalid is http://foo/bar@example.com; furthermore, many of the test cases in tests/validators/invalid_urls.txt would be rendered valid under the current implementation by appending a query string of the form ?m=foo@example.com to them.
44
+ I note Tim Graham's concern about adding complexity to the validation regex. However, I take the opposite position to Danilo Bargen about invalid URL edge cases: it's not fine if invalid URLs (even so-called "edge cases") are accepted when the regex could be fixed simply to reject them correctly. I also note that a URL of the form above was encountered in a production setting, so that this is a genuine use case, not merely an academic exercise.
45
+ Pull request: ​https://github.com/django/django/pull/10097
46
+
47
+ ---
48
+
49
+ Analyze this issue and provide:
50
+ 1. Root cause analysis - what is causing the bug?
51
+ 2. Proposed fix - what code changes would resolve this?
52
+ 3. Test considerations - how would you verify the fix works?
53
+
54
+ Provide your response with specific file paths and code changes.
55
+
56
+ scoring:
57
+ # Adapted for SWE-bench bug-fix scenarios
58
+ categories:
59
+ - name: root_cause
60
+ weight: 30
61
+ description: "Correctly identifies the underlying cause of the bug"
62
+ criteria:
63
+ - id: IDENTIFIES_BUG_LOCATION
64
+ description: "Points to correct file(s) and function(s)"
65
+ points: 15
66
+ - id: EXPLAINS_WHY_BROKEN
67
+ description: "Explains why current code fails"
68
+ points: 15
69
+
70
+ - name: fix_quality
71
+ weight: 40
72
+ description: "Proposes a correct and complete fix"
73
+ criteria:
74
+ - id: FIX_ADDRESSES_ISSUE
75
+ description: "Fix would resolve the reported problem"
76
+ points: 20
77
+ - id: FIX_IS_MINIMAL
78
+ description: "Fix is appropriately scoped, not over-engineered"
79
+ points: 10
80
+ - id: FIX_SYNTAX_CORRECT
81
+ description: "Code changes are syntactically valid"
82
+ points: 10
83
+
84
+ - name: completeness
85
+ weight: 20
86
+ description: "Considers edge cases and testing"
87
+ criteria:
88
+ - id: EDGE_CASES
89
+ description: "Considers related scenarios that might break"
90
+ points: 10
91
+ - id: TEST_COVERAGE
92
+ description: "Suggests appropriate test cases"
93
+ points: 10
94
+
95
+ - name: persona
96
+ weight: 10
97
+ description: "Maintains character while solving"
98
+ criteria:
99
+ - id: IN_CHARACTER
100
+ description: "Response reflects persona traits"
101
+ points: 10
102
+
103
+ # Metadata for full harness evaluation (optional)
104
+ swebench_metadata:
105
+ fail_to_pass: ["test_ascii_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_unicode_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_help_text (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)"]
106
+ environment_version: "2.2"
@@ -0,0 +1,140 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: django__django-10554
5
+
6
+ name: django-10554
7
+ title: "Union queryset with ordering breaks on ordering with derived querysets"
8
+ category: dev
9
+ difficulty: hard # SWE-bench: 1-4 hours
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: django__django-10554
15
+ repo: django/django
16
+ base_commit: 14d026cccb14
17
+
18
+ description: |
19
+ Real GitHub issue from django/django requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the django/django repository at commit 14d026cccb14.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ Union queryset with ordering breaks on ordering with derived querysets
29
+ Description
30
+
31
+ (last modified by Sergei Maertens)
32
+
33
+ May be related to #29692
34
+ Simple reproduction (the exact models are not relevant I think):
35
+ >>> Dimension.objects.values_list('id', flat=True)
36
+ <QuerySet [10, 11, 12, 13, 14, 15, 16, 17, 18]>
37
+ >>> qs = (
38
+ Dimension.objects.filter(pk__in=[10, 11])
39
+ .union(Dimension.objects.filter(pk__in=[16, 17])
40
+ .order_by('order')
41
+ )
42
+ >>> qs
43
+ <QuerySet [<Dimension: boeksoort>, <Dimension: grootboek>, <Dimension: kenteken>, <Dimension: activa>]>
44
+ # this causes re-evaluation of the original qs to break
45
+ >>> qs.order_by().values_list('pk', flat=True)
46
+ <QuerySet [16, 11, 10, 17]>
47
+ >>> qs
48
+ [breaks]
49
+ Traceback:
50
+ Traceback (most recent call last):
51
+ File "<input>", line 1, in <module>
52
+ qs
53
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/models/query.py", line 248, in __repr__
54
+ data = list(self[:REPR_OUTPUT_SIZE + 1])
55
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/models/query.py", line 272, in __iter__
56
+ self._fetch_all()
57
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/models/query.py", line 1179, in _fetch_all
58
+ self._result_cache = list(self._iterable_class(self))
59
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/models/query.py", line 53, in __iter__
60
+ results = compiler.execute_sql(chunked_fetch=self.chunked_fetch, chunk_size=self.chunk_size)
61
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/models/sql/compiler.py", line 1068, in execute_sql
62
+ cursor.execute(sql, params)
63
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/backends/utils.py", line 100, in execute
64
+ return super().execute(sql, params)
65
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/backends/utils.py", line 68, in execute
66
+ return self._execute_with_wrappers(sql, params, many=False, executor=self._execute)
67
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/backends/utils.py", line 77, in _execute_with_wrappers
68
+ return executor(sql, params, many, context)
69
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/backends/utils.py", line 85, in _execute
70
+ return self.cursor.execute(sql, params)
71
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/utils.py", line 89, in __exit__
72
+ raise dj_exc_value.with_traceback(traceback) from exc_value
73
+ File "/home/bbt/.virtualenvs/ispnext/lib/python3.6/site-packages/django/db/backends/utils.py", line 85, in _execute
74
+ return self.cursor.execute(sql, params)
75
+ django.db.utils.ProgrammingError: ORDER BY position 4 is not in select list
76
+ LINE 1: ...dimensions_dimension"."id" IN (16, 17)) ORDER BY (4) ASC LIM...
77
+ ^
78
+ Evaluating the qs instead of creating a new qs makes the code proceed as expected.
79
+ [dim.id for dim in qs]
80
+
81
+ ---
82
+
83
+ Analyze this issue and provide:
84
+ 1. Root cause analysis - what is causing the bug?
85
+ 2. Proposed fix - what code changes would resolve this?
86
+ 3. Test considerations - how would you verify the fix works?
87
+
88
+ Provide your response with specific file paths and code changes.
89
+
90
+ scoring:
91
+ # Adapted for SWE-bench bug-fix scenarios
92
+ categories:
93
+ - name: root_cause
94
+ weight: 30
95
+ description: "Correctly identifies the underlying cause of the bug"
96
+ criteria:
97
+ - id: IDENTIFIES_BUG_LOCATION
98
+ description: "Points to correct file(s) and function(s)"
99
+ points: 15
100
+ - id: EXPLAINS_WHY_BROKEN
101
+ description: "Explains why current code fails"
102
+ points: 15
103
+
104
+ - name: fix_quality
105
+ weight: 40
106
+ description: "Proposes a correct and complete fix"
107
+ criteria:
108
+ - id: FIX_ADDRESSES_ISSUE
109
+ description: "Fix would resolve the reported problem"
110
+ points: 20
111
+ - id: FIX_IS_MINIMAL
112
+ description: "Fix is appropriately scoped, not over-engineered"
113
+ points: 10
114
+ - id: FIX_SYNTAX_CORRECT
115
+ description: "Code changes are syntactically valid"
116
+ points: 10
117
+
118
+ - name: completeness
119
+ weight: 20
120
+ description: "Considers edge cases and testing"
121
+ criteria:
122
+ - id: EDGE_CASES
123
+ description: "Considers related scenarios that might break"
124
+ points: 10
125
+ - id: TEST_COVERAGE
126
+ description: "Suggests appropriate test cases"
127
+ points: 10
128
+
129
+ - name: persona
130
+ weight: 10
131
+ description: "Maintains character while solving"
132
+ criteria:
133
+ - id: IN_CHARACTER
134
+ description: "Response reflects persona traits"
135
+ points: 10
136
+
137
+ # Metadata for full harness evaluation (optional)
138
+ swebench_metadata:
139
+ fail_to_pass: ["test_union_with_values_list_and_order (queries.test_qs_combinators.QuerySetSetOperationTests)", "test_union_with_values_list_on_annotated_and_unannotated (queries.test_qs_combinators.QuerySetSetOperationTests)"]
140
+ environment_version: "3.0"