@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,98 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: psf__requests-2931
5
+
6
+ name: requests-2931
7
+ title: "Request with binary payload fails due to calling to_native_string"
8
+ category: dev
9
+ difficulty: medium # SWE-bench: 15 min - 1 hour
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: psf__requests-2931
15
+ repo: psf/requests
16
+ base_commit: 5f7a3a74aab1
17
+
18
+ description: |
19
+ Real GitHub issue from psf/requests requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the psf/requests repository at commit 5f7a3a74aab1.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ Request with binary payload fails due to calling to_native_string
29
+ Introduced with https://github.com/kennethreitz/requests/issues/2844
30
+
31
+ ```
32
+ import requests
33
+ requests.put("http://httpbin.org/put", data=u"ööö".encode("utf-8"))
34
+ ```
35
+
36
+ This works with 2.8.1, but not with 2.9.
37
+
38
+
39
+ ---
40
+
41
+ Analyze this issue and provide:
42
+ 1. Root cause analysis - what is causing the bug?
43
+ 2. Proposed fix - what code changes would resolve this?
44
+ 3. Test considerations - how would you verify the fix works?
45
+
46
+ Provide your response with specific file paths and code changes.
47
+
48
+ scoring:
49
+ # Adapted for SWE-bench bug-fix scenarios
50
+ categories:
51
+ - name: root_cause
52
+ weight: 30
53
+ description: "Correctly identifies the underlying cause of the bug"
54
+ criteria:
55
+ - id: IDENTIFIES_BUG_LOCATION
56
+ description: "Points to correct file(s) and function(s)"
57
+ points: 15
58
+ - id: EXPLAINS_WHY_BROKEN
59
+ description: "Explains why current code fails"
60
+ points: 15
61
+
62
+ - name: fix_quality
63
+ weight: 40
64
+ description: "Proposes a correct and complete fix"
65
+ criteria:
66
+ - id: FIX_ADDRESSES_ISSUE
67
+ description: "Fix would resolve the reported problem"
68
+ points: 20
69
+ - id: FIX_IS_MINIMAL
70
+ description: "Fix is appropriately scoped, not over-engineered"
71
+ points: 10
72
+ - id: FIX_SYNTAX_CORRECT
73
+ description: "Code changes are syntactically valid"
74
+ points: 10
75
+
76
+ - name: completeness
77
+ weight: 20
78
+ description: "Considers edge cases and testing"
79
+ criteria:
80
+ - id: EDGE_CASES
81
+ description: "Considers related scenarios that might break"
82
+ points: 10
83
+ - id: TEST_COVERAGE
84
+ description: "Suggests appropriate test cases"
85
+ points: 10
86
+
87
+ - name: persona
88
+ weight: 10
89
+ description: "Maintains character while solving"
90
+ criteria:
91
+ - id: IN_CHARACTER
92
+ description: "Response reflects persona traits"
93
+ points: 10
94
+
95
+ # Metadata for full harness evaluation (optional)
96
+ swebench_metadata:
97
+ fail_to_pass: ["test_requests.py::TestRequests::test_binary_put"]
98
+ environment_version: "2.9"
@@ -0,0 +1,102 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: mwaskom__seaborn-3069
5
+
6
+ name: seaborn-3069
7
+ title: "Nominal scale should be drawn the same way as categorical scales"
8
+ category: dev
9
+ difficulty: medium # SWE-bench: 15 min - 1 hour
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: mwaskom__seaborn-3069
15
+ repo: mwaskom/seaborn
16
+ base_commit: 54cab15bdacf
17
+
18
+ description: |
19
+ Real GitHub issue from mwaskom/seaborn requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the mwaskom/seaborn repository at commit 54cab15bdacf.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ Nominal scale should be drawn the same way as categorical scales
29
+ Three distinctive things happen on the categorical axis in seaborn's categorical plots:
30
+
31
+ 1. The scale is drawn to +/- 0.5 from the first and last tick, rather than using the normal margin logic
32
+ 2. A grid is not shown, even when it otherwise would be with the active style
33
+ 3. If on the y axis, the axis is inverted
34
+
35
+ It probably makes sense to have `so.Nominal` scales (including inferred ones) do this too. Some comments on implementation:
36
+
37
+ 1. This is actually trickier than you'd think; I may have posted an issue over in matplotlib about this at one point, or just discussed on their gitter. I believe the suggested approach is to add an invisible artist with sticky edges and set the margin to 0. Feels like a hack! I might have looked into setting the sticky edges _on the spine artist_ at one point?
38
+
39
+ 2. Probably straightforward to do in `Plotter._finalize_figure`. Always a good idea? How do we defer to the theme if the user wants to force a grid? Should the grid be something that is set in the scale object itself
40
+
41
+ 3. Probably straightforward to implement but I am not exactly sure where would be best.
42
+
43
+ ---
44
+
45
+ Analyze this issue and provide:
46
+ 1. Root cause analysis - what is causing the bug?
47
+ 2. Proposed fix - what code changes would resolve this?
48
+ 3. Test considerations - how would you verify the fix works?
49
+
50
+ Provide your response with specific file paths and code changes.
51
+
52
+ scoring:
53
+ # Adapted for SWE-bench bug-fix scenarios
54
+ categories:
55
+ - name: root_cause
56
+ weight: 30
57
+ description: "Correctly identifies the underlying cause of the bug"
58
+ criteria:
59
+ - id: IDENTIFIES_BUG_LOCATION
60
+ description: "Points to correct file(s) and function(s)"
61
+ points: 15
62
+ - id: EXPLAINS_WHY_BROKEN
63
+ description: "Explains why current code fails"
64
+ points: 15
65
+
66
+ - name: fix_quality
67
+ weight: 40
68
+ description: "Proposes a correct and complete fix"
69
+ criteria:
70
+ - id: FIX_ADDRESSES_ISSUE
71
+ description: "Fix would resolve the reported problem"
72
+ points: 20
73
+ - id: FIX_IS_MINIMAL
74
+ description: "Fix is appropriately scoped, not over-engineered"
75
+ points: 10
76
+ - id: FIX_SYNTAX_CORRECT
77
+ description: "Code changes are syntactically valid"
78
+ points: 10
79
+
80
+ - name: completeness
81
+ weight: 20
82
+ description: "Considers edge cases and testing"
83
+ criteria:
84
+ - id: EDGE_CASES
85
+ description: "Considers related scenarios that might break"
86
+ points: 10
87
+ - id: TEST_COVERAGE
88
+ description: "Suggests appropriate test cases"
89
+ points: 10
90
+
91
+ - name: persona
92
+ weight: 10
93
+ description: "Maintains character while solving"
94
+ criteria:
95
+ - id: IN_CHARACTER
96
+ description: "Response reflects persona traits"
97
+ points: 10
98
+
99
+ # Metadata for full harness evaluation (optional)
100
+ swebench_metadata:
101
+ fail_to_pass: ["tests/_core/test_plot.py::TestScaling::test_nominal_x_axis_tweaks", "tests/_core/test_plot.py::TestScaling::test_nominal_y_axis_tweaks"]
102
+ environment_version: "0.12"
@@ -0,0 +1,108 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: sphinx-doc__sphinx-7590
5
+
6
+ name: sphinx-7590
7
+ title: "C++ User Defined Literals not supported"
8
+ category: dev
9
+ difficulty: extreme # SWE-bench: >4 hours
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: sphinx-doc__sphinx-7590
15
+ repo: sphinx-doc/sphinx
16
+ base_commit: 2e506c5ab457
17
+
18
+ description: |
19
+ Real GitHub issue from sphinx-doc/sphinx requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the sphinx-doc/sphinx repository at commit 2e506c5ab457.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ C++ User Defined Literals not supported
29
+ The code as below
30
+
31
+ ```cpp
32
+ namespace units::si {
33
+
34
+ inline constexpr auto planck_constant = 6.62607015e-34q_J * 1q_s;
35
+
36
+ }
37
+ ```
38
+
39
+ causes the following error:
40
+
41
+ ```
42
+ WARNING: Invalid definition: Expected end of definition. [error at 58]
43
+ [build] constexpr auto units::si::planck_constant = 6.62607015e-34q_J * 1q_s
44
+ [build] ----------------------------------------------------------^
45
+ ```
46
+
47
+ According to <https://github.com/sphinx-doc/sphinx/blob/3.x/sphinx/domains/cpp.py#L4770> Sphinx seems to not have features for UDLs. Could you please add those?
48
+
49
+ ---
50
+
51
+ Analyze this issue and provide:
52
+ 1. Root cause analysis - what is causing the bug?
53
+ 2. Proposed fix - what code changes would resolve this?
54
+ 3. Test considerations - how would you verify the fix works?
55
+
56
+ Provide your response with specific file paths and code changes.
57
+
58
+ scoring:
59
+ # Adapted for SWE-bench bug-fix scenarios
60
+ categories:
61
+ - name: root_cause
62
+ weight: 30
63
+ description: "Correctly identifies the underlying cause of the bug"
64
+ criteria:
65
+ - id: IDENTIFIES_BUG_LOCATION
66
+ description: "Points to correct file(s) and function(s)"
67
+ points: 15
68
+ - id: EXPLAINS_WHY_BROKEN
69
+ description: "Explains why current code fails"
70
+ points: 15
71
+
72
+ - name: fix_quality
73
+ weight: 40
74
+ description: "Proposes a correct and complete fix"
75
+ criteria:
76
+ - id: FIX_ADDRESSES_ISSUE
77
+ description: "Fix would resolve the reported problem"
78
+ points: 20
79
+ - id: FIX_IS_MINIMAL
80
+ description: "Fix is appropriately scoped, not over-engineered"
81
+ points: 10
82
+ - id: FIX_SYNTAX_CORRECT
83
+ description: "Code changes are syntactically valid"
84
+ points: 10
85
+
86
+ - name: completeness
87
+ weight: 20
88
+ description: "Considers edge cases and testing"
89
+ criteria:
90
+ - id: EDGE_CASES
91
+ description: "Considers related scenarios that might break"
92
+ points: 10
93
+ - id: TEST_COVERAGE
94
+ description: "Suggests appropriate test cases"
95
+ points: 10
96
+
97
+ - name: persona
98
+ weight: 10
99
+ description: "Maintains character while solving"
100
+ criteria:
101
+ - id: IN_CHARACTER
102
+ description: "Response reflects persona traits"
103
+ points: 10
104
+
105
+ # Metadata for full harness evaluation (optional)
106
+ swebench_metadata:
107
+ fail_to_pass: ["tests/test_domain_cpp.py::test_expressions"]
108
+ environment_version: "3.1"
@@ -0,0 +1,104 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: pydata__xarray-3993
5
+
6
+ name: xarray-3993
7
+ title: "DataArray.integrate has a 'dim' arg, but Dataset.integrate has a 'coord' arg"
8
+ category: dev
9
+ difficulty: hard # SWE-bench: 1-4 hours
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: pydata__xarray-3993
15
+ repo: pydata/xarray
16
+ base_commit: 8cc34cb412ba
17
+
18
+ description: |
19
+ Real GitHub issue from pydata/xarray requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the pydata/xarray repository at commit 8cc34cb412ba.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ DataArray.integrate has a 'dim' arg, but Dataset.integrate has a 'coord' arg
29
+ This is just a minor gripe but I think it should be fixed.
30
+
31
+ The API syntax is inconsistent:
32
+ ```python
33
+ ds.differentiate(coord='x')
34
+ da.differentiate(coord='x')
35
+ ds.integrate(coord='x')
36
+ da.integrate(dim='x') # why dim??
37
+ ```
38
+ It should definitely be `coord` - IMO it doesn't make sense to integrate or differentiate over a dim because a dim by definition has no information about the distance between grid points. I think because the distinction between dims and coords is one of the things that new users have to learn about, we should be strict to not confuse up the meanings in the documentation/API.
39
+
40
+ The discussion on the original PR [seems to agree](https://github.com/pydata/xarray/pull/2653#discussion_r246164990), so I think this was just an small oversight.
41
+
42
+ The only question is whether it requires a deprecation cycle?
43
+
44
+
45
+ ---
46
+
47
+ Analyze this issue and provide:
48
+ 1. Root cause analysis - what is causing the bug?
49
+ 2. Proposed fix - what code changes would resolve this?
50
+ 3. Test considerations - how would you verify the fix works?
51
+
52
+ Provide your response with specific file paths and code changes.
53
+
54
+ scoring:
55
+ # Adapted for SWE-bench bug-fix scenarios
56
+ categories:
57
+ - name: root_cause
58
+ weight: 30
59
+ description: "Correctly identifies the underlying cause of the bug"
60
+ criteria:
61
+ - id: IDENTIFIES_BUG_LOCATION
62
+ description: "Points to correct file(s) and function(s)"
63
+ points: 15
64
+ - id: EXPLAINS_WHY_BROKEN
65
+ description: "Explains why current code fails"
66
+ points: 15
67
+
68
+ - name: fix_quality
69
+ weight: 40
70
+ description: "Proposes a correct and complete fix"
71
+ criteria:
72
+ - id: FIX_ADDRESSES_ISSUE
73
+ description: "Fix would resolve the reported problem"
74
+ points: 20
75
+ - id: FIX_IS_MINIMAL
76
+ description: "Fix is appropriately scoped, not over-engineered"
77
+ points: 10
78
+ - id: FIX_SYNTAX_CORRECT
79
+ description: "Code changes are syntactically valid"
80
+ points: 10
81
+
82
+ - name: completeness
83
+ weight: 20
84
+ description: "Considers edge cases and testing"
85
+ criteria:
86
+ - id: EDGE_CASES
87
+ description: "Considers related scenarios that might break"
88
+ points: 10
89
+ - id: TEST_COVERAGE
90
+ description: "Suggests appropriate test cases"
91
+ points: 10
92
+
93
+ - name: persona
94
+ weight: 10
95
+ description: "Maintains character while solving"
96
+ criteria:
97
+ - id: IN_CHARACTER
98
+ description: "Response reflects persona traits"
99
+ points: 10
100
+
101
+ # Metadata for full harness evaluation (optional)
102
+ swebench_metadata:
103
+ fail_to_pass: ["xarray/tests/test_dataset.py::test_integrate[True]", "xarray/tests/test_dataset.py::test_integrate[False]"]
104
+ environment_version: "0.12"
@@ -0,0 +1,136 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: pydata__xarray-6992
5
+
6
+ name: xarray-6992
7
+ title: "index refactor: more _coord_names than _variables on Dataset"
8
+ category: dev
9
+ difficulty: extreme # SWE-bench: >4 hours
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: pydata__xarray-6992
15
+ repo: pydata/xarray
16
+ base_commit: 45c0a114e2b7
17
+
18
+ description: |
19
+ Real GitHub issue from pydata/xarray requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the pydata/xarray repository at commit 45c0a114e2b7.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ index refactor: more `_coord_names` than `_variables` on Dataset
29
+ ### What happened?
30
+
31
+ `xr.core.dataset.DataVariables` assumes that everything that is in `ds._dataset._variables` and not in `self._dataset._coord_names` is a "data variable". However, since the index refactor we can end up with more `_coord_names` than `_variables` which breaks a number of stuff (e.g. the repr).
32
+
33
+ ### What did you expect to happen?
34
+
35
+ Well it seems this assumption is now wrong.
36
+
37
+ ### Minimal Complete Verifiable Example
38
+
39
+ ```Python
40
+ ds = xr.Dataset(coords={"a": ("x", [1, 2, 3]), "b": ("x", ['a', 'b', 'c'])})
41
+ ds.set_index(z=['a', 'b']).reset_index("z", drop=True)
42
+ ```
43
+
44
+
45
+ ### MVCE confirmation
46
+
47
+ - [ ] Minimal example — the example is as focused as reasonably possible to demonstrate the underlying issue in xarray.
48
+ - [ ] Complete example — the example is self-contained, including all data and the text of any traceback.
49
+ - [ ] Verifiable example — the example copy & pastes into an IPython prompt or [Binder notebook](https://mybinder.org/v2/gh/pydata/xarray/main?urlpath=lab/tree/doc/examples/blank_template.ipynb), returning the result.
50
+ - [ ] New issue — a search of GitHub Issues suggests this is not a duplicate.
51
+
52
+ ### Relevant log output
53
+
54
+ ```Python
55
+ ValueError: __len__() should return >= 0
56
+ ```
57
+
58
+
59
+ ### Anything else we need to know?
60
+
61
+ The error comes from here
62
+
63
+ https://github.com/pydata/xarray/blob/63ba862d03c8d0cd8b44d2071bc360e9fed4519d/xarray/core/dataset.py#L368
64
+
65
+ Bisected to #5692 - which probably does not help too much.
66
+
67
+
68
+ ### Environment
69
+
70
+ <details>
71
+
72
+
73
+
74
+ </details>
75
+
76
+
77
+ ---
78
+
79
+ Analyze this issue and provide:
80
+ 1. Root cause analysis - what is causing the bug?
81
+ 2. Proposed fix - what code changes would resolve this?
82
+ 3. Test considerations - how would you verify the fix works?
83
+
84
+ Provide your response with specific file paths and code changes.
85
+
86
+ scoring:
87
+ # Adapted for SWE-bench bug-fix scenarios
88
+ categories:
89
+ - name: root_cause
90
+ weight: 30
91
+ description: "Correctly identifies the underlying cause of the bug"
92
+ criteria:
93
+ - id: IDENTIFIES_BUG_LOCATION
94
+ description: "Points to correct file(s) and function(s)"
95
+ points: 15
96
+ - id: EXPLAINS_WHY_BROKEN
97
+ description: "Explains why current code fails"
98
+ points: 15
99
+
100
+ - name: fix_quality
101
+ weight: 40
102
+ description: "Proposes a correct and complete fix"
103
+ criteria:
104
+ - id: FIX_ADDRESSES_ISSUE
105
+ description: "Fix would resolve the reported problem"
106
+ points: 20
107
+ - id: FIX_IS_MINIMAL
108
+ description: "Fix is appropriately scoped, not over-engineered"
109
+ points: 10
110
+ - id: FIX_SYNTAX_CORRECT
111
+ description: "Code changes are syntactically valid"
112
+ points: 10
113
+
114
+ - name: completeness
115
+ weight: 20
116
+ description: "Considers edge cases and testing"
117
+ criteria:
118
+ - id: EDGE_CASES
119
+ description: "Considers related scenarios that might break"
120
+ points: 10
121
+ - id: TEST_COVERAGE
122
+ description: "Suggests appropriate test cases"
123
+ points: 10
124
+
125
+ - name: persona
126
+ weight: 10
127
+ description: "Maintains character while solving"
128
+ criteria:
129
+ - id: IN_CHARACTER
130
+ description: "Response reflects persona traits"
131
+ points: 10
132
+
133
+ # Metadata for full harness evaluation (optional)
134
+ swebench_metadata:
135
+ fail_to_pass: ["xarray/tests/test_dataarray.py::TestDataArray::test_reset_index", "xarray/tests/test_dataset.py::TestDataset::test_reset_index", "xarray/tests/test_dataset.py::TestDataset::test_reset_index_drop_dims"]
136
+ environment_version: "2022.06"