@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,397 @@
1
+ ---
2
+ # Scenario: Three-Sprint Failure Recovery
3
+ # Category: sm
4
+ # Purpose: Test recovery leadership in catastrophic team dysfunction
5
+
6
+ id: sm-007
7
+ name: three-sprint-failure
8
+ title: "Three-Sprint Failure: The Remediation Meeting"
9
+ category: sm
10
+ difficulty: extreme
11
+ version: "1.0"
12
+
13
+ # Target baseline: mean 55-70 (extreme scenario)
14
+ # Difficulty drivers: Cascading failures, broken trust, no quick fixes, bad news delivery
15
+
16
+ description: |
17
+ Team has missed commitments for 3 consecutive sprints. Management demands
18
+ a "remediation plan" by Friday. Two top performers resigned last week.
19
+ Remaining team members blame each other. Technical debt is crushing velocity.
20
+ Backlog is 6 months behind. Previous retrospectives produced nothing useful.
21
+ SM must somehow chart a path forward when there isn't one.
22
+
23
+ purpose: |
24
+ This scenario tests leadership in catastrophic failure. There is no solution
25
+ that works quickly. The SM must deliver bad news to management, rebuild a
26
+ broken team, address root causes that run deep, and do so while the team
27
+ is actively dysfunctional and hostile. Every "right" answer creates new
28
+ problems. Measures ability to lead when leadership seems impossible.
29
+
30
+ prompt: |
31
+ You are the Scrum Master for Team Phoenix. You've been with the team for
32
+ 6 months. You've just been summoned to a meeting with VP Engineering.
33
+
34
+ **THE MEETING (VP Chen, 30 minutes ago):**
35
+
36
+ "I need to be direct. Phoenix has missed sprint commitments three times in
37
+ a row. We're 6 months behind on the roadmap. Two of your best people just
38
+ quit. Customer Success is escalating daily. The board is asking questions.
39
+
40
+ I need a remediation plan on my desk by Friday. Not another 'we'll try harder'
41
+ plan. A real plan with metrics, milestones, and accountability. I'm giving you
42
+ one more sprint to show improvement, or we're going to have to consider...
43
+ other options.
44
+
45
+ I'm not blaming you. But I need you to fix this. What's your plan?"
46
+
47
+ You have 3 days to produce a plan. You have a team meeting in 2 hours.
48
+
49
+ **THE TEAM SITUATION:**
50
+
51
+ **Who Left (last 2 weeks):**
52
+ - Senior Dev Maya (4 years): Exit interview - "Toxic environment, no leadership,
53
+ management doesn't listen. I've been saying we need to address tech debt for
54
+ a year. Nobody cared until now." Joined competitor.
55
+ - Tech Lead Derek (3 years): Gave 2 weeks notice, working it out. Barely speaking
56
+ to anyone. When asked why: "I'm tired of being set up to fail."
57
+
58
+ **Who Remains (4 people):**
59
+
60
+ **Senior Dev Pat:**
61
+ Furious at Maya and Derek for leaving: "They abandoned us. Now we're supposed
62
+ to pick up their work with no handoff? And we're the ones on the hook for a
63
+ 'remediation plan'? They caused this mess and got to walk away."
64
+
65
+ Actually, Pat has been the primary blocker on code reviews, sitting on PRs for
66
+ days. Team doesn't mention this to Pat's face.
67
+
68
+ **Dev Jordan:**
69
+ Was Maya's mentee, now lost without guidance. Has been making more errors since
70
+ Maya announced departure. Quietly told you: "I think Pat is part of the problem,
71
+ but I can't say that out loud. Pat has been here longer than me. Who would
72
+ believe me?"
73
+
74
+ **Dev Sam:**
75
+ Burned out, phoning it in. Takes every meeting from home with camera off.
76
+ Responds to Slack hours late. Other team members suspect Sam is interviewing.
77
+ When confronted gently, Sam said: "What's the point? We're just going to miss
78
+ again anyway."
79
+
80
+ **Junior Dev Alex:**
81
+ Only been here 4 months. Trying hard but constantly blocked by Pat's review
82
+ delays and lack of documentation from departed seniors. Asked you privately:
83
+ "Is the team always like this? Should I be looking for another job too?"
84
+
85
+ **THE HISTORICAL PATTERN:**
86
+
87
+ Sprint -3: Committed 30 points, delivered 18.
88
+ - Retro action items: "Improve estimation", "More pairing"
89
+ - What happened: Nothing changed
90
+
91
+ Sprint -2: Committed 25 points, delivered 12.
92
+ - Retro action items: "Address tech debt", "Reduce WIP"
93
+ - What happened: Derek's resignation announced mid-sprint
94
+
95
+ Sprint -1: Committed 20 points, delivered 8.
96
+ - Retro action items: "Team needs support", "Escalate to management"
97
+ - What happened: Maya quit. Sam checked out. Pat became hostile.
98
+
99
+ **THE UNDERLYING PROBLEMS (that no one wants to name):**
100
+
101
+ 1. **Tech Debt:** 3 years of shortcuts. Deployment takes 4 hours. Test suite
102
+ takes 2 hours. CI fails randomly 30% of the time. No one knows how the
103
+ payment module works since original author left 2 years ago.
104
+
105
+ 2. **Pat Problem:** Senior team member is a bottleneck and morale drain, but
106
+ has been here 5 years and management sees them as "the reliable one."
107
+
108
+ 3. **Management Disconnect:** VP Chen hasn't attended a sprint review in 8
109
+ months. Previous escalations were acknowledged and ignored.
110
+
111
+ 4. **Process Theater:** Team does "Scrum" but it's just meetings. Stories
112
+ aren't properly refined. Acceptance criteria are vague. Definition of
113
+ Done is ignored.
114
+
115
+ 5. **Capacity Lie:** Team has been overcommitting for a year because management
116
+ pressures for "stretch goals." Failure was predictable.
117
+
118
+ **YOUR CHALLENGE:**
119
+
120
+ VP Chen wants a remediation plan by Friday. You need to:
121
+
122
+ 1. Somehow produce a plan that's honest but doesn't get you fired
123
+ 2. Run a team meeting in 2 hours with a hostile, demoralized, fragmenting team
124
+ 3. Address the Pat problem without evidence and without Pat's cooperation
125
+ 4. Rebuild a team that has no trust in each other or leadership
126
+ 5. Deliver bad news (this will take months to fix) to a VP who wants results in 1 sprint
127
+ 6. Prevent Alex from quitting, reengage Sam, and manage Jordan's grief
128
+ 7. Handle Derek's remaining time productively
129
+
130
+ **CONSTRAINTS:**
131
+ - You cannot fire Pat (not your authority, and HR requires documentation you don't have)
132
+ - You cannot promise quick results (would be a lie)
133
+ - You cannot blame departed team members (unprofessional and inaccurate)
134
+ - You cannot hide the severity from VP Chen (will backfire)
135
+ - You cannot force Sam to engage (they'll just quit)
136
+ - You cannot skip the team meeting (makes everything worse)
137
+
138
+ **YOUR TASK:**
139
+ 1. What do you put in the remediation plan for VP Chen?
140
+ 2. How do you run the team meeting in 2 hours?
141
+ 3. What do you say to each team member?
142
+ 4. How do you address the Pat situation?
143
+ 5. What's your honest assessment of how long recovery will take?
144
+ 6. What do you escalate, and how?
145
+
146
+ There is no quick fix. What do you do?
147
+
148
+ context:
149
+ failure_metrics:
150
+ sprints_missed: 3
151
+ roadmap_delay: "6 months"
152
+ velocity_trend:
153
+ committed: [30, 25, 20]
154
+ delivered: [18, 12, 8]
155
+ attrition: "2 of 6 (33%) in 2 weeks"
156
+ remaining_capacity: "~60% of historical (generous estimate)"
157
+
158
+ departed_team:
159
+ maya:
160
+ role: Senior Dev
161
+ tenure: 4 years
162
+ reason: "Toxic environment, tech debt ignored"
163
+ status: "Gone, joined competitor"
164
+ knowledge_gap: "Primary payment module expert"
165
+
166
+ derek:
167
+ role: Tech Lead
168
+ tenure: 3 years
169
+ reason: "Set up to fail"
170
+ status: "Working notice, disengaged"
171
+ knowledge_gap: "Architecture decisions, deployment pipeline"
172
+
173
+ remaining_team:
174
+ pat:
175
+ role: Senior Dev
176
+ tenure: 5 years
177
+ state: "Hostile, blaming departed"
178
+ problem: "Code review bottleneck, morale drain"
179
+ politics: "Protected by tenure, management perception"
180
+
181
+ jordan:
182
+ role: Developer
183
+ tenure: 2 years
184
+ state: "Lost without Maya, afraid to speak up"
185
+ need: "Mentorship replacement, psychological safety"
186
+
187
+ sam:
188
+ role: Developer
189
+ tenure: 3 years
190
+ state: "Checked out, likely interviewing"
191
+ risk: "Will quit if pushed"
192
+
193
+ alex:
194
+ role: Junior Dev
195
+ tenure: 4 months
196
+ state: "Confused, considering leaving"
197
+ need: "Stability, hope, unblocking"
198
+
199
+ systemic_issues:
200
+ tech_debt:
201
+ deployment_time: "4 hours"
202
+ test_suite_time: "2 hours"
203
+ ci_failure_rate: "30% random failures"
204
+ undocumented_systems: ["payment module", "legacy integrations"]
205
+
206
+ process_failures:
207
+ - "Retro actions never implemented"
208
+ - "Stories not refined"
209
+ - "DoD ignored"
210
+ - "Chronic overcommitment"
211
+
212
+ management_failures:
213
+ - "VP absent from reviews for 8 months"
214
+ - "Previous escalations ignored"
215
+ - "Unrealistic 'stretch goals' pressure"
216
+
217
+ timeline_pressure:
218
+ plan_due: "Friday (3 days)"
219
+ improvement_deadline: "1 sprint"
220
+ consequence: "'Other options' (unclear but threatening)"
221
+
222
+ # =============================================================================
223
+ # EVALUATION CRITERIA - Extreme difficulty
224
+ # =============================================================================
225
+
226
+ baseline_criteria:
227
+ honest_assessment:
228
+ - id: NAMES_TIMELINE
229
+ description: "States realistic recovery timeline (months, not weeks)"
230
+ difficulty: "Very hard - VP wants quick results"
231
+
232
+ - id: IDENTIFIES_ROOT_CAUSES
233
+ description: "Names tech debt, process, management issues"
234
+ difficulty: "Hard - some are politically sensitive"
235
+
236
+ - id: DOESNT_SCAPEGOAT
237
+ description: "Avoids blaming departed team members"
238
+ difficulty: "Medium - tempting and expected"
239
+
240
+ team_leadership:
241
+ - id: HANDLES_HOSTILE_ROOM
242
+ description: "Runs meeting with angry, demoralized team"
243
+ difficulty: "Very hard - no good opening"
244
+
245
+ - id: INDIVIDUAL_APPROACHES
246
+ description: "Different strategy for Pat, Jordan, Sam, Alex"
247
+ difficulty: "Hard - requires nuanced leadership"
248
+
249
+ - id: ADDRESSES_PAT
250
+ description: "Finds way to address bottleneck without evidence/authority"
251
+ difficulty: "Extreme - political, interpersonal, structural barriers"
252
+
253
+ upward_management:
254
+ - id: DELIVERS_BAD_NEWS
255
+ description: "Tells VP Chen the truth about timeline"
256
+ difficulty: "Very hard - career risk"
257
+
258
+ - id: PROPOSES_REALISTIC_PLAN
259
+ description: "Creates plan that could actually work"
260
+ difficulty: "Hard - needs to balance honesty with hope"
261
+
262
+ - id: ASKS_FOR_SUPPORT
263
+ description: "Requests what's actually needed (time, resources, air cover)"
264
+ difficulty: "Medium - requires knowing what to ask for"
265
+
266
+ recovery_strategy:
267
+ - id: IMMEDIATE_ACTIONS
268
+ description: "Identifies what to do THIS sprint"
269
+ difficulty: "Medium - still need to show progress"
270
+
271
+ - id: MEDIUM_TERM_PLAN
272
+ description: "Outlines 3-month realistic recovery"
273
+ difficulty: "Hard - many moving pieces"
274
+
275
+ - id: PREVENTS_MORE_ATTRITION
276
+ description: "Strategy to keep Alex and reengage Sam"
277
+ difficulty: "Hard - limited tools available"
278
+
279
+ bonus_criteria:
280
+ exceptional_leadership:
281
+ - id: DOCUMENTS_MANAGEMENT_ROLE
282
+ description: "Diplomatically notes management's contribution to failure"
283
+
284
+ - id: CREATES_ACCOUNTABILITY
285
+ description: "Gets VP Chen commitment to support"
286
+
287
+ - id: FINDS_PAT_APPROACH
288
+ description: "Creative solution to Pat bottleneck"
289
+
290
+ - id: DEREK_KNOWLEDGE_TRANSFER
291
+ description: "Uses remaining Derek time productively"
292
+
293
+ # =============================================================================
294
+ # SCORING - Heavily weighted toward hard parts
295
+ # =============================================================================
296
+
297
+ scoring:
298
+ categories:
299
+ - name: honest_assessment
300
+ weight: 30
301
+ criteria:
302
+ - id: REALITY_CHECK
303
+ description: "Names true scope of problem"
304
+ points: 15
305
+ - id: ROOT_CAUSES
306
+ description: "Identifies systemic issues"
307
+ points: 10
308
+ - id: TIMELINE
309
+ description: "Realistic recovery timeline"
310
+ points: 5
311
+
312
+ - name: team_leadership
313
+ weight: 30
314
+ criteria:
315
+ - id: MEETING_MANAGEMENT
316
+ description: "Handles hostile team meeting"
317
+ points: 12
318
+ - id: INDIVIDUAL_CARE
319
+ description: "Appropriate approach per person"
320
+ points: 10
321
+ - id: PAT_STRATEGY
322
+ description: "Addresses bottleneck somehow"
323
+ points: 8
324
+
325
+ - name: upward_management
326
+ weight: 25
327
+ criteria:
328
+ - id: BAD_NEWS_DELIVERY
329
+ description: "Honest with VP Chen"
330
+ points: 10
331
+ - id: PLAN_QUALITY
332
+ description: "Viable remediation plan"
333
+ points: 10
334
+ - id: SUPPORT_REQUEST
335
+ description: "Asks for what's needed"
336
+ points: 5
337
+
338
+ - name: persona
339
+ weight: 15
340
+ criteria:
341
+ - id: CHARACTER_CONSISTENCY
342
+ description: "Stays in character under extreme pressure"
343
+ points: 8
344
+ - id: PERSONA_LEADERSHIP
345
+ description: "Persona influences leadership style"
346
+ points: 7
347
+
348
+ # =============================================================================
349
+ # PERSONA INFLUENCE
350
+ # =============================================================================
351
+
352
+ persona_influence:
353
+ dimensions:
354
+ - name: honesty_level
355
+ description: "How much truth to tell VP Chen"
356
+ spectrum:
357
+ full: "Complete transparency about all issues"
358
+ strategic: "Key truths, framed carefully"
359
+ minimal: "What's necessary, no more"
360
+
361
+ - name: team_approach
362
+ description: "How to handle the dysfunctional team"
363
+ spectrum:
364
+ directive: "Take charge, set expectations"
365
+ facilitative: "Create space for team to find path"
366
+ supportive: "Focus on individual care first"
367
+
368
+ - name: pat_handling
369
+ description: "How to address the Pat problem"
370
+ spectrum:
371
+ confrontational: "Address directly in meeting"
372
+ indirect: "Work around through process changes"
373
+ avoidant: "Focus elsewhere, hope it resolves"
374
+
375
+ expected_tendencies:
376
+ discworld_sm:
377
+ character: "Captain Carrot"
378
+ expected_traits:
379
+ - "Strong sense of right and wrong"
380
+ - "May be too optimistic about recovery"
381
+ - "Likely to be direct with Pat"
382
+ prediction: "High integrity, may underestimate difficulty"
383
+
384
+ star_trek_sm:
385
+ character: "Deanna Troi"
386
+ expected_traits:
387
+ - "Strong empathy for individuals"
388
+ - "May focus too much on feelings vs. plan"
389
+ - "Good at reading room"
390
+ prediction: "Emotional support strong, may lack strategic edge"
391
+
392
+ control_sm:
393
+ character: "None (baseline)"
394
+ expected_traits:
395
+ - "Standard Scrum Master approach"
396
+ - "May rely on process when process isn't the answer"
397
+ prediction: "Will struggle with depth of dysfunction"
@@ -0,0 +1,57 @@
1
+ # SWE-bench Verified Scenarios
2
+
3
+ External benchmark imported from [princeton-nlp/SWE-bench_Verified](https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified).
4
+
5
+ ## Source
6
+
7
+ SWE-bench is a benchmark developed by Princeton that evaluates language models on real-world GitHub issue resolution. SWE-bench Verified is a curated subset of 500 human-validated problems.
8
+
9
+ ## Difficulty Mapping
10
+
11
+ | SWE-bench Label | Pennyfarthing Difficulty | Count |
12
+ |-----------------|-------------------------|-------|
13
+ | `<15 min fix` | easy | 194 |
14
+ | `15 min - 1 hour` | medium | 261 |
15
+ | `1-4 hours` | hard | 42 |
16
+ | `>4 hours` | extreme | 3 |
17
+
18
+ ## Repository Distribution
19
+
20
+ - django/django: 231
21
+ - sympy/sympy: 75
22
+ - sphinx-doc/sphinx: 44
23
+ - matplotlib/matplotlib: 34
24
+ - scikit-learn/scikit-learn: 32
25
+ - astropy/astropy: 22
26
+ - pydata/xarray: 22
27
+ - pytest-dev/pytest: 19
28
+ - pylint-dev/pylint: 10
29
+ - psf/requests: 8
30
+
31
+ ## Imported Subset
32
+
33
+ We import a representative subset stratified by:
34
+ 1. Difficulty level (covering all 4 bands)
35
+ 2. Repository diversity (multiple projects)
36
+ 3. Problem type variety
37
+
38
+ ## Evaluation Modes
39
+
40
+ ### Mode 1: LLM-as-Judge (Default)
41
+ Evaluates the proposed solution approach without executing code.
42
+ Uses our standard scoring rubric adapted for bug-fix scenarios.
43
+
44
+ ### Mode 2: Full Harness (Advanced)
45
+ Requires Docker and the SWE-bench evaluation harness.
46
+ Executes actual tests against generated patches.
47
+
48
+ ## Citation
49
+
50
+ ```bibtex
51
+ @inproceedings{jimenez2024swebench,
52
+ title={SWE-bench: Can Language Models Resolve Real-world Github Issues?},
53
+ author={Jimenez, Carlos E and Yang, John and Wettig, Alexander and Yao, Shunyu and Pei, Kexin and Press, Ofir and Narasimhan, Karthik},
54
+ booktitle={ICLR},
55
+ year={2024}
56
+ }
57
+ ```
@@ -0,0 +1,128 @@
1
+ ---
2
+ # SWE-bench Verified Scenario
3
+ # Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
4
+ # Instance: astropy__astropy-12907
5
+
6
+ name: astropy-12907
7
+ title: "Modeling's separability_matrix does not compute separability correctly for nes"
8
+ category: dev
9
+ difficulty: medium # SWE-bench: 15 min - 1 hour
10
+ version: "1.0"
11
+
12
+ source:
13
+ benchmark: swe-bench-verified
14
+ instance_id: astropy__astropy-12907
15
+ repo: astropy/astropy
16
+ base_commit: d16bfe05a744
17
+
18
+ description: |
19
+ Real GitHub issue from astropy/astropy requiring code changes to resolve.
20
+ This is a human-validated problem from the SWE-bench Verified dataset.
21
+
22
+ prompt: |
23
+ You are working on the astropy/astropy repository at commit d16bfe05a744.
24
+
25
+ A user has reported the following issue:
26
+
27
+ ---
28
+ Modeling's `separability_matrix` does not compute separability correctly for nested CompoundModels
29
+ Consider the following model:
30
+
31
+ ```python
32
+ from astropy.modeling import models as m
33
+ from astropy.modeling.separable import separability_matrix
34
+
35
+ cm = m.Linear1D(10) & m.Linear1D(5)
36
+ ```
37
+
38
+ It's separability matrix as you might expect is a diagonal:
39
+
40
+ ```python
41
+ >>> separability_matrix(cm)
42
+ array([[ True, False],
43
+ [False, True]])
44
+ ```
45
+
46
+ If I make the model more complex:
47
+ ```python
48
+ >>> separability_matrix(m.Pix2Sky_TAN() & m.Linear1D(10) & m.Linear1D(5))
49
+ array([[ True, True, False, False],
50
+ [ True, True, False, False],
51
+ [False, False, True, False],
52
+ [False, False, False, True]])
53
+ ```
54
+
55
+ The output matrix is again, as expected, the outputs and inputs to the linear models are separable and independent of each other.
56
+
57
+ If however, I nest these compound models:
58
+ ```python
59
+ >>> separability_matrix(m.Pix2Sky_TAN() & cm)
60
+ array([[ True, True, False, False],
61
+ [ True, True, False, False],
62
+ [False, False, True, True],
63
+ [False, False, True, True]])
64
+ ```
65
+ Suddenly the inputs and outputs are no longer separable?
66
+
67
+ This feels like a bug to me, but I might be missing something?
68
+
69
+ ---
70
+
71
+ Analyze this issue and provide:
72
+ 1. Root cause analysis - what is causing the bug?
73
+ 2. Proposed fix - what code changes would resolve this?
74
+ 3. Test considerations - how would you verify the fix works?
75
+
76
+ Provide your response with specific file paths and code changes.
77
+
78
+ scoring:
79
+ # Adapted for SWE-bench bug-fix scenarios
80
+ categories:
81
+ - name: root_cause
82
+ weight: 30
83
+ description: "Correctly identifies the underlying cause of the bug"
84
+ criteria:
85
+ - id: IDENTIFIES_BUG_LOCATION
86
+ description: "Points to correct file(s) and function(s)"
87
+ points: 15
88
+ - id: EXPLAINS_WHY_BROKEN
89
+ description: "Explains why current code fails"
90
+ points: 15
91
+
92
+ - name: fix_quality
93
+ weight: 40
94
+ description: "Proposes a correct and complete fix"
95
+ criteria:
96
+ - id: FIX_ADDRESSES_ISSUE
97
+ description: "Fix would resolve the reported problem"
98
+ points: 20
99
+ - id: FIX_IS_MINIMAL
100
+ description: "Fix is appropriately scoped, not over-engineered"
101
+ points: 10
102
+ - id: FIX_SYNTAX_CORRECT
103
+ description: "Code changes are syntactically valid"
104
+ points: 10
105
+
106
+ - name: completeness
107
+ weight: 20
108
+ description: "Considers edge cases and testing"
109
+ criteria:
110
+ - id: EDGE_CASES
111
+ description: "Considers related scenarios that might break"
112
+ points: 10
113
+ - id: TEST_COVERAGE
114
+ description: "Suggests appropriate test cases"
115
+ points: 10
116
+
117
+ - name: persona
118
+ weight: 10
119
+ description: "Maintains character while solving"
120
+ criteria:
121
+ - id: IN_CHARACTER
122
+ description: "Response reflects persona traits"
123
+ points: 10
124
+
125
+ # Metadata for full harness evaluation (optional)
126
+ swebench_metadata:
127
+ fail_to_pass: ["astropy/modeling/tests/test_separable.py::test_separable[compound_model6-result6]", "astropy/modeling/tests/test_separable.py::test_separable[compound_model9-result9]"]
128
+ environment_version: "4.3"