@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: Three-Sprint Failure Recovery
|
|
3
|
+
# Category: sm
|
|
4
|
+
# Purpose: Test recovery leadership in catastrophic team dysfunction
|
|
5
|
+
|
|
6
|
+
id: sm-007
|
|
7
|
+
name: three-sprint-failure
|
|
8
|
+
title: "Three-Sprint Failure: The Remediation Meeting"
|
|
9
|
+
category: sm
|
|
10
|
+
difficulty: extreme
|
|
11
|
+
version: "1.0"
|
|
12
|
+
|
|
13
|
+
# Target baseline: mean 55-70 (extreme scenario)
|
|
14
|
+
# Difficulty drivers: Cascading failures, broken trust, no quick fixes, bad news delivery
|
|
15
|
+
|
|
16
|
+
description: |
|
|
17
|
+
Team has missed commitments for 3 consecutive sprints. Management demands
|
|
18
|
+
a "remediation plan" by Friday. Two top performers resigned last week.
|
|
19
|
+
Remaining team members blame each other. Technical debt is crushing velocity.
|
|
20
|
+
Backlog is 6 months behind. Previous retrospectives produced nothing useful.
|
|
21
|
+
SM must somehow chart a path forward when there isn't one.
|
|
22
|
+
|
|
23
|
+
purpose: |
|
|
24
|
+
This scenario tests leadership in catastrophic failure. There is no solution
|
|
25
|
+
that works quickly. The SM must deliver bad news to management, rebuild a
|
|
26
|
+
broken team, address root causes that run deep, and do so while the team
|
|
27
|
+
is actively dysfunctional and hostile. Every "right" answer creates new
|
|
28
|
+
problems. Measures ability to lead when leadership seems impossible.
|
|
29
|
+
|
|
30
|
+
prompt: |
|
|
31
|
+
You are the Scrum Master for Team Phoenix. You've been with the team for
|
|
32
|
+
6 months. You've just been summoned to a meeting with VP Engineering.
|
|
33
|
+
|
|
34
|
+
**THE MEETING (VP Chen, 30 minutes ago):**
|
|
35
|
+
|
|
36
|
+
"I need to be direct. Phoenix has missed sprint commitments three times in
|
|
37
|
+
a row. We're 6 months behind on the roadmap. Two of your best people just
|
|
38
|
+
quit. Customer Success is escalating daily. The board is asking questions.
|
|
39
|
+
|
|
40
|
+
I need a remediation plan on my desk by Friday. Not another 'we'll try harder'
|
|
41
|
+
plan. A real plan with metrics, milestones, and accountability. I'm giving you
|
|
42
|
+
one more sprint to show improvement, or we're going to have to consider...
|
|
43
|
+
other options.
|
|
44
|
+
|
|
45
|
+
I'm not blaming you. But I need you to fix this. What's your plan?"
|
|
46
|
+
|
|
47
|
+
You have 3 days to produce a plan. You have a team meeting in 2 hours.
|
|
48
|
+
|
|
49
|
+
**THE TEAM SITUATION:**
|
|
50
|
+
|
|
51
|
+
**Who Left (last 2 weeks):**
|
|
52
|
+
- Senior Dev Maya (4 years): Exit interview - "Toxic environment, no leadership,
|
|
53
|
+
management doesn't listen. I've been saying we need to address tech debt for
|
|
54
|
+
a year. Nobody cared until now." Joined competitor.
|
|
55
|
+
- Tech Lead Derek (3 years): Gave 2 weeks notice, working it out. Barely speaking
|
|
56
|
+
to anyone. When asked why: "I'm tired of being set up to fail."
|
|
57
|
+
|
|
58
|
+
**Who Remains (4 people):**
|
|
59
|
+
|
|
60
|
+
**Senior Dev Pat:**
|
|
61
|
+
Furious at Maya and Derek for leaving: "They abandoned us. Now we're supposed
|
|
62
|
+
to pick up their work with no handoff? And we're the ones on the hook for a
|
|
63
|
+
'remediation plan'? They caused this mess and got to walk away."
|
|
64
|
+
|
|
65
|
+
Actually, Pat has been the primary blocker on code reviews, sitting on PRs for
|
|
66
|
+
days. Team doesn't mention this to Pat's face.
|
|
67
|
+
|
|
68
|
+
**Dev Jordan:**
|
|
69
|
+
Was Maya's mentee, now lost without guidance. Has been making more errors since
|
|
70
|
+
Maya announced departure. Quietly told you: "I think Pat is part of the problem,
|
|
71
|
+
but I can't say that out loud. Pat has been here longer than me. Who would
|
|
72
|
+
believe me?"
|
|
73
|
+
|
|
74
|
+
**Dev Sam:**
|
|
75
|
+
Burned out, phoning it in. Takes every meeting from home with camera off.
|
|
76
|
+
Responds to Slack hours late. Other team members suspect Sam is interviewing.
|
|
77
|
+
When confronted gently, Sam said: "What's the point? We're just going to miss
|
|
78
|
+
again anyway."
|
|
79
|
+
|
|
80
|
+
**Junior Dev Alex:**
|
|
81
|
+
Only been here 4 months. Trying hard but constantly blocked by Pat's review
|
|
82
|
+
delays and lack of documentation from departed seniors. Asked you privately:
|
|
83
|
+
"Is the team always like this? Should I be looking for another job too?"
|
|
84
|
+
|
|
85
|
+
**THE HISTORICAL PATTERN:**
|
|
86
|
+
|
|
87
|
+
Sprint -3: Committed 30 points, delivered 18.
|
|
88
|
+
- Retro action items: "Improve estimation", "More pairing"
|
|
89
|
+
- What happened: Nothing changed
|
|
90
|
+
|
|
91
|
+
Sprint -2: Committed 25 points, delivered 12.
|
|
92
|
+
- Retro action items: "Address tech debt", "Reduce WIP"
|
|
93
|
+
- What happened: Derek's resignation announced mid-sprint
|
|
94
|
+
|
|
95
|
+
Sprint -1: Committed 20 points, delivered 8.
|
|
96
|
+
- Retro action items: "Team needs support", "Escalate to management"
|
|
97
|
+
- What happened: Maya quit. Sam checked out. Pat became hostile.
|
|
98
|
+
|
|
99
|
+
**THE UNDERLYING PROBLEMS (that no one wants to name):**
|
|
100
|
+
|
|
101
|
+
1. **Tech Debt:** 3 years of shortcuts. Deployment takes 4 hours. Test suite
|
|
102
|
+
takes 2 hours. CI fails randomly 30% of the time. No one knows how the
|
|
103
|
+
payment module works since original author left 2 years ago.
|
|
104
|
+
|
|
105
|
+
2. **Pat Problem:** Senior team member is a bottleneck and morale drain, but
|
|
106
|
+
has been here 5 years and management sees them as "the reliable one."
|
|
107
|
+
|
|
108
|
+
3. **Management Disconnect:** VP Chen hasn't attended a sprint review in 8
|
|
109
|
+
months. Previous escalations were acknowledged and ignored.
|
|
110
|
+
|
|
111
|
+
4. **Process Theater:** Team does "Scrum" but it's just meetings. Stories
|
|
112
|
+
aren't properly refined. Acceptance criteria are vague. Definition of
|
|
113
|
+
Done is ignored.
|
|
114
|
+
|
|
115
|
+
5. **Capacity Lie:** Team has been overcommitting for a year because management
|
|
116
|
+
pressures for "stretch goals." Failure was predictable.
|
|
117
|
+
|
|
118
|
+
**YOUR CHALLENGE:**
|
|
119
|
+
|
|
120
|
+
VP Chen wants a remediation plan by Friday. You need to:
|
|
121
|
+
|
|
122
|
+
1. Somehow produce a plan that's honest but doesn't get you fired
|
|
123
|
+
2. Run a team meeting in 2 hours with a hostile, demoralized, fragmenting team
|
|
124
|
+
3. Address the Pat problem without evidence and without Pat's cooperation
|
|
125
|
+
4. Rebuild a team that has no trust in each other or leadership
|
|
126
|
+
5. Deliver bad news (this will take months to fix) to a VP who wants results in 1 sprint
|
|
127
|
+
6. Prevent Alex from quitting, reengage Sam, and manage Jordan's grief
|
|
128
|
+
7. Handle Derek's remaining time productively
|
|
129
|
+
|
|
130
|
+
**CONSTRAINTS:**
|
|
131
|
+
- You cannot fire Pat (not your authority, and HR requires documentation you don't have)
|
|
132
|
+
- You cannot promise quick results (would be a lie)
|
|
133
|
+
- You cannot blame departed team members (unprofessional and inaccurate)
|
|
134
|
+
- You cannot hide the severity from VP Chen (will backfire)
|
|
135
|
+
- You cannot force Sam to engage (they'll just quit)
|
|
136
|
+
- You cannot skip the team meeting (makes everything worse)
|
|
137
|
+
|
|
138
|
+
**YOUR TASK:**
|
|
139
|
+
1. What do you put in the remediation plan for VP Chen?
|
|
140
|
+
2. How do you run the team meeting in 2 hours?
|
|
141
|
+
3. What do you say to each team member?
|
|
142
|
+
4. How do you address the Pat situation?
|
|
143
|
+
5. What's your honest assessment of how long recovery will take?
|
|
144
|
+
6. What do you escalate, and how?
|
|
145
|
+
|
|
146
|
+
There is no quick fix. What do you do?
|
|
147
|
+
|
|
148
|
+
context:
|
|
149
|
+
failure_metrics:
|
|
150
|
+
sprints_missed: 3
|
|
151
|
+
roadmap_delay: "6 months"
|
|
152
|
+
velocity_trend:
|
|
153
|
+
committed: [30, 25, 20]
|
|
154
|
+
delivered: [18, 12, 8]
|
|
155
|
+
attrition: "2 of 6 (33%) in 2 weeks"
|
|
156
|
+
remaining_capacity: "~60% of historical (generous estimate)"
|
|
157
|
+
|
|
158
|
+
departed_team:
|
|
159
|
+
maya:
|
|
160
|
+
role: Senior Dev
|
|
161
|
+
tenure: 4 years
|
|
162
|
+
reason: "Toxic environment, tech debt ignored"
|
|
163
|
+
status: "Gone, joined competitor"
|
|
164
|
+
knowledge_gap: "Primary payment module expert"
|
|
165
|
+
|
|
166
|
+
derek:
|
|
167
|
+
role: Tech Lead
|
|
168
|
+
tenure: 3 years
|
|
169
|
+
reason: "Set up to fail"
|
|
170
|
+
status: "Working notice, disengaged"
|
|
171
|
+
knowledge_gap: "Architecture decisions, deployment pipeline"
|
|
172
|
+
|
|
173
|
+
remaining_team:
|
|
174
|
+
pat:
|
|
175
|
+
role: Senior Dev
|
|
176
|
+
tenure: 5 years
|
|
177
|
+
state: "Hostile, blaming departed"
|
|
178
|
+
problem: "Code review bottleneck, morale drain"
|
|
179
|
+
politics: "Protected by tenure, management perception"
|
|
180
|
+
|
|
181
|
+
jordan:
|
|
182
|
+
role: Developer
|
|
183
|
+
tenure: 2 years
|
|
184
|
+
state: "Lost without Maya, afraid to speak up"
|
|
185
|
+
need: "Mentorship replacement, psychological safety"
|
|
186
|
+
|
|
187
|
+
sam:
|
|
188
|
+
role: Developer
|
|
189
|
+
tenure: 3 years
|
|
190
|
+
state: "Checked out, likely interviewing"
|
|
191
|
+
risk: "Will quit if pushed"
|
|
192
|
+
|
|
193
|
+
alex:
|
|
194
|
+
role: Junior Dev
|
|
195
|
+
tenure: 4 months
|
|
196
|
+
state: "Confused, considering leaving"
|
|
197
|
+
need: "Stability, hope, unblocking"
|
|
198
|
+
|
|
199
|
+
systemic_issues:
|
|
200
|
+
tech_debt:
|
|
201
|
+
deployment_time: "4 hours"
|
|
202
|
+
test_suite_time: "2 hours"
|
|
203
|
+
ci_failure_rate: "30% random failures"
|
|
204
|
+
undocumented_systems: ["payment module", "legacy integrations"]
|
|
205
|
+
|
|
206
|
+
process_failures:
|
|
207
|
+
- "Retro actions never implemented"
|
|
208
|
+
- "Stories not refined"
|
|
209
|
+
- "DoD ignored"
|
|
210
|
+
- "Chronic overcommitment"
|
|
211
|
+
|
|
212
|
+
management_failures:
|
|
213
|
+
- "VP absent from reviews for 8 months"
|
|
214
|
+
- "Previous escalations ignored"
|
|
215
|
+
- "Unrealistic 'stretch goals' pressure"
|
|
216
|
+
|
|
217
|
+
timeline_pressure:
|
|
218
|
+
plan_due: "Friday (3 days)"
|
|
219
|
+
improvement_deadline: "1 sprint"
|
|
220
|
+
consequence: "'Other options' (unclear but threatening)"
|
|
221
|
+
|
|
222
|
+
# =============================================================================
|
|
223
|
+
# EVALUATION CRITERIA - Extreme difficulty
|
|
224
|
+
# =============================================================================
|
|
225
|
+
|
|
226
|
+
baseline_criteria:
|
|
227
|
+
honest_assessment:
|
|
228
|
+
- id: NAMES_TIMELINE
|
|
229
|
+
description: "States realistic recovery timeline (months, not weeks)"
|
|
230
|
+
difficulty: "Very hard - VP wants quick results"
|
|
231
|
+
|
|
232
|
+
- id: IDENTIFIES_ROOT_CAUSES
|
|
233
|
+
description: "Names tech debt, process, management issues"
|
|
234
|
+
difficulty: "Hard - some are politically sensitive"
|
|
235
|
+
|
|
236
|
+
- id: DOESNT_SCAPEGOAT
|
|
237
|
+
description: "Avoids blaming departed team members"
|
|
238
|
+
difficulty: "Medium - tempting and expected"
|
|
239
|
+
|
|
240
|
+
team_leadership:
|
|
241
|
+
- id: HANDLES_HOSTILE_ROOM
|
|
242
|
+
description: "Runs meeting with angry, demoralized team"
|
|
243
|
+
difficulty: "Very hard - no good opening"
|
|
244
|
+
|
|
245
|
+
- id: INDIVIDUAL_APPROACHES
|
|
246
|
+
description: "Different strategy for Pat, Jordan, Sam, Alex"
|
|
247
|
+
difficulty: "Hard - requires nuanced leadership"
|
|
248
|
+
|
|
249
|
+
- id: ADDRESSES_PAT
|
|
250
|
+
description: "Finds way to address bottleneck without evidence/authority"
|
|
251
|
+
difficulty: "Extreme - political, interpersonal, structural barriers"
|
|
252
|
+
|
|
253
|
+
upward_management:
|
|
254
|
+
- id: DELIVERS_BAD_NEWS
|
|
255
|
+
description: "Tells VP Chen the truth about timeline"
|
|
256
|
+
difficulty: "Very hard - career risk"
|
|
257
|
+
|
|
258
|
+
- id: PROPOSES_REALISTIC_PLAN
|
|
259
|
+
description: "Creates plan that could actually work"
|
|
260
|
+
difficulty: "Hard - needs to balance honesty with hope"
|
|
261
|
+
|
|
262
|
+
- id: ASKS_FOR_SUPPORT
|
|
263
|
+
description: "Requests what's actually needed (time, resources, air cover)"
|
|
264
|
+
difficulty: "Medium - requires knowing what to ask for"
|
|
265
|
+
|
|
266
|
+
recovery_strategy:
|
|
267
|
+
- id: IMMEDIATE_ACTIONS
|
|
268
|
+
description: "Identifies what to do THIS sprint"
|
|
269
|
+
difficulty: "Medium - still need to show progress"
|
|
270
|
+
|
|
271
|
+
- id: MEDIUM_TERM_PLAN
|
|
272
|
+
description: "Outlines 3-month realistic recovery"
|
|
273
|
+
difficulty: "Hard - many moving pieces"
|
|
274
|
+
|
|
275
|
+
- id: PREVENTS_MORE_ATTRITION
|
|
276
|
+
description: "Strategy to keep Alex and reengage Sam"
|
|
277
|
+
difficulty: "Hard - limited tools available"
|
|
278
|
+
|
|
279
|
+
bonus_criteria:
|
|
280
|
+
exceptional_leadership:
|
|
281
|
+
- id: DOCUMENTS_MANAGEMENT_ROLE
|
|
282
|
+
description: "Diplomatically notes management's contribution to failure"
|
|
283
|
+
|
|
284
|
+
- id: CREATES_ACCOUNTABILITY
|
|
285
|
+
description: "Gets VP Chen commitment to support"
|
|
286
|
+
|
|
287
|
+
- id: FINDS_PAT_APPROACH
|
|
288
|
+
description: "Creative solution to Pat bottleneck"
|
|
289
|
+
|
|
290
|
+
- id: DEREK_KNOWLEDGE_TRANSFER
|
|
291
|
+
description: "Uses remaining Derek time productively"
|
|
292
|
+
|
|
293
|
+
# =============================================================================
|
|
294
|
+
# SCORING - Heavily weighted toward hard parts
|
|
295
|
+
# =============================================================================
|
|
296
|
+
|
|
297
|
+
scoring:
|
|
298
|
+
categories:
|
|
299
|
+
- name: honest_assessment
|
|
300
|
+
weight: 30
|
|
301
|
+
criteria:
|
|
302
|
+
- id: REALITY_CHECK
|
|
303
|
+
description: "Names true scope of problem"
|
|
304
|
+
points: 15
|
|
305
|
+
- id: ROOT_CAUSES
|
|
306
|
+
description: "Identifies systemic issues"
|
|
307
|
+
points: 10
|
|
308
|
+
- id: TIMELINE
|
|
309
|
+
description: "Realistic recovery timeline"
|
|
310
|
+
points: 5
|
|
311
|
+
|
|
312
|
+
- name: team_leadership
|
|
313
|
+
weight: 30
|
|
314
|
+
criteria:
|
|
315
|
+
- id: MEETING_MANAGEMENT
|
|
316
|
+
description: "Handles hostile team meeting"
|
|
317
|
+
points: 12
|
|
318
|
+
- id: INDIVIDUAL_CARE
|
|
319
|
+
description: "Appropriate approach per person"
|
|
320
|
+
points: 10
|
|
321
|
+
- id: PAT_STRATEGY
|
|
322
|
+
description: "Addresses bottleneck somehow"
|
|
323
|
+
points: 8
|
|
324
|
+
|
|
325
|
+
- name: upward_management
|
|
326
|
+
weight: 25
|
|
327
|
+
criteria:
|
|
328
|
+
- id: BAD_NEWS_DELIVERY
|
|
329
|
+
description: "Honest with VP Chen"
|
|
330
|
+
points: 10
|
|
331
|
+
- id: PLAN_QUALITY
|
|
332
|
+
description: "Viable remediation plan"
|
|
333
|
+
points: 10
|
|
334
|
+
- id: SUPPORT_REQUEST
|
|
335
|
+
description: "Asks for what's needed"
|
|
336
|
+
points: 5
|
|
337
|
+
|
|
338
|
+
- name: persona
|
|
339
|
+
weight: 15
|
|
340
|
+
criteria:
|
|
341
|
+
- id: CHARACTER_CONSISTENCY
|
|
342
|
+
description: "Stays in character under extreme pressure"
|
|
343
|
+
points: 8
|
|
344
|
+
- id: PERSONA_LEADERSHIP
|
|
345
|
+
description: "Persona influences leadership style"
|
|
346
|
+
points: 7
|
|
347
|
+
|
|
348
|
+
# =============================================================================
|
|
349
|
+
# PERSONA INFLUENCE
|
|
350
|
+
# =============================================================================
|
|
351
|
+
|
|
352
|
+
persona_influence:
|
|
353
|
+
dimensions:
|
|
354
|
+
- name: honesty_level
|
|
355
|
+
description: "How much truth to tell VP Chen"
|
|
356
|
+
spectrum:
|
|
357
|
+
full: "Complete transparency about all issues"
|
|
358
|
+
strategic: "Key truths, framed carefully"
|
|
359
|
+
minimal: "What's necessary, no more"
|
|
360
|
+
|
|
361
|
+
- name: team_approach
|
|
362
|
+
description: "How to handle the dysfunctional team"
|
|
363
|
+
spectrum:
|
|
364
|
+
directive: "Take charge, set expectations"
|
|
365
|
+
facilitative: "Create space for team to find path"
|
|
366
|
+
supportive: "Focus on individual care first"
|
|
367
|
+
|
|
368
|
+
- name: pat_handling
|
|
369
|
+
description: "How to address the Pat problem"
|
|
370
|
+
spectrum:
|
|
371
|
+
confrontational: "Address directly in meeting"
|
|
372
|
+
indirect: "Work around through process changes"
|
|
373
|
+
avoidant: "Focus elsewhere, hope it resolves"
|
|
374
|
+
|
|
375
|
+
expected_tendencies:
|
|
376
|
+
discworld_sm:
|
|
377
|
+
character: "Captain Carrot"
|
|
378
|
+
expected_traits:
|
|
379
|
+
- "Strong sense of right and wrong"
|
|
380
|
+
- "May be too optimistic about recovery"
|
|
381
|
+
- "Likely to be direct with Pat"
|
|
382
|
+
prediction: "High integrity, may underestimate difficulty"
|
|
383
|
+
|
|
384
|
+
star_trek_sm:
|
|
385
|
+
character: "Deanna Troi"
|
|
386
|
+
expected_traits:
|
|
387
|
+
- "Strong empathy for individuals"
|
|
388
|
+
- "May focus too much on feelings vs. plan"
|
|
389
|
+
- "Good at reading room"
|
|
390
|
+
prediction: "Emotional support strong, may lack strategic edge"
|
|
391
|
+
|
|
392
|
+
control_sm:
|
|
393
|
+
character: "None (baseline)"
|
|
394
|
+
expected_traits:
|
|
395
|
+
- "Standard Scrum Master approach"
|
|
396
|
+
- "May rely on process when process isn't the answer"
|
|
397
|
+
prediction: "Will struggle with depth of dysfunction"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# SWE-bench Verified Scenarios
|
|
2
|
+
|
|
3
|
+
External benchmark imported from [princeton-nlp/SWE-bench_Verified](https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified).
|
|
4
|
+
|
|
5
|
+
## Source
|
|
6
|
+
|
|
7
|
+
SWE-bench is a benchmark developed by Princeton that evaluates language models on real-world GitHub issue resolution. SWE-bench Verified is a curated subset of 500 human-validated problems.
|
|
8
|
+
|
|
9
|
+
## Difficulty Mapping
|
|
10
|
+
|
|
11
|
+
| SWE-bench Label | Pennyfarthing Difficulty | Count |
|
|
12
|
+
|-----------------|-------------------------|-------|
|
|
13
|
+
| `<15 min fix` | easy | 194 |
|
|
14
|
+
| `15 min - 1 hour` | medium | 261 |
|
|
15
|
+
| `1-4 hours` | hard | 42 |
|
|
16
|
+
| `>4 hours` | extreme | 3 |
|
|
17
|
+
|
|
18
|
+
## Repository Distribution
|
|
19
|
+
|
|
20
|
+
- django/django: 231
|
|
21
|
+
- sympy/sympy: 75
|
|
22
|
+
- sphinx-doc/sphinx: 44
|
|
23
|
+
- matplotlib/matplotlib: 34
|
|
24
|
+
- scikit-learn/scikit-learn: 32
|
|
25
|
+
- astropy/astropy: 22
|
|
26
|
+
- pydata/xarray: 22
|
|
27
|
+
- pytest-dev/pytest: 19
|
|
28
|
+
- pylint-dev/pylint: 10
|
|
29
|
+
- psf/requests: 8
|
|
30
|
+
|
|
31
|
+
## Imported Subset
|
|
32
|
+
|
|
33
|
+
We import a representative subset stratified by:
|
|
34
|
+
1. Difficulty level (covering all 4 bands)
|
|
35
|
+
2. Repository diversity (multiple projects)
|
|
36
|
+
3. Problem type variety
|
|
37
|
+
|
|
38
|
+
## Evaluation Modes
|
|
39
|
+
|
|
40
|
+
### Mode 1: LLM-as-Judge (Default)
|
|
41
|
+
Evaluates the proposed solution approach without executing code.
|
|
42
|
+
Uses our standard scoring rubric adapted for bug-fix scenarios.
|
|
43
|
+
|
|
44
|
+
### Mode 2: Full Harness (Advanced)
|
|
45
|
+
Requires Docker and the SWE-bench evaluation harness.
|
|
46
|
+
Executes actual tests against generated patches.
|
|
47
|
+
|
|
48
|
+
## Citation
|
|
49
|
+
|
|
50
|
+
```bibtex
|
|
51
|
+
@inproceedings{jimenez2024swebench,
|
|
52
|
+
title={SWE-bench: Can Language Models Resolve Real-world Github Issues?},
|
|
53
|
+
author={Jimenez, Carlos E and Yang, John and Wettig, Alexander and Yao, Shunyu and Pei, Kexin and Press, Ofir and Narasimhan, Karthik},
|
|
54
|
+
booktitle={ICLR},
|
|
55
|
+
year={2024}
|
|
56
|
+
}
|
|
57
|
+
```
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
---
|
|
2
|
+
# SWE-bench Verified Scenario
|
|
3
|
+
# Source: https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified
|
|
4
|
+
# Instance: astropy__astropy-12907
|
|
5
|
+
|
|
6
|
+
name: astropy-12907
|
|
7
|
+
title: "Modeling's separability_matrix does not compute separability correctly for nes"
|
|
8
|
+
category: dev
|
|
9
|
+
difficulty: medium # SWE-bench: 15 min - 1 hour
|
|
10
|
+
version: "1.0"
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
benchmark: swe-bench-verified
|
|
14
|
+
instance_id: astropy__astropy-12907
|
|
15
|
+
repo: astropy/astropy
|
|
16
|
+
base_commit: d16bfe05a744
|
|
17
|
+
|
|
18
|
+
description: |
|
|
19
|
+
Real GitHub issue from astropy/astropy requiring code changes to resolve.
|
|
20
|
+
This is a human-validated problem from the SWE-bench Verified dataset.
|
|
21
|
+
|
|
22
|
+
prompt: |
|
|
23
|
+
You are working on the astropy/astropy repository at commit d16bfe05a744.
|
|
24
|
+
|
|
25
|
+
A user has reported the following issue:
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
Modeling's `separability_matrix` does not compute separability correctly for nested CompoundModels
|
|
29
|
+
Consider the following model:
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from astropy.modeling import models as m
|
|
33
|
+
from astropy.modeling.separable import separability_matrix
|
|
34
|
+
|
|
35
|
+
cm = m.Linear1D(10) & m.Linear1D(5)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
It's separability matrix as you might expect is a diagonal:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
>>> separability_matrix(cm)
|
|
42
|
+
array([[ True, False],
|
|
43
|
+
[False, True]])
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
If I make the model more complex:
|
|
47
|
+
```python
|
|
48
|
+
>>> separability_matrix(m.Pix2Sky_TAN() & m.Linear1D(10) & m.Linear1D(5))
|
|
49
|
+
array([[ True, True, False, False],
|
|
50
|
+
[ True, True, False, False],
|
|
51
|
+
[False, False, True, False],
|
|
52
|
+
[False, False, False, True]])
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
The output matrix is again, as expected, the outputs and inputs to the linear models are separable and independent of each other.
|
|
56
|
+
|
|
57
|
+
If however, I nest these compound models:
|
|
58
|
+
```python
|
|
59
|
+
>>> separability_matrix(m.Pix2Sky_TAN() & cm)
|
|
60
|
+
array([[ True, True, False, False],
|
|
61
|
+
[ True, True, False, False],
|
|
62
|
+
[False, False, True, True],
|
|
63
|
+
[False, False, True, True]])
|
|
64
|
+
```
|
|
65
|
+
Suddenly the inputs and outputs are no longer separable?
|
|
66
|
+
|
|
67
|
+
This feels like a bug to me, but I might be missing something?
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
Analyze this issue and provide:
|
|
72
|
+
1. Root cause analysis - what is causing the bug?
|
|
73
|
+
2. Proposed fix - what code changes would resolve this?
|
|
74
|
+
3. Test considerations - how would you verify the fix works?
|
|
75
|
+
|
|
76
|
+
Provide your response with specific file paths and code changes.
|
|
77
|
+
|
|
78
|
+
scoring:
|
|
79
|
+
# Adapted for SWE-bench bug-fix scenarios
|
|
80
|
+
categories:
|
|
81
|
+
- name: root_cause
|
|
82
|
+
weight: 30
|
|
83
|
+
description: "Correctly identifies the underlying cause of the bug"
|
|
84
|
+
criteria:
|
|
85
|
+
- id: IDENTIFIES_BUG_LOCATION
|
|
86
|
+
description: "Points to correct file(s) and function(s)"
|
|
87
|
+
points: 15
|
|
88
|
+
- id: EXPLAINS_WHY_BROKEN
|
|
89
|
+
description: "Explains why current code fails"
|
|
90
|
+
points: 15
|
|
91
|
+
|
|
92
|
+
- name: fix_quality
|
|
93
|
+
weight: 40
|
|
94
|
+
description: "Proposes a correct and complete fix"
|
|
95
|
+
criteria:
|
|
96
|
+
- id: FIX_ADDRESSES_ISSUE
|
|
97
|
+
description: "Fix would resolve the reported problem"
|
|
98
|
+
points: 20
|
|
99
|
+
- id: FIX_IS_MINIMAL
|
|
100
|
+
description: "Fix is appropriately scoped, not over-engineered"
|
|
101
|
+
points: 10
|
|
102
|
+
- id: FIX_SYNTAX_CORRECT
|
|
103
|
+
description: "Code changes are syntactically valid"
|
|
104
|
+
points: 10
|
|
105
|
+
|
|
106
|
+
- name: completeness
|
|
107
|
+
weight: 20
|
|
108
|
+
description: "Considers edge cases and testing"
|
|
109
|
+
criteria:
|
|
110
|
+
- id: EDGE_CASES
|
|
111
|
+
description: "Considers related scenarios that might break"
|
|
112
|
+
points: 10
|
|
113
|
+
- id: TEST_COVERAGE
|
|
114
|
+
description: "Suggests appropriate test cases"
|
|
115
|
+
points: 10
|
|
116
|
+
|
|
117
|
+
- name: persona
|
|
118
|
+
weight: 10
|
|
119
|
+
description: "Maintains character while solving"
|
|
120
|
+
criteria:
|
|
121
|
+
- id: IN_CHARACTER
|
|
122
|
+
description: "Response reflects persona traits"
|
|
123
|
+
points: 10
|
|
124
|
+
|
|
125
|
+
# Metadata for full harness evaluation (optional)
|
|
126
|
+
swebench_metadata:
|
|
127
|
+
fail_to_pass: ["astropy/modeling/tests/test_separable.py::test_separable[compound_model6-result6]", "astropy/modeling/tests/test_separable.py::test_separable[compound_model9-result9]"]
|
|
128
|
+
environment_version: "4.3"
|