@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,414 @@
1
+ ---
2
+ # Scenario: Cross-Team Dependency Deadlock
3
+ # Category: sm
4
+ # Purpose: Test technical coordination and dependency resolution skills
5
+
6
+ id: sm-003
7
+ name: dependency-deadlock
8
+ title: "Cross-Team Dependency Deadlock"
9
+ category: sm
10
+ difficulty: easy # Empirical: 87.20 ± 2.36 (n=10) - structured problem, clear solution
11
+ version: "1.1"
12
+
13
+ # Empirical calibration: 2026-01-01
14
+ # Control baseline: mean=87.20, std=2.36, CI=[85.5, 88.9]
15
+ # Original label "extreme" was incorrect - code interfaces provide clear solution path
16
+
17
+ description: |
18
+ Four teams are blocked in a circular dependency: Team A needs API from B,
19
+ Team B needs schema from C, Team C needs design spec from D, Team D needs
20
+ API spec from A. All claim they can't start without the other. Includes
21
+ actual code interfaces showing the dependencies. SM must break the deadlock.
22
+
23
+ purpose: |
24
+ This scenario tests both technical understanding and coordination skills.
25
+ A SM needs to understand the code dependencies well enough to propose
26
+ a breaking strategy, while also managing the political dynamics of four
27
+ teams blaming each other. Extreme difficulty for finals-caliber challenge.
28
+
29
+ prompt: |
30
+ You are a Scrum Master asked to help resolve a cross-team dependency deadlock.
31
+ Four teams have been stuck for 2 weeks, each claiming the other needs to go first.
32
+
33
+ **THE SITUATION:**
34
+
35
+ **Team Alpha (Orders Service):**
36
+ "We can't implement order creation until Team Beta gives us the inventory
37
+ reservation API. We've been waiting 2 weeks. Here's what we need:"
38
+
39
+ ```typescript
40
+ // What Alpha needs from Beta
41
+ interface InventoryReservation {
42
+ reserveItems(orderId: string, items: LineItem[]): Promise<ReservationResult>;
43
+ releaseReservation(reservationId: string): Promise<void>;
44
+ }
45
+ ```
46
+
47
+ **Team Beta (Inventory Service):**
48
+ "We can't build the reservation API until Team Gamma finalizes the database
49
+ schema. They keep changing the product table structure. We need:"
50
+
51
+ ```sql
52
+ -- What Beta needs from Gamma
53
+ CREATE TABLE products (
54
+ id UUID PRIMARY KEY,
55
+ sku VARCHAR(50) UNIQUE NOT NULL,
56
+ quantity_available INTEGER NOT NULL,
57
+ quantity_reserved INTEGER NOT NULL,
58
+ -- Gamma hasn't decided: warehouse_id or location_json?
59
+ );
60
+
61
+ CREATE TABLE reservations (
62
+ id UUID PRIMARY KEY,
63
+ product_id UUID REFERENCES products(id),
64
+ -- Need to know if multi-warehouse support needed
65
+ );
66
+ ```
67
+
68
+ **Team Gamma (Platform/Data):**
69
+ "We can't finalize the schema until Team Delta gives us the UX requirements.
70
+ Are we doing multi-warehouse? Single location? They keep changing the design:"
71
+
72
+ ```
73
+ OPEN QUESTIONS FROM GAMMA:
74
+ 1. Single warehouse or multi-warehouse?
75
+ 2. If multi-warehouse, does reservation need to specify location?
76
+ 3. What's the split/consolidation logic for orders across warehouses?
77
+ 4. Need UX mockups to understand user mental model
78
+ ```
79
+
80
+ **Team Delta (Frontend/UX):**
81
+ "We can't finalize the designs until Team Alpha tells us what data the
82
+ API will return. We need to know the order response shape to design the
83
+ confirmation screen:"
84
+
85
+ ```typescript
86
+ // What Delta needs from Alpha
87
+ interface OrderConfirmation {
88
+ orderId: string;
89
+ // What fields are available?
90
+ // Will we show warehouse location?
91
+ // Estimated delivery per item or per order?
92
+ // What about partial fulfillment scenarios?
93
+ }
94
+ ```
95
+
96
+ **ADDITIONAL CONTEXT:**
97
+ - Sprint ends in 1 week
98
+ - All 4 teams have committed to stakeholders
99
+ - No single team has authority over the others
100
+ - Previous attempts at a joint meeting devolved into blame
101
+ - Product owner says "just figure it out"
102
+
103
+ **YOUR TASK:**
104
+ Break this deadlock. You must:
105
+ 1. Analyze the actual dependencies (they may not be as circular as claimed)
106
+ 2. Identify what can be parallelized with contracts/interfaces
107
+ 3. Propose a concrete sequencing or parallel work strategy
108
+ 4. Suggest technical approaches (contract-first, mocks, feature flags)
109
+ 5. Facilitate agreement across all 4 teams
110
+
111
+ Be specific about:
112
+ - Who does what first
113
+ - What decisions can be made now vs. deferred
114
+ - How to prevent this pattern in the future
115
+
116
+ code:
117
+ language: typescript
118
+ filename: dependency-overview.ts
119
+ content: |
120
+ /**
121
+ * DEPENDENCY ANALYSIS
122
+ *
123
+ * The claimed circular dependency:
124
+ * Alpha → Beta → Gamma → Delta → Alpha
125
+ *
126
+ * But is it really circular? Let's trace the actual blocks:
127
+ *
128
+ * Alpha (Orders) claims to need:
129
+ * - Beta's InventoryReservation API
130
+ * - Actually just needs: interface contract, can mock
131
+ *
132
+ * Beta (Inventory) claims to need:
133
+ * - Gamma's product schema
134
+ * - Actually needs: decision on multi-warehouse (1 bit of info)
135
+ *
136
+ * Gamma (Platform) claims to need:
137
+ * - Delta's UX requirements
138
+ * - Actually needs: business decision, not UX design
139
+ *
140
+ * Delta (UX) claims to need:
141
+ * - Alpha's API response shape
142
+ * - Actually needs: data model concepts, not implementation
143
+ *
144
+ * POSSIBLE BREAKING POINTS:
145
+ * 1. Multi-warehouse decision can be made NOW by product owner
146
+ * 2. Interface contracts can be defined before implementation
147
+ * 3. UX can design for "worst case" (multi-warehouse) and simplify later
148
+ * 4. Each team can work against interface, not implementation
149
+ */
150
+
151
+ // Alpha can define this NOW (their output contract)
152
+ interface OrderConfirmation {
153
+ orderId: string;
154
+ status: 'confirmed' | 'partial' | 'pending';
155
+ items: Array<{
156
+ productId: string;
157
+ quantity: number;
158
+ fulfillmentLocation?: string; // optional for now
159
+ estimatedDelivery: string;
160
+ }>;
161
+ // Feature flag: show warehouse details
162
+ showWarehouseDetails: boolean;
163
+ }
164
+
165
+ // Beta can define this NOW (their input contract)
166
+ interface InventoryReservation {
167
+ reserveItems(
168
+ orderId: string,
169
+ items: Array<{ productId: string; quantity: number }>
170
+ ): Promise<{
171
+ reservationId: string;
172
+ reservedItems: Array<{
173
+ productId: string;
174
+ quantity: number;
175
+ location?: string; // optional until multi-warehouse decided
176
+ }>;
177
+ partialFulfillment: boolean;
178
+ }>;
179
+
180
+ releaseReservation(reservationId: string): Promise<void>;
181
+ }
182
+
183
+ // Gamma can start with this schema (add warehouse later)
184
+ const INITIAL_SCHEMA = `
185
+ CREATE TABLE products (
186
+ id UUID PRIMARY KEY,
187
+ sku VARCHAR(50) UNIQUE NOT NULL,
188
+ quantity_available INTEGER NOT NULL,
189
+ quantity_reserved INTEGER NOT NULL DEFAULT 0,
190
+ warehouse_id UUID NULL -- nullable for now, add FK later
191
+ );
192
+
193
+ CREATE TABLE reservations (
194
+ id UUID PRIMARY KEY,
195
+ order_id UUID NOT NULL,
196
+ product_id UUID REFERENCES products(id),
197
+ quantity INTEGER NOT NULL,
198
+ warehouse_id UUID NULL, -- nullable for now
199
+ expires_at TIMESTAMP NOT NULL,
200
+ created_at TIMESTAMP DEFAULT NOW()
201
+ );
202
+ `;
203
+
204
+ // Delta can design with this abstraction
205
+ interface UXDataContract {
206
+ // Order confirmation screen needs:
207
+ order: {
208
+ id: string;
209
+ items: Array<{
210
+ name: string;
211
+ quantity: number;
212
+ delivery: string;
213
+ // Optional warehouse info (show if feature enabled)
214
+ warehouseInfo?: {
215
+ name: string;
216
+ distance: string;
217
+ };
218
+ }>;
219
+ };
220
+ }
221
+
222
+ context:
223
+ teams:
224
+ - name: Alpha
225
+ domain: Orders Service
226
+ blocked_by: Beta (inventory API)
227
+ actually_needs: Interface contract
228
+
229
+ - name: Beta
230
+ domain: Inventory Service
231
+ blocked_by: Gamma (schema)
232
+ actually_needs: Multi-warehouse decision
233
+
234
+ - name: Gamma
235
+ domain: Platform/Data
236
+ blocked_by: Delta (UX requirements)
237
+ actually_needs: Business decision
238
+
239
+ - name: Delta
240
+ domain: Frontend/UX
241
+ blocked_by: Alpha (API shape)
242
+ actually_needs: Data model concepts
243
+
244
+ breaking_strategies:
245
+ - name: Contract-First
246
+ description: Define interfaces before implementation
247
+
248
+ - name: Feature Flags
249
+ description: Implement with optional fields, enable later
250
+
251
+ - name: Decision Forcing
252
+ description: Get product owner to make warehouse decision NOW
253
+
254
+ - name: Parallel with Mocks
255
+ description: Each team works against mocked dependencies
256
+
257
+ # =============================================================================
258
+ # EVALUATION CRITERIA
259
+ # =============================================================================
260
+
261
+ baseline_criteria:
262
+ analysis:
263
+ - id: IDENTIFIES_FALSE_DEPENDENCIES
264
+ description: "Recognizes that some 'dependencies' are actually decisions"
265
+
266
+ - id: TRACES_REAL_BLOCKS
267
+ description: "Identifies what each team actually needs"
268
+
269
+ - id: FINDS_BREAKING_POINT
270
+ description: "Identifies that warehouse decision breaks the cycle"
271
+
272
+ technical_solutions:
273
+ - id: PROPOSES_CONTRACTS
274
+ description: "Suggests interface/contract-first approach"
275
+
276
+ - id: SUGGESTS_PARALLEL_WORK
277
+ description: "Shows how teams can work in parallel"
278
+
279
+ - id: USES_FEATURE_FLAGS
280
+ description: "Suggests progressive disclosure via flags"
281
+
282
+ coordination:
283
+ - id: SEQUENCES_WORK
284
+ description: "Provides clear sequencing for the teams"
285
+
286
+ - id: ASSIGNS_OWNERSHIP
287
+ description: "Clarifies who owns which decision"
288
+
289
+ - id: SETS_TIMELINE
290
+ description: "Proposes realistic timeline for resolution"
291
+
292
+ facilitation:
293
+ - id: MANAGES_BLAME
294
+ description: "Redirects blame to systemic solutions"
295
+
296
+ - id: CREATES_SHARED_UNDERSTANDING
297
+ description: "Helps all teams see the full picture"
298
+
299
+ bonus_criteria:
300
+ prevention:
301
+ - id: PROPOSES_DEPENDENCY_PROCESS
302
+ description: "Suggests cross-team dependency management process"
303
+
304
+ - id: ARCHITECTURE_INSIGHT
305
+ description: "Notes how better API design prevents this"
306
+
307
+ - id: DOCUMENTATION_PRACTICE
308
+ description: "Suggests interface documentation practices"
309
+
310
+ advanced_technical:
311
+ - id: CDC_TESTING
312
+ description: "Suggests consumer-driven contract testing"
313
+
314
+ - id: API_VERSIONING
315
+ description: "Notes API versioning as mitigation"
316
+
317
+ # =============================================================================
318
+ # SCORING
319
+ # =============================================================================
320
+
321
+ scoring:
322
+ categories:
323
+ - name: technical_analysis
324
+ weight: 30
325
+ criteria:
326
+ - id: UNDERSTANDS_CODE
327
+ description: "Correctly interprets the code dependencies"
328
+ points: 15
329
+ - id: FINDS_SOLUTION
330
+ description: "Proposes viable technical breaking strategy"
331
+ points: 15
332
+
333
+ - name: coordination
334
+ weight: 30
335
+ criteria:
336
+ - id: CLEAR_PLAN
337
+ description: "Provides actionable plan for all 4 teams"
338
+ points: 15
339
+ - id: REALISTIC
340
+ description: "Plan fits 1-week timeline"
341
+ points: 15
342
+
343
+ - name: facilitation
344
+ weight: 25
345
+ criteria:
346
+ - id: MANAGES_DYNAMICS
347
+ description: "Handles inter-team blame constructively"
348
+ points: 10
349
+ - id: GETS_AGREEMENT
350
+ description: "Proposes path to agreement"
351
+ points: 10
352
+ - id: ESCALATION
353
+ description: "Knows when to escalate (warehouse decision)"
354
+ points: 5
355
+
356
+ - name: persona
357
+ weight: 15
358
+ criteria:
359
+ - id: CHARACTER_CONSISTENCY
360
+ description: "Stays in character throughout"
361
+ points: 8
362
+ - id: PERSONA_VALUE_ADD
363
+ description: "Persona enhances technical explanation"
364
+ points: 7
365
+
366
+ # =============================================================================
367
+ # PERSONA INFLUENCE
368
+ # =============================================================================
369
+
370
+ persona_influence:
371
+ dimensions:
372
+ - name: technical_depth
373
+ description: "How deeply technical solutions are explored"
374
+ spectrum:
375
+ high_level: "Focuses on process, delegates technical details"
376
+ balanced: "Provides both process and technical guidance"
377
+ deep_technical: "Dives into code-level solutions"
378
+
379
+ - name: coordination_style
380
+ description: "How the cross-team work is organized"
381
+ spectrum:
382
+ sequential: "Clear ordering, one at a time"
383
+ parallel: "Maximum parallelization"
384
+ hybrid: "Critical path + parallel non-blocking"
385
+
386
+ - name: escalation_threshold
387
+ description: "When to escalate vs. solve locally"
388
+ spectrum:
389
+ low: "Escalates decisions to product owner early"
390
+ medium: "Tries to solve, escalates blockers"
391
+ high: "Attempts to resolve everything at team level"
392
+
393
+ expected_tendencies:
394
+ discworld_sm:
395
+ character: "Captain Carrot"
396
+ expected_traits:
397
+ - "Practical - finds workable solutions"
398
+ - "May not dive deep into code"
399
+ - "Good at getting teams to cooperate"
400
+ technical_depth_prediction: "balanced"
401
+
402
+ star_trek_sm:
403
+ character: "Deanna Troi"
404
+ expected_traits:
405
+ - "Senses underlying frustrations"
406
+ - "May focus on team dynamics over technical"
407
+ - "Good at facilitating agreement"
408
+ technical_depth_prediction: "high_level"
409
+
410
+ control_sm:
411
+ character: "None (baseline)"
412
+ expected_traits:
413
+ - "Standard coordination approach"
414
+ technical_depth_prediction: "baseline reference"