@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,415 @@
1
+ ---
2
+ # Scenario: Database Migration Disaster Debug
3
+ # Category: dev
4
+ # Purpose: Test production incident debugging and migration fix skills
5
+
6
+ id: dev-004
7
+ name: migration-disaster
8
+ title: "Database Migration Disaster"
9
+ category: dev
10
+ difficulty: medium # Calibrated 2026-01-01: mean=76.50, was hard
11
+ version: "1.0"
12
+
13
+ description: |
14
+ A database migration that passed in staging but corrupted production data.
15
+ The migration has partial state, data type coercion bugs, foreign key constraint
16
+ violations, and timezone conversion errors. Developer must identify all issues
17
+ and write a corrective migration to restore data integrity.
18
+
19
+ purpose: |
20
+ This scenario tests crisis debugging skills. A "thorough" persona should
21
+ find more issues. A "fast" persona might miss subtle corruption. Measures
22
+ ability to think systematically about data integrity under pressure.
23
+
24
+ prompt: |
25
+ INCIDENT: A database migration was applied to production and is now causing
26
+ data integrity issues. Users are reporting incorrect order totals, missing
27
+ relationships, and timezone problems.
28
+
29
+ The migration was applied partially - it ran until line 47 before failing,
30
+ leaving the database in an inconsistent state.
31
+
32
+ Your task:
33
+ 1. Analyze the migration code to identify ALL bugs
34
+ 2. Explain the impact of each bug on production data
35
+ 3. Write a corrective migration to fix the damage
36
+ 4. Document any data that cannot be recovered
37
+
38
+ For each issue:
39
+ 1. Identify the specific line and problem
40
+ 2. Classify severity (Critical/High/Medium/Low)
41
+ 3. Explain how it corrupts data
42
+ 4. Provide SQL to detect affected rows
43
+ 5. Provide SQL to fix or document the damage
44
+
45
+ IMPORTANT: This is production. Every fix must be safe and reversible.
46
+
47
+ code:
48
+ language: python
49
+ filename: migrations/0042_normalize_orders.py
50
+ content: |
51
+ """
52
+ Migration 0042: Normalize orders table and add new pricing fields
53
+ Applied: 2024-01-15 03:45:00 UTC (during maintenance window)
54
+ Status: FAILED at line 47
55
+ """
56
+
57
+ from django.db import migrations, connection
58
+
59
+ def forward_migration(apps, schema_editor):
60
+ with connection.cursor() as cursor:
61
+ # Step 1: Add new columns for normalized pricing
62
+ cursor.execute("""
63
+ ALTER TABLE orders
64
+ ADD COLUMN subtotal_cents INTEGER,
65
+ ADD COLUMN tax_cents INTEGER,
66
+ ADD COLUMN shipping_cents INTEGER,
67
+ ADD COLUMN discount_cents INTEGER,
68
+ ADD COLUMN total_cents INTEGER
69
+ """)
70
+
71
+ # Step 2: Migrate existing price data (stored as decimal dollars)
72
+ cursor.execute("""
73
+ UPDATE orders
74
+ SET subtotal_cents = subtotal * 100,
75
+ tax_cents = tax_amount * 100,
76
+ shipping_cents = shipping_cost * 100,
77
+ discount_cents = discount_amount * 100,
78
+ total_cents = total_amount * 100
79
+ """)
80
+
81
+ # Step 3: Convert timestamps from local time to UTC
82
+ # Note: Server was running in America/New_York timezone
83
+ cursor.execute("""
84
+ UPDATE orders
85
+ SET created_at = created_at - INTERVAL '5 hours',
86
+ updated_at = updated_at - INTERVAL '5 hours'
87
+ """)
88
+
89
+ # Step 4: Create new order_items table from denormalized data
90
+ cursor.execute("""
91
+ CREATE TABLE order_items_new (
92
+ id SERIAL PRIMARY KEY,
93
+ order_id INTEGER REFERENCES orders(id),
94
+ product_id INTEGER,
95
+ quantity INTEGER,
96
+ unit_price_cents INTEGER,
97
+ line_total_cents INTEGER
98
+ )
99
+ """)
100
+
101
+ # MIGRATION FAILED HERE - line 47
102
+
103
+ # Step 5: Migrate order items from JSON column
104
+ cursor.execute("""
105
+ INSERT INTO order_items_new (order_id, product_id, quantity, unit_price_cents, line_total_cents)
106
+ SELECT
107
+ o.id,
108
+ (item->>'product_id')::int,
109
+ (item->>'quantity')::int,
110
+ (item->>'price')::decimal * 100,
111
+ (item->>'quantity')::int * (item->>'price')::decimal * 100
112
+ FROM orders o,
113
+ jsonb_array_elements(o.items_json) AS item
114
+ """)
115
+
116
+ # Step 6: Update foreign keys to use new user_uuid
117
+ cursor.execute("""
118
+ UPDATE orders o
119
+ SET user_id = u.new_id
120
+ FROM users u
121
+ WHERE o.user_email = u.email
122
+ """)
123
+
124
+ # Step 7: Drop old columns
125
+ cursor.execute("""
126
+ ALTER TABLE orders
127
+ DROP COLUMN subtotal,
128
+ DROP COLUMN tax_amount,
129
+ DROP COLUMN shipping_cost,
130
+ DROP COLUMN discount_amount,
131
+ DROP COLUMN total_amount,
132
+ DROP COLUMN items_json,
133
+ DROP COLUMN user_email
134
+ """)
135
+
136
+ # Step 8: Add NOT NULL constraints
137
+ cursor.execute("""
138
+ ALTER TABLE orders
139
+ ALTER COLUMN subtotal_cents SET NOT NULL,
140
+ ALTER COLUMN total_cents SET NOT NULL,
141
+ ALTER COLUMN user_id SET NOT NULL
142
+ """)
143
+
144
+ # Step 9: Create indexes
145
+ cursor.execute("""
146
+ CREATE INDEX idx_orders_user_id ON orders(user_id);
147
+ CREATE INDEX idx_orders_created_at ON orders(created_at);
148
+ CREATE INDEX idx_order_items_order_id ON order_items_new(order_id);
149
+ """)
150
+
151
+
152
+ def reverse_migration(apps, schema_editor):
153
+ # NOTE: This reverse migration was never implemented
154
+ pass
155
+
156
+
157
+ class Migration(migrations.Migration):
158
+ dependencies = [
159
+ ('orders', '0041_add_user_uuid'),
160
+ ]
161
+
162
+ operations = [
163
+ migrations.RunPython(forward_migration, reverse_migration),
164
+ ]
165
+
166
+ # ================================================================
167
+ # POST-INCIDENT NOTES (added by on-call engineer)
168
+ # ================================================================
169
+ #
170
+ # Migration ran at 03:45 UTC but failed at Step 5.
171
+ # Steps 1-4 completed. Steps 5-9 did not run.
172
+ #
173
+ # Current database state:
174
+ # - orders table has new _cents columns (populated)
175
+ # - orders table still has old decimal columns (not dropped)
176
+ # - orders table timestamps were modified
177
+ # - order_items_new table exists (empty)
178
+ # - user_id column NOT updated
179
+ # - items_json still exists
180
+ #
181
+ # Sample data before migration:
182
+ # order_id=1234, subtotal=99.99, tax_amount=8.25, total_amount=108.24
183
+ # created_at='2024-01-10 14:30:00' (Eastern Time)
184
+ #
185
+ # Sample data after migration (current):
186
+ # order_id=1234, subtotal=99.99, subtotal_cents=9999
187
+ # tax_cents=825, total_cents=10824
188
+ # created_at='2024-01-10 09:30:00' (should be UTC)
189
+ #
190
+
191
+ # =============================================================================
192
+ # BASELINE ISSUES (minimum expected to find)
193
+ # =============================================================================
194
+
195
+ baseline_issues:
196
+ critical:
197
+ - id: FLOAT_TO_INT_TRUNCATION
198
+ location: "lines 21-26"
199
+ description: "Multiplying DECIMAL by 100 and storing as INTEGER truncates cents"
200
+ impact: "Orders with prices like $19.99 become 1998 cents instead of 1999"
201
+ affected_query: "SELECT id FROM orders WHERE (subtotal * 100) != subtotal_cents"
202
+
203
+ - id: TIMEZONE_WRONG_DIRECTION
204
+ location: "lines 29-32"
205
+ description: "Subtracting 5 hours is backwards - EST is UTC-5, should ADD"
206
+ impact: "All timestamps now 10 hours off (shifted wrong direction)"
207
+ affected_query: "All orders - timestamps are systematically wrong"
208
+
209
+ - id: TIMEZONE_DST_IGNORED
210
+ location: "lines 29-32"
211
+ description: "Fixed 5-hour offset ignores Daylight Saving Time"
212
+ impact: "Orders during EDT (summer) are 4 hours off, EST (winter) 5 hours off"
213
+ affected_query: "Orders between March-November have different offset error"
214
+
215
+ - id: PARTIAL_MIGRATION_STATE
216
+ location: "entire migration"
217
+ description: "Migration has no transaction - partial state left on failure"
218
+ impact: "Database in inconsistent state, items not migrated but prices converted"
219
+
220
+ high:
221
+ - id: FK_WILL_FAIL_STEP5
222
+ location: "line 45"
223
+ description: "order_id references orders(id) but some items_json has invalid order refs"
224
+ impact: "Step 5 INSERT would fail on FK constraint when it runs"
225
+ note: "This is WHY the migration failed"
226
+
227
+ - id: USER_ID_ORPHAN
228
+ location: "lines 57-60"
229
+ description: "UPDATE joins on email but some orders have emails not in users table"
230
+ impact: "Orders with deleted users would get NULL user_id, then fail NOT NULL"
231
+
232
+ - id: JSON_INVALID_ENTRIES
233
+ location: "line 51"
234
+ description: "Some items_json entries have string product_id, not int"
235
+ impact: "::int cast would fail on items like {\"product_id\": \"SKU-123\"}"
236
+
237
+ - id: NO_REVERSE_MIGRATION
238
+ location: "lines 68-69"
239
+ description: "reverse_migration is not implemented"
240
+ impact: "Cannot safely rollback - manual intervention required"
241
+
242
+ medium:
243
+ - id: DECIMAL_PRECISION_LOSS
244
+ location: "lines 49-50"
245
+ description: "(item->>'price')::decimal loses precision before multiply"
246
+ impact: "Accumulated rounding errors in line_total_cents"
247
+
248
+ - id: SUBTOTAL_TAX_MISMATCH
249
+ location: "post-migration data"
250
+ description: "Some orders now have subtotal_cents + tax_cents != total_cents"
251
+ impact: "Financial reports will show discrepancies"
252
+
253
+ - id: INDEX_CREATION_ORDER
254
+ location: "lines 73-76"
255
+ description: "Creating indexes on empty table then bulk INSERT"
256
+ impact: "Would be slow - should create indexes after data load"
257
+
258
+ - id: MISSING_ORDER_ITEMS_MIGRATION
259
+ location: "current state"
260
+ description: "order_items_new exists but is empty, items_json still has data"
261
+ impact: "Application may try to read from empty table"
262
+
263
+ low:
264
+ - id: COLUMN_NULLABLE
265
+ location: "lines 10-15"
266
+ description: "New columns added without DEFAULT, remain NULL until UPDATE"
267
+ impact: "Brief window where columns are NULL"
268
+
269
+ - id: NO_BATCH_PROCESSING
270
+ location: "line 21"
271
+ description: "Single UPDATE for entire table - could timeout on large tables"
272
+ impact: "Migration might timeout on production-scale data"
273
+
274
+ # =============================================================================
275
+ # BONUS ISSUES (thorough reviewers might find these)
276
+ # =============================================================================
277
+
278
+ bonus_issues:
279
+ data_integrity:
280
+ - id: TIMEZONE_PAST_ORDERS
281
+ description: "Orders from years ago may have different DST rules"
282
+
283
+ - id: NEGATIVE_DISCOUNT
284
+ description: "Some discount_amount may be negative (store credit) - breaks cents logic"
285
+
286
+ - id: CURRENCY_ASSUMPTION
287
+ description: "Migration assumes USD - international orders may have different precision"
288
+
289
+ operational:
290
+ - id: NO_PROGRESS_TRACKING
291
+ description: "No way to resume migration from failure point"
292
+
293
+ - id: NO_VALIDATION_STEP
294
+ description: "No checksums or row counts to verify migration success"
295
+
296
+ - id: MAINTENANCE_WINDOW_SHORT
297
+ description: "Large UPDATE statements may exceed maintenance window"
298
+
299
+ recovery:
300
+ - id: ORIGINAL_DATA_GONE
301
+ description: "If columns were dropped, original decimal values unrecoverable"
302
+
303
+ - id: AUDIT_TRAIL_BROKEN
304
+ description: "Timestamp changes break audit/compliance requirements"
305
+
306
+ # =============================================================================
307
+ # SCORING
308
+ # =============================================================================
309
+
310
+ scoring:
311
+ total_baseline_issues: 16
312
+ total_bonus_issues: 8
313
+ weights:
314
+ critical: 3
315
+ high: 2
316
+ medium: 1
317
+ low: 0.5
318
+ max_baseline_score: 26 # 4*3 + 4*2 + 4*1 + 2*0.5
319
+
320
+ categories:
321
+ - name: detection
322
+ weight: 35
323
+ criteria:
324
+ - id: BASELINE_FOUND
325
+ description: "Issues from the seeded baseline list"
326
+ points: 25
327
+ - id: BONUS_DISCOVERIES
328
+ description: "Valid issues beyond the baseline"
329
+ points: 10
330
+
331
+ - name: depth
332
+ weight: 30
333
+ criteria:
334
+ - id: ROOT_CAUSE_ANALYSIS
335
+ description: "Traces to data corruption mechanism"
336
+ points: 10
337
+ - id: DETECTION_QUERIES
338
+ description: "Provides SQL to find affected rows"
339
+ points: 10
340
+ - id: RECOVERY_PLAN
341
+ description: "Provides corrective migration"
342
+ points: 10
343
+
344
+ - name: quality
345
+ weight: 20
346
+ criteria:
347
+ - id: SEVERITY_ACCURACY
348
+ description: "Correctly classifies data vs operational issues"
349
+ points: 7
350
+ - id: SAFE_FIXES
351
+ description: "Fixes are transactional and reversible"
352
+ points: 8
353
+ - id: PRIORITIZATION
354
+ description: "Addresses critical data issues first"
355
+ points: 5
356
+
357
+ - name: persona
358
+ weight: 15
359
+ criteria:
360
+ - id: CHARACTER_CONSISTENCY
361
+ description: "Stays in character throughout"
362
+ points: 8
363
+ - id: CRISIS_COMMUNICATION
364
+ description: "Communicates findings clearly for incident response"
365
+ points: 7
366
+
367
+ # =============================================================================
368
+ # PERSONA INFLUENCE
369
+ # =============================================================================
370
+
371
+ persona_influence:
372
+ dimensions:
373
+ - name: crisis_approach
374
+ description: "How the incident is prioritized"
375
+ spectrum:
376
+ systematic: "Catalogs all issues before proposing fixes"
377
+ pragmatic: "Fixes critical issues first, documents rest"
378
+ rapid: "Provides quick fix, may miss secondary issues"
379
+
380
+ - name: data_recovery_focus
381
+ description: "Balance between speed and data integrity"
382
+ spectrum:
383
+ integrity_first: "Ensures no data loss, even if slower"
384
+ balanced: "Prioritizes but accepts some data may be unrecoverable"
385
+ speed_first: "Gets system running, documents data loss"
386
+
387
+ - name: documentation_style
388
+ description: "How findings are communicated"
389
+ spectrum:
390
+ technical: "SQL-focused, precise"
391
+ narrative: "Explains impact to stakeholders"
392
+ minimal: "Just the facts and fixes"
393
+
394
+ expected_tendencies:
395
+ discworld_dev:
396
+ character: "Ponder Stibbons"
397
+ expected_traits:
398
+ - "Systematic - should catalog all issues"
399
+ - "May over-explain magical (technical) details"
400
+ - "Thorough documentation"
401
+ crisis_approach_prediction: "systematic"
402
+
403
+ star_trek_dev:
404
+ character: "Data"
405
+ expected_traits:
406
+ - "Logical - precise identification of issues"
407
+ - "May provide optimal recovery sequence"
408
+ - "Clear prioritization"
409
+ crisis_approach_prediction: "systematic"
410
+
411
+ control_dev:
412
+ character: "None (baseline)"
413
+ expected_traits:
414
+ - "Standard incident response"
415
+ crisis_approach_prediction: "baseline reference"