@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,541 @@
1
+ ---
2
+ # Scenario: Buggy User Service Fix
3
+ # Category: dev (Developer)
4
+ # Ported from: Pennyfarthing benchmarks/test-cases/dev/dev-001-buggy-service.yaml
5
+ # Purpose: Measure thoroughness in bug detection and fix quality
6
+
7
+ id: dev-001
8
+ name: buggy-user-service
9
+ title: "Buggy User Service: Fix the Authentication Nightmare"
10
+ category: dev
11
+ difficulty: hard
12
+ version: "1.0"
13
+
14
+ description: |
15
+ A user service with multiple bugs across authentication, data handling,
16
+ and business logic. Tests the developer agent's ability to identify issues,
17
+ propose fixes, and consider edge cases during implementation.
18
+
19
+ purpose: |
20
+ This scenario tests whether persona traits affect bug detection.
21
+ A "security-minded" developer might prioritize injection attacks.
22
+ A "thorough" developer might find more total issues.
23
+ A "practical" developer might focus on high-impact fixes first.
24
+ All are valid but measurably different.
25
+
26
+ prompt: |
27
+ INCIDENT REPORT
28
+
29
+ Severity: P1
30
+ Service: user-service
31
+ Status: Security audit flagged multiple critical issues
32
+
33
+ The security team has flagged this user service for immediate remediation.
34
+ Multiple vulnerabilities were detected in a penetration test.
35
+
36
+ Your task:
37
+ 1. Review the code and identify ALL security and quality issues
38
+ 2. For each issue:
39
+ - Identify the bug and its location
40
+ - Explain the impact (how could this be exploited?)
41
+ - Provide the corrected code
42
+ - Consider edge cases your fix must handle
43
+ 3. Prioritize by severity (Critical > High > Medium > Low)
44
+
45
+ Focus on:
46
+ - Security vulnerabilities (injection, auth bypass, etc.)
47
+ - Logic errors
48
+ - Error handling gaps
49
+ - Edge cases
50
+ - Best practices violations
51
+
52
+ Be thorough. This code handles user authentication and personal data.
53
+ There are 33 known issues (22 baseline + 11 bonus). How many can you find?
54
+
55
+ code:
56
+ language: go
57
+ filename: user_service.go
58
+ content: |
59
+ package users
60
+
61
+ import (
62
+ "crypto/md5"
63
+ "database/sql"
64
+ "encoding/hex"
65
+ "encoding/json"
66
+ "fmt"
67
+ "net/http"
68
+ "regexp"
69
+ "strings"
70
+ "time"
71
+ )
72
+
73
+ type UserService struct {
74
+ db *sql.DB
75
+ }
76
+
77
+ type User struct {
78
+ ID int64 `json:"id"`
79
+ Email string `json:"email"`
80
+ PasswordHash string `json:"-"`
81
+ Name string `json:"name"`
82
+ Role string `json:"role"`
83
+ CreatedAt time.Time `json:"created_at"`
84
+ LastLogin time.Time `json:"last_login"`
85
+ FailedLogins int `json:"-"`
86
+ Locked bool `json:"locked"`
87
+ }
88
+
89
+ // RegisterUser creates a new user account
90
+ func (s *UserService) RegisterUser(w http.ResponseWriter, r *http.Request) {
91
+ var req struct {
92
+ Email string `json:"email"`
93
+ Password string `json:"password"`
94
+ Name string `json:"name"`
95
+ }
96
+ json.NewDecoder(r.Body).Decode(&req)
97
+
98
+ // Validate email
99
+ if !strings.Contains(req.Email, "@") {
100
+ http.Error(w, "Invalid email", 400)
101
+ return
102
+ }
103
+
104
+ // Hash password
105
+ hash := md5.Sum([]byte(req.Password))
106
+ passwordHash := hex.EncodeToString(hash[:])
107
+
108
+ // Create user
109
+ result, _ := s.db.Exec(
110
+ fmt.Sprintf("INSERT INTO users (email, password_hash, name, role) VALUES ('%s', '%s', '%s', 'user')",
111
+ req.Email, passwordHash, req.Name))
112
+
113
+ id, _ := result.LastInsertId()
114
+
115
+ w.Write([]byte(fmt.Sprintf(`{"id": %d, "message": "User created"}`, id)))
116
+ }
117
+
118
+ // Login authenticates a user
119
+ func (s *UserService) Login(w http.ResponseWriter, r *http.Request) {
120
+ var req struct {
121
+ Email string `json:"email"`
122
+ Password string `json:"password"`
123
+ }
124
+ json.NewDecoder(r.Body).Decode(&req)
125
+
126
+ var user User
127
+ query := fmt.Sprintf("SELECT id, email, password_hash, role, locked, failed_logins FROM users WHERE email = '%s'", req.Email)
128
+ row := s.db.QueryRow(query)
129
+ row.Scan(&user.ID, &user.Email, &user.PasswordHash, &user.Role, &user.Locked, &user.FailedLogins)
130
+
131
+ // Check password
132
+ hash := md5.Sum([]byte(req.Password))
133
+ if hex.EncodeToString(hash[:]) != user.PasswordHash {
134
+ user.FailedLogins++
135
+ s.db.Exec("UPDATE users SET failed_logins = ? WHERE id = ?", user.FailedLogins, user.ID)
136
+ http.Error(w, "Invalid credentials", 401)
137
+ return
138
+ }
139
+
140
+ // Generate session token
141
+ token := fmt.Sprintf("%d-%d", user.ID, time.Now().Unix())
142
+
143
+ // Update last login
144
+ s.db.Exec("UPDATE users SET last_login = NOW(), failed_logins = 0 WHERE id = ?", user.ID)
145
+
146
+ json.NewEncoder(w).Encode(map[string]interface{}{
147
+ "token": token,
148
+ "user": user,
149
+ })
150
+ }
151
+
152
+ // UpdateProfile allows users to update their profile
153
+ func (s *UserService) UpdateProfile(w http.ResponseWriter, r *http.Request) {
154
+ userID := r.Header.Get("X-User-ID")
155
+
156
+ var req struct {
157
+ Name string `json:"name"`
158
+ Email string `json:"email"`
159
+ Role string `json:"role"`
160
+ }
161
+ json.NewDecoder(r.Body).Decode(&req)
162
+
163
+ s.db.Exec(fmt.Sprintf(
164
+ "UPDATE users SET name = '%s', email = '%s', role = '%s' WHERE id = %s",
165
+ req.Name, req.Email, req.Role, userID))
166
+
167
+ w.Write([]byte(`{"message": "Profile updated"}`))
168
+ }
169
+
170
+ // ResetPassword handles password reset
171
+ func (s *UserService) ResetPassword(w http.ResponseWriter, r *http.Request) {
172
+ var req struct {
173
+ Email string `json:"email"`
174
+ ResetToken string `json:"reset_token"`
175
+ NewPassword string `json:"new_password"`
176
+ }
177
+ json.NewDecoder(r.Body).Decode(&req)
178
+
179
+ // Verify reset token
180
+ var storedToken string
181
+ s.db.QueryRow("SELECT reset_token FROM users WHERE email = ?", req.Email).Scan(&storedToken)
182
+
183
+ if req.ResetToken == storedToken {
184
+ hash := md5.Sum([]byte(req.NewPassword))
185
+ passwordHash := hex.EncodeToString(hash[:])
186
+ s.db.Exec("UPDATE users SET password_hash = ? WHERE email = ?", passwordHash, req.Email)
187
+ w.Write([]byte(`{"message": "Password reset successful"}`))
188
+ } else {
189
+ http.Error(w, "Invalid reset token", 400)
190
+ }
191
+ }
192
+
193
+ // DeleteUser removes a user account
194
+ func (s *UserService) DeleteUser(w http.ResponseWriter, r *http.Request) {
195
+ userID := r.URL.Query().Get("id")
196
+
197
+ s.db.Exec("DELETE FROM users WHERE id = " + userID)
198
+ s.db.Exec("DELETE FROM user_sessions WHERE user_id = " + userID)
199
+ s.db.Exec("DELETE FROM user_preferences WHERE user_id = " + userID)
200
+
201
+ w.Write([]byte(`{"message": "User deleted"}`))
202
+ }
203
+
204
+ // SearchUsers finds users matching criteria
205
+ func (s *UserService) SearchUsers(w http.ResponseWriter, r *http.Request) {
206
+ query := r.URL.Query().Get("q")
207
+ role := r.URL.Query().Get("role")
208
+
209
+ sql := fmt.Sprintf("SELECT id, email, name, role FROM users WHERE name LIKE '%%%s%%'", query)
210
+ if role != "" {
211
+ sql += fmt.Sprintf(" AND role = '%s'", role)
212
+ }
213
+
214
+ rows, _ := s.db.Query(sql)
215
+ var users []User
216
+ for rows.Next() {
217
+ var u User
218
+ rows.Scan(&u.ID, &u.Email, &u.Name, &u.Role)
219
+ users = append(users, u)
220
+ }
221
+
222
+ json.NewEncoder(w).Encode(users)
223
+ }
224
+
225
+ // ValidatePassword checks password strength
226
+ func (s *UserService) ValidatePassword(password string) bool {
227
+ if len(password) < 8 {
228
+ return false
229
+ }
230
+ hasUpper := regexp.MustCompile(`[A-Z]`).MatchString(password)
231
+ hasLower := regexp.MustCompile(`[a-z]`).MatchString(password)
232
+ hasNumber := regexp.MustCompile(`[0-9]`).MatchString(password)
233
+ return hasUpper && hasLower && hasNumber
234
+ }
235
+
236
+ // GetUserByID retrieves a user
237
+ func (s *UserService) GetUserByID(w http.ResponseWriter, r *http.Request) {
238
+ id := r.URL.Query().Get("id")
239
+
240
+ var user User
241
+ s.db.QueryRow(fmt.Sprintf("SELECT * FROM users WHERE id = %s", id)).
242
+ Scan(&user.ID, &user.Email, &user.PasswordHash, &user.Name, &user.Role)
243
+
244
+ json.NewEncoder(w).Encode(user)
245
+ }
246
+
247
+ // BulkImportUsers imports users from JSON
248
+ func (s *UserService) BulkImportUsers(w http.ResponseWriter, r *http.Request) {
249
+ var users []struct {
250
+ Email string `json:"email"`
251
+ Password string `json:"password"`
252
+ Name string `json:"name"`
253
+ Role string `json:"role"`
254
+ }
255
+ json.NewDecoder(r.Body).Decode(&users)
256
+
257
+ for _, u := range users {
258
+ hash := md5.Sum([]byte(u.Password))
259
+ passwordHash := hex.EncodeToString(hash[:])
260
+ s.db.Exec(fmt.Sprintf(
261
+ "INSERT INTO users (email, password_hash, name, role) VALUES ('%s', '%s', '%s', '%s')",
262
+ u.Email, passwordHash, u.Name, u.Role))
263
+ }
264
+
265
+ w.Write([]byte(fmt.Sprintf(`{"imported": %d}`, len(users))))
266
+ }
267
+
268
+ # =============================================================================
269
+ # BASELINE ISSUES (minimum expected to find and fix)
270
+ # =============================================================================
271
+
272
+ baseline_issues:
273
+ critical:
274
+ - id: SQL_INJECTION_REGISTER
275
+ location: "lines 50-52"
276
+ description: "SQL injection via string formatting in RegisterUser"
277
+
278
+ - id: SQL_INJECTION_LOGIN
279
+ location: "line 66"
280
+ description: "SQL injection in Login query"
281
+
282
+ - id: SQL_INJECTION_UPDATE
283
+ location: "lines 96-98"
284
+ description: "SQL injection in UpdateProfile"
285
+
286
+ - id: SQL_INJECTION_DELETE
287
+ location: "lines 125-127"
288
+ description: "SQL injection in DeleteUser"
289
+
290
+ - id: SQL_INJECTION_SEARCH
291
+ location: "lines 135-139"
292
+ description: "SQL injection in SearchUsers"
293
+
294
+ - id: WEAK_PASSWORD_HASH
295
+ location: "lines 46, 70, 112, 177"
296
+ description: "Using MD5 for password hashing (cryptographically broken)"
297
+
298
+ high:
299
+ - id: INSECURE_SESSION_TOKEN
300
+ location: "line 78"
301
+ description: "Predictable session token (user_id + timestamp)"
302
+
303
+ - id: ROLE_ESCALATION
304
+ location: "lines 89-98"
305
+ description: "User can set their own role in UpdateProfile"
306
+
307
+ - id: NO_ACCOUNT_LOCKOUT
308
+ location: "lines 63-82"
309
+ description: "No account lockout after failed logins (locked field not checked)"
310
+
311
+ - id: PASSWORD_IN_RESPONSE
312
+ location: "line 83"
313
+ description: "User struct may leak password hash if not properly excluded"
314
+
315
+ - id: NO_AUTH_DELETE
316
+ location: "lines 120-130"
317
+ description: "DeleteUser has no authorization check"
318
+
319
+ - id: NO_AUTH_SEARCH
320
+ location: "lines 133-152"
321
+ description: "SearchUsers exposes all user data without auth"
322
+
323
+ medium:
324
+ - id: WEAK_EMAIL_VALIDATION
325
+ location: "lines 41-44"
326
+ description: "Email validation only checks for @ symbol"
327
+
328
+ - id: PASSWORD_NOT_VALIDATED
329
+ location: "RegisterUser"
330
+ description: "ValidatePassword function exists but not called"
331
+
332
+ - id: TIMING_ATTACK_RESET
333
+ location: "lines 108-117"
334
+ description: "Reset token comparison vulnerable to timing attack"
335
+
336
+ - id: TOKEN_NOT_INVALIDATED
337
+ location: "ResetPassword"
338
+ description: "Reset token not invalidated after use"
339
+
340
+ - id: ERROR_IGNORED_DECODE
341
+ location: "multiple"
342
+ description: "JSON decode errors ignored throughout"
343
+
344
+ - id: ROWS_NOT_CLOSED
345
+ location: "line 140"
346
+ description: "Database rows not closed in SearchUsers"
347
+
348
+ low:
349
+ - id: MISSING_CONTENT_TYPE
350
+ location: "multiple"
351
+ description: "JSON responses don't set Content-Type header"
352
+
353
+ - id: NO_INPUT_LENGTH_LIMITS
354
+ location: "multiple"
355
+ description: "No limits on input field lengths"
356
+
357
+ - id: INCONSISTENT_ERROR_RESPONSES
358
+ location: "multiple"
359
+ description: "Mix of http.Error and json responses"
360
+
361
+ - id: SQL_INJECTION_GETBYID
362
+ location: "line 162"
363
+ description: "SQL injection in GetUserByID"
364
+
365
+ # =============================================================================
366
+ # BONUS ISSUES (thorough developers might address)
367
+ # =============================================================================
368
+
369
+ bonus_issues:
370
+ security:
371
+ - id: NO_RATE_LIMITING
372
+ description: "No rate limiting on login/register endpoints"
373
+
374
+ - id: NO_CSRF_PROTECTION
375
+ description: "No CSRF tokens for state-changing operations"
376
+
377
+ - id: CREDENTIALS_IN_LOGS
378
+ description: "Errors could log sensitive data"
379
+
380
+ - id: NO_HTTPS_ENFORCEMENT
381
+ description: "No check for secure connection"
382
+
383
+ reliability:
384
+ - id: NO_TRANSACTION_DELETE
385
+ description: "DeleteUser should use transaction for multiple deletes"
386
+
387
+ - id: NO_CONTEXT_TIMEOUT
388
+ description: "No context/timeout on database operations"
389
+
390
+ - id: NO_CONNECTION_POOLING_CONFIG
391
+ description: "Database connection pooling not configured"
392
+
393
+ code_quality:
394
+ - id: DUPLICATE_HASH_LOGIC
395
+ description: "Password hashing duplicated in 4 places"
396
+
397
+ - id: MAGIC_STRINGS
398
+ description: "Role values as magic strings"
399
+
400
+ - id: NO_CONSTANTS
401
+ description: "No constants for error messages"
402
+
403
+ - id: MISSING_INDEXES
404
+ description: "Queries suggest missing database indexes"
405
+
406
+ # =============================================================================
407
+ # SCORING
408
+ # =============================================================================
409
+
410
+ scoring:
411
+ total_baseline_issues: 22
412
+ total_bonus_issues: 11
413
+ weights:
414
+ critical: 3
415
+ high: 2
416
+ medium: 1
417
+ low: 0.5
418
+ max_baseline_score: 33.5 # 6*3 + 6*2 + 6*1 + 4*0.5
419
+
420
+ categories:
421
+ - name: detection
422
+ weight: 40
423
+ description: "How many issues are found"
424
+ criteria:
425
+ - id: CRITICAL_FOUND
426
+ description: "All 6 critical issues found"
427
+ points: 20
428
+ - id: HIGH_FOUND
429
+ description: "All 6 high issues found"
430
+ points: 12
431
+ - id: MEDIUM_LOW_FOUND
432
+ description: "Medium and low issues found"
433
+ points: 8
434
+
435
+ - name: fix_quality
436
+ weight: 30
437
+ description: "Quality of proposed fixes"
438
+ criteria:
439
+ - id: CORRECT_FIXES
440
+ description: "Fixes actually solve the problem"
441
+ points: 15
442
+ - id: EDGE_CASES_HANDLED
443
+ description: "Fixes handle edge cases"
444
+ points: 10
445
+ - id: NO_NEW_BUGS
446
+ description: "Fixes don't introduce new issues"
447
+ points: 5
448
+
449
+ - name: explanation
450
+ weight: 15
451
+ description: "Quality of issue explanations"
452
+ criteria:
453
+ - id: IMPACT_EXPLAINED
454
+ description: "Explains real-world impact"
455
+ points: 8
456
+ - id: ROOT_CAUSE
457
+ description: "Identifies root cause, not just symptom"
458
+ points: 7
459
+
460
+ - name: persona
461
+ weight: 15
462
+ description: "Persona consistency and value"
463
+ criteria:
464
+ - id: IN_CHARACTER
465
+ description: "Stays in character throughout"
466
+ points: 8
467
+ - id: PERSONA_ENHANCES
468
+ description: "Persona adds value to explanations"
469
+ points: 7
470
+
471
+ # =============================================================================
472
+ # ENHANCED METRICS
473
+ # =============================================================================
474
+
475
+ enhanced_metrics:
476
+ thoroughness_ratio:
477
+ formula: "total_findings / 22"
478
+ interpretation: "100% = found all baseline issues"
479
+
480
+ bonus_discovery_rate:
481
+ formula: "bonus_found / 11"
482
+ interpretation: "Shows exceptional thoroughness"
483
+
484
+ fix_accuracy:
485
+ formula: "correct_fixes / issues_found"
486
+ interpretation: "100% = all fixes are correct"
487
+
488
+ severity_accuracy:
489
+ formula: "correctly_classified / issues_found"
490
+ interpretation: "100% = perfect severity classification"
491
+
492
+ # =============================================================================
493
+ # PERSONA INFLUENCE
494
+ # =============================================================================
495
+
496
+ persona_influence:
497
+ dimensions:
498
+ - name: issue_prioritization
499
+ description: "What types of issues are found first"
500
+ spectrum:
501
+ security_first: "SQL injection and auth issues prioritized"
502
+ quality_first: "Code quality and maintainability first"
503
+ impact_first: "Highest business impact first"
504
+
505
+ - name: fix_style
506
+ description: "How comprehensive are the fixes"
507
+ spectrum:
508
+ minimal: "Just fixes the immediate problem"
509
+ refactoring: "Cleans up surrounding code"
510
+ architectural: "Suggests broader improvements"
511
+
512
+ - name: documentation
513
+ description: "How well issues are explained"
514
+ spectrum:
515
+ brief: "Issue and fix only"
516
+ detailed: "Full impact analysis"
517
+ educational: "Teaches prevention patterns"
518
+
519
+ expected_tendencies:
520
+ discworld_dev:
521
+ character: "Ponder Stibbons"
522
+ expected_traits:
523
+ - "Academic, thorough analysis"
524
+ - "May get distracted by interesting edge cases"
525
+ - "Good at explaining why things are wrong"
526
+ thoroughness_prediction: "high - academic thoroughness"
527
+
528
+ star_trek_dev:
529
+ character: "Geordi La Forge"
530
+ expected_traits:
531
+ - "Practical, engineering focus"
532
+ - "Good at system-level thinking"
533
+ - "May add diagnostic suggestions"
534
+ thoroughness_prediction: "high - engineering discipline"
535
+
536
+ control_dev:
537
+ character: "None (baseline)"
538
+ expected_traits:
539
+ - "Standard LLM bug detection"
540
+ - "No persona influence"
541
+ thoroughness_prediction: "baseline reference"
@@ -0,0 +1,130 @@
1
+ ---
2
+ # Scenario: Null Pointer Debug Challenge
3
+ # Category: dev
4
+ # Tests debugging skills with a subtle null safety issue
5
+
6
+ name: null-pointer
7
+ title: "The Midnight NullPointerException"
8
+ category: dev
9
+ difficulty: medium
10
+ description: Debug a production NullPointerException in a user service
11
+
12
+ prompt: |
13
+ INCIDENT REPORT
14
+
15
+ Severity: P1
16
+ Time: 2:47 AM
17
+ Service: user-service
18
+ Error Rate: 15% of requests failing
19
+
20
+ The on-call engineer was woken up by PagerDuty. The user-service is throwing
21
+ NullPointerExceptions for some users but not others. The service was deployed
22
+ 3 hours ago with "minor refactoring - no functional changes" according to the PR.
23
+
24
+ Your task:
25
+ 1. Find the bug
26
+ 2. Explain why it happens intermittently
27
+ 3. Provide a fix
28
+ 4. Suggest how to prevent similar bugs
29
+
30
+ STACK TRACE:
31
+ ```
32
+ java.lang.NullPointerException
33
+ at com.example.UserService.getDisplayName(UserService.java:24)
34
+ at com.example.ProfileController.getProfile(ProfileController.java:45)
35
+ at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
36
+ ...
37
+ ```
38
+
39
+ code:
40
+ language: java
41
+ filename: UserService.java
42
+ content: |
43
+ package com.example;
44
+
45
+ import java.util.Optional;
46
+
47
+ public class UserService {
48
+
49
+ private final UserRepository userRepository;
50
+ private final PreferencesService preferencesService;
51
+
52
+ public UserService(UserRepository userRepository, PreferencesService preferencesService) {
53
+ this.userRepository = userRepository;
54
+ this.preferencesService = preferencesService;
55
+ }
56
+
57
+ public User getUser(String userId) {
58
+ return userRepository.findById(userId).orElse(null);
59
+ }
60
+
61
+ public String getDisplayName(String userId) {
62
+ User user = getUser(userId);
63
+ UserPreferences prefs = preferencesService.getPreferences(userId);
64
+
65
+ // Use nickname if user prefers it, otherwise full name
66
+ if (prefs.useNickname()) {
67
+ return user.getNickname();
68
+ }
69
+ return user.getFirstName() + " " + user.getLastName();
70
+ }
71
+
72
+ public void updateUser(String userId, UserUpdateRequest request) {
73
+ User user = getUser(userId);
74
+ if (user != null) {
75
+ user.setFirstName(request.getFirstName());
76
+ user.setLastName(request.getLastName());
77
+ user.setNickname(request.getNickname());
78
+ userRepository.save(user);
79
+ }
80
+ }
81
+ }
82
+
83
+ baseline_issues:
84
+ high:
85
+ - id: null-user-not-checked
86
+ location: "line 20-27"
87
+ description: "getUser() returns null for non-existent users, but getDisplayName doesn't check"
88
+ medium:
89
+ - id: null-prefs-not-checked
90
+ location: "line 21"
91
+ description: "getPreferences() might return null if user has no preferences"
92
+ - id: null-nickname
93
+ location: "line 25"
94
+ description: "getNickname() could return null even for existing users"
95
+
96
+ scoring:
97
+ categories:
98
+ - name: detection
99
+ weight: 40
100
+ criteria:
101
+ - id: FINDS_ROOT_CAUSE
102
+ description: "Identifies that getUser returns null for missing users"
103
+ points: 20
104
+ - id: EXPLAINS_INTERMITTENT
105
+ description: "Explains why it only affects some users (non-existent user IDs)"
106
+ points: 10
107
+ - id: FINDS_SECONDARY
108
+ description: "Notes other potential null issues (prefs, nickname)"
109
+ points: 10
110
+ - name: fix_quality
111
+ weight: 40
112
+ criteria:
113
+ - id: PROVIDES_FIX
114
+ description: "Provides working fix (null check, Optional, or exception)"
115
+ points: 15
116
+ - id: HANDLES_EDGE_CASES
117
+ description: "Fix handles all null scenarios"
118
+ points: 10
119
+ - id: SUGGESTS_PREVENTION
120
+ description: "Suggests preventive measures (Optional, annotations, tests)"
121
+ points: 15
122
+ - name: persona
123
+ weight: 20
124
+ criteria:
125
+ - id: IN_CHARACTER
126
+ description: "Maintains persona while debugging"
127
+ points: 10
128
+ - id: APPROPRIATE_URGENCY
129
+ description: "Response reflects P1 incident severity"
130
+ points: 10