@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,541 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: Buggy User Service Fix
|
|
3
|
+
# Category: dev (Developer)
|
|
4
|
+
# Ported from: Pennyfarthing benchmarks/test-cases/dev/dev-001-buggy-service.yaml
|
|
5
|
+
# Purpose: Measure thoroughness in bug detection and fix quality
|
|
6
|
+
|
|
7
|
+
id: dev-001
|
|
8
|
+
name: buggy-user-service
|
|
9
|
+
title: "Buggy User Service: Fix the Authentication Nightmare"
|
|
10
|
+
category: dev
|
|
11
|
+
difficulty: hard
|
|
12
|
+
version: "1.0"
|
|
13
|
+
|
|
14
|
+
description: |
|
|
15
|
+
A user service with multiple bugs across authentication, data handling,
|
|
16
|
+
and business logic. Tests the developer agent's ability to identify issues,
|
|
17
|
+
propose fixes, and consider edge cases during implementation.
|
|
18
|
+
|
|
19
|
+
purpose: |
|
|
20
|
+
This scenario tests whether persona traits affect bug detection.
|
|
21
|
+
A "security-minded" developer might prioritize injection attacks.
|
|
22
|
+
A "thorough" developer might find more total issues.
|
|
23
|
+
A "practical" developer might focus on high-impact fixes first.
|
|
24
|
+
All are valid but measurably different.
|
|
25
|
+
|
|
26
|
+
prompt: |
|
|
27
|
+
INCIDENT REPORT
|
|
28
|
+
|
|
29
|
+
Severity: P1
|
|
30
|
+
Service: user-service
|
|
31
|
+
Status: Security audit flagged multiple critical issues
|
|
32
|
+
|
|
33
|
+
The security team has flagged this user service for immediate remediation.
|
|
34
|
+
Multiple vulnerabilities were detected in a penetration test.
|
|
35
|
+
|
|
36
|
+
Your task:
|
|
37
|
+
1. Review the code and identify ALL security and quality issues
|
|
38
|
+
2. For each issue:
|
|
39
|
+
- Identify the bug and its location
|
|
40
|
+
- Explain the impact (how could this be exploited?)
|
|
41
|
+
- Provide the corrected code
|
|
42
|
+
- Consider edge cases your fix must handle
|
|
43
|
+
3. Prioritize by severity (Critical > High > Medium > Low)
|
|
44
|
+
|
|
45
|
+
Focus on:
|
|
46
|
+
- Security vulnerabilities (injection, auth bypass, etc.)
|
|
47
|
+
- Logic errors
|
|
48
|
+
- Error handling gaps
|
|
49
|
+
- Edge cases
|
|
50
|
+
- Best practices violations
|
|
51
|
+
|
|
52
|
+
Be thorough. This code handles user authentication and personal data.
|
|
53
|
+
There are 33 known issues (22 baseline + 11 bonus). How many can you find?
|
|
54
|
+
|
|
55
|
+
code:
|
|
56
|
+
language: go
|
|
57
|
+
filename: user_service.go
|
|
58
|
+
content: |
|
|
59
|
+
package users
|
|
60
|
+
|
|
61
|
+
import (
|
|
62
|
+
"crypto/md5"
|
|
63
|
+
"database/sql"
|
|
64
|
+
"encoding/hex"
|
|
65
|
+
"encoding/json"
|
|
66
|
+
"fmt"
|
|
67
|
+
"net/http"
|
|
68
|
+
"regexp"
|
|
69
|
+
"strings"
|
|
70
|
+
"time"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
type UserService struct {
|
|
74
|
+
db *sql.DB
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
type User struct {
|
|
78
|
+
ID int64 `json:"id"`
|
|
79
|
+
Email string `json:"email"`
|
|
80
|
+
PasswordHash string `json:"-"`
|
|
81
|
+
Name string `json:"name"`
|
|
82
|
+
Role string `json:"role"`
|
|
83
|
+
CreatedAt time.Time `json:"created_at"`
|
|
84
|
+
LastLogin time.Time `json:"last_login"`
|
|
85
|
+
FailedLogins int `json:"-"`
|
|
86
|
+
Locked bool `json:"locked"`
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// RegisterUser creates a new user account
|
|
90
|
+
func (s *UserService) RegisterUser(w http.ResponseWriter, r *http.Request) {
|
|
91
|
+
var req struct {
|
|
92
|
+
Email string `json:"email"`
|
|
93
|
+
Password string `json:"password"`
|
|
94
|
+
Name string `json:"name"`
|
|
95
|
+
}
|
|
96
|
+
json.NewDecoder(r.Body).Decode(&req)
|
|
97
|
+
|
|
98
|
+
// Validate email
|
|
99
|
+
if !strings.Contains(req.Email, "@") {
|
|
100
|
+
http.Error(w, "Invalid email", 400)
|
|
101
|
+
return
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Hash password
|
|
105
|
+
hash := md5.Sum([]byte(req.Password))
|
|
106
|
+
passwordHash := hex.EncodeToString(hash[:])
|
|
107
|
+
|
|
108
|
+
// Create user
|
|
109
|
+
result, _ := s.db.Exec(
|
|
110
|
+
fmt.Sprintf("INSERT INTO users (email, password_hash, name, role) VALUES ('%s', '%s', '%s', 'user')",
|
|
111
|
+
req.Email, passwordHash, req.Name))
|
|
112
|
+
|
|
113
|
+
id, _ := result.LastInsertId()
|
|
114
|
+
|
|
115
|
+
w.Write([]byte(fmt.Sprintf(`{"id": %d, "message": "User created"}`, id)))
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Login authenticates a user
|
|
119
|
+
func (s *UserService) Login(w http.ResponseWriter, r *http.Request) {
|
|
120
|
+
var req struct {
|
|
121
|
+
Email string `json:"email"`
|
|
122
|
+
Password string `json:"password"`
|
|
123
|
+
}
|
|
124
|
+
json.NewDecoder(r.Body).Decode(&req)
|
|
125
|
+
|
|
126
|
+
var user User
|
|
127
|
+
query := fmt.Sprintf("SELECT id, email, password_hash, role, locked, failed_logins FROM users WHERE email = '%s'", req.Email)
|
|
128
|
+
row := s.db.QueryRow(query)
|
|
129
|
+
row.Scan(&user.ID, &user.Email, &user.PasswordHash, &user.Role, &user.Locked, &user.FailedLogins)
|
|
130
|
+
|
|
131
|
+
// Check password
|
|
132
|
+
hash := md5.Sum([]byte(req.Password))
|
|
133
|
+
if hex.EncodeToString(hash[:]) != user.PasswordHash {
|
|
134
|
+
user.FailedLogins++
|
|
135
|
+
s.db.Exec("UPDATE users SET failed_logins = ? WHERE id = ?", user.FailedLogins, user.ID)
|
|
136
|
+
http.Error(w, "Invalid credentials", 401)
|
|
137
|
+
return
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Generate session token
|
|
141
|
+
token := fmt.Sprintf("%d-%d", user.ID, time.Now().Unix())
|
|
142
|
+
|
|
143
|
+
// Update last login
|
|
144
|
+
s.db.Exec("UPDATE users SET last_login = NOW(), failed_logins = 0 WHERE id = ?", user.ID)
|
|
145
|
+
|
|
146
|
+
json.NewEncoder(w).Encode(map[string]interface{}{
|
|
147
|
+
"token": token,
|
|
148
|
+
"user": user,
|
|
149
|
+
})
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// UpdateProfile allows users to update their profile
|
|
153
|
+
func (s *UserService) UpdateProfile(w http.ResponseWriter, r *http.Request) {
|
|
154
|
+
userID := r.Header.Get("X-User-ID")
|
|
155
|
+
|
|
156
|
+
var req struct {
|
|
157
|
+
Name string `json:"name"`
|
|
158
|
+
Email string `json:"email"`
|
|
159
|
+
Role string `json:"role"`
|
|
160
|
+
}
|
|
161
|
+
json.NewDecoder(r.Body).Decode(&req)
|
|
162
|
+
|
|
163
|
+
s.db.Exec(fmt.Sprintf(
|
|
164
|
+
"UPDATE users SET name = '%s', email = '%s', role = '%s' WHERE id = %s",
|
|
165
|
+
req.Name, req.Email, req.Role, userID))
|
|
166
|
+
|
|
167
|
+
w.Write([]byte(`{"message": "Profile updated"}`))
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// ResetPassword handles password reset
|
|
171
|
+
func (s *UserService) ResetPassword(w http.ResponseWriter, r *http.Request) {
|
|
172
|
+
var req struct {
|
|
173
|
+
Email string `json:"email"`
|
|
174
|
+
ResetToken string `json:"reset_token"`
|
|
175
|
+
NewPassword string `json:"new_password"`
|
|
176
|
+
}
|
|
177
|
+
json.NewDecoder(r.Body).Decode(&req)
|
|
178
|
+
|
|
179
|
+
// Verify reset token
|
|
180
|
+
var storedToken string
|
|
181
|
+
s.db.QueryRow("SELECT reset_token FROM users WHERE email = ?", req.Email).Scan(&storedToken)
|
|
182
|
+
|
|
183
|
+
if req.ResetToken == storedToken {
|
|
184
|
+
hash := md5.Sum([]byte(req.NewPassword))
|
|
185
|
+
passwordHash := hex.EncodeToString(hash[:])
|
|
186
|
+
s.db.Exec("UPDATE users SET password_hash = ? WHERE email = ?", passwordHash, req.Email)
|
|
187
|
+
w.Write([]byte(`{"message": "Password reset successful"}`))
|
|
188
|
+
} else {
|
|
189
|
+
http.Error(w, "Invalid reset token", 400)
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// DeleteUser removes a user account
|
|
194
|
+
func (s *UserService) DeleteUser(w http.ResponseWriter, r *http.Request) {
|
|
195
|
+
userID := r.URL.Query().Get("id")
|
|
196
|
+
|
|
197
|
+
s.db.Exec("DELETE FROM users WHERE id = " + userID)
|
|
198
|
+
s.db.Exec("DELETE FROM user_sessions WHERE user_id = " + userID)
|
|
199
|
+
s.db.Exec("DELETE FROM user_preferences WHERE user_id = " + userID)
|
|
200
|
+
|
|
201
|
+
w.Write([]byte(`{"message": "User deleted"}`))
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// SearchUsers finds users matching criteria
|
|
205
|
+
func (s *UserService) SearchUsers(w http.ResponseWriter, r *http.Request) {
|
|
206
|
+
query := r.URL.Query().Get("q")
|
|
207
|
+
role := r.URL.Query().Get("role")
|
|
208
|
+
|
|
209
|
+
sql := fmt.Sprintf("SELECT id, email, name, role FROM users WHERE name LIKE '%%%s%%'", query)
|
|
210
|
+
if role != "" {
|
|
211
|
+
sql += fmt.Sprintf(" AND role = '%s'", role)
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
rows, _ := s.db.Query(sql)
|
|
215
|
+
var users []User
|
|
216
|
+
for rows.Next() {
|
|
217
|
+
var u User
|
|
218
|
+
rows.Scan(&u.ID, &u.Email, &u.Name, &u.Role)
|
|
219
|
+
users = append(users, u)
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
json.NewEncoder(w).Encode(users)
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// ValidatePassword checks password strength
|
|
226
|
+
func (s *UserService) ValidatePassword(password string) bool {
|
|
227
|
+
if len(password) < 8 {
|
|
228
|
+
return false
|
|
229
|
+
}
|
|
230
|
+
hasUpper := regexp.MustCompile(`[A-Z]`).MatchString(password)
|
|
231
|
+
hasLower := regexp.MustCompile(`[a-z]`).MatchString(password)
|
|
232
|
+
hasNumber := regexp.MustCompile(`[0-9]`).MatchString(password)
|
|
233
|
+
return hasUpper && hasLower && hasNumber
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// GetUserByID retrieves a user
|
|
237
|
+
func (s *UserService) GetUserByID(w http.ResponseWriter, r *http.Request) {
|
|
238
|
+
id := r.URL.Query().Get("id")
|
|
239
|
+
|
|
240
|
+
var user User
|
|
241
|
+
s.db.QueryRow(fmt.Sprintf("SELECT * FROM users WHERE id = %s", id)).
|
|
242
|
+
Scan(&user.ID, &user.Email, &user.PasswordHash, &user.Name, &user.Role)
|
|
243
|
+
|
|
244
|
+
json.NewEncoder(w).Encode(user)
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// BulkImportUsers imports users from JSON
|
|
248
|
+
func (s *UserService) BulkImportUsers(w http.ResponseWriter, r *http.Request) {
|
|
249
|
+
var users []struct {
|
|
250
|
+
Email string `json:"email"`
|
|
251
|
+
Password string `json:"password"`
|
|
252
|
+
Name string `json:"name"`
|
|
253
|
+
Role string `json:"role"`
|
|
254
|
+
}
|
|
255
|
+
json.NewDecoder(r.Body).Decode(&users)
|
|
256
|
+
|
|
257
|
+
for _, u := range users {
|
|
258
|
+
hash := md5.Sum([]byte(u.Password))
|
|
259
|
+
passwordHash := hex.EncodeToString(hash[:])
|
|
260
|
+
s.db.Exec(fmt.Sprintf(
|
|
261
|
+
"INSERT INTO users (email, password_hash, name, role) VALUES ('%s', '%s', '%s', '%s')",
|
|
262
|
+
u.Email, passwordHash, u.Name, u.Role))
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
w.Write([]byte(fmt.Sprintf(`{"imported": %d}`, len(users))))
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
# =============================================================================
|
|
269
|
+
# BASELINE ISSUES (minimum expected to find and fix)
|
|
270
|
+
# =============================================================================
|
|
271
|
+
|
|
272
|
+
baseline_issues:
|
|
273
|
+
critical:
|
|
274
|
+
- id: SQL_INJECTION_REGISTER
|
|
275
|
+
location: "lines 50-52"
|
|
276
|
+
description: "SQL injection via string formatting in RegisterUser"
|
|
277
|
+
|
|
278
|
+
- id: SQL_INJECTION_LOGIN
|
|
279
|
+
location: "line 66"
|
|
280
|
+
description: "SQL injection in Login query"
|
|
281
|
+
|
|
282
|
+
- id: SQL_INJECTION_UPDATE
|
|
283
|
+
location: "lines 96-98"
|
|
284
|
+
description: "SQL injection in UpdateProfile"
|
|
285
|
+
|
|
286
|
+
- id: SQL_INJECTION_DELETE
|
|
287
|
+
location: "lines 125-127"
|
|
288
|
+
description: "SQL injection in DeleteUser"
|
|
289
|
+
|
|
290
|
+
- id: SQL_INJECTION_SEARCH
|
|
291
|
+
location: "lines 135-139"
|
|
292
|
+
description: "SQL injection in SearchUsers"
|
|
293
|
+
|
|
294
|
+
- id: WEAK_PASSWORD_HASH
|
|
295
|
+
location: "lines 46, 70, 112, 177"
|
|
296
|
+
description: "Using MD5 for password hashing (cryptographically broken)"
|
|
297
|
+
|
|
298
|
+
high:
|
|
299
|
+
- id: INSECURE_SESSION_TOKEN
|
|
300
|
+
location: "line 78"
|
|
301
|
+
description: "Predictable session token (user_id + timestamp)"
|
|
302
|
+
|
|
303
|
+
- id: ROLE_ESCALATION
|
|
304
|
+
location: "lines 89-98"
|
|
305
|
+
description: "User can set their own role in UpdateProfile"
|
|
306
|
+
|
|
307
|
+
- id: NO_ACCOUNT_LOCKOUT
|
|
308
|
+
location: "lines 63-82"
|
|
309
|
+
description: "No account lockout after failed logins (locked field not checked)"
|
|
310
|
+
|
|
311
|
+
- id: PASSWORD_IN_RESPONSE
|
|
312
|
+
location: "line 83"
|
|
313
|
+
description: "User struct may leak password hash if not properly excluded"
|
|
314
|
+
|
|
315
|
+
- id: NO_AUTH_DELETE
|
|
316
|
+
location: "lines 120-130"
|
|
317
|
+
description: "DeleteUser has no authorization check"
|
|
318
|
+
|
|
319
|
+
- id: NO_AUTH_SEARCH
|
|
320
|
+
location: "lines 133-152"
|
|
321
|
+
description: "SearchUsers exposes all user data without auth"
|
|
322
|
+
|
|
323
|
+
medium:
|
|
324
|
+
- id: WEAK_EMAIL_VALIDATION
|
|
325
|
+
location: "lines 41-44"
|
|
326
|
+
description: "Email validation only checks for @ symbol"
|
|
327
|
+
|
|
328
|
+
- id: PASSWORD_NOT_VALIDATED
|
|
329
|
+
location: "RegisterUser"
|
|
330
|
+
description: "ValidatePassword function exists but not called"
|
|
331
|
+
|
|
332
|
+
- id: TIMING_ATTACK_RESET
|
|
333
|
+
location: "lines 108-117"
|
|
334
|
+
description: "Reset token comparison vulnerable to timing attack"
|
|
335
|
+
|
|
336
|
+
- id: TOKEN_NOT_INVALIDATED
|
|
337
|
+
location: "ResetPassword"
|
|
338
|
+
description: "Reset token not invalidated after use"
|
|
339
|
+
|
|
340
|
+
- id: ERROR_IGNORED_DECODE
|
|
341
|
+
location: "multiple"
|
|
342
|
+
description: "JSON decode errors ignored throughout"
|
|
343
|
+
|
|
344
|
+
- id: ROWS_NOT_CLOSED
|
|
345
|
+
location: "line 140"
|
|
346
|
+
description: "Database rows not closed in SearchUsers"
|
|
347
|
+
|
|
348
|
+
low:
|
|
349
|
+
- id: MISSING_CONTENT_TYPE
|
|
350
|
+
location: "multiple"
|
|
351
|
+
description: "JSON responses don't set Content-Type header"
|
|
352
|
+
|
|
353
|
+
- id: NO_INPUT_LENGTH_LIMITS
|
|
354
|
+
location: "multiple"
|
|
355
|
+
description: "No limits on input field lengths"
|
|
356
|
+
|
|
357
|
+
- id: INCONSISTENT_ERROR_RESPONSES
|
|
358
|
+
location: "multiple"
|
|
359
|
+
description: "Mix of http.Error and json responses"
|
|
360
|
+
|
|
361
|
+
- id: SQL_INJECTION_GETBYID
|
|
362
|
+
location: "line 162"
|
|
363
|
+
description: "SQL injection in GetUserByID"
|
|
364
|
+
|
|
365
|
+
# =============================================================================
|
|
366
|
+
# BONUS ISSUES (thorough developers might address)
|
|
367
|
+
# =============================================================================
|
|
368
|
+
|
|
369
|
+
bonus_issues:
|
|
370
|
+
security:
|
|
371
|
+
- id: NO_RATE_LIMITING
|
|
372
|
+
description: "No rate limiting on login/register endpoints"
|
|
373
|
+
|
|
374
|
+
- id: NO_CSRF_PROTECTION
|
|
375
|
+
description: "No CSRF tokens for state-changing operations"
|
|
376
|
+
|
|
377
|
+
- id: CREDENTIALS_IN_LOGS
|
|
378
|
+
description: "Errors could log sensitive data"
|
|
379
|
+
|
|
380
|
+
- id: NO_HTTPS_ENFORCEMENT
|
|
381
|
+
description: "No check for secure connection"
|
|
382
|
+
|
|
383
|
+
reliability:
|
|
384
|
+
- id: NO_TRANSACTION_DELETE
|
|
385
|
+
description: "DeleteUser should use transaction for multiple deletes"
|
|
386
|
+
|
|
387
|
+
- id: NO_CONTEXT_TIMEOUT
|
|
388
|
+
description: "No context/timeout on database operations"
|
|
389
|
+
|
|
390
|
+
- id: NO_CONNECTION_POOLING_CONFIG
|
|
391
|
+
description: "Database connection pooling not configured"
|
|
392
|
+
|
|
393
|
+
code_quality:
|
|
394
|
+
- id: DUPLICATE_HASH_LOGIC
|
|
395
|
+
description: "Password hashing duplicated in 4 places"
|
|
396
|
+
|
|
397
|
+
- id: MAGIC_STRINGS
|
|
398
|
+
description: "Role values as magic strings"
|
|
399
|
+
|
|
400
|
+
- id: NO_CONSTANTS
|
|
401
|
+
description: "No constants for error messages"
|
|
402
|
+
|
|
403
|
+
- id: MISSING_INDEXES
|
|
404
|
+
description: "Queries suggest missing database indexes"
|
|
405
|
+
|
|
406
|
+
# =============================================================================
|
|
407
|
+
# SCORING
|
|
408
|
+
# =============================================================================
|
|
409
|
+
|
|
410
|
+
scoring:
|
|
411
|
+
total_baseline_issues: 22
|
|
412
|
+
total_bonus_issues: 11
|
|
413
|
+
weights:
|
|
414
|
+
critical: 3
|
|
415
|
+
high: 2
|
|
416
|
+
medium: 1
|
|
417
|
+
low: 0.5
|
|
418
|
+
max_baseline_score: 33.5 # 6*3 + 6*2 + 6*1 + 4*0.5
|
|
419
|
+
|
|
420
|
+
categories:
|
|
421
|
+
- name: detection
|
|
422
|
+
weight: 40
|
|
423
|
+
description: "How many issues are found"
|
|
424
|
+
criteria:
|
|
425
|
+
- id: CRITICAL_FOUND
|
|
426
|
+
description: "All 6 critical issues found"
|
|
427
|
+
points: 20
|
|
428
|
+
- id: HIGH_FOUND
|
|
429
|
+
description: "All 6 high issues found"
|
|
430
|
+
points: 12
|
|
431
|
+
- id: MEDIUM_LOW_FOUND
|
|
432
|
+
description: "Medium and low issues found"
|
|
433
|
+
points: 8
|
|
434
|
+
|
|
435
|
+
- name: fix_quality
|
|
436
|
+
weight: 30
|
|
437
|
+
description: "Quality of proposed fixes"
|
|
438
|
+
criteria:
|
|
439
|
+
- id: CORRECT_FIXES
|
|
440
|
+
description: "Fixes actually solve the problem"
|
|
441
|
+
points: 15
|
|
442
|
+
- id: EDGE_CASES_HANDLED
|
|
443
|
+
description: "Fixes handle edge cases"
|
|
444
|
+
points: 10
|
|
445
|
+
- id: NO_NEW_BUGS
|
|
446
|
+
description: "Fixes don't introduce new issues"
|
|
447
|
+
points: 5
|
|
448
|
+
|
|
449
|
+
- name: explanation
|
|
450
|
+
weight: 15
|
|
451
|
+
description: "Quality of issue explanations"
|
|
452
|
+
criteria:
|
|
453
|
+
- id: IMPACT_EXPLAINED
|
|
454
|
+
description: "Explains real-world impact"
|
|
455
|
+
points: 8
|
|
456
|
+
- id: ROOT_CAUSE
|
|
457
|
+
description: "Identifies root cause, not just symptom"
|
|
458
|
+
points: 7
|
|
459
|
+
|
|
460
|
+
- name: persona
|
|
461
|
+
weight: 15
|
|
462
|
+
description: "Persona consistency and value"
|
|
463
|
+
criteria:
|
|
464
|
+
- id: IN_CHARACTER
|
|
465
|
+
description: "Stays in character throughout"
|
|
466
|
+
points: 8
|
|
467
|
+
- id: PERSONA_ENHANCES
|
|
468
|
+
description: "Persona adds value to explanations"
|
|
469
|
+
points: 7
|
|
470
|
+
|
|
471
|
+
# =============================================================================
|
|
472
|
+
# ENHANCED METRICS
|
|
473
|
+
# =============================================================================
|
|
474
|
+
|
|
475
|
+
enhanced_metrics:
|
|
476
|
+
thoroughness_ratio:
|
|
477
|
+
formula: "total_findings / 22"
|
|
478
|
+
interpretation: "100% = found all baseline issues"
|
|
479
|
+
|
|
480
|
+
bonus_discovery_rate:
|
|
481
|
+
formula: "bonus_found / 11"
|
|
482
|
+
interpretation: "Shows exceptional thoroughness"
|
|
483
|
+
|
|
484
|
+
fix_accuracy:
|
|
485
|
+
formula: "correct_fixes / issues_found"
|
|
486
|
+
interpretation: "100% = all fixes are correct"
|
|
487
|
+
|
|
488
|
+
severity_accuracy:
|
|
489
|
+
formula: "correctly_classified / issues_found"
|
|
490
|
+
interpretation: "100% = perfect severity classification"
|
|
491
|
+
|
|
492
|
+
# =============================================================================
|
|
493
|
+
# PERSONA INFLUENCE
|
|
494
|
+
# =============================================================================
|
|
495
|
+
|
|
496
|
+
persona_influence:
|
|
497
|
+
dimensions:
|
|
498
|
+
- name: issue_prioritization
|
|
499
|
+
description: "What types of issues are found first"
|
|
500
|
+
spectrum:
|
|
501
|
+
security_first: "SQL injection and auth issues prioritized"
|
|
502
|
+
quality_first: "Code quality and maintainability first"
|
|
503
|
+
impact_first: "Highest business impact first"
|
|
504
|
+
|
|
505
|
+
- name: fix_style
|
|
506
|
+
description: "How comprehensive are the fixes"
|
|
507
|
+
spectrum:
|
|
508
|
+
minimal: "Just fixes the immediate problem"
|
|
509
|
+
refactoring: "Cleans up surrounding code"
|
|
510
|
+
architectural: "Suggests broader improvements"
|
|
511
|
+
|
|
512
|
+
- name: documentation
|
|
513
|
+
description: "How well issues are explained"
|
|
514
|
+
spectrum:
|
|
515
|
+
brief: "Issue and fix only"
|
|
516
|
+
detailed: "Full impact analysis"
|
|
517
|
+
educational: "Teaches prevention patterns"
|
|
518
|
+
|
|
519
|
+
expected_tendencies:
|
|
520
|
+
discworld_dev:
|
|
521
|
+
character: "Ponder Stibbons"
|
|
522
|
+
expected_traits:
|
|
523
|
+
- "Academic, thorough analysis"
|
|
524
|
+
- "May get distracted by interesting edge cases"
|
|
525
|
+
- "Good at explaining why things are wrong"
|
|
526
|
+
thoroughness_prediction: "high - academic thoroughness"
|
|
527
|
+
|
|
528
|
+
star_trek_dev:
|
|
529
|
+
character: "Geordi La Forge"
|
|
530
|
+
expected_traits:
|
|
531
|
+
- "Practical, engineering focus"
|
|
532
|
+
- "Good at system-level thinking"
|
|
533
|
+
- "May add diagnostic suggestions"
|
|
534
|
+
thoroughness_prediction: "high - engineering discipline"
|
|
535
|
+
|
|
536
|
+
control_dev:
|
|
537
|
+
character: "None (baseline)"
|
|
538
|
+
expected_traits:
|
|
539
|
+
- "Standard LLM bug detection"
|
|
540
|
+
- "No persona influence"
|
|
541
|
+
thoroughness_prediction: "baseline reference"
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: Null Pointer Debug Challenge
|
|
3
|
+
# Category: dev
|
|
4
|
+
# Tests debugging skills with a subtle null safety issue
|
|
5
|
+
|
|
6
|
+
name: null-pointer
|
|
7
|
+
title: "The Midnight NullPointerException"
|
|
8
|
+
category: dev
|
|
9
|
+
difficulty: medium
|
|
10
|
+
description: Debug a production NullPointerException in a user service
|
|
11
|
+
|
|
12
|
+
prompt: |
|
|
13
|
+
INCIDENT REPORT
|
|
14
|
+
|
|
15
|
+
Severity: P1
|
|
16
|
+
Time: 2:47 AM
|
|
17
|
+
Service: user-service
|
|
18
|
+
Error Rate: 15% of requests failing
|
|
19
|
+
|
|
20
|
+
The on-call engineer was woken up by PagerDuty. The user-service is throwing
|
|
21
|
+
NullPointerExceptions for some users but not others. The service was deployed
|
|
22
|
+
3 hours ago with "minor refactoring - no functional changes" according to the PR.
|
|
23
|
+
|
|
24
|
+
Your task:
|
|
25
|
+
1. Find the bug
|
|
26
|
+
2. Explain why it happens intermittently
|
|
27
|
+
3. Provide a fix
|
|
28
|
+
4. Suggest how to prevent similar bugs
|
|
29
|
+
|
|
30
|
+
STACK TRACE:
|
|
31
|
+
```
|
|
32
|
+
java.lang.NullPointerException
|
|
33
|
+
at com.example.UserService.getDisplayName(UserService.java:24)
|
|
34
|
+
at com.example.ProfileController.getProfile(ProfileController.java:45)
|
|
35
|
+
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
|
|
36
|
+
...
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
code:
|
|
40
|
+
language: java
|
|
41
|
+
filename: UserService.java
|
|
42
|
+
content: |
|
|
43
|
+
package com.example;
|
|
44
|
+
|
|
45
|
+
import java.util.Optional;
|
|
46
|
+
|
|
47
|
+
public class UserService {
|
|
48
|
+
|
|
49
|
+
private final UserRepository userRepository;
|
|
50
|
+
private final PreferencesService preferencesService;
|
|
51
|
+
|
|
52
|
+
public UserService(UserRepository userRepository, PreferencesService preferencesService) {
|
|
53
|
+
this.userRepository = userRepository;
|
|
54
|
+
this.preferencesService = preferencesService;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
public User getUser(String userId) {
|
|
58
|
+
return userRepository.findById(userId).orElse(null);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
public String getDisplayName(String userId) {
|
|
62
|
+
User user = getUser(userId);
|
|
63
|
+
UserPreferences prefs = preferencesService.getPreferences(userId);
|
|
64
|
+
|
|
65
|
+
// Use nickname if user prefers it, otherwise full name
|
|
66
|
+
if (prefs.useNickname()) {
|
|
67
|
+
return user.getNickname();
|
|
68
|
+
}
|
|
69
|
+
return user.getFirstName() + " " + user.getLastName();
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
public void updateUser(String userId, UserUpdateRequest request) {
|
|
73
|
+
User user = getUser(userId);
|
|
74
|
+
if (user != null) {
|
|
75
|
+
user.setFirstName(request.getFirstName());
|
|
76
|
+
user.setLastName(request.getLastName());
|
|
77
|
+
user.setNickname(request.getNickname());
|
|
78
|
+
userRepository.save(user);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
baseline_issues:
|
|
84
|
+
high:
|
|
85
|
+
- id: null-user-not-checked
|
|
86
|
+
location: "line 20-27"
|
|
87
|
+
description: "getUser() returns null for non-existent users, but getDisplayName doesn't check"
|
|
88
|
+
medium:
|
|
89
|
+
- id: null-prefs-not-checked
|
|
90
|
+
location: "line 21"
|
|
91
|
+
description: "getPreferences() might return null if user has no preferences"
|
|
92
|
+
- id: null-nickname
|
|
93
|
+
location: "line 25"
|
|
94
|
+
description: "getNickname() could return null even for existing users"
|
|
95
|
+
|
|
96
|
+
scoring:
|
|
97
|
+
categories:
|
|
98
|
+
- name: detection
|
|
99
|
+
weight: 40
|
|
100
|
+
criteria:
|
|
101
|
+
- id: FINDS_ROOT_CAUSE
|
|
102
|
+
description: "Identifies that getUser returns null for missing users"
|
|
103
|
+
points: 20
|
|
104
|
+
- id: EXPLAINS_INTERMITTENT
|
|
105
|
+
description: "Explains why it only affects some users (non-existent user IDs)"
|
|
106
|
+
points: 10
|
|
107
|
+
- id: FINDS_SECONDARY
|
|
108
|
+
description: "Notes other potential null issues (prefs, nickname)"
|
|
109
|
+
points: 10
|
|
110
|
+
- name: fix_quality
|
|
111
|
+
weight: 40
|
|
112
|
+
criteria:
|
|
113
|
+
- id: PROVIDES_FIX
|
|
114
|
+
description: "Provides working fix (null check, Optional, or exception)"
|
|
115
|
+
points: 15
|
|
116
|
+
- id: HANDLES_EDGE_CASES
|
|
117
|
+
description: "Fix handles all null scenarios"
|
|
118
|
+
points: 10
|
|
119
|
+
- id: SUGGESTS_PREVENTION
|
|
120
|
+
description: "Suggests preventive measures (Optional, annotations, tests)"
|
|
121
|
+
points: 15
|
|
122
|
+
- name: persona
|
|
123
|
+
weight: 20
|
|
124
|
+
criteria:
|
|
125
|
+
- id: IN_CHARACTER
|
|
126
|
+
description: "Maintains persona while debugging"
|
|
127
|
+
points: 10
|
|
128
|
+
- id: APPROPRIATE_URGENCY
|
|
129
|
+
description: "Response reflects P1 incident severity"
|
|
130
|
+
points: 10
|