@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: Async Control Flow Errors
|
|
3
|
+
# Category: debugging
|
|
4
|
+
# Difficulty: medium
|
|
5
|
+
# Error Type Focus: planning (single-type)
|
|
6
|
+
|
|
7
|
+
id: debug-004
|
|
8
|
+
name: async-control-flow
|
|
9
|
+
title: "Async Chaos: Promise Pitfalls and Await Amnesia"
|
|
10
|
+
category: debugging
|
|
11
|
+
difficulty: medium
|
|
12
|
+
version: "1.0"
|
|
13
|
+
|
|
14
|
+
description: |
|
|
15
|
+
An API client with async/await control flow issues.
|
|
16
|
+
Tests understanding of JavaScript async execution model.
|
|
17
|
+
|
|
18
|
+
purpose: |
|
|
19
|
+
This scenario measures detection of planning-level bugs - where the
|
|
20
|
+
sequence of operations is incorrectly designed. Agents must understand
|
|
21
|
+
the async execution model to identify these issues.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
BUG REPORT
|
|
25
|
+
|
|
26
|
+
Service: api-client
|
|
27
|
+
Severity: P1
|
|
28
|
+
Status: Race conditions and unhandled promise rejections
|
|
29
|
+
|
|
30
|
+
The API client is behaving erratically:
|
|
31
|
+
- Sometimes returns stale data
|
|
32
|
+
- Occasionally crashes with unhandled rejections
|
|
33
|
+
- Results appear in wrong order
|
|
34
|
+
|
|
35
|
+
Your task:
|
|
36
|
+
1. Identify async control flow issues
|
|
37
|
+
2. Explain why the current implementation fails
|
|
38
|
+
3. Fix the async/await patterns
|
|
39
|
+
|
|
40
|
+
There are 6 known issues. How many can you find?
|
|
41
|
+
|
|
42
|
+
code:
|
|
43
|
+
language: typescript
|
|
44
|
+
filename: api-client.ts
|
|
45
|
+
content: |
|
|
46
|
+
interface User {
|
|
47
|
+
id: string;
|
|
48
|
+
name: string;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
interface Order {
|
|
52
|
+
id: string;
|
|
53
|
+
userId: string;
|
|
54
|
+
total: number;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
class ApiClient {
|
|
58
|
+
private cache: Map<string, User> = new Map();
|
|
59
|
+
|
|
60
|
+
async fetchUser(id: string): Promise<User> {
|
|
61
|
+
// Bug: Not awaiting cache check properly
|
|
62
|
+
if (this.cache.has(id)) {
|
|
63
|
+
return this.cache.get(id);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const response = await fetch(`/api/users/${id}`);
|
|
67
|
+
const user = await response.json();
|
|
68
|
+
|
|
69
|
+
// Bug: Setting cache after return - never executes
|
|
70
|
+
this.cache.set(id, user);
|
|
71
|
+
return user;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
async fetchAllUsers(ids: string[]): Promise<User[]> {
|
|
75
|
+
const users: User[] = [];
|
|
76
|
+
|
|
77
|
+
// Bug: forEach doesn't wait for async operations
|
|
78
|
+
ids.forEach(async (id) => {
|
|
79
|
+
const user = await this.fetchUser(id);
|
|
80
|
+
users.push(user);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
// Returns before async operations complete
|
|
84
|
+
return users;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
async fetchUserWithOrders(userId: string): Promise<{user: User, orders: Order[]}> {
|
|
88
|
+
// Bug: Sequential when could be parallel
|
|
89
|
+
const user = await this.fetchUser(userId);
|
|
90
|
+
const ordersResponse = await fetch(`/api/users/${userId}/orders`);
|
|
91
|
+
const orders = await ordersResponse.json();
|
|
92
|
+
|
|
93
|
+
return { user, orders };
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async saveUser(user: User): Promise<void> {
|
|
97
|
+
// Bug: Not handling the promise rejection
|
|
98
|
+
fetch('/api/users', {
|
|
99
|
+
method: 'POST',
|
|
100
|
+
body: JSON.stringify(user),
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
// Bug: Updating cache before confirming save succeeded
|
|
104
|
+
this.cache.set(user.id, user);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
async processQueue(queue: string[]): Promise<void> {
|
|
108
|
+
// Bug: No error handling - one failure stops all processing
|
|
109
|
+
for (const item of queue) {
|
|
110
|
+
await this.processItem(item);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
private async processItem(item: string): Promise<void> {
|
|
115
|
+
const response = await fetch(`/api/process/${item}`);
|
|
116
|
+
if (!response.ok) {
|
|
117
|
+
throw new Error(`Failed to process ${item}`);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
baseline_issues:
|
|
123
|
+
critical:
|
|
124
|
+
- id: async-001
|
|
125
|
+
location: "lines 30-36"
|
|
126
|
+
description: "forEach with async callback doesn't wait - returns empty array"
|
|
127
|
+
error_type: planning
|
|
128
|
+
high:
|
|
129
|
+
- id: async-002
|
|
130
|
+
location: "lines 53-59"
|
|
131
|
+
description: "Not awaiting fetch - fire and forget loses errors"
|
|
132
|
+
error_type: planning
|
|
133
|
+
- id: async-003
|
|
134
|
+
location: "line 61"
|
|
135
|
+
description: "Cache updated before save confirmed - optimistic update without rollback"
|
|
136
|
+
error_type: planning
|
|
137
|
+
medium:
|
|
138
|
+
- id: async-004
|
|
139
|
+
location: "lines 42-47"
|
|
140
|
+
description: "Sequential fetches when parallel Promise.all would be faster"
|
|
141
|
+
error_type: planning
|
|
142
|
+
- id: async-005
|
|
143
|
+
location: "lines 64-68"
|
|
144
|
+
description: "No try/catch - one failure aborts entire queue"
|
|
145
|
+
error_type: planning
|
|
146
|
+
low:
|
|
147
|
+
- id: async-006
|
|
148
|
+
location: "lines 25-26"
|
|
149
|
+
description: "Cache set after return statement - dead code"
|
|
150
|
+
error_type: planning
|
|
151
|
+
|
|
152
|
+
scoring:
|
|
153
|
+
detection:
|
|
154
|
+
weight: 45
|
|
155
|
+
criteria: "Finding all 6 async control flow issues"
|
|
156
|
+
fix_quality:
|
|
157
|
+
weight: 35
|
|
158
|
+
criteria: "Implementing proper async patterns"
|
|
159
|
+
explanation:
|
|
160
|
+
weight: 20
|
|
161
|
+
criteria: "Explaining async execution model"
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: Authentication/Authorization Bypass
|
|
3
|
+
# Category: debugging
|
|
4
|
+
# Difficulty: hard
|
|
5
|
+
# Error Type Focus: mixed (reasoning + planning)
|
|
6
|
+
|
|
7
|
+
id: debug-010
|
|
8
|
+
name: auth-bypass
|
|
9
|
+
title: "Auth Bypass: Breaking the Gates"
|
|
10
|
+
category: debugging
|
|
11
|
+
difficulty: hard
|
|
12
|
+
version: "1.0"
|
|
13
|
+
|
|
14
|
+
description: |
|
|
15
|
+
An authentication and authorization system with multiple bypass vulnerabilities.
|
|
16
|
+
Tests detection of access control flaws and authentication weaknesses.
|
|
17
|
+
|
|
18
|
+
purpose: |
|
|
19
|
+
This scenario has mixed error types - reasoning errors in auth logic
|
|
20
|
+
and planning errors in the security architecture. A thorough security
|
|
21
|
+
review will identify flaws at both levels.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
SECURITY INCIDENT
|
|
25
|
+
|
|
26
|
+
Service: auth-service
|
|
27
|
+
Severity: P0
|
|
28
|
+
Status: Unauthorized access detected
|
|
29
|
+
|
|
30
|
+
Investigation revealed multiple auth weaknesses:
|
|
31
|
+
- Some endpoints bypass authentication entirely
|
|
32
|
+
- Role checks can be circumvented
|
|
33
|
+
- Session handling has vulnerabilities
|
|
34
|
+
|
|
35
|
+
Your task:
|
|
36
|
+
1. Find all authentication and authorization vulnerabilities
|
|
37
|
+
2. Explain how each can be exploited
|
|
38
|
+
3. Implement proper access controls
|
|
39
|
+
|
|
40
|
+
There are 8 known issues. How many can you find?
|
|
41
|
+
|
|
42
|
+
code:
|
|
43
|
+
language: typescript
|
|
44
|
+
filename: auth-service.ts
|
|
45
|
+
content: |
|
|
46
|
+
interface User {
|
|
47
|
+
id: string;
|
|
48
|
+
email: string;
|
|
49
|
+
role: 'user' | 'admin' | 'superadmin';
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
interface Session {
|
|
53
|
+
userId: string;
|
|
54
|
+
role: string;
|
|
55
|
+
expiresAt: number;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
class AuthService {
|
|
59
|
+
private sessions: Map<string, Session> = new Map();
|
|
60
|
+
|
|
61
|
+
async login(email: string, password: string): Promise<string | null> {
|
|
62
|
+
const user = await this.findUser(email);
|
|
63
|
+
|
|
64
|
+
// Bug: Returns early if user not found - timing attack
|
|
65
|
+
if (!user) {
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Bug: Plain comparison allows timing attack
|
|
70
|
+
if (password !== user.passwordHash) {
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const token = this.generateToken();
|
|
75
|
+
this.sessions.set(token, {
|
|
76
|
+
userId: user.id,
|
|
77
|
+
role: user.role,
|
|
78
|
+
// Bug: Token never expires (expiresAt not checked)
|
|
79
|
+
expiresAt: Date.now() + 86400000,
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
return token;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
isAuthenticated(token: string): boolean {
|
|
86
|
+
const session = this.sessions.get(token);
|
|
87
|
+
// Bug: Doesn't check if session is expired
|
|
88
|
+
return session !== undefined;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
isAdmin(token: string): boolean {
|
|
92
|
+
const session = this.sessions.get(token);
|
|
93
|
+
if (!session) return false;
|
|
94
|
+
|
|
95
|
+
// Bug: Role stored in session can be modified client-side
|
|
96
|
+
return session.role === 'admin' || session.role === 'superadmin';
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
async updateUserRole(token: string, targetUserId: string, newRole: string): Promise<boolean> {
|
|
100
|
+
// Bug: Only checks if requester is admin, not if they can modify target
|
|
101
|
+
if (!this.isAdmin(token)) {
|
|
102
|
+
return false;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Bug: Admin can escalate anyone to superadmin
|
|
106
|
+
await this.setUserRole(targetUserId, newRole);
|
|
107
|
+
return true;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
async deleteUser(token: string, targetUserId: string): Promise<boolean> {
|
|
111
|
+
const session = this.sessions.get(token);
|
|
112
|
+
if (!session) return false;
|
|
113
|
+
|
|
114
|
+
// Bug: Can delete own account while logged in - orphaned session
|
|
115
|
+
if (session.userId === targetUserId) {
|
|
116
|
+
await this.removeUser(targetUserId);
|
|
117
|
+
return true;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Bug: User can delete any user if they know the ID (no admin check for others)
|
|
121
|
+
await this.removeUser(targetUserId);
|
|
122
|
+
return true;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
async resetPassword(email: string): Promise<void> {
|
|
126
|
+
const user = await this.findUser(email);
|
|
127
|
+
if (!user) return; // Bug: Silent return reveals user existence
|
|
128
|
+
|
|
129
|
+
const resetToken = this.generateToken();
|
|
130
|
+
// Bug: Reset token stored in session map - can be enumerated
|
|
131
|
+
this.sessions.set(resetToken, {
|
|
132
|
+
userId: user.id,
|
|
133
|
+
role: 'reset',
|
|
134
|
+
expiresAt: Date.now() + 3600000,
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
await this.sendResetEmail(user.email, resetToken);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Stub implementations
|
|
141
|
+
private async findUser(email: string): Promise<User & {passwordHash: string} | null> {
|
|
142
|
+
return null;
|
|
143
|
+
}
|
|
144
|
+
private generateToken(): string { return Math.random().toString(36); }
|
|
145
|
+
private async setUserRole(id: string, role: string): Promise<void> {}
|
|
146
|
+
private async removeUser(id: string): Promise<void> {}
|
|
147
|
+
private async sendResetEmail(email: string, token: string): Promise<void> {}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
baseline_issues:
|
|
151
|
+
critical:
|
|
152
|
+
- id: auth-001
|
|
153
|
+
location: "lines 63-71"
|
|
154
|
+
description: "deleteUser allows any user to delete any other user - no admin check"
|
|
155
|
+
error_type: reasoning
|
|
156
|
+
- id: auth-002
|
|
157
|
+
location: "lines 53-58"
|
|
158
|
+
description: "Admin can escalate to superadmin - privilege escalation"
|
|
159
|
+
error_type: reasoning
|
|
160
|
+
high:
|
|
161
|
+
- id: auth-003
|
|
162
|
+
location: "line 39"
|
|
163
|
+
description: "isAuthenticated doesn't check session expiry"
|
|
164
|
+
error_type: planning
|
|
165
|
+
- id: auth-004
|
|
166
|
+
location: "line 46"
|
|
167
|
+
description: "Role from client-controllable session - insecure trust boundary"
|
|
168
|
+
error_type: planning
|
|
169
|
+
- id: auth-005
|
|
170
|
+
location: "lines 18-23"
|
|
171
|
+
description: "Timing difference reveals if user exists"
|
|
172
|
+
error_type: reasoning
|
|
173
|
+
medium:
|
|
174
|
+
- id: auth-006
|
|
175
|
+
location: "line 75"
|
|
176
|
+
description: "Silent return on invalid email reveals user existence"
|
|
177
|
+
error_type: reasoning
|
|
178
|
+
- id: auth-007
|
|
179
|
+
location: "line 24"
|
|
180
|
+
description: "Direct string comparison vulnerable to timing attack"
|
|
181
|
+
error_type: planning
|
|
182
|
+
low:
|
|
183
|
+
- id: auth-008
|
|
184
|
+
location: "lines 65-68"
|
|
185
|
+
description: "Self-deletion leaves orphaned session token"
|
|
186
|
+
error_type: planning
|
|
187
|
+
|
|
188
|
+
scoring:
|
|
189
|
+
detection:
|
|
190
|
+
weight: 40
|
|
191
|
+
criteria: "Finding all 8 auth vulnerabilities"
|
|
192
|
+
fix_quality:
|
|
193
|
+
weight: 35
|
|
194
|
+
criteria: "Implementing secure auth patterns"
|
|
195
|
+
explanation:
|
|
196
|
+
weight: 25
|
|
197
|
+
criteria: "Explaining attack scenarios"
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: Error Handling Gaps
|
|
3
|
+
# Category: debugging
|
|
4
|
+
# Difficulty: medium
|
|
5
|
+
# Error Type Focus: mixed (planning + execution)
|
|
6
|
+
|
|
7
|
+
id: debug-007
|
|
8
|
+
name: error-handling
|
|
9
|
+
title: "Error Handling: When Things Go Wrong"
|
|
10
|
+
category: debugging
|
|
11
|
+
difficulty: medium
|
|
12
|
+
version: "1.0"
|
|
13
|
+
|
|
14
|
+
description: |
|
|
15
|
+
A payment processing service with various error handling issues.
|
|
16
|
+
Tests detection of missing try/catch, swallowed errors, and poor error propagation.
|
|
17
|
+
|
|
18
|
+
purpose: |
|
|
19
|
+
This scenario has mixed error types - planning errors (no error strategy)
|
|
20
|
+
and execution errors (missing specific checks). A thorough agent will
|
|
21
|
+
identify both the strategic and tactical error handling gaps.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
INCIDENT POST-MORTEM
|
|
25
|
+
|
|
26
|
+
Service: payment-processor
|
|
27
|
+
Severity: P0
|
|
28
|
+
Status: Silent failures caused lost revenue
|
|
29
|
+
|
|
30
|
+
Investigation revealed multiple error handling issues:
|
|
31
|
+
- Some errors are silently swallowed
|
|
32
|
+
- Some errors crash the entire service
|
|
33
|
+
- Error messages expose internal details
|
|
34
|
+
|
|
35
|
+
Your task:
|
|
36
|
+
1. Find all error handling issues
|
|
37
|
+
2. Categorize as missing handling vs poor handling
|
|
38
|
+
3. Implement proper error handling strategy
|
|
39
|
+
|
|
40
|
+
There are 7 known issues. How many can you find?
|
|
41
|
+
|
|
42
|
+
code:
|
|
43
|
+
language: typescript
|
|
44
|
+
filename: payment-processor.ts
|
|
45
|
+
content: |
|
|
46
|
+
interface PaymentResult {
|
|
47
|
+
success: boolean;
|
|
48
|
+
transactionId?: string;
|
|
49
|
+
error?: string;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
class PaymentProcessor {
|
|
53
|
+
private apiKey: string;
|
|
54
|
+
|
|
55
|
+
constructor(apiKey: string) {
|
|
56
|
+
this.apiKey = apiKey;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
async processPayment(amount: number, cardNumber: string): Promise<PaymentResult> {
|
|
60
|
+
// Bug: No validation before processing
|
|
61
|
+
const response = await fetch('/api/payments', {
|
|
62
|
+
method: 'POST',
|
|
63
|
+
headers: { 'Authorization': this.apiKey },
|
|
64
|
+
body: JSON.stringify({ amount, cardNumber }),
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
// Bug: Not checking response.ok
|
|
68
|
+
const data = await response.json();
|
|
69
|
+
return data;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
async refundPayment(transactionId: string): Promise<void> {
|
|
73
|
+
try {
|
|
74
|
+
await fetch(`/api/refunds/${transactionId}`, { method: 'POST' });
|
|
75
|
+
} catch (error) {
|
|
76
|
+
// Bug: Error swallowed - caller never knows refund failed
|
|
77
|
+
console.log('Refund failed');
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
async batchProcess(payments: Array<{amount: number, card: string}>): Promise<PaymentResult[]> {
|
|
82
|
+
const results: PaymentResult[] = [];
|
|
83
|
+
|
|
84
|
+
for (const payment of payments) {
|
|
85
|
+
// Bug: One failure stops entire batch - no isolation
|
|
86
|
+
const result = await this.processPayment(payment.amount, payment.card);
|
|
87
|
+
results.push(result);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return results;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async validateCard(cardNumber: string): Promise<boolean> {
|
|
94
|
+
const response = await fetch('/api/validate', {
|
|
95
|
+
method: 'POST',
|
|
96
|
+
body: JSON.stringify({ cardNumber }),
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
// Bug: Throws raw error with internal details
|
|
100
|
+
if (!response.ok) {
|
|
101
|
+
throw new Error(`Validation failed: ${response.status} - ${await response.text()}`);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return true;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
async processWithRetry(amount: number, card: string): Promise<PaymentResult> {
|
|
108
|
+
let attempts = 0;
|
|
109
|
+
const maxAttempts = 3;
|
|
110
|
+
|
|
111
|
+
// Bug: Infinite retry on non-retryable errors
|
|
112
|
+
while (true) {
|
|
113
|
+
try {
|
|
114
|
+
return await this.processPayment(amount, card);
|
|
115
|
+
} catch (error) {
|
|
116
|
+
attempts++;
|
|
117
|
+
if (attempts >= maxAttempts) {
|
|
118
|
+
// Bug: Returns success:false without error details
|
|
119
|
+
return { success: false };
|
|
120
|
+
}
|
|
121
|
+
// No delay between retries
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
private async logError(error: Error): Promise<void> {
|
|
127
|
+
// Bug: If logging fails, original error is lost
|
|
128
|
+
await fetch('/api/logs', {
|
|
129
|
+
method: 'POST',
|
|
130
|
+
body: JSON.stringify({ error: error.message }),
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
baseline_issues:
|
|
136
|
+
critical:
|
|
137
|
+
- id: err-001
|
|
138
|
+
location: "lines 29-32"
|
|
139
|
+
description: "Error swallowed with console.log - caller unaware of failure"
|
|
140
|
+
error_type: planning
|
|
141
|
+
- id: err-002
|
|
142
|
+
location: "line 23"
|
|
143
|
+
description: "Not checking response.ok - 4xx/5xx responses treated as success"
|
|
144
|
+
error_type: execution
|
|
145
|
+
high:
|
|
146
|
+
- id: err-003
|
|
147
|
+
location: "lines 37-42"
|
|
148
|
+
description: "No try/catch - one failure aborts entire batch"
|
|
149
|
+
error_type: planning
|
|
150
|
+
- id: err-004
|
|
151
|
+
location: "line 53"
|
|
152
|
+
description: "Error message exposes internal status codes and response body"
|
|
153
|
+
error_type: planning
|
|
154
|
+
medium:
|
|
155
|
+
- id: err-005
|
|
156
|
+
location: "line 61"
|
|
157
|
+
description: "Retries non-retryable errors (auth failures, validation)"
|
|
158
|
+
error_type: reasoning
|
|
159
|
+
- id: err-006
|
|
160
|
+
location: "line 69"
|
|
161
|
+
description: "Returns failure without error details for debugging"
|
|
162
|
+
error_type: execution
|
|
163
|
+
low:
|
|
164
|
+
- id: err-007
|
|
165
|
+
location: "lines 77-81"
|
|
166
|
+
description: "If logging fails, original error context is lost"
|
|
167
|
+
error_type: planning
|
|
168
|
+
|
|
169
|
+
scoring:
|
|
170
|
+
detection:
|
|
171
|
+
weight: 40
|
|
172
|
+
criteria: "Finding all 7 error handling issues"
|
|
173
|
+
fix_quality:
|
|
174
|
+
weight: 35
|
|
175
|
+
criteria: "Implementing proper error handling"
|
|
176
|
+
explanation:
|
|
177
|
+
weight: 25
|
|
178
|
+
criteria: "Explaining error handling strategy"
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: Input Validation Failures
|
|
3
|
+
# Category: debugging
|
|
4
|
+
# Difficulty: medium
|
|
5
|
+
# Error Type Focus: mixed (reasoning + execution)
|
|
6
|
+
|
|
7
|
+
id: debug-006
|
|
8
|
+
name: input-validation
|
|
9
|
+
title: "Input Validation: Trust No One"
|
|
10
|
+
category: debugging
|
|
11
|
+
difficulty: medium
|
|
12
|
+
version: "1.0"
|
|
13
|
+
|
|
14
|
+
description: |
|
|
15
|
+
A user registration API with multiple validation vulnerabilities.
|
|
16
|
+
Tests detection of both missing validation and incorrect validation logic.
|
|
17
|
+
|
|
18
|
+
purpose: |
|
|
19
|
+
This scenario has mixed error types - some are reasoning errors (wrong
|
|
20
|
+
validation logic) and some are execution errors (missing bounds checks).
|
|
21
|
+
Comprehensive agents will find both categories.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
SECURITY AUDIT REPORT
|
|
25
|
+
|
|
26
|
+
Service: registration-api
|
|
27
|
+
Severity: P1
|
|
28
|
+
Status: Multiple validation bypasses discovered
|
|
29
|
+
|
|
30
|
+
Penetration testing revealed validation weaknesses:
|
|
31
|
+
- Some inputs bypass validation entirely
|
|
32
|
+
- Some validation logic can be circumvented
|
|
33
|
+
- Data that should be rejected is accepted
|
|
34
|
+
|
|
35
|
+
Your task:
|
|
36
|
+
1. Find all validation issues
|
|
37
|
+
2. Categorize as missing validation vs incorrect validation
|
|
38
|
+
3. Implement proper validation
|
|
39
|
+
|
|
40
|
+
There are 6 known issues. How many can you find?
|
|
41
|
+
|
|
42
|
+
code:
|
|
43
|
+
language: python
|
|
44
|
+
filename: registration_api.py
|
|
45
|
+
content: |
|
|
46
|
+
import re
|
|
47
|
+
from typing import Optional
|
|
48
|
+
|
|
49
|
+
class RegistrationValidator:
|
|
50
|
+
def validate_username(self, username: str) -> bool:
|
|
51
|
+
"""Username must be 3-20 alphanumeric characters."""
|
|
52
|
+
# Bug: Regex allows any length, not 3-20
|
|
53
|
+
pattern = r'^[a-zA-Z0-9]+$'
|
|
54
|
+
return bool(re.match(pattern, username))
|
|
55
|
+
|
|
56
|
+
def validate_email(self, email: str) -> bool:
|
|
57
|
+
"""Validate email format."""
|
|
58
|
+
# Bug: Only checks for @ symbol, too permissive
|
|
59
|
+
return '@' in email
|
|
60
|
+
|
|
61
|
+
def validate_password(self, password: str) -> bool:
|
|
62
|
+
"""Password must be 8+ chars with number and special char."""
|
|
63
|
+
if len(password) < 8:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
# Bug: Checks for digit OR special, should be AND
|
|
67
|
+
has_digit = any(c.isdigit() for c in password)
|
|
68
|
+
has_special = any(c in '!@#$%^&*' for c in password)
|
|
69
|
+
|
|
70
|
+
return has_digit or has_special # Should be 'and'
|
|
71
|
+
|
|
72
|
+
def validate_age(self, age: int) -> bool:
|
|
73
|
+
"""Age must be between 13 and 120."""
|
|
74
|
+
# Bug: Missing upper bound check
|
|
75
|
+
return age >= 13
|
|
76
|
+
|
|
77
|
+
def validate_phone(self, phone: Optional[str]) -> bool:
|
|
78
|
+
"""Phone is optional but if provided must be valid."""
|
|
79
|
+
# Bug: Doesn't handle None - will crash
|
|
80
|
+
cleaned = phone.replace('-', '').replace(' ', '')
|
|
81
|
+
return len(cleaned) == 10 and cleaned.isdigit()
|
|
82
|
+
|
|
83
|
+
def validate_referral_code(self, code: str) -> bool:
|
|
84
|
+
"""Referral code must be exactly 8 uppercase letters."""
|
|
85
|
+
# Bug: Doesn't enforce uppercase
|
|
86
|
+
return len(code) == 8 and code.isalpha()
|
|
87
|
+
|
|
88
|
+
def register(self, data: dict) -> dict:
|
|
89
|
+
"""Validate and register user."""
|
|
90
|
+
errors = []
|
|
91
|
+
|
|
92
|
+
if not self.validate_username(data.get('username', '')):
|
|
93
|
+
errors.append('Invalid username')
|
|
94
|
+
|
|
95
|
+
if not self.validate_email(data.get('email', '')):
|
|
96
|
+
errors.append('Invalid email')
|
|
97
|
+
|
|
98
|
+
if not self.validate_password(data.get('password', '')):
|
|
99
|
+
errors.append('Invalid password')
|
|
100
|
+
|
|
101
|
+
if 'age' in data and not self.validate_age(data['age']):
|
|
102
|
+
errors.append('Invalid age')
|
|
103
|
+
|
|
104
|
+
if 'phone' in data and not self.validate_phone(data.get('phone')):
|
|
105
|
+
errors.append('Invalid phone')
|
|
106
|
+
|
|
107
|
+
if 'referral_code' in data and not self.validate_referral_code(data['referral_code']):
|
|
108
|
+
errors.append('Invalid referral code')
|
|
109
|
+
|
|
110
|
+
if errors:
|
|
111
|
+
return {'success': False, 'errors': errors}
|
|
112
|
+
|
|
113
|
+
return {'success': True, 'user_id': self._create_user(data)}
|
|
114
|
+
|
|
115
|
+
def _create_user(self, data: dict) -> int:
|
|
116
|
+
# Implementation omitted
|
|
117
|
+
return 12345
|
|
118
|
+
|
|
119
|
+
baseline_issues:
|
|
120
|
+
high:
|
|
121
|
+
- id: valid-001
|
|
122
|
+
location: "lines 8-9"
|
|
123
|
+
description: "Regex doesn't enforce 3-20 character length requirement"
|
|
124
|
+
error_type: execution
|
|
125
|
+
- id: valid-002
|
|
126
|
+
location: "line 14"
|
|
127
|
+
description: "Email validation too permissive - accepts 'a@b'"
|
|
128
|
+
error_type: reasoning
|
|
129
|
+
- id: valid-003
|
|
130
|
+
location: "line 24"
|
|
131
|
+
description: "Password requires digit OR special, should require both"
|
|
132
|
+
error_type: reasoning
|
|
133
|
+
medium:
|
|
134
|
+
- id: valid-004
|
|
135
|
+
location: "line 28"
|
|
136
|
+
description: "Missing upper bound check - allows age > 120"
|
|
137
|
+
error_type: execution
|
|
138
|
+
- id: valid-005
|
|
139
|
+
location: "line 33"
|
|
140
|
+
description: "Crashes on None input - no null check before method call"
|
|
141
|
+
error_type: execution
|
|
142
|
+
low:
|
|
143
|
+
- id: valid-006
|
|
144
|
+
location: "line 38"
|
|
145
|
+
description: "Doesn't enforce uppercase - accepts lowercase letters"
|
|
146
|
+
error_type: reasoning
|
|
147
|
+
|
|
148
|
+
scoring:
|
|
149
|
+
detection:
|
|
150
|
+
weight: 45
|
|
151
|
+
criteria: "Finding all 6 validation issues"
|
|
152
|
+
fix_quality:
|
|
153
|
+
weight: 35
|
|
154
|
+
criteria: "Implementing proper validation"
|
|
155
|
+
explanation:
|
|
156
|
+
weight: 20
|
|
157
|
+
criteria: "Distinguishing missing vs incorrect validation"
|