@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,161 @@
1
+ ---
2
+ # Scenario: Async Control Flow Errors
3
+ # Category: debugging
4
+ # Difficulty: medium
5
+ # Error Type Focus: planning (single-type)
6
+
7
+ id: debug-004
8
+ name: async-control-flow
9
+ title: "Async Chaos: Promise Pitfalls and Await Amnesia"
10
+ category: debugging
11
+ difficulty: medium
12
+ version: "1.0"
13
+
14
+ description: |
15
+ An API client with async/await control flow issues.
16
+ Tests understanding of JavaScript async execution model.
17
+
18
+ purpose: |
19
+ This scenario measures detection of planning-level bugs - where the
20
+ sequence of operations is incorrectly designed. Agents must understand
21
+ the async execution model to identify these issues.
22
+
23
+ prompt: |
24
+ BUG REPORT
25
+
26
+ Service: api-client
27
+ Severity: P1
28
+ Status: Race conditions and unhandled promise rejections
29
+
30
+ The API client is behaving erratically:
31
+ - Sometimes returns stale data
32
+ - Occasionally crashes with unhandled rejections
33
+ - Results appear in wrong order
34
+
35
+ Your task:
36
+ 1. Identify async control flow issues
37
+ 2. Explain why the current implementation fails
38
+ 3. Fix the async/await patterns
39
+
40
+ There are 6 known issues. How many can you find?
41
+
42
+ code:
43
+ language: typescript
44
+ filename: api-client.ts
45
+ content: |
46
+ interface User {
47
+ id: string;
48
+ name: string;
49
+ }
50
+
51
+ interface Order {
52
+ id: string;
53
+ userId: string;
54
+ total: number;
55
+ }
56
+
57
+ class ApiClient {
58
+ private cache: Map<string, User> = new Map();
59
+
60
+ async fetchUser(id: string): Promise<User> {
61
+ // Bug: Not awaiting cache check properly
62
+ if (this.cache.has(id)) {
63
+ return this.cache.get(id);
64
+ }
65
+
66
+ const response = await fetch(`/api/users/${id}`);
67
+ const user = await response.json();
68
+
69
+ // Bug: Setting cache after return - never executes
70
+ this.cache.set(id, user);
71
+ return user;
72
+ }
73
+
74
+ async fetchAllUsers(ids: string[]): Promise<User[]> {
75
+ const users: User[] = [];
76
+
77
+ // Bug: forEach doesn't wait for async operations
78
+ ids.forEach(async (id) => {
79
+ const user = await this.fetchUser(id);
80
+ users.push(user);
81
+ });
82
+
83
+ // Returns before async operations complete
84
+ return users;
85
+ }
86
+
87
+ async fetchUserWithOrders(userId: string): Promise<{user: User, orders: Order[]}> {
88
+ // Bug: Sequential when could be parallel
89
+ const user = await this.fetchUser(userId);
90
+ const ordersResponse = await fetch(`/api/users/${userId}/orders`);
91
+ const orders = await ordersResponse.json();
92
+
93
+ return { user, orders };
94
+ }
95
+
96
+ async saveUser(user: User): Promise<void> {
97
+ // Bug: Not handling the promise rejection
98
+ fetch('/api/users', {
99
+ method: 'POST',
100
+ body: JSON.stringify(user),
101
+ });
102
+
103
+ // Bug: Updating cache before confirming save succeeded
104
+ this.cache.set(user.id, user);
105
+ }
106
+
107
+ async processQueue(queue: string[]): Promise<void> {
108
+ // Bug: No error handling - one failure stops all processing
109
+ for (const item of queue) {
110
+ await this.processItem(item);
111
+ }
112
+ }
113
+
114
+ private async processItem(item: string): Promise<void> {
115
+ const response = await fetch(`/api/process/${item}`);
116
+ if (!response.ok) {
117
+ throw new Error(`Failed to process ${item}`);
118
+ }
119
+ }
120
+ }
121
+
122
+ baseline_issues:
123
+ critical:
124
+ - id: async-001
125
+ location: "lines 30-36"
126
+ description: "forEach with async callback doesn't wait - returns empty array"
127
+ error_type: planning
128
+ high:
129
+ - id: async-002
130
+ location: "lines 53-59"
131
+ description: "Not awaiting fetch - fire and forget loses errors"
132
+ error_type: planning
133
+ - id: async-003
134
+ location: "line 61"
135
+ description: "Cache updated before save confirmed - optimistic update without rollback"
136
+ error_type: planning
137
+ medium:
138
+ - id: async-004
139
+ location: "lines 42-47"
140
+ description: "Sequential fetches when parallel Promise.all would be faster"
141
+ error_type: planning
142
+ - id: async-005
143
+ location: "lines 64-68"
144
+ description: "No try/catch - one failure aborts entire queue"
145
+ error_type: planning
146
+ low:
147
+ - id: async-006
148
+ location: "lines 25-26"
149
+ description: "Cache set after return statement - dead code"
150
+ error_type: planning
151
+
152
+ scoring:
153
+ detection:
154
+ weight: 45
155
+ criteria: "Finding all 6 async control flow issues"
156
+ fix_quality:
157
+ weight: 35
158
+ criteria: "Implementing proper async patterns"
159
+ explanation:
160
+ weight: 20
161
+ criteria: "Explaining async execution model"
@@ -0,0 +1,197 @@
1
+ ---
2
+ # Scenario: Authentication/Authorization Bypass
3
+ # Category: debugging
4
+ # Difficulty: hard
5
+ # Error Type Focus: mixed (reasoning + planning)
6
+
7
+ id: debug-010
8
+ name: auth-bypass
9
+ title: "Auth Bypass: Breaking the Gates"
10
+ category: debugging
11
+ difficulty: hard
12
+ version: "1.0"
13
+
14
+ description: |
15
+ An authentication and authorization system with multiple bypass vulnerabilities.
16
+ Tests detection of access control flaws and authentication weaknesses.
17
+
18
+ purpose: |
19
+ This scenario has mixed error types - reasoning errors in auth logic
20
+ and planning errors in the security architecture. A thorough security
21
+ review will identify flaws at both levels.
22
+
23
+ prompt: |
24
+ SECURITY INCIDENT
25
+
26
+ Service: auth-service
27
+ Severity: P0
28
+ Status: Unauthorized access detected
29
+
30
+ Investigation revealed multiple auth weaknesses:
31
+ - Some endpoints bypass authentication entirely
32
+ - Role checks can be circumvented
33
+ - Session handling has vulnerabilities
34
+
35
+ Your task:
36
+ 1. Find all authentication and authorization vulnerabilities
37
+ 2. Explain how each can be exploited
38
+ 3. Implement proper access controls
39
+
40
+ There are 8 known issues. How many can you find?
41
+
42
+ code:
43
+ language: typescript
44
+ filename: auth-service.ts
45
+ content: |
46
+ interface User {
47
+ id: string;
48
+ email: string;
49
+ role: 'user' | 'admin' | 'superadmin';
50
+ }
51
+
52
+ interface Session {
53
+ userId: string;
54
+ role: string;
55
+ expiresAt: number;
56
+ }
57
+
58
+ class AuthService {
59
+ private sessions: Map<string, Session> = new Map();
60
+
61
+ async login(email: string, password: string): Promise<string | null> {
62
+ const user = await this.findUser(email);
63
+
64
+ // Bug: Returns early if user not found - timing attack
65
+ if (!user) {
66
+ return null;
67
+ }
68
+
69
+ // Bug: Plain comparison allows timing attack
70
+ if (password !== user.passwordHash) {
71
+ return null;
72
+ }
73
+
74
+ const token = this.generateToken();
75
+ this.sessions.set(token, {
76
+ userId: user.id,
77
+ role: user.role,
78
+ // Bug: Token never expires (expiresAt not checked)
79
+ expiresAt: Date.now() + 86400000,
80
+ });
81
+
82
+ return token;
83
+ }
84
+
85
+ isAuthenticated(token: string): boolean {
86
+ const session = this.sessions.get(token);
87
+ // Bug: Doesn't check if session is expired
88
+ return session !== undefined;
89
+ }
90
+
91
+ isAdmin(token: string): boolean {
92
+ const session = this.sessions.get(token);
93
+ if (!session) return false;
94
+
95
+ // Bug: Role stored in session can be modified client-side
96
+ return session.role === 'admin' || session.role === 'superadmin';
97
+ }
98
+
99
+ async updateUserRole(token: string, targetUserId: string, newRole: string): Promise<boolean> {
100
+ // Bug: Only checks if requester is admin, not if they can modify target
101
+ if (!this.isAdmin(token)) {
102
+ return false;
103
+ }
104
+
105
+ // Bug: Admin can escalate anyone to superadmin
106
+ await this.setUserRole(targetUserId, newRole);
107
+ return true;
108
+ }
109
+
110
+ async deleteUser(token: string, targetUserId: string): Promise<boolean> {
111
+ const session = this.sessions.get(token);
112
+ if (!session) return false;
113
+
114
+ // Bug: Can delete own account while logged in - orphaned session
115
+ if (session.userId === targetUserId) {
116
+ await this.removeUser(targetUserId);
117
+ return true;
118
+ }
119
+
120
+ // Bug: User can delete any user if they know the ID (no admin check for others)
121
+ await this.removeUser(targetUserId);
122
+ return true;
123
+ }
124
+
125
+ async resetPassword(email: string): Promise<void> {
126
+ const user = await this.findUser(email);
127
+ if (!user) return; // Bug: Silent return reveals user existence
128
+
129
+ const resetToken = this.generateToken();
130
+ // Bug: Reset token stored in session map - can be enumerated
131
+ this.sessions.set(resetToken, {
132
+ userId: user.id,
133
+ role: 'reset',
134
+ expiresAt: Date.now() + 3600000,
135
+ });
136
+
137
+ await this.sendResetEmail(user.email, resetToken);
138
+ }
139
+
140
+ // Stub implementations
141
+ private async findUser(email: string): Promise<User & {passwordHash: string} | null> {
142
+ return null;
143
+ }
144
+ private generateToken(): string { return Math.random().toString(36); }
145
+ private async setUserRole(id: string, role: string): Promise<void> {}
146
+ private async removeUser(id: string): Promise<void> {}
147
+ private async sendResetEmail(email: string, token: string): Promise<void> {}
148
+ }
149
+
150
+ baseline_issues:
151
+ critical:
152
+ - id: auth-001
153
+ location: "lines 63-71"
154
+ description: "deleteUser allows any user to delete any other user - no admin check"
155
+ error_type: reasoning
156
+ - id: auth-002
157
+ location: "lines 53-58"
158
+ description: "Admin can escalate to superadmin - privilege escalation"
159
+ error_type: reasoning
160
+ high:
161
+ - id: auth-003
162
+ location: "line 39"
163
+ description: "isAuthenticated doesn't check session expiry"
164
+ error_type: planning
165
+ - id: auth-004
166
+ location: "line 46"
167
+ description: "Role from client-controllable session - insecure trust boundary"
168
+ error_type: planning
169
+ - id: auth-005
170
+ location: "lines 18-23"
171
+ description: "Timing difference reveals if user exists"
172
+ error_type: reasoning
173
+ medium:
174
+ - id: auth-006
175
+ location: "line 75"
176
+ description: "Silent return on invalid email reveals user existence"
177
+ error_type: reasoning
178
+ - id: auth-007
179
+ location: "line 24"
180
+ description: "Direct string comparison vulnerable to timing attack"
181
+ error_type: planning
182
+ low:
183
+ - id: auth-008
184
+ location: "lines 65-68"
185
+ description: "Self-deletion leaves orphaned session token"
186
+ error_type: planning
187
+
188
+ scoring:
189
+ detection:
190
+ weight: 40
191
+ criteria: "Finding all 8 auth vulnerabilities"
192
+ fix_quality:
193
+ weight: 35
194
+ criteria: "Implementing secure auth patterns"
195
+ explanation:
196
+ weight: 25
197
+ criteria: "Explaining attack scenarios"
@@ -0,0 +1,178 @@
1
+ ---
2
+ # Scenario: Error Handling Gaps
3
+ # Category: debugging
4
+ # Difficulty: medium
5
+ # Error Type Focus: mixed (planning + execution)
6
+
7
+ id: debug-007
8
+ name: error-handling
9
+ title: "Error Handling: When Things Go Wrong"
10
+ category: debugging
11
+ difficulty: medium
12
+ version: "1.0"
13
+
14
+ description: |
15
+ A payment processing service with various error handling issues.
16
+ Tests detection of missing try/catch, swallowed errors, and poor error propagation.
17
+
18
+ purpose: |
19
+ This scenario has mixed error types - planning errors (no error strategy)
20
+ and execution errors (missing specific checks). A thorough agent will
21
+ identify both the strategic and tactical error handling gaps.
22
+
23
+ prompt: |
24
+ INCIDENT POST-MORTEM
25
+
26
+ Service: payment-processor
27
+ Severity: P0
28
+ Status: Silent failures caused lost revenue
29
+
30
+ Investigation revealed multiple error handling issues:
31
+ - Some errors are silently swallowed
32
+ - Some errors crash the entire service
33
+ - Error messages expose internal details
34
+
35
+ Your task:
36
+ 1. Find all error handling issues
37
+ 2. Categorize as missing handling vs poor handling
38
+ 3. Implement proper error handling strategy
39
+
40
+ There are 7 known issues. How many can you find?
41
+
42
+ code:
43
+ language: typescript
44
+ filename: payment-processor.ts
45
+ content: |
46
+ interface PaymentResult {
47
+ success: boolean;
48
+ transactionId?: string;
49
+ error?: string;
50
+ }
51
+
52
+ class PaymentProcessor {
53
+ private apiKey: string;
54
+
55
+ constructor(apiKey: string) {
56
+ this.apiKey = apiKey;
57
+ }
58
+
59
+ async processPayment(amount: number, cardNumber: string): Promise<PaymentResult> {
60
+ // Bug: No validation before processing
61
+ const response = await fetch('/api/payments', {
62
+ method: 'POST',
63
+ headers: { 'Authorization': this.apiKey },
64
+ body: JSON.stringify({ amount, cardNumber }),
65
+ });
66
+
67
+ // Bug: Not checking response.ok
68
+ const data = await response.json();
69
+ return data;
70
+ }
71
+
72
+ async refundPayment(transactionId: string): Promise<void> {
73
+ try {
74
+ await fetch(`/api/refunds/${transactionId}`, { method: 'POST' });
75
+ } catch (error) {
76
+ // Bug: Error swallowed - caller never knows refund failed
77
+ console.log('Refund failed');
78
+ }
79
+ }
80
+
81
+ async batchProcess(payments: Array<{amount: number, card: string}>): Promise<PaymentResult[]> {
82
+ const results: PaymentResult[] = [];
83
+
84
+ for (const payment of payments) {
85
+ // Bug: One failure stops entire batch - no isolation
86
+ const result = await this.processPayment(payment.amount, payment.card);
87
+ results.push(result);
88
+ }
89
+
90
+ return results;
91
+ }
92
+
93
+ async validateCard(cardNumber: string): Promise<boolean> {
94
+ const response = await fetch('/api/validate', {
95
+ method: 'POST',
96
+ body: JSON.stringify({ cardNumber }),
97
+ });
98
+
99
+ // Bug: Throws raw error with internal details
100
+ if (!response.ok) {
101
+ throw new Error(`Validation failed: ${response.status} - ${await response.text()}`);
102
+ }
103
+
104
+ return true;
105
+ }
106
+
107
+ async processWithRetry(amount: number, card: string): Promise<PaymentResult> {
108
+ let attempts = 0;
109
+ const maxAttempts = 3;
110
+
111
+ // Bug: Infinite retry on non-retryable errors
112
+ while (true) {
113
+ try {
114
+ return await this.processPayment(amount, card);
115
+ } catch (error) {
116
+ attempts++;
117
+ if (attempts >= maxAttempts) {
118
+ // Bug: Returns success:false without error details
119
+ return { success: false };
120
+ }
121
+ // No delay between retries
122
+ }
123
+ }
124
+ }
125
+
126
+ private async logError(error: Error): Promise<void> {
127
+ // Bug: If logging fails, original error is lost
128
+ await fetch('/api/logs', {
129
+ method: 'POST',
130
+ body: JSON.stringify({ error: error.message }),
131
+ });
132
+ }
133
+ }
134
+
135
+ baseline_issues:
136
+ critical:
137
+ - id: err-001
138
+ location: "lines 29-32"
139
+ description: "Error swallowed with console.log - caller unaware of failure"
140
+ error_type: planning
141
+ - id: err-002
142
+ location: "line 23"
143
+ description: "Not checking response.ok - 4xx/5xx responses treated as success"
144
+ error_type: execution
145
+ high:
146
+ - id: err-003
147
+ location: "lines 37-42"
148
+ description: "No try/catch - one failure aborts entire batch"
149
+ error_type: planning
150
+ - id: err-004
151
+ location: "line 53"
152
+ description: "Error message exposes internal status codes and response body"
153
+ error_type: planning
154
+ medium:
155
+ - id: err-005
156
+ location: "line 61"
157
+ description: "Retries non-retryable errors (auth failures, validation)"
158
+ error_type: reasoning
159
+ - id: err-006
160
+ location: "line 69"
161
+ description: "Returns failure without error details for debugging"
162
+ error_type: execution
163
+ low:
164
+ - id: err-007
165
+ location: "lines 77-81"
166
+ description: "If logging fails, original error context is lost"
167
+ error_type: planning
168
+
169
+ scoring:
170
+ detection:
171
+ weight: 40
172
+ criteria: "Finding all 7 error handling issues"
173
+ fix_quality:
174
+ weight: 35
175
+ criteria: "Implementing proper error handling"
176
+ explanation:
177
+ weight: 25
178
+ criteria: "Explaining error handling strategy"
@@ -0,0 +1,157 @@
1
+ ---
2
+ # Scenario: Input Validation Failures
3
+ # Category: debugging
4
+ # Difficulty: medium
5
+ # Error Type Focus: mixed (reasoning + execution)
6
+
7
+ id: debug-006
8
+ name: input-validation
9
+ title: "Input Validation: Trust No One"
10
+ category: debugging
11
+ difficulty: medium
12
+ version: "1.0"
13
+
14
+ description: |
15
+ A user registration API with multiple validation vulnerabilities.
16
+ Tests detection of both missing validation and incorrect validation logic.
17
+
18
+ purpose: |
19
+ This scenario has mixed error types - some are reasoning errors (wrong
20
+ validation logic) and some are execution errors (missing bounds checks).
21
+ Comprehensive agents will find both categories.
22
+
23
+ prompt: |
24
+ SECURITY AUDIT REPORT
25
+
26
+ Service: registration-api
27
+ Severity: P1
28
+ Status: Multiple validation bypasses discovered
29
+
30
+ Penetration testing revealed validation weaknesses:
31
+ - Some inputs bypass validation entirely
32
+ - Some validation logic can be circumvented
33
+ - Data that should be rejected is accepted
34
+
35
+ Your task:
36
+ 1. Find all validation issues
37
+ 2. Categorize as missing validation vs incorrect validation
38
+ 3. Implement proper validation
39
+
40
+ There are 6 known issues. How many can you find?
41
+
42
+ code:
43
+ language: python
44
+ filename: registration_api.py
45
+ content: |
46
+ import re
47
+ from typing import Optional
48
+
49
+ class RegistrationValidator:
50
+ def validate_username(self, username: str) -> bool:
51
+ """Username must be 3-20 alphanumeric characters."""
52
+ # Bug: Regex allows any length, not 3-20
53
+ pattern = r'^[a-zA-Z0-9]+$'
54
+ return bool(re.match(pattern, username))
55
+
56
+ def validate_email(self, email: str) -> bool:
57
+ """Validate email format."""
58
+ # Bug: Only checks for @ symbol, too permissive
59
+ return '@' in email
60
+
61
+ def validate_password(self, password: str) -> bool:
62
+ """Password must be 8+ chars with number and special char."""
63
+ if len(password) < 8:
64
+ return False
65
+
66
+ # Bug: Checks for digit OR special, should be AND
67
+ has_digit = any(c.isdigit() for c in password)
68
+ has_special = any(c in '!@#$%^&*' for c in password)
69
+
70
+ return has_digit or has_special # Should be 'and'
71
+
72
+ def validate_age(self, age: int) -> bool:
73
+ """Age must be between 13 and 120."""
74
+ # Bug: Missing upper bound check
75
+ return age >= 13
76
+
77
+ def validate_phone(self, phone: Optional[str]) -> bool:
78
+ """Phone is optional but if provided must be valid."""
79
+ # Bug: Doesn't handle None - will crash
80
+ cleaned = phone.replace('-', '').replace(' ', '')
81
+ return len(cleaned) == 10 and cleaned.isdigit()
82
+
83
+ def validate_referral_code(self, code: str) -> bool:
84
+ """Referral code must be exactly 8 uppercase letters."""
85
+ # Bug: Doesn't enforce uppercase
86
+ return len(code) == 8 and code.isalpha()
87
+
88
+ def register(self, data: dict) -> dict:
89
+ """Validate and register user."""
90
+ errors = []
91
+
92
+ if not self.validate_username(data.get('username', '')):
93
+ errors.append('Invalid username')
94
+
95
+ if not self.validate_email(data.get('email', '')):
96
+ errors.append('Invalid email')
97
+
98
+ if not self.validate_password(data.get('password', '')):
99
+ errors.append('Invalid password')
100
+
101
+ if 'age' in data and not self.validate_age(data['age']):
102
+ errors.append('Invalid age')
103
+
104
+ if 'phone' in data and not self.validate_phone(data.get('phone')):
105
+ errors.append('Invalid phone')
106
+
107
+ if 'referral_code' in data and not self.validate_referral_code(data['referral_code']):
108
+ errors.append('Invalid referral code')
109
+
110
+ if errors:
111
+ return {'success': False, 'errors': errors}
112
+
113
+ return {'success': True, 'user_id': self._create_user(data)}
114
+
115
+ def _create_user(self, data: dict) -> int:
116
+ # Implementation omitted
117
+ return 12345
118
+
119
+ baseline_issues:
120
+ high:
121
+ - id: valid-001
122
+ location: "lines 8-9"
123
+ description: "Regex doesn't enforce 3-20 character length requirement"
124
+ error_type: execution
125
+ - id: valid-002
126
+ location: "line 14"
127
+ description: "Email validation too permissive - accepts 'a@b'"
128
+ error_type: reasoning
129
+ - id: valid-003
130
+ location: "line 24"
131
+ description: "Password requires digit OR special, should require both"
132
+ error_type: reasoning
133
+ medium:
134
+ - id: valid-004
135
+ location: "line 28"
136
+ description: "Missing upper bound check - allows age > 120"
137
+ error_type: execution
138
+ - id: valid-005
139
+ location: "line 33"
140
+ description: "Crashes on None input - no null check before method call"
141
+ error_type: execution
142
+ low:
143
+ - id: valid-006
144
+ location: "line 38"
145
+ description: "Doesn't enforce uppercase - accepts lowercase letters"
146
+ error_type: reasoning
147
+
148
+ scoring:
149
+ detection:
150
+ weight: 45
151
+ criteria: "Finding all 6 validation issues"
152
+ fix_quality:
153
+ weight: 35
154
+ criteria: "Implementing proper validation"
155
+ explanation:
156
+ weight: 20
157
+ criteria: "Distinguishing missing vs incorrect validation"