@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,622 @@
1
+ ---
2
+ # Scenario: Order Service Code Review (Easy)
3
+ # NOTE: Re-ranked to "easy" based on control baseline mean 91.9 ± 1.45 (Story 7-2)
4
+ # Category: code-review
5
+ # Ported from: Pennyfarthing benchmarks/test-cases/code-review/cr-002-order-service.yaml
6
+ # Purpose: Differentiate persona thoroughness with layered complexity
7
+
8
+ id: cr-002
9
+ name: order-service
10
+ title: "Order Service Code Review (Enhanced)"
11
+ category: code-review
12
+ difficulty: easy
13
+ version: "2.0"
14
+
15
+ description: |
16
+ An e-commerce order service with payment processing, inventory management,
17
+ and notification handling. Contains multiple layers of issues - some obvious,
18
+ some subtle, some architectural. Designed to measure thoroughness differences.
19
+
20
+ purpose: |
21
+ This scenario tests whether personas with different traits (e.g., "uncompromising"
22
+ vs "pragmatic") actually find different numbers of issues, or if persona only
23
+ affects communication style. Finding all 22 baseline issues = competent.
24
+ Finding bonus issues = thorough. Missing obvious issues = concerning.
25
+
26
+ prompt: |
27
+ You are reviewing a pull request for an order processing service.
28
+ The developer says "it works in staging" and wants to merge to main.
29
+
30
+ Review this code thoroughly for:
31
+ - Security vulnerabilities
32
+ - Bug and logic errors
33
+ - Performance issues
34
+ - Code quality problems
35
+ - Architectural concerns
36
+ - Missing error handling
37
+ - Race conditions and concurrency issues
38
+
39
+ For each issue:
40
+ 1. Identify the specific location (line number)
41
+ 2. Classify severity (Critical/High/Medium/Low)
42
+ 3. Explain the impact
43
+ 4. Provide a fix or recommendation
44
+
45
+ Be thorough. This code handles real money.
46
+
47
+ code:
48
+ language: go
49
+ filename: order_service.go
50
+ content: |
51
+ package orders
52
+
53
+ import (
54
+ "bytes"
55
+ "context"
56
+ "database/sql"
57
+ "encoding/json"
58
+ "fmt"
59
+ "log"
60
+ "net/http"
61
+ "strconv"
62
+ "sync"
63
+ "time"
64
+ )
65
+
66
+ type OrderService struct {
67
+ db *sql.DB
68
+ paymentAPI string
69
+ inventoryAPI string
70
+ cache map[string]*Order
71
+ cacheMutex sync.Mutex
72
+ }
73
+
74
+ type Order struct {
75
+ ID int64 `json:"id"`
76
+ UserID int64 `json:"user_id"`
77
+ Items []Item `json:"items"`
78
+ Total float64 `json:"total"`
79
+ Status string `json:"status"`
80
+ PaymentID string `json:"payment_id"`
81
+ CreatedAt time.Time `json:"created_at"`
82
+ CreditCard string `json:"credit_card,omitempty"`
83
+ }
84
+
85
+ type Item struct {
86
+ ProductID int64 `json:"product_id"`
87
+ Quantity int `json:"quantity"`
88
+ Price float64 `json:"price"`
89
+ }
90
+
91
+ // CreateOrder processes a new order
92
+ func (s *OrderService) CreateOrder(w http.ResponseWriter, r *http.Request) {
93
+ var order Order
94
+ json.NewDecoder(r.Body).Decode(&order)
95
+
96
+ // Calculate total
97
+ var total float64
98
+ for _, item := range order.Items {
99
+ total += item.Price * float64(item.Quantity)
100
+ }
101
+ order.Total = total
102
+
103
+ // Check inventory for each item
104
+ for _, item := range order.Items {
105
+ resp, _ := http.Get(fmt.Sprintf("%s/check?product=%d&qty=%d",
106
+ s.inventoryAPI, item.ProductID, item.Quantity))
107
+ if resp.StatusCode != 200 {
108
+ http.Error(w, "Inventory check failed", 400)
109
+ return
110
+ }
111
+ }
112
+
113
+ // Process payment
114
+ paymentReq := map[string]interface{}{
115
+ "amount": order.Total,
116
+ "card": order.CreditCard,
117
+ "order_ref": order.ID,
118
+ }
119
+ paymentBody, _ := json.Marshal(paymentReq)
120
+ resp, _ := http.Post(s.paymentAPI+"/charge", "application/json",
121
+ bytes.NewReader(paymentBody))
122
+
123
+ var paymentResp map[string]string
124
+ json.NewDecoder(resp.Body).Decode(&paymentResp)
125
+ order.PaymentID = paymentResp["payment_id"]
126
+
127
+ // Reserve inventory
128
+ for _, item := range order.Items {
129
+ go func(i Item) {
130
+ http.Post(fmt.Sprintf("%s/reserve", s.inventoryAPI),
131
+ "application/json",
132
+ bytes.NewReader([]byte(fmt.Sprintf(
133
+ `{"product_id":%d,"quantity":%d}`, i.ProductID, i.Quantity))))
134
+ }(item)
135
+ }
136
+
137
+ // Save order
138
+ result, err := s.db.Exec(
139
+ "INSERT INTO orders (user_id, total, status, payment_id, credit_card) VALUES (?, ?, ?, ?, ?)",
140
+ order.UserID, order.Total, "pending", order.PaymentID, order.CreditCard)
141
+ if err != nil {
142
+ log.Printf("Failed to save order: %v", err)
143
+ http.Error(w, "Order failed", 500)
144
+ return
145
+ }
146
+
147
+ order.ID, _ = result.LastInsertId()
148
+ order.Status = "pending"
149
+
150
+ // Cache the order
151
+ s.cache[strconv.FormatInt(order.ID, 10)] = &order
152
+
153
+ json.NewEncoder(w).Encode(order)
154
+ }
155
+
156
+ // GetOrder retrieves an order by ID
157
+ func (s *OrderService) GetOrder(w http.ResponseWriter, r *http.Request) {
158
+ orderID := r.URL.Query().Get("id")
159
+
160
+ // Check cache first
161
+ if cached, ok := s.cache[orderID]; ok {
162
+ json.NewEncoder(w).Encode(cached)
163
+ return
164
+ }
165
+
166
+ query := fmt.Sprintf("SELECT * FROM orders WHERE id = %s", orderID)
167
+ row := s.db.QueryRow(query)
168
+
169
+ var order Order
170
+ err := row.Scan(&order.ID, &order.UserID, &order.Total, &order.Status,
171
+ &order.PaymentID, &order.CreditCard, &order.CreatedAt)
172
+ if err != nil {
173
+ http.Error(w, "Order not found", 404)
174
+ return
175
+ }
176
+
177
+ // Load items
178
+ itemRows, _ := s.db.Query(
179
+ fmt.Sprintf("SELECT product_id, quantity, price FROM order_items WHERE order_id = %s", orderID))
180
+ for itemRows.Next() {
181
+ var item Item
182
+ itemRows.Scan(&item.ProductID, &item.Quantity, &item.Price)
183
+ order.Items = append(order.Items, item)
184
+ }
185
+
186
+ s.cache[orderID] = &order
187
+ json.NewEncoder(w).Encode(order)
188
+ }
189
+
190
+ // CancelOrder cancels an existing order
191
+ func (s *OrderService) CancelOrder(w http.ResponseWriter, r *http.Request) {
192
+ orderID := r.URL.Query().Get("id")
193
+
194
+ // Update status
195
+ s.db.Exec("UPDATE orders SET status = 'cancelled' WHERE id = " + orderID)
196
+
197
+ // Refund payment
198
+ var paymentID string
199
+ s.db.QueryRow("SELECT payment_id FROM orders WHERE id = ?", orderID).Scan(&paymentID)
200
+
201
+ http.Post(s.paymentAPI+"/refund", "application/json",
202
+ bytes.NewReader([]byte(fmt.Sprintf(`{"payment_id":"%s"}`, paymentID))))
203
+
204
+ // Release inventory
205
+ rows, _ := s.db.Query("SELECT product_id, quantity FROM order_items WHERE order_id = ?", orderID)
206
+ for rows.Next() {
207
+ var productID int64
208
+ var quantity int
209
+ rows.Scan(&productID, &quantity)
210
+ go func() {
211
+ http.Post(fmt.Sprintf("%s/release", s.inventoryAPI),
212
+ "application/json",
213
+ bytes.NewReader([]byte(fmt.Sprintf(
214
+ `{"product_id":%d,"quantity":%d}`, productID, quantity))))
215
+ }()
216
+ }
217
+
218
+ // Remove from cache
219
+ delete(s.cache, orderID)
220
+
221
+ w.Write([]byte("Order cancelled"))
222
+ }
223
+
224
+ // GetUserOrders returns all orders for a user
225
+ func (s *OrderService) GetUserOrders(w http.ResponseWriter, r *http.Request) {
226
+ userID := r.URL.Query().Get("user_id")
227
+ limit := r.URL.Query().Get("limit")
228
+ if limit == "" {
229
+ limit = "100"
230
+ }
231
+
232
+ query := fmt.Sprintf(
233
+ "SELECT id, total, status, created_at FROM orders WHERE user_id = %s ORDER BY created_at DESC LIMIT %s",
234
+ userID, limit)
235
+
236
+ rows, err := s.db.Query(query)
237
+ if err != nil {
238
+ log.Printf("Query error: %v", err)
239
+ }
240
+
241
+ var orders []Order
242
+ for rows.Next() {
243
+ var o Order
244
+ rows.Scan(&o.ID, &o.Total, &o.Status, &o.CreatedAt)
245
+ orders = append(orders, o)
246
+ }
247
+
248
+ json.NewEncoder(w).Encode(orders)
249
+ }
250
+
251
+ // ProcessRefund handles refund requests
252
+ func (s *OrderService) ProcessRefund(w http.ResponseWriter, r *http.Request) {
253
+ var req struct {
254
+ OrderID int64 `json:"order_id"`
255
+ Amount float64 `json:"amount"`
256
+ Reason string `json:"reason"`
257
+ }
258
+ json.NewDecoder(r.Body).Decode(&req)
259
+
260
+ // Get order
261
+ var order Order
262
+ s.db.QueryRow("SELECT id, total, payment_id FROM orders WHERE id = ?", req.OrderID).
263
+ Scan(&order.ID, &order.Total, &order.PaymentID)
264
+
265
+ // Process refund
266
+ refundReq := fmt.Sprintf(`{"payment_id":"%s","amount":%f,"reason":"%s"}`,
267
+ order.PaymentID, req.Amount, req.Reason)
268
+ resp, _ := http.Post(s.paymentAPI+"/refund", "application/json",
269
+ bytes.NewReader([]byte(refundReq)))
270
+
271
+ if resp.StatusCode == 200 {
272
+ s.db.Exec("UPDATE orders SET status = 'refunded' WHERE id = ?", req.OrderID)
273
+ s.db.Exec(fmt.Sprintf(
274
+ "INSERT INTO refund_log (order_id, amount, reason, processed_at) VALUES (%d, %f, '%s', NOW())",
275
+ req.OrderID, req.Amount, req.Reason))
276
+ }
277
+
278
+ w.Write([]byte("Refund processed"))
279
+ }
280
+
281
+ // BulkUpdatePrices updates prices for multiple products
282
+ func (s *OrderService) BulkUpdatePrices(w http.ResponseWriter, r *http.Request) {
283
+ var updates []struct {
284
+ ProductID int64 `json:"product_id"`
285
+ NewPrice float64 `json:"new_price"`
286
+ }
287
+ json.NewDecoder(r.Body).Decode(&updates)
288
+
289
+ for _, u := range updates {
290
+ s.db.Exec(fmt.Sprintf(
291
+ "UPDATE products SET price = %f WHERE id = %d", u.NewPrice, u.ProductID))
292
+ }
293
+
294
+ w.Write([]byte(fmt.Sprintf("Updated %d products", len(updates))))
295
+ }
296
+
297
+ // ExportOrders exports orders to CSV
298
+ func (s *OrderService) ExportOrders(w http.ResponseWriter, r *http.Request) {
299
+ startDate := r.URL.Query().Get("start")
300
+ endDate := r.URL.Query().Get("end")
301
+
302
+ query := fmt.Sprintf(
303
+ "SELECT id, user_id, total, status, credit_card, created_at FROM orders WHERE created_at BETWEEN '%s' AND '%s'",
304
+ startDate, endDate)
305
+
306
+ rows, _ := s.db.Query(query)
307
+
308
+ w.Header().Set("Content-Type", "text/csv")
309
+ w.Write([]byte("id,user_id,total,status,card_last4,created_at\n"))
310
+
311
+ for rows.Next() {
312
+ var id, userID int64
313
+ var total float64
314
+ var status, creditCard string
315
+ var createdAt time.Time
316
+ rows.Scan(&id, &userID, &total, &status, &creditCard, &createdAt)
317
+
318
+ // Mask credit card
319
+ cardLast4 := creditCard[len(creditCard)-4:]
320
+
321
+ w.Write([]byte(fmt.Sprintf("%d,%d,%.2f,%s,%s,%s\n",
322
+ id, userID, total, status, cardLast4, createdAt.Format(time.RFC3339))))
323
+ }
324
+ }
325
+
326
+ # =============================================================================
327
+ # BASELINE ISSUES (minimum expected to find)
328
+ # Finding all of these = 100% baseline score
329
+ # These are seeded, known issues - NOT shown to contestants
330
+ # =============================================================================
331
+
332
+ baseline_issues:
333
+ critical:
334
+ - id: SQL_INJECTION_GET_ORDER
335
+ location: "line 107"
336
+ description: "SQL injection in GetOrder via string formatting"
337
+
338
+ - id: SQL_INJECTION_CANCEL
339
+ location: "line 136"
340
+ description: "SQL injection in CancelOrder via string concatenation"
341
+
342
+ - id: SQL_INJECTION_USER_ORDERS
343
+ location: "lines 169-171"
344
+ description: "SQL injection in GetUserOrders (both userID and limit)"
345
+
346
+ - id: SQL_INJECTION_REFUND_LOG
347
+ location: "lines 209-211"
348
+ description: "SQL injection in refund log insert (reason field)"
349
+
350
+ - id: SQL_INJECTION_EXPORT
351
+ location: "lines 232-234"
352
+ description: "SQL injection in ExportOrders (date parameters)"
353
+
354
+ - id: CREDIT_CARD_STORED
355
+ location: "line 89"
356
+ description: "Credit card stored in database unencrypted"
357
+
358
+ high:
359
+ - id: CREDIT_CARD_EXPOSED_JSON
360
+ location: "line 31"
361
+ description: "Credit card included in JSON response"
362
+
363
+ - id: CREDIT_CARD_IN_CSV
364
+ location: "line 243"
365
+ description: "Credit card exposed in CSV export"
366
+
367
+ - id: NO_AUTH_CANCEL
368
+ location: "line 132"
369
+ description: "No authorization check on CancelOrder"
370
+
371
+ - id: NO_AUTH_REFUND
372
+ location: "line 186"
373
+ description: "No authorization check on ProcessRefund"
374
+
375
+ - id: NO_AUTH_BULK_UPDATE
376
+ location: "line 216"
377
+ description: "No authorization check on BulkUpdatePrices"
378
+
379
+ - id: NO_AUTH_EXPORT
380
+ location: "line 227"
381
+ description: "No authorization check on ExportOrders"
382
+
383
+ medium:
384
+ - id: RACE_CONDITION_INVENTORY
385
+ location: "lines 77-83"
386
+ description: "Goroutines for inventory reserve without synchronization"
387
+
388
+ - id: RACE_CONDITION_CANCEL
389
+ location: "lines 151-158"
390
+ description: "Goroutines in cancel without proper closure"
391
+
392
+ - id: CACHE_NO_MUTEX
393
+ location: "line 96"
394
+ description: "Cache write without mutex lock"
395
+
396
+ - id: CACHE_NO_MUTEX_READ
397
+ location: "line 103"
398
+ description: "Cache read without mutex lock"
399
+
400
+ - id: ROWS_NOT_CLOSED_GET
401
+ location: "line 116"
402
+ description: "itemRows not closed in GetOrder"
403
+
404
+ - id: ROWS_NOT_CLOSED_CANCEL
405
+ location: "line 147"
406
+ description: "rows not closed in CancelOrder"
407
+
408
+ - id: ERROR_IGNORED_DECODE
409
+ location: "line 45"
410
+ description: "JSON decode error ignored in CreateOrder"
411
+
412
+ - id: ERROR_IGNORED_PAYMENT
413
+ location: "line 68"
414
+ description: "Payment POST error ignored"
415
+
416
+ low:
417
+ - id: MISSING_CONTENT_TYPE
418
+ location: "multiple handlers"
419
+ description: "JSON responses don't set Content-Type header"
420
+
421
+ - id: FLOAT_FOR_MONEY
422
+ location: "lines 27, 49-52"
423
+ description: "Using float64 for monetary values (precision issues)"
424
+
425
+ # =============================================================================
426
+ # BONUS ISSUES (thorough reviewers might find these)
427
+ # Finding these demonstrates above-average thoroughness
428
+ # =============================================================================
429
+
430
+ bonus_issues:
431
+ architectural:
432
+ - id: NO_TRANSACTION
433
+ description: "CreateOrder should use transaction for payment + DB + inventory"
434
+
435
+ - id: SAGA_PATTERN_MISSING
436
+ description: "Distributed transaction needs saga/compensation pattern"
437
+
438
+ - id: NO_IDEMPOTENCY
439
+ description: "CreateOrder not idempotent - duplicate orders possible"
440
+
441
+ - id: TIGHT_COUPLING
442
+ description: "Direct HTTP calls to services instead of abstraction"
443
+
444
+ security:
445
+ - id: SSRF_POTENTIAL
446
+ description: "API URLs from config could be exploited for SSRF"
447
+
448
+ - id: NO_RATE_LIMITING
449
+ description: "No rate limiting on any endpoints"
450
+
451
+ - id: NO_INPUT_VALIDATION
452
+ description: "No validation on order items, quantities, prices"
453
+
454
+ - id: CARD_MASKING_UNSAFE
455
+ location: "line 243"
456
+ description: "Card masking panics on cards < 4 chars"
457
+
458
+ reliability:
459
+ - id: NO_TIMEOUT_HTTP
460
+ description: "HTTP calls have no timeout set"
461
+
462
+ - id: NO_RETRY_LOGIC
463
+ description: "External service calls have no retry"
464
+
465
+ - id: NO_CIRCUIT_BREAKER
466
+ description: "No circuit breaker for failing services"
467
+
468
+ - id: CACHE_UNBOUNDED
469
+ description: "Cache grows forever, no eviction"
470
+
471
+ observability:
472
+ - id: POOR_ERROR_MESSAGES
473
+ description: "Generic error messages hide root cause"
474
+
475
+ - id: NO_REQUEST_ID
476
+ description: "No correlation ID for tracing"
477
+
478
+ - id: INCONSISTENT_LOGGING
479
+ description: "Some errors logged, others ignored"
480
+
481
+ code_quality:
482
+ - id: DUPLICATE_SQL_PATTERNS
483
+ description: "Same SQL injection pattern repeated"
484
+
485
+ - id: NO_CONTEXT_PROPAGATION
486
+ description: "context.Context not used for cancellation"
487
+
488
+ - id: MAGIC_STRINGS
489
+ description: "Status values as magic strings"
490
+
491
+ # =============================================================================
492
+ # SCORING
493
+ # =============================================================================
494
+
495
+ scoring:
496
+ total_baseline_issues: 22
497
+ total_bonus_issues: 19
498
+ weights:
499
+ critical: 3
500
+ high: 2
501
+ medium: 1
502
+ low: 0.5
503
+ max_baseline_score: 35.5 # 6*3 + 6*2 + 8*1 + 2*0.5
504
+
505
+ categories:
506
+ - name: detection
507
+ weight: 40
508
+ criteria:
509
+ - id: BASELINE_FOUND
510
+ description: "Issues from the seeded baseline list"
511
+ points: 25
512
+ - id: BONUS_DISCOVERIES
513
+ description: "Valid issues beyond the baseline"
514
+ points: 15
515
+
516
+ - name: depth
517
+ weight: 30
518
+ criteria:
519
+ - id: ROOT_CAUSE_ANALYSIS
520
+ description: "Traces to underlying cause, not just symptom"
521
+ points: 10
522
+ - id: FIX_SPECIFICITY
523
+ description: "Provides actual code fixes with line numbers"
524
+ points: 10
525
+ - id: IMPACT_ASSESSMENT
526
+ description: "Explains full attack chain or cascade effects"
527
+ points: 10
528
+
529
+ - name: quality
530
+ weight: 15
531
+ criteria:
532
+ - id: SEVERITY_ACCURACY
533
+ description: "Correctly classifies severity levels"
534
+ points: 5
535
+ - id: REASONING_QUALITY
536
+ description: "Clear logical chain for each issue"
537
+ points: 5
538
+ - id: ORGANIZATION
539
+ description: "Prioritized, scannable structure"
540
+ points: 5
541
+
542
+ - name: persona
543
+ weight: 15
544
+ criteria:
545
+ - id: CHARACTER_CONSISTENCY
546
+ description: "Stays in character throughout"
547
+ points: 8
548
+ - id: PERSONA_VALUE_ADD
549
+ description: "Persona enhances memorability/clarity"
550
+ points: 7
551
+
552
+ # =============================================================================
553
+ # ENHANCED METRICS (for scientific comparison)
554
+ # =============================================================================
555
+
556
+ enhanced_metrics:
557
+ thoroughness_ratio:
558
+ formula: "total_findings / baseline_issues"
559
+ interpretation: "100% = found baseline, 150% = found 50% more"
560
+
561
+ bonus_discovery_rate:
562
+ formula: "bonus_found / bonus_issues"
563
+ interpretation: "What percentage of bonus issues discovered"
564
+
565
+ depth_score:
566
+ formula: "avg(root_cause, fix_specificity, impact_assessment)"
567
+ scale: "1-5"
568
+
569
+ quality_score:
570
+ formula: "avg(severity_accuracy, reasoning, organization)"
571
+ scale: "1-5"
572
+
573
+ # =============================================================================
574
+ # PERSONA INFLUENCE (how different traits should affect approach)
575
+ # =============================================================================
576
+
577
+ persona_influence:
578
+ dimensions:
579
+ - name: thoroughness
580
+ description: "How many issues are found"
581
+ spectrum:
582
+ minimal: "Finds obvious issues, moves on"
583
+ adequate: "Finds most baseline issues"
584
+ exhaustive: "Finds baseline + bonus issues"
585
+
586
+ - name: severity_focus
587
+ description: "What gets prioritized"
588
+ spectrum:
589
+ security_first: "Leads with SQL injection, auth gaps"
590
+ quality_first: "Leads with code quality, patterns"
591
+ balanced: "Covers all categories systematically"
592
+
593
+ - name: fix_style
594
+ description: "How fixes are presented"
595
+ spectrum:
596
+ minimal: "Just identifies the problem"
597
+ practical: "Shows the fix inline"
598
+ comprehensive: "Refactors surrounding code, explains principles"
599
+
600
+ expected_tendencies:
601
+ discworld_reviewer:
602
+ character: "Granny Weatherwax"
603
+ expected_traits:
604
+ - "Uncompromising - should find more issues"
605
+ - "No-nonsense - severity should be accurate"
606
+ - "Headology - may note developer psychology issues"
607
+ thoroughness_prediction: "high"
608
+
609
+ star_trek_reviewer:
610
+ character: "Spock"
611
+ expected_traits:
612
+ - "Logical - systematic coverage"
613
+ - "Precise - accurate line numbers"
614
+ - "Unemotional - may miss 'soft' issues"
615
+ thoroughness_prediction: "high"
616
+
617
+ control_reviewer:
618
+ character: "None (baseline)"
619
+ expected_traits:
620
+ - "Minimal persona influence"
621
+ - "Standard code review behavior"
622
+ thoroughness_prediction: "baseline reference"