@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,546 @@
1
+ ---
2
+ # Scenario: Concurrent Cache Race Conditions
3
+ # Category: dev
4
+ # Purpose: Test concurrency debugging and race condition detection
5
+
6
+ id: dev-005
7
+ name: race-condition-cache
8
+ title: "Concurrent Cache Race Conditions"
9
+ category: dev
10
+ difficulty: medium # Calibrated 2026-01-01: mean=76.80, was extreme
11
+ version: "1.0"
12
+
13
+ description: |
14
+ A caching layer with intermittent data corruption. Contains multiple race
15
+ conditions: double-checked locking bug, map concurrent write panic, cache
16
+ stampede, stale read after invalidation. Developer must identify all races
17
+ and provide thread-safe fixes.
18
+
19
+ purpose: |
20
+ This scenario tests deep concurrency expertise. Race conditions are notoriously
21
+ hard to debug because they're intermittent. A "systematic" persona might find
22
+ more issues through careful analysis. A "intuitive" persona might miss subtle
23
+ races. Extreme difficulty for finals-caliber challenge.
24
+
25
+ prompt: |
26
+ BUG REPORT: Production cache is experiencing intermittent issues:
27
+ - Occasional panics: "concurrent map writes"
28
+ - Data corruption: cache returns wrong values for keys
29
+ - Cache stampede: database gets hammered when cache expires
30
+ - Stale reads: recently invalidated data still being served
31
+
32
+ The cache was "working fine" until traffic increased 10x last week.
33
+
34
+ Your task:
35
+ 1. Analyze the code to identify ALL race conditions
36
+ 2. Explain the exact interleaving that causes each bug
37
+ 3. Provide thread-safe fixes for each issue
38
+ 4. Ensure fixes don't introduce deadlocks
39
+
40
+ For each race condition:
41
+ 1. Identify the specific lines and goroutines involved
42
+ 2. Describe the interleaving sequence (step by step)
43
+ 3. Classify severity (Critical/High/Medium/Low)
44
+ 4. Provide the corrected code
45
+
46
+ IMPORTANT: Go's -race detector would catch some but not all of these.
47
+ Think about semantic races and correctness, not just data races.
48
+
49
+ code:
50
+ language: go
51
+ filename: cache.go
52
+ content: |
53
+ package cache
54
+
55
+ import (
56
+ "context"
57
+ "sync"
58
+ "time"
59
+ )
60
+
61
+ type CacheEntry struct {
62
+ Value interface{}
63
+ ExpiresAt time.Time
64
+ Loading bool
65
+ }
66
+
67
+ type LoadFunc func(ctx context.Context, key string) (interface{}, error)
68
+
69
+ type Cache struct {
70
+ data map[string]*CacheEntry
71
+ mu sync.Mutex
72
+ ttl time.Duration
73
+ loader LoadFunc
74
+ stats *Stats
75
+ cleaning bool
76
+ }
77
+
78
+ type Stats struct {
79
+ Hits int64
80
+ Misses int64
81
+ LoadErrors int64
82
+ }
83
+
84
+ func NewCache(ttl time.Duration, loader LoadFunc) *Cache {
85
+ c := &Cache{
86
+ data: make(map[string]*CacheEntry),
87
+ ttl: ttl,
88
+ loader: loader,
89
+ stats: &Stats{},
90
+ }
91
+ go c.cleanupLoop()
92
+ return c
93
+ }
94
+
95
+ func (c *Cache) Get(ctx context.Context, key string) (interface{}, error) {
96
+ // Fast path: check cache without lock
97
+ if entry, ok := c.data[key]; ok {
98
+ if time.Now().Before(entry.ExpiresAt) {
99
+ c.stats.Hits++
100
+ return entry.Value, nil
101
+ }
102
+ }
103
+
104
+ // Cache miss or expired - need to load
105
+ c.mu.Lock()
106
+
107
+ // Double-check after acquiring lock
108
+ if entry, ok := c.data[key]; ok {
109
+ if time.Now().Before(entry.ExpiresAt) {
110
+ c.mu.Unlock()
111
+ c.stats.Hits++
112
+ return entry.Value, nil
113
+ }
114
+ // Entry is loading, wait for it
115
+ if entry.Loading {
116
+ c.mu.Unlock()
117
+ return c.waitForLoad(ctx, key)
118
+ }
119
+ // Mark as loading
120
+ entry.Loading = true
121
+ } else {
122
+ // Create new entry in loading state
123
+ c.data[key] = &CacheEntry{Loading: true}
124
+ }
125
+
126
+ c.mu.Unlock()
127
+ c.stats.Misses++
128
+
129
+ // Load the value (outside lock to allow concurrency)
130
+ value, err := c.loader(ctx, key)
131
+ if err != nil {
132
+ c.stats.LoadErrors++
133
+ c.mu.Lock()
134
+ delete(c.data, key)
135
+ c.mu.Unlock()
136
+ return nil, err
137
+ }
138
+
139
+ // Store the loaded value
140
+ c.mu.Lock()
141
+ c.data[key] = &CacheEntry{
142
+ Value: value,
143
+ ExpiresAt: time.Now().Add(c.ttl),
144
+ Loading: false,
145
+ }
146
+ c.mu.Unlock()
147
+
148
+ return value, nil
149
+ }
150
+
151
+ func (c *Cache) waitForLoad(ctx context.Context, key string) (interface{}, error) {
152
+ // Busy wait for the entry to finish loading
153
+ for {
154
+ select {
155
+ case <-ctx.Done():
156
+ return nil, ctx.Err()
157
+ default:
158
+ entry, ok := c.data[key]
159
+ if ok && !entry.Loading {
160
+ return entry.Value, nil
161
+ }
162
+ time.Sleep(10 * time.Millisecond)
163
+ }
164
+ }
165
+ }
166
+
167
+ func (c *Cache) Set(key string, value interface{}) {
168
+ c.data[key] = &CacheEntry{
169
+ Value: value,
170
+ ExpiresAt: time.Now().Add(c.ttl),
171
+ Loading: false,
172
+ }
173
+ }
174
+
175
+ func (c *Cache) Invalidate(key string) {
176
+ c.mu.Lock()
177
+ defer c.mu.Unlock()
178
+ delete(c.data, key)
179
+ }
180
+
181
+ func (c *Cache) InvalidateAll() {
182
+ c.mu.Lock()
183
+ c.data = make(map[string]*CacheEntry)
184
+ c.mu.Unlock()
185
+ }
186
+
187
+ func (c *Cache) GetMulti(ctx context.Context, keys []string) map[string]interface{} {
188
+ results := make(map[string]interface{})
189
+
190
+ for _, key := range keys {
191
+ go func(k string) {
192
+ value, err := c.Get(ctx, k)
193
+ if err == nil {
194
+ results[k] = value
195
+ }
196
+ }(key)
197
+ }
198
+
199
+ // Wait a bit for goroutines to complete
200
+ time.Sleep(100 * time.Millisecond)
201
+
202
+ return results
203
+ }
204
+
205
+ func (c *Cache) cleanupLoop() {
206
+ ticker := time.NewTicker(time.Minute)
207
+ for range ticker.C {
208
+ c.cleanup()
209
+ }
210
+ }
211
+
212
+ func (c *Cache) cleanup() {
213
+ if c.cleaning {
214
+ return
215
+ }
216
+ c.cleaning = true
217
+
218
+ now := time.Now()
219
+ for key, entry := range c.data {
220
+ if now.After(entry.ExpiresAt) {
221
+ delete(c.data, key)
222
+ }
223
+ }
224
+
225
+ c.cleaning = false
226
+ }
227
+
228
+ func (c *Cache) Size() int {
229
+ return len(c.data)
230
+ }
231
+
232
+ func (c *Cache) GetStats() Stats {
233
+ return Stats{
234
+ Hits: c.stats.Hits,
235
+ Misses: c.stats.Misses,
236
+ LoadErrors: c.stats.LoadErrors,
237
+ }
238
+ }
239
+
240
+ // Refresh reloads a key proactively
241
+ func (c *Cache) Refresh(ctx context.Context, key string) error {
242
+ value, err := c.loader(ctx, key)
243
+ if err != nil {
244
+ return err
245
+ }
246
+
247
+ c.data[key] = &CacheEntry{
248
+ Value: value,
249
+ ExpiresAt: time.Now().Add(c.ttl),
250
+ }
251
+
252
+ return nil
253
+ }
254
+
255
+ // LoadOrStore is like Get but takes a custom loader
256
+ func (c *Cache) LoadOrStore(ctx context.Context, key string, loader LoadFunc) (interface{}, error) {
257
+ if entry, ok := c.data[key]; ok {
258
+ if time.Now().Before(entry.ExpiresAt) {
259
+ return entry.Value, nil
260
+ }
261
+ }
262
+
263
+ value, err := loader(ctx, key)
264
+ if err != nil {
265
+ return nil, err
266
+ }
267
+
268
+ c.data[key] = &CacheEntry{
269
+ Value: value,
270
+ ExpiresAt: time.Now().Add(c.ttl),
271
+ }
272
+
273
+ return value, nil
274
+ }
275
+
276
+ # =============================================================================
277
+ # BASELINE ISSUES (minimum expected to find)
278
+ # =============================================================================
279
+
280
+ baseline_issues:
281
+ critical:
282
+ - id: MAP_CONCURRENT_READ_WRITE_GET
283
+ location: "lines 40-43"
284
+ description: "Reading c.data[key] without lock while other goroutines write"
285
+ interleaving: |
286
+ 1. Goroutine A: reads c.data[key] at line 41 (no lock)
287
+ 2. Goroutine B: writes c.data[key] = &CacheEntry{} at line 77 (has lock)
288
+ 3. PANIC: concurrent map read/write
289
+ impact: "Sporadic panics under load"
290
+
291
+ - id: MAP_CONCURRENT_WRITE_SET
292
+ location: "line 101"
293
+ description: "Set() writes to map without any lock"
294
+ interleaving: |
295
+ 1. Goroutine A: c.data[key] = ... in Set()
296
+ 2. Goroutine B: c.data[key] = ... in Get()
297
+ 3. PANIC: concurrent map writes
298
+ impact: "Panics when Set() called concurrently"
299
+
300
+ - id: MAP_CONCURRENT_WRITE_REFRESH
301
+ location: "lines 161-165"
302
+ description: "Refresh() writes to map without lock"
303
+ interleaving: |
304
+ 1. Goroutine A: c.data[key] = ... in Refresh()
305
+ 2. Goroutine B: c.data[key] = ... in Get()
306
+ 3. PANIC: concurrent map writes
307
+ impact: "Panics during proactive refresh"
308
+
309
+ - id: MAP_CONCURRENT_WRITE_LOAD_OR_STORE
310
+ location: "lines 172-184"
311
+ description: "LoadOrStore reads and writes without lock"
312
+ interleaving: "Same as above - concurrent map access"
313
+ impact: "Panics in LoadOrStore"
314
+
315
+ - id: CLEANUP_NO_LOCK
316
+ location: "lines 133-143"
317
+ description: "cleanup() iterates and deletes without lock"
318
+ interleaving: |
319
+ 1. Cleanup goroutine: range c.data (no lock)
320
+ 2. Get goroutine: c.data[key] = ... (with lock)
321
+ 3. PANIC: concurrent map iteration and write
322
+ impact: "Panics during cleanup cycle"
323
+
324
+ - id: GETMULTI_RESULTS_RACE
325
+ location: "lines 117-126"
326
+ description: "Multiple goroutines write to results map without synchronization"
327
+ interleaving: |
328
+ 1. Goroutine for key1: results[k] = value
329
+ 2. Goroutine for key2: results[k] = value (concurrent write)
330
+ 3. PANIC or data corruption
331
+ impact: "Panics or missing results"
332
+
333
+ high:
334
+ - id: STATS_RACE
335
+ location: "lines 43, 49, 68, 73"
336
+ description: "Stats counters incremented without atomic operations"
337
+ interleaving: |
338
+ 1. Goroutine A: reads c.stats.Hits (value 10)
339
+ 2. Goroutine B: reads c.stats.Hits (value 10)
340
+ 3. Goroutine A: writes c.stats.Hits = 11
341
+ 4. Goroutine B: writes c.stats.Hits = 11
342
+ 5. Lost increment - should be 12
343
+ impact: "Inaccurate metrics"
344
+
345
+ - id: CLEANING_FLAG_RACE
346
+ location: "lines 132-134"
347
+ description: "cleaning flag read/write without synchronization"
348
+ interleaving: |
349
+ 1. Goroutine A: reads c.cleaning (false)
350
+ 2. Goroutine B: reads c.cleaning (false)
351
+ 3. Both proceed to clean concurrently
352
+ impact: "Double cleanup, potential inconsistency"
353
+
354
+ - id: STALE_READ_AFTER_INVALIDATE
355
+ location: "Get fast path vs Invalidate"
356
+ description: "Get reads without lock, may see entry after Invalidate"
357
+ interleaving: |
358
+ 1. Goroutine A: if entry, ok := c.data[key]; ok (finds entry)
359
+ 2. Goroutine B: delete(c.data, key) in Invalidate
360
+ 3. Goroutine A: returns stale entry.Value
361
+ impact: "Serves invalidated data"
362
+
363
+ medium:
364
+ - id: DOUBLE_CHECKED_LOCKING_BUG
365
+ location: "lines 40-66"
366
+ description: "Entry can change between unlock and accessing entry.Value"
367
+ interleaving: |
368
+ 1. Goroutine A: finds entry in loading state, unlocks
369
+ 2. Goroutine B: deletes entry due to error (line 73)
370
+ 3. Goroutine A: waitForLoad sees no entry, infinite loop or nil
371
+ impact: "Goroutine hangs or returns nil"
372
+
373
+ - id: CACHE_STAMPEDE
374
+ location: "lines 58-63"
375
+ description: "Loading flag check has race - multiple loads can start"
376
+ interleaving: |
377
+ 1. Request A: finds expired, sets Loading=true, unlocks
378
+ 2. Request B: finds expired before A's Loading visible
379
+ 3. Request C: finds expired before A's Loading visible
380
+ 4. All 3 call loader simultaneously
381
+ impact: "Database stampede on popular key expiry"
382
+
383
+ - id: GETMULTI_NOT_WAITING
384
+ location: "line 124"
385
+ description: "time.Sleep(100ms) doesn't guarantee goroutines complete"
386
+ interleaving: "Slow loader takes >100ms, results returned incomplete"
387
+ impact: "Missing keys in multi-get response"
388
+
389
+ - id: SIZE_RACE
390
+ location: "line 146"
391
+ description: "len(c.data) without lock"
392
+ impact: "Inaccurate size, potential panic if map resizing"
393
+
394
+ low:
395
+ - id: WAITFORLOAD_BUSY_SPIN
396
+ location: "lines 85-95"
397
+ description: "Busy waiting with sleep wastes CPU"
398
+ impact: "CPU usage spikes during contention"
399
+
400
+ - id: TICKER_NEVER_STOPPED
401
+ location: "lines 128-131"
402
+ description: "Cleanup ticker never stopped, goroutine leak"
403
+ impact: "Resource leak when Cache abandoned"
404
+
405
+ # =============================================================================
406
+ # BONUS ISSUES (thorough reviewers might find these)
407
+ # =============================================================================
408
+
409
+ bonus_issues:
410
+ concurrency:
411
+ - id: WAITFORLOAD_INFINITE_LOOP
412
+ description: "If loading entry deleted, waitForLoad loops forever"
413
+
414
+ - id: CONTEXT_LEAK_WAITFORLOAD
415
+ description: "If context cancelled, may still have loading entry"
416
+
417
+ - id: INVALIDATE_DURING_LOAD
418
+ description: "Invalidate during load leaves orphaned loading entry"
419
+
420
+ performance:
421
+ - id: LOCK_CONTENTION_HOT_KEYS
422
+ description: "Single mutex for all keys causes contention"
423
+
424
+ - id: NO_SHARDING
425
+ description: "Could use sharded locks for better concurrency"
426
+
427
+ design:
428
+ - id: SYNC_COND_BETTER_THAN_BUSYWAIT
429
+ description: "sync.Cond more efficient than polling for load complete"
430
+
431
+ - id: SINGLEFLIGHT_FOR_DEDUP
432
+ description: "golang.org/x/sync/singleflight better for stampede"
433
+
434
+ - id: ATOMIC_VALUE_FOR_STATS
435
+ description: "atomic.Int64 cleaner than manual synchronization"
436
+
437
+ # =============================================================================
438
+ # SCORING
439
+ # =============================================================================
440
+
441
+ scoring:
442
+ total_baseline_issues: 14
443
+ total_bonus_issues: 8
444
+ weights:
445
+ critical: 3
446
+ high: 2
447
+ medium: 1
448
+ low: 0.5
449
+ max_baseline_score: 32 # 6*3 + 3*2 + 4*1 + 2*0.5
450
+
451
+ categories:
452
+ - name: detection
453
+ weight: 35
454
+ criteria:
455
+ - id: BASELINE_FOUND
456
+ description: "Race conditions from the seeded list"
457
+ points: 25
458
+ - id: BONUS_DISCOVERIES
459
+ description: "Additional concurrency issues found"
460
+ points: 10
461
+
462
+ - name: depth
463
+ weight: 35
464
+ criteria:
465
+ - id: INTERLEAVING_ANALYSIS
466
+ description: "Step-by-step goroutine interleaving for each race"
467
+ points: 15
468
+ - id: FIX_CORRECTNESS
469
+ description: "Fixes are correct and don't introduce deadlocks"
470
+ points: 12
471
+ - id: GO_IDIOMS
472
+ description: "Uses idiomatic Go concurrency patterns"
473
+ points: 8
474
+
475
+ - name: quality
476
+ weight: 15
477
+ criteria:
478
+ - id: SEVERITY_ACCURACY
479
+ description: "Correctly classifies crash vs corruption vs performance"
480
+ points: 5
481
+ - id: REASONING_QUALITY
482
+ description: "Clear explanation of race mechanics"
483
+ points: 5
484
+ - id: PRIORITIZATION
485
+ description: "Addresses panics before semantic races"
486
+ points: 5
487
+
488
+ - name: persona
489
+ weight: 15
490
+ criteria:
491
+ - id: CHARACTER_CONSISTENCY
492
+ description: "Stays in character throughout"
493
+ points: 8
494
+ - id: PERSONA_VALUE_ADD
495
+ description: "Persona adds color to technical explanation"
496
+ points: 7
497
+
498
+ # =============================================================================
499
+ # PERSONA INFLUENCE
500
+ # =============================================================================
501
+
502
+ persona_influence:
503
+ dimensions:
504
+ - name: concurrency_expertise
505
+ description: "Depth of Go concurrency knowledge"
506
+ spectrum:
507
+ basic: "Finds obvious map races"
508
+ intermediate: "Finds lock ordering and semantic races"
509
+ expert: "Identifies subtle interleavings and suggests singleflight"
510
+
511
+ - name: analysis_style
512
+ description: "How races are explained"
513
+ spectrum:
514
+ intuitive: "Describes problem generally"
515
+ step_by_step: "Provides exact interleaving sequences"
516
+ formal: "Uses happens-before terminology"
517
+
518
+ - name: fix_philosophy
519
+ description: "Approach to corrections"
520
+ spectrum:
521
+ minimal: "Adds locks where needed"
522
+ pragmatic: "Restructures for clarity"
523
+ comprehensive: "Redesigns with sharding, singleflight"
524
+
525
+ expected_tendencies:
526
+ discworld_dev:
527
+ character: "Ponder Stibbons"
528
+ expected_traits:
529
+ - "Methodical - should find most races"
530
+ - "May over-explain the mechanics"
531
+ - "Practical fixes over elegant redesigns"
532
+ concurrency_expertise_prediction: "intermediate to expert"
533
+
534
+ star_trek_dev:
535
+ character: "Data"
536
+ expected_traits:
537
+ - "Logical - precise interleaving sequences"
538
+ - "May identify all races systematically"
539
+ - "Could suggest optimal restructuring"
540
+ concurrency_expertise_prediction: "expert"
541
+
542
+ control_dev:
543
+ character: "None (baseline)"
544
+ expected_traits:
545
+ - "Standard debugging behavior"
546
+ concurrency_expertise_prediction: "baseline reference"