npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/dev/race-condition-cache.yaml ADDED Viewed

@@ -0,0 +1,546 @@
+---
+# Scenario: Concurrent Cache Race Conditions
+# Category: dev
+# Purpose: Test concurrency debugging and race condition detection
+id: dev-005
+name: race-condition-cache
+title: "Concurrent Cache Race Conditions"
+category: dev
+difficulty: medium  # Calibrated 2026-01-01: mean=76.80, was extreme
+version: "1.0"
+description: |
+  A caching layer with intermittent data corruption. Contains multiple race
+  conditions: double-checked locking bug, map concurrent write panic, cache
+  stampede, stale read after invalidation. Developer must identify all races
+  and provide thread-safe fixes.
+purpose: |
+  This scenario tests deep concurrency expertise. Race conditions are notoriously
+  hard to debug because they're intermittent. A "systematic" persona might find
+  more issues through careful analysis. A "intuitive" persona might miss subtle
+  races. Extreme difficulty for finals-caliber challenge.
+prompt: |
+  BUG REPORT: Production cache is experiencing intermittent issues:
+  - Occasional panics: "concurrent map writes"
+  - Data corruption: cache returns wrong values for keys
+  - Cache stampede: database gets hammered when cache expires
+  - Stale reads: recently invalidated data still being served
+  The cache was "working fine" until traffic increased 10x last week.
+  Your task:
+  1. Analyze the code to identify ALL race conditions
+  2. Explain the exact interleaving that causes each bug
+  3. Provide thread-safe fixes for each issue
+  4. Ensure fixes don't introduce deadlocks
+  For each race condition:
+  1. Identify the specific lines and goroutines involved
+  2. Describe the interleaving sequence (step by step)
+  3. Classify severity (Critical/High/Medium/Low)
+  4. Provide the corrected code
+  IMPORTANT: Go's -race detector would catch some but not all of these.
+  Think about semantic races and correctness, not just data races.
+code:
+  language: go
+  filename: cache.go
+  content: |
+    package cache
+    import (
+        "context"
+        "sync"
+        "time"
+    )
+    type CacheEntry struct {
+        Value     interface{}
+        ExpiresAt time.Time
+        Loading   bool
+    }
+    type LoadFunc func(ctx context.Context, key string) (interface{}, error)
+    type Cache struct {
+        data     map[string]*CacheEntry
+        mu       sync.Mutex
+        ttl      time.Duration
+        loader   LoadFunc
+        stats    *Stats
+        cleaning bool
+    }
+    type Stats struct {
+        Hits       int64
+        Misses     int64
+        LoadErrors int64
+    }
+    func NewCache(ttl time.Duration, loader LoadFunc) *Cache {
+        c := &Cache{
+            data:   make(map[string]*CacheEntry),
+            ttl:    ttl,
+            loader: loader,
+            stats:  &Stats{},
+        }
+        go c.cleanupLoop()
+        return c
+    }
+    func (c *Cache) Get(ctx context.Context, key string) (interface{}, error) {
+        // Fast path: check cache without lock
+        if entry, ok := c.data[key]; ok {
+            if time.Now().Before(entry.ExpiresAt) {
+                c.stats.Hits++
+                return entry.Value, nil
+            }
+        }
+        // Cache miss or expired - need to load
+        c.mu.Lock()
+        // Double-check after acquiring lock
+        if entry, ok := c.data[key]; ok {
+            if time.Now().Before(entry.ExpiresAt) {
+                c.mu.Unlock()
+                c.stats.Hits++
+                return entry.Value, nil
+            }
+            // Entry is loading, wait for it
+            if entry.Loading {
+                c.mu.Unlock()
+                return c.waitForLoad(ctx, key)
+            }
+            // Mark as loading
+            entry.Loading = true
+        } else {
+            // Create new entry in loading state
+            c.data[key] = &CacheEntry{Loading: true}
+        }
+        c.mu.Unlock()
+        c.stats.Misses++
+        // Load the value (outside lock to allow concurrency)
+        value, err := c.loader(ctx, key)
+        if err != nil {
+            c.stats.LoadErrors++
+            c.mu.Lock()
+            delete(c.data, key)
+            c.mu.Unlock()
+            return nil, err
+        }
+        // Store the loaded value
+        c.mu.Lock()
+        c.data[key] = &CacheEntry{
+            Value:     value,
+            ExpiresAt: time.Now().Add(c.ttl),
+            Loading:   false,
+        }
+        c.mu.Unlock()
+        return value, nil
+    }
+    func (c *Cache) waitForLoad(ctx context.Context, key string) (interface{}, error) {
+        // Busy wait for the entry to finish loading
+        for {
+            select {
+            case <-ctx.Done():
+                return nil, ctx.Err()
+            default:
+                entry, ok := c.data[key]
+                if ok && !entry.Loading {
+                    return entry.Value, nil
+                }
+                time.Sleep(10 * time.Millisecond)
+            }
+        }
+    }
+    func (c *Cache) Set(key string, value interface{}) {
+        c.data[key] = &CacheEntry{
+            Value:     value,
+            ExpiresAt: time.Now().Add(c.ttl),
+            Loading:   false,
+        }
+    }
+    func (c *Cache) Invalidate(key string) {
+        c.mu.Lock()
+        defer c.mu.Unlock()
+        delete(c.data, key)
+    }
+    func (c *Cache) InvalidateAll() {
+        c.mu.Lock()
+        c.data = make(map[string]*CacheEntry)
+        c.mu.Unlock()
+    }
+    func (c *Cache) GetMulti(ctx context.Context, keys []string) map[string]interface{} {
+        results := make(map[string]interface{})
+        for _, key := range keys {
+            go func(k string) {
+                value, err := c.Get(ctx, k)
+                if err == nil {
+                    results[k] = value
+                }
+            }(key)
+        }
+        // Wait a bit for goroutines to complete
+        time.Sleep(100 * time.Millisecond)
+        return results
+    }
+    func (c *Cache) cleanupLoop() {
+        ticker := time.NewTicker(time.Minute)
+        for range ticker.C {
+            c.cleanup()
+        }
+    }
+    func (c *Cache) cleanup() {
+        if c.cleaning {
+            return
+        }
+        c.cleaning = true
+        now := time.Now()
+        for key, entry := range c.data {
+            if now.After(entry.ExpiresAt) {
+                delete(c.data, key)
+            }
+        }
+        c.cleaning = false
+    }
+    func (c *Cache) Size() int {
+        return len(c.data)
+    }
+    func (c *Cache) GetStats() Stats {
+        return Stats{
+            Hits:       c.stats.Hits,
+            Misses:     c.stats.Misses,
+            LoadErrors: c.stats.LoadErrors,
+        }
+    }
+    // Refresh reloads a key proactively
+    func (c *Cache) Refresh(ctx context.Context, key string) error {
+        value, err := c.loader(ctx, key)
+        if err != nil {
+            return err
+        }
+        c.data[key] = &CacheEntry{
+            Value:     value,
+            ExpiresAt: time.Now().Add(c.ttl),
+        }
+        return nil
+    }
+    // LoadOrStore is like Get but takes a custom loader
+    func (c *Cache) LoadOrStore(ctx context.Context, key string, loader LoadFunc) (interface{}, error) {
+        if entry, ok := c.data[key]; ok {
+            if time.Now().Before(entry.ExpiresAt) {
+                return entry.Value, nil
+            }
+        }
+        value, err := loader(ctx, key)
+        if err != nil {
+            return nil, err
+        }
+        c.data[key] = &CacheEntry{
+            Value:     value,
+            ExpiresAt: time.Now().Add(c.ttl),
+        }
+        return value, nil
+    }
+# =============================================================================
+# BASELINE ISSUES (minimum expected to find)
+# =============================================================================
+baseline_issues:
+  critical:
+    - id: MAP_CONCURRENT_READ_WRITE_GET
+      location: "lines 40-43"
+      description: "Reading c.data[key] without lock while other goroutines write"
+      interleaving: |
+        1. Goroutine A: reads c.data[key] at line 41 (no lock)
+        2. Goroutine B: writes c.data[key] = &CacheEntry{} at line 77 (has lock)
+        3. PANIC: concurrent map read/write
+      impact: "Sporadic panics under load"
+    - id: MAP_CONCURRENT_WRITE_SET
+      location: "line 101"
+      description: "Set() writes to map without any lock"
+      interleaving: |
+        1. Goroutine A: c.data[key] = ... in Set()
+        2. Goroutine B: c.data[key] = ... in Get()
+        3. PANIC: concurrent map writes
+      impact: "Panics when Set() called concurrently"
+    - id: MAP_CONCURRENT_WRITE_REFRESH
+      location: "lines 161-165"
+      description: "Refresh() writes to map without lock"
+      interleaving: |
+        1. Goroutine A: c.data[key] = ... in Refresh()
+        2. Goroutine B: c.data[key] = ... in Get()
+        3. PANIC: concurrent map writes
+      impact: "Panics during proactive refresh"
+    - id: MAP_CONCURRENT_WRITE_LOAD_OR_STORE
+      location: "lines 172-184"
+      description: "LoadOrStore reads and writes without lock"
+      interleaving: "Same as above - concurrent map access"
+      impact: "Panics in LoadOrStore"
+    - id: CLEANUP_NO_LOCK
+      location: "lines 133-143"
+      description: "cleanup() iterates and deletes without lock"
+      interleaving: |
+        1. Cleanup goroutine: range c.data (no lock)
+        2. Get goroutine: c.data[key] = ... (with lock)
+        3. PANIC: concurrent map iteration and write
+      impact: "Panics during cleanup cycle"
+    - id: GETMULTI_RESULTS_RACE
+      location: "lines 117-126"
+      description: "Multiple goroutines write to results map without synchronization"
+      interleaving: |
+        1. Goroutine for key1: results[k] = value
+        2. Goroutine for key2: results[k] = value (concurrent write)
+        3. PANIC or data corruption
+      impact: "Panics or missing results"
+  high:
+    - id: STATS_RACE
+      location: "lines 43, 49, 68, 73"
+      description: "Stats counters incremented without atomic operations"
+      interleaving: |
+        1. Goroutine A: reads c.stats.Hits (value 10)
+        2. Goroutine B: reads c.stats.Hits (value 10)
+        3. Goroutine A: writes c.stats.Hits = 11
+        4. Goroutine B: writes c.stats.Hits = 11
+        5. Lost increment - should be 12
+      impact: "Inaccurate metrics"
+    - id: CLEANING_FLAG_RACE
+      location: "lines 132-134"
+      description: "cleaning flag read/write without synchronization"
+      interleaving: |
+        1. Goroutine A: reads c.cleaning (false)
+        2. Goroutine B: reads c.cleaning (false)
+        3. Both proceed to clean concurrently
+      impact: "Double cleanup, potential inconsistency"
+    - id: STALE_READ_AFTER_INVALIDATE
+      location: "Get fast path vs Invalidate"
+      description: "Get reads without lock, may see entry after Invalidate"
+      interleaving: |
+        1. Goroutine A: if entry, ok := c.data[key]; ok (finds entry)
+        2. Goroutine B: delete(c.data, key) in Invalidate
+        3. Goroutine A: returns stale entry.Value
+      impact: "Serves invalidated data"
+  medium:
+    - id: DOUBLE_CHECKED_LOCKING_BUG
+      location: "lines 40-66"
+      description: "Entry can change between unlock and accessing entry.Value"
+      interleaving: |
+        1. Goroutine A: finds entry in loading state, unlocks
+        2. Goroutine B: deletes entry due to error (line 73)
+        3. Goroutine A: waitForLoad sees no entry, infinite loop or nil
+      impact: "Goroutine hangs or returns nil"
+    - id: CACHE_STAMPEDE
+      location: "lines 58-63"
+      description: "Loading flag check has race - multiple loads can start"
+      interleaving: |
+        1. Request A: finds expired, sets Loading=true, unlocks
+        2. Request B: finds expired before A's Loading visible
+        3. Request C: finds expired before A's Loading visible
+        4. All 3 call loader simultaneously
+      impact: "Database stampede on popular key expiry"
+    - id: GETMULTI_NOT_WAITING
+      location: "line 124"
+      description: "time.Sleep(100ms) doesn't guarantee goroutines complete"
+      interleaving: "Slow loader takes >100ms, results returned incomplete"
+      impact: "Missing keys in multi-get response"
+    - id: SIZE_RACE
+      location: "line 146"
+      description: "len(c.data) without lock"
+      impact: "Inaccurate size, potential panic if map resizing"
+  low:
+    - id: WAITFORLOAD_BUSY_SPIN
+      location: "lines 85-95"
+      description: "Busy waiting with sleep wastes CPU"
+      impact: "CPU usage spikes during contention"
+    - id: TICKER_NEVER_STOPPED
+      location: "lines 128-131"
+      description: "Cleanup ticker never stopped, goroutine leak"
+      impact: "Resource leak when Cache abandoned"
+# =============================================================================
+# BONUS ISSUES (thorough reviewers might find these)
+# =============================================================================
+bonus_issues:
+  concurrency:
+    - id: WAITFORLOAD_INFINITE_LOOP
+      description: "If loading entry deleted, waitForLoad loops forever"
+    - id: CONTEXT_LEAK_WAITFORLOAD
+      description: "If context cancelled, may still have loading entry"
+    - id: INVALIDATE_DURING_LOAD
+      description: "Invalidate during load leaves orphaned loading entry"
+  performance:
+    - id: LOCK_CONTENTION_HOT_KEYS
+      description: "Single mutex for all keys causes contention"
+    - id: NO_SHARDING
+      description: "Could use sharded locks for better concurrency"
+  design:
+    - id: SYNC_COND_BETTER_THAN_BUSYWAIT
+      description: "sync.Cond more efficient than polling for load complete"
+    - id: SINGLEFLIGHT_FOR_DEDUP
+      description: "golang.org/x/sync/singleflight better for stampede"
+    - id: ATOMIC_VALUE_FOR_STATS
+      description: "atomic.Int64 cleaner than manual synchronization"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  total_baseline_issues: 14
+  total_bonus_issues: 8
+  weights:
+    critical: 3
+    high: 2
+    medium: 1
+    low: 0.5
+  max_baseline_score: 32  # 6*3 + 3*2 + 4*1 + 2*0.5
+  categories:
+    - name: detection
+      weight: 35
+      criteria:
+        - id: BASELINE_FOUND
+          description: "Race conditions from the seeded list"
+          points: 25
+        - id: BONUS_DISCOVERIES
+          description: "Additional concurrency issues found"
+          points: 10
+    - name: depth
+      weight: 35
+      criteria:
+        - id: INTERLEAVING_ANALYSIS
+          description: "Step-by-step goroutine interleaving for each race"
+          points: 15
+        - id: FIX_CORRECTNESS
+          description: "Fixes are correct and don't introduce deadlocks"
+          points: 12
+        - id: GO_IDIOMS
+          description: "Uses idiomatic Go concurrency patterns"
+          points: 8
+    - name: quality
+      weight: 15
+      criteria:
+        - id: SEVERITY_ACCURACY
+          description: "Correctly classifies crash vs corruption vs performance"
+          points: 5
+        - id: REASONING_QUALITY
+          description: "Clear explanation of race mechanics"
+          points: 5
+        - id: PRIORITIZATION
+          description: "Addresses panics before semantic races"
+          points: 5
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona adds color to technical explanation"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: concurrency_expertise
+      description: "Depth of Go concurrency knowledge"
+      spectrum:
+        basic: "Finds obvious map races"
+        intermediate: "Finds lock ordering and semantic races"
+        expert: "Identifies subtle interleavings and suggests singleflight"
+    - name: analysis_style
+      description: "How races are explained"
+      spectrum:
+        intuitive: "Describes problem generally"
+        step_by_step: "Provides exact interleaving sequences"
+        formal: "Uses happens-before terminology"
+    - name: fix_philosophy
+      description: "Approach to corrections"
+      spectrum:
+        minimal: "Adds locks where needed"
+        pragmatic: "Restructures for clarity"
+        comprehensive: "Redesigns with sharding, singleflight"
+expected_tendencies:
+  discworld_dev:
+    character: "Ponder Stibbons"
+    expected_traits:
+      - "Methodical - should find most races"
+      - "May over-explain the mechanics"
+      - "Practical fixes over elegant redesigns"
+    concurrency_expertise_prediction: "intermediate to expert"
+  star_trek_dev:
+    character: "Data"
+    expected_traits:
+      - "Logical - precise interleaving sequences"
+      - "May identify all races systematically"
+      - "Could suggest optimal restructuring"
+    concurrency_expertise_prediction: "expert"
+  control_dev:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard debugging behavior"
+    concurrency_expertise_prediction: "baseline reference"