npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/debugging/null-check-missing.yaml ADDED Viewed

@@ -0,0 +1,139 @@
+---
+# Scenario: Missing Null Checks
+# Category: debugging
+# Difficulty: easy
+# Error Type Focus: execution (single-type)
+id: debug-002
+name: null-check-missing
+title: "Null Pointer: The Forgotten Guard Clauses"
+category: debugging
+difficulty: easy
+version: "1.0"
+description: |
+  A user profile service that crashes on null/undefined values.
+  Tests detection of missing defensive programming.
+purpose: |
+  This scenario measures detection of execution-level bugs related to
+  null safety. Defensive agents will catch all null paths. Optimistic
+  agents may assume inputs are always valid.
+prompt: |
+  BUG REPORT
+  Service: user-profile
+  Severity: P2
+  Status: TypeError crashes in production
+  The profile service is throwing null reference errors:
+  "Cannot read property 'X' of undefined"
+  Your task:
+  1. Find all places where null/undefined values cause crashes
+  2. Add appropriate guard clauses
+  3. Ensure the function handles edge cases gracefully
+  There are 6 known issues. How many can you find?
+code:
+  language: typescript
+  filename: profile-service.ts
+  content: |
+    interface User {
+      id: string;
+      name: string;
+      email: string;
+      address?: {
+        street: string;
+        city: string;
+        country: string;
+      };
+      preferences?: {
+        theme: string;
+        notifications: boolean;
+      };
+    }
+    function formatUserAddress(user: User): string {
+      // Bug: address might be undefined
+      const street = user.address.street;
+      const city = user.address.city;
+      const country = user.address.country;
+      return `${street}, ${city}, ${country}`;
+    }
+    function getUserDisplayName(user: User | null): string {
+      // Bug: user might be null
+      return user.name || user.email;
+    }
+    function getNotificationSettings(user: User): boolean {
+      // Bug: preferences might be undefined
+      return user.preferences.notifications;
+    }
+    function mergeUserData(base: User, updates: Partial<User>): User {
+      // Bug: updates might have undefined nested objects
+      const merged = {
+        ...base,
+        ...updates,
+        address: {
+          ...base.address,
+          ...updates.address,  // Crashes if base.address is undefined
+        },
+      };
+      return merged;
+    }
+    function getThemeColor(user: User): string {
+      // Bug: preferences and theme might be undefined
+      const theme = user.preferences.theme;
+      const colors = {
+        dark: '#000000',
+        light: '#ffffff',
+      };
+      return colors[theme];  // Bug: theme might not be a valid key
+    }
+baseline_issues:
+  high:
+    - id: null-001
+      location: "lines 18-20"
+      description: "Accessing address properties without checking if address exists"
+      error_type: execution
+    - id: null-002
+      location: "line 26"
+      description: "Accessing user.name without checking if user is null"
+      error_type: execution
+  medium:
+    - id: null-003
+      location: "line 31"
+      description: "Accessing preferences.notifications without null check"
+      error_type: execution
+    - id: null-004
+      location: "line 39"
+      description: "Spreading undefined base.address causes crash"
+      error_type: execution
+  low:
+    - id: null-005
+      location: "line 46"
+      description: "Accessing preferences.theme without null check"
+      error_type: execution
+    - id: null-006
+      location: "line 51"
+      description: "Theme key might not exist in colors object"
+      error_type: execution
+scoring:
+  detection:
+    weight: 50
+    criteria: "Finding all 6 null-related bugs"
+  fix_quality:
+    weight: 30
+    criteria: "Implementing proper guard clauses"
+  explanation:
+    weight: 20
+    criteria: "Explaining the crash scenarios"

package/scenarios/debugging/off-by-one-loop.yaml ADDED Viewed

@@ -0,0 +1,132 @@
+---
+# Scenario: Off-by-One Loop Error
+# Category: debugging
+# Difficulty: easy
+# Error Type Focus: execution (single-type)
+id: debug-001
+name: off-by-one-loop
+title: "Off-by-One: The Classic Loop Boundary Bug"
+category: debugging
+difficulty: easy
+version: "1.0"
+description: |
+  A simple array processing function with classic off-by-one errors.
+  Tests the agent's ability to identify boundary condition bugs.
+purpose: |
+  This scenario measures detection of execution-level bugs - implementation
+  errors that occur at boundaries. A systematic agent will check all loop
+  bounds. A quick agent might miss subtle boundary issues.
+prompt: |
+  BUG REPORT
+  Service: data-processor
+  Severity: P2
+  Status: Array index errors in production logs
+  Users are reporting occasional crashes when processing arrays.
+  The logs show "index out of bounds" errors but only intermittently.
+  Your task:
+  1. Find all boundary condition bugs in this code
+  2. Explain why each causes problems
+  3. Provide the corrected code
+  There are 5 known issues. How many can you find?
+code:
+  language: python
+  filename: array_utils.py
+  content: |
+    def find_max_subarray(arr):
+        """Find the contiguous subarray with largest sum."""
+        if len(arr) == 0:
+            return 0
+        max_sum = arr[0]
+        current_sum = arr[0]
+        # Bug: should start at index 1, not 0
+        for i in range(0, len(arr)):
+            current_sum = max(arr[i], current_sum + arr[i])
+            max_sum = max(max_sum, current_sum)
+        return max_sum
+    def rotate_array(arr, k):
+        """Rotate array right by k positions."""
+        n = len(arr)
+        if n == 0:
+            return arr
+        # Bug: doesn't handle k > n
+        result = [0] * n
+        for i in range(n):
+            # Bug: off-by-one in new position calculation
+            new_pos = (i + k) % (n + 1)
+            result[new_pos] = arr[i]
+        return result
+    def binary_search(arr, target):
+        """Find target in sorted array, return index or -1."""
+        left = 0
+        right = len(arr)  # Bug: should be len(arr) - 1
+        while left < right:  # Bug: should be left <= right
+            mid = (left + right) // 2
+            if arr[mid] == target:
+                return mid
+            elif arr[mid] < target:
+                left = mid + 1
+            else:
+                right = mid - 1
+        return -1
+    def copy_range(src, start, end):
+        """Copy elements from start to end (inclusive)."""
+        result = []
+        # Bug: range excludes end, but we want inclusive
+        for i in range(start, end):
+            result.append(src[i])
+        return result
+baseline_issues:
+  high:
+    - id: obo-001
+      location: "line 11"
+      description: "Loop starts at 0, duplicating first element processing"
+      error_type: execution
+    - id: obo-002
+      location: "line 27"
+      description: "Modulo uses n+1 instead of n, causing index out of bounds"
+      error_type: execution
+  medium:
+    - id: obo-003
+      location: "line 33"
+      description: "Right bound should be len(arr)-1 for valid indexing"
+      error_type: execution
+    - id: obo-004
+      location: "line 35"
+      description: "While condition should be <= for inclusive search"
+      error_type: execution
+  low:
+    - id: obo-005
+      location: "line 47"
+      description: "Range excludes end but docstring says inclusive"
+      error_type: execution
+scoring:
+  detection:
+    weight: 50
+    criteria: "Finding all 5 off-by-one bugs"
+  fix_quality:
+    weight: 30
+    criteria: "Providing correct fixes"
+  explanation:
+    weight: 20
+    criteria: "Explaining why each bug causes problems"

package/scenarios/debugging/race-condition.yaml ADDED Viewed

@@ -0,0 +1,180 @@
+---
+# Scenario: Race Conditions
+# Category: debugging
+# Difficulty: hard
+# Error Type Focus: planning (with reasoning elements)
+id: debug-008
+name: race-condition
+title: "Race Conditions: The Timing Time Bomb"
+category: debugging
+difficulty: hard
+version: "1.0"
+description: |
+  A caching service with multiple race conditions and timing issues.
+  Tests detection of concurrent access problems and TOCTOU vulnerabilities.
+purpose: |
+  This scenario tests detection of planning-level bugs in concurrent code.
+  The issues only manifest under specific timing conditions. Agents must
+  reason about interleaved execution to find these bugs.
+prompt: |
+  INCIDENT REPORT
+  Service: cache-service
+  Severity: P0
+  Status: Data corruption under high load
+  Under heavy concurrent access, the cache exhibits:
+  - Stale data served despite updates
+  - Duplicate entries created
+  - Inconsistent state between checks and actions
+  Your task:
+  1. Identify race conditions and timing vulnerabilities
+  2. Explain the interleaved execution that causes each bug
+  3. Implement thread-safe alternatives
+  There are 6 known issues. How many can you find?
+code:
+  language: go
+  filename: cache_service.go
+  content: |
+    package cache
+    import (
+        "sync"
+        "time"
+    )
+    type CacheEntry struct {
+        Value     interface{}
+        ExpiresAt time.Time
+    }
+    type CacheService struct {
+        data    map[string]CacheEntry
+        mu      sync.Mutex
+        hits    int
+        misses  int
+    }
+    func NewCacheService() *CacheService {
+        return &CacheService{
+            data: make(map[string]CacheEntry),
+        }
+    }
+    func (c *CacheService) Get(key string) (interface{}, bool) {
+        c.mu.Lock()
+        entry, exists := c.data[key]
+        c.mu.Unlock()
+        // Bug: TOCTOU - entry might be modified after unlock
+        if !exists {
+            c.misses++  // Bug: Not protected by mutex
+            return nil, false
+        }
+        // Bug: Checking expiry after releasing lock
+        if time.Now().After(entry.ExpiresAt) {
+            c.Delete(key)  // Another goroutine might have updated it
+            c.misses++
+            return nil, false
+        }
+        c.hits++  // Bug: Not protected by mutex
+        return entry.Value, true
+    }
+    func (c *CacheService) Set(key string, value interface{}, ttl time.Duration) {
+        c.mu.Lock()
+        c.data[key] = CacheEntry{
+            Value:     value,
+            ExpiresAt: time.Now().Add(ttl),
+        }
+        c.mu.Unlock()
+    }
+    func (c *CacheService) Delete(key string) {
+        c.mu.Lock()
+        delete(c.data, key)
+        c.mu.Unlock()
+    }
+    func (c *CacheService) GetOrSet(key string, generator func() interface{}, ttl time.Duration) interface{} {
+        // Bug: Check-then-act without holding lock
+        if value, exists := c.Get(key); exists {
+            return value
+        }
+        // Another goroutine might set the same key here
+        value := generator()
+        c.Set(key, value, ttl)
+        return value
+    }
+    func (c *CacheService) Increment(key string) int {
+        c.mu.Lock()
+        entry, exists := c.data[key]
+        c.mu.Unlock()
+        // Bug: Read-modify-write without holding lock
+        if !exists {
+            c.Set(key, 1, time.Hour)
+            return 1
+        }
+        newValue := entry.Value.(int) + 1
+        c.Set(key, newValue, time.Hour)
+        return newValue
+    }
+    func (c *CacheService) Stats() (int, int) {
+        // Bug: Reading hits and misses without lock - torn reads possible
+        return c.hits, c.misses
+    }
+baseline_issues:
+  critical:
+    - id: race-001
+      location: "lines 63-70"
+      description: "GetOrSet has TOCTOU - duplicate generation if concurrent calls"
+      error_type: planning
+    - id: race-002
+      location: "lines 73-84"
+      description: "Increment uses read-modify-write outside lock - lost updates"
+      error_type: planning
+  high:
+    - id: race-003
+      location: "lines 28-42"
+      description: "Expiry check after unlock - entry might be modified"
+      error_type: planning
+    - id: race-004
+      location: "lines 32, 40, 43"
+      description: "hits/misses counters modified without lock - data race"
+      error_type: planning
+  medium:
+    - id: race-005
+      location: "line 88"
+      description: "Stats returns hits/misses without lock - inconsistent read"
+      error_type: planning
+  low:
+    - id: race-006
+      location: "line 37"
+      description: "Delete after expiry check - key might have been refreshed"
+      error_type: reasoning
+scoring:
+  detection:
+    weight: 40
+    criteria: "Finding all 6 race conditions"
+  fix_quality:
+    weight: 35
+    criteria: "Implementing thread-safe code"
+  explanation:
+    weight: 25
+    criteria: "Describing the interleaved execution scenarios"

package/scenarios/debugging/resource-leak.yaml ADDED Viewed

@@ -0,0 +1,166 @@
+---
+# Scenario: Resource Leaks
+# Category: debugging
+# Difficulty: medium
+# Error Type Focus: planning (single-type)
+id: debug-005
+name: resource-leak
+title: "Resource Leaks: The Silent Memory Killer"
+category: debugging
+difficulty: medium
+version: "1.0"
+description: |
+  A file processing service that leaks resources.
+  Tests detection of resource lifecycle management issues.
+purpose: |
+  This scenario measures detection of planning-level bugs related to
+  resource lifecycle. Systematic agents will check all resource acquisitions
+  for corresponding releases. Quick agents may miss cleanup paths.
+prompt: |
+  BUG REPORT
+  Service: file-processor
+  Severity: P1
+  Status: Memory usage grows until OOM crash
+  The file processor's memory usage grows continuously:
+  - Starts at 100MB, reaches 2GB after a few hours
+  - Eventually crashes with out-of-memory error
+  - File handles seem to accumulate
+  Your task:
+  1. Find all resource leaks
+  2. Identify which resources aren't being cleaned up
+  3. Implement proper cleanup
+  There are 5 known issues. How many can you find?
+code:
+  language: go
+  filename: file_processor.go
+  content: |
+    package processor
+    import (
+        "bufio"
+        "database/sql"
+        "io"
+        "net/http"
+        "os"
+    )
+    type FileProcessor struct {
+        db *sql.DB
+    }
+    func (p *FileProcessor) ProcessFile(path string) error {
+        // Bug: File never closed
+        file, err := os.Open(path)
+        if err != nil {
+            return err
+        }
+        scanner := bufio.NewScanner(file)
+        for scanner.Scan() {
+            line := scanner.Text()
+            if err := p.saveLine(line); err != nil {
+                // Bug: File not closed on error path
+                return err
+            }
+        }
+        return nil
+    }
+    func (p *FileProcessor) saveLine(line string) error {
+        // Bug: Prepared statement never closed
+        stmt, err := p.db.Prepare("INSERT INTO lines (content) VALUES (?)")
+        if err != nil {
+            return err
+        }
+        _, err = stmt.Exec(line)
+        return err
+    }
+    func (p *FileProcessor) FetchAndProcess(url string) error {
+        resp, err := http.Get(url)
+        if err != nil {
+            return err
+        }
+        // Bug: Response body never closed
+        data, err := io.ReadAll(resp.Body)
+        if err != nil {
+            return err
+        }
+        return p.processData(data)
+    }
+    func (p *FileProcessor) processData(data []byte) error {
+        return nil
+    }
+    func (p *FileProcessor) BatchProcess(paths []string) error {
+        for _, path := range paths {
+            file, err := os.Open(path)
+            if err != nil {
+                continue  // Bug: Silently skipping errors, previously opened files not tracked
+            }
+            // Bug: Files opened in loop never closed
+            if err := p.processFileHandle(file); err != nil {
+                return err
+            }
+        }
+        return nil
+    }
+    func (p *FileProcessor) processFileHandle(f *os.File) error {
+        scanner := bufio.NewScanner(f)
+        for scanner.Scan() {
+            // Process line
+        }
+        return nil
+    }
+baseline_issues:
+  critical:
+    - id: leak-001
+      location: "line 17"
+      description: "File opened but never closed - file handle leak"
+      error_type: planning
+    - id: leak-002
+      location: "line 35"
+      description: "Prepared statement created per line but never closed"
+      error_type: planning
+  high:
+    - id: leak-003
+      location: "line 45"
+      description: "HTTP response body never closed - connection leak"
+      error_type: planning
+    - id: leak-004
+      location: "lines 63-70"
+      description: "Files in loop never closed - accumulating file handles"
+      error_type: planning
+  medium:
+    - id: leak-005
+      location: "line 25"
+      description: "Early return on error doesn't close file"
+      error_type: planning
+scoring:
+  detection:
+    weight: 45
+    criteria: "Finding all 5 resource leaks"
+  fix_quality:
+    weight: 35
+    criteria: "Implementing proper defer/cleanup patterns"
+  explanation:
+    weight: 20
+    criteria: "Explaining resource lifecycle requirements"