npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scripts/job-fair-runner.sh ADDED Viewed

@@ -0,0 +1,278 @@
+#!/usr/bin/env bash
+# Job Fair Runner
+# Runs all characters from a theme against all roles with baselines
+# Usage: ./scripts/job-fair-runner.sh <theme> [--runs N]
+set -e
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+SOLO_RUNNER="$SCRIPT_DIR/solo-runner.sh"
+THEME="$1"
+RUNS=2  # Default runs per combo
+REQUESTED_ROLES=""  # Empty = all roles
+# Parse flags
+if [[ $# -gt 0 ]]; then
+    shift  # Remove theme from args
+fi
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --runs)
+            RUNS="$2"
+            shift 2
+            ;;
+        --roles)
+            REQUESTED_ROLES="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1" >&2
+            exit 1
+            ;;
+    esac
+done
+if [[ -z "$THEME" ]]; then
+    echo "Usage: $0 <theme> [--runs N] [--roles role1,role2,...]" >&2
+    echo "  --runs N     Number of runs per combination (default: 2)"
+    echo "  --roles      Comma-separated roles to test (default: all)"
+    echo "               Available: dev,reviewer,tea,sm,architect"
+    echo "               Note: 'dev' expands to dev-codegen,dev-debug"
+    exit 1
+fi
+THEME_FILE="$PROJECT_DIR/pennyfarthing-dist/personas/themes/${THEME}.yaml"
+if [[ ! -f "$THEME_FILE" ]]; then
+    echo "Error: Theme not found: $THEME_FILE" >&2
+    exit 1
+fi
+# Define role -> scenario mappings
+# DEV is special: has TWO sub-competencies (codegen + debug)
+get_scenario() {
+    case "$1" in
+        dev-codegen) echo "tdd-shopping-cart" ;;
+        dev-debug) echo "astropy-12907" ;;
+        dev) echo "tdd-shopping-cart" ;;  # Legacy fallback
+        reviewer) echo "order-service" ;;
+        tea) echo "payment-processor-tests" ;;
+        sm) echo "sprint-planning-conflict" ;;
+        architect) echo "legacy-modernization" ;;
+    esac
+}
+get_baseline() {
+    case "$1" in
+        dev-codegen) echo "85.8" ;;
+        dev-debug) echo "77.5" ;;
+        dev) echo "85.8" ;;  # Legacy fallback
+        reviewer) echo "78.5" ;;
+        tea) echo "72.1" ;;
+        sm) echo "80.3" ;;
+        architect) echo "87.2" ;;
+    esac
+}
+# Dev has dual testing - returns space-separated list
+get_dev_sub_roles() {
+    echo "dev-codegen dev-debug"
+}
+# Create output directory
+TIMESTAMP=$(date -u +%Y%m%dT%H%M%SZ)
+OUTPUT_DIR="$PROJECT_DIR/internal/results/job-fair/${THEME}-${TIMESTAMP}"
+mkdir -p "$OUTPUT_DIR"
+echo "=== Job Fair: $THEME ==="
+echo "Output: $OUTPUT_DIR"
+echo "Runs per combo: $RUNS"
+echo ""
+# Get main characters (the 5 core roles)
+MAIN_CHARS_FILE=$(mktemp)
+yq -r '.agents | to_entries[] | select(.key | test("^(orchestrator|sm|tea|dev|reviewer)$")) | "\(.key):\(.value.character)"' "$THEME_FILE" > "$MAIN_CHARS_FILE"
+echo "### Characters"
+while IFS=: read -r role char; do
+    echo "  - $role: $char"
+done < "$MAIN_CHARS_FILE"
+echo ""
+# Touch raw results file
+touch "$OUTPUT_DIR/raw_results.txt"
+# Build role list - dev expands to dev-codegen and dev-debug
+# If --roles specified, only test those roles
+if [[ -n "$REQUESTED_ROLES" ]]; then
+    # Parse comma-separated roles
+    BASE_ROLES=$(echo "$REQUESTED_ROLES" | tr ',' ' ')
+else
+    BASE_ROLES="dev reviewer tea sm architect"
+fi
+ROLES_TO_TEST=""
+for BASE_ROLE in $BASE_ROLES; do
+    if [[ "$BASE_ROLE" == "dev" ]]; then
+        # Dev has dual sub-competencies
+        ROLES_TO_TEST="$ROLES_TO_TEST dev-codegen dev-debug"
+    elif [[ "$BASE_ROLE" == "dev-codegen" || "$BASE_ROLE" == "dev-debug" ]]; then
+        # Allow specifying individual dev sub-competencies
+        ROLES_TO_TEST="$ROLES_TO_TEST $BASE_ROLE"
+    else
+        ROLES_TO_TEST="$ROLES_TO_TEST $BASE_ROLE"
+    fi
+done
+echo "Roles to test: $ROLES_TO_TEST"
+# Run each role (dev-codegen and dev-debug tested separately)
+for ROLE in $ROLES_TO_TEST; do
+    SCENARIO=$(get_scenario "$ROLE")
+    BASELINE=$(get_baseline "$ROLE")
+    # For dev sub-roles, use 'dev' as the actual agent role
+    if [[ "$ROLE" == "dev-codegen" || "$ROLE" == "dev-debug" ]]; then
+        AGENT_ROLE="dev"
+    else
+        AGENT_ROLE="$ROLE"
+    fi
+    echo "### Testing: $ROLE (scenario: $SCENARIO, baseline: $BASELINE)"
+    # Run each character for this role
+    while IFS=: read -r source_role char; do
+        # Determine spec based on whether this is native or cross-role
+        # Use AGENT_ROLE for comparison (dev for both dev-codegen and dev-debug)
+        if [[ "$source_role" == "$AGENT_ROLE" ]]; then
+            # Native role - direct run
+            SPEC="$THEME:$AGENT_ROLE"
+            CROSS=""
+        else
+            # Cross-role - extract first name or simple identifier
+            # Use first word of character name for lookup
+            FIRST_NAME=$(echo "$char" | awk '{print $1}')
+            SPEC="$THEME:$FIRST_NAME"
+            CROSS="--as $AGENT_ROLE"
+        fi
+        CHAR_SLUG=$(echo "$char" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g')
+        RUN_DIR="$OUTPUT_DIR/runs/$ROLE/$CHAR_SLUG"
+        mkdir -p "$RUN_DIR"
+        SUM=0
+        COUNT=0
+        for i in $(seq 1 $RUNS); do
+            echo -n "  $char ($source_role) -> $ROLE [$i/$RUNS]: "
+            if [[ -n "$CROSS" ]]; then
+                RESULT=$("$SOLO_RUNNER" "$SPEC" "$SCENARIO" "$RUN_DIR" $CROSS 2>/dev/null || echo '{"success":false,"score":0}')
+            else
+                RESULT=$("$SOLO_RUNNER" "$SPEC" "$SCENARIO" "$RUN_DIR" 2>/dev/null || echo '{"success":false,"score":0}')
+            fi
+            SCORE=$(echo "$RESULT" | jq -r '.score // 0')
+            echo "$SCORE"
+            # Save individual result
+            echo "$RESULT" > "$RUN_DIR/run_$i.json"
+            # Accumulate scores
+            SUM=$(echo "$SUM + $SCORE" | bc)
+            COUNT=$((COUNT + 1))
+            # Brief pause between runs
+            sleep 1
+        done
+        # Calculate mean for this character-role combo
+        if [[ $COUNT -gt 0 ]]; then
+            MEAN=$(echo "scale=2; $SUM / $COUNT" | bc)
+            echo "  -> Mean: $MEAN"
+            # Store in results file
+            echo "$source_role:$char:$ROLE:$MEAN:$COUNT" >> "$OUTPUT_DIR/raw_results.txt"
+        fi
+    done < "$MAIN_CHARS_FILE"
+    echo ""
+done
+# Clean up temp file
+rm -f "$MAIN_CHARS_FILE"
+# Generate summary
+echo "### Generating summary..."
+cat > "$OUTPUT_DIR/summary.yaml" << EOF
+theme: $THEME
+timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)
+runs_per_combo: $RUNS
+mode: full
+# Dev has dual sub-competency testing (only role with 2 scenarios)
+scenarios:
+  dev-codegen: $(get_scenario dev-codegen)
+  dev-debug: $(get_scenario dev-debug)
+  reviewer: $(get_scenario reviewer)
+  tea: $(get_scenario tea)
+  sm: $(get_scenario sm)
+  architect: $(get_scenario architect)
+baselines:
+  dev-codegen: {mean: $(get_baseline dev-codegen), std: 7.30, n: 10}
+  dev-debug: {mean: $(get_baseline dev-debug), std: 8.54, n: 10}
+  reviewer: {mean: $(get_baseline reviewer), std: 1.8, n: 10}
+  tea: {mean: $(get_baseline tea), std: 2.3, n: 10}
+  sm: {mean: $(get_baseline sm), std: 1.9, n: 10}
+  architect: {mean: $(get_baseline architect), std: 3.25, n: 10}
+EOF
+# Parse raw results and add to summary
+if [[ -s "$OUTPUT_DIR/raw_results.txt" ]]; then
+    echo "" >> "$OUTPUT_DIR/summary.yaml"
+    echo "matrix:" >> "$OUTPUT_DIR/summary.yaml"
+    # Group by source role
+    CURRENT_CHAR=""
+    while IFS=: read -r src_role char role mean n; do
+        CHAR_KEY=$(echo "$char" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/_/g')
+        if [[ "$CHAR_KEY" != "$CURRENT_CHAR" ]]; then
+            if [[ -n "$CURRENT_CHAR" ]]; then
+                echo "" >> "$OUTPUT_DIR/summary.yaml"
+            fi
+            echo "  $CHAR_KEY:" >> "$OUTPUT_DIR/summary.yaml"
+            CURRENT_CHAR="$CHAR_KEY"
+        fi
+        echo "    $role: {mean: $mean, n: $n}" >> "$OUTPUT_DIR/summary.yaml"
+    done < "$OUTPUT_DIR/raw_results.txt"
+fi
+echo ""
+echo "=== Job Fair Complete ==="
+echo "Results: $OUTPUT_DIR/summary.yaml"
+# Update manifest
+MANIFEST_FILE="$PROJECT_DIR/internal/results/job-fair/manifest.yaml"
+if [[ -f "$MANIFEST_FILE" ]]; then
+    echo ""
+    echo "### Updating manifest..."
+    # Check if theme already in manifest
+    if grep -q "theme: $THEME$" "$MANIFEST_FILE"; then
+        echo "Theme '$THEME' already in manifest"
+    else
+        # Append new entry before the "# Themes not yet run" comment
+        ENTRY="  - theme: $THEME
+    timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)
+    has_raw_data: true
+    runs_per_combo: $RUNS
+"
+        # Insert before the comment line
+        sed -i.bak "/^# Themes not yet run/i\\
+$ENTRY" "$MANIFEST_FILE" && rm -f "${MANIFEST_FILE}.bak"
+        echo "Added '$THEME' to manifest"
+    fi
+fi

package/scripts/job-fair-status.sh ADDED Viewed

@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# Job Fair Status - Accurate coverage detection
+# Usage: ./scripts/job-fair-status.sh [--verbose]
+set -e
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+THEMES_DIR="$PROJECT_DIR/pennyfarthing-dist/personas/themes"
+RESULTS_DIR="$PROJECT_DIR/internal/results/job-fair"
+VERBOSE=false
+[[ "$1" == "--verbose" || "$1" == "-v" ]] && VERBOSE=true
+# Get all theme names
+all_themes=$(ls "$THEMES_DIR"/*.yaml 2>/dev/null | xargs -n1 basename | sed 's/\.yaml$//' | sort)
+total_themes=$(echo "$all_themes" | wc -l | tr -d ' ')
+# Find themes with valid job fair data (summary.yaml with matrix: section)
+themes_with_data=()
+themes_without_data=()
+themes_partial=()
+for theme in $all_themes; do
+  # Look for any directory matching this theme
+  found=false
+  has_matrix=false
+  for dir in "$RESULTS_DIR"/${theme}*/; do
+    [[ -d "$dir" ]] || continue
+    found=true
+    if [[ -f "${dir}summary.yaml" ]]; then
+      if grep -q "^matrix:" "${dir}summary.yaml" 2>/dev/null; then
+        has_matrix=true
+        break
+      fi
+    fi
+  done
+  if $has_matrix; then
+    themes_with_data+=("$theme")
+  elif $found; then
+    themes_partial+=("$theme")
+  else
+    themes_without_data+=("$theme")
+  fi
+done
+# Output summary
+echo "=== Job Fair Coverage Status ==="
+echo "Total themes: $total_themes"
+echo "With data:    ${#themes_with_data[@]} (summary.yaml with matrix)"
+echo "Partial:      ${#themes_partial[@]} (directory exists, no matrix)"
+echo "Not started:  ${#themes_without_data[@]}"
+echo ""
+echo "Coverage: ${#themes_with_data[@]}/$total_themes ($(( ${#themes_with_data[@]} * 100 / total_themes ))%)"
+if $VERBOSE; then
+  if [[ ${#themes_partial[@]} -gt 0 ]]; then
+    echo ""
+    echo "### Partial (need consolidation or re-run):"
+    printf '  - %s\n' "${themes_partial[@]}"
+  fi
+  if [[ ${#themes_without_data[@]} -gt 0 ]]; then
+    echo ""
+    echo "### Not Started:"
+    printf '  - %s\n' "${themes_without_data[@]}"
+  fi
+fi
+# Exit with status based on coverage
+if [[ ${#themes_with_data[@]} -eq $total_themes ]]; then
+  echo ""
+  echo "✓ Full coverage achieved!"
+  exit 0
+else
+  exit 1
+fi

package/scripts/job-fair-watcher-v2.sh ADDED Viewed

@@ -0,0 +1,38 @@
+#!/bin/bash
+# Watch job fairs - start batch 2 only (skip simpsons)
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+echo "Job Fair Watcher v2 started at $(date)"
+echo "Will run: discworld, snow-crash (skipping the-simpsons)"
+is_running() {
+    pgrep -f "job-fair-batch.sh $1" > /dev/null 2>&1
+}
+start_theme() {
+    local theme="$1"
+    echo "$(date): Starting $theme"
+    nohup "$SCRIPT_DIR/job-fair-batch.sh" "$theme" 4 > "/tmp/job-fair-${theme}.log" 2>&1 &
+    echo "$(date): $theme PID: $!"
+}
+# Wait for batch 1 to complete
+while is_running "arthurian-mythos" || is_running "greek-mythology"; do
+    sleep 60
+done
+echo "$(date): Batch 1 complete"
+# Start batch 2 only
+start_theme "discworld"
+start_theme "snow-crash"
+# Wait for batch 2 to complete
+while is_running "discworld" || is_running "snow-crash"; do
+    sleep 60
+done
+echo "$(date): Batch 2 complete - ALL DONE (skipped simpsons)"
+# Final summary
+echo ""
+"$SCRIPT_DIR/job-fair-progress.sh" 2>/dev/null

package/scripts/job-fair-watcher.sh ADDED Viewed

@@ -0,0 +1,50 @@
+#!/bin/bash
+# Watch job fairs and start next batch when current finishes
+# Usage: ./scripts/job-fair-watcher.sh &
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+echo "Job Fair Watcher started at $(date)"
+# Function to check if a theme is running
+is_running() {
+    pgrep -f "job-fair-batch.sh $1" > /dev/null 2>&1
+}
+# Function to start a theme
+start_theme() {
+    local theme="$1"
+    echo "$(date): Starting $theme"
+    nohup "$SCRIPT_DIR/job-fair-batch.sh" "$theme" 4 > "/tmp/job-fair-${theme}.log" 2>&1 &
+    echo "$(date): $theme PID: $!"
+}
+# Wait for batch 1 to complete
+while is_running "arthurian-mythos" || is_running "greek-mythology"; do
+    sleep 60
+done
+echo "$(date): Batch 1 complete"
+# Start batch 2
+start_theme "discworld"
+start_theme "snow-crash"
+# Wait for batch 2 to complete
+while is_running "discworld" || is_running "snow-crash"; do
+    sleep 60
+done
+echo "$(date): Batch 2 complete"
+# Start batch 3
+start_theme "the-simpsons"
+# Wait for batch 3 to complete
+while is_running "the-simpsons"; do
+    sleep 60
+done
+echo "$(date): Batch 3 complete - ALL DONE!"
+# Final summary
+echo ""
+echo "=== Final Results ==="
+"$SCRIPT_DIR/job-fair-progress.sh"

package/scripts/parallel-benchmark.sh ADDED Viewed

@@ -0,0 +1,140 @@
+#!/bin/bash
+# Parallel Benchmark Runner
+# Runs multiple themes in parallel with staggered starts to avoid rate limiting
+# Usage: ./scripts/parallel-benchmark.sh <scenario> <theme1:agent> <theme2:agent> ... [--stagger N]
+set -e
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SOLO_RUNNER="$SCRIPT_DIR/solo-runner.sh"
+if [[ ! -x "$SOLO_RUNNER" ]]; then
+    echo "Error: solo-runner.sh not found or not executable" >&2
+    exit 1
+fi
+# Parse arguments
+SCENARIO=""
+STAGGER=3  # Default 3 second stagger
+SPECS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --stagger)
+            STAGGER="$2"
+            shift 2
+            ;;
+        --scenario)
+            SCENARIO="$2"
+            shift 2
+            ;;
+        *)
+            if [[ -z "$SCENARIO" ]]; then
+                SCENARIO="$1"
+            else
+                SPECS+=("$1")
+            fi
+            shift
+            ;;
+    esac
+done
+if [[ -z "$SCENARIO" || ${#SPECS[@]} -eq 0 ]]; then
+    echo "Usage: $0 <scenario> <theme:agent> [theme:agent ...] [--stagger N]" >&2
+    echo "" >&2
+    echo "Examples:" >&2
+    echo "  $0 race-condition-cache breaking-bad:dev firefly:dev the-wire:dev" >&2
+    echo "  $0 security-review discworld:reviewer west-wing:reviewer --stagger 5" >&2
+    exit 1
+fi
+# Create output directory
+TIMESTAMP=$(date +%Y%m%d-%H%M%S)
+OUTPUT_BASE="/tmp/parallel-benchmark-${SCENARIO}-${TIMESTAMP}"
+mkdir -p "$OUTPUT_BASE"
+echo "=== Parallel Benchmark ==="
+echo "Scenario: $SCENARIO"
+echo "Contestants: ${SPECS[*]}"
+echo "Stagger: ${STAGGER}s"
+echo "Output: $OUTPUT_BASE"
+echo ""
+# Launch all runs with staggered starts
+PIDS=()
+for spec in "${SPECS[@]}"; do
+    theme="${spec%%:*}"
+    mkdir -p "$OUTPUT_BASE/$theme"
+    echo "Starting: $spec"
+    "$SOLO_RUNNER" "$spec" "$SCENARIO" "$OUTPUT_BASE/$theme" > "$OUTPUT_BASE/$theme/stdout.txt" 2>&1 &
+    PIDS+=($!)
+    # Stagger next start
+    if [[ ${#PIDS[@]} -lt ${#SPECS[@]} ]]; then
+        sleep "$STAGGER"
+    fi
+done
+echo ""
+echo "Waiting for ${#PIDS[@]} runs to complete..."
+# Wait for all processes
+FAILED=0
+for i in "${!PIDS[@]}"; do
+    pid="${PIDS[$i]}"
+    spec="${SPECS[$i]}"
+    if wait "$pid"; then
+        echo "  [OK] $spec"
+    else
+        echo "  [FAIL] $spec"
+        FAILED=$((FAILED + 1))
+    fi
+done
+echo ""
+echo "=== Results ==="
+# Collect and display results
+RESULTS=()
+for spec in "${SPECS[@]}"; do
+    theme="${spec%%:*}"
+    result_file="$OUTPUT_BASE/$theme/stdout.txt"
+    if [[ -f "$result_file" ]]; then
+        # Extract score from the JSON output
+        score=$(grep -o '"score": [0-9.]*' "$result_file" | head -1 | grep -o '[0-9.]*' || echo "N/A")
+        character=$(grep -o '"character": "[^"]*"' "$result_file" | head -1 | sed 's/"character": "\([^"]*\)"/\1/' || echo "Unknown")
+        if [[ "$score" != "N/A" ]]; then
+            printf "%-30s %-25s %s\n" "$spec" "$character" "$score"
+            RESULTS+=("$spec:$score")
+        else
+            printf "%-30s %-25s %s\n" "$spec" "$character" "FAILED"
+            # Show error
+            tail -5 "$result_file" 2>/dev/null
+        fi
+    else
+        printf "%-30s %-25s %s\n" "$spec" "Unknown" "NO OUTPUT"
+    fi
+done
+echo ""
+echo "=== Summary ==="
+echo "Total: ${#SPECS[@]}"
+echo "Succeeded: $((${#SPECS[@]} - FAILED))"
+echo "Failed: $FAILED"
+echo "Output: $OUTPUT_BASE"
+# Calculate mean if we have results
+if [[ ${#RESULTS[@]} -gt 0 ]]; then
+    total=0
+    count=0
+    for result in "${RESULTS[@]}"; do
+        score="${result##*:}"
+        total=$(echo "$total + $score" | bc)
+        count=$((count + 1))
+    done
+    mean=$(echo "scale=2; $total / $count" | bc)
+    echo "Mean Score: $mean"
+fi