@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# benchmark-runner.sh - Shell wrapper for benchmark-runner.js
|
|
3
|
+
# This wrapper ensures the tests can call benchmark-runner.sh as expected
|
|
4
|
+
#
|
|
5
|
+
# All logic is implemented in benchmark-runner.js (Node.js)
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
8
|
+
exec node "$SCRIPT_DIR/benchmark-runner.js" "$@"
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Consolidate Job Fair Results
|
|
3
|
+
# Merges multiple role-specific runs for the same theme into one summary
|
|
4
|
+
# Usage: ./scripts/consolidate-job-fair.sh [theme1 theme2 ...]
|
|
5
|
+
|
|
6
|
+
set -e
|
|
7
|
+
|
|
8
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
9
|
+
PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
|
|
10
|
+
RESULTS_DIR="$PROJECT_DIR/internal/results/job-fair"
|
|
11
|
+
CONSOLIDATED_DIR="$RESULTS_DIR/consolidated"
|
|
12
|
+
|
|
13
|
+
mkdir -p "$CONSOLIDATED_DIR"
|
|
14
|
+
|
|
15
|
+
# If themes specified, use those; otherwise find all themes
|
|
16
|
+
if [[ $# -gt 0 ]]; then
|
|
17
|
+
THEMES="$@"
|
|
18
|
+
else
|
|
19
|
+
# Extract unique theme names from directory names
|
|
20
|
+
THEMES=$(ls -d "$RESULTS_DIR"/*-20* 2>/dev/null | xargs -n1 basename | sed 's/-20[0-9]*T[0-9]*Z$//' | sed 's/-20[0-9]*-[0-9]*$//' | sort -u)
|
|
21
|
+
fi
|
|
22
|
+
|
|
23
|
+
echo "=== Consolidating Job Fair Results ==="
|
|
24
|
+
echo ""
|
|
25
|
+
|
|
26
|
+
for THEME in $THEMES; do
|
|
27
|
+
echo "### Processing: $THEME"
|
|
28
|
+
|
|
29
|
+
# Find all directories for this theme
|
|
30
|
+
THEME_DIRS=$(ls -d "$RESULTS_DIR/${THEME}"-20* 2>/dev/null | sort)
|
|
31
|
+
|
|
32
|
+
if [[ -z "$THEME_DIRS" ]]; then
|
|
33
|
+
echo " [SKIP] No results found for $THEME"
|
|
34
|
+
continue
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
# Create consolidated directory for this theme
|
|
38
|
+
THEME_OUT="$CONSOLIDATED_DIR/$THEME"
|
|
39
|
+
mkdir -p "$THEME_OUT"
|
|
40
|
+
|
|
41
|
+
# Merge all raw_results.txt files
|
|
42
|
+
> "$THEME_OUT/raw_results.txt"
|
|
43
|
+
|
|
44
|
+
for DIR in $THEME_DIRS; do
|
|
45
|
+
if [[ -f "$DIR/raw_results.txt" ]]; then
|
|
46
|
+
cat "$DIR/raw_results.txt" >> "$THEME_OUT/raw_results.txt"
|
|
47
|
+
fi
|
|
48
|
+
done
|
|
49
|
+
|
|
50
|
+
# Count unique roles collected
|
|
51
|
+
ROLES=$(cut -d: -f3 "$THEME_OUT/raw_results.txt" 2>/dev/null | sort -u | tr '\n' ' ')
|
|
52
|
+
ROLE_COUNT=$(echo "$ROLES" | wc -w | tr -d ' ')
|
|
53
|
+
|
|
54
|
+
echo " Merged $ROLE_COUNT roles: $ROLES"
|
|
55
|
+
|
|
56
|
+
# Generate consolidated summary
|
|
57
|
+
TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
58
|
+
|
|
59
|
+
cat > "$THEME_OUT/summary.yaml" << EOF
|
|
60
|
+
# Consolidated Job Fair Results
|
|
61
|
+
theme: $THEME
|
|
62
|
+
consolidated_at: $TIMESTAMP
|
|
63
|
+
source_runs: $(echo "$THEME_DIRS" | wc -l | tr -d ' ')
|
|
64
|
+
|
|
65
|
+
scenarios:
|
|
66
|
+
dev-codegen: tdd-shopping-cart
|
|
67
|
+
dev-debug: astropy-12907
|
|
68
|
+
reviewer: order-service
|
|
69
|
+
tea: payment-processor-tests
|
|
70
|
+
sm: sprint-planning-conflict
|
|
71
|
+
architect: legacy-modernization
|
|
72
|
+
|
|
73
|
+
baselines:
|
|
74
|
+
dev-codegen: {mean: 85.8, std: 7.30, n: 10}
|
|
75
|
+
dev-debug: {mean: 77.5, std: 8.54, n: 10}
|
|
76
|
+
reviewer: {mean: 78.5, std: 1.8, n: 10}
|
|
77
|
+
tea: {mean: 72.1, std: 2.3, n: 10}
|
|
78
|
+
sm: {mean: 80.3, std: 1.9, n: 10}
|
|
79
|
+
architect: {mean: 87.2, std: 3.25, n: 10}
|
|
80
|
+
|
|
81
|
+
EOF
|
|
82
|
+
|
|
83
|
+
# Parse raw results and build matrix
|
|
84
|
+
if [[ -s "$THEME_OUT/raw_results.txt" ]]; then
|
|
85
|
+
echo "" >> "$THEME_OUT/summary.yaml"
|
|
86
|
+
echo "matrix:" >> "$THEME_OUT/summary.yaml"
|
|
87
|
+
|
|
88
|
+
# Get unique characters (handle names with spaces)
|
|
89
|
+
cut -d: -f2 "$THEME_OUT/raw_results.txt" | sort -u | while IFS= read -r CHAR; do
|
|
90
|
+
[[ -z "$CHAR" ]] && continue
|
|
91
|
+
CHAR_KEY=$(echo "$CHAR" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/_/g' | sed 's/__*/_/g' | sed 's/^_//;s/_$//')
|
|
92
|
+
echo " $CHAR_KEY:" >> "$THEME_OUT/summary.yaml"
|
|
93
|
+
|
|
94
|
+
# Get all roles for this character (escape special chars in grep)
|
|
95
|
+
CHAR_ESCAPED=$(printf '%s\n' "$CHAR" | sed 's/[[\.*^$()+?{|]/\\&/g')
|
|
96
|
+
grep ":${CHAR}:" "$THEME_OUT/raw_results.txt" | while IFS=: read -r src_role char role mean n; do
|
|
97
|
+
echo " $role: {mean: $mean, n: $n}" >> "$THEME_OUT/summary.yaml"
|
|
98
|
+
done
|
|
99
|
+
done
|
|
100
|
+
fi
|
|
101
|
+
|
|
102
|
+
echo " -> $THEME_OUT/summary.yaml"
|
|
103
|
+
done
|
|
104
|
+
|
|
105
|
+
echo ""
|
|
106
|
+
echo "=== Consolidation Complete ==="
|
|
107
|
+
echo "Results in: $CONSOLIDATED_DIR/"
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Convert job-fair results to benchmark format for showcase
|
|
3
|
+
#
|
|
4
|
+
# Reads: internal/results/job-fair/{theme}-*/runs/{role}/{character}/run_*.json
|
|
5
|
+
# Writes: internal/results/benchmarks/{scenario}/{theme}-{role}/summary.yaml
|
|
6
|
+
|
|
7
|
+
set -e
|
|
8
|
+
|
|
9
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
10
|
+
PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
|
|
11
|
+
|
|
12
|
+
JOBFAIR_DIR="$PROJECT_DIR/internal/results/job-fair"
|
|
13
|
+
BENCHMARKS_DIR="$PROJECT_DIR/internal/results/benchmarks"
|
|
14
|
+
BASELINES_DIR="$PROJECT_DIR/internal/results/baselines"
|
|
15
|
+
|
|
16
|
+
# Scenario mappings (role -> scenario)
|
|
17
|
+
declare -A ROLE_SCENARIO
|
|
18
|
+
ROLE_SCENARIO[dev-codegen]="tdd-shopping-cart"
|
|
19
|
+
ROLE_SCENARIO[dev-debug]="astropy-12907"
|
|
20
|
+
ROLE_SCENARIO[reviewer]="order-service"
|
|
21
|
+
ROLE_SCENARIO[tea]="payment-processor-tests"
|
|
22
|
+
ROLE_SCENARIO[sm]="sprint-planning-conflict"
|
|
23
|
+
ROLE_SCENARIO[architect]="legacy-modernization"
|
|
24
|
+
|
|
25
|
+
# Scenario metadata
|
|
26
|
+
declare -A SCENARIO_TITLE
|
|
27
|
+
SCENARIO_TITLE[tdd-shopping-cart]="TDD Shopping Cart Implementation"
|
|
28
|
+
SCENARIO_TITLE[astropy-12907]="Astropy Issue #12907 Debug"
|
|
29
|
+
SCENARIO_TITLE[order-service]="Order Service Code Review"
|
|
30
|
+
SCENARIO_TITLE[payment-processor-tests]="Payment Processor Test Suite"
|
|
31
|
+
SCENARIO_TITLE[sprint-planning-conflict]="Sprint Planning Conflict Resolution"
|
|
32
|
+
SCENARIO_TITLE[legacy-modernization]="Legacy System Modernization"
|
|
33
|
+
|
|
34
|
+
declare -A SCENARIO_CATEGORY
|
|
35
|
+
SCENARIO_CATEGORY[tdd-shopping-cart]="dev"
|
|
36
|
+
SCENARIO_CATEGORY[astropy-12907]="dev"
|
|
37
|
+
SCENARIO_CATEGORY[order-service]="reviewer"
|
|
38
|
+
SCENARIO_CATEGORY[payment-processor-tests]="tea"
|
|
39
|
+
SCENARIO_CATEGORY[sprint-planning-conflict]="sm"
|
|
40
|
+
SCENARIO_CATEGORY[legacy-modernization]="architect"
|
|
41
|
+
|
|
42
|
+
# Get baseline stats for a scenario/role
|
|
43
|
+
get_baseline() {
|
|
44
|
+
local scenario=$1
|
|
45
|
+
local role=$2
|
|
46
|
+
local baseline_file="$BASELINES_DIR/$scenario/$role/summary.yaml"
|
|
47
|
+
|
|
48
|
+
if [[ -f "$baseline_file" ]]; then
|
|
49
|
+
local mean=$(grep "mean:" "$baseline_file" | head -1 | awk '{print $2}')
|
|
50
|
+
local std=$(grep "std_dev:" "$baseline_file" | head -1 | awk '{print $2}')
|
|
51
|
+
echo "$mean:$std"
|
|
52
|
+
else
|
|
53
|
+
echo ""
|
|
54
|
+
fi
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Process a single theme
|
|
58
|
+
process_theme() {
|
|
59
|
+
local theme=$1
|
|
60
|
+
echo "Processing theme: $theme"
|
|
61
|
+
|
|
62
|
+
# Find all job-fair run directories for this theme
|
|
63
|
+
local run_dirs=$(find "$JOBFAIR_DIR" -maxdepth 1 -type d -name "${theme}-*" 2>/dev/null)
|
|
64
|
+
|
|
65
|
+
if [[ -z "$run_dirs" ]]; then
|
|
66
|
+
echo " No run directories found for $theme"
|
|
67
|
+
return
|
|
68
|
+
fi
|
|
69
|
+
|
|
70
|
+
# For each role we track
|
|
71
|
+
for role in dev-codegen dev-debug reviewer tea sm architect; do
|
|
72
|
+
local scenario="${ROLE_SCENARIO[$role]}"
|
|
73
|
+
local map_role="$role"
|
|
74
|
+
|
|
75
|
+
# Map dev-codegen and dev-debug to dev for directory lookup
|
|
76
|
+
local dir_role="$role"
|
|
77
|
+
|
|
78
|
+
# Collect all scores for this theme/role
|
|
79
|
+
local scores=()
|
|
80
|
+
local characters=()
|
|
81
|
+
local total_input_tokens=0
|
|
82
|
+
local total_output_tokens=0
|
|
83
|
+
local run_count=0
|
|
84
|
+
|
|
85
|
+
# Find the native character for this role from the theme
|
|
86
|
+
local theme_file="$PROJECT_DIR/pennyfarthing-dist/personas/themes/${theme}.yaml"
|
|
87
|
+
local lookup_role="$role"
|
|
88
|
+
[[ "$role" == "dev-codegen" || "$role" == "dev-debug" ]] && lookup_role="dev"
|
|
89
|
+
|
|
90
|
+
local native_char=""
|
|
91
|
+
if [[ -f "$theme_file" ]]; then
|
|
92
|
+
native_char=$(yq ".agents.${lookup_role}.character // \"\"" "$theme_file" 2>/dev/null | tr -d '"')
|
|
93
|
+
fi
|
|
94
|
+
|
|
95
|
+
# Scan all run directories
|
|
96
|
+
for run_dir in $run_dirs; do
|
|
97
|
+
local role_dir="$run_dir/runs/$dir_role"
|
|
98
|
+
|
|
99
|
+
if [[ ! -d "$role_dir" ]]; then
|
|
100
|
+
continue
|
|
101
|
+
fi
|
|
102
|
+
|
|
103
|
+
# For each character directory
|
|
104
|
+
for char_dir in "$role_dir"/*/; do
|
|
105
|
+
[[ ! -d "$char_dir" ]] && continue
|
|
106
|
+
|
|
107
|
+
# Read run files
|
|
108
|
+
for run_file in "$char_dir"/run_*.json; do
|
|
109
|
+
[[ ! -f "$run_file" ]] && continue
|
|
110
|
+
|
|
111
|
+
local score=$(python3 -c "import json; d=json.load(open('$run_file')); print(d.get('score', 0))" 2>/dev/null)
|
|
112
|
+
local char=$(python3 -c "import json; d=json.load(open('$run_file')); print(d.get('character', ''))" 2>/dev/null)
|
|
113
|
+
local input_tok=$(python3 -c "import json; d=json.load(open('$run_file')); print(d.get('agent_tokens', 0))" 2>/dev/null)
|
|
114
|
+
|
|
115
|
+
if [[ -n "$score" && "$score" != "0" ]]; then
|
|
116
|
+
scores+=("$score")
|
|
117
|
+
[[ -z "${characters[0]}" ]] && characters+=("$char")
|
|
118
|
+
total_input_tokens=$((total_input_tokens + input_tok))
|
|
119
|
+
run_count=$((run_count + 1))
|
|
120
|
+
fi
|
|
121
|
+
done
|
|
122
|
+
done
|
|
123
|
+
done
|
|
124
|
+
|
|
125
|
+
# Skip if no scores found
|
|
126
|
+
if [[ ${#scores[@]} -eq 0 ]]; then
|
|
127
|
+
continue
|
|
128
|
+
fi
|
|
129
|
+
|
|
130
|
+
# Calculate statistics
|
|
131
|
+
local n=${#scores[@]}
|
|
132
|
+
local sum=0
|
|
133
|
+
local min=100
|
|
134
|
+
local max=0
|
|
135
|
+
|
|
136
|
+
for s in "${scores[@]}"; do
|
|
137
|
+
sum=$(echo "$sum + $s" | bc)
|
|
138
|
+
[[ $(echo "$s < $min" | bc) -eq 1 ]] && min=$s
|
|
139
|
+
[[ $(echo "$s > $max" | bc) -eq 1 ]] && max=$s
|
|
140
|
+
done
|
|
141
|
+
|
|
142
|
+
local mean=$(echo "scale=2; $sum / $n" | bc)
|
|
143
|
+
|
|
144
|
+
# Calculate std dev
|
|
145
|
+
local sq_sum=0
|
|
146
|
+
for s in "${scores[@]}"; do
|
|
147
|
+
local diff=$(echo "$s - $mean" | bc)
|
|
148
|
+
sq_sum=$(echo "$sq_sum + ($diff * $diff)" | bc)
|
|
149
|
+
done
|
|
150
|
+
local variance=$(echo "scale=4; $sq_sum / $n" | bc)
|
|
151
|
+
local std_dev=$(echo "scale=2; sqrt($variance)" | bc)
|
|
152
|
+
|
|
153
|
+
# Get baseline
|
|
154
|
+
local baseline_role="$lookup_role"
|
|
155
|
+
local baseline=$(get_baseline "$scenario" "$baseline_role")
|
|
156
|
+
local baseline_mean=""
|
|
157
|
+
local baseline_std=""
|
|
158
|
+
local delta=""
|
|
159
|
+
|
|
160
|
+
if [[ -n "$baseline" ]]; then
|
|
161
|
+
baseline_mean=$(echo "$baseline" | cut -d: -f1)
|
|
162
|
+
baseline_std=$(echo "$baseline" | cut -d: -f2)
|
|
163
|
+
delta=$(echo "scale=2; $mean - $baseline_mean" | bc)
|
|
164
|
+
fi
|
|
165
|
+
|
|
166
|
+
# Create output directory
|
|
167
|
+
local output_dir="$BENCHMARKS_DIR/$scenario/${theme}-${lookup_role}"
|
|
168
|
+
mkdir -p "$output_dir"
|
|
169
|
+
|
|
170
|
+
# Format scores array
|
|
171
|
+
local scores_str=$(printf ", %.0f" "${scores[@]}")
|
|
172
|
+
scores_str="[${scores_str:2}]"
|
|
173
|
+
|
|
174
|
+
# Determine character (use native if available)
|
|
175
|
+
local char_name="${native_char:-${characters[0]:-Unknown}}"
|
|
176
|
+
|
|
177
|
+
# Write summary.yaml
|
|
178
|
+
cat > "$output_dir/summary.yaml" << EOF
|
|
179
|
+
# ${theme}:${lookup_role} on ${scenario}
|
|
180
|
+
# Generated from job-fair data: $(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
181
|
+
|
|
182
|
+
agent:
|
|
183
|
+
theme: ${theme}
|
|
184
|
+
role: ${lookup_role}
|
|
185
|
+
spec: ${theme}:${lookup_role}
|
|
186
|
+
character: ${char_name}
|
|
187
|
+
|
|
188
|
+
scenario:
|
|
189
|
+
name: ${scenario}
|
|
190
|
+
title: ${SCENARIO_TITLE[$scenario]}
|
|
191
|
+
category: ${SCENARIO_CATEGORY[$scenario]}
|
|
192
|
+
difficulty: medium
|
|
193
|
+
|
|
194
|
+
statistics:
|
|
195
|
+
n: ${n}
|
|
196
|
+
mean: ${mean}
|
|
197
|
+
std_dev: ${std_dev}
|
|
198
|
+
min: ${min}
|
|
199
|
+
max: ${max}
|
|
200
|
+
scores: ${scores_str}
|
|
201
|
+
EOF
|
|
202
|
+
|
|
203
|
+
# Add baseline comparison if available
|
|
204
|
+
if [[ -n "$baseline_mean" ]]; then
|
|
205
|
+
cat >> "$output_dir/summary.yaml" << EOF
|
|
206
|
+
|
|
207
|
+
baseline_comparison:
|
|
208
|
+
control_mean: ${baseline_mean}
|
|
209
|
+
control_stddev: ${baseline_std}
|
|
210
|
+
delta: ${delta}
|
|
211
|
+
EOF
|
|
212
|
+
fi
|
|
213
|
+
|
|
214
|
+
echo " Created: $output_dir/summary.yaml (n=$n, mean=$mean)"
|
|
215
|
+
done
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
# Main
|
|
219
|
+
echo "Converting job-fair results to benchmark format..."
|
|
220
|
+
echo ""
|
|
221
|
+
|
|
222
|
+
# Get list of themes from consolidated
|
|
223
|
+
themes=$(ls -d "$JOBFAIR_DIR/consolidated"/*/ 2>/dev/null | xargs -n1 basename)
|
|
224
|
+
|
|
225
|
+
for theme in $themes; do
|
|
226
|
+
process_theme "$theme"
|
|
227
|
+
echo ""
|
|
228
|
+
done
|
|
229
|
+
|
|
230
|
+
echo "Done!"
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Job Fair Batch Runner
|
|
3
|
+
# Runs all characters in a theme against all role scenarios
|
|
4
|
+
# Usage: ./scripts/job-fair-batch.sh <theme> [runs_per_combo]
|
|
5
|
+
|
|
6
|
+
set -e
|
|
7
|
+
|
|
8
|
+
THEME="$1"
|
|
9
|
+
RUNS="${2:-4}"
|
|
10
|
+
|
|
11
|
+
if [[ -z "$THEME" ]]; then
|
|
12
|
+
echo "Usage: $0 <theme> [runs_per_combo]" >&2
|
|
13
|
+
exit 1
|
|
14
|
+
fi
|
|
15
|
+
|
|
16
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
17
|
+
PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
|
|
18
|
+
PERSONA_FILE="$PROJECT_DIR/pennyfarthing-dist/personas/themes/${THEME}.yaml"
|
|
19
|
+
|
|
20
|
+
if [[ ! -f "$PERSONA_FILE" ]]; then
|
|
21
|
+
echo "Error: Theme not found: $THEME" >&2
|
|
22
|
+
exit 1
|
|
23
|
+
fi
|
|
24
|
+
|
|
25
|
+
# Output directory
|
|
26
|
+
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
|
27
|
+
OUTPUT_DIR="$PROJECT_DIR/internal/results/job-fair/${THEME}-${TIMESTAMP}"
|
|
28
|
+
mkdir -p "$OUTPUT_DIR"
|
|
29
|
+
|
|
30
|
+
echo "=== Job Fair: $THEME ===" | tee "$OUTPUT_DIR/log.txt"
|
|
31
|
+
echo "Runs per combo: $RUNS" | tee -a "$OUTPUT_DIR/log.txt"
|
|
32
|
+
echo "Output: $OUTPUT_DIR" | tee -a "$OUTPUT_DIR/log.txt"
|
|
33
|
+
|
|
34
|
+
# Role -> Scenario mapping function (macOS bash 3.x compatible)
|
|
35
|
+
get_scenario_for_role() {
|
|
36
|
+
case "$1" in
|
|
37
|
+
dev) echo "astropy-12907" ;;
|
|
38
|
+
reviewer) echo "astropy-12907" ;;
|
|
39
|
+
tea) echo "checkout-component-tests" ;;
|
|
40
|
+
sm) echo "dependency-deadlock" ;;
|
|
41
|
+
architect) echo "database-selection" ;;
|
|
42
|
+
*) echo "" ;;
|
|
43
|
+
esac
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# Get all characters from theme
|
|
47
|
+
CHARACTERS=$(yq -r '.agents | to_entries[] | "\(.key):\(.value.character)"' "$PERSONA_FILE")
|
|
48
|
+
|
|
49
|
+
# Initialize results file
|
|
50
|
+
echo "theme: $THEME" > "$OUTPUT_DIR/summary.yaml"
|
|
51
|
+
echo "timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$OUTPUT_DIR/summary.yaml"
|
|
52
|
+
echo "runs_per_combo: $RUNS" >> "$OUTPUT_DIR/summary.yaml"
|
|
53
|
+
echo "results:" >> "$OUTPUT_DIR/summary.yaml"
|
|
54
|
+
|
|
55
|
+
TOTAL_RUNS=0
|
|
56
|
+
TOTAL_COST=0
|
|
57
|
+
|
|
58
|
+
# For each target role
|
|
59
|
+
for TARGET_ROLE in dev reviewer tea sm architect; do
|
|
60
|
+
SCENARIO=$(get_scenario_for_role "$TARGET_ROLE")
|
|
61
|
+
echo "" | tee -a "$OUTPUT_DIR/log.txt"
|
|
62
|
+
echo "--- Testing as $TARGET_ROLE (scenario: $SCENARIO) ---" | tee -a "$OUTPUT_DIR/log.txt"
|
|
63
|
+
|
|
64
|
+
# For each character
|
|
65
|
+
while IFS=: read -r SOURCE_ROLE CHARACTER; do
|
|
66
|
+
echo " $CHARACTER ($SOURCE_ROLE -> $TARGET_ROLE)" | tee -a "$OUTPUT_DIR/log.txt"
|
|
67
|
+
|
|
68
|
+
CHAR_SLUG=$(echo "$CHARACTER" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g')
|
|
69
|
+
COMBO_DIR="$OUTPUT_DIR/${TARGET_ROLE}/${CHAR_SLUG}"
|
|
70
|
+
mkdir -p "$COMBO_DIR"
|
|
71
|
+
|
|
72
|
+
SCORES=""
|
|
73
|
+
for RUN in $(seq 1 $RUNS); do
|
|
74
|
+
echo -n " Run $RUN/$RUNS... " | tee -a "$OUTPUT_DIR/log.txt"
|
|
75
|
+
|
|
76
|
+
# Run solo benchmark with cross-role
|
|
77
|
+
RESULT=$("$SCRIPT_DIR/solo-runner.sh" "${THEME}:${CHARACTER}" "$SCENARIO" "$COMBO_DIR" --as "$TARGET_ROLE" 2>&1) || {
|
|
78
|
+
echo "FAILED" | tee -a "$OUTPUT_DIR/log.txt"
|
|
79
|
+
echo "$RESULT" >> "$OUTPUT_DIR/log.txt"
|
|
80
|
+
continue
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
SCORE=$(echo "$RESULT" | jq -r '.score // 0')
|
|
84
|
+
COST=$(echo "$RESULT" | jq -r '.total_cost_usd // 0')
|
|
85
|
+
|
|
86
|
+
echo "score=$SCORE cost=\$$COST" | tee -a "$OUTPUT_DIR/log.txt"
|
|
87
|
+
|
|
88
|
+
SCORES="$SCORES $SCORE"
|
|
89
|
+
TOTAL_COST=$(echo "$TOTAL_COST + $COST" | bc)
|
|
90
|
+
((TOTAL_RUNS++))
|
|
91
|
+
done
|
|
92
|
+
|
|
93
|
+
# Calculate mean for this combo
|
|
94
|
+
if [[ -n "$SCORES" ]]; then
|
|
95
|
+
MEAN=$(echo "$SCORES" | tr ' ' '\n' | grep -v '^$' | awk '{sum+=$1; n++} END {if(n>0) printf "%.2f", sum/n; else print 0}')
|
|
96
|
+
echo " - character: \"$CHARACTER\"" >> "$OUTPUT_DIR/summary.yaml"
|
|
97
|
+
echo " source_role: $SOURCE_ROLE" >> "$OUTPUT_DIR/summary.yaml"
|
|
98
|
+
echo " target_role: $TARGET_ROLE" >> "$OUTPUT_DIR/summary.yaml"
|
|
99
|
+
echo " mean: $MEAN" >> "$OUTPUT_DIR/summary.yaml"
|
|
100
|
+
echo " scores: [$SCORES ]" >> "$OUTPUT_DIR/summary.yaml"
|
|
101
|
+
fi
|
|
102
|
+
|
|
103
|
+
done <<< "$CHARACTERS"
|
|
104
|
+
done
|
|
105
|
+
|
|
106
|
+
echo "" | tee -a "$OUTPUT_DIR/log.txt"
|
|
107
|
+
echo "=== Complete ===" | tee -a "$OUTPUT_DIR/log.txt"
|
|
108
|
+
echo "Total runs: $TOTAL_RUNS" | tee -a "$OUTPUT_DIR/log.txt"
|
|
109
|
+
echo "Total cost: \$$TOTAL_COST" | tee -a "$OUTPUT_DIR/log.txt"
|
|
110
|
+
echo "Results: $OUTPUT_DIR/summary.yaml" | tee -a "$OUTPUT_DIR/log.txt"
|
|
111
|
+
|
|
112
|
+
# Add totals to summary
|
|
113
|
+
echo "total_runs: $TOTAL_RUNS" >> "$OUTPUT_DIR/summary.yaml"
|
|
114
|
+
echo "total_cost_usd: $TOTAL_COST" >> "$OUTPUT_DIR/summary.yaml"
|
|
115
|
+
|
|
116
|
+
echo "$OUTPUT_DIR"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Check job fair progress across all running themes
|
|
3
|
+
|
|
4
|
+
echo "=== Job Fair Progress ==="
|
|
5
|
+
echo ""
|
|
6
|
+
|
|
7
|
+
for log in /tmp/job-fair-*.log; do
|
|
8
|
+
if [[ -f "$log" ]]; then
|
|
9
|
+
theme=$(basename "$log" | sed 's/job-fair-//' | sed 's/.log//')
|
|
10
|
+
|
|
11
|
+
# Count completed runs
|
|
12
|
+
completed=$(grep -c "score=" "$log" 2>/dev/null || echo "0")
|
|
13
|
+
total=200
|
|
14
|
+
|
|
15
|
+
# Get last few lines
|
|
16
|
+
last_char=$(grep -E "^ [A-Z]" "$log" 2>/dev/null | tail -1 | sed 's/^ *//')
|
|
17
|
+
last_score=$(grep "score=" "$log" 2>/dev/null | tail -1 | grep -oE "score=[0-9.]+" | cut -d= -f2)
|
|
18
|
+
|
|
19
|
+
# Check if still running
|
|
20
|
+
if pgrep -f "job-fair-batch.sh $theme" > /dev/null 2>&1; then
|
|
21
|
+
status="RUNNING"
|
|
22
|
+
else
|
|
23
|
+
status="DONE"
|
|
24
|
+
fi
|
|
25
|
+
|
|
26
|
+
pct=$((completed * 100 / total))
|
|
27
|
+
echo "$theme: $completed/$total ($pct%) [$status]"
|
|
28
|
+
[[ -n "$last_char" ]] && echo " Last: $last_char -> $last_score"
|
|
29
|
+
fi
|
|
30
|
+
done
|
|
31
|
+
|
|
32
|
+
echo ""
|
|
33
|
+
echo "Total estimated: 1000 runs"
|
|
34
|
+
total_done=$(cat /tmp/job-fair-*.log 2>/dev/null | grep -c "score=" || echo "0")
|
|
35
|
+
echo "Total completed: $total_done"
|