@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,8 @@
1
+ #!/bin/bash
2
+ # benchmark-runner.sh - Shell wrapper for benchmark-runner.js
3
+ # This wrapper ensures the tests can call benchmark-runner.sh as expected
4
+ #
5
+ # All logic is implemented in benchmark-runner.js (Node.js)
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
8
+ exec node "$SCRIPT_DIR/benchmark-runner.js" "$@"
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env bash
2
+ # Consolidate Job Fair Results
3
+ # Merges multiple role-specific runs for the same theme into one summary
4
+ # Usage: ./scripts/consolidate-job-fair.sh [theme1 theme2 ...]
5
+
6
+ set -e
7
+
8
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
9
+ PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
10
+ RESULTS_DIR="$PROJECT_DIR/internal/results/job-fair"
11
+ CONSOLIDATED_DIR="$RESULTS_DIR/consolidated"
12
+
13
+ mkdir -p "$CONSOLIDATED_DIR"
14
+
15
+ # If themes specified, use those; otherwise find all themes
16
+ if [[ $# -gt 0 ]]; then
17
+ THEMES="$@"
18
+ else
19
+ # Extract unique theme names from directory names
20
+ THEMES=$(ls -d "$RESULTS_DIR"/*-20* 2>/dev/null | xargs -n1 basename | sed 's/-20[0-9]*T[0-9]*Z$//' | sed 's/-20[0-9]*-[0-9]*$//' | sort -u)
21
+ fi
22
+
23
+ echo "=== Consolidating Job Fair Results ==="
24
+ echo ""
25
+
26
+ for THEME in $THEMES; do
27
+ echo "### Processing: $THEME"
28
+
29
+ # Find all directories for this theme
30
+ THEME_DIRS=$(ls -d "$RESULTS_DIR/${THEME}"-20* 2>/dev/null | sort)
31
+
32
+ if [[ -z "$THEME_DIRS" ]]; then
33
+ echo " [SKIP] No results found for $THEME"
34
+ continue
35
+ fi
36
+
37
+ # Create consolidated directory for this theme
38
+ THEME_OUT="$CONSOLIDATED_DIR/$THEME"
39
+ mkdir -p "$THEME_OUT"
40
+
41
+ # Merge all raw_results.txt files
42
+ > "$THEME_OUT/raw_results.txt"
43
+
44
+ for DIR in $THEME_DIRS; do
45
+ if [[ -f "$DIR/raw_results.txt" ]]; then
46
+ cat "$DIR/raw_results.txt" >> "$THEME_OUT/raw_results.txt"
47
+ fi
48
+ done
49
+
50
+ # Count unique roles collected
51
+ ROLES=$(cut -d: -f3 "$THEME_OUT/raw_results.txt" 2>/dev/null | sort -u | tr '\n' ' ')
52
+ ROLE_COUNT=$(echo "$ROLES" | wc -w | tr -d ' ')
53
+
54
+ echo " Merged $ROLE_COUNT roles: $ROLES"
55
+
56
+ # Generate consolidated summary
57
+ TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
58
+
59
+ cat > "$THEME_OUT/summary.yaml" << EOF
60
+ # Consolidated Job Fair Results
61
+ theme: $THEME
62
+ consolidated_at: $TIMESTAMP
63
+ source_runs: $(echo "$THEME_DIRS" | wc -l | tr -d ' ')
64
+
65
+ scenarios:
66
+ dev-codegen: tdd-shopping-cart
67
+ dev-debug: astropy-12907
68
+ reviewer: order-service
69
+ tea: payment-processor-tests
70
+ sm: sprint-planning-conflict
71
+ architect: legacy-modernization
72
+
73
+ baselines:
74
+ dev-codegen: {mean: 85.8, std: 7.30, n: 10}
75
+ dev-debug: {mean: 77.5, std: 8.54, n: 10}
76
+ reviewer: {mean: 78.5, std: 1.8, n: 10}
77
+ tea: {mean: 72.1, std: 2.3, n: 10}
78
+ sm: {mean: 80.3, std: 1.9, n: 10}
79
+ architect: {mean: 87.2, std: 3.25, n: 10}
80
+
81
+ EOF
82
+
83
+ # Parse raw results and build matrix
84
+ if [[ -s "$THEME_OUT/raw_results.txt" ]]; then
85
+ echo "" >> "$THEME_OUT/summary.yaml"
86
+ echo "matrix:" >> "$THEME_OUT/summary.yaml"
87
+
88
+ # Get unique characters (handle names with spaces)
89
+ cut -d: -f2 "$THEME_OUT/raw_results.txt" | sort -u | while IFS= read -r CHAR; do
90
+ [[ -z "$CHAR" ]] && continue
91
+ CHAR_KEY=$(echo "$CHAR" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/_/g' | sed 's/__*/_/g' | sed 's/^_//;s/_$//')
92
+ echo " $CHAR_KEY:" >> "$THEME_OUT/summary.yaml"
93
+
94
+ # Get all roles for this character (escape special chars in grep)
95
+ CHAR_ESCAPED=$(printf '%s\n' "$CHAR" | sed 's/[[\.*^$()+?{|]/\\&/g')
96
+ grep ":${CHAR}:" "$THEME_OUT/raw_results.txt" | while IFS=: read -r src_role char role mean n; do
97
+ echo " $role: {mean: $mean, n: $n}" >> "$THEME_OUT/summary.yaml"
98
+ done
99
+ done
100
+ fi
101
+
102
+ echo " -> $THEME_OUT/summary.yaml"
103
+ done
104
+
105
+ echo ""
106
+ echo "=== Consolidation Complete ==="
107
+ echo "Results in: $CONSOLIDATED_DIR/"
@@ -0,0 +1,230 @@
1
+ #!/bin/bash
2
+ # Convert job-fair results to benchmark format for showcase
3
+ #
4
+ # Reads: internal/results/job-fair/{theme}-*/runs/{role}/{character}/run_*.json
5
+ # Writes: internal/results/benchmarks/{scenario}/{theme}-{role}/summary.yaml
6
+
7
+ set -e
8
+
9
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
10
+ PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
11
+
12
+ JOBFAIR_DIR="$PROJECT_DIR/internal/results/job-fair"
13
+ BENCHMARKS_DIR="$PROJECT_DIR/internal/results/benchmarks"
14
+ BASELINES_DIR="$PROJECT_DIR/internal/results/baselines"
15
+
16
+ # Scenario mappings (role -> scenario)
17
+ declare -A ROLE_SCENARIO
18
+ ROLE_SCENARIO[dev-codegen]="tdd-shopping-cart"
19
+ ROLE_SCENARIO[dev-debug]="astropy-12907"
20
+ ROLE_SCENARIO[reviewer]="order-service"
21
+ ROLE_SCENARIO[tea]="payment-processor-tests"
22
+ ROLE_SCENARIO[sm]="sprint-planning-conflict"
23
+ ROLE_SCENARIO[architect]="legacy-modernization"
24
+
25
+ # Scenario metadata
26
+ declare -A SCENARIO_TITLE
27
+ SCENARIO_TITLE[tdd-shopping-cart]="TDD Shopping Cart Implementation"
28
+ SCENARIO_TITLE[astropy-12907]="Astropy Issue #12907 Debug"
29
+ SCENARIO_TITLE[order-service]="Order Service Code Review"
30
+ SCENARIO_TITLE[payment-processor-tests]="Payment Processor Test Suite"
31
+ SCENARIO_TITLE[sprint-planning-conflict]="Sprint Planning Conflict Resolution"
32
+ SCENARIO_TITLE[legacy-modernization]="Legacy System Modernization"
33
+
34
+ declare -A SCENARIO_CATEGORY
35
+ SCENARIO_CATEGORY[tdd-shopping-cart]="dev"
36
+ SCENARIO_CATEGORY[astropy-12907]="dev"
37
+ SCENARIO_CATEGORY[order-service]="reviewer"
38
+ SCENARIO_CATEGORY[payment-processor-tests]="tea"
39
+ SCENARIO_CATEGORY[sprint-planning-conflict]="sm"
40
+ SCENARIO_CATEGORY[legacy-modernization]="architect"
41
+
42
+ # Get baseline stats for a scenario/role
43
+ get_baseline() {
44
+ local scenario=$1
45
+ local role=$2
46
+ local baseline_file="$BASELINES_DIR/$scenario/$role/summary.yaml"
47
+
48
+ if [[ -f "$baseline_file" ]]; then
49
+ local mean=$(grep "mean:" "$baseline_file" | head -1 | awk '{print $2}')
50
+ local std=$(grep "std_dev:" "$baseline_file" | head -1 | awk '{print $2}')
51
+ echo "$mean:$std"
52
+ else
53
+ echo ""
54
+ fi
55
+ }
56
+
57
+ # Process a single theme
58
+ process_theme() {
59
+ local theme=$1
60
+ echo "Processing theme: $theme"
61
+
62
+ # Find all job-fair run directories for this theme
63
+ local run_dirs=$(find "$JOBFAIR_DIR" -maxdepth 1 -type d -name "${theme}-*" 2>/dev/null)
64
+
65
+ if [[ -z "$run_dirs" ]]; then
66
+ echo " No run directories found for $theme"
67
+ return
68
+ fi
69
+
70
+ # For each role we track
71
+ for role in dev-codegen dev-debug reviewer tea sm architect; do
72
+ local scenario="${ROLE_SCENARIO[$role]}"
73
+ local map_role="$role"
74
+
75
+ # Map dev-codegen and dev-debug to dev for directory lookup
76
+ local dir_role="$role"
77
+
78
+ # Collect all scores for this theme/role
79
+ local scores=()
80
+ local characters=()
81
+ local total_input_tokens=0
82
+ local total_output_tokens=0
83
+ local run_count=0
84
+
85
+ # Find the native character for this role from the theme
86
+ local theme_file="$PROJECT_DIR/pennyfarthing-dist/personas/themes/${theme}.yaml"
87
+ local lookup_role="$role"
88
+ [[ "$role" == "dev-codegen" || "$role" == "dev-debug" ]] && lookup_role="dev"
89
+
90
+ local native_char=""
91
+ if [[ -f "$theme_file" ]]; then
92
+ native_char=$(yq ".agents.${lookup_role}.character // \"\"" "$theme_file" 2>/dev/null | tr -d '"')
93
+ fi
94
+
95
+ # Scan all run directories
96
+ for run_dir in $run_dirs; do
97
+ local role_dir="$run_dir/runs/$dir_role"
98
+
99
+ if [[ ! -d "$role_dir" ]]; then
100
+ continue
101
+ fi
102
+
103
+ # For each character directory
104
+ for char_dir in "$role_dir"/*/; do
105
+ [[ ! -d "$char_dir" ]] && continue
106
+
107
+ # Read run files
108
+ for run_file in "$char_dir"/run_*.json; do
109
+ [[ ! -f "$run_file" ]] && continue
110
+
111
+ local score=$(python3 -c "import json; d=json.load(open('$run_file')); print(d.get('score', 0))" 2>/dev/null)
112
+ local char=$(python3 -c "import json; d=json.load(open('$run_file')); print(d.get('character', ''))" 2>/dev/null)
113
+ local input_tok=$(python3 -c "import json; d=json.load(open('$run_file')); print(d.get('agent_tokens', 0))" 2>/dev/null)
114
+
115
+ if [[ -n "$score" && "$score" != "0" ]]; then
116
+ scores+=("$score")
117
+ [[ -z "${characters[0]}" ]] && characters+=("$char")
118
+ total_input_tokens=$((total_input_tokens + input_tok))
119
+ run_count=$((run_count + 1))
120
+ fi
121
+ done
122
+ done
123
+ done
124
+
125
+ # Skip if no scores found
126
+ if [[ ${#scores[@]} -eq 0 ]]; then
127
+ continue
128
+ fi
129
+
130
+ # Calculate statistics
131
+ local n=${#scores[@]}
132
+ local sum=0
133
+ local min=100
134
+ local max=0
135
+
136
+ for s in "${scores[@]}"; do
137
+ sum=$(echo "$sum + $s" | bc)
138
+ [[ $(echo "$s < $min" | bc) -eq 1 ]] && min=$s
139
+ [[ $(echo "$s > $max" | bc) -eq 1 ]] && max=$s
140
+ done
141
+
142
+ local mean=$(echo "scale=2; $sum / $n" | bc)
143
+
144
+ # Calculate std dev
145
+ local sq_sum=0
146
+ for s in "${scores[@]}"; do
147
+ local diff=$(echo "$s - $mean" | bc)
148
+ sq_sum=$(echo "$sq_sum + ($diff * $diff)" | bc)
149
+ done
150
+ local variance=$(echo "scale=4; $sq_sum / $n" | bc)
151
+ local std_dev=$(echo "scale=2; sqrt($variance)" | bc)
152
+
153
+ # Get baseline
154
+ local baseline_role="$lookup_role"
155
+ local baseline=$(get_baseline "$scenario" "$baseline_role")
156
+ local baseline_mean=""
157
+ local baseline_std=""
158
+ local delta=""
159
+
160
+ if [[ -n "$baseline" ]]; then
161
+ baseline_mean=$(echo "$baseline" | cut -d: -f1)
162
+ baseline_std=$(echo "$baseline" | cut -d: -f2)
163
+ delta=$(echo "scale=2; $mean - $baseline_mean" | bc)
164
+ fi
165
+
166
+ # Create output directory
167
+ local output_dir="$BENCHMARKS_DIR/$scenario/${theme}-${lookup_role}"
168
+ mkdir -p "$output_dir"
169
+
170
+ # Format scores array
171
+ local scores_str=$(printf ", %.0f" "${scores[@]}")
172
+ scores_str="[${scores_str:2}]"
173
+
174
+ # Determine character (use native if available)
175
+ local char_name="${native_char:-${characters[0]:-Unknown}}"
176
+
177
+ # Write summary.yaml
178
+ cat > "$output_dir/summary.yaml" << EOF
179
+ # ${theme}:${lookup_role} on ${scenario}
180
+ # Generated from job-fair data: $(date -u +%Y-%m-%dT%H:%M:%SZ)
181
+
182
+ agent:
183
+ theme: ${theme}
184
+ role: ${lookup_role}
185
+ spec: ${theme}:${lookup_role}
186
+ character: ${char_name}
187
+
188
+ scenario:
189
+ name: ${scenario}
190
+ title: ${SCENARIO_TITLE[$scenario]}
191
+ category: ${SCENARIO_CATEGORY[$scenario]}
192
+ difficulty: medium
193
+
194
+ statistics:
195
+ n: ${n}
196
+ mean: ${mean}
197
+ std_dev: ${std_dev}
198
+ min: ${min}
199
+ max: ${max}
200
+ scores: ${scores_str}
201
+ EOF
202
+
203
+ # Add baseline comparison if available
204
+ if [[ -n "$baseline_mean" ]]; then
205
+ cat >> "$output_dir/summary.yaml" << EOF
206
+
207
+ baseline_comparison:
208
+ control_mean: ${baseline_mean}
209
+ control_stddev: ${baseline_std}
210
+ delta: ${delta}
211
+ EOF
212
+ fi
213
+
214
+ echo " Created: $output_dir/summary.yaml (n=$n, mean=$mean)"
215
+ done
216
+ }
217
+
218
+ # Main
219
+ echo "Converting job-fair results to benchmark format..."
220
+ echo ""
221
+
222
+ # Get list of themes from consolidated
223
+ themes=$(ls -d "$JOBFAIR_DIR/consolidated"/*/ 2>/dev/null | xargs -n1 basename)
224
+
225
+ for theme in $themes; do
226
+ process_theme "$theme"
227
+ echo ""
228
+ done
229
+
230
+ echo "Done!"
@@ -0,0 +1,116 @@
1
+ #!/bin/bash
2
+ # Job Fair Batch Runner
3
+ # Runs all characters in a theme against all role scenarios
4
+ # Usage: ./scripts/job-fair-batch.sh <theme> [runs_per_combo]
5
+
6
+ set -e
7
+
8
+ THEME="$1"
9
+ RUNS="${2:-4}"
10
+
11
+ if [[ -z "$THEME" ]]; then
12
+ echo "Usage: $0 <theme> [runs_per_combo]" >&2
13
+ exit 1
14
+ fi
15
+
16
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
17
+ PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
18
+ PERSONA_FILE="$PROJECT_DIR/pennyfarthing-dist/personas/themes/${THEME}.yaml"
19
+
20
+ if [[ ! -f "$PERSONA_FILE" ]]; then
21
+ echo "Error: Theme not found: $THEME" >&2
22
+ exit 1
23
+ fi
24
+
25
+ # Output directory
26
+ TIMESTAMP=$(date +%Y%m%d-%H%M%S)
27
+ OUTPUT_DIR="$PROJECT_DIR/internal/results/job-fair/${THEME}-${TIMESTAMP}"
28
+ mkdir -p "$OUTPUT_DIR"
29
+
30
+ echo "=== Job Fair: $THEME ===" | tee "$OUTPUT_DIR/log.txt"
31
+ echo "Runs per combo: $RUNS" | tee -a "$OUTPUT_DIR/log.txt"
32
+ echo "Output: $OUTPUT_DIR" | tee -a "$OUTPUT_DIR/log.txt"
33
+
34
+ # Role -> Scenario mapping function (macOS bash 3.x compatible)
35
+ get_scenario_for_role() {
36
+ case "$1" in
37
+ dev) echo "astropy-12907" ;;
38
+ reviewer) echo "astropy-12907" ;;
39
+ tea) echo "checkout-component-tests" ;;
40
+ sm) echo "dependency-deadlock" ;;
41
+ architect) echo "database-selection" ;;
42
+ *) echo "" ;;
43
+ esac
44
+ }
45
+
46
+ # Get all characters from theme
47
+ CHARACTERS=$(yq -r '.agents | to_entries[] | "\(.key):\(.value.character)"' "$PERSONA_FILE")
48
+
49
+ # Initialize results file
50
+ echo "theme: $THEME" > "$OUTPUT_DIR/summary.yaml"
51
+ echo "timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$OUTPUT_DIR/summary.yaml"
52
+ echo "runs_per_combo: $RUNS" >> "$OUTPUT_DIR/summary.yaml"
53
+ echo "results:" >> "$OUTPUT_DIR/summary.yaml"
54
+
55
+ TOTAL_RUNS=0
56
+ TOTAL_COST=0
57
+
58
+ # For each target role
59
+ for TARGET_ROLE in dev reviewer tea sm architect; do
60
+ SCENARIO=$(get_scenario_for_role "$TARGET_ROLE")
61
+ echo "" | tee -a "$OUTPUT_DIR/log.txt"
62
+ echo "--- Testing as $TARGET_ROLE (scenario: $SCENARIO) ---" | tee -a "$OUTPUT_DIR/log.txt"
63
+
64
+ # For each character
65
+ while IFS=: read -r SOURCE_ROLE CHARACTER; do
66
+ echo " $CHARACTER ($SOURCE_ROLE -> $TARGET_ROLE)" | tee -a "$OUTPUT_DIR/log.txt"
67
+
68
+ CHAR_SLUG=$(echo "$CHARACTER" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g')
69
+ COMBO_DIR="$OUTPUT_DIR/${TARGET_ROLE}/${CHAR_SLUG}"
70
+ mkdir -p "$COMBO_DIR"
71
+
72
+ SCORES=""
73
+ for RUN in $(seq 1 $RUNS); do
74
+ echo -n " Run $RUN/$RUNS... " | tee -a "$OUTPUT_DIR/log.txt"
75
+
76
+ # Run solo benchmark with cross-role
77
+ RESULT=$("$SCRIPT_DIR/solo-runner.sh" "${THEME}:${CHARACTER}" "$SCENARIO" "$COMBO_DIR" --as "$TARGET_ROLE" 2>&1) || {
78
+ echo "FAILED" | tee -a "$OUTPUT_DIR/log.txt"
79
+ echo "$RESULT" >> "$OUTPUT_DIR/log.txt"
80
+ continue
81
+ }
82
+
83
+ SCORE=$(echo "$RESULT" | jq -r '.score // 0')
84
+ COST=$(echo "$RESULT" | jq -r '.total_cost_usd // 0')
85
+
86
+ echo "score=$SCORE cost=\$$COST" | tee -a "$OUTPUT_DIR/log.txt"
87
+
88
+ SCORES="$SCORES $SCORE"
89
+ TOTAL_COST=$(echo "$TOTAL_COST + $COST" | bc)
90
+ ((TOTAL_RUNS++))
91
+ done
92
+
93
+ # Calculate mean for this combo
94
+ if [[ -n "$SCORES" ]]; then
95
+ MEAN=$(echo "$SCORES" | tr ' ' '\n' | grep -v '^$' | awk '{sum+=$1; n++} END {if(n>0) printf "%.2f", sum/n; else print 0}')
96
+ echo " - character: \"$CHARACTER\"" >> "$OUTPUT_DIR/summary.yaml"
97
+ echo " source_role: $SOURCE_ROLE" >> "$OUTPUT_DIR/summary.yaml"
98
+ echo " target_role: $TARGET_ROLE" >> "$OUTPUT_DIR/summary.yaml"
99
+ echo " mean: $MEAN" >> "$OUTPUT_DIR/summary.yaml"
100
+ echo " scores: [$SCORES ]" >> "$OUTPUT_DIR/summary.yaml"
101
+ fi
102
+
103
+ done <<< "$CHARACTERS"
104
+ done
105
+
106
+ echo "" | tee -a "$OUTPUT_DIR/log.txt"
107
+ echo "=== Complete ===" | tee -a "$OUTPUT_DIR/log.txt"
108
+ echo "Total runs: $TOTAL_RUNS" | tee -a "$OUTPUT_DIR/log.txt"
109
+ echo "Total cost: \$$TOTAL_COST" | tee -a "$OUTPUT_DIR/log.txt"
110
+ echo "Results: $OUTPUT_DIR/summary.yaml" | tee -a "$OUTPUT_DIR/log.txt"
111
+
112
+ # Add totals to summary
113
+ echo "total_runs: $TOTAL_RUNS" >> "$OUTPUT_DIR/summary.yaml"
114
+ echo "total_cost_usd: $TOTAL_COST" >> "$OUTPUT_DIR/summary.yaml"
115
+
116
+ echo "$OUTPUT_DIR"
@@ -0,0 +1,35 @@
1
+ #!/bin/bash
2
+ # Check job fair progress across all running themes
3
+
4
+ echo "=== Job Fair Progress ==="
5
+ echo ""
6
+
7
+ for log in /tmp/job-fair-*.log; do
8
+ if [[ -f "$log" ]]; then
9
+ theme=$(basename "$log" | sed 's/job-fair-//' | sed 's/.log//')
10
+
11
+ # Count completed runs
12
+ completed=$(grep -c "score=" "$log" 2>/dev/null || echo "0")
13
+ total=200
14
+
15
+ # Get last few lines
16
+ last_char=$(grep -E "^ [A-Z]" "$log" 2>/dev/null | tail -1 | sed 's/^ *//')
17
+ last_score=$(grep "score=" "$log" 2>/dev/null | tail -1 | grep -oE "score=[0-9.]+" | cut -d= -f2)
18
+
19
+ # Check if still running
20
+ if pgrep -f "job-fair-batch.sh $theme" > /dev/null 2>&1; then
21
+ status="RUNNING"
22
+ else
23
+ status="DONE"
24
+ fi
25
+
26
+ pct=$((completed * 100 / total))
27
+ echo "$theme: $completed/$total ($pct%) [$status]"
28
+ [[ -n "$last_char" ]] && echo " Last: $last_char -> $last_score"
29
+ fi
30
+ done
31
+
32
+ echo ""
33
+ echo "Total estimated: 1000 runs"
34
+ total_done=$(cat /tmp/job-fair-*.log 2>/dev/null | grep -c "score=" || echo "0")
35
+ echo "Total completed: $total_done"