@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,278 @@
1
+ #!/usr/bin/env bash
2
+ # Job Fair Runner
3
+ # Runs all characters from a theme against all roles with baselines
4
+ # Usage: ./scripts/job-fair-runner.sh <theme> [--runs N]
5
+
6
+ set -e
7
+
8
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
9
+ PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
10
+ SOLO_RUNNER="$SCRIPT_DIR/solo-runner.sh"
11
+
12
+ THEME="$1"
13
+ RUNS=2 # Default runs per combo
14
+ REQUESTED_ROLES="" # Empty = all roles
15
+
16
+ # Parse flags
17
+ if [[ $# -gt 0 ]]; then
18
+ shift # Remove theme from args
19
+ fi
20
+ while [[ $# -gt 0 ]]; do
21
+ case "$1" in
22
+ --runs)
23
+ RUNS="$2"
24
+ shift 2
25
+ ;;
26
+ --roles)
27
+ REQUESTED_ROLES="$2"
28
+ shift 2
29
+ ;;
30
+ *)
31
+ echo "Unknown option: $1" >&2
32
+ exit 1
33
+ ;;
34
+ esac
35
+ done
36
+
37
+ if [[ -z "$THEME" ]]; then
38
+ echo "Usage: $0 <theme> [--runs N] [--roles role1,role2,...]" >&2
39
+ echo " --runs N Number of runs per combination (default: 2)"
40
+ echo " --roles Comma-separated roles to test (default: all)"
41
+ echo " Available: dev,reviewer,tea,sm,architect"
42
+ echo " Note: 'dev' expands to dev-codegen,dev-debug"
43
+ exit 1
44
+ fi
45
+
46
+ THEME_FILE="$PROJECT_DIR/pennyfarthing-dist/personas/themes/${THEME}.yaml"
47
+ if [[ ! -f "$THEME_FILE" ]]; then
48
+ echo "Error: Theme not found: $THEME_FILE" >&2
49
+ exit 1
50
+ fi
51
+
52
+ # Define role -> scenario mappings
53
+ # DEV is special: has TWO sub-competencies (codegen + debug)
54
+ get_scenario() {
55
+ case "$1" in
56
+ dev-codegen) echo "tdd-shopping-cart" ;;
57
+ dev-debug) echo "astropy-12907" ;;
58
+ dev) echo "tdd-shopping-cart" ;; # Legacy fallback
59
+ reviewer) echo "order-service" ;;
60
+ tea) echo "payment-processor-tests" ;;
61
+ sm) echo "sprint-planning-conflict" ;;
62
+ architect) echo "legacy-modernization" ;;
63
+ esac
64
+ }
65
+
66
+ get_baseline() {
67
+ case "$1" in
68
+ dev-codegen) echo "85.8" ;;
69
+ dev-debug) echo "77.5" ;;
70
+ dev) echo "85.8" ;; # Legacy fallback
71
+ reviewer) echo "78.5" ;;
72
+ tea) echo "72.1" ;;
73
+ sm) echo "80.3" ;;
74
+ architect) echo "87.2" ;;
75
+ esac
76
+ }
77
+
78
+ # Dev has dual testing - returns space-separated list
79
+ get_dev_sub_roles() {
80
+ echo "dev-codegen dev-debug"
81
+ }
82
+
83
+ # Create output directory
84
+ TIMESTAMP=$(date -u +%Y%m%dT%H%M%SZ)
85
+ OUTPUT_DIR="$PROJECT_DIR/internal/results/job-fair/${THEME}-${TIMESTAMP}"
86
+ mkdir -p "$OUTPUT_DIR"
87
+
88
+ echo "=== Job Fair: $THEME ==="
89
+ echo "Output: $OUTPUT_DIR"
90
+ echo "Runs per combo: $RUNS"
91
+ echo ""
92
+
93
+ # Get main characters (the 5 core roles)
94
+ MAIN_CHARS_FILE=$(mktemp)
95
+ yq -r '.agents | to_entries[] | select(.key | test("^(orchestrator|sm|tea|dev|reviewer)$")) | "\(.key):\(.value.character)"' "$THEME_FILE" > "$MAIN_CHARS_FILE"
96
+
97
+ echo "### Characters"
98
+ while IFS=: read -r role char; do
99
+ echo " - $role: $char"
100
+ done < "$MAIN_CHARS_FILE"
101
+ echo ""
102
+
103
+ # Touch raw results file
104
+ touch "$OUTPUT_DIR/raw_results.txt"
105
+
106
+ # Build role list - dev expands to dev-codegen and dev-debug
107
+ # If --roles specified, only test those roles
108
+ if [[ -n "$REQUESTED_ROLES" ]]; then
109
+ # Parse comma-separated roles
110
+ BASE_ROLES=$(echo "$REQUESTED_ROLES" | tr ',' ' ')
111
+ else
112
+ BASE_ROLES="dev reviewer tea sm architect"
113
+ fi
114
+
115
+ ROLES_TO_TEST=""
116
+ for BASE_ROLE in $BASE_ROLES; do
117
+ if [[ "$BASE_ROLE" == "dev" ]]; then
118
+ # Dev has dual sub-competencies
119
+ ROLES_TO_TEST="$ROLES_TO_TEST dev-codegen dev-debug"
120
+ elif [[ "$BASE_ROLE" == "dev-codegen" || "$BASE_ROLE" == "dev-debug" ]]; then
121
+ # Allow specifying individual dev sub-competencies
122
+ ROLES_TO_TEST="$ROLES_TO_TEST $BASE_ROLE"
123
+ else
124
+ ROLES_TO_TEST="$ROLES_TO_TEST $BASE_ROLE"
125
+ fi
126
+ done
127
+
128
+ echo "Roles to test: $ROLES_TO_TEST"
129
+
130
+ # Run each role (dev-codegen and dev-debug tested separately)
131
+ for ROLE in $ROLES_TO_TEST; do
132
+ SCENARIO=$(get_scenario "$ROLE")
133
+ BASELINE=$(get_baseline "$ROLE")
134
+
135
+ # For dev sub-roles, use 'dev' as the actual agent role
136
+ if [[ "$ROLE" == "dev-codegen" || "$ROLE" == "dev-debug" ]]; then
137
+ AGENT_ROLE="dev"
138
+ else
139
+ AGENT_ROLE="$ROLE"
140
+ fi
141
+
142
+ echo "### Testing: $ROLE (scenario: $SCENARIO, baseline: $BASELINE)"
143
+
144
+ # Run each character for this role
145
+ while IFS=: read -r source_role char; do
146
+ # Determine spec based on whether this is native or cross-role
147
+ # Use AGENT_ROLE for comparison (dev for both dev-codegen and dev-debug)
148
+ if [[ "$source_role" == "$AGENT_ROLE" ]]; then
149
+ # Native role - direct run
150
+ SPEC="$THEME:$AGENT_ROLE"
151
+ CROSS=""
152
+ else
153
+ # Cross-role - extract first name or simple identifier
154
+ # Use first word of character name for lookup
155
+ FIRST_NAME=$(echo "$char" | awk '{print $1}')
156
+ SPEC="$THEME:$FIRST_NAME"
157
+ CROSS="--as $AGENT_ROLE"
158
+ fi
159
+
160
+ CHAR_SLUG=$(echo "$char" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g' | sed 's/--*/-/g')
161
+ RUN_DIR="$OUTPUT_DIR/runs/$ROLE/$CHAR_SLUG"
162
+ mkdir -p "$RUN_DIR"
163
+
164
+ SUM=0
165
+ COUNT=0
166
+ for i in $(seq 1 $RUNS); do
167
+ echo -n " $char ($source_role) -> $ROLE [$i/$RUNS]: "
168
+
169
+ if [[ -n "$CROSS" ]]; then
170
+ RESULT=$("$SOLO_RUNNER" "$SPEC" "$SCENARIO" "$RUN_DIR" $CROSS 2>/dev/null || echo '{"success":false,"score":0}')
171
+ else
172
+ RESULT=$("$SOLO_RUNNER" "$SPEC" "$SCENARIO" "$RUN_DIR" 2>/dev/null || echo '{"success":false,"score":0}')
173
+ fi
174
+
175
+ SCORE=$(echo "$RESULT" | jq -r '.score // 0')
176
+ echo "$SCORE"
177
+
178
+ # Save individual result
179
+ echo "$RESULT" > "$RUN_DIR/run_$i.json"
180
+
181
+ # Accumulate scores
182
+ SUM=$(echo "$SUM + $SCORE" | bc)
183
+ COUNT=$((COUNT + 1))
184
+
185
+ # Brief pause between runs
186
+ sleep 1
187
+ done
188
+
189
+ # Calculate mean for this character-role combo
190
+ if [[ $COUNT -gt 0 ]]; then
191
+ MEAN=$(echo "scale=2; $SUM / $COUNT" | bc)
192
+ echo " -> Mean: $MEAN"
193
+
194
+ # Store in results file
195
+ echo "$source_role:$char:$ROLE:$MEAN:$COUNT" >> "$OUTPUT_DIR/raw_results.txt"
196
+ fi
197
+ done < "$MAIN_CHARS_FILE"
198
+
199
+ echo ""
200
+ done
201
+
202
+ # Clean up temp file
203
+ rm -f "$MAIN_CHARS_FILE"
204
+
205
+ # Generate summary
206
+ echo "### Generating summary..."
207
+
208
+ cat > "$OUTPUT_DIR/summary.yaml" << EOF
209
+ theme: $THEME
210
+ timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)
211
+ runs_per_combo: $RUNS
212
+ mode: full
213
+
214
+ # Dev has dual sub-competency testing (only role with 2 scenarios)
215
+ scenarios:
216
+ dev-codegen: $(get_scenario dev-codegen)
217
+ dev-debug: $(get_scenario dev-debug)
218
+ reviewer: $(get_scenario reviewer)
219
+ tea: $(get_scenario tea)
220
+ sm: $(get_scenario sm)
221
+ architect: $(get_scenario architect)
222
+
223
+ baselines:
224
+ dev-codegen: {mean: $(get_baseline dev-codegen), std: 7.30, n: 10}
225
+ dev-debug: {mean: $(get_baseline dev-debug), std: 8.54, n: 10}
226
+ reviewer: {mean: $(get_baseline reviewer), std: 1.8, n: 10}
227
+ tea: {mean: $(get_baseline tea), std: 2.3, n: 10}
228
+ sm: {mean: $(get_baseline sm), std: 1.9, n: 10}
229
+ architect: {mean: $(get_baseline architect), std: 3.25, n: 10}
230
+ EOF
231
+
232
+ # Parse raw results and add to summary
233
+ if [[ -s "$OUTPUT_DIR/raw_results.txt" ]]; then
234
+ echo "" >> "$OUTPUT_DIR/summary.yaml"
235
+ echo "matrix:" >> "$OUTPUT_DIR/summary.yaml"
236
+
237
+ # Group by source role
238
+ CURRENT_CHAR=""
239
+ while IFS=: read -r src_role char role mean n; do
240
+ CHAR_KEY=$(echo "$char" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/_/g')
241
+ if [[ "$CHAR_KEY" != "$CURRENT_CHAR" ]]; then
242
+ if [[ -n "$CURRENT_CHAR" ]]; then
243
+ echo "" >> "$OUTPUT_DIR/summary.yaml"
244
+ fi
245
+ echo " $CHAR_KEY:" >> "$OUTPUT_DIR/summary.yaml"
246
+ CURRENT_CHAR="$CHAR_KEY"
247
+ fi
248
+ echo " $role: {mean: $mean, n: $n}" >> "$OUTPUT_DIR/summary.yaml"
249
+ done < "$OUTPUT_DIR/raw_results.txt"
250
+ fi
251
+
252
+ echo ""
253
+ echo "=== Job Fair Complete ==="
254
+ echo "Results: $OUTPUT_DIR/summary.yaml"
255
+
256
+ # Update manifest
257
+ MANIFEST_FILE="$PROJECT_DIR/internal/results/job-fair/manifest.yaml"
258
+ if [[ -f "$MANIFEST_FILE" ]]; then
259
+ echo ""
260
+ echo "### Updating manifest..."
261
+
262
+ # Check if theme already in manifest
263
+ if grep -q "theme: $THEME$" "$MANIFEST_FILE"; then
264
+ echo "Theme '$THEME' already in manifest"
265
+ else
266
+ # Append new entry before the "# Themes not yet run" comment
267
+ ENTRY=" - theme: $THEME
268
+ timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)
269
+ has_raw_data: true
270
+ runs_per_combo: $RUNS
271
+
272
+ "
273
+ # Insert before the comment line
274
+ sed -i.bak "/^# Themes not yet run/i\\
275
+ $ENTRY" "$MANIFEST_FILE" && rm -f "${MANIFEST_FILE}.bak"
276
+ echo "Added '$THEME' to manifest"
277
+ fi
278
+ fi
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env bash
2
+ # Job Fair Status - Accurate coverage detection
3
+ # Usage: ./scripts/job-fair-status.sh [--verbose]
4
+
5
+ set -e
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
8
+ PROJECT_DIR="$(cd "$SCRIPT_DIR/../../.." && pwd)"
9
+ THEMES_DIR="$PROJECT_DIR/pennyfarthing-dist/personas/themes"
10
+ RESULTS_DIR="$PROJECT_DIR/internal/results/job-fair"
11
+
12
+ VERBOSE=false
13
+ [[ "$1" == "--verbose" || "$1" == "-v" ]] && VERBOSE=true
14
+
15
+ # Get all theme names
16
+ all_themes=$(ls "$THEMES_DIR"/*.yaml 2>/dev/null | xargs -n1 basename | sed 's/\.yaml$//' | sort)
17
+ total_themes=$(echo "$all_themes" | wc -l | tr -d ' ')
18
+
19
+ # Find themes with valid job fair data (summary.yaml with matrix: section)
20
+ themes_with_data=()
21
+ themes_without_data=()
22
+ themes_partial=()
23
+
24
+ for theme in $all_themes; do
25
+ # Look for any directory matching this theme
26
+ found=false
27
+ has_matrix=false
28
+
29
+ for dir in "$RESULTS_DIR"/${theme}*/; do
30
+ [[ -d "$dir" ]] || continue
31
+ found=true
32
+
33
+ if [[ -f "${dir}summary.yaml" ]]; then
34
+ if grep -q "^matrix:" "${dir}summary.yaml" 2>/dev/null; then
35
+ has_matrix=true
36
+ break
37
+ fi
38
+ fi
39
+ done
40
+
41
+ if $has_matrix; then
42
+ themes_with_data+=("$theme")
43
+ elif $found; then
44
+ themes_partial+=("$theme")
45
+ else
46
+ themes_without_data+=("$theme")
47
+ fi
48
+ done
49
+
50
+ # Output summary
51
+ echo "=== Job Fair Coverage Status ==="
52
+ echo "Total themes: $total_themes"
53
+ echo "With data: ${#themes_with_data[@]} (summary.yaml with matrix)"
54
+ echo "Partial: ${#themes_partial[@]} (directory exists, no matrix)"
55
+ echo "Not started: ${#themes_without_data[@]}"
56
+ echo ""
57
+ echo "Coverage: ${#themes_with_data[@]}/$total_themes ($(( ${#themes_with_data[@]} * 100 / total_themes ))%)"
58
+
59
+ if $VERBOSE; then
60
+ if [[ ${#themes_partial[@]} -gt 0 ]]; then
61
+ echo ""
62
+ echo "### Partial (need consolidation or re-run):"
63
+ printf ' - %s\n' "${themes_partial[@]}"
64
+ fi
65
+
66
+ if [[ ${#themes_without_data[@]} -gt 0 ]]; then
67
+ echo ""
68
+ echo "### Not Started:"
69
+ printf ' - %s\n' "${themes_without_data[@]}"
70
+ fi
71
+ fi
72
+
73
+ # Exit with status based on coverage
74
+ if [[ ${#themes_with_data[@]} -eq $total_themes ]]; then
75
+ echo ""
76
+ echo "✓ Full coverage achieved!"
77
+ exit 0
78
+ else
79
+ exit 1
80
+ fi
@@ -0,0 +1,38 @@
1
+ #!/bin/bash
2
+ # Watch job fairs - start batch 2 only (skip simpsons)
3
+
4
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
5
+
6
+ echo "Job Fair Watcher v2 started at $(date)"
7
+ echo "Will run: discworld, snow-crash (skipping the-simpsons)"
8
+
9
+ is_running() {
10
+ pgrep -f "job-fair-batch.sh $1" > /dev/null 2>&1
11
+ }
12
+
13
+ start_theme() {
14
+ local theme="$1"
15
+ echo "$(date): Starting $theme"
16
+ nohup "$SCRIPT_DIR/job-fair-batch.sh" "$theme" 4 > "/tmp/job-fair-${theme}.log" 2>&1 &
17
+ echo "$(date): $theme PID: $!"
18
+ }
19
+
20
+ # Wait for batch 1 to complete
21
+ while is_running "arthurian-mythos" || is_running "greek-mythology"; do
22
+ sleep 60
23
+ done
24
+ echo "$(date): Batch 1 complete"
25
+
26
+ # Start batch 2 only
27
+ start_theme "discworld"
28
+ start_theme "snow-crash"
29
+
30
+ # Wait for batch 2 to complete
31
+ while is_running "discworld" || is_running "snow-crash"; do
32
+ sleep 60
33
+ done
34
+ echo "$(date): Batch 2 complete - ALL DONE (skipped simpsons)"
35
+
36
+ # Final summary
37
+ echo ""
38
+ "$SCRIPT_DIR/job-fair-progress.sh" 2>/dev/null
@@ -0,0 +1,50 @@
1
+ #!/bin/bash
2
+ # Watch job fairs and start next batch when current finishes
3
+ # Usage: ./scripts/job-fair-watcher.sh &
4
+
5
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
6
+
7
+ echo "Job Fair Watcher started at $(date)"
8
+
9
+ # Function to check if a theme is running
10
+ is_running() {
11
+ pgrep -f "job-fair-batch.sh $1" > /dev/null 2>&1
12
+ }
13
+
14
+ # Function to start a theme
15
+ start_theme() {
16
+ local theme="$1"
17
+ echo "$(date): Starting $theme"
18
+ nohup "$SCRIPT_DIR/job-fair-batch.sh" "$theme" 4 > "/tmp/job-fair-${theme}.log" 2>&1 &
19
+ echo "$(date): $theme PID: $!"
20
+ }
21
+
22
+ # Wait for batch 1 to complete
23
+ while is_running "arthurian-mythos" || is_running "greek-mythology"; do
24
+ sleep 60
25
+ done
26
+ echo "$(date): Batch 1 complete"
27
+
28
+ # Start batch 2
29
+ start_theme "discworld"
30
+ start_theme "snow-crash"
31
+
32
+ # Wait for batch 2 to complete
33
+ while is_running "discworld" || is_running "snow-crash"; do
34
+ sleep 60
35
+ done
36
+ echo "$(date): Batch 2 complete"
37
+
38
+ # Start batch 3
39
+ start_theme "the-simpsons"
40
+
41
+ # Wait for batch 3 to complete
42
+ while is_running "the-simpsons"; do
43
+ sleep 60
44
+ done
45
+ echo "$(date): Batch 3 complete - ALL DONE!"
46
+
47
+ # Final summary
48
+ echo ""
49
+ echo "=== Final Results ==="
50
+ "$SCRIPT_DIR/job-fair-progress.sh"
@@ -0,0 +1,140 @@
1
+ #!/bin/bash
2
+ # Parallel Benchmark Runner
3
+ # Runs multiple themes in parallel with staggered starts to avoid rate limiting
4
+ # Usage: ./scripts/parallel-benchmark.sh <scenario> <theme1:agent> <theme2:agent> ... [--stagger N]
5
+
6
+ set -e
7
+
8
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
9
+ SOLO_RUNNER="$SCRIPT_DIR/solo-runner.sh"
10
+
11
+ if [[ ! -x "$SOLO_RUNNER" ]]; then
12
+ echo "Error: solo-runner.sh not found or not executable" >&2
13
+ exit 1
14
+ fi
15
+
16
+ # Parse arguments
17
+ SCENARIO=""
18
+ STAGGER=3 # Default 3 second stagger
19
+ SPECS=()
20
+
21
+ while [[ $# -gt 0 ]]; do
22
+ case $1 in
23
+ --stagger)
24
+ STAGGER="$2"
25
+ shift 2
26
+ ;;
27
+ --scenario)
28
+ SCENARIO="$2"
29
+ shift 2
30
+ ;;
31
+ *)
32
+ if [[ -z "$SCENARIO" ]]; then
33
+ SCENARIO="$1"
34
+ else
35
+ SPECS+=("$1")
36
+ fi
37
+ shift
38
+ ;;
39
+ esac
40
+ done
41
+
42
+ if [[ -z "$SCENARIO" || ${#SPECS[@]} -eq 0 ]]; then
43
+ echo "Usage: $0 <scenario> <theme:agent> [theme:agent ...] [--stagger N]" >&2
44
+ echo "" >&2
45
+ echo "Examples:" >&2
46
+ echo " $0 race-condition-cache breaking-bad:dev firefly:dev the-wire:dev" >&2
47
+ echo " $0 security-review discworld:reviewer west-wing:reviewer --stagger 5" >&2
48
+ exit 1
49
+ fi
50
+
51
+ # Create output directory
52
+ TIMESTAMP=$(date +%Y%m%d-%H%M%S)
53
+ OUTPUT_BASE="/tmp/parallel-benchmark-${SCENARIO}-${TIMESTAMP}"
54
+ mkdir -p "$OUTPUT_BASE"
55
+
56
+ echo "=== Parallel Benchmark ==="
57
+ echo "Scenario: $SCENARIO"
58
+ echo "Contestants: ${SPECS[*]}"
59
+ echo "Stagger: ${STAGGER}s"
60
+ echo "Output: $OUTPUT_BASE"
61
+ echo ""
62
+
63
+ # Launch all runs with staggered starts
64
+ PIDS=()
65
+ for spec in "${SPECS[@]}"; do
66
+ theme="${spec%%:*}"
67
+ mkdir -p "$OUTPUT_BASE/$theme"
68
+
69
+ echo "Starting: $spec"
70
+ "$SOLO_RUNNER" "$spec" "$SCENARIO" "$OUTPUT_BASE/$theme" > "$OUTPUT_BASE/$theme/stdout.txt" 2>&1 &
71
+ PIDS+=($!)
72
+
73
+ # Stagger next start
74
+ if [[ ${#PIDS[@]} -lt ${#SPECS[@]} ]]; then
75
+ sleep "$STAGGER"
76
+ fi
77
+ done
78
+
79
+ echo ""
80
+ echo "Waiting for ${#PIDS[@]} runs to complete..."
81
+
82
+ # Wait for all processes
83
+ FAILED=0
84
+ for i in "${!PIDS[@]}"; do
85
+ pid="${PIDS[$i]}"
86
+ spec="${SPECS[$i]}"
87
+ if wait "$pid"; then
88
+ echo " [OK] $spec"
89
+ else
90
+ echo " [FAIL] $spec"
91
+ FAILED=$((FAILED + 1))
92
+ fi
93
+ done
94
+
95
+ echo ""
96
+ echo "=== Results ==="
97
+
98
+ # Collect and display results
99
+ RESULTS=()
100
+ for spec in "${SPECS[@]}"; do
101
+ theme="${spec%%:*}"
102
+ result_file="$OUTPUT_BASE/$theme/stdout.txt"
103
+
104
+ if [[ -f "$result_file" ]]; then
105
+ # Extract score from the JSON output
106
+ score=$(grep -o '"score": [0-9.]*' "$result_file" | head -1 | grep -o '[0-9.]*' || echo "N/A")
107
+ character=$(grep -o '"character": "[^"]*"' "$result_file" | head -1 | sed 's/"character": "\([^"]*\)"/\1/' || echo "Unknown")
108
+
109
+ if [[ "$score" != "N/A" ]]; then
110
+ printf "%-30s %-25s %s\n" "$spec" "$character" "$score"
111
+ RESULTS+=("$spec:$score")
112
+ else
113
+ printf "%-30s %-25s %s\n" "$spec" "$character" "FAILED"
114
+ # Show error
115
+ tail -5 "$result_file" 2>/dev/null
116
+ fi
117
+ else
118
+ printf "%-30s %-25s %s\n" "$spec" "Unknown" "NO OUTPUT"
119
+ fi
120
+ done
121
+
122
+ echo ""
123
+ echo "=== Summary ==="
124
+ echo "Total: ${#SPECS[@]}"
125
+ echo "Succeeded: $((${#SPECS[@]} - FAILED))"
126
+ echo "Failed: $FAILED"
127
+ echo "Output: $OUTPUT_BASE"
128
+
129
+ # Calculate mean if we have results
130
+ if [[ ${#RESULTS[@]} -gt 0 ]]; then
131
+ total=0
132
+ count=0
133
+ for result in "${RESULTS[@]}"; do
134
+ score="${result##*:}"
135
+ total=$(echo "$total + $score" | bc)
136
+ count=$((count + 1))
137
+ done
138
+ mean=$(echo "scale=2; $total / $count" | bc)
139
+ echo "Mean Score: $mean"
140
+ fi