@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,337 @@
1
+ #!/usr/bin/env zsh
2
+ # Pennyfarthing Test Setup Utilities
3
+ # Config-driven test utilities that work with any project structure
4
+ #
5
+ # Usage: source scripts/test/test-setup.sh
6
+ #
7
+ # Configuration is read from .pennyfarthing/repos.yaml
8
+ # See repos.yaml for schema documentation
9
+ #
10
+ # Functions:
11
+ # generate_run_id - Create timestamp-based unique run ID
12
+ # get_log_path TYPE RUN_ID - Return log file path for a test type
13
+ # ensure_test_containers - Start test containers if configured
14
+ # setup_repo_test_env REPO - Export test env vars for a repo
15
+ # check_skip_violations REPO - Check for forbidden skip patterns
16
+ # show_skip_violations REPO - Display skip violation details
17
+ # cleanup_test_logs - Remove old test/lint log files
18
+
19
+ # Don't exit on error - we want to handle errors gracefully
20
+ set +e
21
+
22
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
23
+ PROJECT_ROOT="${PROJECT_ROOT:-$(cd "$SCRIPT_DIR/../../../.." && pwd)}"
24
+
25
+ # Source repo-utils for config access
26
+ source "$CLAUDE_PROJECT_DIR/scripts/repo-utils.sh" 2>/dev/null || {
27
+ echo "Warning: repo-utils.sh not found, using defaults" >&2
28
+ }
29
+
30
+ # ============================================================================
31
+ # Run ID and Logging
32
+ # ============================================================================
33
+
34
+ # Generate a unique run ID based on timestamp
35
+ # Usage: RUN_ID=$(generate_run_id)
36
+ generate_run_id() {
37
+ date +%Y%m%d-%H%M%S
38
+ }
39
+
40
+ # Get log file path for a given type
41
+ # Usage: LOG_PATH=$(get_log_path "test-myrepo" "$RUN_ID")
42
+ # Types: test-{repo}, lint-{repo}, or any custom prefix
43
+ get_log_path() {
44
+ local log_type="$1"
45
+ local run_id="${2:-$(generate_run_id)}"
46
+
47
+ # Use configured log dir or default
48
+ local log_dir
49
+ if declare -f get_test_log_dir &>/dev/null; then
50
+ log_dir=$(get_test_log_dir)
51
+ else
52
+ log_dir="$CLAUDE_PROJECT_DIR/.session"
53
+ fi
54
+
55
+ echo "${log_dir}/${log_type}-results-${run_id}.log"
56
+ }
57
+
58
+ # ============================================================================
59
+ # Container Management
60
+ # ============================================================================
61
+
62
+ # Check if test containers are running, start if configured
63
+ # Returns: 0 if containers ready (or none needed), 1 if failed to start
64
+ ensure_test_containers() {
65
+ # Get container command from config
66
+ local container_cmd
67
+ if declare -f get_container_command &>/dev/null; then
68
+ container_cmd=$(get_container_command)
69
+ fi
70
+
71
+ # If no container command configured, nothing to do
72
+ if [[ -z "$container_cmd" ]]; then
73
+ return 0
74
+ fi
75
+
76
+ echo "Starting test containers via: $container_cmd"
77
+ eval "$container_cmd"
78
+ return $?
79
+ }
80
+
81
+ # ============================================================================
82
+ # Environment Setup
83
+ # ============================================================================
84
+
85
+ # Export test environment variables for a specific repo
86
+ # Usage: setup_repo_test_env "Pennyfarthing-api"
87
+ setup_repo_test_env() {
88
+ local repo="$1"
89
+
90
+ if declare -f get_test_env &>/dev/null; then
91
+ local env_exports
92
+ env_exports=$(get_test_env "$repo")
93
+ if [[ -n "$env_exports" ]]; then
94
+ eval "$env_exports"
95
+ fi
96
+ fi
97
+ }
98
+
99
+ # Export test environment for all repos
100
+ # Usage: setup_all_test_env
101
+ setup_all_test_env() {
102
+ if ! declare -f get_repos &>/dev/null; then
103
+ return 0
104
+ fi
105
+
106
+ for repo in $(get_repos); do
107
+ setup_repo_test_env "$repo"
108
+ done
109
+ }
110
+
111
+ # ============================================================================
112
+ # Skip Violation Checks
113
+ # ============================================================================
114
+
115
+ # Check for forbidden skip patterns in a repo's test files
116
+ # Usage: VIOLATIONS=$(check_skip_violations "Pennyfarthing-api")
117
+ # Returns: count of violations found
118
+ check_skip_violations() {
119
+ local repo="$1"
120
+ local count=0
121
+
122
+ # Get repo info
123
+ local repo_path language
124
+ if declare -f get_repo_full_path &>/dev/null; then
125
+ repo_path=$(get_repo_full_path "$repo")
126
+ language=$(get_repo_language "$repo")
127
+ else
128
+ repo_path="$CLAUDE_PROJECT_DIR/$repo"
129
+ language="unknown"
130
+ fi
131
+
132
+ if [[ ! -d "$repo_path" ]]; then
133
+ echo "0"
134
+ return 0
135
+ fi
136
+
137
+ # Get patterns for this language
138
+ local skip_patterns skip_exceptions file_pattern
139
+ if declare -f get_skip_patterns &>/dev/null; then
140
+ skip_patterns=$(get_skip_patterns "$language")
141
+ skip_exceptions=$(get_skip_exceptions "$language")
142
+ file_pattern=$(get_test_file_pattern "$language")
143
+ else
144
+ # Fallback defaults
145
+ case "$language" in
146
+ go)
147
+ skip_patterns='t\.Skip'
148
+ skip_exceptions='LocalStack|not available'
149
+ file_pattern='*_test.go'
150
+ ;;
151
+ typescript|javascript)
152
+ skip_patterns='it\.skip|describe\.skip|test\.skip'
153
+ skip_exceptions=''
154
+ file_pattern='*.test.*'
155
+ ;;
156
+ *)
157
+ echo "0"
158
+ return 0
159
+ ;;
160
+ esac
161
+ fi
162
+
163
+ if [[ -z "$skip_patterns" ]]; then
164
+ echo "0"
165
+ return 0
166
+ fi
167
+
168
+ # Search for violations
169
+ local grep_result
170
+ grep_result=$(grep -r -E "$skip_patterns" "$repo_path" --include="$file_pattern" 2>/dev/null || true)
171
+
172
+ # Filter out exceptions
173
+ if [[ -n "$skip_exceptions" && -n "$grep_result" ]]; then
174
+ grep_result=$(echo "$grep_result" | grep -v -E "$skip_exceptions" || true)
175
+ fi
176
+
177
+ # Count remaining violations
178
+ if [[ -n "$grep_result" ]]; then
179
+ count=$(echo "$grep_result" | wc -l | tr -d ' ')
180
+ fi
181
+
182
+ echo "$count"
183
+ }
184
+
185
+ # Show skip violations with file locations
186
+ # Usage: show_skip_violations "Pennyfarthing-api" [max_lines]
187
+ show_skip_violations() {
188
+ local repo="$1"
189
+ local max_lines="${2:-10}"
190
+
191
+ # Get repo info
192
+ local repo_path language
193
+ if declare -f get_repo_full_path &>/dev/null; then
194
+ repo_path=$(get_repo_full_path "$repo")
195
+ language=$(get_repo_language "$repo")
196
+ else
197
+ repo_path="$CLAUDE_PROJECT_DIR/$repo"
198
+ language="unknown"
199
+ fi
200
+
201
+ if [[ ! -d "$repo_path" ]]; then
202
+ return 0
203
+ fi
204
+
205
+ # Get patterns for this language
206
+ local skip_patterns skip_exceptions file_pattern
207
+ if declare -f get_skip_patterns &>/dev/null; then
208
+ skip_patterns=$(get_skip_patterns "$language")
209
+ skip_exceptions=$(get_skip_exceptions "$language")
210
+ file_pattern=$(get_test_file_pattern "$language")
211
+ else
212
+ return 0
213
+ fi
214
+
215
+ if [[ -z "$skip_patterns" ]]; then
216
+ return 0
217
+ fi
218
+
219
+ # Search and display
220
+ local grep_result
221
+ grep_result=$(grep -r -E "$skip_patterns" "$repo_path" --include="$file_pattern" 2>/dev/null || true)
222
+
223
+ if [[ -n "$skip_exceptions" && -n "$grep_result" ]]; then
224
+ grep_result=$(echo "$grep_result" | grep -v -E "$skip_exceptions" || true)
225
+ fi
226
+
227
+ if [[ -n "$grep_result" ]]; then
228
+ echo "$grep_result" | head -"$max_lines"
229
+ fi
230
+ }
231
+
232
+ # Check all repos for skip violations
233
+ # Usage: TOTAL=$(check_all_skip_violations)
234
+ check_all_skip_violations() {
235
+ local total=0
236
+
237
+ if ! declare -f get_repos &>/dev/null; then
238
+ echo "0"
239
+ return 0
240
+ fi
241
+
242
+ for repo in $(get_repos); do
243
+ local count
244
+ count=$(check_skip_violations "$repo")
245
+ total=$((total + count))
246
+ done
247
+
248
+ echo "$total"
249
+ }
250
+
251
+ # ============================================================================
252
+ # Cleanup
253
+ # ============================================================================
254
+
255
+ # Remove old test and lint log files
256
+ cleanup_test_logs() {
257
+ local log_dir
258
+ if declare -f get_test_log_dir &>/dev/null; then
259
+ log_dir=$(get_test_log_dir)
260
+ else
261
+ log_dir="$CLAUDE_PROJECT_DIR/.session"
262
+ fi
263
+
264
+ rm -f "$log_dir"/test-*-results-*.log 2>/dev/null
265
+ rm -f "$log_dir"/lint-*-results-*.log 2>/dev/null
266
+ }
267
+
268
+ # ============================================================================
269
+ # High-Level Test Running
270
+ # ============================================================================
271
+
272
+ # Run tests for a specific repo with logging
273
+ # Usage: run_repo_tests "Pennyfarthing-api" "$RUN_ID"
274
+ run_repo_tests() {
275
+ local repo="$1"
276
+ local run_id="${2:-$(generate_run_id)}"
277
+
278
+ local repo_path test_cmd log_path
279
+ if declare -f get_repo_full_path &>/dev/null; then
280
+ repo_path=$(get_repo_full_path "$repo")
281
+ test_cmd=$(get_test_command "$repo")
282
+ else
283
+ repo_path="$CLAUDE_PROJECT_DIR/$repo"
284
+ test_cmd=""
285
+ fi
286
+
287
+ if [[ -z "$test_cmd" ]]; then
288
+ echo "SKIP: $repo (no test command configured)"
289
+ return 0
290
+ fi
291
+
292
+ if [[ ! -d "$repo_path" ]]; then
293
+ echo "SKIP: $repo (path not found: $repo_path)"
294
+ return 0
295
+ fi
296
+
297
+ log_path=$(get_log_path "test-$repo" "$run_id")
298
+
299
+ # Setup environment for this repo
300
+ setup_repo_test_env "$repo"
301
+
302
+ echo "=== Testing $repo ==="
303
+ (cd "$repo_path" && eval "$test_cmd") 2>&1 | tee "$log_path"
304
+ local exit_code=${pipestatus[1]}
305
+
306
+ if [[ $exit_code -eq 0 ]]; then
307
+ echo "PASS: $repo"
308
+ else
309
+ echo "FAIL: $repo (exit code: $exit_code)"
310
+ fi
311
+
312
+ return $exit_code
313
+ }
314
+
315
+ # Run tests for all repos
316
+ # Usage: run_all_repo_tests "$RUN_ID"
317
+ run_all_repo_tests() {
318
+ local run_id="${1:-$(generate_run_id)}"
319
+ local failed=0
320
+
321
+ ensure_test_containers || {
322
+ echo "Warning: Container setup failed, continuing anyway"
323
+ }
324
+
325
+ if ! declare -f get_repos &>/dev/null; then
326
+ echo "Warning: repo-utils not loaded, cannot iterate repos"
327
+ return 1
328
+ fi
329
+
330
+ for repo in $(get_build_order); do
331
+ if ! run_repo_tests "$repo" "$run_id"; then
332
+ ((failed++)) || true
333
+ fi
334
+ done
335
+
336
+ return $failed
337
+ }
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env bash
2
+ # compute-theme-tiers.sh - Compute tier rankings from job-fair results
3
+ #
4
+ # Uses the MOST COMPLETE run for each theme (most matrix entries),
5
+ # not the most recent. This prevents incomplete runs from overriding good data.
6
+ #
7
+ # Usage: compute-theme-tiers.sh [--dry-run] [--verbose] [--min-entries N]
8
+
9
+ set -euo pipefail
10
+
11
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
12
+
13
+ exec python3 "$SCRIPT_DIR/compute_theme_tiers.py" "$@"