@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,402 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ compute_theme_tiers.py - Compute tier rankings from job-fair results
4
+
5
+ Reads all summary.yaml files from internal/results/job-fair/
6
+ For each theme, extracts character x role scores from the matrix
7
+ Normalizes across formats, then computes delta vs baseline
8
+ Assigns tier based on overall performance vs control baseline
9
+
10
+ KEY DESIGN DECISIONS:
11
+ 1. Normalizes dev roles: averages dev-codegen + dev-debug into synthetic "dev"
12
+ to enable fair comparison across old 4-role and new 6-role formats.
13
+ Final comparison uses: dev, reviewer, sm, tea (4 roles)
14
+ 2. Uses the MOST COMPLETE run for each theme (most matrix entries),
15
+ not the most recent. This prevents incomplete runs from overriding good data.
16
+
17
+ Tier criteria (calibrated for actual delta distribution):
18
+ S: delta >= +7 (elite - top performers)
19
+ A: delta >= +5 (excellent - strong positive)
20
+ B: delta >= +3 (strong - solid performers)
21
+ C: delta >= +1 (good - above average)
22
+ D: delta < +1 (average/below)
23
+ U: no data (unbenchmarked)
24
+
25
+ Usage:
26
+ compute_theme_tiers.py [--dry-run] [--verbose] [--min-entries N]
27
+ """
28
+
29
+ import argparse
30
+ import json
31
+ import re
32
+ import subprocess
33
+ import sys
34
+ from pathlib import Path
35
+
36
+ try:
37
+ import yaml
38
+ except ImportError:
39
+ print("Error: PyYAML required. Install with: pip install pyyaml", file=sys.stderr)
40
+ sys.exit(1)
41
+
42
+
43
+ def find_project_root() -> Path:
44
+ """Find project root by looking for .pennyfarthing directory."""
45
+ current = Path.cwd()
46
+ while current != current.parent:
47
+ if (current / ".pennyfarthing").is_dir():
48
+ return current
49
+ current = current.parent
50
+ return Path.cwd()
51
+
52
+
53
+ PROJECT_ROOT = find_project_root()
54
+ JOB_FAIR_DIR = PROJECT_ROOT.parent / 'internal' / 'results' / 'job-fair'
55
+ THEMES_DIR = PROJECT_ROOT / 'pennyfarthing-dist' / 'personas' / 'themes'
56
+
57
+ DEFAULT_MIN_ENTRIES = 20
58
+
59
+ NORMALIZED_ROLES = {'dev', 'reviewer', 'sm', 'tea'}
60
+ DEV_SUBROLES = ['dev-codegen', 'dev-debug']
61
+
62
+ TIER_THRESHOLDS = {
63
+ 'S': 7,
64
+ 'A': 5,
65
+ 'B': 3,
66
+ 'C': 1,
67
+ }
68
+
69
+
70
+ def yq_get(file_path: Path, field: str) -> str | None:
71
+ """Extract YAML field using yq."""
72
+ try:
73
+ result = subprocess.run(
74
+ ['yq', '-r', field, str(file_path)],
75
+ capture_output=True, text=True, check=True
76
+ )
77
+ value = result.stdout.strip()
78
+ return None if value == 'null' else value
79
+ except Exception:
80
+ return None
81
+
82
+
83
+ def parse_baselines(file_path: Path) -> dict | None:
84
+ """Parse baselines from summary.yaml."""
85
+ try:
86
+ result = subprocess.run(
87
+ ['yq', '-o=json', '.baselines', str(file_path)],
88
+ capture_output=True, text=True, check=True
89
+ )
90
+ return json.loads(result.stdout)
91
+ except Exception:
92
+ return None
93
+
94
+
95
+ def count_matrix_entries(file_path: Path) -> int:
96
+ """Count matrix entries by grep."""
97
+ try:
98
+ result = subprocess.run(
99
+ ['awk', '/^matrix:/,0 { if (/mean:/) count++ } END { print count }', str(file_path)],
100
+ capture_output=True, text=True, check=True
101
+ )
102
+ return int(result.stdout.strip()) if result.stdout.strip() else 0
103
+ except Exception:
104
+ return 0
105
+
106
+
107
+ def parse_matrix_scores(file_path: Path) -> list[dict]:
108
+ """Extract all scores from matrix section using yq."""
109
+ try:
110
+ result = subprocess.run(
111
+ ['yq', '.matrix | to_entries | .[] | .key as $char | .value | to_entries | .[] | [$char, .key, .value.mean, .value.n] | @csv', str(file_path)],
112
+ capture_output=True, text=True, check=True
113
+ )
114
+
115
+ scores = []
116
+ for line in result.stdout.strip().split('\n'):
117
+ if not line:
118
+ continue
119
+ parts = line.split(',')
120
+ if len(parts) >= 4:
121
+ character = parts[0].strip('"')
122
+ role = parts[1].strip('"')
123
+ try:
124
+ mean = float(parts[2])
125
+ n = int(parts[3])
126
+ scores.append({'character': character, 'role': role, 'mean': mean, 'n': n})
127
+ except ValueError:
128
+ continue
129
+ return scores
130
+ except Exception:
131
+ return []
132
+
133
+
134
+ def normalize_baselines(baselines: dict | None) -> dict | None:
135
+ """Normalize baselines: average dev-codegen + dev-debug into synthetic dev."""
136
+ if not baselines:
137
+ return None
138
+
139
+ normalized = dict(baselines)
140
+
141
+ if 'dev' not in normalized and 'dev-codegen' in normalized and 'dev-debug' in normalized:
142
+ codegen = normalized['dev-codegen']
143
+ debug = normalized['dev-debug']
144
+ normalized['dev'] = {
145
+ 'mean': (codegen['mean'] + debug['mean']) / 2,
146
+ 'std': ((codegen['std'] ** 2 + debug['std'] ** 2) / 2) ** 0.5,
147
+ 'n': codegen['n'] + debug['n'],
148
+ }
149
+
150
+ return normalized
151
+
152
+
153
+ def compute_deltas(baselines: dict | None, matrix_scores: list[dict]) -> dict | None:
154
+ """Compute delta vs baselines for a job-fair run."""
155
+ if not baselines or not matrix_scores:
156
+ return None
157
+
158
+ normalized_baselines = normalize_baselines(baselines)
159
+
160
+ # First pass: collect raw scores
161
+ raw_scores = {}
162
+ for score in matrix_scores:
163
+ role = score['role']
164
+ mean = score['mean']
165
+ if not isinstance(mean, (int, float)):
166
+ continue
167
+ if role not in raw_scores:
168
+ raw_scores[role] = {'sum': 0, 'count': 0}
169
+ raw_scores[role]['sum'] += mean
170
+ raw_scores[role]['count'] += 1
171
+
172
+ # Second pass: normalize dev subroles
173
+ role_scores = {}
174
+ for role, scores in raw_scores.items():
175
+ if role in DEV_SUBROLES:
176
+ if 'dev' not in role_scores:
177
+ role_scores['dev'] = {'sum': 0, 'count': 0}
178
+ role_scores['dev']['sum'] += scores['sum']
179
+ role_scores['dev']['count'] += scores['count']
180
+ elif role in NORMALIZED_ROLES:
181
+ role_scores[role] = scores
182
+
183
+ # Compute deltas
184
+ role_deltas = {}
185
+ total_delta = 0
186
+ total_score = 0
187
+ n_roles = 0
188
+
189
+ for role, scores in role_scores.items():
190
+ baseline = normalized_baselines.get(role)
191
+ if not baseline or not isinstance(baseline.get('mean'), (int, float)):
192
+ continue
193
+
194
+ role_mean = scores['sum'] / scores['count']
195
+ delta = role_mean - baseline['mean']
196
+
197
+ role_deltas[role] = {
198
+ 'mean': role_mean,
199
+ 'baseline': baseline['mean'],
200
+ 'delta': delta,
201
+ 'n': scores['count'],
202
+ }
203
+
204
+ total_delta += delta
205
+ total_score += role_mean
206
+ n_roles += 1
207
+
208
+ if n_roles == 0:
209
+ return None
210
+
211
+ return {
212
+ 'mean_delta': total_delta / n_roles,
213
+ 'mean_score': total_score / n_roles,
214
+ 'n_roles': n_roles,
215
+ 'role_deltas': role_deltas,
216
+ }
217
+
218
+
219
+ def assign_tier(mean_delta: float) -> str:
220
+ """Assign tier based on mean delta."""
221
+ if mean_delta >= TIER_THRESHOLDS['S']:
222
+ return 'S'
223
+ if mean_delta >= TIER_THRESHOLDS['A']:
224
+ return 'A'
225
+ if mean_delta >= TIER_THRESHOLDS['B']:
226
+ return 'B'
227
+ if mean_delta >= TIER_THRESHOLDS['C']:
228
+ return 'C'
229
+ return 'D'
230
+
231
+
232
+ def find_summary_files() -> list[dict]:
233
+ """Find all job-fair summary files."""
234
+ if not JOB_FAIR_DIR.exists():
235
+ print(f"Error: Job fair directory not found: {JOB_FAIR_DIR}", file=sys.stderr)
236
+ sys.exit(1)
237
+
238
+ files = []
239
+ for entry in sorted(JOB_FAIR_DIR.iterdir()):
240
+ if not entry.is_dir():
241
+ continue
242
+ summary_path = entry / 'summary.yaml'
243
+ if summary_path.exists():
244
+ files.append({
245
+ 'path': summary_path,
246
+ 'run_name': entry.name,
247
+ })
248
+ return files
249
+
250
+
251
+ def update_theme_tier(theme_name: str, new_tier: str, dry_run: bool) -> dict:
252
+ """Update tier in theme file."""
253
+ theme_file = THEMES_DIR / f"{theme_name}.yaml"
254
+ if not theme_file.exists():
255
+ return {'updated': False, 'reason': 'file not found'}
256
+
257
+ content = theme_file.read_text()
258
+ tier_match = re.search(r'^(\s+tier:\s*)(\S+)', content, re.MULTILINE)
259
+
260
+ if not tier_match:
261
+ return {'updated': False, 'reason': 'no tier field', 'current_tier': 'U'}
262
+
263
+ current_tier = tier_match.group(2)
264
+ if current_tier == new_tier:
265
+ return {'updated': False, 'reason': 'unchanged', 'current_tier': current_tier}
266
+
267
+ if not dry_run:
268
+ new_content = re.sub(r'^(\s+tier:\s*)\S+', f'\\g<1>{new_tier}', content, count=1, flags=re.MULTILINE)
269
+ theme_file.write_text(new_content)
270
+
271
+ return {'updated': True, 'current_tier': current_tier, 'new_tier': new_tier}
272
+
273
+
274
+ def main() -> int:
275
+ parser = argparse.ArgumentParser(
276
+ description="Compute tier rankings from job-fair results"
277
+ )
278
+ parser.add_argument('--dry-run', action='store_true',
279
+ help='Output changes without writing to theme files')
280
+ parser.add_argument('--verbose', action='store_true',
281
+ help='Show detailed output including skipped runs')
282
+ parser.add_argument('--min-entries', type=int, default=DEFAULT_MIN_ENTRIES,
283
+ help=f'Minimum matrix entries for a run to be complete (default: {DEFAULT_MIN_ENTRIES})')
284
+ args = parser.parse_args()
285
+
286
+ if args.dry_run:
287
+ print('DRY RUN - no changes will be made\n')
288
+
289
+ print('Configuration:')
290
+ print(f" Minimum entries for complete run: {args.min_entries}")
291
+ print(f" Normalized roles: {', '.join(sorted(NORMALIZED_ROLES))}")
292
+ print(f" Dev subroles (averaged): {' + '.join(DEV_SUBROLES)} -> dev")
293
+ print(f" Job fair directory: {JOB_FAIR_DIR}")
294
+ print('')
295
+
296
+ summary_files = find_summary_files()
297
+ print(f"Scanning {len(summary_files)} job-fair runs...\n")
298
+
299
+ theme_runs = {}
300
+ skipped_runs = []
301
+
302
+ for file_info in summary_files:
303
+ path = file_info['path']
304
+ run_name = file_info['run_name']
305
+
306
+ theme = yq_get(path, '.theme')
307
+ if not theme:
308
+ continue
309
+
310
+ entries = count_matrix_entries(path)
311
+
312
+ if entries < args.min_entries:
313
+ skipped_runs.append({'theme': theme, 'run_name': run_name, 'entries': entries, 'reason': 'incomplete'})
314
+ continue
315
+
316
+ baselines = parse_baselines(path)
317
+ matrix_scores = parse_matrix_scores(path)
318
+
319
+ deltas = compute_deltas(baselines, matrix_scores)
320
+ if not deltas:
321
+ skipped_runs.append({'theme': theme, 'run_name': run_name, 'entries': entries, 'reason': 'no valid deltas'})
322
+ continue
323
+
324
+ if theme not in theme_runs or entries > theme_runs[theme]['entries']:
325
+ theme_runs[theme] = {
326
+ 'run_name': run_name,
327
+ 'entries': entries,
328
+ **deltas,
329
+ }
330
+
331
+ if args.verbose and skipped_runs:
332
+ print('Skipped Runs (incomplete or invalid):')
333
+ for run in skipped_runs:
334
+ print(f" {run['theme']}: {run['run_name']} ({run['entries']} entries) - {run['reason']}")
335
+ print('')
336
+
337
+ sorted_themes = sorted(
338
+ [{'theme': theme, **data} for theme, data in theme_runs.items()],
339
+ key=lambda x: x['mean_delta'],
340
+ reverse=True
341
+ )
342
+
343
+ print('Theme Performance Summary')
344
+ print('=' * 70)
345
+ print('')
346
+ header = f"{'Theme':<28}{'Entries':>8}{'Mean':>8}{'Delta':>10}{'Tier':>6}"
347
+ if args.verbose:
348
+ header += ' Source Run'
349
+ print(header)
350
+ print('-' * 70)
351
+
352
+ updated = 0
353
+ unchanged = 0
354
+ tier_counts = {'S': 0, 'A': 0, 'B': 0, 'C': 0, 'D': 0}
355
+
356
+ for item in sorted_themes:
357
+ theme = item['theme']
358
+ run_name = item['run_name']
359
+ entries = item['entries']
360
+ mean_score = item['mean_score']
361
+ mean_delta = item['mean_delta']
362
+
363
+ tier = assign_tier(mean_delta)
364
+ tier_counts[tier] += 1
365
+
366
+ delta_str = f"{'+' if mean_delta >= 0 else ''}{mean_delta:.2f}"
367
+ line = f"{theme:<28}{entries:>8}{mean_score:>8.2f}{delta_str:>10}{tier:>6}"
368
+ if args.verbose:
369
+ line += f" {run_name}"
370
+ print(line)
371
+
372
+ result = update_theme_tier(theme, tier, args.dry_run)
373
+ if result['updated']:
374
+ updated += 1
375
+ if args.verbose:
376
+ print(f" -> Updated: {result['current_tier']} -> {result['new_tier']}")
377
+ else:
378
+ unchanged += 1
379
+
380
+ print('')
381
+ print('Tier Distribution:')
382
+ for tier in ['S', 'A', 'B', 'C', 'D']:
383
+ print(f" {tier}: {tier_counts[tier]} themes")
384
+
385
+ all_themes = [f.stem for f in THEMES_DIR.glob('*.yaml')]
386
+ benchmarked_themes = set(theme_runs.keys())
387
+ unbenchmarked = [t for t in all_themes if t not in benchmarked_themes]
388
+ print(f" U: {len(unbenchmarked)} themes (unbenchmarked)")
389
+
390
+ if args.verbose and unbenchmarked:
391
+ sample = ', '.join(unbenchmarked[:10])
392
+ suffix = '...' if len(unbenchmarked) > 10 else ''
393
+ print(f" {sample}{suffix}")
394
+
395
+ print('')
396
+ print(f"Summary: {updated} updated, {unchanged} unchanged")
397
+
398
+ return 0
399
+
400
+
401
+ if __name__ == "__main__":
402
+ sys.exit(main())
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env zsh
2
+ # update-theme-tiers.sh - Update tier field in theme YAML files based on THEME-TIERS.md
3
+ #
4
+ # Usage: update-theme-tiers.sh [--dry-run]
5
+
6
+ set -euo pipefail
7
+
8
+ SCRIPT_DIR="${0:A:h}"
9
+ PROJECT_ROOT="${SCRIPT_DIR:h:h:h:h}"
10
+
11
+ THEMES_DIR="$PROJECT_ROOT/pennyfarthing-dist/personas/themes"
12
+ TIERS_DOC="$PROJECT_ROOT/docs/THEME-TIERS.md"
13
+
14
+ DRY_RUN=false
15
+ if [[ "${1:-}" == "--dry-run" ]]; then
16
+ DRY_RUN=true
17
+ echo "DRY RUN - no changes will be made"
18
+ echo ""
19
+ fi
20
+
21
+ # Extract theme-to-tier mapping from THEME-TIERS.md
22
+ typeset -A TIER_MAP
23
+
24
+ current_tier=""
25
+ while IFS= read -r line; do
26
+ # Detect tier section headers
27
+ if [[ "$line" =~ "^## S-Tier" ]]; then
28
+ current_tier="S"
29
+ elif [[ "$line" =~ "^## A-Tier" ]]; then
30
+ current_tier="A"
31
+ elif [[ "$line" =~ "^## B-Tier" ]]; then
32
+ current_tier="B"
33
+ elif [[ "$line" =~ "^## C-Tier" ]]; then
34
+ current_tier="C"
35
+ elif [[ "$line" =~ "^## D-Tier" ]]; then
36
+ current_tier="D"
37
+ elif [[ "$line" =~ "^## U-Tier" ]]; then
38
+ current_tier="U"
39
+ elif [[ "$line" =~ "^## (Role-by-Role|Recommendations|OCEAN|Data|See)" ]]; then
40
+ # Stop parsing at these sections
41
+ break
42
+ fi
43
+
44
+ # Extract theme names from table rows (| **theme** | or | theme |)
45
+ if [[ -n "$current_tier" && "$line" =~ '^\|[[:space:]]*\*?\*?([a-z0-9-]+)\*?\*?[[:space:]]*\|' ]]; then
46
+ theme_name="${match[1]}"
47
+ # Skip header rows
48
+ if [[ "$theme_name" != "Theme" && "$theme_name" != "---" ]]; then
49
+ TIER_MAP[$theme_name]="$current_tier"
50
+ fi
51
+ fi
52
+ done < "$TIERS_DOC"
53
+
54
+ echo "Parsed ${#TIER_MAP[@]} theme tiers from THEME-TIERS.md"
55
+ echo ""
56
+
57
+ # Update each theme file
58
+ updated=0
59
+ skipped=0
60
+ unchanged=0
61
+
62
+ for theme_file in "$THEMES_DIR"/*.yaml; do
63
+ [[ -f "$theme_file" ]] || continue
64
+
65
+ theme_name="${theme_file:t:r}"
66
+ new_tier="${TIER_MAP[$theme_name]:-U}" # Default to U (unbenchmarked) if not found
67
+
68
+ # Get current tier from file
69
+ current_tier=$(grep -E "^ tier:" "$theme_file" 2>/dev/null | sed 's/.*tier:[[:space:]]*//' || echo "")
70
+
71
+ if [[ "$current_tier" == "$new_tier" ]]; then
72
+ unchanged=$((unchanged + 1))
73
+ continue
74
+ fi
75
+
76
+ if [[ "$DRY_RUN" == "true" ]]; then
77
+ echo "Would update $theme_name: ${current_tier:-<none>} → $new_tier"
78
+ updated=$((updated + 1))
79
+ else
80
+ # Use sed to update the tier field
81
+ if [[ -n "$current_tier" ]]; then
82
+ sed -i '' "s/^ tier:.*/ tier: $new_tier/" "$theme_file"
83
+ else
84
+ # Add tier field after user_title line
85
+ sed -i '' "/^ user_title:/a\\
86
+ tier: $new_tier" "$theme_file"
87
+ fi
88
+ echo "Updated $theme_name: ${current_tier:-<none>} → $new_tier"
89
+ updated=$((updated + 1))
90
+ fi
91
+ done
92
+
93
+ echo ""
94
+ echo "Summary:"
95
+ echo " Updated: $updated"
96
+ echo " Unchanged: $unchanged"
97
+ echo " Skipped: $skipped"