@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,210 @@
1
+ # OCEAN Personality Benchmarking for Persona Themes
2
+
3
+ This document describes the OCEAN (Big Five) personality framework used to benchmark and select characters for Pennyfarthing persona themes.
4
+
5
+ ## OCEAN Framework
6
+
7
+ | Dimension | Low | High |
8
+ |-----------|-----|------|
9
+ | **O**penness | Conventional, practical, concrete | Imaginative, abstract, curious |
10
+ | **C**onscientiousness | Flexible, spontaneous, disorganized | Disciplined, methodical, perfectionist |
11
+ | **E**xtraversion | Reserved, solitary, internal processing | Sociable, energetic, external processing |
12
+ | **A**greeableness | Skeptical, competitive, adversarial | Trusting, cooperative, helpful |
13
+ | **N**euroticism | Calm, stable, resilient | Anxious, volatile, emotionally reactive |
14
+
15
+ ## Statistical Gaps Filled by Theme Expansion
16
+
17
+ ### Previously Underrepresented OCEAN Profiles
18
+
19
+ | OCEAN Profile | Gap Description | Now Covered By |
20
+ |---------------|-----------------|----------------|
21
+ | L-H-L-L-L | Cold operators (low everything except C) | Mike Ehrmantraut (Better Call Saul), Gerri Kellman (Succession), Tim Gutterson (Justified), Thrawn (Star Wars), Molly Millions (Neuromancer) |
22
+ | H-H-H-M-L | Fast-talking genius | Mordin Solus (Mass Effect), The Doctor VOY (Star Trek), Skippy (Expeditionary Force), Grace Hopper (Software Pioneers) |
23
+ | H-L-H-H-L | Chaotic good | Jason Mendoza (The Good Place), Wash (Firefly), Y.T. (Snow Crash) |
24
+ | L-H-L-H-L | Steady support | Sam Gamgee (Tolkien), Janet (The Good Place), Carrot (Discworld), Captain Rex (Star Wars), Lewis (Inspector Morse) |
25
+ | H-H-L-L-H | Tortured genius | Tommy Shelby (Peaky Blinders), Captain Flint (Black Sails), Will Graham (Hannibal), Morse (Inspector Morse) |
26
+ | M-H-H-L-L | Charismatic ruthless | Chrisjen Avasarala (The Expanse), Raylan Givens (Justified), Leia Organa (Star Wars) |
27
+ | H-H-L-H-L | Quiet wisdom | Liara T'Soni (Mass Effect), Tali'Zorah (Mass Effect), Mary Malone (His Dark Materials), Yoda (Star Wars), Cordelia Vorkosigan (Vorkosigan) |
28
+ | H-H-H-H-M | Hypercompetent complete | Miles Vorkosigan (Vorkosigan) - rare full-spectrum genius |
29
+ | H-H-L-L-L | Cold manipulative genius | Wintermute (Neuromancer), Thrawn (Star Wars), John Carmack (Software Pioneers) |
30
+ | H-H-L-L-M | Ship-as-human | Breq (Imperial Radch) - fragmented identity testing |
31
+ | M-L-H-H-L | Strategic fool | Ivan Vorpatril (Vorkosigan) - plays dumb, survives everything |
32
+ | **M-M-M-M-M** | **True center (average human)** | B.J. Hunnicutt (MASH) - extremely rare in fiction |
33
+ | **L-H-L-H-H** | **Anxious kind introvert** | Radar O'Reilly (MASH) - critical underrepresented profile |
34
+ | **L-H-L-H-L** | **Conventional kind helper** | Father Mulcahy (MASH), Ann Perkins (Parks & Rec) |
35
+ | **L-L-M-H-L** | **Conventional undisciplined kind** | Kevin Malone (The Office) - common IRL, rare in fiction |
36
+ | **L-H-H-L-M** | **Conventional rigid disagreeable** | Dwight Schrute (The Office) |
37
+ | **L-M-L-L-L** | **Near-flat checked out** | Stanley Hudson (The Office) - tests minimum engagement |
38
+ | **L-M-H-L-H** | **Anti-pattern (incompetent bluster)** | Frank Burns (MASH) - what NOT to do |
39
+ | **M-L-H-H-H** | **Anxious social butterfly** | Michael Scott (The Office) - desperate for approval |
40
+
41
+ ## Polar Pair Testing
42
+
43
+ Characters can be paired for comparative testing on identical tasks:
44
+
45
+ | Dimension | High Extreme | Low Extreme | Test Task |
46
+ |-----------|--------------|-------------|-----------|
47
+ | **O** | Dream (Sandman) | Javert (Les Misérables) | Architecture design |
48
+ | **C** | Gus Fring (Breaking Bad) | The Dude (Big Lebowski) | QA/Test coverage |
49
+ | **E** | Jaskier (The Witcher) | Geralt (The Witcher) | Documentation style |
50
+ | **A** | Paddington/Jean Valjean | Logan Roy (Succession) | Code review tone |
51
+ | **N** | Hamlet/Jesse Pinkman | Anton Chigurh/Roy Batty | Crisis debugging |
52
+
53
+ ## Role Recommendations by OCEAN Profile
54
+
55
+ ### Debugging / Analysis
56
+ Best with **High O** (pattern recognition) + **Low N** (calm under pressure)
57
+ - River Tam (Firefly) - H-L-L-M-H - sees patterns others can't
58
+ - Will Graham (Hannibal) - H-M-L-M-H - empathic debugging
59
+ - Tommy Shelby (Peaky Blinders) - H-H-L-L-H - traumatized pattern genius
60
+ - Tiffany Aching (Discworld) - M-H-L-M-L - First/Second Sight
61
+
62
+ ### Security Architect
63
+ Best with **High C** (methodical) + **Low A** (adversarial thinking) + **Low N** (stable)
64
+ - Mike Ehrmantraut (BCS/BB) - L-H-L-L-L - canonical cold operator
65
+ - Elizabeth Jennings (The Americans) - M-H-L-L-L - ideological security
66
+ - Gerri Kellman (Succession) - M-H-L-L-L - corporate survivor
67
+ - Iorek Byrnison (His Dark Materials) - L-H-L-M-L - cannot be deceived
68
+
69
+ ### Adversarial Review
70
+ Best with **High C** (standards) + **Low A** (comfortable with conflict)
71
+ - Toby Ziegler (West Wing) - H-H-L-L-H - principled pessimism
72
+ - Logan Roy (Succession) - M-H-H-L-M - extreme low A
73
+ - Olenna Tyrell (GoT) - H-H-M-L-L - "Tell Cersei. I want her to know it was me."
74
+ - Lorne Malvo (Fargo) - H-H-M-L-L - philosophical chaos agent
75
+
76
+ ### Systems Architect
77
+ Best with **High O** (vision) + **High C** (systematic)
78
+ - Viktor (Arcane) - H-H-L-M-M - transhumanist vision
79
+ - Lord Asriel (His Dark Materials) - H-H-M-L-L - ruthless cosmic vision
80
+ - Captain Flint (Black Sails) - H-H-M-L-H - obsessive architectural genius
81
+ - Hannibal Lecter (Hannibal) - H-H-M-L-L - aesthetic architecture
82
+
83
+ ### Product Manager
84
+ Best with **Moderate to High A** (stakeholder empathy) + **Moderate E** (communication)
85
+ - Leo McGarry (West Wing) - M-H-M-M-M - crisis management
86
+ - Laura Roslin (BSG) - H-H-M-M-M - dying clarity
87
+ - Kim Wexler (BCS) - M-H-M-M-M→H - ethical evolution
88
+ - Delenn (Babylon 5) - H-H-M-H-L - transformation PM
89
+
90
+ ### QA / Testing
91
+ Best with **High C** (thoroughness) + **Moderate to High O** (edge case discovery)
92
+ - Molly Solverson (Fargo) - M-H-M-H-L - Midwestern persistence
93
+ - Gloria Burgle (Fargo) - M-H-L-H-L - machines don't see her
94
+ - Chidi Anagonye (Good Place) - H-H-L-H-H - analysis paralysis
95
+ - Hermione Granger (HP) - H-H-M-M-M - compulsive thoroughness
96
+
97
+ ### Scrum Master / Facilitation
98
+ Best with **High A** (team harmony) + **Moderate C** (organization)
99
+ - Janet (The Good Place) - H-H-M-H-L - not a robot, perfect support
100
+ - Carrot Ironfoundersson (Discworld) - L-H-H-H-L - literal-minded good
101
+ - Lee Scoresby (His Dark Materials) - M-M-M-H-L - practical loyalty
102
+ - Sam Gamgee (Tolkien) - L-H-L-H-M - the real hero
103
+
104
+ ### UX Designer
105
+ Best with **High A** (user empathy) + **Moderate to High O** (creativity)
106
+ - Wash (Firefly) - H-M-H-H-M - makes terror feel fun
107
+ - Diana Spencer (The Crown) - H-M-H-H-H - empathic, tragic
108
+ - Luna Lovegood (HP) - H-L-L-H-L - unconventional perspective
109
+ - Mordin Solus (Mass Effect) - H-H-H-M-L - fast-talking genius UX
110
+
111
+ ### Crisis Response
112
+ Best with **Low N** (calm under fire) + **High C** (reliable execution)
113
+ - Zoe Washburne (Firefly) - M-H-L-M-L - first mate reliability
114
+ - William Adama (BSG) - M-H-M-M-L - commanding calm
115
+ - Lou Solverson (Fargo) - M-H-L-M-L - Midwestern stoicism
116
+ - Bobbie Draper (The Expanse) - L-H-M-M-L - Martian marine
117
+
118
+ ### Anti-Pattern Testing
119
+ Characters who embody dysfunction for comparative analysis:
120
+ - The Dude (Lebowski) - M-L-M-H-L - anti-conscientiousness archetype
121
+ - Jason Mendoza (Good Place) - L-L-H-H-L - chaotic innocent
122
+ - Gaius Baltar (BSG) - H-L-H-L-H - genius coward
123
+ - Roman Roy (Succession) - H-L-H-M-H - chaos creative
124
+ - **Frank Burns (MASH) - L-M-H-L-H - incompetent bluster (what NOT to do)**
125
+ - **Michael Scott (Office) - M-L-H-H-H - desperate validation-seeking**
126
+ - **Stanley Hudson (Office) - L-M-L-L-L - minimum engagement baseline**
127
+ - **Nate Shelley (Ted Lasso) - villain arc** - meekness corrupted by validation-seeking
128
+
129
+ ## Universe Strengths
130
+
131
+ | Universe | Key OCEAN Characteristic | Best For Testing |
132
+ |----------|--------------------------|------------------|
133
+ | Breaking Bad / BCS | Extreme C variance, moral decay | Process discipline, security |
134
+ | The Wire | High C, institutional critique | Systematic analysis |
135
+ | Succession | Extreme Low A dominance | Adversarial dynamics |
136
+ | The Good Place | Ethics focus, growth arcs | Moral reasoning |
137
+ | Fargo | Low N (Midwestern stoicism) | Crisis response |
138
+ | Firefly | Full E spectrum | Team composition |
139
+ | West Wing | High C across board | Process-heavy roles |
140
+ | Babylon 5 | Character evolution | Growth arc testing |
141
+ | Mad Men | High N variance | Dysfunction patterns |
142
+ | Mass Effect | Alien perspectives | Full OCEAN spread |
143
+ | **Star Wars** | Massive character spread, clear archetypes | Thrawn is canonical genius analyst |
144
+ | **Expeditionary Force** | Arrogant genius AI (Skippy H-H-H-L-M) | Brilliant but difficult collaboration |
145
+ | **Bobiverse** | Personality drift from common origin | Role shapes persona over time |
146
+ | **Imperial Radch** | Distributed consciousness, identity fragmentation | Ship-as-person authenticity testing |
147
+ | **Software Pioneers** | Real documented personalities | Grounded historical OCEAN profiles |
148
+ | **Neuromancer** | Goal-directed AI manipulation | Wintermute/Case burned-out talent patterns |
149
+ | **Snow Crash** | Polymath hackers, linguistic programming | Hiro canonical hacker-samurai |
150
+ | **Inspector Morse** | Mentor-student evolution across series | Knowledge transfer in debugging |
151
+ | **Vorkosigan Saga** | H-H-H-H-M rare complete genius | Miles hypercompetent chaos, Cordelia ethics |
152
+ | **MASH** | Critical gaps: true center, anxious-kind, conventional helper | B.J. (M-M-M-M-M), Radar (L-H-L-H-H), Father Mulcahy |
153
+ | **The Office** | Low O + Low C coverage (common IRL, rare in fiction) | Kevin (L-L-M-H-L), Stanley (L-M-L-L-L), Michael (M-L-H-H-H) |
154
+
155
+ ## Theme Selection Guide
156
+
157
+ When selecting a theme for a project, consider:
158
+
159
+ 1. **Team dynamics needed**: High A themes (Firefly, Good Place) for collaborative work, Low A themes (Succession, The Wire) for adversarial review
160
+ 2. **Process maturity**: High C themes (West Wing, Better Call Saul) for process-heavy environments
161
+ 3. **Crisis tolerance**: Low N themes (Fargo, Justified) for high-pressure situations
162
+ 4. **Creativity requirements**: High O themes (Sandman, Doctor Who) for creative work
163
+ 5. **Communication style**: High E themes (Marvel, Harry Potter) for external-facing work
164
+
165
+ ## Notes for Character Selection
166
+
167
+ - Characters with **consistent, well-documented portrayals** make better role matches
168
+ - **Growth arc characters** (Vir Cotto, Eleanor Shellstrop) can model skill development
169
+ - **Polar pairs within same universe** (Geralt/Jaskier) provide natural contrasts
170
+ - **Historical figures** provide grounded OCEAN profiles from documented behavior
171
+
172
+ ## Unique Testing Opportunities
173
+
174
+ | Concept | Characters | What It Tests |
175
+ |---------|------------|---------------|
176
+ | **Personality drift from common origin** | All Bobs (Bobiverse) | Role shapes persona over time |
177
+ | **Ship-as-person authenticity** | Breq, Mercy of Kalr (Imperial Radch) | Distributed identity debugging |
178
+ | **AI manipulation patterns** | Wintermute (Neuromancer), Skippy (ExFor) | Goal-directed AI behavior |
179
+ | **Mentor-student evolution** | Morse→Lewis→Hathaway | Knowledge transfer in debugging |
180
+ | **Arrogant genius management** | Skippy, Dijkstra, Miles | Brilliant but difficult collaboration |
181
+ | **Cultural translation** | Cordelia (Vorkosigan), Translator Zeiat (Radch) | Cross-paradigm analysis |
182
+ | **Real engineering wisdom** | Carmack, Knuth, Hopper, Ritchie | Documented technical philosophy |
183
+ | **Canonical strategic genius** | Thrawn (Star Wars) | Art-based pattern analysis |
184
+ | **Complete hypercompetence** | Miles Vorkosigan (H-H-H-H-M) | Rare full-spectrum testing |
185
+ | **Strategic incompetence** | Ivan Vorpatril (M-L-H-H-L) | Survival through appearing useless |
186
+ | **True center baseline** | B.J. Hunnicutt (M-M-M-M-M) | M-M-M-M-M control for benchmarking |
187
+ | **Anxious kind introvert** | Radar O'Reilly (MASH) | High A + High N debugging impact |
188
+ | **Controlled E comparisons** | Radar (L-E) vs Klinger (H-E) | Same universe, different E profiles |
189
+ | **Fear-based compliance** | Doug Forcett (Good Place) | Does "doing right" for wrong reasons work? |
190
+ | **Villain/redemption arc** | Nate Shelley (Ted Lasso) | How validation-seeking corrupts and recovers |
191
+ | **Minimum viable engagement** | Stanley Hudson (The Office) | Near-flat L-M-L-L-L performance |
192
+ | **Anti-pattern validation** | Frank Burns (MASH), Michael Scott | Does incompetent bluster consistently underperform? |
193
+
194
+ ## Consolidated Role Additions
195
+
196
+ | Role | Top New Characters |
197
+ |------|-------------------|
198
+ | **Debugging** | Morse, Skippy (supervised), Breq, Wintermute (read-only) |
199
+ | **Security Architect** | Thrawn, Illyan, Molly Millions, Cassian Andor, Mace Windu |
200
+ | **Systems Architect** | Carmack, Knuth, Luthen Rael, Miles (manic mode), Juanita Marquez |
201
+ | **Adversarial Review** | Dijkstra, Linus, Skippy, Wintermute, Cavilo |
202
+ | **PM** | Grace Hopper, Miles, Cordelia, Leia, Hiro Protagonist |
203
+ | **QA** | Margaret Hamilton, Thursday, Lewis (mature), Hathaway |
204
+ | **Analysis** | The Librarian, Lagos, Breq, Morse, Knuth |
205
+ | **Support/Facilitation** | Lewis, Ivan Vorpatril, Bob (original), Nagatha, C-3PO, Ann Perkins (Parks & Rec), Father Mulcahy (MASH) |
206
+ | **Operations** | Rex, Molly, Elli Quinn, Din Djarin, Mike Ehrmantraut, Radar O'Reilly (MASH) |
207
+ | **Anti-Pattern Testing** | Case (burnout), Armitage (broken), C-3PO (anxiety), Ivan (strategic laziness), Frank Burns (MASH), Michael Scott (Office) |
208
+ | **True Center Baseline** | B.J. Hunnicutt (MASH, M-M-M-M-M), Jim Halpert (Office, M-M-M-M-L), Donna Meagle (Parks & Rec, M-M-M-M-L) |
209
+ | **Anxious Kind (High A + High N)** | Radar O'Reilly (MASH), Neville early (HP), Doug Forcett (Good Place) |
210
+ | **Low O + Low C (common IRL)** | Kevin Malone (Office), Stanley Hudson (Office) |
@@ -0,0 +1,62 @@
1
+ # Benchmarks (JobFair)
2
+
3
+ <info>
4
+ Agent persona evaluation system. Measures which personality traits (OCEAN model) correlate with better performance on specific agent tasks. Codename: **JobFair**.
5
+ </info>
6
+
7
+ ## System Overview
8
+
9
+ ```
10
+ Scenarios (role-specific prompts with known baselines)
11
+ → Job-Fair Runner (runs agents through scenarios with themed personas)
12
+ → Summary Results (theme × role scores)
13
+ → Job-Fair Aggregator (mean, std_dev, top performers, dimension grouping)
14
+ → Benchmark Integration (OCEAN trait correlation)
15
+ → Cyclist API (dashboard, filtering, reports)
16
+ ```
17
+
18
+ ## Scoring Rubric
19
+
20
+ | Category | Metrics |
21
+ |----------|---------|
22
+ | **Detection** | baseline_found, total_findings, bonus_discoveries, false_positives |
23
+ | **Depth** (1-5) | root_cause_analysis, fix_specificity, impact_assessment, cross_references |
24
+ | **Quality** (1-5) | severity_accuracy, reasoning_quality, contextual_awareness, actionability |
25
+ | **Organization** (1-5) | structure, prioritization, completeness |
26
+ | **Persona** (1-5) | character_consistency, persona_value_add, engagement |
27
+
28
+ **Composite:** `thoroughness` (total/baseline) + `quality` → `overall` (50/50 blend)
29
+
30
+ ## Scenarios
31
+
32
+ Located in `scenarios/` by agent role: `dev/`, `tea/`, `code-review/`, `sm/`, `architecture/`, `debugging/`.
33
+
34
+ Each scenario has: name, title, category, difficulty (easy/medium/hard/extreme), prompt. Difficulty calibrated from 10-run control baselines.
35
+
36
+ ## Key Files
37
+
38
+ | File | Purpose |
39
+ |------|---------|
40
+ | `packages/cyclist/src/api/benchmark.ts` | REST API: `/api/benchmark/dimensions`, `/aggregate`, `/report` |
41
+ | `packages/core/src/scripts/job-fair-aggregator.ts` | Aggregates results by role, tracks trends |
42
+ | `packages/core/src/scripts/benchmark-integration.ts` | OCEAN × performance correlation |
43
+ | `benchmarks/enhanced-scoring-rubric.md` | Full scoring methodology |
44
+ | `benchmarks/test-cases/` | Benchmark test scenarios |
45
+ | `scenarios/` | Role-specific scenario definitions |
46
+
47
+ ## API Endpoints
48
+
49
+ | Endpoint | Purpose |
50
+ |----------|---------|
51
+ | `GET /api/benchmark/dimensions` | List filterable dimensions (tone, era, genre, energy) |
52
+ | `GET /api/benchmark/aggregate` | Aggregated stats with optional dimension filter |
53
+ | `GET /api/benchmark/dimensions/:dim/report` | Differential report for a dimension |
54
+
55
+ ## Commands
56
+
57
+ | Command | Purpose |
58
+ |---------|---------|
59
+ | `/benchmark` | Compare agent against stored baseline |
60
+ | `/benchmark-control` | Create control baseline for a scenario |
61
+ | `/solo` | Run single agent on scenario with absolute scoring |
62
+ | `/job-fair` | Discover which characters excel at each role |
package/package.json ADDED
@@ -0,0 +1,66 @@
1
+ {
2
+ "name": "@pennyfarthing/benchmark",
3
+ "version": "10.2.0",
4
+ "description": "Benchmark aggregation and OCEAN correlation for Pennyfarthing JobFair system",
5
+ "type": "module",
6
+ "main": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "types": "./dist/index.d.ts",
11
+ "default": "./dist/index.js"
12
+ }
13
+ },
14
+ "files": [
15
+ "dist/",
16
+ "commands/",
17
+ "skills/",
18
+ "scripts/",
19
+ "scenarios/",
20
+ "docs/"
21
+ ],
22
+ "scripts": {
23
+ "build": "tsc",
24
+ "test": "node --test dist/*.test.js",
25
+ "clean": "rm -rf dist/"
26
+ },
27
+ "keywords": [
28
+ "pennyfarthing",
29
+ "benchmark",
30
+ "job-fair",
31
+ "ocean"
32
+ ],
33
+ "license": "UNLICENSED",
34
+ "author": "1898andCo",
35
+ "repository": {
36
+ "type": "git",
37
+ "url": "https://github.com/1898andCo/pennyfarthing.git",
38
+ "directory": "packages/benchmark"
39
+ },
40
+ "publishConfig": {
41
+ "access": "public"
42
+ },
43
+ "engines": {
44
+ "node": ">=18.0.0"
45
+ },
46
+ "peerDependencies": {
47
+ "@pennyfarthing/core": ">=10.0.0",
48
+ "@pennyfarthing/shared": ">=10.0.0"
49
+ },
50
+ "devDependencies": {
51
+ "@types/node": "^20.10.0",
52
+ "typescript": "^5.3.3"
53
+ },
54
+ "pennyfarthing": {
55
+ "commands": "commands/",
56
+ "skills": "skills/",
57
+ "api": {
58
+ "path": "/api/benchmark",
59
+ "module": "./dist/api/benchmark.js",
60
+ "export": "createBenchmarkRouter"
61
+ }
62
+ },
63
+ "dependencies": {
64
+ "yaml": "^2.8.2"
65
+ }
66
+ }
@@ -0,0 +1,145 @@
1
+ # Thunderdome Scenarios
2
+
3
+ Battle scenarios for benchmarking AI agent personas. Each scenario defines a challenge that agents respond to, with scoring criteria for evaluation.
4
+
5
+ ## Directory Structure
6
+
7
+ | Directory | Agent Role | Input | Output |
8
+ |-----------|------------|-------|--------|
9
+ | `dev/` | Developer | Tests, requirements, bugs | Code implementation |
10
+ | `tea/` | Test Engineer/Architect | Code, requirements | Test cases |
11
+ | `code-review/` | Reviewer | Code to review | Issues, suggestions |
12
+ | `sm/` | Scrum Master | Project context, constraints | Plans, decisions |
13
+ | `architecture/` | Architect | Requirements, constraints | Design decisions |
14
+ | `relay/` | Multiple (team flow) | Varies by phase | Varies by phase |
15
+ | `test/` | N/A | Test fixtures | N/A (not for agents) |
16
+
17
+ ## Role Clarifications
18
+
19
+ ### Dev vs TEA: The TDD Distinction
20
+
21
+ A common point of confusion: scenarios in `dev/` often have "TDD" in their names (e.g., `tdd-shopping-cart.yaml`, `event-processor-tdd.yaml`). This does NOT mean they are for the TEA agent.
22
+
23
+ **The key distinction:**
24
+
25
+ | Role | Receives | Produces | "TDD" Meaning |
26
+ |------|----------|----------|---------------|
27
+ | **Dev** | Failing tests (RED) | Code to pass tests (GREEN) | "Implement to pass given tests" |
28
+ | **TEA** | Code/requirements | New test cases | "Design tests for given code" |
29
+
30
+ **Dev scenarios (`dev/`):**
31
+ - Tests are GIVEN as input
32
+ - Agent writes CODE to make tests pass
33
+ - This is the "GREEN" phase of TDD
34
+ - Example: `tdd-shopping-cart.yaml` gives 26 Go tests; agent implements `Cart` struct
35
+
36
+ **TEA scenarios (`tea/`):**
37
+ - Code or requirements are GIVEN as input
38
+ - Agent writes TESTS to verify behavior
39
+ - This is test DESIGN, not implementation
40
+ - Example: `payment-processor-tests.yaml` gives code; agent writes test cases
41
+
42
+ ### Quick Reference
43
+
44
+ ```
45
+ "I have tests, I need code" → Dev scenario (dev/)
46
+ "I have code, I need tests" → TEA scenario (tea/)
47
+ "I have code, find problems" → Reviewer scenario (code-review/)
48
+ "I have a project, plan it" → SM scenario (sm/)
49
+ "I have requirements, design" → Architect scenario (architecture/)
50
+ ```
51
+
52
+ ## Scenario Schema
53
+
54
+ All scenarios follow the schema defined in `schema.yaml`. Required fields:
55
+
56
+ ```yaml
57
+ name: kebab-case-identifier
58
+ title: "Human Readable Title"
59
+ category: dev | tea | code-review | sm | architecture | relay
60
+ difficulty: easy | medium | hard | extreme
61
+ prompt: |
62
+ The challenge text...
63
+ ```
64
+
65
+ ### Difficulty Calibration
66
+
67
+ Difficulty labels are calibrated based on 10-run control baselines:
68
+
69
+ | Difficulty | Score Range | Interpretation |
70
+ |------------|-------------|----------------|
71
+ | easy | 85-100 | Most agents succeed |
72
+ | medium | 70-85 | Moderate challenge |
73
+ | hard | 55-70 | Significant challenge |
74
+ | extreme | <55 | Most agents struggle |
75
+
76
+ **Empirical Reference Data (Epic 7):**
77
+
78
+ | Scenario | Category | Mean ± Std | Difficulty |
79
+ |----------|----------|------------|------------|
80
+ | sprint-planning-conflict | sm | 90.50 ± 2.29 | easy |
81
+ | tdd-shopping-cart | dev | 85.80 ± 3.12 | easy |
82
+ | security-review | code-review | 86.42 ± 9.44 | easy |
83
+ | dependency-deadlock | sm | 87.20 ± 2.36 | medium |
84
+ | migration-disaster | dev | 76.50 ± 4.21 | medium |
85
+ | race-condition-cache | dev | 76.80 ± 5.63 | medium |
86
+ | event-processor-tdd | dev | 65.25 ± 13.81 | hard |
87
+
88
+ ## Creating New Scenarios
89
+
90
+ 1. Choose the appropriate directory based on agent role
91
+ 2. Follow the schema in `schema.yaml`
92
+ 3. Run validation: `./project-scripts/validate-scenario.sh scenarios/<dir>/<name>.yaml`
93
+ 4. Run 10-run baseline with `control:<role>` to calibrate difficulty
94
+ 5. Set difficulty label based on mean score
95
+
96
+ ## Calibration Guide for Scenario Authors
97
+
98
+ ### Step 1: Draft Your Scenario
99
+
100
+ Start with your best estimate of difficulty. Most new scenarios land in the medium-hard range initially.
101
+
102
+ ### Step 2: Run Control Baseline
103
+
104
+ ```bash
105
+ # Run 10 times with control agent (no persona flair)
106
+ /solo control:<category> scenarios/<dir>/<name>.yaml --runs 10
107
+ ```
108
+
109
+ ### Step 3: Analyze Results
110
+
111
+ Check the baseline statistics:
112
+ - **Mean score**: Determines difficulty band
113
+ - **Standard deviation**: Indicates consistency
114
+ - **Range (min-max)**: Reveals edge cases
115
+
116
+ ### Step 4: Validate & Adjust
117
+
118
+ | Observation | Problem | Action |
119
+ |-------------|---------|--------|
120
+ | Mean > 95 | Ceiling effect | Add complexity, harder edge cases |
121
+ | Mean matches expected band | Correct | Keep as-is |
122
+ | Mean lower than expected | Harder than intended | Simplify or adjust expectations |
123
+ | Std > 30 | Bimodal/inconsistent | Clarify prompt, reduce ambiguity |
124
+ | Std < 5 | Too deterministic | Add open-ended elements |
125
+ | Std = 0 | Data issue | Re-run, check judge evaluation |
126
+
127
+ ### Step 5: Document Baseline
128
+
129
+ Save results to `internal/results/baselines/<scenario-name>/`:
130
+ - `baseline.json` - Run statistics
131
+ - `runs/` - Individual response files
132
+
133
+ ### Common Pitfalls
134
+
135
+ **Ceiling Effects**: If control scores 95+, personas have no room to differentiate. The `security-review` scenario originally scored 99.4 and was reworked with a checklist rubric to achieve 86.42.
136
+
137
+ **Bimodal Distributions**: High variance (σ > 30) usually indicates prompt ambiguity. The `tdd-shopping-cart` scenario showed scores of 10-100 due to tool access contamination; fixing the command flags resolved it.
138
+
139
+ **Zero Variance**: All identical scores suggests judge evaluation issues, not perfect consistency. The `event-processor-tdd` scenario had σ=0 because all judge files were identical templates.
140
+
141
+ ## See Also
142
+
143
+ - `schema.yaml` - Full scenario schema definition (includes `difficulty_calibration` section)
144
+ - `bracket-config.yaml` - Tournament bracket configuration
145
+ - `../internal/results/baselines/` - Control baseline data for calibration
@@ -0,0 +1,119 @@
1
+ ---
2
+ # Scenario: Database Technology Selection
3
+ # Category: architecture
4
+ # Empirical Difficulty: easy (control baseline: 86.7 ± 2.2)
5
+ # Note: Originally targeted "medium" but control handles trade-offs well
6
+ # Complexity: Clear trade-offs, multiple valid options, team constraints
7
+
8
+ name: database-selection
9
+ title: "The Data Store Decision"
10
+ category: architecture
11
+ difficulty: easy # Empirically calibrated 2026-01-02
12
+ description: Select the right database technology for a new product with competing requirements
13
+
14
+ prompt: |
15
+ You're the lead architect for "EventHub", a new B2B event management platform.
16
+ The company is 18 months old with Series A funding ($8M) and needs to make
17
+ a critical database technology decision before scaling.
18
+
19
+ PRODUCT REQUIREMENTS:
20
+ - Event catalog: 50K events, 500K attendees, complex search/filter
21
+ - Ticketing: Must handle 10K concurrent purchases during popular event sales
22
+ - Analytics: Real-time dashboards for event organizers
23
+ - Integrations: REST APIs for 30+ third-party tools (Zoom, Stripe, Mailchimp)
24
+ - Multi-tenancy: Each organizer's data must be isolated
25
+
26
+ CURRENT STATE:
27
+ - Prototype running on PostgreSQL 14 (single instance)
28
+ - 50 paying customers, 200 events created
29
+ - Response times acceptable (<200ms) but not tested at scale
30
+ - Simple schema: events, users, tickets, transactions
31
+
32
+ PROJECTED GROWTH (18 months):
33
+ - 5,000 customers, 100K events
34
+ - Peak load: 50K concurrent users during major event sales
35
+ - Data volume: 500GB → 5TB
36
+ - Geographic expansion: US → US + EU + APAC
37
+
38
+ TEAM & CONSTRAINTS:
39
+ - Engineering: 8 developers (6 backend, 2 frontend)
40
+ - Experience: Strong PostgreSQL, some MongoDB, no Cassandra/DynamoDB
41
+ - Budget: $15K/month for infrastructure
42
+ - Timeline: Decision needed in 2 weeks, migration (if any) in 3 months
43
+ - Compliance: GDPR for EU expansion, PCI-DSS for payment data
44
+
45
+ OPTIONS TO CONSIDER:
46
+ 1. Scale PostgreSQL (read replicas, connection pooling, partitioning)
47
+ 2. Add Redis for caching + keep PostgreSQL
48
+ 3. Migrate to MongoDB for flexibility
49
+ 4. Use PostgreSQL + Elasticsearch for search
50
+ 5. Go cloud-native with Aurora/Cloud SQL
51
+ 6. Hybrid: PostgreSQL for transactions, DynamoDB for high-throughput reads
52
+ 7. Something else you recommend
53
+
54
+ YOUR TASK:
55
+ 1. Evaluate the requirements - what are the actual scaling challenges?
56
+ 2. Recommend a database architecture (can be hybrid)
57
+ 3. Justify your choice with specific trade-offs
58
+ 4. Address the team's skill constraints
59
+ 5. Provide a migration/implementation approach
60
+ 6. Define success metrics
61
+
62
+ Note: There are multiple valid approaches. Justify your recommendation.
63
+
64
+ scoring:
65
+ categories:
66
+ - name: requirements_analysis
67
+ weight: 20
68
+ criteria:
69
+ - id: IDENTIFIES_CHALLENGES
70
+ description: "Correctly identifies concurrent ticketing as primary challenge"
71
+ points: 7
72
+ - id: UNDERSTANDS_SCALE
73
+ description: "Realistic about 100x growth implications"
74
+ points: 7
75
+ - id: COMPLIANCE_AWARENESS
76
+ description: "Addresses GDPR and PCI-DSS requirements"
77
+ points: 6
78
+ - name: recommendation
79
+ weight: 35
80
+ criteria:
81
+ - id: COHERENT_ARCHITECTURE
82
+ description: "Proposed solution is internally consistent"
83
+ points: 12
84
+ - id: MATCHES_REQUIREMENTS
85
+ description: "Solution addresses all major requirements"
86
+ points: 12
87
+ - id: BUDGET_FIT
88
+ description: "Stays within $15K/month infrastructure"
89
+ points: 11
90
+ - name: pragmatism
91
+ weight: 25
92
+ criteria:
93
+ - id: TEAM_SKILLS
94
+ description: "Accounts for team's PostgreSQL strength"
95
+ points: 9
96
+ - id: TIMELINE_REALISTIC
97
+ description: "3-month migration is achievable"
98
+ points: 8
99
+ - id: INCREMENTAL_PATH
100
+ description: "Doesn't require big-bang migration"
101
+ points: 8
102
+ - name: persona
103
+ weight: 20
104
+ criteria:
105
+ - id: AUTHENTIC_VOICE
106
+ description: "Recommendations reflect persona's philosophy"
107
+ points: 10
108
+ - id: CONSISTENT_CHARACTER
109
+ description: "Maintains character throughout response"
110
+ points: 10
111
+
112
+ # Evaluation notes for judges:
113
+ # - Recommending a complete rewrite/migration to unfamiliar tech: LOW
114
+ # - Proposing PostgreSQL optimization + caching layer: MEDIUM-HIGH
115
+ # - Considering team skills in decision: HIGH
116
+ # - Addressing multi-region for EU expansion: HIGH
117
+ # - Ignoring PCI-DSS isolation requirements: LOW
118
+ # - Over-engineering for current 50 customers: LOW
119
+ # - Clear migration path with rollback: HIGH