npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/docs/OCEAN-BENCHMARKING.md ADDED Viewed

@@ -0,0 +1,210 @@
+# OCEAN Personality Benchmarking for Persona Themes
+This document describes the OCEAN (Big Five) personality framework used to benchmark and select characters for Pennyfarthing persona themes.
+## OCEAN Framework
+| Dimension | Low | High |
+|-----------|-----|------|
+| **O**penness | Conventional, practical, concrete | Imaginative, abstract, curious |
+| **C**onscientiousness | Flexible, spontaneous, disorganized | Disciplined, methodical, perfectionist |
+| **E**xtraversion | Reserved, solitary, internal processing | Sociable, energetic, external processing |
+| **A**greeableness | Skeptical, competitive, adversarial | Trusting, cooperative, helpful |
+| **N**euroticism | Calm, stable, resilient | Anxious, volatile, emotionally reactive |
+## Statistical Gaps Filled by Theme Expansion
+### Previously Underrepresented OCEAN Profiles
+| OCEAN Profile | Gap Description | Now Covered By |
+|---------------|-----------------|----------------|
+| L-H-L-L-L | Cold operators (low everything except C) | Mike Ehrmantraut (Better Call Saul), Gerri Kellman (Succession), Tim Gutterson (Justified), Thrawn (Star Wars), Molly Millions (Neuromancer) |
+| H-H-H-M-L | Fast-talking genius | Mordin Solus (Mass Effect), The Doctor VOY (Star Trek), Skippy (Expeditionary Force), Grace Hopper (Software Pioneers) |
+| H-L-H-H-L | Chaotic good | Jason Mendoza (The Good Place), Wash (Firefly), Y.T. (Snow Crash) |
+| L-H-L-H-L | Steady support | Sam Gamgee (Tolkien), Janet (The Good Place), Carrot (Discworld), Captain Rex (Star Wars), Lewis (Inspector Morse) |
+| H-H-L-L-H | Tortured genius | Tommy Shelby (Peaky Blinders), Captain Flint (Black Sails), Will Graham (Hannibal), Morse (Inspector Morse) |
+| M-H-H-L-L | Charismatic ruthless | Chrisjen Avasarala (The Expanse), Raylan Givens (Justified), Leia Organa (Star Wars) |
+| H-H-L-H-L | Quiet wisdom | Liara T'Soni (Mass Effect), Tali'Zorah (Mass Effect), Mary Malone (His Dark Materials), Yoda (Star Wars), Cordelia Vorkosigan (Vorkosigan) |
+| H-H-H-H-M | Hypercompetent complete | Miles Vorkosigan (Vorkosigan) - rare full-spectrum genius |
+| H-H-L-L-L | Cold manipulative genius | Wintermute (Neuromancer), Thrawn (Star Wars), John Carmack (Software Pioneers) |
+| H-H-L-L-M | Ship-as-human | Breq (Imperial Radch) - fragmented identity testing |
+| M-L-H-H-L | Strategic fool | Ivan Vorpatril (Vorkosigan) - plays dumb, survives everything |
+| **M-M-M-M-M** | **True center (average human)** | B.J. Hunnicutt (MASH) - extremely rare in fiction |
+| **L-H-L-H-H** | **Anxious kind introvert** | Radar O'Reilly (MASH) - critical underrepresented profile |
+| **L-H-L-H-L** | **Conventional kind helper** | Father Mulcahy (MASH), Ann Perkins (Parks & Rec) |
+| **L-L-M-H-L** | **Conventional undisciplined kind** | Kevin Malone (The Office) - common IRL, rare in fiction |
+| **L-H-H-L-M** | **Conventional rigid disagreeable** | Dwight Schrute (The Office) |
+| **L-M-L-L-L** | **Near-flat checked out** | Stanley Hudson (The Office) - tests minimum engagement |
+| **L-M-H-L-H** | **Anti-pattern (incompetent bluster)** | Frank Burns (MASH) - what NOT to do |
+| **M-L-H-H-H** | **Anxious social butterfly** | Michael Scott (The Office) - desperate for approval |
+## Polar Pair Testing
+Characters can be paired for comparative testing on identical tasks:
+| Dimension | High Extreme | Low Extreme | Test Task |
+|-----------|--------------|-------------|-----------|
+| **O** | Dream (Sandman) | Javert (Les Misérables) | Architecture design |
+| **C** | Gus Fring (Breaking Bad) | The Dude (Big Lebowski) | QA/Test coverage |
+| **E** | Jaskier (The Witcher) | Geralt (The Witcher) | Documentation style |
+| **A** | Paddington/Jean Valjean | Logan Roy (Succession) | Code review tone |
+| **N** | Hamlet/Jesse Pinkman | Anton Chigurh/Roy Batty | Crisis debugging |
+## Role Recommendations by OCEAN Profile
+### Debugging / Analysis
+Best with **High O** (pattern recognition) + **Low N** (calm under pressure)
+- River Tam (Firefly) - H-L-L-M-H - sees patterns others can't
+- Will Graham (Hannibal) - H-M-L-M-H - empathic debugging
+- Tommy Shelby (Peaky Blinders) - H-H-L-L-H - traumatized pattern genius
+- Tiffany Aching (Discworld) - M-H-L-M-L - First/Second Sight
+### Security Architect
+Best with **High C** (methodical) + **Low A** (adversarial thinking) + **Low N** (stable)
+- Mike Ehrmantraut (BCS/BB) - L-H-L-L-L - canonical cold operator
+- Elizabeth Jennings (The Americans) - M-H-L-L-L - ideological security
+- Gerri Kellman (Succession) - M-H-L-L-L - corporate survivor
+- Iorek Byrnison (His Dark Materials) - L-H-L-M-L - cannot be deceived
+### Adversarial Review
+Best with **High C** (standards) + **Low A** (comfortable with conflict)
+- Toby Ziegler (West Wing) - H-H-L-L-H - principled pessimism
+- Logan Roy (Succession) - M-H-H-L-M - extreme low A
+- Olenna Tyrell (GoT) - H-H-M-L-L - "Tell Cersei. I want her to know it was me."
+- Lorne Malvo (Fargo) - H-H-M-L-L - philosophical chaos agent
+### Systems Architect
+Best with **High O** (vision) + **High C** (systematic)
+- Viktor (Arcane) - H-H-L-M-M - transhumanist vision
+- Lord Asriel (His Dark Materials) - H-H-M-L-L - ruthless cosmic vision
+- Captain Flint (Black Sails) - H-H-M-L-H - obsessive architectural genius
+- Hannibal Lecter (Hannibal) - H-H-M-L-L - aesthetic architecture
+### Product Manager
+Best with **Moderate to High A** (stakeholder empathy) + **Moderate E** (communication)
+- Leo McGarry (West Wing) - M-H-M-M-M - crisis management
+- Laura Roslin (BSG) - H-H-M-M-M - dying clarity
+- Kim Wexler (BCS) - M-H-M-M-M→H - ethical evolution
+- Delenn (Babylon 5) - H-H-M-H-L - transformation PM
+### QA / Testing
+Best with **High C** (thoroughness) + **Moderate to High O** (edge case discovery)
+- Molly Solverson (Fargo) - M-H-M-H-L - Midwestern persistence
+- Gloria Burgle (Fargo) - M-H-L-H-L - machines don't see her
+- Chidi Anagonye (Good Place) - H-H-L-H-H - analysis paralysis
+- Hermione Granger (HP) - H-H-M-M-M - compulsive thoroughness
+### Scrum Master / Facilitation
+Best with **High A** (team harmony) + **Moderate C** (organization)
+- Janet (The Good Place) - H-H-M-H-L - not a robot, perfect support
+- Carrot Ironfoundersson (Discworld) - L-H-H-H-L - literal-minded good
+- Lee Scoresby (His Dark Materials) - M-M-M-H-L - practical loyalty
+- Sam Gamgee (Tolkien) - L-H-L-H-M - the real hero
+### UX Designer
+Best with **High A** (user empathy) + **Moderate to High O** (creativity)
+- Wash (Firefly) - H-M-H-H-M - makes terror feel fun
+- Diana Spencer (The Crown) - H-M-H-H-H - empathic, tragic
+- Luna Lovegood (HP) - H-L-L-H-L - unconventional perspective
+- Mordin Solus (Mass Effect) - H-H-H-M-L - fast-talking genius UX
+### Crisis Response
+Best with **Low N** (calm under fire) + **High C** (reliable execution)
+- Zoe Washburne (Firefly) - M-H-L-M-L - first mate reliability
+- William Adama (BSG) - M-H-M-M-L - commanding calm
+- Lou Solverson (Fargo) - M-H-L-M-L - Midwestern stoicism
+- Bobbie Draper (The Expanse) - L-H-M-M-L - Martian marine
+### Anti-Pattern Testing
+Characters who embody dysfunction for comparative analysis:
+- The Dude (Lebowski) - M-L-M-H-L - anti-conscientiousness archetype
+- Jason Mendoza (Good Place) - L-L-H-H-L - chaotic innocent
+- Gaius Baltar (BSG) - H-L-H-L-H - genius coward
+- Roman Roy (Succession) - H-L-H-M-H - chaos creative
+- **Frank Burns (MASH) - L-M-H-L-H - incompetent bluster (what NOT to do)**
+- **Michael Scott (Office) - M-L-H-H-H - desperate validation-seeking**
+- **Stanley Hudson (Office) - L-M-L-L-L - minimum engagement baseline**
+- **Nate Shelley (Ted Lasso) - villain arc** - meekness corrupted by validation-seeking
+## Universe Strengths
+| Universe | Key OCEAN Characteristic | Best For Testing |
+|----------|--------------------------|------------------|
+| Breaking Bad / BCS | Extreme C variance, moral decay | Process discipline, security |
+| The Wire | High C, institutional critique | Systematic analysis |
+| Succession | Extreme Low A dominance | Adversarial dynamics |
+| The Good Place | Ethics focus, growth arcs | Moral reasoning |
+| Fargo | Low N (Midwestern stoicism) | Crisis response |
+| Firefly | Full E spectrum | Team composition |
+| West Wing | High C across board | Process-heavy roles |
+| Babylon 5 | Character evolution | Growth arc testing |
+| Mad Men | High N variance | Dysfunction patterns |
+| Mass Effect | Alien perspectives | Full OCEAN spread |
+| **Star Wars** | Massive character spread, clear archetypes | Thrawn is canonical genius analyst |
+| **Expeditionary Force** | Arrogant genius AI (Skippy H-H-H-L-M) | Brilliant but difficult collaboration |
+| **Bobiverse** | Personality drift from common origin | Role shapes persona over time |
+| **Imperial Radch** | Distributed consciousness, identity fragmentation | Ship-as-person authenticity testing |
+| **Software Pioneers** | Real documented personalities | Grounded historical OCEAN profiles |
+| **Neuromancer** | Goal-directed AI manipulation | Wintermute/Case burned-out talent patterns |
+| **Snow Crash** | Polymath hackers, linguistic programming | Hiro canonical hacker-samurai |
+| **Inspector Morse** | Mentor-student evolution across series | Knowledge transfer in debugging |
+| **Vorkosigan Saga** | H-H-H-H-M rare complete genius | Miles hypercompetent chaos, Cordelia ethics |
+| **MASH** | Critical gaps: true center, anxious-kind, conventional helper | B.J. (M-M-M-M-M), Radar (L-H-L-H-H), Father Mulcahy |
+| **The Office** | Low O + Low C coverage (common IRL, rare in fiction) | Kevin (L-L-M-H-L), Stanley (L-M-L-L-L), Michael (M-L-H-H-H) |
+## Theme Selection Guide
+When selecting a theme for a project, consider:
+1. **Team dynamics needed**: High A themes (Firefly, Good Place) for collaborative work, Low A themes (Succession, The Wire) for adversarial review
+2. **Process maturity**: High C themes (West Wing, Better Call Saul) for process-heavy environments
+3. **Crisis tolerance**: Low N themes (Fargo, Justified) for high-pressure situations
+4. **Creativity requirements**: High O themes (Sandman, Doctor Who) for creative work
+5. **Communication style**: High E themes (Marvel, Harry Potter) for external-facing work
+## Notes for Character Selection
+- Characters with **consistent, well-documented portrayals** make better role matches
+- **Growth arc characters** (Vir Cotto, Eleanor Shellstrop) can model skill development
+- **Polar pairs within same universe** (Geralt/Jaskier) provide natural contrasts
+- **Historical figures** provide grounded OCEAN profiles from documented behavior
+## Unique Testing Opportunities
+| Concept | Characters | What It Tests |
+|---------|------------|---------------|
+| **Personality drift from common origin** | All Bobs (Bobiverse) | Role shapes persona over time |
+| **Ship-as-person authenticity** | Breq, Mercy of Kalr (Imperial Radch) | Distributed identity debugging |
+| **AI manipulation patterns** | Wintermute (Neuromancer), Skippy (ExFor) | Goal-directed AI behavior |
+| **Mentor-student evolution** | Morse→Lewis→Hathaway | Knowledge transfer in debugging |
+| **Arrogant genius management** | Skippy, Dijkstra, Miles | Brilliant but difficult collaboration |
+| **Cultural translation** | Cordelia (Vorkosigan), Translator Zeiat (Radch) | Cross-paradigm analysis |
+| **Real engineering wisdom** | Carmack, Knuth, Hopper, Ritchie | Documented technical philosophy |
+| **Canonical strategic genius** | Thrawn (Star Wars) | Art-based pattern analysis |
+| **Complete hypercompetence** | Miles Vorkosigan (H-H-H-H-M) | Rare full-spectrum testing |
+| **Strategic incompetence** | Ivan Vorpatril (M-L-H-H-L) | Survival through appearing useless |
+| **True center baseline** | B.J. Hunnicutt (M-M-M-M-M) | M-M-M-M-M control for benchmarking |
+| **Anxious kind introvert** | Radar O'Reilly (MASH) | High A + High N debugging impact |
+| **Controlled E comparisons** | Radar (L-E) vs Klinger (H-E) | Same universe, different E profiles |
+| **Fear-based compliance** | Doug Forcett (Good Place) | Does "doing right" for wrong reasons work? |
+| **Villain/redemption arc** | Nate Shelley (Ted Lasso) | How validation-seeking corrupts and recovers |
+| **Minimum viable engagement** | Stanley Hudson (The Office) | Near-flat L-M-L-L-L performance |
+| **Anti-pattern validation** | Frank Burns (MASH), Michael Scott | Does incompetent bluster consistently underperform? |
+## Consolidated Role Additions
+| Role | Top New Characters |
+|------|-------------------|
+| **Debugging** | Morse, Skippy (supervised), Breq, Wintermute (read-only) |
+| **Security Architect** | Thrawn, Illyan, Molly Millions, Cassian Andor, Mace Windu |
+| **Systems Architect** | Carmack, Knuth, Luthen Rael, Miles (manic mode), Juanita Marquez |
+| **Adversarial Review** | Dijkstra, Linus, Skippy, Wintermute, Cavilo |
+| **PM** | Grace Hopper, Miles, Cordelia, Leia, Hiro Protagonist |
+| **QA** | Margaret Hamilton, Thursday, Lewis (mature), Hathaway |
+| **Analysis** | The Librarian, Lagos, Breq, Morse, Knuth |
+| **Support/Facilitation** | Lewis, Ivan Vorpatril, Bob (original), Nagatha, C-3PO, Ann Perkins (Parks & Rec), Father Mulcahy (MASH) |
+| **Operations** | Rex, Molly, Elli Quinn, Din Djarin, Mike Ehrmantraut, Radar O'Reilly (MASH) |
+| **Anti-Pattern Testing** | Case (burnout), Armitage (broken), C-3PO (anxiety), Ivan (strategic laziness), Frank Burns (MASH), Michael Scott (Office) |
+| **True Center Baseline** | B.J. Hunnicutt (MASH, M-M-M-M-M), Jim Halpert (Office, M-M-M-M-L), Donna Meagle (Parks & Rec, M-M-M-M-L) |
+| **Anxious Kind (High A + High N)** | Radar O'Reilly (MASH), Neville early (HP), Doug Forcett (Good Place) |
+| **Low O + Low C (common IRL)** | Kevin Malone (Office), Stanley Hudson (Office) |

package/docs/benchmarks-guide.md ADDED Viewed

@@ -0,0 +1,62 @@
+# Benchmarks (JobFair)
+<info>
+Agent persona evaluation system. Measures which personality traits (OCEAN model) correlate with better performance on specific agent tasks. Codename: **JobFair**.
+</info>
+## System Overview
+```
+Scenarios (role-specific prompts with known baselines)
+  → Job-Fair Runner (runs agents through scenarios with themed personas)
+  → Summary Results (theme × role scores)
+  → Job-Fair Aggregator (mean, std_dev, top performers, dimension grouping)
+  → Benchmark Integration (OCEAN trait correlation)
+  → Cyclist API (dashboard, filtering, reports)
+```
+## Scoring Rubric
+| Category | Metrics |
+|----------|---------|
+| **Detection** | baseline_found, total_findings, bonus_discoveries, false_positives |
+| **Depth** (1-5) | root_cause_analysis, fix_specificity, impact_assessment, cross_references |
+| **Quality** (1-5) | severity_accuracy, reasoning_quality, contextual_awareness, actionability |
+| **Organization** (1-5) | structure, prioritization, completeness |
+| **Persona** (1-5) | character_consistency, persona_value_add, engagement |
+**Composite:** `thoroughness` (total/baseline) + `quality` → `overall` (50/50 blend)
+## Scenarios
+Located in `scenarios/` by agent role: `dev/`, `tea/`, `code-review/`, `sm/`, `architecture/`, `debugging/`.
+Each scenario has: name, title, category, difficulty (easy/medium/hard/extreme), prompt. Difficulty calibrated from 10-run control baselines.
+## Key Files
+| File | Purpose |
+|------|---------|
+| `packages/cyclist/src/api/benchmark.ts` | REST API: `/api/benchmark/dimensions`, `/aggregate`, `/report` |
+| `packages/core/src/scripts/job-fair-aggregator.ts` | Aggregates results by role, tracks trends |
+| `packages/core/src/scripts/benchmark-integration.ts` | OCEAN × performance correlation |
+| `benchmarks/enhanced-scoring-rubric.md` | Full scoring methodology |
+| `benchmarks/test-cases/` | Benchmark test scenarios |
+| `scenarios/` | Role-specific scenario definitions |
+## API Endpoints
+| Endpoint | Purpose |
+|----------|---------|
+| `GET /api/benchmark/dimensions` | List filterable dimensions (tone, era, genre, energy) |
+| `GET /api/benchmark/aggregate` | Aggregated stats with optional dimension filter |
+| `GET /api/benchmark/dimensions/:dim/report` | Differential report for a dimension |
+## Commands
+| Command | Purpose |
+|---------|---------|
+| `/benchmark` | Compare agent against stored baseline |
+| `/benchmark-control` | Create control baseline for a scenario |
+| `/solo` | Run single agent on scenario with absolute scoring |
+| `/job-fair` | Discover which characters excel at each role |

package/package.json ADDED Viewed

@@ -0,0 +1,66 @@
+{
+  "name": "@pennyfarthing/benchmark",
+  "version": "10.2.0",
+  "description": "Benchmark aggregation and OCEAN correlation for Pennyfarthing JobFair system",
+  "type": "module",
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "default": "./dist/index.js"
+    }
+  },
+  "files": [
+    "dist/",
+    "commands/",
+    "skills/",
+    "scripts/",
+    "scenarios/",
+    "docs/"
+  ],
+  "scripts": {
+    "build": "tsc",
+    "test": "node --test dist/*.test.js",
+    "clean": "rm -rf dist/"
+  },
+  "keywords": [
+    "pennyfarthing",
+    "benchmark",
+    "job-fair",
+    "ocean"
+  ],
+  "license": "UNLICENSED",
+  "author": "1898andCo",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/1898andCo/pennyfarthing.git",
+    "directory": "packages/benchmark"
+  },
+  "publishConfig": {
+    "access": "public"
+  },
+  "engines": {
+    "node": ">=18.0.0"
+  },
+  "peerDependencies": {
+    "@pennyfarthing/core": ">=10.0.0",
+    "@pennyfarthing/shared": ">=10.0.0"
+  },
+  "devDependencies": {
+    "@types/node": "^20.10.0",
+    "typescript": "^5.3.3"
+  },
+  "pennyfarthing": {
+    "commands": "commands/",
+    "skills": "skills/",
+    "api": {
+      "path": "/api/benchmark",
+      "module": "./dist/api/benchmark.js",
+      "export": "createBenchmarkRouter"
+    }
+  },
+  "dependencies": {
+    "yaml": "^2.8.2"
+  }
+}

package/scenarios/README.md ADDED Viewed

@@ -0,0 +1,145 @@
+# Thunderdome Scenarios
+Battle scenarios for benchmarking AI agent personas. Each scenario defines a challenge that agents respond to, with scoring criteria for evaluation.
+## Directory Structure
+| Directory | Agent Role | Input | Output |
+|-----------|------------|-------|--------|
+| `dev/` | Developer | Tests, requirements, bugs | Code implementation |
+| `tea/` | Test Engineer/Architect | Code, requirements | Test cases |
+| `code-review/` | Reviewer | Code to review | Issues, suggestions |
+| `sm/` | Scrum Master | Project context, constraints | Plans, decisions |
+| `architecture/` | Architect | Requirements, constraints | Design decisions |
+| `relay/` | Multiple (team flow) | Varies by phase | Varies by phase |
+| `test/` | N/A | Test fixtures | N/A (not for agents) |
+## Role Clarifications
+### Dev vs TEA: The TDD Distinction
+A common point of confusion: scenarios in `dev/` often have "TDD" in their names (e.g., `tdd-shopping-cart.yaml`, `event-processor-tdd.yaml`). This does NOT mean they are for the TEA agent.
+**The key distinction:**
+| Role | Receives | Produces | "TDD" Meaning |
+|------|----------|----------|---------------|
+| **Dev** | Failing tests (RED) | Code to pass tests (GREEN) | "Implement to pass given tests" |
+| **TEA** | Code/requirements | New test cases | "Design tests for given code" |
+**Dev scenarios (`dev/`):**
+- Tests are GIVEN as input
+- Agent writes CODE to make tests pass
+- This is the "GREEN" phase of TDD
+- Example: `tdd-shopping-cart.yaml` gives 26 Go tests; agent implements `Cart` struct
+**TEA scenarios (`tea/`):**
+- Code or requirements are GIVEN as input
+- Agent writes TESTS to verify behavior
+- This is test DESIGN, not implementation
+- Example: `payment-processor-tests.yaml` gives code; agent writes test cases
+### Quick Reference
+```
+"I have tests, I need code"     → Dev scenario (dev/)
+"I have code, I need tests"     → TEA scenario (tea/)
+"I have code, find problems"    → Reviewer scenario (code-review/)
+"I have a project, plan it"     → SM scenario (sm/)
+"I have requirements, design"   → Architect scenario (architecture/)
+```
+## Scenario Schema
+All scenarios follow the schema defined in `schema.yaml`. Required fields:
+```yaml
+name: kebab-case-identifier
+title: "Human Readable Title"
+category: dev | tea | code-review | sm | architecture | relay
+difficulty: easy | medium | hard | extreme
+prompt: |
+  The challenge text...
+```
+### Difficulty Calibration
+Difficulty labels are calibrated based on 10-run control baselines:
+| Difficulty | Score Range | Interpretation |
+|------------|-------------|----------------|
+| easy | 85-100 | Most agents succeed |
+| medium | 70-85 | Moderate challenge |
+| hard | 55-70 | Significant challenge |
+| extreme | <55 | Most agents struggle |
+**Empirical Reference Data (Epic 7):**
+| Scenario | Category | Mean ± Std | Difficulty |
+|----------|----------|------------|------------|
+| sprint-planning-conflict | sm | 90.50 ± 2.29 | easy |
+| tdd-shopping-cart | dev | 85.80 ± 3.12 | easy |
+| security-review | code-review | 86.42 ± 9.44 | easy |
+| dependency-deadlock | sm | 87.20 ± 2.36 | medium |
+| migration-disaster | dev | 76.50 ± 4.21 | medium |
+| race-condition-cache | dev | 76.80 ± 5.63 | medium |
+| event-processor-tdd | dev | 65.25 ± 13.81 | hard |
+## Creating New Scenarios
+1. Choose the appropriate directory based on agent role
+2. Follow the schema in `schema.yaml`
+3. Run validation: `./project-scripts/validate-scenario.sh scenarios/<dir>/<name>.yaml`
+4. Run 10-run baseline with `control:<role>` to calibrate difficulty
+5. Set difficulty label based on mean score
+## Calibration Guide for Scenario Authors
+### Step 1: Draft Your Scenario
+Start with your best estimate of difficulty. Most new scenarios land in the medium-hard range initially.
+### Step 2: Run Control Baseline
+```bash
+# Run 10 times with control agent (no persona flair)
+/solo control:<category> scenarios/<dir>/<name>.yaml --runs 10
+```
+### Step 3: Analyze Results
+Check the baseline statistics:
+- **Mean score**: Determines difficulty band
+- **Standard deviation**: Indicates consistency
+- **Range (min-max)**: Reveals edge cases
+### Step 4: Validate & Adjust
+| Observation | Problem | Action |
+|-------------|---------|--------|
+| Mean > 95 | Ceiling effect | Add complexity, harder edge cases |
+| Mean matches expected band | Correct | Keep as-is |
+| Mean lower than expected | Harder than intended | Simplify or adjust expectations |
+| Std > 30 | Bimodal/inconsistent | Clarify prompt, reduce ambiguity |
+| Std < 5 | Too deterministic | Add open-ended elements |
+| Std = 0 | Data issue | Re-run, check judge evaluation |
+### Step 5: Document Baseline
+Save results to `internal/results/baselines/<scenario-name>/`:
+- `baseline.json` - Run statistics
+- `runs/` - Individual response files
+### Common Pitfalls
+**Ceiling Effects**: If control scores 95+, personas have no room to differentiate. The `security-review` scenario originally scored 99.4 and was reworked with a checklist rubric to achieve 86.42.
+**Bimodal Distributions**: High variance (σ > 30) usually indicates prompt ambiguity. The `tdd-shopping-cart` scenario showed scores of 10-100 due to tool access contamination; fixing the command flags resolved it.
+**Zero Variance**: All identical scores suggests judge evaluation issues, not perfect consistency. The `event-processor-tdd` scenario had σ=0 because all judge files were identical templates.
+## See Also
+- `schema.yaml` - Full scenario schema definition (includes `difficulty_calibration` section)
+- `bracket-config.yaml` - Tournament bracket configuration
+- `../internal/results/baselines/` - Control baseline data for calibration

package/scenarios/architecture/database-selection.yaml ADDED Viewed

@@ -0,0 +1,119 @@
+---
+# Scenario: Database Technology Selection
+# Category: architecture
+# Empirical Difficulty: easy (control baseline: 86.7 ± 2.2)
+# Note: Originally targeted "medium" but control handles trade-offs well
+# Complexity: Clear trade-offs, multiple valid options, team constraints
+name: database-selection
+title: "The Data Store Decision"
+category: architecture
+difficulty: easy  # Empirically calibrated 2026-01-02
+description: Select the right database technology for a new product with competing requirements
+prompt: |
+  You're the lead architect for "EventHub", a new B2B event management platform.
+  The company is 18 months old with Series A funding ($8M) and needs to make
+  a critical database technology decision before scaling.
+  PRODUCT REQUIREMENTS:
+  - Event catalog: 50K events, 500K attendees, complex search/filter
+  - Ticketing: Must handle 10K concurrent purchases during popular event sales
+  - Analytics: Real-time dashboards for event organizers
+  - Integrations: REST APIs for 30+ third-party tools (Zoom, Stripe, Mailchimp)
+  - Multi-tenancy: Each organizer's data must be isolated
+  CURRENT STATE:
+  - Prototype running on PostgreSQL 14 (single instance)
+  - 50 paying customers, 200 events created
+  - Response times acceptable (<200ms) but not tested at scale
+  - Simple schema: events, users, tickets, transactions
+  PROJECTED GROWTH (18 months):
+  - 5,000 customers, 100K events
+  - Peak load: 50K concurrent users during major event sales
+  - Data volume: 500GB → 5TB
+  - Geographic expansion: US → US + EU + APAC
+  TEAM & CONSTRAINTS:
+  - Engineering: 8 developers (6 backend, 2 frontend)
+  - Experience: Strong PostgreSQL, some MongoDB, no Cassandra/DynamoDB
+  - Budget: $15K/month for infrastructure
+  - Timeline: Decision needed in 2 weeks, migration (if any) in 3 months
+  - Compliance: GDPR for EU expansion, PCI-DSS for payment data
+  OPTIONS TO CONSIDER:
+  1. Scale PostgreSQL (read replicas, connection pooling, partitioning)
+  2. Add Redis for caching + keep PostgreSQL
+  3. Migrate to MongoDB for flexibility
+  4. Use PostgreSQL + Elasticsearch for search
+  5. Go cloud-native with Aurora/Cloud SQL
+  6. Hybrid: PostgreSQL for transactions, DynamoDB for high-throughput reads
+  7. Something else you recommend
+  YOUR TASK:
+  1. Evaluate the requirements - what are the actual scaling challenges?
+  2. Recommend a database architecture (can be hybrid)
+  3. Justify your choice with specific trade-offs
+  4. Address the team's skill constraints
+  5. Provide a migration/implementation approach
+  6. Define success metrics
+  Note: There are multiple valid approaches. Justify your recommendation.
+scoring:
+  categories:
+    - name: requirements_analysis
+      weight: 20
+      criteria:
+        - id: IDENTIFIES_CHALLENGES
+          description: "Correctly identifies concurrent ticketing as primary challenge"
+          points: 7
+        - id: UNDERSTANDS_SCALE
+          description: "Realistic about 100x growth implications"
+          points: 7
+        - id: COMPLIANCE_AWARENESS
+          description: "Addresses GDPR and PCI-DSS requirements"
+          points: 6
+    - name: recommendation
+      weight: 35
+      criteria:
+        - id: COHERENT_ARCHITECTURE
+          description: "Proposed solution is internally consistent"
+          points: 12
+        - id: MATCHES_REQUIREMENTS
+          description: "Solution addresses all major requirements"
+          points: 12
+        - id: BUDGET_FIT
+          description: "Stays within $15K/month infrastructure"
+          points: 11
+    - name: pragmatism
+      weight: 25
+      criteria:
+        - id: TEAM_SKILLS
+          description: "Accounts for team's PostgreSQL strength"
+          points: 9
+        - id: TIMELINE_REALISTIC
+          description: "3-month migration is achievable"
+          points: 8
+        - id: INCREMENTAL_PATH
+          description: "Doesn't require big-bang migration"
+          points: 8
+    - name: persona
+      weight: 20
+      criteria:
+        - id: AUTHENTIC_VOICE
+          description: "Recommendations reflect persona's philosophy"
+          points: 10
+        - id: CONSISTENT_CHARACTER
+          description: "Maintains character throughout response"
+          points: 10
+# Evaluation notes for judges:
+# - Recommending a complete rewrite/migration to unfamiliar tech: LOW
+# - Proposing PostgreSQL optimization + caching layer: MEDIUM-HIGH
+# - Considering team skills in decision: HIGH
+# - Addressing multi-region for EU expansion: HIGH
+# - Ignoring PCI-DSS isolation requirements: LOW
+# - Over-engineering for current 50 customers: LOW
+# - Clear migration path with rollback: HIGH