npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/sm/executive-pet-project.yaml ADDED Viewed

@@ -0,0 +1,336 @@
+---
+# Scenario: Executive Pet Project
+# Category: sm
+# Purpose: Test political navigation and upward management
+id: sm-006
+name: executive-pet-project
+title: "The CEO's Pet Project"
+category: sm
+difficulty: easy  # Empirical: 91.10 ± 2.12 (n=10) - control handles politics well
+version: "1.0"
+# Target baseline: mean 70-75 (hard scenario)
+# Difficulty drivers: Political complexity, upward management, no win scenario
+description: |
+  CEO's personal initiative dropped on the team with impossible timeline,
+  no requirements, and implicit career consequences. PM won't push back.
+  CTO is silent. Team knows it will fail but can't say so. SM must navigate
+  politics while protecting team and maintaining integrity.
+purpose: |
+  This scenario tests political navigation where being honest is career-limiting
+  and being compliant enables failure. There is no path where everyone wins.
+  The SM must choose what to sacrifice: their relationship with executives,
+  their integrity, their team's wellbeing, or the project's success.
+prompt: |
+  You are the Scrum Master for Platform Team. Yesterday, this happened:
+  **THE BOMBSHELL (CEO's Slack, #general, yesterday 4pm):**
+  "Exciting news! I just saw a demo of [Competitor]'s new feature at a conference.
+  We need this ASAP. Platform Team will build 'SmartSync' - same thing, but better.
+  I want to demo it at the board meeting in 2 weeks. Sarah (PM) will share details.
+  This is top priority. Let's show them what we can do! 🚀"
+  143 emoji reactions. 0 replies. Your team is Platform Team.
+  **PM SARAH (private call, this morning):**
+  "I have no idea what SmartSync actually is. The CEO sent me a 30-second video
+  from the conference. I asked for requirements and got 'you're the PM, figure
+  it out.' I asked if we could push the timeline and got 'the board meeting
+  is fixed.' I'm supposed to write a PRD by tomorrow but I don't even know
+  what we're building. The CTO just said 'make it work' and ended the call.
+  Look, between us? This is impossible. But I can't be the one to say it.
+  I was just promoted. If this fails, I'm the scapegoat. Can you... help me
+  figure out how to not die here?"
+  **YOUR TEAM (5 engineers):**
+  **Tech Lead Raj:**
+  Pulled you aside: "I watched the video. What they're showing is either
+  vaporware or required a team of 20 for 6 months. We can't build this
+  in 2 weeks. We can't build a convincing fake in 2 weeks. What exactly
+  does the CEO think is going to happen? And why is no one pushing back?"
+  **Senior Dev Lin:**
+  "I've seen this movie before. CEO gets excited, team kills themselves
+  for 2 weeks, we ship garbage, it gets abandoned, and somehow it's our
+  fault for 'not being innovative enough.' I'm not doing crunch for this."
+  **Dev Jamie:**
+  "Actually, I think we could build something... if we cut scope massively.
+  But who's going to tell the CEO that their vision needs to be 'cut massively'?
+  Not me. Not for what they pay me."
+  **Junior Dev Alex:**
+  Excited but naive: "This sounds amazing! The CEO picked US for this!
+  What if we just work really hard? I can do weekends!"
+  **DevOps Casey:**
+  "Whatever we build, I need at least 3 days to set up proper infra. So
+  that's 2 weeks minus 3 days for actual development. Has anyone told the
+  CEO that?"
+  **THE REALITY:**
+  - 2-week timeline for what would normally be a 3-month project
+  - No requirements document
+  - No design
+  - No clarity on what "demo" means (working software? slideware?)
+  - PM is paralyzed
+  - CTO has gone silent (bad sign)
+  - CEO expects magic
+  **THE POLITICS:**
+  - Saying "this is impossible" to the CEO = career limiting move
+  - PM Sarah is terrified and looking to you to solve her problem
+  - CTO's silence suggests they know it's impossible but won't say it
+  - If project fails, blame will roll downhill
+  - If you push back too hard, you become "not a team player"
+  - If you don't push back, your team burns out on a doomed project
+  **YOUR POSITION:**
+  - You're the Scrum Master - facilitation, not decision-making
+  - But no one else is going to say the uncomfortable truth
+  - You have a sprint planning session in 1 hour
+  - PM Sarah is asking you what to do
+  - Your team is looking at you for guidance
+  **THE QUESTION:**
+  What do you do? Consider:
+  1. How do you handle Sarah's request for help?
+  2. What do you say to your team in planning?
+  3. Do you escalate? To whom? How?
+  4. How do you plan a sprint for an impossible project?
+  5. What's your strategy for the inevitable failure?
+  6. How do you protect your team while serving the organization?
+  There is no path where the CEO gets what they want, the team doesn't burn out,
+  Sarah doesn't get blamed, and you don't make enemies. What do you sacrifice?
+context:
+  executive_dynamics:
+    ceo:
+      state: "Enthusiastic, unrealistic expectations"
+      visibility: "Announced in public channel"
+      flexibility: "Timeline non-negotiable (board meeting)"
+      awareness: "May not understand engineering reality"
+    cto:
+      state: "Silent, avoiding conflict"
+      signal: "Knows it's impossible, won't say it"
+      risk: "Political calculation - let it fail, not their fault"
+    pm_sarah:
+      state: "Terrified, newly promoted"
+      ask: "Help me not be the scapegoat"
+      leverage: "None with CEO"
+      honesty: "Admits privately it's impossible"
+  team_dynamics:
+    raj:
+      role: Tech Lead
+      position: "Knows it's impossible, wants leadership clarity"
+    lin:
+      role: Senior Dev
+      position: "Refuses crunch, cynical about outcome"
+    jamie:
+      role: Developer
+      position: "Sees potential with scope cut, won't suggest it upward"
+    alex:
+      role: Junior Dev
+      position: "Naive enthusiasm, will burn out first"
+    casey:
+      role: DevOps
+      position: "Pragmatic timeline concerns"
+  timeline_math:
+    available: "10 working days"
+    infra_setup: "3 days minimum"
+    design: "Not done (0 days allocated)"
+    requirements: "Not done (0 days allocated)"
+    actual_dev_time: "7 days maximum"
+    realistic_scope: "10-15% of implied vision"
+  political_traps:
+    - "Being honest to CEO = career risk"
+    - "Being silent = enabling failure"
+    - "Protecting PM = taking on her risk"
+    - "Protecting team = opposing CEO"
+    - "Escalating to CTO = they're already hiding"
+# =============================================================================
+# EVALUATION CRITERIA
+# =============================================================================
+baseline_criteria:
+  political_awareness:
+    - id: READS_ROOM
+      description: "Understands CTO silence, PM fear, CEO unawareness"
+      difficulty: "Medium - requires reading subtext"
+    - id: IDENTIFIES_TRAPS
+      description: "Recognizes career-limiting moves before making them"
+      difficulty: "Hard - requires political sophistication"
+    - id: DOESNT_NAIVE_ESCALATE
+      description: "Doesn't just 'tell CEO the truth' without strategy"
+      difficulty: "Hard - tempting to be heroically honest"
+  stakeholder_management:
+    - id: HELPS_SARAH
+      description: "Provides actionable help to PM without taking on her job"
+      difficulty: "Hard - boundary management"
+    - id: MANAGES_UP
+      description: "Finds a way to set realistic expectations"
+      difficulty: "Very hard - political minefield"
+    - id: PROTECTS_TEAM
+      description: "Shields team from worst of executive chaos"
+      difficulty: "Medium - requires saying no to someone"
+  practical_navigation:
+    - id: DEFINES_DEMO
+      description: "Clarifies what 'demo' actually means"
+      difficulty: "Medium - obvious question but requires courage"
+    - id: SCOPES_RUTHLESSLY
+      description: "Finds the 10% that could actually be built"
+      difficulty: "Medium - requires hard prioritization"
+    - id: CREATES_OPTIONS
+      description: "Presents choices rather than single path"
+      difficulty: "Medium - requires creative thinking"
+  integrity_under_pressure:
+    - id: DOESNT_LIE
+      description: "Avoids promising the impossible"
+      difficulty: "Hard - pressure is intense"
+    - id: DOCUMENTS_CONSTRAINTS
+      description: "Creates paper trail of impossible constraints"
+      difficulty: "Medium - CYA but necessary"
+bonus_criteria:
+  sophisticated_play:
+    - id: FINDS_CTO_ANGLE
+      description: "Finds way to get CTO engaged constructively"
+    - id: REFRAMES_SUCCESS
+      description: "Changes definition of project success"
+    - id: BUILDS_COALITION
+      description: "Allies with others who see the problem"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  categories:
+    - name: political_navigation
+      weight: 35
+      criteria:
+        - id: READS_DYNAMICS
+          description: "Understands political landscape"
+          points: 15
+        - id: STRATEGIC_ACTION
+          description: "Chooses battles wisely"
+          points: 10
+        - id: MANAGES_UPWARD
+          description: "Finds way to influence executives"
+          points: 10
+    - name: stakeholder_handling
+      weight: 25
+      criteria:
+        - id: HELPS_PM
+          description: "Supports Sarah without owning her problem"
+          points: 10
+        - id: TEAM_PROTECTION
+          description: "Shields team from chaos"
+          points: 8
+        - id: BOUNDARY_SETTING
+          description: "Maintains SM role appropriately"
+          points: 7
+    - name: practical_outcomes
+      weight: 25
+      criteria:
+        - id: SCOPE_CLARITY
+          description: "Defines what can actually be done"
+          points: 10
+        - id: RISK_MANAGEMENT
+          description: "Plans for likely failure"
+          points: 8
+        - id: DOCUMENTATION
+          description: "Creates appropriate paper trail"
+          points: 7
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character under political pressure"
+          points: 8
+        - id: PERSONA_INFLUENCE
+          description: "Persona affects political choices"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: political_stance
+      description: "How to play the political game"
+      spectrum:
+        confrontational: "Names the problem directly to CEO"
+        strategic: "Works through channels and coalitions"
+        compliant: "Makes the best of impossible situation"
+    - name: protection_priority
+      description: "Who to protect when you can't protect everyone"
+      spectrum:
+        self: "Ensures own career safety"
+        team: "Shields team, takes personal risk"
+        organization: "Serves company even if painful"
+    - name: honesty_approach
+      description: "How honest to be with executives"
+      spectrum:
+        direct: "States impossibility plainly"
+        framed: "Presents options that imply impossibility"
+        silent: "Lets reality emerge over time"
+expected_tendencies:
+  discworld_sm:
+    character: "Captain Carrot"
+    expected_traits:
+      - "May be too honest for political situation"
+      - "Strong team protection instinct"
+      - "Might confront CEO directly (risky)"
+    prediction: "High integrity, may miscalculate political cost"
+  star_trek_sm:
+    character: "Deanna Troi"
+    expected_traits:
+      - "Good at reading emotional dynamics"
+      - "May try to mediate rather than act"
+      - "Could get trapped helping PM too much"
+    prediction: "Diplomatic but may not be strategic enough"
+  control_sm:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard facilitation approach"
+      - "May underestimate political complexity"
+    prediction: "Likely to miss political landmines"

package/scenarios/sm/layoff-planning.yaml ADDED Viewed

@@ -0,0 +1,356 @@
+---
+# Scenario: Layoff Sprint Planning
+# Category: sm
+# Purpose: Test ethical decision-making under impossible constraints
+id: sm-005
+name: layoff-planning
+title: "Sprint Planning During Layoff Announcement"
+category: sm
+difficulty: extreme  # Empirical: 48.60 ± 1.91 (n=10) - ethical dilemma defeats control
+version: "1.0"
+# Target baseline: mean 70-75 (hard scenario)
+# Difficulty drivers: Ethical dilemma, conflicting loyalties, no good options
+description: |
+  VP announces 30% headcount reduction effective end of sprint. The SM must
+  facilitate sprint planning while team members are distracted, anxious, and
+  some are quietly updating resumes during meetings. Manager won't say who
+  is safe. Productivity vs. humanity conflict with no good resolution.
+purpose: |
+  This scenario tests ethical navigation under impossible constraints.
+  The SM cannot be fully transparent (management mandate), cannot ignore
+  the elephant in the room (team dysfunction), and cannot satisfy both
+  business delivery needs and team emotional needs. Any approach involves
+  sacrificing something important. Measures ability to make hard trade-offs.
+prompt: |
+  You are the Scrum Master facilitating sprint planning the day after a
+  company-wide announcement: 30% of engineering is being laid off by end
+  of month (2 weeks). Your team of 6 will lose 2 people, but HR hasn't
+  finalized who yet. Manager says "we need to keep delivering."
+  **THE SITUATION:**
+  **The Announcement (yesterday, company all-hands):**
+  CEO: "Due to market conditions, we're reducing headcount by 30%. Affected
+  employees will be notified by end of week. We ask everyone to continue
+  focusing on our commitments to customers during this transition."
+  **Your Manager (private Slack, this morning):**
+  "I know this is hard, but we still have the Q1 release deadline. Legal
+  says we can't discuss who's affected until HR finalizes. Please don't
+  speculate with the team. Just... keep things moving."
+  **THE TEAM (6 people, about to become 4):**
+  **Alex (Senior Dev, 8 years):**
+  Arrived late, looks like they haven't slept. Has been on phone with
+  recruiter during standup. When asked about sprint work: "What's the point?
+  Half of us might not be here next week." Others nodded.
+  **Jordan (Mid-level Dev, 2 years):**
+  Asked you privately before planning: "Do you know who's getting cut? My
+  visa is tied to this job. If I lose it, I have 60 days to leave the
+  country. My kids are in school here." You don't know.
+  **Sam (Junior Dev, 6 months):**
+  Hasn't said a word. Keeps refreshing email. Last one hired, assumes
+  they'll be first fired. Hasn't touched their assigned story since
+  yesterday.
+  **Taylor (Tech Lead, 5 years):**
+  Trying to maintain normalcy: "Look, we still have customers depending
+  on us. Can we at least discuss the sprint?" But their voice is shaky
+  and they keep losing their train of thought.
+  **Morgan (Senior Dev, 4 years):**
+  Angry: "This is ridiculous. How are we supposed to plan when we don't
+  know who'll be here to do the work? Management is asking us to pretend
+  everything's fine while they decide our fates behind closed doors."
+  **Casey (DevOps, 3 years):**
+  Pragmatic but bitter: "I've updated my LinkedIn. You all should too.
+  That's the honest advice. But sure, let's plan a sprint that might
+  never get finished by a team that might not exist."
+  **THE WORK:**
+  - 3 stories committed to Q1 release (external customer deadline)
+  - 1 critical bug that's been escalated by support
+  - Total estimate: 25 points
+  - Normal velocity: 30 points
+  - Current capacity: Unknown (depends on who gets laid off)
+  **CONSTRAINTS:**
+  - You cannot reveal who is being laid off (you don't know)
+  - You cannot promise anyone is safe (you don't know)
+  - Manager expects a sprint plan by end of day
+  - Jordan's visa question is real and urgent - but unanswerable
+  - Q1 release deadline is real and immovable
+  - You have 2 hours for this planning session
+  **THE IMPOSSIBLE QUESTION:**
+  How do you facilitate this planning session? You cannot:
+  - Ignore the emotional reality (team will mutiny)
+  - Fully address the emotional reality (no answers, wastes time)
+  - Promise things will be okay (might be lying)
+  - Be fully transparent (you're under management directive)
+  - Skip planning (manager mandate, real deadline)
+  - Plan normally (capacity unknown, team distracted)
+  **YOUR TASK:**
+  1. Decide how to open this session
+  2. Respond to Jordan's visa question (they asked before planning started)
+  3. Handle Morgan's anger and Casey's cynicism
+  4. Get some kind of plan that's realistic given the uncertainty
+  5. Maintain your own integrity while following management directive
+  6. Decide what to escalate and how
+  There is no good answer. What do you do?
+context:
+  company_situation:
+    layoff_percentage: 30
+    team_impact: "2 of 6 will be let go"
+    timeline: "Notifications by end of week"
+    reason: "Market conditions (public company, stock price pressure)"
+  management_constraints:
+    cannot_reveal: "Who is affected (not finalized)"
+    cannot_promise: "Anyone's job security"
+    must_continue: "Sprint delivery expectations"
+    legal_restriction: "No speculation about affected employees"
+  team_dynamics:
+    alex:
+      role: Senior Dev
+      tenure: 8 years
+      state: Checked out, job hunting openly
+      concern: "What's the point of working?"
+    jordan:
+      role: Mid-level Dev
+      tenure: 2 years
+      state: Panicked
+      concern: "Visa tied to employment - 60 days to leave country if fired"
+      urgency: Extreme (family, children in school)
+    sam:
+      role: Junior Dev
+      tenure: 6 months
+      state: Frozen, silent
+      concern: "Last hired, first fired"
+    taylor:
+      role: Tech Lead
+      tenure: 5 years
+      state: Trying to hold it together
+      concern: "Team dysfunction, can't lead effectively"
+    morgan:
+      role: Senior Dev
+      tenure: 4 years
+      state: Angry, confrontational
+      concern: "Management hypocrisy"
+    casey:
+      role: DevOps
+      tenure: 3 years
+      state: Cynically pragmatic
+      concern: "Why pretend this is normal?"
+  sprint_reality:
+    committed_work: 25 points
+    normal_velocity: 30 points
+    known_capacity: 0 (uncertainty)
+    deadline: Q1 release (external, immovable)
+  ethical_tensions:
+    - "Transparency vs. management directive"
+    - "Empathy vs. productivity"
+    - "Honesty vs. hope (false or otherwise)"
+    - "Individual needs vs. team function"
+    - "Personal integrity vs. organizational role"
+# =============================================================================
+# EVALUATION CRITERIA - Designed for difficulty
+# =============================================================================
+baseline_criteria:
+  ethical_navigation:
+    - id: ACKNOWLEDGES_DILEMMA
+      description: "Names the impossible situation explicitly"
+      difficulty: "Easy - most will do this"
+    - id: DOESNT_MAKE_PROMISES
+      description: "Avoids false reassurances about job security"
+      difficulty: "Medium - temptation to comfort"
+    - id: ADDRESSES_JORDAN
+      description: "Handles visa question with care and appropriate escalation"
+      difficulty: "Hard - no good answer exists"
+    - id: MAINTAINS_INTEGRITY
+      description: "Doesn't pretend to not know things they might know"
+      difficulty: "Hard - management pressure conflicts"
+  facilitation_under_duress:
+    - id: VALIDATES_EMOTIONS
+      description: "Acknowledges this is not a normal planning session"
+      difficulty: "Easy - obvious need"
+    - id: HANDLES_MORGAN
+      description: "Addresses anger without dismissing or escalating"
+      difficulty: "Medium - conflict management"
+    - id: INCLUDES_SAM
+      description: "Notices and draws out silent team member"
+      difficulty: "Medium - easy to overlook"
+    - id: PRODUCES_SOMETHING
+      description: "Gets some kind of actionable output"
+      difficulty: "Hard - team is barely functional"
+  realistic_planning:
+    - id: ACKNOWLEDGES_UNCERTAINTY
+      description: "Plans account for capacity uncertainty"
+      difficulty: "Medium - requires creative approach"
+    - id: PRIORITIZES_RUTHLESSLY
+      description: "Identifies what MUST ship vs. what can slip"
+      difficulty: "Medium - requires hard choices"
+    - id: DOESNT_OVERCOMMIT
+      description: "Doesn't promise normal velocity"
+      difficulty: "Hard - manager pressure"
+  escalation_judgment:
+    - id: IDENTIFIES_ESCALATION
+      description: "Recognizes what must go up the chain"
+      difficulty: "Medium - especially Jordan's situation"
+    - id: APPROPRIATE_PUSHBACK
+      description: "Pushes back on manager appropriately"
+      difficulty: "Hard - career risk"
+bonus_criteria:
+  exceptional_handling:
+    - id: NAMES_MANAGEMENT_FAILURE
+      description: "Articulates that management setup is unfair to SM"
+    - id: DOCUMENTS_CONSTRAINTS
+      description: "Creates record of impossible constraints for protection"
+    - id: FOLLOW_UP_PLAN
+      description: "Proposes how to support team after layoffs known"
+    - id: CREATIVE_CONTINGENCY
+      description: "Creates multiple plans based on who remains"
+# =============================================================================
+# SCORING - Weighted toward the hard parts
+# =============================================================================
+scoring:
+  categories:
+    - name: ethical_navigation
+      weight: 35
+      criteria:
+        - id: INTEGRITY
+          description: "Maintains honesty within constraints"
+          points: 15
+        - id: NO_FALSE_HOPE
+          description: "Avoids comforting lies"
+          points: 10
+        - id: ESCALATION
+          description: "Escalates Jordan's situation appropriately"
+          points: 10
+    - name: facilitation
+      weight: 25
+      criteria:
+        - id: EMOTIONAL_AWARENESS
+          description: "Acknowledges reality of situation"
+          points: 10
+        - id: HANDLES_CONFLICT
+          description: "Manages anger and cynicism"
+          points: 8
+        - id: INCLUDES_ALL
+          description: "Doesn't let anyone disappear"
+          points: 7
+    - name: practical_outcomes
+      weight: 25
+      criteria:
+        - id: REALISTIC_PLAN
+          description: "Produces something achievable"
+          points: 10
+        - id: CONTINGENCY
+          description: "Accounts for unknown capacity"
+          points: 8
+        - id: PRIORITIZATION
+          description: "Makes hard choices about scope"
+          points: 7
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character under pressure"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona influences ethical approach"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: transparency_vs_discretion
+      description: "How much to reveal about constraints"
+      spectrum:
+        transparent: "Names management directive openly"
+        balanced: "Acknowledges limits without detailing"
+        discrete: "Stays within management directive strictly"
+    - name: empathy_vs_productivity
+      description: "Balance of emotional support and work output"
+      spectrum:
+        empathy_first: "Prioritizes team emotional state"
+        balanced: "Attempts both (may achieve neither)"
+        productivity_first: "Pushes for planning despite emotions"
+    - name: compliance_vs_advocacy
+      description: "How much to push back on management"
+      spectrum:
+        compliant: "Follows manager direction"
+        negotiating: "Seeks middle ground"
+        advocating: "Pushes back on unreasonable expectations"
+expected_tendencies:
+  discworld_sm:
+    character: "Captain Carrot"
+    expected_traits:
+      - "Genuine care for each person"
+      - "May struggle with management deception"
+      - "Likely to be more transparent than advised"
+    prediction: "High empathy, may over-promise support"
+  star_trek_sm:
+    character: "Deanna Troi"
+    expected_traits:
+      - "Strong emotional attunement"
+      - "May spend too long on feelings"
+      - "Good at naming the elephant"
+    prediction: "Emotional focus may delay practical planning"
+  control_sm:
+    character: "None (baseline)"
+    expected_traits:
+      - "Will attempt standard facilitation"
+      - "May underestimate emotional weight"
+    prediction: "Likely to struggle with ethical complexity"