npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/sm/sprint-planning-conflict.yaml ADDED Viewed

@@ -0,0 +1,303 @@
+---
+# Scenario: Sprint Planning Conflict Resolution
+# Category: sm
+# Purpose: Test facilitation and prioritization skills
+id: sm-001
+name: sprint-planning-conflict
+title: "Sprint Planning Conflict Resolution"
+category: sm
+difficulty: easy  # Empirical: 90.50 mean (n=10) - highest SM score
+version: "1.1"
+# Empirical calibration: 2026-01-01
+# Control baseline: mean=90.50, std=2.29, CI=[88.9, 92.1]
+# Original label "medium" was incorrect - concrete constraints make solution clearer
+description: |
+  A sprint planning session where stakeholders have competing priorities:
+  tech lead wants a major refactor, PM wants 3 new features, QA says test
+  coverage is dangerously low, and one dev is on PTO. Capacity is 20 points
+  but requests total 35. SM must facilitate a resolution.
+purpose: |
+  This scenario tests soft skills: facilitation, prioritization, negotiation.
+  A "directive" persona might impose a solution. A "collaborative" persona
+  might facilitate consensus. Measures ability to balance competing concerns.
+prompt: |
+  You are the Scrum Master facilitating sprint planning. Your team has
+  20 story points capacity for the upcoming sprint, but stakeholders are
+  requesting 35 points of work.
+  **TEAM CAPACITY:**
+  - 4 developers × 5 points each = 20 points
+  - 1 developer (Alex) is on PTO all sprint
+  - No carryover from last sprint
+  **STAKEHOLDER REQUESTS:**
+  **Tech Lead (Jordan):** "We MUST do the authentication refactor. It's 8 points
+  but it's blocking 3 other teams. Every sprint we delay costs us more. The
+  current code is a security liability."
+  **Product Manager (Sam):** "I promised the CEO three features for the board
+  demo next month:
+  - User dashboard redesign (5 points)
+  - Export to PDF (5 points)
+  - Email notifications (3 points)
+  If we don't deliver these, I'm going to have a very uncomfortable conversation."
+  **QA Lead (Morgan):** "Test coverage dropped to 45% last sprint. I need at least
+  8 points for test automation debt, or I can't guarantee quality for anything
+  we ship. We had 3 production bugs last month."
+  **Senior Dev (Taylor):** "Two of those production bugs were in authentication.
+  Jordan's refactor would actually fix those. But the PDF export is technically
+  complex - whoever estimated 5 points is dreaming, it's more like 8."
+  **CONSTRAINTS:**
+  - Sprint starts Monday, cannot slip
+  - One dev (Alex) on PTO reduces capacity
+  - Dependencies: Dashboard needs authentication stable
+  - CEO demo is in 4 weeks (2 sprints)
+  **YOUR TASK:**
+  As Scrum Master, facilitate this planning session:
+  1. Acknowledge all concerns
+  2. Identify the real priorities and constraints
+  3. Propose a sprint plan that maximizes value
+  4. Handle pushback constructively
+  5. Document decisions and commitments
+  Be specific about what goes in the sprint, what gets deferred, and why.
+context:
+  team_size: 5
+  capacity_points: 20
+  requested_points: 35
+  sprint_length: 2_weeks
+  external_deadline: 4_weeks
+  stakeholders:
+    - name: Jordan
+      role: Tech Lead
+      priority: Authentication refactor (8 pts)
+      concern: Security and blocking other teams
+    - name: Sam
+      role: Product Manager
+      priority: 3 features (13 pts total)
+      concern: CEO commitment
+    - name: Morgan
+      role: QA Lead
+      priority: Test automation (8 pts)
+      concern: Quality and coverage
+    - name: Taylor
+      role: Senior Dev
+      priority: None specific
+      concern: Accurate estimation, production stability
+  stories:
+    - id: AUTH-REFACTOR
+      title: Authentication refactor
+      points: 8
+      requested_by: Jordan
+      notes: Blocks 3 teams, fixes 2 prod bugs
+    - id: DASHBOARD
+      title: User dashboard redesign
+      points: 5
+      requested_by: Sam
+      notes: Needs stable auth
+    - id: PDF-EXPORT
+      title: Export to PDF
+      points: "5 (estimated) / 8 (Taylor's estimate)"
+      requested_by: Sam
+      notes: Technically complex
+    - id: EMAIL-NOTIFY
+      title: Email notifications
+      points: 3
+      requested_by: Sam
+      notes: Standalone feature
+    - id: TEST-AUTOMATION
+      title: Test automation debt
+      points: 8
+      requested_by: Morgan
+      notes: Coverage at 45%
+# =============================================================================
+# EVALUATION CRITERIA
+# =============================================================================
+baseline_criteria:
+  facilitation:
+    - id: ACKNOWLEDGES_ALL
+      description: "Acknowledges each stakeholder's concern"
+    - id: NO_DISMISSAL
+      description: "Doesn't dismiss any request outright"
+    - id: ASKS_QUESTIONS
+      description: "Asks clarifying questions before deciding"
+  prioritization:
+    - id: IDENTIFIES_DEPENDENCIES
+      description: "Notes dashboard depends on auth stability"
+    - id: CONSIDERS_RISK
+      description: "Weighs security/quality risks"
+    - id: ADDRESSES_ESTIMATION
+      description: "Addresses Taylor's concern about PDF estimate"
+  resolution:
+    - id: VIABLE_PLAN
+      description: "Produces a plan that fits capacity"
+    - id: CLEAR_DECISIONS
+      description: "Explicitly states what's in and what's out"
+    - id: EXPLAINS_TRADEOFFS
+      description: "Explains why deferred items are deferred"
+  communication:
+    - id: MANAGES_EXPECTATIONS
+      description: "Sets realistic expectations with Sam"
+    - id: OFFERS_ALTERNATIVES
+      description: "Proposes alternatives for deferred work"
+    - id: DOCUMENTS_DECISIONS
+      description: "Documents sprint commitment clearly"
+bonus_criteria:
+  creative_solutions:
+    - id: SPLITS_STORIES
+      description: "Suggests splitting large stories"
+    - id: PARALLEL_PATHS
+      description: "Identifies work that can proceed in parallel"
+    - id: NEXT_SPRINT_PLAN
+      description: "Outlines plan for following sprint"
+  stakeholder_management:
+    - id: REFRAMES_CONSTRAINTS
+      description: "Reframes the situation positively"
+    - id: BUILDS_CONSENSUS
+      description: "Gets stakeholders to agree, not just comply"
+    - id: ESCALATION_PATH
+      description: "Notes when/how to escalate if needed"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  categories:
+    - name: facilitation
+      weight: 30
+      criteria:
+        - id: INCLUSIVE
+          description: "All voices heard and acknowledged"
+          points: 10
+        - id: CONSTRUCTIVE
+          description: "Keeps discussion productive"
+          points: 10
+        - id: NEUTRAL
+          description: "Doesn't take sides unfairly"
+          points: 10
+    - name: prioritization
+      weight: 30
+      criteria:
+        - id: VALUE_BASED
+          description: "Decisions based on clear value criteria"
+          points: 10
+        - id: RISK_AWARE
+          description: "Considers risks and dependencies"
+          points: 10
+        - id: REALISTIC
+          description: "Plan is achievable within capacity"
+          points: 10
+    - name: communication
+      weight: 25
+      criteria:
+        - id: CLEAR_OUTCOME
+          description: "Sprint plan is unambiguous"
+          points: 10
+        - id: EXPECTATIONS_SET
+          description: "Stakeholders know what to expect"
+          points: 8
+        - id: FOLLOW_UP
+          description: "Action items and follow-ups identified"
+          points: 7
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona enhances facilitation style"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: facilitation_style
+      description: "How the session is run"
+      spectrum:
+        directive: "Takes charge, proposes solution early"
+        collaborative: "Facilitates group decision-making"
+        consensus: "Won't proceed until everyone agrees"
+    - name: conflict_tolerance
+      description: "How disagreement is handled"
+      spectrum:
+        avoiding: "Smooths over conflict quickly"
+        addressing: "Names conflict constructively"
+        embracing: "Uses conflict to find better solutions"
+    - name: stakeholder_balance
+      description: "How competing needs are weighted"
+      spectrum:
+        technical: "Favors engineering concerns"
+        business: "Favors product/business concerns"
+        balanced: "Weighs all concerns equally"
+expected_tendencies:
+  discworld_sm:
+    character: "Captain Carrot"
+    expected_traits:
+      - "Earnest - genuinely wants to help everyone"
+      - "Practical - finds workable solutions"
+      - "Respectful - treats all concerns as valid"
+    facilitation_prediction: "collaborative"
+  star_trek_sm:
+    character: "Deanna Troi"
+    expected_traits:
+      - "Empathetic - senses underlying concerns"
+      - "Diplomatic - navigates politics well"
+      - "May over-focus on feelings vs. decisions"
+    facilitation_prediction: "consensus"
+  control_sm:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard scrum master behavior"
+    facilitation_prediction: "baseline reference"

package/scenarios/sm/story-breakdown.yaml ADDED Viewed

@@ -0,0 +1,240 @@
+---
+# Scenario: Epic to Story Breakdown
+# Category: sm
+# Purpose: Test requirements decomposition and story writing skills
+id: sm-004
+name: story-breakdown
+title: "Epic to Story Breakdown"
+category: sm
+difficulty: medium  # Empirical: 85.50 ± 1.50 (n=10) - borderline medium/hard
+version: "1.1"
+# Empirical calibration: 2026-01-01
+# Control baseline: mean=85.50, std=1.50, CI=[84.4, 86.6]
+# Original label "easy" was incorrect - open-ended nature makes it harder
+description: |
+  Given a vague epic from the product manager, break it down into well-formed
+  user stories with acceptance criteria. Tests ability to clarify requirements,
+  identify scope, and write testable stories.
+purpose: |
+  This scenario tests requirements analysis. A "thorough" persona might
+  create more stories. A "pragmatic" persona might focus on core functionality.
+  Measures ability to take ambiguous input and produce actionable work items.
+prompt: |
+  You are a Scrum Master working with a Product Manager to break down a new epic.
+  **THE EPIC:**
+  "We need user notifications. Users should be able to get notified about
+  important things happening in the app."
+  That's all the PM gave you. They're in meetings all day and can't clarify.
+  **YOUR TASK:**
+  1. Identify the questions you WOULD ask the PM (document them)
+  2. Make reasonable assumptions for each unanswered question
+  3. Break the epic into 4-8 user stories
+  4. For each story, write:
+     - User story format: "As a [user], I want [feature], so that [benefit]"
+     - 3-5 acceptance criteria (testable, specific)
+     - Story point estimate (1, 2, 3, 5, or 8)
+     - Dependencies on other stories (if any)
+  5. Identify any technical enabler stories needed
+  6. Suggest a prioritized order for implementation
+  **CONSTRAINTS:**
+  - Team velocity is ~20 points per sprint
+  - No existing notification infrastructure exists
+  - App has web and mobile clients
+  - Users already have email addresses in the system
+  Be specific and create stories that a developer could start working on.
+context:
+  epic_description: "User notifications for important app events"
+  team_velocity: 20
+  existing_infrastructure: None for notifications
+  clients: [web, mobile]
+  user_data_available: [email]
+  likely_questions:
+    - What events should trigger notifications?
+    - What channels (email, push, in-app, SMS)?
+    - Should users control their preferences?
+    - What's the priority/urgency model?
+    - Are there compliance requirements?
+    - What's the MVP vs. full vision?
+  reasonable_assumptions:
+    - Start with email and in-app notifications
+    - Users can opt-out of non-critical notifications
+    - MVP focuses on 2-3 key event types
+    - Preferences UI can come in phase 2
+# =============================================================================
+# EVALUATION CRITERIA
+# =============================================================================
+baseline_criteria:
+  clarification:
+    - id: IDENTIFIES_GAPS
+      description: "Lists key questions that need PM clarification"
+    - id: DOCUMENTS_ASSUMPTIONS
+      description: "States assumptions made in PM's absence"
+    - id: REASONABLE_ASSUMPTIONS
+      description: "Assumptions are sensible defaults"
+  story_quality:
+    - id: USER_STORY_FORMAT
+      description: "Stories follow As a/I want/So that format"
+    - id: TESTABLE_ACS
+      description: "Acceptance criteria are specific and testable"
+    - id: APPROPRIATE_SIZE
+      description: "Stories are right-sized (not too big/small)"
+    - id: CLEAR_SCOPE
+      description: "Story scope is unambiguous"
+  completeness:
+    - id: TECHNICAL_ENABLERS
+      description: "Identifies infrastructure/enabler stories"
+    - id: DEPENDENCIES_NOTED
+      description: "Story dependencies are documented"
+    - id: PRIORITIZATION
+      description: "Suggests implementation order"
+  practicality:
+    - id: FITS_VELOCITY
+      description: "Stories fit team velocity constraints"
+    - id: MVP_FOCUSED
+      description: "Distinguishes MVP from future enhancements"
+bonus_criteria:
+  depth:
+    - id: EDGE_CASES
+      description: "Stories cover edge cases (opt-out, failures)"
+    - id: NON_FUNCTIONAL
+      description: "Considers performance, scalability"
+    - id: MOBILE_SPECIFIC
+      description: "Addresses mobile push notification setup"
+  process:
+    - id: DEFINITION_OF_DONE
+      description: "Suggests DoD for notification stories"
+    - id: RISK_IDENTIFICATION
+      description: "Notes risks or unknowns"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  categories:
+    - name: analysis
+      weight: 30
+      criteria:
+        - id: QUESTIONS
+          description: "Identifies right clarifying questions"
+          points: 15
+        - id: ASSUMPTIONS
+          description: "Makes sensible documented assumptions"
+          points: 15
+    - name: story_writing
+      weight: 40
+      criteria:
+        - id: FORMAT
+          description: "Stories follow standard format"
+          points: 10
+        - id: ACCEPTANCE_CRITERIA
+          description: "ACs are testable and complete"
+          points: 15
+        - id: SIZING
+          description: "Story points are reasonable"
+          points: 10
+        - id: DEPENDENCIES
+          description: "Dependencies correctly identified"
+          points: 5
+    - name: planning
+      weight: 15
+      criteria:
+        - id: PRIORITIZATION
+          description: "Logical implementation order"
+          points: 8
+        - id: MVP_SCOPE
+          description: "Clear MVP vs future distinction"
+          points: 7
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona enhances story clarity"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: story_granularity
+      description: "How fine-grained the breakdown is"
+      spectrum:
+        coarse: "Fewer, larger stories"
+        balanced: "Right-sized stories"
+        fine: "Many small stories"
+    - name: assumption_style
+      description: "How assumptions are handled"
+      spectrum:
+        conservative: "Minimal assumptions, notes unknowns"
+        moderate: "Reasonable defaults with documentation"
+        aggressive: "Makes decisions, moves forward"
+    - name: technical_depth
+      description: "How much technical detail included"
+      spectrum:
+        business_only: "Focuses on user value"
+        balanced: "Includes technical enablers"
+        technical: "Detailed technical considerations"
+expected_tendencies:
+  discworld_sm:
+    character: "Captain Carrot"
+    expected_traits:
+      - "Practical - reasonable assumptions"
+      - "Clear - well-written stories"
+      - "May be optimistic about scope"
+    granularity_prediction: "balanced"
+  star_trek_sm:
+    character: "Deanna Troi"
+    expected_traits:
+      - "Thorough - many questions identified"
+      - "User-focused - emphasizes user value"
+      - "May over-analyze requirements"
+    granularity_prediction: "fine"
+  control_sm:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard story breakdown approach"
+    granularity_prediction: "baseline reference"