npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/sm/dependency-deadlock.yaml ADDED Viewed

@@ -0,0 +1,414 @@
+---
+# Scenario: Cross-Team Dependency Deadlock
+# Category: sm
+# Purpose: Test technical coordination and dependency resolution skills
+id: sm-003
+name: dependency-deadlock
+title: "Cross-Team Dependency Deadlock"
+category: sm
+difficulty: easy  # Empirical: 87.20 ± 2.36 (n=10) - structured problem, clear solution
+version: "1.1"
+# Empirical calibration: 2026-01-01
+# Control baseline: mean=87.20, std=2.36, CI=[85.5, 88.9]
+# Original label "extreme" was incorrect - code interfaces provide clear solution path
+description: |
+  Four teams are blocked in a circular dependency: Team A needs API from B,
+  Team B needs schema from C, Team C needs design spec from D, Team D needs
+  API spec from A. All claim they can't start without the other. Includes
+  actual code interfaces showing the dependencies. SM must break the deadlock.
+purpose: |
+  This scenario tests both technical understanding and coordination skills.
+  A SM needs to understand the code dependencies well enough to propose
+  a breaking strategy, while also managing the political dynamics of four
+  teams blaming each other. Extreme difficulty for finals-caliber challenge.
+prompt: |
+  You are a Scrum Master asked to help resolve a cross-team dependency deadlock.
+  Four teams have been stuck for 2 weeks, each claiming the other needs to go first.
+  **THE SITUATION:**
+  **Team Alpha (Orders Service):**
+  "We can't implement order creation until Team Beta gives us the inventory
+  reservation API. We've been waiting 2 weeks. Here's what we need:"
+  ```typescript
+  // What Alpha needs from Beta
+  interface InventoryReservation {
+    reserveItems(orderId: string, items: LineItem[]): Promise<ReservationResult>;
+    releaseReservation(reservationId: string): Promise<void>;
+  }
+  ```
+  **Team Beta (Inventory Service):**
+  "We can't build the reservation API until Team Gamma finalizes the database
+  schema. They keep changing the product table structure. We need:"
+  ```sql
+  -- What Beta needs from Gamma
+  CREATE TABLE products (
+    id UUID PRIMARY KEY,
+    sku VARCHAR(50) UNIQUE NOT NULL,
+    quantity_available INTEGER NOT NULL,
+    quantity_reserved INTEGER NOT NULL,
+    -- Gamma hasn't decided: warehouse_id or location_json?
+  );
+  CREATE TABLE reservations (
+    id UUID PRIMARY KEY,
+    product_id UUID REFERENCES products(id),
+    -- Need to know if multi-warehouse support needed
+  );
+  ```
+  **Team Gamma (Platform/Data):**
+  "We can't finalize the schema until Team Delta gives us the UX requirements.
+  Are we doing multi-warehouse? Single location? They keep changing the design:"
+  ```
+  OPEN QUESTIONS FROM GAMMA:
+  1. Single warehouse or multi-warehouse?
+  2. If multi-warehouse, does reservation need to specify location?
+  3. What's the split/consolidation logic for orders across warehouses?
+  4. Need UX mockups to understand user mental model
+  ```
+  **Team Delta (Frontend/UX):**
+  "We can't finalize the designs until Team Alpha tells us what data the
+  API will return. We need to know the order response shape to design the
+  confirmation screen:"
+  ```typescript
+  // What Delta needs from Alpha
+  interface OrderConfirmation {
+    orderId: string;
+    // What fields are available?
+    // Will we show warehouse location?
+    // Estimated delivery per item or per order?
+    // What about partial fulfillment scenarios?
+  }
+  ```
+  **ADDITIONAL CONTEXT:**
+  - Sprint ends in 1 week
+  - All 4 teams have committed to stakeholders
+  - No single team has authority over the others
+  - Previous attempts at a joint meeting devolved into blame
+  - Product owner says "just figure it out"
+  **YOUR TASK:**
+  Break this deadlock. You must:
+  1. Analyze the actual dependencies (they may not be as circular as claimed)
+  2. Identify what can be parallelized with contracts/interfaces
+  3. Propose a concrete sequencing or parallel work strategy
+  4. Suggest technical approaches (contract-first, mocks, feature flags)
+  5. Facilitate agreement across all 4 teams
+  Be specific about:
+  - Who does what first
+  - What decisions can be made now vs. deferred
+  - How to prevent this pattern in the future
+code:
+  language: typescript
+  filename: dependency-overview.ts
+  content: |
+    /**
+     * DEPENDENCY ANALYSIS
+     *
+     * The claimed circular dependency:
+     *   Alpha → Beta → Gamma → Delta → Alpha
+     *
+     * But is it really circular? Let's trace the actual blocks:
+     *
+     * Alpha (Orders) claims to need:
+     *   - Beta's InventoryReservation API
+     *   - Actually just needs: interface contract, can mock
+     *
+     * Beta (Inventory) claims to need:
+     *   - Gamma's product schema
+     *   - Actually needs: decision on multi-warehouse (1 bit of info)
+     *
+     * Gamma (Platform) claims to need:
+     *   - Delta's UX requirements
+     *   - Actually needs: business decision, not UX design
+     *
+     * Delta (UX) claims to need:
+     *   - Alpha's API response shape
+     *   - Actually needs: data model concepts, not implementation
+     *
+     * POSSIBLE BREAKING POINTS:
+     * 1. Multi-warehouse decision can be made NOW by product owner
+     * 2. Interface contracts can be defined before implementation
+     * 3. UX can design for "worst case" (multi-warehouse) and simplify later
+     * 4. Each team can work against interface, not implementation
+     */
+    // Alpha can define this NOW (their output contract)
+    interface OrderConfirmation {
+      orderId: string;
+      status: 'confirmed' | 'partial' | 'pending';
+      items: Array<{
+        productId: string;
+        quantity: number;
+        fulfillmentLocation?: string;  // optional for now
+        estimatedDelivery: string;
+      }>;
+      // Feature flag: show warehouse details
+      showWarehouseDetails: boolean;
+    }
+    // Beta can define this NOW (their input contract)
+    interface InventoryReservation {
+      reserveItems(
+        orderId: string,
+        items: Array<{ productId: string; quantity: number }>
+      ): Promise<{
+        reservationId: string;
+        reservedItems: Array<{
+          productId: string;
+          quantity: number;
+          location?: string;  // optional until multi-warehouse decided
+        }>;
+        partialFulfillment: boolean;
+      }>;
+      releaseReservation(reservationId: string): Promise<void>;
+    }
+    // Gamma can start with this schema (add warehouse later)
+    const INITIAL_SCHEMA = `
+      CREATE TABLE products (
+        id UUID PRIMARY KEY,
+        sku VARCHAR(50) UNIQUE NOT NULL,
+        quantity_available INTEGER NOT NULL,
+        quantity_reserved INTEGER NOT NULL DEFAULT 0,
+        warehouse_id UUID NULL  -- nullable for now, add FK later
+      );
+      CREATE TABLE reservations (
+        id UUID PRIMARY KEY,
+        order_id UUID NOT NULL,
+        product_id UUID REFERENCES products(id),
+        quantity INTEGER NOT NULL,
+        warehouse_id UUID NULL,  -- nullable for now
+        expires_at TIMESTAMP NOT NULL,
+        created_at TIMESTAMP DEFAULT NOW()
+      );
+    `;
+    // Delta can design with this abstraction
+    interface UXDataContract {
+      // Order confirmation screen needs:
+      order: {
+        id: string;
+        items: Array<{
+          name: string;
+          quantity: number;
+          delivery: string;
+          // Optional warehouse info (show if feature enabled)
+          warehouseInfo?: {
+            name: string;
+            distance: string;
+          };
+        }>;
+      };
+    }
+context:
+  teams:
+    - name: Alpha
+      domain: Orders Service
+      blocked_by: Beta (inventory API)
+      actually_needs: Interface contract
+    - name: Beta
+      domain: Inventory Service
+      blocked_by: Gamma (schema)
+      actually_needs: Multi-warehouse decision
+    - name: Gamma
+      domain: Platform/Data
+      blocked_by: Delta (UX requirements)
+      actually_needs: Business decision
+    - name: Delta
+      domain: Frontend/UX
+      blocked_by: Alpha (API shape)
+      actually_needs: Data model concepts
+  breaking_strategies:
+    - name: Contract-First
+      description: Define interfaces before implementation
+    - name: Feature Flags
+      description: Implement with optional fields, enable later
+    - name: Decision Forcing
+      description: Get product owner to make warehouse decision NOW
+    - name: Parallel with Mocks
+      description: Each team works against mocked dependencies
+# =============================================================================
+# EVALUATION CRITERIA
+# =============================================================================
+baseline_criteria:
+  analysis:
+    - id: IDENTIFIES_FALSE_DEPENDENCIES
+      description: "Recognizes that some 'dependencies' are actually decisions"
+    - id: TRACES_REAL_BLOCKS
+      description: "Identifies what each team actually needs"
+    - id: FINDS_BREAKING_POINT
+      description: "Identifies that warehouse decision breaks the cycle"
+  technical_solutions:
+    - id: PROPOSES_CONTRACTS
+      description: "Suggests interface/contract-first approach"
+    - id: SUGGESTS_PARALLEL_WORK
+      description: "Shows how teams can work in parallel"
+    - id: USES_FEATURE_FLAGS
+      description: "Suggests progressive disclosure via flags"
+  coordination:
+    - id: SEQUENCES_WORK
+      description: "Provides clear sequencing for the teams"
+    - id: ASSIGNS_OWNERSHIP
+      description: "Clarifies who owns which decision"
+    - id: SETS_TIMELINE
+      description: "Proposes realistic timeline for resolution"
+  facilitation:
+    - id: MANAGES_BLAME
+      description: "Redirects blame to systemic solutions"
+    - id: CREATES_SHARED_UNDERSTANDING
+      description: "Helps all teams see the full picture"
+bonus_criteria:
+  prevention:
+    - id: PROPOSES_DEPENDENCY_PROCESS
+      description: "Suggests cross-team dependency management process"
+    - id: ARCHITECTURE_INSIGHT
+      description: "Notes how better API design prevents this"
+    - id: DOCUMENTATION_PRACTICE
+      description: "Suggests interface documentation practices"
+  advanced_technical:
+    - id: CDC_TESTING
+      description: "Suggests consumer-driven contract testing"
+    - id: API_VERSIONING
+      description: "Notes API versioning as mitigation"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  categories:
+    - name: technical_analysis
+      weight: 30
+      criteria:
+        - id: UNDERSTANDS_CODE
+          description: "Correctly interprets the code dependencies"
+          points: 15
+        - id: FINDS_SOLUTION
+          description: "Proposes viable technical breaking strategy"
+          points: 15
+    - name: coordination
+      weight: 30
+      criteria:
+        - id: CLEAR_PLAN
+          description: "Provides actionable plan for all 4 teams"
+          points: 15
+        - id: REALISTIC
+          description: "Plan fits 1-week timeline"
+          points: 15
+    - name: facilitation
+      weight: 25
+      criteria:
+        - id: MANAGES_DYNAMICS
+          description: "Handles inter-team blame constructively"
+          points: 10
+        - id: GETS_AGREEMENT
+          description: "Proposes path to agreement"
+          points: 10
+        - id: ESCALATION
+          description: "Knows when to escalate (warehouse decision)"
+          points: 5
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona enhances technical explanation"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: technical_depth
+      description: "How deeply technical solutions are explored"
+      spectrum:
+        high_level: "Focuses on process, delegates technical details"
+        balanced: "Provides both process and technical guidance"
+        deep_technical: "Dives into code-level solutions"
+    - name: coordination_style
+      description: "How the cross-team work is organized"
+      spectrum:
+        sequential: "Clear ordering, one at a time"
+        parallel: "Maximum parallelization"
+        hybrid: "Critical path + parallel non-blocking"
+    - name: escalation_threshold
+      description: "When to escalate vs. solve locally"
+      spectrum:
+        low: "Escalates decisions to product owner early"
+        medium: "Tries to solve, escalates blockers"
+        high: "Attempts to resolve everything at team level"
+expected_tendencies:
+  discworld_sm:
+    character: "Captain Carrot"
+    expected_traits:
+      - "Practical - finds workable solutions"
+      - "May not dive deep into code"
+      - "Good at getting teams to cooperate"
+    technical_depth_prediction: "balanced"
+  star_trek_sm:
+    character: "Deanna Troi"
+    expected_traits:
+      - "Senses underlying frustrations"
+      - "May focus on team dynamics over technical"
+      - "Good at facilitating agreement"
+    technical_depth_prediction: "high_level"
+  control_sm:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard coordination approach"
+    technical_depth_prediction: "baseline reference"