npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/schema.yaml ADDED Viewed

@@ -0,0 +1,639 @@
+---
+# Thunderdome Scenario Schema
+# Version 1.0 - Battle scenario format for persona benchmarking
+#
+# Thunderdome uses competitive "duels" as a fun framing for rigorous
+# persona performance evaluation. Scenarios define challenges that
+# measure how different personas approach the same problem.
+schema:
+  version: "1.0"
+  # ============================================================================
+  # Required Fields
+  # ============================================================================
+  required:
+    name:
+      type: string
+      format: kebab-case
+      description: "Unique scenario identifier"
+      example: "code-review-user-service"
+    title:
+      type: string
+      description: "Human-readable scenario name"
+      example: "User Service Code Review"
+    category:
+      type: enum
+      values: [code-review, architecture, dev, tea, sm, pm, reviewer, general]
+      description: "Which agent type this scenario targets"
+    difficulty:
+      type: enum
+      values: [easy, medium, hard, extreme]
+      description: "Difficulty level"
+    prompt:
+      type: string
+      multiline: true
+      description: "The challenge presented to contestants"
+  # ============================================================================
+  # Optional Metadata
+  # ============================================================================
+  optional:
+    id:
+      type: string
+      description: "Short ID for reference (e.g., cr-001, arch-002)"
+    version:
+      type: string
+      default: "1.0"
+      description: "Scenario version for iteration tracking"
+    description:
+      type: string
+      description: "What this scenario tests"
+    purpose:
+      type: string
+      multiline: true
+      description: "Detailed explanation of what's being measured"
+    tags:
+      type: array
+      items: string
+      description: "Categorization tags"
+    constraints:
+      type: array
+      items: string
+      description: "Rules contestants must follow"
+    context:
+      type: string
+      multiline: true
+      description: "Additional context for contestants"
+  # ============================================================================
+  # Code-Based Scenarios
+  # For code review, debugging, implementation challenges
+  # ============================================================================
+  code_content:
+    code:
+      type: object
+      description: "Code to review/fix/implement"
+      schema:
+        language: string
+        filename: string
+        content: string (multiline)
+    tests:
+      type: object
+      description: "Test suite for TDD scenarios"
+      schema:
+        language: string
+        filename: string
+        content: string (multiline)
+    stub:
+      type: object
+      description: "Starter code for implementation"
+      schema:
+        language: string
+        filename: string
+        content: string (multiline)
+  # ============================================================================
+  # TRAIL Error Taxonomy (Epic 14)
+  # Categorizes errors for OCEAN personality correlation research
+  # Based on Patronus AI's TRAIL benchmark error categories
+  # ============================================================================
+  error_type:
+    type: string
+    enum: [reasoning, planning, execution]
+    required: false
+    description: "TRAIL error category for OCEAN correlation analysis"
+    categories:
+      reasoning:
+        description: "Logic and decision-making failures"
+        examples: ["incorrect inferences", "contradictions", "false assumptions", "circular logic"]
+      planning:
+        description: "Task orchestration and coordination failures"
+        examples: ["sequencing errors", "dependency gaps", "resource misallocation", "incomplete plans"]
+      execution:
+        description: "System and tool interaction failures"
+        examples: ["timeouts", "context overflow", "tool misuse", "API errors"]
+  # ============================================================================
+  # Baseline Issues (for code review/debugging scenarios)
+  # Known issues seeded in the code - NOT shown to contestants
+  # Used to measure detection rate
+  # ============================================================================
+  baseline_issues:
+    type: object
+    description: "Known issues for scoring detection rate"
+    schema:
+      critical:
+        type: array
+        items:
+          id: string
+          location: string
+          description: string
+          error_type: string (optional, enum: reasoning|planning|execution)
+      high:
+        type: array
+        items:
+          id: string
+          location: string
+          description: string
+          error_type: string (optional, enum: reasoning|planning|execution)
+      medium:
+        type: array
+        items:
+          id: string
+          location: string
+          description: string
+          error_type: string (optional, enum: reasoning|planning|execution)
+      low:
+        type: array
+        items:
+          id: string
+          location: string
+          description: string
+          error_type: string (optional, enum: reasoning|planning|execution)
+  bonus_issues:
+    type: array
+    description: "Extra issues thorough contestants might find"
+    items:
+      id: string
+      description: string
+  # ============================================================================
+  # Scoring Rubric
+  # How to evaluate contestant responses
+  # ============================================================================
+  scoring:
+    type: object
+    description: "Evaluation rubric with weighted categories"
+    schema:
+      total_baseline_issues:
+        type: integer
+        description: "Count of known issues to find"
+      severity_weights:
+        type: object
+        description: "Severity weights for detection scoring (v2)"
+        default:
+          critical: 15
+          high: 10
+          medium: 5
+          low: 2
+        note: "Used for weighted recall calculation"
+      weights:
+        type: object
+        description: "Legacy severity weights (deprecated, use severity_weights)"
+        example:
+          critical: 3
+          high: 2
+          medium: 1
+          low: 0.5
+      categories:
+        type: array
+        items:
+          name: string
+          weight: integer (percentage of total score)
+          criteria:
+            type: array
+            items:
+              id: string
+              description: string
+              points: integer
+  # ============================================================================
+  # Detection Scoring v2 (Precision/Recall)
+  # Replaces additive scoring with explicit precision/recall metrics
+  # ============================================================================
+  detection_scoring_v2:
+    type: object
+    description: "Precision/recall based detection scoring configuration"
+    schema:
+      severity_weights:
+        type: object
+        description: "Point values by severity for weighted recall"
+        default:
+          critical: 15
+          high: 10
+          medium: 5
+          low: 2
+      component_weights:
+        type: object
+        description: "How detection subtotal (50 pts) is allocated"
+        default:
+          recall: 30        # Weighted recall × 30 (coverage priority)
+          precision: 10     # Precision × 10 (penalize hallucinations)
+          novel_bonus: 10   # min(novel_valid × 3, 10) (reward thoroughness)
+      metrics_output:
+        type: object
+        description: "Metrics calculated and reported"
+        fields:
+          weighted_found: "Sum of (found_issues × severity_weight)"
+          weighted_total: "Sum of (all_baseline × severity_weight)"
+          recall: "weighted_found / weighted_total"
+          precision: "true_positives / (true_positives + false_positives)"
+          f2_score: "5 × (P × R) / (4P + R) - recall-biased harmonic mean"
+      rationale: |
+        This scoring system addresses several issues with the original additive approach:
+        1. **Explicit precision/recall trade-off**: Previously hidden, now visible
+        2. **Severity-weighted recall**: Critical issues matter more than low
+        3. **Novel findings preserved**: Bonus pool separate from precision calculation
+        4. **Transparent metrics**: All intermediate values reported for debugging
+        Design choices:
+        - Recall weighted 3x precision (30 vs 10 pts) because missing vulnerabilities
+          is typically worse than false positives in security review
+        - F2 score reported for reference but not used in final scoring to maintain
+          interpretability of component scores
+        - Novel bonus capped at 10 pts to prevent gaming via quantity over quality
+  # ============================================================================
+  # Persona Influence Areas
+  # How different personas should legitimately differ
+  # ============================================================================
+  persona_influence:
+    type: object
+    description: "Dimensions where persona should affect response"
+    schema:
+      dimensions:
+        type: array
+        items:
+          name: string
+          description: string
+          spectrum:
+            type: object
+            description: "Range of valid approaches"
+            example:
+              conservative: "Description of conservative approach"
+              moderate: "Description of moderate approach"
+              aggressive: "Description of aggressive approach"
+  expected_tendencies:
+    type: object
+    description: "Expected approach by known personas (for evaluation)"
+    schema:
+      "[theme]_[agent]":
+        character: string
+        expected_traits: array of strings
+        risk_profile: string
+  # ============================================================================
+  # Default Judging Dimensions
+  # Fallback scoring when scenario doesn't specify custom rubric
+  # ============================================================================
+  # ============================================================================
+  # Difficulty Calibration
+  # Score bands based on empirical 10-run control baselines
+  # Last calibrated: 2026-01-02 (24 scenarios, 240 runs)
+  # ============================================================================
+  difficulty_calibration:
+    type: object
+    description: "Empirical score bands for difficulty labels"
+    schema:
+      bands:
+        easy:
+          range: "85-100"
+          interpretation: "Most control agents succeed consistently"
+          count: 12
+          examples:
+            - "order-service (91.9)"
+            - "executive-pet-project (91.1)"
+            - "sprint-planning-conflict (90.5)"
+            - "scaling-decision (88.6)"
+            - "event-processor-tdd (87.9)"
+            - "tdd-shopping-cart (85.8)"
+        medium:
+          range: "70-85"
+          interpretation: "Moderate challenge, some variance expected"
+          count: 9
+          examples:
+            - "null-pointer (82.8)"
+            - "checkout-component-tests (82.4)"
+            - "react-auth-component (82.3)"
+            - "terraform-infrastructure (80.9)"
+            - "graphql-api-review (79.5)"
+            - "payment-processor-tests (79.2)"
+            - "race-condition-cache (76.8)"
+            - "migration-disaster (76.5)"
+            - "buggy-user-service (74.3)"
+        hard:
+          range: "55-70"
+          interpretation: "Significant challenge, control often struggles"
+          count: 3
+          examples:
+            - "cli-tool-tests (64.5)"
+            - "microservice-integration-tests (63.1)"
+        extreme:
+          range: "<55"
+          interpretation: "Most control agents fail or produce incomplete responses"
+          count: 2
+          examples:
+            - "three-sprint-failure (49.0)"
+            - "layoff-planning (48.6)"
+          notes: "Only ethical dilemma scenarios reach extreme - control handles technical challenges well"
+      calibration_requirements:
+        baseline_runs: 10
+        baseline_agent: "control:<category>"
+        required_metrics: ["mean", "std", "range"]
+        minimum_variance: 5.0  # If std < 5, scenario may be too deterministic
+      validation_rules:
+        - "Difficulty label must match empirical score band"
+        - "Ceiling effects (mean > 95) require scenario rework"
+        - "Bimodal distributions (std > 30) indicate prompt ambiguity"
+        - "Zero variance (std = 0) indicates data collection issue"
+  defaults:
+    difficulty: medium
+    version: "1.0"
+    scoring:
+      categories:
+        - name: correctness
+          weight: 25
+          description: "Technical accuracy and validity"
+        - name: quality
+          weight: 25
+          description: "Code/content quality, clarity, maintainability"
+        - name: creativity
+          weight: 25
+          description: "Novel approaches, elegance, inventiveness"
+        - name: persona
+          weight: 25
+          description: "Staying in character while delivering value"
+# ==============================================================================
+# Example Scenarios
+# ==============================================================================
+examples:
+  # Minimal duel scenario (quick battles)
+  minimal:
+    name: explain-recursion
+    title: "Explain Recursion"
+    category: general
+    difficulty: easy
+    prompt: |
+      Explain recursion to a junior developer who has never
+      encountered the concept before.
+  # Code review with baseline issues
+  code_review:
+    name: user-service-review
+    id: cr-001
+    title: "User Service Code Review"
+    category: code-review
+    difficulty: medium
+    version: "1.0"
+    description: "Review Go code for security and quality issues"
+    purpose: |
+      Measures detection rate for known vulnerabilities plus
+      bonus discoveries. Tests depth of analysis and fix quality.
+    prompt: |
+      Review this code for bugs, security issues, and code quality problems.
+      Provide specific line references and severity ratings.
+    code:
+      language: go
+      filename: user_service.go
+      content: |
+        package users
+        func GetUser(id string) (*User, error) {
+            query := fmt.Sprintf("SELECT * FROM users WHERE id = '%s'", id)
+            row := db.QueryRow(query)
+            var user User
+            row.Scan(&user.ID, &user.Email, &user.Password)
+            return &user, nil
+        }
+    baseline_issues:
+      critical:
+        - id: SQL_INJECTION
+          location: "line 4"
+          description: "SQL injection via string formatting"
+          error_type: reasoning  # Logic failure: choosing unsafe string formatting
+      high:
+        - id: PASSWORD_EXPOSURE
+          location: "line 6"
+          description: "Password field exposed in response"
+          error_type: planning  # Design failure: not planning data exposure
+      medium:
+        - id: ERROR_IGNORED
+          location: "line 6"
+          description: "Scan error ignored"
+          error_type: execution  # Implementation failure: ignoring error handling
+    scoring:
+      total_baseline_issues: 3
+      weights:
+        critical: 3
+        high: 2
+        medium: 1
+      categories:
+        - name: detection
+          weight: 40
+          criteria:
+            - id: BASELINE_FOUND
+              description: "Issues from seeded list"
+              points: 20
+            - id: BONUS_DISCOVERIES
+              description: "Valid issues beyond baseline"
+              points: 20
+        - name: depth
+          weight: 30
+          criteria:
+            - id: ROOT_CAUSE
+              description: "Explains why it's wrong"
+              points: 10
+            - id: FIX_QUALITY
+              description: "Provides correct fix"
+              points: 10
+            - id: IMPACT_ANALYSIS
+              description: "Explains consequences"
+              points: 10
+        - name: persona
+          weight: 30
+          criteria:
+            - id: CHARACTER_CONSISTENCY
+              description: "Stays in character"
+              points: 15
+            - id: PERSONA_VALUE_ADD
+              description: "Persona enhances response"
+              points: 15
+    persona_influence:
+      dimensions:
+        - name: severity_focus
+          description: "What issues get prioritized"
+          spectrum:
+            security_first: "Leads with SQL injection, auth issues"
+            quality_first: "Leads with error handling, structure"
+            balanced: "Covers both equally"
+        - name: fix_style
+          description: "How fixes are presented"
+          spectrum:
+            minimal: "Just fixes the issue"
+            comprehensive: "Refactors surrounding code"
+            educational: "Explains principles behind fix"
+  # Architecture scenario with persona influence tracking
+  architecture:
+    name: migration-dilemma
+    id: arch-002
+    title: "The Migration Dilemma"
+    category: architecture
+    difficulty: hard
+    version: "1.0"
+    description: "Modernize a legacy e-commerce platform"
+    purpose: |
+      Open-ended challenge with no single correct answer.
+      Measures trade-off analysis, risk tolerance, and how
+      persona values influence architectural recommendations.
+    prompt: |
+      You are brought in to modernize TechMart, a 5-year-old
+      e-commerce monolith. Budget: $500K over 18 months.
+      Must show progress in 6 months while maintaining features.
+      Provide: situation analysis, recommended approach,
+      trade-offs, success criteria, and what would change your mind.
+    context: |
+      - 50K daily active users, $2M annual revenue
+      - 12 developers, 2 DevOps, no architect
+      - 200K lines Rails, 20% test coverage
+      - CEO wants "modern", CTO wants pragmatic
+      - Last Black Friday: 2 hours downtime, $50K loss
+    scoring:
+      categories:
+        - name: situation_analysis
+          weight: 15
+          criteria:
+            - id: PROBLEM_ID
+              description: "Identifies core problems vs symptoms"
+              points: 5
+            - id: CONSTRAINTS
+              description: "Understands real constraints"
+              points: 5
+            - id: STAKEHOLDERS
+              description: "Reads CEO/CTO tension"
+              points: 5
+        - name: approach
+          weight: 30
+          criteria:
+            - id: COHERENT
+              description: "Internally consistent strategy"
+              points: 10
+            - id: PHASING
+              description: "Realistic timeline"
+              points: 10
+            - id: TECH_FIT
+              description: "Tech matches constraints"
+              points: 10
+        - name: trade_offs
+          weight: 25
+          criteria:
+            - id: SACRIFICES
+              description: "Honest about costs"
+              points: 8
+            - id: RISKS
+              description: "Realistic about failures"
+              points: 8
+            - id: ALTERNATIVES
+              description: "Considers other approaches"
+              points: 9
+        - name: adaptability
+          weight: 15
+          criteria:
+            - id: METRICS
+              description: "Measurable success criteria"
+              points: 8
+            - id: WARNINGS
+              description: "Early failure indicators"
+              points: 7
+        - name: persona
+          weight: 15
+          criteria:
+            - id: AUTHENTIC
+              description: "Decisions align with persona values"
+              points: 8
+            - id: INFLUENCE
+              description: "Traits visibly affect choices"
+              points: 7
+    persona_influence:
+      dimensions:
+        - name: risk_tolerance
+          description: "How aggressive is the change?"
+          spectrum:
+            conservative: "Strangler fig, incremental, proven tech"
+            moderate: "Phased with calculated risks"
+            aggressive: "Bold restructuring, new tech, ambitious"
+        - name: technology_choices
+          description: "What stack is recommended?"
+          spectrum:
+            boring: "Optimize Rails, add caching"
+            pragmatic: "Extract specific microservices"
+            cutting_edge: "Full K8s, service mesh, event sourcing"
+        - name: team_weight
+          description: "How much does current team factor in?"
+          spectrum:
+            high: "Work within skills, extensive training"
+            medium: "Some training, some hires"
+            low: "Hire new skills, expect adaptation"
+    expected_tendencies:
+      discworld_architect:
+        character: "Leonard of Quirm"
+        expected_traits:
+          - "Novel, possibly over-engineered solution"
+          - "Gets distracted by interesting sub-problems"
+          - "Multiple diagrams and sketches"
+        risk_profile: "moderate-to-aggressive, creative"
+      star_trek_architect:
+        character: "Spock"
+        expected_traits:
+          - "Highly logical, systematic"
+          - "Risk-averse, proven approaches"
+          - "Quantifies everything"
+        risk_profile: "conservative, methodical"