npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/tea/payment-processor-tests.yaml ADDED Viewed

@@ -0,0 +1,550 @@
+---
+# Scenario: Payment Processor Test Suite
+# Category: tea (Test Engineer/Architect)
+# Ported from: Pennyfarthing benchmarks/test-cases/tea/tea-001-payment-processor.yaml
+# Purpose: Measure thoroughness in test coverage for complex system
+id: tea-001
+name: payment-processor-tests
+title: "Payment Processor Test Suite Challenge"
+category: tea
+difficulty: medium  # Empirical: control mean 79.17
+version: "1.0"
+description: |
+  A payment processor module that needs comprehensive test coverage.
+  Tests the TEA agent's ability to identify test scenarios, edge cases,
+  error conditions, concurrency issues, and write meaningful assertions.
+purpose: |
+  This scenario tests whether persona traits affect test strategy.
+  A "methodical" TEA might systematically cover every branch.
+  An "experienced" TEA might prioritize security tests.
+  A "cautious" TEA might focus on error handling.
+  All are valid but measurably different in coverage patterns.
+prompt: |
+  You are a Test Engineer/Architect tasked with writing a comprehensive
+  test suite for this payment processor module.
+  Consider:
+  - Happy path scenarios
+  - Edge cases and boundary conditions
+  - Error handling and failure modes
+  - Concurrency concerns (idempotency, race conditions)
+  - Integration points (gateway, store, rate limiter)
+  - Security-related test cases
+  For each test:
+  1. Name it descriptively (TestProcessPayment_CardDeclined, etc.)
+  2. Explain what scenario it covers
+  3. Include meaningful assertions
+  4. Consider setup/teardown needs (mocks for interfaces)
+  Your goal is THOROUGH COVERAGE. Missing important test scenarios
+  is the primary failure mode we're measuring.
+  There are 39 known test scenarios (20 baseline + 19 bonus). How many can you find?
+code:
+  language: go
+  filename: payment_processor.go
+  content: |
+    package payments
+    import (
+        "context"
+        "errors"
+        "fmt"
+        "sync"
+        "time"
+    )
+    var (
+        ErrInsufficientFunds = errors.New("insufficient funds")
+        ErrInvalidAmount     = errors.New("invalid amount")
+        ErrCardExpired       = errors.New("card expired")
+        ErrCardDeclined      = errors.New("card declined")
+        ErrDuplicatePayment  = errors.New("duplicate payment")
+        ErrPaymentNotFound   = errors.New("payment not found")
+        ErrRefundExceedsPayment = errors.New("refund exceeds original payment")
+    )
+    type PaymentStatus string
+    const (
+        StatusPending   PaymentStatus = "pending"
+        StatusCompleted PaymentStatus = "completed"
+        StatusFailed    PaymentStatus = "failed"
+        StatusRefunded  PaymentStatus = "refunded"
+        StatusPartialRefund PaymentStatus = "partial_refund"
+    )
+    type Payment struct {
+        ID            string
+        Amount        int64  // in cents
+        Currency      string
+        CardNumber    string // last 4 digits only
+        CardExpMonth  int
+        CardExpYear   int
+        Status        PaymentStatus
+        RefundedAmount int64
+        CreatedAt     time.Time
+        CompletedAt   *time.Time
+        Metadata      map[string]string
+        IdempotencyKey string
+    }
+    type PaymentProcessor struct {
+        gateway      PaymentGateway
+        store        PaymentStore
+        rateLimiter  RateLimiter
+        mutex        sync.RWMutex
+        processedKeys map[string]string // idempotency key -> payment ID
+    }
+    type PaymentGateway interface {
+        Charge(ctx context.Context, amount int64, currency string, cardToken string) (string, error)
+        Refund(ctx context.Context, chargeID string, amount int64) error
+        ValidateCard(cardToken string) (bool, error)
+    }
+    type PaymentStore interface {
+        Save(payment *Payment) error
+        Get(id string) (*Payment, error)
+        Update(payment *Payment) error
+        GetByIdempotencyKey(key string) (*Payment, error)
+    }
+    type RateLimiter interface {
+        Allow(key string) bool
+    }
+    func NewPaymentProcessor(gateway PaymentGateway, store PaymentStore, limiter RateLimiter) *PaymentProcessor {
+        return &PaymentProcessor{
+            gateway:       gateway,
+            store:         store,
+            rateLimiter:   limiter,
+            processedKeys: make(map[string]string),
+        }
+    }
+    // ProcessPayment handles a new payment request
+    func (p *PaymentProcessor) ProcessPayment(ctx context.Context, req PaymentRequest) (*Payment, error) {
+        // Validate amount
+        if req.Amount <= 0 {
+            return nil, ErrInvalidAmount
+        }
+        if req.Amount > 10000000 { // $100,000 max
+            return nil, ErrInvalidAmount
+        }
+        // Check rate limit
+        if !p.rateLimiter.Allow(req.CustomerID) {
+            return nil, errors.New("rate limit exceeded")
+        }
+        // Check idempotency
+        if req.IdempotencyKey != "" {
+            p.mutex.RLock()
+            if existingID, ok := p.processedKeys[req.IdempotencyKey]; ok {
+                p.mutex.RUnlock()
+                return p.store.Get(existingID)
+            }
+            p.mutex.RUnlock()
+        }
+        // Validate card expiration
+        now := time.Now()
+        if req.CardExpYear < now.Year() ||
+           (req.CardExpYear == now.Year() && req.CardExpMonth < int(now.Month())) {
+            return nil, ErrCardExpired
+        }
+        // Create payment record
+        payment := &Payment{
+            ID:            generateID(),
+            Amount:        req.Amount,
+            Currency:      req.Currency,
+            CardNumber:    req.CardNumber[len(req.CardNumber)-4:],
+            CardExpMonth:  req.CardExpMonth,
+            CardExpYear:   req.CardExpYear,
+            Status:        StatusPending,
+            CreatedAt:     time.Now(),
+            Metadata:      req.Metadata,
+            IdempotencyKey: req.IdempotencyKey,
+        }
+        // Save pending payment
+        if err := p.store.Save(payment); err != nil {
+            return nil, fmt.Errorf("failed to save payment: %w", err)
+        }
+        // Process with gateway
+        chargeID, err := p.gateway.Charge(ctx, req.Amount, req.Currency, req.CardToken)
+        if err != nil {
+            payment.Status = StatusFailed
+            p.store.Update(payment)
+            if errors.Is(err, ErrCardDeclined) {
+                return nil, ErrCardDeclined
+            }
+            return nil, fmt.Errorf("payment failed: %w", err)
+        }
+        // Update payment as completed
+        now2 := time.Now()
+        payment.CompletedAt = &now2
+        payment.Status = StatusCompleted
+        payment.Metadata["charge_id"] = chargeID
+        if err := p.store.Update(payment); err != nil {
+            // Payment succeeded but update failed - critical state
+            return payment, fmt.Errorf("payment completed but failed to update: %w", err)
+        }
+        // Record idempotency key
+        if req.IdempotencyKey != "" {
+            p.mutex.Lock()
+            p.processedKeys[req.IdempotencyKey] = payment.ID
+            p.mutex.Unlock()
+        }
+        return payment, nil
+    }
+    // RefundPayment processes a refund
+    func (p *PaymentProcessor) RefundPayment(ctx context.Context, paymentID string, amount int64) (*Payment, error) {
+        payment, err := p.store.Get(paymentID)
+        if err != nil {
+            return nil, ErrPaymentNotFound
+        }
+        if payment.Status != StatusCompleted && payment.Status != StatusPartialRefund {
+            return nil, errors.New("payment cannot be refunded")
+        }
+        remainingAmount := payment.Amount - payment.RefundedAmount
+        if amount > remainingAmount {
+            return nil, ErrRefundExceedsPayment
+        }
+        chargeID := payment.Metadata["charge_id"]
+        if err := p.gateway.Refund(ctx, chargeID, amount); err != nil {
+            return nil, fmt.Errorf("refund failed: %w", err)
+        }
+        payment.RefundedAmount += amount
+        if payment.RefundedAmount == payment.Amount {
+            payment.Status = StatusRefunded
+        } else {
+            payment.Status = StatusPartialRefund
+        }
+        if err := p.store.Update(payment); err != nil {
+            return payment, fmt.Errorf("refund completed but failed to update: %w", err)
+        }
+        return payment, nil
+    }
+    // GetPayment retrieves a payment by ID
+    func (p *PaymentProcessor) GetPayment(ctx context.Context, id string) (*Payment, error) {
+        return p.store.Get(id)
+    }
+    type PaymentRequest struct {
+        Amount         int64
+        Currency       string
+        CardNumber     string
+        CardToken      string
+        CardExpMonth   int
+        CardExpYear    int
+        CustomerID     string
+        IdempotencyKey string
+        Metadata       map[string]string
+    }
+    func generateID() string {
+        return fmt.Sprintf("pay_%d", time.Now().UnixNano())
+    }
+# =============================================================================
+# BASELINE TEST CASES (minimum expected to write)
+# These are NOT shown to contestants - used to measure coverage
+# =============================================================================
+baseline_issues:
+  happy_path:
+    - id: TEST_SUCCESSFUL_PAYMENT
+      description: "Basic successful payment flow"
+    - id: TEST_SUCCESSFUL_REFUND
+      description: "Basic successful full refund"
+    - id: TEST_PARTIAL_REFUND
+      description: "Partial refund updates status correctly"
+    - id: TEST_MULTIPLE_PARTIAL_REFUNDS
+      description: "Multiple partial refunds until full refund"
+    - id: TEST_GET_PAYMENT
+      description: "Retrieve payment by ID"
+  validation:
+    - id: TEST_ZERO_AMOUNT
+      description: "Reject zero amount payment"
+    - id: TEST_NEGATIVE_AMOUNT
+      description: "Reject negative amount payment"
+    - id: TEST_EXCEEDS_MAX_AMOUNT
+      description: "Reject amount over $100,000"
+    - id: TEST_EXPIRED_CARD
+      description: "Reject expired card"
+    - id: TEST_EXPIRED_CARD_SAME_MONTH
+      description: "Handle card expiring this month correctly"
+  error_handling:
+    - id: TEST_CARD_DECLINED
+      description: "Handle card declined from gateway"
+    - id: TEST_GATEWAY_ERROR
+      description: "Handle gateway errors gracefully"
+    - id: TEST_PAYMENT_NOT_FOUND_REFUND
+      description: "Refund non-existent payment"
+    - id: TEST_REFUND_EXCEEDS_AMOUNT
+      description: "Refund more than original payment"
+    - id: TEST_REFUND_WRONG_STATUS
+      description: "Cannot refund pending/failed payment"
+  idempotency:
+    - id: TEST_IDEMPOTENCY_SAME_KEY
+      description: "Same idempotency key returns same payment"
+    - id: TEST_IDEMPOTENCY_DIFFERENT_KEYS
+      description: "Different keys create different payments"
+  rate_limiting:
+    - id: TEST_RATE_LIMIT_EXCEEDED
+      description: "Rate limit blocks excessive requests"
+    - id: TEST_RATE_LIMIT_ALLOWED
+      description: "Normal request rate succeeds"
+# =============================================================================
+# BONUS TEST CASES (thorough testers might include)
+# =============================================================================
+bonus_issues:
+  concurrency:
+    - id: TEST_CONCURRENT_SAME_IDEMPOTENCY
+      description: "Concurrent requests with same idempotency key"
+    - id: TEST_CONCURRENT_REFUNDS
+      description: "Concurrent refund requests on same payment"
+    - id: TEST_RACE_CONDITION_IDEMPOTENCY_MAP
+      description: "Thread safety of processedKeys map"
+  edge_cases:
+    - id: TEST_CARD_NUMBER_EXACTLY_4_CHARS
+      description: "Card number with only 4 digits"
+    - id: TEST_CARD_NUMBER_LESS_THAN_4
+      description: "Card number shorter than 4 digits causes panic"
+    - id: TEST_EMPTY_CARD_NUMBER
+      description: "Empty card number handling"
+    - id: TEST_NIL_METADATA
+      description: "Nil metadata map handling"
+    - id: TEST_EXACTLY_MAX_AMOUNT
+      description: "Payment at exactly $100,000 limit"
+    - id: TEST_ONE_CENT_PAYMENT
+      description: "Minimum valid payment (1 cent)"
+    - id: TEST_CURRENCY_VALIDATION
+      description: "Invalid currency codes"
+  failure_modes:
+    - id: TEST_STORE_SAVE_FAILS
+      description: "Initial save to store fails"
+    - id: TEST_STORE_UPDATE_AFTER_CHARGE_FAILS
+      description: "Update fails after successful charge"
+    - id: TEST_STORE_UPDATE_AFTER_REFUND_FAILS
+      description: "Update fails after successful refund"
+    - id: TEST_CONTEXT_CANCELLED
+      description: "Context cancellation during processing"
+    - id: TEST_CONTEXT_TIMEOUT
+      description: "Context timeout during gateway call"
+  security:
+    - id: TEST_FULL_CARD_NOT_STORED
+      description: "Verify only last 4 digits stored"
+    - id: TEST_CARD_TOKEN_USED_NOT_NUMBER
+      description: "Card token passed to gateway, not number"
+  state_transitions:
+    - id: TEST_PENDING_TO_COMPLETED
+      description: "Status transitions from pending to completed"
+    - id: TEST_PENDING_TO_FAILED
+      description: "Status transitions from pending to failed"
+    - id: TEST_COMPLETED_TO_PARTIAL
+      description: "Status transitions to partial_refund"
+    - id: TEST_PARTIAL_TO_REFUNDED
+      description: "Status transitions from partial to full refund"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  total_baseline_tests: 20
+  total_bonus_tests: 19
+  categories:
+    - name: coverage
+      weight: 50
+      description: "How many test scenarios are covered"
+      criteria:
+        - id: HAPPY_PATH_COVERED
+          description: "All 5 happy path tests"
+          points: 15
+        - id: VALIDATION_COVERED
+          description: "All 5 validation tests"
+          points: 15
+        - id: ERROR_HANDLING_COVERED
+          description: "All error handling tests"
+          points: 10
+        - id: IDEMPOTENCY_RATE_LIMIT
+          description: "Idempotency and rate limit tests"
+          points: 10
+    - name: test_quality
+      weight: 25
+      description: "Quality of the tests themselves"
+      criteria:
+        - id: PROPER_MOCKING
+          description: "Correct use of interface mocks"
+          points: 8
+        - id: CLEAR_ASSERTIONS
+          description: "Meaningful, specific assertions"
+          points: 8
+        - id: ISOLATION
+          description: "Tests are properly isolated"
+          points: 4
+        - id: DOCUMENTATION
+          description: "Tests have clear descriptions"
+          points: 5
+    - name: advanced_coverage
+      weight: 10
+      description: "Bonus test scenarios"
+      criteria:
+        - id: CONCURRENCY_TESTS
+          description: "Tests for race conditions"
+          points: 5
+        - id: EDGE_CASE_TESTS
+          description: "Unusual edge cases"
+          points: 3
+        - id: FAILURE_MODE_TESTS
+          description: "Complex failure scenarios"
+          points: 2
+    - name: persona
+      weight: 15
+      description: "Persona consistency and value"
+      criteria:
+        - id: IN_CHARACTER
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_ENHANCES
+          description: "Persona adds value to test strategy"
+          points: 7
+# =============================================================================
+# ENHANCED METRICS
+# =============================================================================
+enhanced_metrics:
+  coverage_ratio:
+    formula: "tests_found / 20"
+    interpretation: "100% = found all baseline scenarios"
+  bonus_discovery_rate:
+    formula: "bonus_tests_found / 19"
+    interpretation: "Shows exceptional thoroughness"
+  category_balance:
+    formula: "min(category_coverage) / max(category_coverage)"
+    interpretation: "1.0 = balanced, <1.0 = gaps"
+  mock_sophistication:
+    formula: "advanced_mocking_patterns / 5"
+    interpretation: "Use of table-driven tests, setup helpers, etc."
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: test_strategy
+      description: "How tests are prioritized"
+      spectrum:
+        security_first: "Starts with security and error cases"
+        happy_path_first: "Starts with basic functionality"
+        edge_cases_first: "Focuses on unusual scenarios"
+    - name: mock_approach
+      description: "How interfaces are mocked"
+      spectrum:
+        minimal: "Basic mocks, just enough to run"
+        comprehensive: "Detailed mock behavior"
+        table_driven: "Table-driven tests with mock matrices"
+    - name: documentation_style
+      description: "How tests are documented"
+      spectrum:
+        minimal: "Test names only"
+        moderate: "Brief comments"
+        thorough: "Full scenario documentation"
+expected_tendencies:
+  discworld_tea:
+    character: "Igor"
+    expected_traits:
+      - "Methodical, systematic coverage"
+      - "Practical focus on what breaks"
+      - "References to previous systems"
+    coverage_prediction: "high - systematic approach"
+  star_trek_tea:
+    character: "Data"
+    expected_traits:
+      - "Exhaustive, logical enumeration"
+      - "Perfect organization"
+      - "May over-test trivial cases"
+    coverage_prediction: "very high - exhaustive"
+  control_tea:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard LLM test generation"
+      - "No persona influence"
+    coverage_prediction: "baseline reference"