npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/code-review/order-service.yaml ADDED Viewed

@@ -0,0 +1,622 @@
+---
+# Scenario: Order Service Code Review (Easy)
+# NOTE: Re-ranked to "easy" based on control baseline mean 91.9 ± 1.45 (Story 7-2)
+# Category: code-review
+# Ported from: Pennyfarthing benchmarks/test-cases/code-review/cr-002-order-service.yaml
+# Purpose: Differentiate persona thoroughness with layered complexity
+id: cr-002
+name: order-service
+title: "Order Service Code Review (Enhanced)"
+category: code-review
+difficulty: easy
+version: "2.0"
+description: |
+  An e-commerce order service with payment processing, inventory management,
+  and notification handling. Contains multiple layers of issues - some obvious,
+  some subtle, some architectural. Designed to measure thoroughness differences.
+purpose: |
+  This scenario tests whether personas with different traits (e.g., "uncompromising"
+  vs "pragmatic") actually find different numbers of issues, or if persona only
+  affects communication style. Finding all 22 baseline issues = competent.
+  Finding bonus issues = thorough. Missing obvious issues = concerning.
+prompt: |
+  You are reviewing a pull request for an order processing service.
+  The developer says "it works in staging" and wants to merge to main.
+  Review this code thoroughly for:
+  - Security vulnerabilities
+  - Bug and logic errors
+  - Performance issues
+  - Code quality problems
+  - Architectural concerns
+  - Missing error handling
+  - Race conditions and concurrency issues
+  For each issue:
+  1. Identify the specific location (line number)
+  2. Classify severity (Critical/High/Medium/Low)
+  3. Explain the impact
+  4. Provide a fix or recommendation
+  Be thorough. This code handles real money.
+code:
+  language: go
+  filename: order_service.go
+  content: |
+    package orders
+    import (
+        "bytes"
+        "context"
+        "database/sql"
+        "encoding/json"
+        "fmt"
+        "log"
+        "net/http"
+        "strconv"
+        "sync"
+        "time"
+    )
+    type OrderService struct {
+        db           *sql.DB
+        paymentAPI   string
+        inventoryAPI string
+        cache        map[string]*Order
+        cacheMutex   sync.Mutex
+    }
+    type Order struct {
+        ID          int64     `json:"id"`
+        UserID      int64     `json:"user_id"`
+        Items       []Item    `json:"items"`
+        Total       float64   `json:"total"`
+        Status      string    `json:"status"`
+        PaymentID   string    `json:"payment_id"`
+        CreatedAt   time.Time `json:"created_at"`
+        CreditCard  string    `json:"credit_card,omitempty"`
+    }
+    type Item struct {
+        ProductID int64   `json:"product_id"`
+        Quantity  int     `json:"quantity"`
+        Price     float64 `json:"price"`
+    }
+    // CreateOrder processes a new order
+    func (s *OrderService) CreateOrder(w http.ResponseWriter, r *http.Request) {
+        var order Order
+        json.NewDecoder(r.Body).Decode(&order)
+        // Calculate total
+        var total float64
+        for _, item := range order.Items {
+            total += item.Price * float64(item.Quantity)
+        }
+        order.Total = total
+        // Check inventory for each item
+        for _, item := range order.Items {
+            resp, _ := http.Get(fmt.Sprintf("%s/check?product=%d&qty=%d",
+                s.inventoryAPI, item.ProductID, item.Quantity))
+            if resp.StatusCode != 200 {
+                http.Error(w, "Inventory check failed", 400)
+                return
+            }
+        }
+        // Process payment
+        paymentReq := map[string]interface{}{
+            "amount":     order.Total,
+            "card":       order.CreditCard,
+            "order_ref":  order.ID,
+        }
+        paymentBody, _ := json.Marshal(paymentReq)
+        resp, _ := http.Post(s.paymentAPI+"/charge", "application/json",
+            bytes.NewReader(paymentBody))
+        var paymentResp map[string]string
+        json.NewDecoder(resp.Body).Decode(&paymentResp)
+        order.PaymentID = paymentResp["payment_id"]
+        // Reserve inventory
+        for _, item := range order.Items {
+            go func(i Item) {
+                http.Post(fmt.Sprintf("%s/reserve", s.inventoryAPI),
+                    "application/json",
+                    bytes.NewReader([]byte(fmt.Sprintf(
+                        `{"product_id":%d,"quantity":%d}`, i.ProductID, i.Quantity))))
+            }(item)
+        }
+        // Save order
+        result, err := s.db.Exec(
+            "INSERT INTO orders (user_id, total, status, payment_id, credit_card) VALUES (?, ?, ?, ?, ?)",
+            order.UserID, order.Total, "pending", order.PaymentID, order.CreditCard)
+        if err != nil {
+            log.Printf("Failed to save order: %v", err)
+            http.Error(w, "Order failed", 500)
+            return
+        }
+        order.ID, _ = result.LastInsertId()
+        order.Status = "pending"
+        // Cache the order
+        s.cache[strconv.FormatInt(order.ID, 10)] = &order
+        json.NewEncoder(w).Encode(order)
+    }
+    // GetOrder retrieves an order by ID
+    func (s *OrderService) GetOrder(w http.ResponseWriter, r *http.Request) {
+        orderID := r.URL.Query().Get("id")
+        // Check cache first
+        if cached, ok := s.cache[orderID]; ok {
+            json.NewEncoder(w).Encode(cached)
+            return
+        }
+        query := fmt.Sprintf("SELECT * FROM orders WHERE id = %s", orderID)
+        row := s.db.QueryRow(query)
+        var order Order
+        err := row.Scan(&order.ID, &order.UserID, &order.Total, &order.Status,
+            &order.PaymentID, &order.CreditCard, &order.CreatedAt)
+        if err != nil {
+            http.Error(w, "Order not found", 404)
+            return
+        }
+        // Load items
+        itemRows, _ := s.db.Query(
+            fmt.Sprintf("SELECT product_id, quantity, price FROM order_items WHERE order_id = %s", orderID))
+        for itemRows.Next() {
+            var item Item
+            itemRows.Scan(&item.ProductID, &item.Quantity, &item.Price)
+            order.Items = append(order.Items, item)
+        }
+        s.cache[orderID] = &order
+        json.NewEncoder(w).Encode(order)
+    }
+    // CancelOrder cancels an existing order
+    func (s *OrderService) CancelOrder(w http.ResponseWriter, r *http.Request) {
+        orderID := r.URL.Query().Get("id")
+        // Update status
+        s.db.Exec("UPDATE orders SET status = 'cancelled' WHERE id = " + orderID)
+        // Refund payment
+        var paymentID string
+        s.db.QueryRow("SELECT payment_id FROM orders WHERE id = ?", orderID).Scan(&paymentID)
+        http.Post(s.paymentAPI+"/refund", "application/json",
+            bytes.NewReader([]byte(fmt.Sprintf(`{"payment_id":"%s"}`, paymentID))))
+        // Release inventory
+        rows, _ := s.db.Query("SELECT product_id, quantity FROM order_items WHERE order_id = ?", orderID)
+        for rows.Next() {
+            var productID int64
+            var quantity int
+            rows.Scan(&productID, &quantity)
+            go func() {
+                http.Post(fmt.Sprintf("%s/release", s.inventoryAPI),
+                    "application/json",
+                    bytes.NewReader([]byte(fmt.Sprintf(
+                        `{"product_id":%d,"quantity":%d}`, productID, quantity))))
+            }()
+        }
+        // Remove from cache
+        delete(s.cache, orderID)
+        w.Write([]byte("Order cancelled"))
+    }
+    // GetUserOrders returns all orders for a user
+    func (s *OrderService) GetUserOrders(w http.ResponseWriter, r *http.Request) {
+        userID := r.URL.Query().Get("user_id")
+        limit := r.URL.Query().Get("limit")
+        if limit == "" {
+            limit = "100"
+        }
+        query := fmt.Sprintf(
+            "SELECT id, total, status, created_at FROM orders WHERE user_id = %s ORDER BY created_at DESC LIMIT %s",
+            userID, limit)
+        rows, err := s.db.Query(query)
+        if err != nil {
+            log.Printf("Query error: %v", err)
+        }
+        var orders []Order
+        for rows.Next() {
+            var o Order
+            rows.Scan(&o.ID, &o.Total, &o.Status, &o.CreatedAt)
+            orders = append(orders, o)
+        }
+        json.NewEncoder(w).Encode(orders)
+    }
+    // ProcessRefund handles refund requests
+    func (s *OrderService) ProcessRefund(w http.ResponseWriter, r *http.Request) {
+        var req struct {
+            OrderID int64   `json:"order_id"`
+            Amount  float64 `json:"amount"`
+            Reason  string  `json:"reason"`
+        }
+        json.NewDecoder(r.Body).Decode(&req)
+        // Get order
+        var order Order
+        s.db.QueryRow("SELECT id, total, payment_id FROM orders WHERE id = ?", req.OrderID).
+            Scan(&order.ID, &order.Total, &order.PaymentID)
+        // Process refund
+        refundReq := fmt.Sprintf(`{"payment_id":"%s","amount":%f,"reason":"%s"}`,
+            order.PaymentID, req.Amount, req.Reason)
+        resp, _ := http.Post(s.paymentAPI+"/refund", "application/json",
+            bytes.NewReader([]byte(refundReq)))
+        if resp.StatusCode == 200 {
+            s.db.Exec("UPDATE orders SET status = 'refunded' WHERE id = ?", req.OrderID)
+            s.db.Exec(fmt.Sprintf(
+                "INSERT INTO refund_log (order_id, amount, reason, processed_at) VALUES (%d, %f, '%s', NOW())",
+                req.OrderID, req.Amount, req.Reason))
+        }
+        w.Write([]byte("Refund processed"))
+    }
+    // BulkUpdatePrices updates prices for multiple products
+    func (s *OrderService) BulkUpdatePrices(w http.ResponseWriter, r *http.Request) {
+        var updates []struct {
+            ProductID int64   `json:"product_id"`
+            NewPrice  float64 `json:"new_price"`
+        }
+        json.NewDecoder(r.Body).Decode(&updates)
+        for _, u := range updates {
+            s.db.Exec(fmt.Sprintf(
+                "UPDATE products SET price = %f WHERE id = %d", u.NewPrice, u.ProductID))
+        }
+        w.Write([]byte(fmt.Sprintf("Updated %d products", len(updates))))
+    }
+    // ExportOrders exports orders to CSV
+    func (s *OrderService) ExportOrders(w http.ResponseWriter, r *http.Request) {
+        startDate := r.URL.Query().Get("start")
+        endDate := r.URL.Query().Get("end")
+        query := fmt.Sprintf(
+            "SELECT id, user_id, total, status, credit_card, created_at FROM orders WHERE created_at BETWEEN '%s' AND '%s'",
+            startDate, endDate)
+        rows, _ := s.db.Query(query)
+        w.Header().Set("Content-Type", "text/csv")
+        w.Write([]byte("id,user_id,total,status,card_last4,created_at\n"))
+        for rows.Next() {
+            var id, userID int64
+            var total float64
+            var status, creditCard string
+            var createdAt time.Time
+            rows.Scan(&id, &userID, &total, &status, &creditCard, &createdAt)
+            // Mask credit card
+            cardLast4 := creditCard[len(creditCard)-4:]
+            w.Write([]byte(fmt.Sprintf("%d,%d,%.2f,%s,%s,%s\n",
+                id, userID, total, status, cardLast4, createdAt.Format(time.RFC3339))))
+        }
+    }
+# =============================================================================
+# BASELINE ISSUES (minimum expected to find)
+# Finding all of these = 100% baseline score
+# These are seeded, known issues - NOT shown to contestants
+# =============================================================================
+baseline_issues:
+  critical:
+    - id: SQL_INJECTION_GET_ORDER
+      location: "line 107"
+      description: "SQL injection in GetOrder via string formatting"
+    - id: SQL_INJECTION_CANCEL
+      location: "line 136"
+      description: "SQL injection in CancelOrder via string concatenation"
+    - id: SQL_INJECTION_USER_ORDERS
+      location: "lines 169-171"
+      description: "SQL injection in GetUserOrders (both userID and limit)"
+    - id: SQL_INJECTION_REFUND_LOG
+      location: "lines 209-211"
+      description: "SQL injection in refund log insert (reason field)"
+    - id: SQL_INJECTION_EXPORT
+      location: "lines 232-234"
+      description: "SQL injection in ExportOrders (date parameters)"
+    - id: CREDIT_CARD_STORED
+      location: "line 89"
+      description: "Credit card stored in database unencrypted"
+  high:
+    - id: CREDIT_CARD_EXPOSED_JSON
+      location: "line 31"
+      description: "Credit card included in JSON response"
+    - id: CREDIT_CARD_IN_CSV
+      location: "line 243"
+      description: "Credit card exposed in CSV export"
+    - id: NO_AUTH_CANCEL
+      location: "line 132"
+      description: "No authorization check on CancelOrder"
+    - id: NO_AUTH_REFUND
+      location: "line 186"
+      description: "No authorization check on ProcessRefund"
+    - id: NO_AUTH_BULK_UPDATE
+      location: "line 216"
+      description: "No authorization check on BulkUpdatePrices"
+    - id: NO_AUTH_EXPORT
+      location: "line 227"
+      description: "No authorization check on ExportOrders"
+  medium:
+    - id: RACE_CONDITION_INVENTORY
+      location: "lines 77-83"
+      description: "Goroutines for inventory reserve without synchronization"
+    - id: RACE_CONDITION_CANCEL
+      location: "lines 151-158"
+      description: "Goroutines in cancel without proper closure"
+    - id: CACHE_NO_MUTEX
+      location: "line 96"
+      description: "Cache write without mutex lock"
+    - id: CACHE_NO_MUTEX_READ
+      location: "line 103"
+      description: "Cache read without mutex lock"
+    - id: ROWS_NOT_CLOSED_GET
+      location: "line 116"
+      description: "itemRows not closed in GetOrder"
+    - id: ROWS_NOT_CLOSED_CANCEL
+      location: "line 147"
+      description: "rows not closed in CancelOrder"
+    - id: ERROR_IGNORED_DECODE
+      location: "line 45"
+      description: "JSON decode error ignored in CreateOrder"
+    - id: ERROR_IGNORED_PAYMENT
+      location: "line 68"
+      description: "Payment POST error ignored"
+  low:
+    - id: MISSING_CONTENT_TYPE
+      location: "multiple handlers"
+      description: "JSON responses don't set Content-Type header"
+    - id: FLOAT_FOR_MONEY
+      location: "lines 27, 49-52"
+      description: "Using float64 for monetary values (precision issues)"
+# =============================================================================
+# BONUS ISSUES (thorough reviewers might find these)
+# Finding these demonstrates above-average thoroughness
+# =============================================================================
+bonus_issues:
+  architectural:
+    - id: NO_TRANSACTION
+      description: "CreateOrder should use transaction for payment + DB + inventory"
+    - id: SAGA_PATTERN_MISSING
+      description: "Distributed transaction needs saga/compensation pattern"
+    - id: NO_IDEMPOTENCY
+      description: "CreateOrder not idempotent - duplicate orders possible"
+    - id: TIGHT_COUPLING
+      description: "Direct HTTP calls to services instead of abstraction"
+  security:
+    - id: SSRF_POTENTIAL
+      description: "API URLs from config could be exploited for SSRF"
+    - id: NO_RATE_LIMITING
+      description: "No rate limiting on any endpoints"
+    - id: NO_INPUT_VALIDATION
+      description: "No validation on order items, quantities, prices"
+    - id: CARD_MASKING_UNSAFE
+      location: "line 243"
+      description: "Card masking panics on cards < 4 chars"
+  reliability:
+    - id: NO_TIMEOUT_HTTP
+      description: "HTTP calls have no timeout set"
+    - id: NO_RETRY_LOGIC
+      description: "External service calls have no retry"
+    - id: NO_CIRCUIT_BREAKER
+      description: "No circuit breaker for failing services"
+    - id: CACHE_UNBOUNDED
+      description: "Cache grows forever, no eviction"
+  observability:
+    - id: POOR_ERROR_MESSAGES
+      description: "Generic error messages hide root cause"
+    - id: NO_REQUEST_ID
+      description: "No correlation ID for tracing"
+    - id: INCONSISTENT_LOGGING
+      description: "Some errors logged, others ignored"
+  code_quality:
+    - id: DUPLICATE_SQL_PATTERNS
+      description: "Same SQL injection pattern repeated"
+    - id: NO_CONTEXT_PROPAGATION
+      description: "context.Context not used for cancellation"
+    - id: MAGIC_STRINGS
+      description: "Status values as magic strings"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  total_baseline_issues: 22
+  total_bonus_issues: 19
+  weights:
+    critical: 3
+    high: 2
+    medium: 1
+    low: 0.5
+  max_baseline_score: 35.5  # 6*3 + 6*2 + 8*1 + 2*0.5
+  categories:
+    - name: detection
+      weight: 40
+      criteria:
+        - id: BASELINE_FOUND
+          description: "Issues from the seeded baseline list"
+          points: 25
+        - id: BONUS_DISCOVERIES
+          description: "Valid issues beyond the baseline"
+          points: 15
+    - name: depth
+      weight: 30
+      criteria:
+        - id: ROOT_CAUSE_ANALYSIS
+          description: "Traces to underlying cause, not just symptom"
+          points: 10
+        - id: FIX_SPECIFICITY
+          description: "Provides actual code fixes with line numbers"
+          points: 10
+        - id: IMPACT_ASSESSMENT
+          description: "Explains full attack chain or cascade effects"
+          points: 10
+    - name: quality
+      weight: 15
+      criteria:
+        - id: SEVERITY_ACCURACY
+          description: "Correctly classifies severity levels"
+          points: 5
+        - id: REASONING_QUALITY
+          description: "Clear logical chain for each issue"
+          points: 5
+        - id: ORGANIZATION
+          description: "Prioritized, scannable structure"
+          points: 5
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona enhances memorability/clarity"
+          points: 7
+# =============================================================================
+# ENHANCED METRICS (for scientific comparison)
+# =============================================================================
+enhanced_metrics:
+  thoroughness_ratio:
+    formula: "total_findings / baseline_issues"
+    interpretation: "100% = found baseline, 150% = found 50% more"
+  bonus_discovery_rate:
+    formula: "bonus_found / bonus_issues"
+    interpretation: "What percentage of bonus issues discovered"
+  depth_score:
+    formula: "avg(root_cause, fix_specificity, impact_assessment)"
+    scale: "1-5"
+  quality_score:
+    formula: "avg(severity_accuracy, reasoning, organization)"
+    scale: "1-5"
+# =============================================================================
+# PERSONA INFLUENCE (how different traits should affect approach)
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: thoroughness
+      description: "How many issues are found"
+      spectrum:
+        minimal: "Finds obvious issues, moves on"
+        adequate: "Finds most baseline issues"
+        exhaustive: "Finds baseline + bonus issues"
+    - name: severity_focus
+      description: "What gets prioritized"
+      spectrum:
+        security_first: "Leads with SQL injection, auth gaps"
+        quality_first: "Leads with code quality, patterns"
+        balanced: "Covers all categories systematically"
+    - name: fix_style
+      description: "How fixes are presented"
+      spectrum:
+        minimal: "Just identifies the problem"
+        practical: "Shows the fix inline"
+        comprehensive: "Refactors surrounding code, explains principles"
+expected_tendencies:
+  discworld_reviewer:
+    character: "Granny Weatherwax"
+    expected_traits:
+      - "Uncompromising - should find more issues"
+      - "No-nonsense - severity should be accurate"
+      - "Headology - may note developer psychology issues"
+    thoroughness_prediction: "high"
+  star_trek_reviewer:
+    character: "Spock"
+    expected_traits:
+      - "Logical - systematic coverage"
+      - "Precise - accurate line numbers"
+      - "Unemotional - may miss 'soft' issues"
+    thoroughness_prediction: "high"
+  control_reviewer:
+    character: "None (baseline)"
+    expected_traits:
+      - "Minimal persona influence"
+      - "Standard code review behavior"
+    thoroughness_prediction: "baseline reference"