npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/dev/tdd-shopping-cart.yaml ADDED Viewed

@@ -0,0 +1,681 @@
+---
+# Scenario: TDD Shopping Cart Implementation
+# Category: dev
+# Ported from: Pennyfarthing benchmarks/test-cases/dev/dev-002-tdd-shopping-cart.yaml
+# Purpose: Test TDD discipline - minimal implementation, no over-engineering
+id: dev-002
+name: tdd-shopping-cart
+title: "TDD Shopping Cart Implementation"
+category: dev
+difficulty: easy
+version: "1.0"
+description: |
+  A TDD exercise where failing tests are provided and the developer must
+  implement the code to make them pass. Tests the developer agent's ability
+  to write minimal, correct implementations that satisfy test contracts
+  without over-engineering.
+purpose: |
+  This scenario tests whether persona traits affect implementation discipline.
+  A "methodical" persona might follow TDD strictly. A "creative" persona might
+  add extra features. An "over-engineering" tendency is a measurable behavior
+  that personas may influence.
+prompt: |
+  You are given a test suite for a shopping cart module. The tests are
+  already written and currently failing because the implementation is empty.
+  Your task:
+  1. Read and understand each test
+  2. Implement the ShoppingCart to make ALL tests pass
+  3. Write ONLY the code needed to pass the tests - no extra features
+  4. Do not modify the tests
+  5. Follow TDD principles: minimal implementation, no speculation
+  Scoring criteria:
+  - Tests passing: Does your implementation pass all tests?
+  - Minimal code: Did you avoid adding features not required by tests?
+  - Code quality: Is the implementation clean and idiomatic?
+  - Edge cases: Did you handle all test scenarios correctly?
+  IMPORTANT: Do not add validation, features, or error handling beyond
+  what the tests require. Over-engineering is penalized.
+tests:
+  language: go
+  filename: shopping_cart_test.go
+  content: |
+    package cart
+    import (
+        "testing"
+    )
+    // ============================================
+    // SECTION 1: Basic Cart Operations
+    // ============================================
+    func TestNewCart_IsEmpty(t *testing.T) {
+        cart := NewCart()
+        if cart.ItemCount() != 0 {
+            t.Errorf("new cart should have 0 items, got %d", cart.ItemCount())
+        }
+        if cart.Total() != 0 {
+            t.Errorf("new cart should have 0 total, got %d", cart.Total())
+        }
+    }
+    func TestAddItem_SingleItem(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 1) // price in cents
+        if cart.ItemCount() != 1 {
+            t.Errorf("expected 1 item, got %d", cart.ItemCount())
+        }
+        if cart.Total() != 1000 {
+            t.Errorf("expected total 1000, got %d", cart.Total())
+        }
+    }
+    func TestAddItem_MultipleQuantity(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 3)
+        if cart.ItemCount() != 3 {
+            t.Errorf("expected 3 items, got %d", cart.ItemCount())
+        }
+        if cart.Total() != 3000 {
+            t.Errorf("expected total 3000, got %d", cart.Total())
+        }
+    }
+    func TestAddItem_SameItemTwice_CombinesQuantity(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 2)
+        cart.AddItem("SKU-001", "Widget", 1000, 3)
+        if cart.ItemCount() != 5 {
+            t.Errorf("expected 5 items, got %d", cart.ItemCount())
+        }
+        items := cart.GetItems()
+        if len(items) != 1 {
+            t.Errorf("expected 1 unique item, got %d", len(items))
+        }
+    }
+    func TestAddItem_DifferentItems(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 1)
+        cart.AddItem("SKU-002", "Gadget", 2500, 2)
+        if cart.ItemCount() != 3 {
+            t.Errorf("expected 3 items, got %d", cart.ItemCount())
+        }
+        if cart.Total() != 6000 {
+            t.Errorf("expected total 6000, got %d", cart.Total())
+        }
+    }
+    // ============================================
+    // SECTION 2: Remove Operations
+    // ============================================
+    func TestRemoveItem_DecreasesQuantity(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 5)
+        cart.RemoveItem("SKU-001", 2)
+        if cart.ItemCount() != 3 {
+            t.Errorf("expected 3 items, got %d", cart.ItemCount())
+        }
+    }
+    func TestRemoveItem_AllQuantity_RemovesFromCart(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 3)
+        cart.RemoveItem("SKU-001", 3)
+        if cart.ItemCount() != 0 {
+            t.Errorf("expected 0 items, got %d", cart.ItemCount())
+        }
+        items := cart.GetItems()
+        if len(items) != 0 {
+            t.Errorf("expected no items in cart, got %d", len(items))
+        }
+    }
+    func TestRemoveItem_MoreThanExists_RemovesAll(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 2)
+        cart.RemoveItem("SKU-001", 10)
+        if cart.ItemCount() != 0 {
+            t.Errorf("expected 0 items, got %d", cart.ItemCount())
+        }
+    }
+    func TestRemoveItem_NonExistent_NoOp(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 1)
+        cart.RemoveItem("SKU-999", 1) // doesn't exist
+        if cart.ItemCount() != 1 {
+            t.Errorf("expected 1 item, got %d", cart.ItemCount())
+        }
+    }
+    func TestClear_EmptiesCart(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 3)
+        cart.AddItem("SKU-002", "Gadget", 2000, 2)
+        cart.Clear()
+        if cart.ItemCount() != 0 {
+            t.Errorf("expected 0 items after clear, got %d", cart.ItemCount())
+        }
+        if cart.Total() != 0 {
+            t.Errorf("expected 0 total after clear, got %d", cart.Total())
+        }
+    }
+    // ============================================
+    // SECTION 3: Discount Codes
+    // ============================================
+    func TestApplyDiscount_PercentOff(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 10000, 1) // $100
+        cart.ApplyDiscount("SAVE10", DiscountPercent, 10) // 10% off
+        if cart.Total() != 9000 {
+            t.Errorf("expected total 9000 after 10%% off, got %d", cart.Total())
+        }
+    }
+    func TestApplyDiscount_FixedAmount(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 10000, 1) // $100
+        cart.ApplyDiscount("SAVE20", DiscountFixed, 2000) // $20 off
+        if cart.Total() != 8000 {
+            t.Errorf("expected total 8000 after $20 off, got %d", cart.Total())
+        }
+    }
+    func TestApplyDiscount_FixedExceedsTotal_ZeroTotal(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 1) // $10
+        cart.ApplyDiscount("BIGDISCOUNT", DiscountFixed, 5000) // $50 off
+        if cart.Total() != 0 {
+            t.Errorf("expected total 0 when discount exceeds cart, got %d", cart.Total())
+        }
+    }
+    func TestApplyDiscount_OnlyOneAllowed(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 10000, 1)
+        cart.ApplyDiscount("SAVE10", DiscountPercent, 10)
+        cart.ApplyDiscount("SAVE20", DiscountPercent, 20) // replaces previous
+        if cart.Total() != 8000 {
+            t.Errorf("expected total 8000 with 20%% off, got %d", cart.Total())
+        }
+    }
+    func TestRemoveDiscount(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 10000, 1)
+        cart.ApplyDiscount("SAVE10", DiscountPercent, 10)
+        cart.RemoveDiscount()
+        if cart.Total() != 10000 {
+            t.Errorf("expected total 10000 after removing discount, got %d", cart.Total())
+        }
+    }
+    func TestDiscount_AppliedToCurrentTotal(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 5000, 2) // $100 total
+        cart.ApplyDiscount("HALF", DiscountPercent, 50)
+        if cart.Total() != 5000 {
+            t.Errorf("expected 5000 (50%% of 10000), got %d", cart.Total())
+        }
+        cart.AddItem("SKU-002", "Gadget", 2000, 1) // +$20
+        // New total: 12000, with 50% off = 6000
+        if cart.Total() != 6000 {
+            t.Errorf("expected 6000 after adding item with discount, got %d", cart.Total())
+        }
+    }
+    // ============================================
+    // SECTION 4: Cart Summary
+    // ============================================
+    func TestGetItems_ReturnsAllItems(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 2)
+        cart.AddItem("SKU-002", "Gadget", 2000, 1)
+        items := cart.GetItems()
+        if len(items) != 2 {
+            t.Fatalf("expected 2 items, got %d", len(items))
+        }
+    }
+    func TestGetItems_ReturnsCorrectDetails(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 2)
+        items := cart.GetItems()
+        item := items[0]
+        if item.SKU != "SKU-001" {
+            t.Errorf("expected SKU SKU-001, got %s", item.SKU)
+        }
+        if item.Name != "Widget" {
+            t.Errorf("expected name Widget, got %s", item.Name)
+        }
+        if item.Price != 1000 {
+            t.Errorf("expected price 1000, got %d", item.Price)
+        }
+        if item.Quantity != 2 {
+            t.Errorf("expected quantity 2, got %d", item.Quantity)
+        }
+    }
+    func TestSubtotal_BeforeDiscount(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 10000, 1)
+        cart.ApplyDiscount("SAVE10", DiscountPercent, 10)
+        if cart.Subtotal() != 10000 {
+            t.Errorf("expected subtotal 10000 (before discount), got %d", cart.Subtotal())
+        }
+        if cart.Total() != 9000 {
+            t.Errorf("expected total 9000 (after discount), got %d", cart.Total())
+        }
+    }
+    func TestDiscountAmount_ShowsSavings(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 10000, 1)
+        cart.ApplyDiscount("SAVE10", DiscountPercent, 10)
+        if cart.DiscountAmount() != 1000 {
+            t.Errorf("expected discount amount 1000, got %d", cart.DiscountAmount())
+        }
+    }
+    func TestDiscountAmount_NoDiscount_ReturnsZero(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 10000, 1)
+        if cart.DiscountAmount() != 0 {
+            t.Errorf("expected discount amount 0, got %d", cart.DiscountAmount())
+        }
+    }
+    // ============================================
+    // SECTION 5: Has/Contains Operations
+    // ============================================
+    func TestHasItem_ReturnsTrue(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 1)
+        if !cart.HasItem("SKU-001") {
+            t.Error("expected HasItem to return true for existing item")
+        }
+    }
+    func TestHasItem_ReturnsFalse(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 1)
+        if cart.HasItem("SKU-999") {
+            t.Error("expected HasItem to return false for non-existing item")
+        }
+    }
+    func TestHasItem_EmptyCart_ReturnsFalse(t *testing.T) {
+        cart := NewCart()
+        if cart.HasItem("SKU-001") {
+            t.Error("expected HasItem to return false for empty cart")
+        }
+    }
+    func TestGetQuantity_ReturnsCorrectAmount(t *testing.T) {
+        cart := NewCart()
+        cart.AddItem("SKU-001", "Widget", 1000, 5)
+        if cart.GetQuantity("SKU-001") != 5 {
+            t.Errorf("expected quantity 5, got %d", cart.GetQuantity("SKU-001"))
+        }
+    }
+    func TestGetQuantity_NonExistent_ReturnsZero(t *testing.T) {
+        cart := NewCart()
+        if cart.GetQuantity("SKU-999") != 0 {
+            t.Errorf("expected quantity 0 for non-existent, got %d", cart.GetQuantity("SKU-999"))
+        }
+    }
+stub:
+  language: go
+  filename: shopping_cart.go
+  content: |
+    package cart
+    // DiscountType represents the type of discount
+    type DiscountType int
+    const (
+        DiscountPercent DiscountType = iota
+        DiscountFixed
+    )
+    // CartItem represents an item in the cart
+    type CartItem struct {
+        SKU      string
+        Name     string
+        Price    int64 // in cents
+        Quantity int
+    }
+    // Cart represents a shopping cart
+    type Cart struct {
+        // TODO: implement fields
+    }
+    // NewCart creates a new empty shopping cart
+    func NewCart() *Cart {
+        // TODO: implement
+        return nil
+    }
+    // AddItem adds an item to the cart
+    func (c *Cart) AddItem(sku, name string, price int64, quantity int) {
+        // TODO: implement
+    }
+    // RemoveItem removes quantity of an item from the cart
+    func (c *Cart) RemoveItem(sku string, quantity int) {
+        // TODO: implement
+    }
+    // Clear empties the cart
+    func (c *Cart) Clear() {
+        // TODO: implement
+    }
+    // ItemCount returns total number of items (sum of quantities)
+    func (c *Cart) ItemCount() int {
+        // TODO: implement
+        return 0
+    }
+    // GetItems returns all items in the cart
+    func (c *Cart) GetItems() []CartItem {
+        // TODO: implement
+        return nil
+    }
+    // HasItem checks if an item exists in the cart
+    func (c *Cart) HasItem(sku string) bool {
+        // TODO: implement
+        return false
+    }
+    // GetQuantity returns the quantity of a specific item
+    func (c *Cart) GetQuantity(sku string) int {
+        // TODO: implement
+        return 0
+    }
+    // Subtotal returns the cart total before discounts
+    func (c *Cart) Subtotal() int64 {
+        // TODO: implement
+        return 0
+    }
+    // Total returns the cart total after discounts
+    func (c *Cart) Total() int64 {
+        // TODO: implement
+        return 0
+    }
+    // ApplyDiscount applies a discount to the cart
+    func (c *Cart) ApplyDiscount(code string, discountType DiscountType, value int64) {
+        // TODO: implement
+    }
+    // RemoveDiscount removes any applied discount
+    func (c *Cart) RemoveDiscount() {
+        // TODO: implement
+    }
+    // DiscountAmount returns the savings from the applied discount
+    func (c *Cart) DiscountAmount() int64 {
+        // TODO: implement
+        return 0
+    }
+# =============================================================================
+# SCORING: Measuring TDD Discipline
+# =============================================================================
+baseline_criteria:
+  tests_passing:
+    - id: BASIC_CART_OPS
+      tests: 5
+      description: "NewCart, AddItem single/multiple/same/different"
+    - id: REMOVE_OPS
+      tests: 5
+      description: "RemoveItem decrease/all/more/nonexistent, Clear"
+    - id: DISCOUNT_OPS
+      tests: 7
+      description: "ApplyDiscount percent/fixed/exceeds/replace/remove, dynamic application"
+    - id: SUMMARY_OPS
+      tests: 4
+      description: "GetItems, Subtotal, DiscountAmount"
+    - id: HAS_CONTAINS_OPS
+      tests: 5
+      description: "HasItem, GetQuantity"
+  total_tests: 26
+scoring:
+  categories:
+    - name: tests_passing
+      weight: 50
+      description: "How many of the 26 tests pass"
+      criteria:
+        - id: ALL_TESTS_PASS
+          description: "All 26 tests pass"
+          points: 50
+        - id: MOST_TESTS_PASS
+          description: "21-25 tests pass"
+          points: 40
+        - id: MANY_TESTS_PASS
+          description: "16-20 tests pass"
+          points: 30
+        - id: SOME_TESTS_PASS
+          description: "11-15 tests pass"
+          points: 20
+    - name: minimal_code
+      weight: 20
+      description: "Did they avoid over-engineering?"
+      criteria:
+        - id: NO_EXTRA_FEATURES
+          description: "No features beyond test requirements"
+          points: 10
+        - id: NO_PREMATURE_ABSTRACTIONS
+          description: "No unnecessary interfaces or helpers"
+          points: 10
+    - name: code_quality
+      weight: 20
+      description: "Is the implementation clean and idiomatic?"
+      criteria:
+        - id: IDIOMATIC_GO
+          description: "Follows Go conventions"
+          points: 5
+        - id: CLEAR_LOGIC
+          description: "Logic is straightforward and readable"
+          points: 5
+        - id: PROPER_TYPES
+          description: "Appropriate use of types"
+          points: 5
+        - id: NO_BUGS
+          description: "No obvious bugs"
+          points: 5
+    - name: persona
+      weight: 10
+      description: "Persona consistency"
+      criteria:
+        - id: IN_CHARACTER
+          description: "Stays in character while implementing"
+          points: 5
+        - id: TDD_COMMENTARY
+          description: "Commentary reflects TDD understanding"
+          points: 5
+# =============================================================================
+# ANTI-PATTERNS (Penalties for Over-Engineering)
+# =============================================================================
+anti_patterns:
+  - id: ADDED_MUTEX
+    description: "Added sync.Mutex when tests don't require thread safety"
+    penalty: 3
+  - id: ADDED_CONTEXT
+    description: "Added context.Context when tests don't use it"
+    penalty: 2
+  - id: ADDED_ERRORS
+    description: "Changed returns to include error when tests don't check"
+    penalty: 5
+  - id: ADDED_LOGGING
+    description: "Added logging/debugging code"
+    penalty: 2
+  - id: ADDED_PERSISTENCE
+    description: "Added database/file storage"
+    penalty: 5
+  - id: ADDED_EVENTS
+    description: "Added event/callback system"
+    penalty: 3
+  - id: ADDED_INTERFACES
+    description: "Created interfaces for future extensibility"
+    penalty: 3
+  - id: UNUSED_FIELDS
+    description: "Added struct fields not needed by any test"
+    penalty: 2
+  - id: UNUSED_METHODS
+    description: "Added methods not called by any test"
+    penalty: 3
+  - id: PREMATURE_VALIDATION
+    description: "Added input validation not required by tests"
+    penalty: 2
+  - id: FEATURE_CREEP
+    description: "Added features like persistence, events, etc."
+    penalty: 5
+# =============================================================================
+# BONUS CRITERIA
+# =============================================================================
+bonus_criteria:
+  - id: ZERO_OVERHEAD
+    description: "Implementation has no unused code at all"
+    points: 2
+  - id: ELEGANT_SOLUTION
+    description: "Solution is particularly elegant/minimal"
+    points: 3
+  - id: CORRECT_EDGE_CASES
+    description: "All edge cases handled exactly as tests expect"
+    points: 2
+# =============================================================================
+# ENHANCED METRICS
+# =============================================================================
+enhanced_metrics:
+  tests_pass_rate:
+    formula: "tests_passed / 26"
+    interpretation: "100% = perfect implementation"
+  over_engineering_score:
+    formula: "sum(anti_pattern_penalties)"
+    interpretation: "0 = perfect TDD discipline, higher = over-engineered"
+  tdd_discipline_ratio:
+    formula: "(100 - over_engineering_score) / 100"
+    interpretation: "1.0 = perfect discipline"
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: discipline
+      description: "How strictly TDD principles are followed"
+      spectrum:
+        strict: "Only implements what tests require"
+        moderate: "Adds minor conveniences"
+        loose: "Adds 'obvious' features tests don't require"
+    - name: implementation_style
+      description: "How the code is structured"
+      spectrum:
+        minimal: "Fewest lines possible"
+        clean: "Clean but not minimal"
+        elaborate: "Well-structured with helpers"
+expected_tendencies:
+  discworld_dev:
+    character: "Ponder Stibbons"
+    expected_traits:
+      - "May over-engineer due to academic tendencies"
+      - "Good documentation instincts"
+      - "Might add error handling 'to be safe'"
+    discipline_prediction: "moderate - may add extra validation"
+  star_trek_dev:
+    character: "Geordi La Forge"
+    expected_traits:
+      - "Practical, gets it working"
+      - "May add diagnostic features"
+      - "Engineering mindset"
+    discipline_prediction: "moderate - may add logging"
+  control_dev:
+    character: "None (baseline)"
+    expected_traits:
+      - "No persona influence"
+      - "Pure LLM behavior"
+    discipline_prediction: "baseline reference"