npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/dev/migration-disaster.yaml ADDED Viewed

@@ -0,0 +1,415 @@
+---
+# Scenario: Database Migration Disaster Debug
+# Category: dev
+# Purpose: Test production incident debugging and migration fix skills
+id: dev-004
+name: migration-disaster
+title: "Database Migration Disaster"
+category: dev
+difficulty: medium  # Calibrated 2026-01-01: mean=76.50, was hard
+version: "1.0"
+description: |
+  A database migration that passed in staging but corrupted production data.
+  The migration has partial state, data type coercion bugs, foreign key constraint
+  violations, and timezone conversion errors. Developer must identify all issues
+  and write a corrective migration to restore data integrity.
+purpose: |
+  This scenario tests crisis debugging skills. A "thorough" persona should
+  find more issues. A "fast" persona might miss subtle corruption. Measures
+  ability to think systematically about data integrity under pressure.
+prompt: |
+  INCIDENT: A database migration was applied to production and is now causing
+  data integrity issues. Users are reporting incorrect order totals, missing
+  relationships, and timezone problems.
+  The migration was applied partially - it ran until line 47 before failing,
+  leaving the database in an inconsistent state.
+  Your task:
+  1. Analyze the migration code to identify ALL bugs
+  2. Explain the impact of each bug on production data
+  3. Write a corrective migration to fix the damage
+  4. Document any data that cannot be recovered
+  For each issue:
+  1. Identify the specific line and problem
+  2. Classify severity (Critical/High/Medium/Low)
+  3. Explain how it corrupts data
+  4. Provide SQL to detect affected rows
+  5. Provide SQL to fix or document the damage
+  IMPORTANT: This is production. Every fix must be safe and reversible.
+code:
+  language: python
+  filename: migrations/0042_normalize_orders.py
+  content: |
+    """
+    Migration 0042: Normalize orders table and add new pricing fields
+    Applied: 2024-01-15 03:45:00 UTC (during maintenance window)
+    Status: FAILED at line 47
+    """
+    from django.db import migrations, connection
+    def forward_migration(apps, schema_editor):
+        with connection.cursor() as cursor:
+            # Step 1: Add new columns for normalized pricing
+            cursor.execute("""
+                ALTER TABLE orders
+                ADD COLUMN subtotal_cents INTEGER,
+                ADD COLUMN tax_cents INTEGER,
+                ADD COLUMN shipping_cents INTEGER,
+                ADD COLUMN discount_cents INTEGER,
+                ADD COLUMN total_cents INTEGER
+            """)
+            # Step 2: Migrate existing price data (stored as decimal dollars)
+            cursor.execute("""
+                UPDATE orders
+                SET subtotal_cents = subtotal * 100,
+                    tax_cents = tax_amount * 100,
+                    shipping_cents = shipping_cost * 100,
+                    discount_cents = discount_amount * 100,
+                    total_cents = total_amount * 100
+            """)
+            # Step 3: Convert timestamps from local time to UTC
+            # Note: Server was running in America/New_York timezone
+            cursor.execute("""
+                UPDATE orders
+                SET created_at = created_at - INTERVAL '5 hours',
+                    updated_at = updated_at - INTERVAL '5 hours'
+            """)
+            # Step 4: Create new order_items table from denormalized data
+            cursor.execute("""
+                CREATE TABLE order_items_new (
+                    id SERIAL PRIMARY KEY,
+                    order_id INTEGER REFERENCES orders(id),
+                    product_id INTEGER,
+                    quantity INTEGER,
+                    unit_price_cents INTEGER,
+                    line_total_cents INTEGER
+                )
+            """)
+            # MIGRATION FAILED HERE - line 47
+            # Step 5: Migrate order items from JSON column
+            cursor.execute("""
+                INSERT INTO order_items_new (order_id, product_id, quantity, unit_price_cents, line_total_cents)
+                SELECT
+                    o.id,
+                    (item->>'product_id')::int,
+                    (item->>'quantity')::int,
+                    (item->>'price')::decimal * 100,
+                    (item->>'quantity')::int * (item->>'price')::decimal * 100
+                FROM orders o,
+                jsonb_array_elements(o.items_json) AS item
+            """)
+            # Step 6: Update foreign keys to use new user_uuid
+            cursor.execute("""
+                UPDATE orders o
+                SET user_id = u.new_id
+                FROM users u
+                WHERE o.user_email = u.email
+            """)
+            # Step 7: Drop old columns
+            cursor.execute("""
+                ALTER TABLE orders
+                DROP COLUMN subtotal,
+                DROP COLUMN tax_amount,
+                DROP COLUMN shipping_cost,
+                DROP COLUMN discount_amount,
+                DROP COLUMN total_amount,
+                DROP COLUMN items_json,
+                DROP COLUMN user_email
+            """)
+            # Step 8: Add NOT NULL constraints
+            cursor.execute("""
+                ALTER TABLE orders
+                ALTER COLUMN subtotal_cents SET NOT NULL,
+                ALTER COLUMN total_cents SET NOT NULL,
+                ALTER COLUMN user_id SET NOT NULL
+            """)
+            # Step 9: Create indexes
+            cursor.execute("""
+                CREATE INDEX idx_orders_user_id ON orders(user_id);
+                CREATE INDEX idx_orders_created_at ON orders(created_at);
+                CREATE INDEX idx_order_items_order_id ON order_items_new(order_id);
+            """)
+    def reverse_migration(apps, schema_editor):
+        # NOTE: This reverse migration was never implemented
+        pass
+    class Migration(migrations.Migration):
+        dependencies = [
+            ('orders', '0041_add_user_uuid'),
+        ]
+        operations = [
+            migrations.RunPython(forward_migration, reverse_migration),
+        ]
+    # ================================================================
+    # POST-INCIDENT NOTES (added by on-call engineer)
+    # ================================================================
+    #
+    # Migration ran at 03:45 UTC but failed at Step 5.
+    # Steps 1-4 completed. Steps 5-9 did not run.
+    #
+    # Current database state:
+    # - orders table has new _cents columns (populated)
+    # - orders table still has old decimal columns (not dropped)
+    # - orders table timestamps were modified
+    # - order_items_new table exists (empty)
+    # - user_id column NOT updated
+    # - items_json still exists
+    #
+    # Sample data before migration:
+    #   order_id=1234, subtotal=99.99, tax_amount=8.25, total_amount=108.24
+    #   created_at='2024-01-10 14:30:00' (Eastern Time)
+    #
+    # Sample data after migration (current):
+    #   order_id=1234, subtotal=99.99, subtotal_cents=9999
+    #   tax_cents=825, total_cents=10824
+    #   created_at='2024-01-10 09:30:00' (should be UTC)
+    #
+# =============================================================================
+# BASELINE ISSUES (minimum expected to find)
+# =============================================================================
+baseline_issues:
+  critical:
+    - id: FLOAT_TO_INT_TRUNCATION
+      location: "lines 21-26"
+      description: "Multiplying DECIMAL by 100 and storing as INTEGER truncates cents"
+      impact: "Orders with prices like $19.99 become 1998 cents instead of 1999"
+      affected_query: "SELECT id FROM orders WHERE (subtotal * 100) != subtotal_cents"
+    - id: TIMEZONE_WRONG_DIRECTION
+      location: "lines 29-32"
+      description: "Subtracting 5 hours is backwards - EST is UTC-5, should ADD"
+      impact: "All timestamps now 10 hours off (shifted wrong direction)"
+      affected_query: "All orders - timestamps are systematically wrong"
+    - id: TIMEZONE_DST_IGNORED
+      location: "lines 29-32"
+      description: "Fixed 5-hour offset ignores Daylight Saving Time"
+      impact: "Orders during EDT (summer) are 4 hours off, EST (winter) 5 hours off"
+      affected_query: "Orders between March-November have different offset error"
+    - id: PARTIAL_MIGRATION_STATE
+      location: "entire migration"
+      description: "Migration has no transaction - partial state left on failure"
+      impact: "Database in inconsistent state, items not migrated but prices converted"
+  high:
+    - id: FK_WILL_FAIL_STEP5
+      location: "line 45"
+      description: "order_id references orders(id) but some items_json has invalid order refs"
+      impact: "Step 5 INSERT would fail on FK constraint when it runs"
+      note: "This is WHY the migration failed"
+    - id: USER_ID_ORPHAN
+      location: "lines 57-60"
+      description: "UPDATE joins on email but some orders have emails not in users table"
+      impact: "Orders with deleted users would get NULL user_id, then fail NOT NULL"
+    - id: JSON_INVALID_ENTRIES
+      location: "line 51"
+      description: "Some items_json entries have string product_id, not int"
+      impact: "::int cast would fail on items like {\"product_id\": \"SKU-123\"}"
+    - id: NO_REVERSE_MIGRATION
+      location: "lines 68-69"
+      description: "reverse_migration is not implemented"
+      impact: "Cannot safely rollback - manual intervention required"
+  medium:
+    - id: DECIMAL_PRECISION_LOSS
+      location: "lines 49-50"
+      description: "(item->>'price')::decimal loses precision before multiply"
+      impact: "Accumulated rounding errors in line_total_cents"
+    - id: SUBTOTAL_TAX_MISMATCH
+      location: "post-migration data"
+      description: "Some orders now have subtotal_cents + tax_cents != total_cents"
+      impact: "Financial reports will show discrepancies"
+    - id: INDEX_CREATION_ORDER
+      location: "lines 73-76"
+      description: "Creating indexes on empty table then bulk INSERT"
+      impact: "Would be slow - should create indexes after data load"
+    - id: MISSING_ORDER_ITEMS_MIGRATION
+      location: "current state"
+      description: "order_items_new exists but is empty, items_json still has data"
+      impact: "Application may try to read from empty table"
+  low:
+    - id: COLUMN_NULLABLE
+      location: "lines 10-15"
+      description: "New columns added without DEFAULT, remain NULL until UPDATE"
+      impact: "Brief window where columns are NULL"
+    - id: NO_BATCH_PROCESSING
+      location: "line 21"
+      description: "Single UPDATE for entire table - could timeout on large tables"
+      impact: "Migration might timeout on production-scale data"
+# =============================================================================
+# BONUS ISSUES (thorough reviewers might find these)
+# =============================================================================
+bonus_issues:
+  data_integrity:
+    - id: TIMEZONE_PAST_ORDERS
+      description: "Orders from years ago may have different DST rules"
+    - id: NEGATIVE_DISCOUNT
+      description: "Some discount_amount may be negative (store credit) - breaks cents logic"
+    - id: CURRENCY_ASSUMPTION
+      description: "Migration assumes USD - international orders may have different precision"
+  operational:
+    - id: NO_PROGRESS_TRACKING
+      description: "No way to resume migration from failure point"
+    - id: NO_VALIDATION_STEP
+      description: "No checksums or row counts to verify migration success"
+    - id: MAINTENANCE_WINDOW_SHORT
+      description: "Large UPDATE statements may exceed maintenance window"
+  recovery:
+    - id: ORIGINAL_DATA_GONE
+      description: "If columns were dropped, original decimal values unrecoverable"
+    - id: AUDIT_TRAIL_BROKEN
+      description: "Timestamp changes break audit/compliance requirements"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  total_baseline_issues: 16
+  total_bonus_issues: 8
+  weights:
+    critical: 3
+    high: 2
+    medium: 1
+    low: 0.5
+  max_baseline_score: 26  # 4*3 + 4*2 + 4*1 + 2*0.5
+  categories:
+    - name: detection
+      weight: 35
+      criteria:
+        - id: BASELINE_FOUND
+          description: "Issues from the seeded baseline list"
+          points: 25
+        - id: BONUS_DISCOVERIES
+          description: "Valid issues beyond the baseline"
+          points: 10
+    - name: depth
+      weight: 30
+      criteria:
+        - id: ROOT_CAUSE_ANALYSIS
+          description: "Traces to data corruption mechanism"
+          points: 10
+        - id: DETECTION_QUERIES
+          description: "Provides SQL to find affected rows"
+          points: 10
+        - id: RECOVERY_PLAN
+          description: "Provides corrective migration"
+          points: 10
+    - name: quality
+      weight: 20
+      criteria:
+        - id: SEVERITY_ACCURACY
+          description: "Correctly classifies data vs operational issues"
+          points: 7
+        - id: SAFE_FIXES
+          description: "Fixes are transactional and reversible"
+          points: 8
+        - id: PRIORITIZATION
+          description: "Addresses critical data issues first"
+          points: 5
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: CRISIS_COMMUNICATION
+          description: "Communicates findings clearly for incident response"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: crisis_approach
+      description: "How the incident is prioritized"
+      spectrum:
+        systematic: "Catalogs all issues before proposing fixes"
+        pragmatic: "Fixes critical issues first, documents rest"
+        rapid: "Provides quick fix, may miss secondary issues"
+    - name: data_recovery_focus
+      description: "Balance between speed and data integrity"
+      spectrum:
+        integrity_first: "Ensures no data loss, even if slower"
+        balanced: "Prioritizes but accepts some data may be unrecoverable"
+        speed_first: "Gets system running, documents data loss"
+    - name: documentation_style
+      description: "How findings are communicated"
+      spectrum:
+        technical: "SQL-focused, precise"
+        narrative: "Explains impact to stakeholders"
+        minimal: "Just the facts and fixes"
+expected_tendencies:
+  discworld_dev:
+    character: "Ponder Stibbons"
+    expected_traits:
+      - "Systematic - should catalog all issues"
+      - "May over-explain magical (technical) details"
+      - "Thorough documentation"
+    crisis_approach_prediction: "systematic"
+  star_trek_dev:
+    character: "Data"
+    expected_traits:
+      - "Logical - precise identification of issues"
+      - "May provide optimal recovery sequence"
+      - "Clear prioritization"
+    crisis_approach_prediction: "systematic"
+  control_dev:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard incident response"
+    crisis_approach_prediction: "baseline reference"