npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/tea/microservice-integration-tests.yaml ADDED Viewed

@@ -0,0 +1,520 @@
+---
+# Scenario: Microservice Integration Test Design
+# Category: tea
+# Purpose: Test system-level integration testing skills
+id: tea-003
+name: microservice-integration-tests
+title: "Microservice Integration Test Design"
+category: tea
+difficulty: extreme  # Empirical: control mean 63.06 (hardest)
+version: "1.0"
+description: |
+  Design integration tests for an order fulfillment flow across 4 services:
+  Order, Inventory, Payment, and Notification. Must handle service dependencies,
+  test data management, async operations, and failure scenarios. Tests system-level
+  testing expertise beyond unit testing.
+purpose: |
+  This scenario tests integration testing philosophy. A "thorough" persona might
+  design more failure scenarios. A "pragmatic" persona might focus on happy paths
+  with fewer mocks. Measures ability to think about distributed system testing.
+prompt: |
+  You are a Test Engineer designing integration tests for an order fulfillment system.
+  The system has 4 microservices:
+  1. **Order Service** - Receives orders, orchestrates fulfillment
+  2. **Inventory Service** - Manages stock, reserves items
+  3. **Payment Service** - Processes payments, handles refunds
+  4. **Notification Service** - Sends emails/SMS for order updates
+  The order flow is:
+  1. Order received → Inventory check
+  2. Inventory reserved → Payment processed
+  3. Payment successful → Order confirmed, notification sent
+  4. Any failure → Compensating transactions (unreserve, refund)
+  Design a comprehensive integration test suite covering:
+  1. Happy path - complete order flow
+  2. Service failure scenarios (each service can fail)
+  3. Timeout and retry behavior
+  4. Compensating transaction correctness
+  5. Async message handling
+  6. Test data isolation
+  For each test scenario:
+  1. Name and description
+  2. Setup requirements (test data, mocks)
+  3. Steps to execute
+  4. Assertions to verify
+  5. Cleanup requirements
+  Use pytest with appropriate fixtures. Consider using testcontainers
+  for realistic service simulation.
+code:
+  language: python
+  filename: services.py
+  content: |
+    """
+    Order Fulfillment System - Service Interfaces
+    These are the service contracts your tests must verify.
+    Actual implementations connect to real databases and message queues.
+    """
+    from dataclasses import dataclass
+    from enum import Enum
+    from typing import List, Optional
+    from datetime import datetime
+    import httpx
+    import asyncio
+    class OrderStatus(Enum):
+        PENDING = "pending"
+        INVENTORY_RESERVED = "inventory_reserved"
+        PAYMENT_PROCESSING = "payment_processing"
+        CONFIRMED = "confirmed"
+        FAILED = "failed"
+        CANCELLED = "cancelled"
+    @dataclass
+    class OrderItem:
+        product_id: str
+        quantity: int
+        unit_price: float
+    @dataclass
+    class Order:
+        order_id: str
+        customer_id: str
+        items: List[OrderItem]
+        status: OrderStatus
+        created_at: datetime
+        total: float
+        payment_id: Optional[str] = None
+        failure_reason: Optional[str] = None
+    @dataclass
+    class InventoryReservation:
+        reservation_id: str
+        order_id: str
+        product_id: str
+        quantity: int
+        expires_at: datetime
+    @dataclass
+    class PaymentResult:
+        payment_id: str
+        status: str  # "success", "declined", "error"
+        amount: float
+        error_message: Optional[str] = None
+    class OrderService:
+        """
+        Orchestrates the order fulfillment flow.
+        Communicates with other services via HTTP and message queue.
+        """
+        def __init__(self, base_url: str):
+            self.base_url = base_url
+            self.client = httpx.AsyncClient(base_url=base_url, timeout=30.0)
+        async def create_order(self, customer_id: str, items: List[dict]) -> Order:
+            """
+            Creates order and initiates fulfillment flow.
+            Returns immediately with PENDING status.
+            Fulfillment happens asynchronously.
+            """
+            response = await self.client.post("/orders", json={
+                "customer_id": customer_id,
+                "items": items
+            })
+            response.raise_for_status()
+            return self._parse_order(response.json())
+        async def get_order(self, order_id: str) -> Order:
+            """Get current order status."""
+            response = await self.client.get(f"/orders/{order_id}")
+            response.raise_for_status()
+            return self._parse_order(response.json())
+        async def cancel_order(self, order_id: str) -> Order:
+            """
+            Cancel order. Triggers compensating transactions:
+            - Unreserve inventory
+            - Refund payment (if processed)
+            - Send cancellation notification
+            """
+            response = await self.client.post(f"/orders/{order_id}/cancel")
+            response.raise_for_status()
+            return self._parse_order(response.json())
+        async def wait_for_status(
+            self,
+            order_id: str,
+            expected_status: OrderStatus,
+            timeout_seconds: int = 30
+        ) -> Order:
+            """Poll until order reaches expected status or timeout."""
+            deadline = datetime.now().timestamp() + timeout_seconds
+            while datetime.now().timestamp() < deadline:
+                order = await self.get_order(order_id)
+                if order.status == expected_status:
+                    return order
+                if order.status == OrderStatus.FAILED:
+                    return order  # Don't wait if already failed
+                await asyncio.sleep(0.5)
+            raise TimeoutError(f"Order {order_id} did not reach {expected_status}")
+        def _parse_order(self, data: dict) -> Order:
+            return Order(
+                order_id=data["order_id"],
+                customer_id=data["customer_id"],
+                items=[OrderItem(**item) for item in data["items"]],
+                status=OrderStatus(data["status"]),
+                created_at=datetime.fromisoformat(data["created_at"]),
+                total=data["total"],
+                payment_id=data.get("payment_id"),
+                failure_reason=data.get("failure_reason")
+            )
+    class InventoryService:
+        """Manages product inventory and reservations."""
+        def __init__(self, base_url: str):
+            self.base_url = base_url
+            self.client = httpx.AsyncClient(base_url=base_url, timeout=10.0)
+        async def check_availability(self, product_id: str, quantity: int) -> bool:
+            """Check if quantity is available (doesn't reserve)."""
+            response = await self.client.get(
+                f"/inventory/{product_id}/available",
+                params={"quantity": quantity}
+            )
+            return response.json()["available"]
+        async def get_reservations(self, order_id: str) -> List[InventoryReservation]:
+            """Get all reservations for an order."""
+            response = await self.client.get(
+                "/reservations",
+                params={"order_id": order_id}
+            )
+            return [
+                InventoryReservation(**r)
+                for r in response.json()["reservations"]
+            ]
+        async def get_stock_level(self, product_id: str) -> int:
+            """Get current stock level for a product."""
+            response = await self.client.get(f"/inventory/{product_id}")
+            return response.json()["quantity"]
+    class PaymentService:
+        """Processes payments and refunds."""
+        def __init__(self, base_url: str):
+            self.base_url = base_url
+            self.client = httpx.AsyncClient(base_url=base_url, timeout=30.0)
+        async def get_payment(self, payment_id: str) -> PaymentResult:
+            """Get payment details."""
+            response = await self.client.get(f"/payments/{payment_id}")
+            data = response.json()
+            return PaymentResult(
+                payment_id=data["payment_id"],
+                status=data["status"],
+                amount=data["amount"],
+                error_message=data.get("error_message")
+            )
+        async def get_payments_for_order(self, order_id: str) -> List[PaymentResult]:
+            """Get all payment attempts for an order."""
+            response = await self.client.get(
+                "/payments",
+                params={"order_id": order_id}
+            )
+            return [
+                PaymentResult(**p)
+                for p in response.json()["payments"]
+            ]
+    class NotificationService:
+        """Sends order notifications."""
+        def __init__(self, base_url: str):
+            self.base_url = base_url
+            self.client = httpx.AsyncClient(base_url=base_url, timeout=10.0)
+        async def get_notifications(self, customer_id: str) -> List[dict]:
+            """Get notifications sent to a customer."""
+            response = await self.client.get(
+                "/notifications",
+                params={"customer_id": customer_id}
+            )
+            return response.json()["notifications"]
+        async def get_notification_by_order(self, order_id: str) -> List[dict]:
+            """Get all notifications for an order."""
+            response = await self.client.get(
+                "/notifications",
+                params={"order_id": order_id}
+            )
+            return response.json()["notifications"]
+    # Test helper: Chaos injection for failure testing
+    class ChaosController:
+        """
+        Controls chaos injection for testing failure scenarios.
+        Each service supports chaos endpoints for testing.
+        """
+        def __init__(self, service_urls: dict):
+            self.urls = service_urls
+            self.client = httpx.AsyncClient(timeout=5.0)
+        async def inject_failure(
+            self,
+            service: str,
+            failure_type: str,
+            duration_seconds: int = 30
+        ):
+            """
+            Inject a failure into a service.
+            failure_type options:
+            - "timeout": Service responds slowly (>30s)
+            - "error_500": Service returns 500 errors
+            - "error_503": Service returns 503 (unavailable)
+            - "partial": Service fails 50% of requests
+            - "payment_declined": Payment always declined (payment service only)
+            """
+            url = f"{self.urls[service]}/chaos/inject"
+            await self.client.post(url, json={
+                "failure_type": failure_type,
+                "duration_seconds": duration_seconds
+            })
+        async def clear_failures(self, service: str):
+            """Remove all injected failures from a service."""
+            url = f"{self.urls[service]}/chaos/clear"
+            await self.client.post(url)
+        async def clear_all(self):
+            """Clear failures from all services."""
+            for service in self.urls:
+                await self.clear_failures(service)
+# =============================================================================
+# BASELINE TEST SCENARIOS (minimum expected to cover)
+# =============================================================================
+baseline_issues:
+  happy_path:
+    - id: COMPLETE_ORDER_FLOW
+      description: "Order flows from creation to confirmation with all services"
+    - id: INVENTORY_RESERVED
+      description: "Inventory is reserved during order processing"
+    - id: PAYMENT_PROCESSED
+      description: "Payment is successfully processed"
+    - id: NOTIFICATION_SENT
+      description: "Confirmation notification sent to customer"
+    - id: STOCK_DECREMENTED
+      description: "Stock level decreases after order confirmed"
+  failure_scenarios:
+    - id: INVENTORY_UNAVAILABLE
+      description: "Order fails gracefully when inventory insufficient"
+    - id: PAYMENT_DECLINED
+      description: "Order fails when payment declined, inventory unreserved"
+    - id: PAYMENT_TIMEOUT
+      description: "Order handles payment service timeout with retry"
+    - id: INVENTORY_SERVICE_DOWN
+      description: "Order fails gracefully when inventory service unavailable"
+    - id: NOTIFICATION_FAILURE
+      description: "Order succeeds even if notification fails (non-critical)"
+  compensating_transactions:
+    - id: CANCEL_UNRESERVES
+      description: "Cancellation unreserves inventory"
+    - id: CANCEL_REFUNDS
+      description: "Cancellation triggers refund if paid"
+    - id: FAILURE_CLEANUP
+      description: "Failed order releases all reservations"
+  async_behavior:
+    - id: EVENTUAL_CONSISTENCY
+      description: "Status eventually consistent across services"
+    - id: IDEMPOTENT_RETRY
+      description: "Retried operations don't duplicate effects"
+  data_isolation:
+    - id: TEST_DATA_CLEANUP
+      description: "Tests clean up created data"
+    - id: PARALLEL_SAFE
+      description: "Tests can run in parallel without interference"
+# =============================================================================
+# BONUS TEST SCENARIOS
+# =============================================================================
+bonus_issues:
+  chaos_engineering:
+    - id: PARTIAL_FAILURE
+      description: "System handles partial service failures gracefully"
+    - id: CASCADE_PREVENTION
+      description: "Failure in one service doesn't cascade"
+    - id: RECOVERY_TEST
+      description: "System recovers when failed service comes back"
+  performance:
+    - id: CONCURRENT_ORDERS
+      description: "Multiple orders processed correctly in parallel"
+    - id: INVENTORY_CONTENTION
+      description: "Concurrent orders for same product handled correctly"
+  edge_cases:
+    - id: DUPLICATE_ORDER
+      description: "Duplicate order request handled idempotently"
+    - id: RACE_CONDITIONS
+      description: "Cancel during processing handled correctly"
+    - id: EXPIRED_RESERVATION
+      description: "Expired reservations released correctly"
+  contract_testing:
+    - id: API_CONTRACTS
+      description: "Service API contracts validated"
+    - id: EVENT_CONTRACTS
+      description: "Message queue event contracts validated"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  total_baseline_scenarios: 18
+  total_bonus_scenarios: 12
+  categories:
+    - name: coverage
+      weight: 40
+      criteria:
+        - id: BASELINE_COVERED
+          description: "All baseline test scenarios covered"
+          points: 30
+        - id: BONUS_COVERED
+          description: "Additional valuable test scenarios"
+          points: 10
+    - name: quality
+      weight: 30
+      criteria:
+        - id: TEST_STRUCTURE
+          description: "Well-organized test structure with fixtures"
+          points: 10
+        - id: ASSERTIONS
+          description: "Comprehensive assertions for each scenario"
+          points: 10
+        - id: ASYNC_HANDLING
+          description: "Correct async/await patterns"
+          points: 10
+    - name: practicality
+      weight: 15
+      criteria:
+        - id: REALISTIC_SETUP
+          description: "Practical test data and service setup"
+          points: 8
+        - id: MAINTAINABLE
+          description: "Tests are maintainable and readable"
+          points: 7
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona enhances test documentation"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: test_philosophy
+      description: "Integration vs isolation tradeoff"
+      spectrum:
+        isolated: "Heavy mocking, fast tests"
+        balanced: "Mix of real and mocked services"
+        realistic: "Testcontainers, real dependencies"
+    - name: failure_focus
+      description: "How much emphasis on failure scenarios"
+      spectrum:
+        happy_path: "Focuses on success scenarios"
+        balanced: "Equal coverage of success and failure"
+        chaos_focused: "Emphasizes failure and recovery"
+    - name: documentation_style
+      description: "How tests are documented"
+      spectrum:
+        minimal: "Self-documenting test names"
+        moderate: "Docstrings on complex tests"
+        comprehensive: "Full scenario documentation"
+expected_tendencies:
+  discworld_tea:
+    character: "Igor"
+    expected_traits:
+      - "Thorough - covers many failure modes"
+      - "Practical - focuses on what breaks"
+      - "May suggest unusual edge cases"
+    coverage_prediction: "high"
+  star_trek_tea:
+    character: "Scotty"
+    expected_traits:
+      - "Systematic - organized test structure"
+      - "Engineering focus - realistic scenarios"
+      - "May emphasize performance testing"
+    coverage_prediction: "high"
+  control_tea:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard integration testing approach"
+    coverage_prediction: "baseline reference"