npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/code-review/terraform-infrastructure.yaml ADDED Viewed

@@ -0,0 +1,582 @@
+---
+# Scenario: Terraform Infrastructure Code Review (Medium)
+# NOTE: Re-ranked to "medium" based on control baseline mean 80.85 ± 6.01 (Story 7-2)
+# Category: code-review
+# Purpose: Test IaC security awareness and cloud infrastructure expertise
+id: rev-004
+name: terraform-infrastructure
+title: "Terraform AWS Infrastructure Review"
+category: code-review
+difficulty: medium
+version: "1.0"
+description: |
+  Terraform modules defining AWS infrastructure: VPC, EKS cluster, RDS database,
+  S3 buckets, and IAM policies. Contains overly permissive IAM, public resources,
+  unencrypted storage, missing logging, and security group misconfigurations.
+  Tests whether reviewers understand cloud security beyond application code.
+purpose: |
+  This scenario tests infrastructure security expertise. Many code reviewers excel
+  at application vulnerabilities but miss IaC issues (IAM policies, network exposure,
+  encryption gaps). Finding all 18 baseline issues = competent cloud security reviewer.
+  Finding bonus issues = understands AWS security deeply.
+prompt: |
+  You are reviewing a pull request for Terraform infrastructure modules.
+  The DevOps engineer says "it works in the dev account" and wants to apply to production.
+  Review this infrastructure code thoroughly for:
+  - IAM policy issues (overly permissive, missing constraints)
+  - Network security (security groups, NACLs, public exposure)
+  - Data protection (encryption at rest/transit, key management)
+  - Logging and monitoring gaps
+  - Compliance concerns (SOC2, HIPAA if applicable)
+  - Cost optimization opportunities
+  For each issue:
+  1. Identify the specific resource and attribute
+  2. Classify severity (Critical/High/Medium/Low)
+  3. Explain the security or operational impact
+  4. Provide the corrected Terraform configuration
+  This infrastructure will handle production workloads. Security is paramount.
+code:
+  language: hcl
+  filename: main.tf
+  content: |
+    # AWS Provider Configuration
+    provider "aws" {
+      region = var.region
+    }
+    variable "region" {
+      default = "us-east-1"
+    }
+    variable "environment" {
+      default = "production"
+    }
+    variable "db_password" {
+      default = "admin123"
+    }
+    # VPC Configuration
+    resource "aws_vpc" "main" {
+      cidr_block           = "10.0.0.0/16"
+      enable_dns_hostnames = true
+      enable_dns_support   = true
+      tags = {
+        Name = "main-vpc"
+      }
+    }
+    resource "aws_subnet" "public_a" {
+      vpc_id                  = aws_vpc.main.id
+      cidr_block              = "10.0.1.0/24"
+      availability_zone       = "${var.region}a"
+      map_public_ip_on_launch = true
+      tags = {
+        Name = "public-subnet-a"
+      }
+    }
+    resource "aws_subnet" "public_b" {
+      vpc_id                  = aws_vpc.main.id
+      cidr_block              = "10.0.2.0/24"
+      availability_zone       = "${var.region}b"
+      map_public_ip_on_launch = true
+      tags = {
+        Name = "public-subnet-b"
+      }
+    }
+    resource "aws_internet_gateway" "main" {
+      vpc_id = aws_vpc.main.id
+    }
+    # Security Groups
+    resource "aws_security_group" "web" {
+      name        = "web-sg"
+      description = "Security group for web servers"
+      vpc_id      = aws_vpc.main.id
+      ingress {
+        from_port   = 0
+        to_port     = 0
+        protocol    = "-1"
+        cidr_blocks = ["0.0.0.0/0"]
+      }
+      egress {
+        from_port   = 0
+        to_port     = 0
+        protocol    = "-1"
+        cidr_blocks = ["0.0.0.0/0"]
+      }
+    }
+    resource "aws_security_group" "database" {
+      name        = "database-sg"
+      description = "Security group for database"
+      vpc_id      = aws_vpc.main.id
+      ingress {
+        from_port   = 3306
+        to_port     = 3306
+        protocol    = "tcp"
+        cidr_blocks = ["0.0.0.0/0"]
+      }
+      ingress {
+        from_port   = 22
+        to_port     = 22
+        protocol    = "tcp"
+        cidr_blocks = ["0.0.0.0/0"]
+      }
+    }
+    # RDS Database
+    resource "aws_db_instance" "main" {
+      identifier           = "production-db"
+      engine               = "mysql"
+      engine_version       = "5.7"
+      instance_class       = "db.t2.micro"
+      allocated_storage    = 20
+      storage_type         = "gp2"
+      db_name              = "appdb"
+      username             = "admin"
+      password             = var.db_password
+      parameter_group_name = "default.mysql5.7"
+      skip_final_snapshot  = true
+      publicly_accessible  = true
+      vpc_security_group_ids = [aws_security_group.database.id]
+      db_subnet_group_name   = aws_db_subnet_group.main.name
+      tags = {
+        Environment = var.environment
+      }
+    }
+    resource "aws_db_subnet_group" "main" {
+      name       = "main-db-subnet"
+      subnet_ids = [aws_subnet.public_a.id, aws_subnet.public_b.id]
+    }
+    # S3 Buckets
+    resource "aws_s3_bucket" "data" {
+      bucket = "company-production-data"
+      tags = {
+        Environment = var.environment
+      }
+    }
+    resource "aws_s3_bucket" "logs" {
+      bucket = "company-production-logs"
+      acl    = "public-read"
+      tags = {
+        Environment = var.environment
+      }
+    }
+    resource "aws_s3_bucket" "backups" {
+      bucket = "company-production-backups"
+      versioning {
+        enabled = false
+      }
+      tags = {
+        Environment = var.environment
+      }
+    }
+    # IAM Roles and Policies
+    resource "aws_iam_role" "app_role" {
+      name = "application-role"
+      assume_role_policy = jsonencode({
+        Version = "2012-10-17"
+        Statement = [
+          {
+            Action = "sts:AssumeRole"
+            Effect = "Allow"
+            Principal = {
+              Service = "ec2.amazonaws.com"
+            }
+          },
+          {
+            Action = "sts:AssumeRole"
+            Effect = "Allow"
+            Principal = {
+              AWS = "*"
+            }
+          }
+        ]
+      })
+    }
+    resource "aws_iam_role_policy" "app_policy" {
+      name = "application-policy"
+      role = aws_iam_role.app_role.id
+      policy = jsonencode({
+        Version = "2012-10-17"
+        Statement = [
+          {
+            Effect = "Allow"
+            Action = [
+              "s3:*",
+              "ec2:*",
+              "rds:*",
+              "iam:*",
+              "secretsmanager:*"
+            ]
+            Resource = "*"
+          }
+        ]
+      })
+    }
+    resource "aws_iam_user" "deploy" {
+      name = "deploy-user"
+    }
+    resource "aws_iam_access_key" "deploy" {
+      user = aws_iam_user.deploy.name
+    }
+    resource "aws_iam_user_policy" "deploy" {
+      name = "deploy-policy"
+      user = aws_iam_user.deploy.name
+      policy = jsonencode({
+        Version = "2012-10-17"
+        Statement = [
+          {
+            Effect   = "Allow"
+            Action   = "*"
+            Resource = "*"
+          }
+        ]
+      })
+    }
+    # EKS Cluster
+    resource "aws_eks_cluster" "main" {
+      name     = "production-cluster"
+      role_arn = aws_iam_role.eks_role.arn
+      vpc_config {
+        subnet_ids              = [aws_subnet.public_a.id, aws_subnet.public_b.id]
+        endpoint_public_access  = true
+        endpoint_private_access = false
+        public_access_cidrs     = ["0.0.0.0/0"]
+      }
+      enabled_cluster_log_types = []
+    }
+    resource "aws_iam_role" "eks_role" {
+      name = "eks-cluster-role"
+      assume_role_policy = jsonencode({
+        Version = "2012-10-17"
+        Statement = [{
+          Action = "sts:AssumeRole"
+          Effect = "Allow"
+          Principal = {
+            Service = "eks.amazonaws.com"
+          }
+        }]
+      })
+    }
+    resource "aws_iam_role_policy_attachment" "eks_cluster_policy" {
+      policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
+      role       = aws_iam_role.eks_role.name
+    }
+    # EC2 Instance
+    resource "aws_instance" "bastion" {
+      ami                         = "ami-12345678"
+      instance_type               = "t2.micro"
+      subnet_id                   = aws_subnet.public_a.id
+      vpc_security_group_ids      = [aws_security_group.web.id]
+      associate_public_ip_address = true
+      key_name                    = "production-key"
+      user_data = <<-EOF
+        #!/bin/bash
+        echo "root:${var.db_password}" | chpasswd
+        sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config
+        systemctl restart sshd
+      EOF
+      tags = {
+        Name = "bastion-host"
+      }
+    }
+    # Outputs
+    output "db_password" {
+      value = var.db_password
+    }
+    output "access_key_id" {
+      value = aws_iam_access_key.deploy.id
+    }
+    output "secret_access_key" {
+      value = aws_iam_access_key.deploy.secret
+    }
+    output "db_endpoint" {
+      value = aws_db_instance.main.endpoint
+    }
+# =============================================================================
+# BASELINE ISSUES (minimum expected to find)
+# =============================================================================
+baseline_issues:
+  critical:
+    - id: HARDCODED_DB_PASSWORD
+      location: "variable db_password, line 14"
+      description: "Database password hardcoded in terraform with weak default"
+    - id: IAM_WILDCARD_PRINCIPAL
+      location: "aws_iam_role.app_role, line 169"
+      description: "IAM role allows any AWS principal to assume it"
+    - id: IAM_ADMIN_POLICY
+      location: "aws_iam_user_policy.deploy, line 205"
+      description: "IAM user has full admin access (Action: *, Resource: *)"
+    - id: SECRETS_IN_OUTPUT
+      location: "outputs, lines 260-267"
+      description: "Sensitive values (password, access keys) exposed in outputs"
+    - id: PUBLIC_RDS
+      location: "aws_db_instance.main, line 109"
+      description: "RDS instance publicly accessible from internet"
+  high:
+    - id: OPEN_SECURITY_GROUP
+      location: "aws_security_group.web, lines 58-65"
+      description: "Security group allows all traffic from any source"
+    - id: DB_SG_OPEN
+      location: "aws_security_group.database, lines 74-85"
+      description: "Database security group open to 0.0.0.0/0"
+    - id: S3_PUBLIC_READ
+      location: "aws_s3_bucket.logs, line 132"
+      description: "Logs bucket has public-read ACL"
+    - id: ROOT_PASSWORD_USERDATA
+      location: "aws_instance.bastion user_data, lines 238-241"
+      description: "Root password set via user_data with db password"
+    - id: PASSWORD_AUTH_SSH
+      location: "aws_instance.bastion user_data, line 242"
+      description: "SSH password authentication enabled on bastion"
+    - id: EKS_PUBLIC_ENDPOINT
+      location: "aws_eks_cluster.main, lines 220-224"
+      description: "EKS cluster publicly accessible from any IP"
+  medium:
+    - id: NO_RDS_ENCRYPTION
+      location: "aws_db_instance.main"
+      description: "RDS storage encryption not enabled"
+    - id: NO_S3_ENCRYPTION
+      location: "aws_s3_bucket resources"
+      description: "S3 buckets missing server-side encryption"
+    - id: SKIP_FINAL_SNAPSHOT
+      location: "aws_db_instance.main, line 108"
+      description: "skip_final_snapshot prevents data recovery"
+    - id: NO_S3_VERSIONING
+      location: "aws_s3_bucket.backups, line 142"
+      description: "Backup bucket has versioning disabled"
+    - id: EKS_NO_LOGGING
+      location: "aws_eks_cluster.main, line 225"
+      description: "EKS cluster logging disabled"
+  low:
+    - id: OUTDATED_MYSQL
+      location: "aws_db_instance.main, line 101"
+      description: "MySQL 5.7 is approaching end of life"
+    - id: UNDERSIZED_INSTANCE
+      location: "aws_db_instance.main, line 102"
+      description: "t2.micro insufficient for production database"
+# =============================================================================
+# BONUS ISSUES (thorough reviewers might find these)
+# =============================================================================
+bonus_issues:
+  security:
+    - id: NO_VPC_FLOW_LOGS
+      description: "VPC flow logs not enabled for network monitoring"
+    - id: NO_CLOUDTRAIL
+      description: "No CloudTrail configuration for API auditing"
+    - id: NO_NACLS
+      description: "No Network ACLs defined for defense in depth"
+    - id: NO_WAF
+      description: "No WAF configuration for web application protection"
+  operational:
+    - id: NO_BACKUP_PLAN
+      description: "No AWS Backup plan for RDS/EC2 snapshots"
+    - id: NO_AUTO_SCALING
+      description: "No auto-scaling for availability"
+    - id: NO_MULTI_AZ_RDS
+      description: "RDS not configured for Multi-AZ failover"
+    - id: HARDCODED_AMI
+      description: "AMI ID hardcoded instead of data source lookup"
+  tagging:
+    - id: INCONSISTENT_TAGS
+      description: "Not all resources have consistent tagging"
+    - id: NO_COST_TAGS
+      description: "Missing cost allocation tags"
+  compliance:
+    - id: NO_KMS_KEYS
+      description: "No customer-managed KMS keys for encryption"
+    - id: NO_PRIVATE_SUBNETS
+      description: "All subnets are public, no private tier"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  total_baseline_issues: 18
+  total_bonus_issues: 12
+  weights:
+    critical: 3
+    high: 2
+    medium: 1
+    low: 0.5
+  max_baseline_score: 32  # 5*3 + 6*2 + 5*1 + 2*0.5
+  categories:
+    - name: detection
+      weight: 40
+      criteria:
+        - id: BASELINE_FOUND
+          description: "Issues from the seeded baseline list"
+          points: 25
+        - id: BONUS_DISCOVERIES
+          description: "Valid issues beyond the baseline"
+          points: 15
+    - name: depth
+      weight: 30
+      criteria:
+        - id: ROOT_CAUSE_ANALYSIS
+          description: "Traces to AWS security model implications"
+          points: 10
+        - id: FIX_SPECIFICITY
+          description: "Provides corrected HCL code"
+          points: 10
+        - id: IMPACT_ASSESSMENT
+          description: "Explains breach scenarios, compliance impact"
+          points: 10
+    - name: quality
+      weight: 15
+      criteria:
+        - id: SEVERITY_ACCURACY
+          description: "Correctly classifies infrastructure severity"
+          points: 5
+        - id: REASONING_QUALITY
+          description: "Clear explanation of AWS security model"
+          points: 5
+        - id: ORGANIZATION
+          description: "Prioritized by blast radius"
+          points: 5
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona enhances memorability/clarity"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: cloud_expertise
+      description: "Depth of AWS/cloud security knowledge"
+      spectrum:
+        app_focused: "Finds obvious issues but misses IAM nuances"
+        balanced: "Covers both network and IAM issues"
+        cloud_native: "Catches subtle IAM, encryption, logging gaps"
+    - name: compliance_awareness
+      description: "Focus on compliance requirements"
+      spectrum:
+        security_only: "Only finds security issues"
+        compliance_aware: "Notes SOC2/HIPAA implications"
+        governance_focused: "Emphasizes audit trail, controls"
+    - name: operational_mindset
+      description: "Balance between security and operations"
+      spectrum:
+        security_purist: "May suggest impractical hardening"
+        balanced: "Practical security that works"
+        ops_focused: "May accept risks for operability"
+expected_tendencies:
+  discworld_reviewer:
+    character: "Granny Weatherwax"
+    expected_traits:
+      - "Practical wisdom - should catch obvious mistakes"
+      - "No-nonsense - won't accept 'it works in dev'"
+      - "May lack cloud-specific deep knowledge"
+    thoroughness_prediction: "medium-high"
+  star_trek_reviewer:
+    character: "Spock"
+    expected_traits:
+      - "Logical - systematic resource-by-resource review"
+      - "Precise - will note specific AWS documentation"
+      - "Technical depth in cloud architecture"
+    thoroughness_prediction: "high"
+  control_reviewer:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard infrastructure review behavior"
+    thoroughness_prediction: "baseline reference"