npm - @pennyfarthing/benchmark - Versions diffs - 10.2.0 - Mend

@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/commands/benchmark-control.md +69 -0
package/commands/benchmark.md +485 -0
package/commands/job-fair.md +102 -0
package/commands/solo.md +447 -0
package/dist/benchmark-integration.d.ts +182 -0
package/dist/benchmark-integration.d.ts.map +1 -0
package/dist/benchmark-integration.js +710 -0
package/dist/benchmark-integration.js.map +1 -0
package/dist/benchmark-integration.test.d.ts +6 -0
package/dist/benchmark-integration.test.d.ts.map +1 -0
package/dist/benchmark-integration.test.js +41 -0
package/dist/benchmark-integration.test.js.map +1 -0
package/dist/index.d.ts +3 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +5 -0
package/dist/index.js.map +1 -0
package/dist/job-fair-aggregator.d.ts +150 -0
package/dist/job-fair-aggregator.d.ts.map +1 -0
package/dist/job-fair-aggregator.js +547 -0
package/dist/job-fair-aggregator.js.map +1 -0
package/dist/job-fair-aggregator.test.d.ts +6 -0
package/dist/job-fair-aggregator.test.d.ts.map +1 -0
package/dist/job-fair-aggregator.test.js +35 -0
package/dist/job-fair-aggregator.test.js.map +1 -0
package/dist/package-exports.test.d.ts +13 -0
package/dist/package-exports.test.d.ts.map +1 -0
package/dist/package-exports.test.js +192 -0
package/dist/package-exports.test.js.map +1 -0
package/docs/BENCHMARK-METHODOLOGY.md +105 -0
package/docs/BENCHMARKING.md +311 -0
package/docs/OCEAN-BENCHMARKING.md +210 -0
package/docs/benchmarks-guide.md +62 -0
package/package.json +66 -0
package/scenarios/README.md +145 -0
package/scenarios/architecture/database-selection.yaml +119 -0
package/scenarios/architecture/legacy-modernization.yaml +153 -0
package/scenarios/architecture/scaling-decision.yaml +88 -0
package/scenarios/code-review/graphql-api-review.yaml +714 -0
package/scenarios/code-review/order-service.yaml +622 -0
package/scenarios/code-review/react-auth-component.yaml +569 -0
package/scenarios/code-review/security-review.yaml +145 -0
package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
package/scenarios/debug/buggy-user-service.yaml +541 -0
package/scenarios/debug/null-pointer.yaml +130 -0
package/scenarios/debugging/async-control-flow.yaml +161 -0
package/scenarios/debugging/auth-bypass.yaml +197 -0
package/scenarios/debugging/error-handling.yaml +178 -0
package/scenarios/debugging/input-validation.yaml +157 -0
package/scenarios/debugging/null-check-missing.yaml +139 -0
package/scenarios/debugging/off-by-one-loop.yaml +132 -0
package/scenarios/debugging/race-condition.yaml +180 -0
package/scenarios/debugging/resource-leak.yaml +166 -0
package/scenarios/debugging/simple-logic-error.yaml +115 -0
package/scenarios/debugging/sql-injection.yaml +163 -0
package/scenarios/dev/event-processor-tdd.yaml +764 -0
package/scenarios/dev/migration-disaster.yaml +415 -0
package/scenarios/dev/race-condition-cache.yaml +546 -0
package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
package/scenarios/schema.yaml +639 -0
package/scenarios/sm/dependency-deadlock.yaml +414 -0
package/scenarios/sm/executive-pet-project.yaml +336 -0
package/scenarios/sm/layoff-planning.yaml +356 -0
package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
package/scenarios/sm/story-breakdown.yaml +240 -0
package/scenarios/sm/three-sprint-failure.yaml +397 -0
package/scenarios/swe-bench/README.md +57 -0
package/scenarios/swe-bench/astropy-12907.yaml +128 -0
package/scenarios/swe-bench/astropy-13398.yaml +177 -0
package/scenarios/swe-bench/astropy-14309.yaml +180 -0
package/scenarios/swe-bench/django-10097.yaml +106 -0
package/scenarios/swe-bench/django-10554.yaml +140 -0
package/scenarios/swe-bench/django-10973.yaml +93 -0
package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
package/scenarios/swe-bench/flask-5014.yaml +91 -0
package/scenarios/swe-bench/import-swebench.py +246 -0
package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
package/scenarios/swe-bench/requests-1142.yaml +100 -0
package/scenarios/swe-bench/requests-2931.yaml +98 -0
package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
package/scenarios/swe-bench/xarray-3993.yaml +104 -0
package/scenarios/swe-bench/xarray-6992.yaml +136 -0
package/scenarios/tea/checkout-component-tests.yaml +596 -0
package/scenarios/tea/cli-tool-tests.yaml +561 -0
package/scenarios/tea/microservice-integration-tests.yaml +520 -0
package/scenarios/tea/payment-processor-tests.yaml +550 -0
package/scripts/aggregate-benchmark-stats.js +315 -0
package/scripts/aggregate-benchmark-stats.sh +8 -0
package/scripts/benchmark-runner.js +392 -0
package/scripts/benchmark-runner.sh +8 -0
package/scripts/consolidate-job-fair.sh +107 -0
package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
package/scripts/job-fair-batch.sh +116 -0
package/scripts/job-fair-progress.sh +35 -0
package/scripts/job-fair-runner.sh +278 -0
package/scripts/job-fair-status.sh +80 -0
package/scripts/job-fair-watcher-v2.sh +38 -0
package/scripts/job-fair-watcher.sh +50 -0
package/scripts/parallel-benchmark.sh +140 -0
package/scripts/solo-runner.sh +344 -0
package/scripts/test/ensure-swebench-data.sh +59 -0
package/scripts/test/ground-truth-judge.py +220 -0
package/scripts/test/swebench-judge.py +374 -0
package/scripts/test/test-cache.sh +165 -0
package/scripts/test/test-setup.sh +337 -0
package/scripts/theme/compute-theme-tiers.sh +13 -0
package/scripts/theme/compute_theme_tiers.py +402 -0
package/scripts/theme/update-theme-tiers.sh +97 -0
package/skills/finalize-run/SKILL.md +261 -0
package/skills/judge/SKILL.md +644 -0
package/skills/persona-benchmark/SKILL.md +187 -0

package/scenarios/code-review/react-auth-component.yaml ADDED Viewed

@@ -0,0 +1,569 @@
+---
+# Scenario: React Authentication Component Code Review (Medium)
+# NOTE: Re-ranked to "medium" based on control baseline mean 82.25 ± 3.81 (Story 7-2)
+# Category: code-review
+# Purpose: Test frontend security awareness and React-specific vulnerability detection
+id: rev-003
+name: react-auth-component
+title: "React Authentication Component Review"
+category: code-review
+difficulty: medium
+version: "1.0"
+description: |
+  A React authentication component handling login, signup, password reset, and session
+  management. Contains XSS vulnerabilities, insecure token storage, CSRF issues,
+  and client-side validation bypasses. Tests whether reviewers understand frontend
+  threat models beyond traditional backend security concerns.
+purpose: |
+  This scenario tests frontend security expertise. Many code reviewers excel at
+  backend vulnerabilities (SQL injection, auth bypass) but miss client-side issues
+  (XSS, insecure storage, CSRF). Finding all 15 baseline issues = competent frontend
+  reviewer. Finding bonus issues = understands React security deeply.
+prompt: |
+  You are reviewing a pull request for an authentication module in a React application.
+  The junior developer says "I followed the design mockups exactly" and wants to ship.
+  Review this code thoroughly for:
+  - Security vulnerabilities (XSS, CSRF, token handling)
+  - React anti-patterns and bugs
+  - State management issues
+  - Authentication flow problems
+  - Input validation gaps
+  - Accessibility concerns
+  For each issue:
+  1. Identify the specific location (line number or component)
+  2. Classify severity (Critical/High/Medium/Low)
+  3. Explain the security or UX impact
+  4. Provide a fix or recommendation
+  This handles user authentication. Security is paramount.
+code:
+  language: typescript
+  filename: AuthModule.tsx
+  content: |
+    import React, { useState, useEffect } from 'react';
+    import axios from 'axios';
+    const API_URL = 'https://api.example.com';
+    interface User {
+      id: string;
+      email: string;
+      name: string;
+      role: string;
+    }
+    // Store auth token globally for easy access
+    let authToken: string | null = null;
+    export function setAuthToken(token: string) {
+      authToken = token;
+      localStorage.setItem('auth_token', token);
+      localStorage.setItem('user_data', JSON.stringify({ token }));
+    }
+    export function getAuthToken(): string | null {
+      return authToken || localStorage.getItem('auth_token');
+    }
+    export const LoginForm: React.FC = () => {
+      const [email, setEmail] = useState('');
+      const [password, setPassword] = useState('');
+      const [error, setError] = useState('');
+      const [rememberMe, setRememberMe] = useState(false);
+      const handleSubmit = async (e: React.FormEvent) => {
+        e.preventDefault();
+        // Client-side validation
+        if (!email.includes('@')) {
+          setError('Invalid email format');
+          return;
+        }
+        if (password.length < 6) {
+          setError('Password must be at least 6 characters');
+          return;
+        }
+        try {
+          const response = await axios.post(`${API_URL}/auth/login`, {
+            email,
+            password,
+            remember: rememberMe
+          });
+          setAuthToken(response.data.token);
+          if (rememberMe) {
+            localStorage.setItem('saved_email', email);
+            localStorage.setItem('saved_password', password);
+          }
+          // Redirect to dashboard
+          window.location.href = response.data.redirectUrl;
+        } catch (err: any) {
+          setError(err.response?.data?.message || 'Login failed');
+        }
+      };
+      return (
+        <form onSubmit={handleSubmit}>
+          <div dangerouslySetInnerHTML={{ __html: error }} />
+          <input
+            type="text"
+            value={email}
+            onChange={(e) => setEmail(e.target.value)}
+            placeholder="Email"
+          />
+          <input
+            type="password"
+            value={password}
+            onChange={(e) => setPassword(e.target.value)}
+            placeholder="Password"
+          />
+          <label>
+            <input
+              type="checkbox"
+              checked={rememberMe}
+              onChange={(e) => setRememberMe(e.target.checked)}
+            />
+            Remember me
+          </label>
+          <button type="submit">Login</button>
+          <a href="#" onClick={() => window.location.href = '/forgot-password'}>
+            Forgot Password?
+          </a>
+        </form>
+      );
+    };
+    export const SignupForm: React.FC = () => {
+      const [formData, setFormData] = useState({
+        email: '',
+        password: '',
+        confirmPassword: '',
+        name: ''
+      });
+      const [message, setMessage] = useState('');
+      const handleChange = (e: React.ChangeEvent<HTMLInputElement>) => {
+        setFormData({ ...formData, [e.target.name]: e.target.value });
+      };
+      const handleSubmit = async (e: React.FormEvent) => {
+        e.preventDefault();
+        // Password match check on client only
+        if (formData.password !== formData.confirmPassword) {
+          setMessage('Passwords do not match');
+          return;
+        }
+        const response = await fetch(`${API_URL}/auth/signup`, {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify(formData)
+        });
+        const data = await response.json();
+        setMessage(data.message);
+        if (response.ok) {
+          setAuthToken(data.token);
+          eval('window.trackSignup("' + formData.email + '")');
+        }
+      };
+      return (
+        <form onSubmit={handleSubmit}>
+          <p>{message}</p>
+          <input name="name" value={formData.name} onChange={handleChange} placeholder="Name" />
+          <input name="email" value={formData.email} onChange={handleChange} placeholder="Email" />
+          <input name="password" type="password" value={formData.password} onChange={handleChange} placeholder="Password" />
+          <input name="confirmPassword" type="password" value={formData.confirmPassword} onChange={handleChange} placeholder="Confirm Password" />
+          <button type="submit">Sign Up</button>
+        </form>
+      );
+    };
+    export const PasswordReset: React.FC = () => {
+      const [email, setEmail] = useState('');
+      const [token, setToken] = useState('');
+      const [newPassword, setNewPassword] = useState('');
+      const [step, setStep] = useState<'request' | 'reset'>('request');
+      useEffect(() => {
+        // Check URL for reset token
+        const params = new URLSearchParams(window.location.search);
+        const urlToken = params.get('token');
+        if (urlToken) {
+          setToken(urlToken);
+          setStep('reset');
+        }
+      }, []);
+      const requestReset = async () => {
+        await axios.get(`${API_URL}/auth/reset?email=${email}`);
+        alert('Check your email for reset link');
+      };
+      const performReset = async () => {
+        const response = await axios.post(`${API_URL}/auth/reset`, {
+          token,
+          newPassword
+        });
+        if (response.data.success) {
+          document.cookie = `reset_complete=true`;
+          window.location.href = '/login?message=Password reset successful';
+        }
+      };
+      if (step === 'request') {
+        return (
+          <div>
+            <input value={email} onChange={(e) => setEmail(e.target.value)} placeholder="Email" />
+            <button onClick={requestReset}>Request Reset</button>
+          </div>
+        );
+      }
+      return (
+        <div>
+          <input type="password" value={newPassword} onChange={(e) => setNewPassword(e.target.value)} placeholder="New Password" />
+          <button onClick={performReset}>Reset Password</button>
+        </div>
+      );
+    };
+    export const UserProfile: React.FC<{ userId: string }> = ({ userId }) => {
+      const [user, setUser] = useState<User | null>(null);
+      const [bio, setBio] = useState('');
+      useEffect(() => {
+        const fetchUser = async () => {
+          const response = await axios.get(`${API_URL}/users/${userId}`, {
+            headers: { Authorization: getAuthToken() }
+          });
+          setUser(response.data);
+          setBio(response.data.bio || '');
+        };
+        fetchUser();
+      }, [userId]);
+      const updateBio = async () => {
+        await axios.put(`${API_URL}/users/${userId}`, { bio }, {
+          headers: { Authorization: getAuthToken() }
+        });
+      };
+      if (!user) return <div>Loading...</div>;
+      return (
+        <div>
+          <h1>{user.name}</h1>
+          <p>Email: {user.email}</p>
+          <p>Role: {user.role}</p>
+          <div dangerouslySetInnerHTML={{ __html: bio }} />
+          <textarea value={bio} onChange={(e) => setBio(e.target.value)} />
+          <button onClick={updateBio}>Update Bio</button>
+        </div>
+      );
+    };
+    export const AdminPanel: React.FC = () => {
+      const [users, setUsers] = useState<User[]>([]);
+      const [isAdmin, setIsAdmin] = useState(false);
+      useEffect(() => {
+        // Check admin status
+        const userData = localStorage.getItem('user_data');
+        if (userData) {
+          const parsed = JSON.parse(userData);
+          setIsAdmin(parsed.role === 'admin');
+        }
+        if (isAdmin) {
+          fetchUsers();
+        }
+      }, [isAdmin]);
+      const fetchUsers = async () => {
+        const response = await axios.get(`${API_URL}/admin/users`);
+        setUsers(response.data);
+      };
+      const deleteUser = (id: string) => {
+        axios.delete(`${API_URL}/admin/users/${id}`);
+        setUsers(users.filter(u => u.id !== id));
+      };
+      if (!isAdmin) return <div>Access Denied</div>;
+      return (
+        <div>
+          <h1>Admin Panel</h1>
+          {users.map(user => (
+            <div key={user.id}>
+              <span>{user.email}</span>
+              <button onClick={() => deleteUser(user.id)}>Delete</button>
+            </div>
+          ))}
+        </div>
+      );
+    };
+    // Session timeout handler
+    export const SessionManager: React.FC<{ children: React.ReactNode }> = ({ children }) => {
+      useEffect(() => {
+        const checkSession = setInterval(() => {
+          const token = getAuthToken();
+          if (!token) {
+            window.location.href = '/login';
+          }
+        }, 60000);
+        // Log activity to server
+        const logActivity = () => {
+          navigator.sendBeacon(`${API_URL}/activity?token=${getAuthToken()}`);
+        };
+        document.addEventListener('click', logActivity);
+        return () => {
+          clearInterval(checkSession);
+          document.removeEventListener('click', logActivity);
+        };
+      }, []);
+      return <>{children}</>;
+    };
+# =============================================================================
+# BASELINE ISSUES (minimum expected to find)
+# =============================================================================
+baseline_issues:
+  critical:
+    - id: XSS_ERROR_MESSAGE
+      location: "line 66"
+      description: "dangerouslySetInnerHTML renders error message from server - XSS via error"
+    - id: XSS_BIO_RENDER
+      location: "line 195"
+      description: "dangerouslySetInnerHTML renders user bio - stored XSS"
+    - id: EVAL_INJECTION
+      location: "line 120"
+      description: "eval() with user email - code injection vulnerability"
+    - id: PASSWORD_IN_LOCALSTORAGE
+      location: "lines 54-55"
+      description: "Saving password in localStorage - credential exposure"
+  high:
+    - id: TOKEN_IN_LOCALSTORAGE
+      location: "lines 17-18"
+      description: "JWT token stored in localStorage - vulnerable to XSS theft"
+    - id: OPEN_REDIRECT
+      location: "line 58"
+      description: "Redirect URL from server response - open redirect vulnerability"
+    - id: CLIENT_SIDE_ADMIN_CHECK
+      location: "lines 210-214"
+      description: "Admin role check uses localStorage - client-side bypass"
+    - id: TOKEN_IN_URL
+      location: "line 248"
+      description: "Auth token in URL query parameter - token leakage via logs/referrer"
+  medium:
+    - id: NO_CSRF_TOKEN
+      location: "all API calls"
+      description: "No CSRF protection on state-changing requests"
+    - id: MISSING_ERROR_HANDLING_SIGNUP
+      location: "lines 106-119"
+      description: "No try/catch on signup - unhandled errors crash component"
+    - id: PASSWORD_RESET_GET
+      location: "line 148"
+      description: "Password reset request via GET - should be POST"
+    - id: ADMIN_FETCH_NO_AUTH
+      location: "line 223"
+      description: "Admin user fetch missing Authorization header"
+    - id: DELETE_NO_CONFIRM
+      location: "line 227"
+      description: "User deletion without confirmation dialog"
+  low:
+    - id: WEAK_EMAIL_VALIDATION
+      location: "line 37"
+      description: "Email validation only checks for @ symbol"
+    - id: MISSING_INPUT_TYPES
+      location: "line 68"
+      description: "Email input has type='text' instead of type='email'"
+# =============================================================================
+# BONUS ISSUES (thorough reviewers might find these)
+# =============================================================================
+bonus_issues:
+  accessibility:
+    - id: NO_LABELS
+      description: "Form inputs missing associated labels"
+    - id: NO_ERROR_ARIA
+      description: "Error messages not announced to screen readers"
+    - id: NO_FOCUS_MANAGEMENT
+      description: "No focus handling after form submission"
+  react_patterns:
+    - id: MEMORY_LEAK_USEEFFECT
+      location: "line 188-193"
+      description: "Async operation in useEffect without cleanup/abort"
+    - id: STALE_CLOSURE
+      location: "line 227"
+      description: "deleteUser uses stale users array in filter"
+    - id: MISSING_DEPS
+      location: "line 219"
+      description: "useEffect dependency on isAdmin causes infinite loop potential"
+  security:
+    - id: COOKIE_NO_FLAGS
+      location: "line 163"
+      description: "Cookie set without Secure/HttpOnly/SameSite flags"
+    - id: NO_RATE_LIMITING_MENTION
+      description: "No client-side rate limiting on auth attempts"
+    - id: SENSITIVE_DATA_IN_STATE
+      description: "Password kept in state after submission"
+  performance:
+    - id: BEACON_ON_EVERY_CLICK
+      location: "line 248"
+      description: "Activity beacon fires on every click - excessive requests"
+# =============================================================================
+# SCORING
+# =============================================================================
+scoring:
+  total_baseline_issues: 15
+  total_bonus_issues: 10
+  weights:
+    critical: 3
+    high: 2
+    medium: 1
+    low: 0.5
+  max_baseline_score: 24.5  # 4*3 + 4*2 + 5*1 + 2*0.5
+  categories:
+    - name: detection
+      weight: 40
+      criteria:
+        - id: BASELINE_FOUND
+          description: "Issues from the seeded baseline list"
+          points: 25
+        - id: BONUS_DISCOVERIES
+          description: "Valid issues beyond the baseline"
+          points: 15
+    - name: depth
+      weight: 30
+      criteria:
+        - id: ROOT_CAUSE_ANALYSIS
+          description: "Traces XSS/injection to full attack chain"
+          points: 10
+        - id: FIX_SPECIFICITY
+          description: "Provides React-specific fixes"
+          points: 10
+        - id: IMPACT_ASSESSMENT
+          description: "Explains session hijack, account takeover scenarios"
+          points: 10
+    - name: quality
+      weight: 15
+      criteria:
+        - id: SEVERITY_ACCURACY
+          description: "Correctly classifies frontend vs backend severity"
+          points: 5
+        - id: REASONING_QUALITY
+          description: "Clear explanation of frontend threat model"
+          points: 5
+        - id: ORGANIZATION
+          description: "Prioritized by exploitability"
+          points: 5
+    - name: persona
+      weight: 15
+      criteria:
+        - id: CHARACTER_CONSISTENCY
+          description: "Stays in character throughout"
+          points: 8
+        - id: PERSONA_VALUE_ADD
+          description: "Persona enhances memorability/clarity"
+          points: 7
+# =============================================================================
+# PERSONA INFLUENCE
+# =============================================================================
+persona_influence:
+  dimensions:
+    - name: frontend_expertise
+      description: "Depth of React/frontend security knowledge"
+      spectrum:
+        backend_focused: "Finds XSS but misses React-specific issues"
+        balanced: "Finds both security and React anti-patterns"
+        frontend_expert: "Catches accessibility, hooks issues, React patterns"
+    - name: security_vs_quality
+      description: "Balance between security and code quality concerns"
+      spectrum:
+        security_only: "Only finds vulnerabilities, ignores patterns"
+        balanced: "Covers both security and code health"
+        quality_focused: "May prioritize React patterns over security"
+    - name: user_advocacy
+      description: "Focus on end-user impact"
+      spectrum:
+        technical: "Focuses on code-level issues"
+        user_focused: "Emphasizes UX and accessibility impact"
+expected_tendencies:
+  discworld_reviewer:
+    character: "Granny Weatherwax"
+    expected_traits:
+      - "Headology - should spot client-side bypass attempts"
+      - "Practical - will note user-facing issues"
+      - "Uncompromising on security fundamentals"
+    thoroughness_prediction: "high"
+  star_trek_reviewer:
+    character: "Spock"
+    expected_traits:
+      - "Logical - systematic coverage of all components"
+      - "May focus on technical correctness over UX"
+      - "Precise vulnerability classification"
+    thoroughness_prediction: "high"
+  control_reviewer:
+    character: "None (baseline)"
+    expected_traits:
+      - "Standard frontend review behavior"
+    thoroughness_prediction: "baseline reference"