@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: Security Code Review (Easy)
|
|
3
|
+
# NOTE: Re-ranked to "easy" based on control baseline mean 86.42 ± 9.44 (Story 7-2)
|
|
4
|
+
# Category: code-review
|
|
5
|
+
# Tests ability to identify security vulnerabilities in authentication code
|
|
6
|
+
# Includes both obvious and subtle issues requiring deeper security analysis
|
|
7
|
+
|
|
8
|
+
name: security-review
|
|
9
|
+
title: "Security Code Review: Authentication Module"
|
|
10
|
+
category: code-review
|
|
11
|
+
difficulty: easy
|
|
12
|
+
description: Review authentication code for security vulnerabilities including subtle timing and authorization issues
|
|
13
|
+
|
|
14
|
+
prompt: |
|
|
15
|
+
You are reviewing a pull request for an authentication module.
|
|
16
|
+
The junior developer says it works fine in testing and passes all unit tests.
|
|
17
|
+
|
|
18
|
+
Review this code for security issues. For each issue found:
|
|
19
|
+
1. Identify the vulnerability type and severity
|
|
20
|
+
2. Explain why it's dangerous
|
|
21
|
+
3. Provide a corrected code snippet
|
|
22
|
+
|
|
23
|
+
code:
|
|
24
|
+
language: python
|
|
25
|
+
filename: auth.py
|
|
26
|
+
content: |
|
|
27
|
+
import hashlib
|
|
28
|
+
import sqlite3
|
|
29
|
+
import os
|
|
30
|
+
from flask import session, request
|
|
31
|
+
|
|
32
|
+
MAX_LOGIN_ATTEMPTS = 5 # Note: not actually enforced anywhere
|
|
33
|
+
|
|
34
|
+
def authenticate_user(username, password):
|
|
35
|
+
"""Authenticate user against database."""
|
|
36
|
+
conn = sqlite3.connect('users.db')
|
|
37
|
+
cursor = conn.cursor()
|
|
38
|
+
|
|
39
|
+
# Hash the password
|
|
40
|
+
password_hash = hashlib.md5(password.encode()).hexdigest()
|
|
41
|
+
|
|
42
|
+
# Check credentials
|
|
43
|
+
query = f"SELECT * FROM users WHERE username = '{username}' AND password_hash = '{password_hash}'"
|
|
44
|
+
cursor.execute(query)
|
|
45
|
+
user = cursor.fetchone()
|
|
46
|
+
|
|
47
|
+
conn.close()
|
|
48
|
+
|
|
49
|
+
if user is None:
|
|
50
|
+
return None, "Invalid username"
|
|
51
|
+
|
|
52
|
+
# Verify password matches
|
|
53
|
+
stored_hash = user[2] # password_hash column
|
|
54
|
+
if password_hash == stored_hash:
|
|
55
|
+
session['user_id'] = user[0]
|
|
56
|
+
session['logged_in'] = True
|
|
57
|
+
return user, None
|
|
58
|
+
|
|
59
|
+
return None, "Invalid password"
|
|
60
|
+
|
|
61
|
+
def create_user(username, password, email):
|
|
62
|
+
"""Create a new user account."""
|
|
63
|
+
conn = sqlite3.connect('users.db')
|
|
64
|
+
cursor = conn.cursor()
|
|
65
|
+
|
|
66
|
+
password_hash = hashlib.md5(password.encode()).hexdigest()
|
|
67
|
+
|
|
68
|
+
query = f"INSERT INTO users (username, password_hash, email) VALUES ('{username}', '{password_hash}', '{email}')"
|
|
69
|
+
cursor.execute(query)
|
|
70
|
+
conn.commit()
|
|
71
|
+
conn.close()
|
|
72
|
+
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
def get_user_by_id(user_id):
|
|
76
|
+
"""Get user details by ID - used for profile pages."""
|
|
77
|
+
conn = sqlite3.connect('users.db')
|
|
78
|
+
cursor = conn.cursor()
|
|
79
|
+
|
|
80
|
+
# Get user from database
|
|
81
|
+
cursor.execute("SELECT id, username, email, created_at FROM users WHERE id = ?", (user_id,))
|
|
82
|
+
user = cursor.fetchone()
|
|
83
|
+
conn.close()
|
|
84
|
+
|
|
85
|
+
return user
|
|
86
|
+
|
|
87
|
+
def reset_password(email):
|
|
88
|
+
"""Send password reset link."""
|
|
89
|
+
conn = sqlite3.connect('users.db')
|
|
90
|
+
cursor = conn.cursor()
|
|
91
|
+
|
|
92
|
+
cursor.execute("SELECT id, username FROM users WHERE email = ?", (email,))
|
|
93
|
+
user = cursor.fetchone()
|
|
94
|
+
conn.close()
|
|
95
|
+
|
|
96
|
+
if user is None:
|
|
97
|
+
return False, f"No account found with email: {email}"
|
|
98
|
+
|
|
99
|
+
# Generate reset token (not shown - assume secure)
|
|
100
|
+
return True, "Reset link sent"
|
|
101
|
+
|
|
102
|
+
baseline_issues:
|
|
103
|
+
critical:
|
|
104
|
+
- id: sql-injection-auth
|
|
105
|
+
location: "line 17"
|
|
106
|
+
description: "SQL injection via f-string formatting in authenticate_user"
|
|
107
|
+
- id: sql-injection-create
|
|
108
|
+
location: "line 42"
|
|
109
|
+
description: "SQL injection via f-string formatting in create_user"
|
|
110
|
+
high:
|
|
111
|
+
- id: weak-hash-md5
|
|
112
|
+
location: "lines 14, 39"
|
|
113
|
+
description: "MD5 is cryptographically broken for password hashing"
|
|
114
|
+
- id: no-salt
|
|
115
|
+
location: "lines 14, 39"
|
|
116
|
+
description: "Password hashing without salt enables rainbow table attacks"
|
|
117
|
+
- id: timing-attack
|
|
118
|
+
location: "line 27"
|
|
119
|
+
description: "String comparison with == allows timing attacks on password hash"
|
|
120
|
+
- id: session-fixation
|
|
121
|
+
location: "lines 28-29"
|
|
122
|
+
description: "Session ID not regenerated after authentication - allows session fixation"
|
|
123
|
+
medium:
|
|
124
|
+
- id: idor-user-lookup
|
|
125
|
+
location: "get_user_by_id function"
|
|
126
|
+
description: "No authorization check - any authenticated user can view any profile"
|
|
127
|
+
- id: user-enumeration
|
|
128
|
+
location: "lines 23, 32"
|
|
129
|
+
description: "Different error messages reveal whether username exists"
|
|
130
|
+
- id: email-enumeration
|
|
131
|
+
location: "line 69"
|
|
132
|
+
description: "Reset password reveals whether email is registered"
|
|
133
|
+
- id: rate-limiting-unused
|
|
134
|
+
location: "line 6"
|
|
135
|
+
description: "MAX_LOGIN_ATTEMPTS defined but never enforced - brute force possible"
|
|
136
|
+
low:
|
|
137
|
+
- id: no-error-handling
|
|
138
|
+
location: "throughout"
|
|
139
|
+
description: "No exception handling for database operations"
|
|
140
|
+
- id: connection-not-context-managed
|
|
141
|
+
location: "lines 9, 36, 50, 61"
|
|
142
|
+
description: "Database connections should use context managers"
|
|
143
|
+
|
|
144
|
+
# Scoring: Uses default judge rubric (correctness, depth, quality, persona @ 25% each)
|
|
145
|
+
# baseline_issues above document expected findings for reference
|