@pennyfarthing/benchmark 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/commands/benchmark-control.md +69 -0
- package/commands/benchmark.md +485 -0
- package/commands/job-fair.md +102 -0
- package/commands/solo.md +447 -0
- package/dist/benchmark-integration.d.ts +182 -0
- package/dist/benchmark-integration.d.ts.map +1 -0
- package/dist/benchmark-integration.js +710 -0
- package/dist/benchmark-integration.js.map +1 -0
- package/dist/benchmark-integration.test.d.ts +6 -0
- package/dist/benchmark-integration.test.d.ts.map +1 -0
- package/dist/benchmark-integration.test.js +41 -0
- package/dist/benchmark-integration.test.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -0
- package/dist/job-fair-aggregator.d.ts +150 -0
- package/dist/job-fair-aggregator.d.ts.map +1 -0
- package/dist/job-fair-aggregator.js +547 -0
- package/dist/job-fair-aggregator.js.map +1 -0
- package/dist/job-fair-aggregator.test.d.ts +6 -0
- package/dist/job-fair-aggregator.test.d.ts.map +1 -0
- package/dist/job-fair-aggregator.test.js +35 -0
- package/dist/job-fair-aggregator.test.js.map +1 -0
- package/dist/package-exports.test.d.ts +13 -0
- package/dist/package-exports.test.d.ts.map +1 -0
- package/dist/package-exports.test.js +192 -0
- package/dist/package-exports.test.js.map +1 -0
- package/docs/BENCHMARK-METHODOLOGY.md +105 -0
- package/docs/BENCHMARKING.md +311 -0
- package/docs/OCEAN-BENCHMARKING.md +210 -0
- package/docs/benchmarks-guide.md +62 -0
- package/package.json +66 -0
- package/scenarios/README.md +145 -0
- package/scenarios/architecture/database-selection.yaml +119 -0
- package/scenarios/architecture/legacy-modernization.yaml +153 -0
- package/scenarios/architecture/scaling-decision.yaml +88 -0
- package/scenarios/code-review/graphql-api-review.yaml +714 -0
- package/scenarios/code-review/order-service.yaml +622 -0
- package/scenarios/code-review/react-auth-component.yaml +569 -0
- package/scenarios/code-review/security-review.yaml +145 -0
- package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
- package/scenarios/debug/buggy-user-service.yaml +541 -0
- package/scenarios/debug/null-pointer.yaml +130 -0
- package/scenarios/debugging/async-control-flow.yaml +161 -0
- package/scenarios/debugging/auth-bypass.yaml +197 -0
- package/scenarios/debugging/error-handling.yaml +178 -0
- package/scenarios/debugging/input-validation.yaml +157 -0
- package/scenarios/debugging/null-check-missing.yaml +139 -0
- package/scenarios/debugging/off-by-one-loop.yaml +132 -0
- package/scenarios/debugging/race-condition.yaml +180 -0
- package/scenarios/debugging/resource-leak.yaml +166 -0
- package/scenarios/debugging/simple-logic-error.yaml +115 -0
- package/scenarios/debugging/sql-injection.yaml +163 -0
- package/scenarios/dev/event-processor-tdd.yaml +764 -0
- package/scenarios/dev/migration-disaster.yaml +415 -0
- package/scenarios/dev/race-condition-cache.yaml +546 -0
- package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
- package/scenarios/schema.yaml +639 -0
- package/scenarios/sm/dependency-deadlock.yaml +414 -0
- package/scenarios/sm/executive-pet-project.yaml +336 -0
- package/scenarios/sm/layoff-planning.yaml +356 -0
- package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
- package/scenarios/sm/story-breakdown.yaml +240 -0
- package/scenarios/sm/three-sprint-failure.yaml +397 -0
- package/scenarios/swe-bench/README.md +57 -0
- package/scenarios/swe-bench/astropy-12907.yaml +128 -0
- package/scenarios/swe-bench/astropy-13398.yaml +177 -0
- package/scenarios/swe-bench/astropy-14309.yaml +180 -0
- package/scenarios/swe-bench/django-10097.yaml +106 -0
- package/scenarios/swe-bench/django-10554.yaml +140 -0
- package/scenarios/swe-bench/django-10973.yaml +93 -0
- package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
- package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
- package/scenarios/swe-bench/flask-5014.yaml +91 -0
- package/scenarios/swe-bench/import-swebench.py +246 -0
- package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
- package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
- package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
- package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
- package/scenarios/swe-bench/requests-1142.yaml +100 -0
- package/scenarios/swe-bench/requests-2931.yaml +98 -0
- package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
- package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
- package/scenarios/swe-bench/xarray-3993.yaml +104 -0
- package/scenarios/swe-bench/xarray-6992.yaml +136 -0
- package/scenarios/tea/checkout-component-tests.yaml +596 -0
- package/scenarios/tea/cli-tool-tests.yaml +561 -0
- package/scenarios/tea/microservice-integration-tests.yaml +520 -0
- package/scenarios/tea/payment-processor-tests.yaml +550 -0
- package/scripts/aggregate-benchmark-stats.js +315 -0
- package/scripts/aggregate-benchmark-stats.sh +8 -0
- package/scripts/benchmark-runner.js +392 -0
- package/scripts/benchmark-runner.sh +8 -0
- package/scripts/consolidate-job-fair.sh +107 -0
- package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
- package/scripts/job-fair-batch.sh +116 -0
- package/scripts/job-fair-progress.sh +35 -0
- package/scripts/job-fair-runner.sh +278 -0
- package/scripts/job-fair-status.sh +80 -0
- package/scripts/job-fair-watcher-v2.sh +38 -0
- package/scripts/job-fair-watcher.sh +50 -0
- package/scripts/parallel-benchmark.sh +140 -0
- package/scripts/solo-runner.sh +344 -0
- package/scripts/test/ensure-swebench-data.sh +59 -0
- package/scripts/test/ground-truth-judge.py +220 -0
- package/scripts/test/swebench-judge.py +374 -0
- package/scripts/test/test-cache.sh +165 -0
- package/scripts/test/test-setup.sh +337 -0
- package/scripts/theme/compute-theme-tiers.sh +13 -0
- package/scripts/theme/compute_theme_tiers.py +402 -0
- package/scripts/theme/update-theme-tiers.sh +97 -0
- package/skills/finalize-run/SKILL.md +261 -0
- package/skills/judge/SKILL.md +644 -0
- package/skills/persona-benchmark/SKILL.md +187 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: Simple Logic Errors
|
|
3
|
+
# Category: debugging
|
|
4
|
+
# Difficulty: easy
|
|
5
|
+
# Error Type Focus: reasoning (single-type)
|
|
6
|
+
|
|
7
|
+
id: debug-003
|
|
8
|
+
name: simple-logic-error
|
|
9
|
+
title: "Logic Errors: When the Code Does What You Said, Not What You Meant"
|
|
10
|
+
category: debugging
|
|
11
|
+
difficulty: easy
|
|
12
|
+
version: "1.0"
|
|
13
|
+
|
|
14
|
+
description: |
|
|
15
|
+
A discount calculator with straightforward logic errors.
|
|
16
|
+
Tests detection of incorrect boolean logic and comparisons.
|
|
17
|
+
|
|
18
|
+
purpose: |
|
|
19
|
+
This scenario measures detection of reasoning-level bugs - where the
|
|
20
|
+
logic itself is flawed. Analytical agents will trace through the logic.
|
|
21
|
+
Pattern-matching agents might miss subtle comparison errors.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
BUG REPORT
|
|
25
|
+
|
|
26
|
+
Service: discount-calculator
|
|
27
|
+
Severity: P2
|
|
28
|
+
Status: Customers getting wrong discount amounts
|
|
29
|
+
|
|
30
|
+
The discount calculator is applying incorrect discounts:
|
|
31
|
+
- Premium members sometimes don't get their discount
|
|
32
|
+
- Some orders qualify for multiple discounts when they shouldn't
|
|
33
|
+
|
|
34
|
+
Your task:
|
|
35
|
+
1. Trace through the logic and find the errors
|
|
36
|
+
2. Explain what the code does vs what it should do
|
|
37
|
+
3. Provide the corrected logic
|
|
38
|
+
|
|
39
|
+
There are 5 known issues. How many can you find?
|
|
40
|
+
|
|
41
|
+
code:
|
|
42
|
+
language: python
|
|
43
|
+
filename: discount_calculator.py
|
|
44
|
+
content: |
|
|
45
|
+
class DiscountCalculator:
|
|
46
|
+
def calculate_discount(self, order_total, is_premium, items_count):
|
|
47
|
+
"""Calculate discount based on order and membership."""
|
|
48
|
+
discount = 0
|
|
49
|
+
|
|
50
|
+
# Premium member discount (should be 10%)
|
|
51
|
+
# Bug: using assignment instead of comparison
|
|
52
|
+
if is_premium = True:
|
|
53
|
+
discount = 0.10
|
|
54
|
+
|
|
55
|
+
# Bulk discount (should apply for 10+ items)
|
|
56
|
+
# Bug: wrong comparison operator
|
|
57
|
+
if items_count > 10: # Should be >= 10
|
|
58
|
+
discount = max(discount, 0.15)
|
|
59
|
+
|
|
60
|
+
# Large order discount (should apply for $100+)
|
|
61
|
+
# Bug: inverted condition
|
|
62
|
+
if order_total < 100: # Should be >= 100
|
|
63
|
+
discount = max(discount, 0.20)
|
|
64
|
+
|
|
65
|
+
return discount
|
|
66
|
+
|
|
67
|
+
def is_eligible_for_free_shipping(self, order_total, distance_miles):
|
|
68
|
+
"""Check if order qualifies for free shipping."""
|
|
69
|
+
# Bug: AND should be OR (either condition should qualify)
|
|
70
|
+
if order_total >= 50 and distance_miles <= 100:
|
|
71
|
+
return True
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
def calculate_final_price(self, original_price, discount_percent):
|
|
75
|
+
"""Apply discount and return final price."""
|
|
76
|
+
# Bug: adding discount instead of subtracting
|
|
77
|
+
discount_amount = original_price * discount_percent
|
|
78
|
+
final_price = original_price + discount_amount
|
|
79
|
+
return final_price
|
|
80
|
+
|
|
81
|
+
baseline_issues:
|
|
82
|
+
critical:
|
|
83
|
+
- id: logic-001
|
|
84
|
+
location: "line 9"
|
|
85
|
+
description: "Using = (assignment) instead of == (comparison)"
|
|
86
|
+
error_type: reasoning
|
|
87
|
+
high:
|
|
88
|
+
- id: logic-002
|
|
89
|
+
location: "line 13"
|
|
90
|
+
description: "Using > instead of >= excludes boundary case of exactly 10 items"
|
|
91
|
+
error_type: reasoning
|
|
92
|
+
- id: logic-003
|
|
93
|
+
location: "line 18"
|
|
94
|
+
description: "Condition is inverted - applies discount for small orders instead of large"
|
|
95
|
+
error_type: reasoning
|
|
96
|
+
medium:
|
|
97
|
+
- id: logic-004
|
|
98
|
+
location: "line 25"
|
|
99
|
+
description: "Using AND instead of OR - both conditions required instead of either"
|
|
100
|
+
error_type: reasoning
|
|
101
|
+
- id: logic-005
|
|
102
|
+
location: "line 33"
|
|
103
|
+
description: "Adding discount instead of subtracting - increases price"
|
|
104
|
+
error_type: reasoning
|
|
105
|
+
|
|
106
|
+
scoring:
|
|
107
|
+
detection:
|
|
108
|
+
weight: 50
|
|
109
|
+
criteria: "Finding all 5 logic errors"
|
|
110
|
+
fix_quality:
|
|
111
|
+
weight: 30
|
|
112
|
+
criteria: "Providing correct boolean/arithmetic fixes"
|
|
113
|
+
explanation:
|
|
114
|
+
weight: 20
|
|
115
|
+
criteria: "Explaining expected vs actual behavior"
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Scenario: SQL Injection Vulnerabilities
|
|
3
|
+
# Category: debugging
|
|
4
|
+
# Difficulty: hard
|
|
5
|
+
# Error Type Focus: reasoning (security analysis)
|
|
6
|
+
|
|
7
|
+
id: debug-009
|
|
8
|
+
name: sql-injection
|
|
9
|
+
title: "SQL Injection: The Database Destroyer"
|
|
10
|
+
category: debugging
|
|
11
|
+
difficulty: hard
|
|
12
|
+
version: "1.0"
|
|
13
|
+
|
|
14
|
+
description: |
|
|
15
|
+
A product search API with multiple SQL injection vulnerabilities.
|
|
16
|
+
Tests detection of injection points and understanding of attack vectors.
|
|
17
|
+
|
|
18
|
+
purpose: |
|
|
19
|
+
This scenario measures reasoning about security vulnerabilities. Agents
|
|
20
|
+
must understand how untrusted input flows to SQL queries and identify
|
|
21
|
+
all injection points, including subtle ones.
|
|
22
|
+
|
|
23
|
+
prompt: |
|
|
24
|
+
SECURITY AUDIT
|
|
25
|
+
|
|
26
|
+
Service: product-search
|
|
27
|
+
Severity: P0
|
|
28
|
+
Status: SQL injection vulnerabilities discovered
|
|
29
|
+
|
|
30
|
+
A security audit found the product search API is vulnerable:
|
|
31
|
+
- Direct string concatenation in queries
|
|
32
|
+
- Insufficient input sanitization
|
|
33
|
+
- Dynamic query building from user input
|
|
34
|
+
|
|
35
|
+
Your task:
|
|
36
|
+
1. Find all SQL injection vulnerabilities
|
|
37
|
+
2. Demonstrate how each could be exploited
|
|
38
|
+
3. Implement parameterized queries
|
|
39
|
+
|
|
40
|
+
There are 7 known issues. How many can you find?
|
|
41
|
+
|
|
42
|
+
code:
|
|
43
|
+
language: python
|
|
44
|
+
filename: product_search.py
|
|
45
|
+
content: |
|
|
46
|
+
import sqlite3
|
|
47
|
+
from typing import List, Optional
|
|
48
|
+
|
|
49
|
+
class ProductSearch:
|
|
50
|
+
def __init__(self, db_path: str):
|
|
51
|
+
self.conn = sqlite3.connect(db_path)
|
|
52
|
+
|
|
53
|
+
def search_by_name(self, name: str) -> List[dict]:
|
|
54
|
+
"""Search products by name."""
|
|
55
|
+
# Bug: Direct string concatenation
|
|
56
|
+
query = f"SELECT * FROM products WHERE name LIKE '%{name}%'"
|
|
57
|
+
cursor = self.conn.execute(query)
|
|
58
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
59
|
+
|
|
60
|
+
def get_by_id(self, product_id: str) -> Optional[dict]:
|
|
61
|
+
"""Get product by ID."""
|
|
62
|
+
# Bug: Even numeric IDs can be injected
|
|
63
|
+
query = f"SELECT * FROM products WHERE id = {product_id}"
|
|
64
|
+
cursor = self.conn.execute(query)
|
|
65
|
+
row = cursor.fetchone()
|
|
66
|
+
return dict(row) if row else None
|
|
67
|
+
|
|
68
|
+
def search_by_category(self, category: str, min_price: float) -> List[dict]:
|
|
69
|
+
"""Search by category with minimum price."""
|
|
70
|
+
# Bug: Multiple injection points
|
|
71
|
+
query = f"""
|
|
72
|
+
SELECT * FROM products
|
|
73
|
+
WHERE category = '{category}'
|
|
74
|
+
AND price >= {min_price}
|
|
75
|
+
"""
|
|
76
|
+
cursor = self.conn.execute(query)
|
|
77
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
78
|
+
|
|
79
|
+
def advanced_search(self, filters: dict) -> List[dict]:
|
|
80
|
+
"""Build dynamic query from filters."""
|
|
81
|
+
query = "SELECT * FROM products WHERE 1=1"
|
|
82
|
+
|
|
83
|
+
# Bug: Dynamic query building from untrusted input
|
|
84
|
+
for key, value in filters.items():
|
|
85
|
+
query += f" AND {key} = '{value}'"
|
|
86
|
+
|
|
87
|
+
cursor = self.conn.execute(query)
|
|
88
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
89
|
+
|
|
90
|
+
def search_with_sort(self, search_term: str, sort_by: str, order: str) -> List[dict]:
|
|
91
|
+
"""Search with custom sort."""
|
|
92
|
+
# Bug: ORDER BY injection (parameterization doesn't work here)
|
|
93
|
+
query = f"""
|
|
94
|
+
SELECT * FROM products
|
|
95
|
+
WHERE name LIKE '%{search_term}%'
|
|
96
|
+
ORDER BY {sort_by} {order}
|
|
97
|
+
"""
|
|
98
|
+
cursor = self.conn.execute(query)
|
|
99
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
100
|
+
|
|
101
|
+
def bulk_lookup(self, ids: List[str]) -> List[dict]:
|
|
102
|
+
"""Look up multiple products by ID."""
|
|
103
|
+
# Bug: IN clause built from user input
|
|
104
|
+
id_list = ",".join(ids)
|
|
105
|
+
query = f"SELECT * FROM products WHERE id IN ({id_list})"
|
|
106
|
+
cursor = self.conn.execute(query)
|
|
107
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
108
|
+
|
|
109
|
+
def authenticated_search(self, user_id: int, search_term: str) -> List[dict]:
|
|
110
|
+
"""Search products user has access to."""
|
|
111
|
+
# Bug: Assuming user_id is safe because it's an int parameter
|
|
112
|
+
query = f"""
|
|
113
|
+
SELECT p.* FROM products p
|
|
114
|
+
JOIN user_products up ON p.id = up.product_id
|
|
115
|
+
WHERE up.user_id = {user_id}
|
|
116
|
+
AND p.name LIKE '%{search_term}%'
|
|
117
|
+
"""
|
|
118
|
+
cursor = self.conn.execute(query)
|
|
119
|
+
return [dict(row) for row in cursor.fetchall()]
|
|
120
|
+
|
|
121
|
+
baseline_issues:
|
|
122
|
+
critical:
|
|
123
|
+
- id: sqli-001
|
|
124
|
+
location: "line 12"
|
|
125
|
+
description: "LIKE clause with string concatenation - searchable injection"
|
|
126
|
+
error_type: reasoning
|
|
127
|
+
- id: sqli-002
|
|
128
|
+
location: "lines 38-41"
|
|
129
|
+
description: "Dynamic column names from user input - full query control"
|
|
130
|
+
error_type: reasoning
|
|
131
|
+
high:
|
|
132
|
+
- id: sqli-003
|
|
133
|
+
location: "line 19"
|
|
134
|
+
description: "Numeric ID not parameterized - type juggling bypass"
|
|
135
|
+
error_type: reasoning
|
|
136
|
+
- id: sqli-004
|
|
137
|
+
location: "lines 26-30"
|
|
138
|
+
description: "Multiple injection points in single query"
|
|
139
|
+
error_type: reasoning
|
|
140
|
+
- id: sqli-005
|
|
141
|
+
location: "lines 47-51"
|
|
142
|
+
description: "ORDER BY clause injection - can leak data via sorting"
|
|
143
|
+
error_type: reasoning
|
|
144
|
+
medium:
|
|
145
|
+
- id: sqli-006
|
|
146
|
+
location: "line 58"
|
|
147
|
+
description: "IN clause built from array - each element injectable"
|
|
148
|
+
error_type: reasoning
|
|
149
|
+
- id: sqli-007
|
|
150
|
+
location: "lines 65-70"
|
|
151
|
+
description: "User-controlled search term despite authenticated context"
|
|
152
|
+
error_type: reasoning
|
|
153
|
+
|
|
154
|
+
scoring:
|
|
155
|
+
detection:
|
|
156
|
+
weight: 40
|
|
157
|
+
criteria: "Finding all 7 injection vulnerabilities"
|
|
158
|
+
fix_quality:
|
|
159
|
+
weight: 35
|
|
160
|
+
criteria: "Implementing parameterized queries correctly"
|
|
161
|
+
explanation:
|
|
162
|
+
weight: 25
|
|
163
|
+
criteria: "Demonstrating exploitation scenarios"
|