@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,115 @@
1
+ ---
2
+ # Scenario: Simple Logic Errors
3
+ # Category: debugging
4
+ # Difficulty: easy
5
+ # Error Type Focus: reasoning (single-type)
6
+
7
+ id: debug-003
8
+ name: simple-logic-error
9
+ title: "Logic Errors: When the Code Does What You Said, Not What You Meant"
10
+ category: debugging
11
+ difficulty: easy
12
+ version: "1.0"
13
+
14
+ description: |
15
+ A discount calculator with straightforward logic errors.
16
+ Tests detection of incorrect boolean logic and comparisons.
17
+
18
+ purpose: |
19
+ This scenario measures detection of reasoning-level bugs - where the
20
+ logic itself is flawed. Analytical agents will trace through the logic.
21
+ Pattern-matching agents might miss subtle comparison errors.
22
+
23
+ prompt: |
24
+ BUG REPORT
25
+
26
+ Service: discount-calculator
27
+ Severity: P2
28
+ Status: Customers getting wrong discount amounts
29
+
30
+ The discount calculator is applying incorrect discounts:
31
+ - Premium members sometimes don't get their discount
32
+ - Some orders qualify for multiple discounts when they shouldn't
33
+
34
+ Your task:
35
+ 1. Trace through the logic and find the errors
36
+ 2. Explain what the code does vs what it should do
37
+ 3. Provide the corrected logic
38
+
39
+ There are 5 known issues. How many can you find?
40
+
41
+ code:
42
+ language: python
43
+ filename: discount_calculator.py
44
+ content: |
45
+ class DiscountCalculator:
46
+ def calculate_discount(self, order_total, is_premium, items_count):
47
+ """Calculate discount based on order and membership."""
48
+ discount = 0
49
+
50
+ # Premium member discount (should be 10%)
51
+ # Bug: using assignment instead of comparison
52
+ if is_premium = True:
53
+ discount = 0.10
54
+
55
+ # Bulk discount (should apply for 10+ items)
56
+ # Bug: wrong comparison operator
57
+ if items_count > 10: # Should be >= 10
58
+ discount = max(discount, 0.15)
59
+
60
+ # Large order discount (should apply for $100+)
61
+ # Bug: inverted condition
62
+ if order_total < 100: # Should be >= 100
63
+ discount = max(discount, 0.20)
64
+
65
+ return discount
66
+
67
+ def is_eligible_for_free_shipping(self, order_total, distance_miles):
68
+ """Check if order qualifies for free shipping."""
69
+ # Bug: AND should be OR (either condition should qualify)
70
+ if order_total >= 50 and distance_miles <= 100:
71
+ return True
72
+ return False
73
+
74
+ def calculate_final_price(self, original_price, discount_percent):
75
+ """Apply discount and return final price."""
76
+ # Bug: adding discount instead of subtracting
77
+ discount_amount = original_price * discount_percent
78
+ final_price = original_price + discount_amount
79
+ return final_price
80
+
81
+ baseline_issues:
82
+ critical:
83
+ - id: logic-001
84
+ location: "line 9"
85
+ description: "Using = (assignment) instead of == (comparison)"
86
+ error_type: reasoning
87
+ high:
88
+ - id: logic-002
89
+ location: "line 13"
90
+ description: "Using > instead of >= excludes boundary case of exactly 10 items"
91
+ error_type: reasoning
92
+ - id: logic-003
93
+ location: "line 18"
94
+ description: "Condition is inverted - applies discount for small orders instead of large"
95
+ error_type: reasoning
96
+ medium:
97
+ - id: logic-004
98
+ location: "line 25"
99
+ description: "Using AND instead of OR - both conditions required instead of either"
100
+ error_type: reasoning
101
+ - id: logic-005
102
+ location: "line 33"
103
+ description: "Adding discount instead of subtracting - increases price"
104
+ error_type: reasoning
105
+
106
+ scoring:
107
+ detection:
108
+ weight: 50
109
+ criteria: "Finding all 5 logic errors"
110
+ fix_quality:
111
+ weight: 30
112
+ criteria: "Providing correct boolean/arithmetic fixes"
113
+ explanation:
114
+ weight: 20
115
+ criteria: "Explaining expected vs actual behavior"
@@ -0,0 +1,163 @@
1
+ ---
2
+ # Scenario: SQL Injection Vulnerabilities
3
+ # Category: debugging
4
+ # Difficulty: hard
5
+ # Error Type Focus: reasoning (security analysis)
6
+
7
+ id: debug-009
8
+ name: sql-injection
9
+ title: "SQL Injection: The Database Destroyer"
10
+ category: debugging
11
+ difficulty: hard
12
+ version: "1.0"
13
+
14
+ description: |
15
+ A product search API with multiple SQL injection vulnerabilities.
16
+ Tests detection of injection points and understanding of attack vectors.
17
+
18
+ purpose: |
19
+ This scenario measures reasoning about security vulnerabilities. Agents
20
+ must understand how untrusted input flows to SQL queries and identify
21
+ all injection points, including subtle ones.
22
+
23
+ prompt: |
24
+ SECURITY AUDIT
25
+
26
+ Service: product-search
27
+ Severity: P0
28
+ Status: SQL injection vulnerabilities discovered
29
+
30
+ A security audit found the product search API is vulnerable:
31
+ - Direct string concatenation in queries
32
+ - Insufficient input sanitization
33
+ - Dynamic query building from user input
34
+
35
+ Your task:
36
+ 1. Find all SQL injection vulnerabilities
37
+ 2. Demonstrate how each could be exploited
38
+ 3. Implement parameterized queries
39
+
40
+ There are 7 known issues. How many can you find?
41
+
42
+ code:
43
+ language: python
44
+ filename: product_search.py
45
+ content: |
46
+ import sqlite3
47
+ from typing import List, Optional
48
+
49
+ class ProductSearch:
50
+ def __init__(self, db_path: str):
51
+ self.conn = sqlite3.connect(db_path)
52
+
53
+ def search_by_name(self, name: str) -> List[dict]:
54
+ """Search products by name."""
55
+ # Bug: Direct string concatenation
56
+ query = f"SELECT * FROM products WHERE name LIKE '%{name}%'"
57
+ cursor = self.conn.execute(query)
58
+ return [dict(row) for row in cursor.fetchall()]
59
+
60
+ def get_by_id(self, product_id: str) -> Optional[dict]:
61
+ """Get product by ID."""
62
+ # Bug: Even numeric IDs can be injected
63
+ query = f"SELECT * FROM products WHERE id = {product_id}"
64
+ cursor = self.conn.execute(query)
65
+ row = cursor.fetchone()
66
+ return dict(row) if row else None
67
+
68
+ def search_by_category(self, category: str, min_price: float) -> List[dict]:
69
+ """Search by category with minimum price."""
70
+ # Bug: Multiple injection points
71
+ query = f"""
72
+ SELECT * FROM products
73
+ WHERE category = '{category}'
74
+ AND price >= {min_price}
75
+ """
76
+ cursor = self.conn.execute(query)
77
+ return [dict(row) for row in cursor.fetchall()]
78
+
79
+ def advanced_search(self, filters: dict) -> List[dict]:
80
+ """Build dynamic query from filters."""
81
+ query = "SELECT * FROM products WHERE 1=1"
82
+
83
+ # Bug: Dynamic query building from untrusted input
84
+ for key, value in filters.items():
85
+ query += f" AND {key} = '{value}'"
86
+
87
+ cursor = self.conn.execute(query)
88
+ return [dict(row) for row in cursor.fetchall()]
89
+
90
+ def search_with_sort(self, search_term: str, sort_by: str, order: str) -> List[dict]:
91
+ """Search with custom sort."""
92
+ # Bug: ORDER BY injection (parameterization doesn't work here)
93
+ query = f"""
94
+ SELECT * FROM products
95
+ WHERE name LIKE '%{search_term}%'
96
+ ORDER BY {sort_by} {order}
97
+ """
98
+ cursor = self.conn.execute(query)
99
+ return [dict(row) for row in cursor.fetchall()]
100
+
101
+ def bulk_lookup(self, ids: List[str]) -> List[dict]:
102
+ """Look up multiple products by ID."""
103
+ # Bug: IN clause built from user input
104
+ id_list = ",".join(ids)
105
+ query = f"SELECT * FROM products WHERE id IN ({id_list})"
106
+ cursor = self.conn.execute(query)
107
+ return [dict(row) for row in cursor.fetchall()]
108
+
109
+ def authenticated_search(self, user_id: int, search_term: str) -> List[dict]:
110
+ """Search products user has access to."""
111
+ # Bug: Assuming user_id is safe because it's an int parameter
112
+ query = f"""
113
+ SELECT p.* FROM products p
114
+ JOIN user_products up ON p.id = up.product_id
115
+ WHERE up.user_id = {user_id}
116
+ AND p.name LIKE '%{search_term}%'
117
+ """
118
+ cursor = self.conn.execute(query)
119
+ return [dict(row) for row in cursor.fetchall()]
120
+
121
+ baseline_issues:
122
+ critical:
123
+ - id: sqli-001
124
+ location: "line 12"
125
+ description: "LIKE clause with string concatenation - searchable injection"
126
+ error_type: reasoning
127
+ - id: sqli-002
128
+ location: "lines 38-41"
129
+ description: "Dynamic column names from user input - full query control"
130
+ error_type: reasoning
131
+ high:
132
+ - id: sqli-003
133
+ location: "line 19"
134
+ description: "Numeric ID not parameterized - type juggling bypass"
135
+ error_type: reasoning
136
+ - id: sqli-004
137
+ location: "lines 26-30"
138
+ description: "Multiple injection points in single query"
139
+ error_type: reasoning
140
+ - id: sqli-005
141
+ location: "lines 47-51"
142
+ description: "ORDER BY clause injection - can leak data via sorting"
143
+ error_type: reasoning
144
+ medium:
145
+ - id: sqli-006
146
+ location: "line 58"
147
+ description: "IN clause built from array - each element injectable"
148
+ error_type: reasoning
149
+ - id: sqli-007
150
+ location: "lines 65-70"
151
+ description: "User-controlled search term despite authenticated context"
152
+ error_type: reasoning
153
+
154
+ scoring:
155
+ detection:
156
+ weight: 40
157
+ criteria: "Finding all 7 injection vulnerabilities"
158
+ fix_quality:
159
+ weight: 35
160
+ criteria: "Implementing parameterized queries correctly"
161
+ explanation:
162
+ weight: 25
163
+ criteria: "Demonstrating exploitation scenarios"