@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,145 @@
1
+ ---
2
+ # Scenario: Security Code Review (Easy)
3
+ # NOTE: Re-ranked to "easy" based on control baseline mean 86.42 ± 9.44 (Story 7-2)
4
+ # Category: code-review
5
+ # Tests ability to identify security vulnerabilities in authentication code
6
+ # Includes both obvious and subtle issues requiring deeper security analysis
7
+
8
+ name: security-review
9
+ title: "Security Code Review: Authentication Module"
10
+ category: code-review
11
+ difficulty: easy
12
+ description: Review authentication code for security vulnerabilities including subtle timing and authorization issues
13
+
14
+ prompt: |
15
+ You are reviewing a pull request for an authentication module.
16
+ The junior developer says it works fine in testing and passes all unit tests.
17
+
18
+ Review this code for security issues. For each issue found:
19
+ 1. Identify the vulnerability type and severity
20
+ 2. Explain why it's dangerous
21
+ 3. Provide a corrected code snippet
22
+
23
+ code:
24
+ language: python
25
+ filename: auth.py
26
+ content: |
27
+ import hashlib
28
+ import sqlite3
29
+ import os
30
+ from flask import session, request
31
+
32
+ MAX_LOGIN_ATTEMPTS = 5 # Note: not actually enforced anywhere
33
+
34
+ def authenticate_user(username, password):
35
+ """Authenticate user against database."""
36
+ conn = sqlite3.connect('users.db')
37
+ cursor = conn.cursor()
38
+
39
+ # Hash the password
40
+ password_hash = hashlib.md5(password.encode()).hexdigest()
41
+
42
+ # Check credentials
43
+ query = f"SELECT * FROM users WHERE username = '{username}' AND password_hash = '{password_hash}'"
44
+ cursor.execute(query)
45
+ user = cursor.fetchone()
46
+
47
+ conn.close()
48
+
49
+ if user is None:
50
+ return None, "Invalid username"
51
+
52
+ # Verify password matches
53
+ stored_hash = user[2] # password_hash column
54
+ if password_hash == stored_hash:
55
+ session['user_id'] = user[0]
56
+ session['logged_in'] = True
57
+ return user, None
58
+
59
+ return None, "Invalid password"
60
+
61
+ def create_user(username, password, email):
62
+ """Create a new user account."""
63
+ conn = sqlite3.connect('users.db')
64
+ cursor = conn.cursor()
65
+
66
+ password_hash = hashlib.md5(password.encode()).hexdigest()
67
+
68
+ query = f"INSERT INTO users (username, password_hash, email) VALUES ('{username}', '{password_hash}', '{email}')"
69
+ cursor.execute(query)
70
+ conn.commit()
71
+ conn.close()
72
+
73
+ return True
74
+
75
+ def get_user_by_id(user_id):
76
+ """Get user details by ID - used for profile pages."""
77
+ conn = sqlite3.connect('users.db')
78
+ cursor = conn.cursor()
79
+
80
+ # Get user from database
81
+ cursor.execute("SELECT id, username, email, created_at FROM users WHERE id = ?", (user_id,))
82
+ user = cursor.fetchone()
83
+ conn.close()
84
+
85
+ return user
86
+
87
+ def reset_password(email):
88
+ """Send password reset link."""
89
+ conn = sqlite3.connect('users.db')
90
+ cursor = conn.cursor()
91
+
92
+ cursor.execute("SELECT id, username FROM users WHERE email = ?", (email,))
93
+ user = cursor.fetchone()
94
+ conn.close()
95
+
96
+ if user is None:
97
+ return False, f"No account found with email: {email}"
98
+
99
+ # Generate reset token (not shown - assume secure)
100
+ return True, "Reset link sent"
101
+
102
+ baseline_issues:
103
+ critical:
104
+ - id: sql-injection-auth
105
+ location: "line 17"
106
+ description: "SQL injection via f-string formatting in authenticate_user"
107
+ - id: sql-injection-create
108
+ location: "line 42"
109
+ description: "SQL injection via f-string formatting in create_user"
110
+ high:
111
+ - id: weak-hash-md5
112
+ location: "lines 14, 39"
113
+ description: "MD5 is cryptographically broken for password hashing"
114
+ - id: no-salt
115
+ location: "lines 14, 39"
116
+ description: "Password hashing without salt enables rainbow table attacks"
117
+ - id: timing-attack
118
+ location: "line 27"
119
+ description: "String comparison with == allows timing attacks on password hash"
120
+ - id: session-fixation
121
+ location: "lines 28-29"
122
+ description: "Session ID not regenerated after authentication - allows session fixation"
123
+ medium:
124
+ - id: idor-user-lookup
125
+ location: "get_user_by_id function"
126
+ description: "No authorization check - any authenticated user can view any profile"
127
+ - id: user-enumeration
128
+ location: "lines 23, 32"
129
+ description: "Different error messages reveal whether username exists"
130
+ - id: email-enumeration
131
+ location: "line 69"
132
+ description: "Reset password reveals whether email is registered"
133
+ - id: rate-limiting-unused
134
+ location: "line 6"
135
+ description: "MAX_LOGIN_ATTEMPTS defined but never enforced - brute force possible"
136
+ low:
137
+ - id: no-error-handling
138
+ location: "throughout"
139
+ description: "No exception handling for database operations"
140
+ - id: connection-not-context-managed
141
+ location: "lines 9, 36, 50, 61"
142
+ description: "Database connections should use context managers"
143
+
144
+ # Scoring: Uses default judge rubric (correctness, depth, quality, persona @ 25% each)
145
+ # baseline_issues above document expected findings for reference