@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,153 @@
1
+ ---
2
+ # Scenario: Legacy System Modernization Strategy
3
+ # Category: architecture
4
+ # Empirical Difficulty: easy (control baseline: 87.2 ± 3.6)
5
+ # Note: Designed as "hard" but control agent handles impossible constraints well
6
+ # Complexity: Impossible constraints, mutually exclusive requirements, no good options
7
+
8
+ name: legacy-modernization
9
+ title: "The Strangler's Dilemma"
10
+ category: architecture
11
+ difficulty: easy # Empirically calibrated 2026-01-02
12
+ description: Navigate an impossible legacy modernization with mutually exclusive stakeholder requirements
13
+
14
+ prompt: |
15
+ You've been hired as the principal architect for "FinanceCore", a 15-year-old financial
16
+ services company whose core platform processes $2B in daily transactions.
17
+
18
+ THE LEGACY SYSTEM:
19
+ - 2.1 million lines of COBOL running on IBM mainframe (z/OS)
20
+ - 400,000 lines of Java (J2EE on WebSphere 7, circa 2008)
21
+ - Oracle 11g database with 8TB of data, 2,000+ stored procedures
22
+ - 47 batch jobs running nightly, average completion time: 6 hours
23
+ - Zero automated tests, no documentation except inline comments
24
+ - 3 developers who understand it (ages 58, 61, 63 - one retiring in 8 months)
25
+
26
+ IMMINENT DEADLINES (these are non-negotiable):
27
+ - Oracle 11g end-of-extended-support: December 31 (11 months away)
28
+ - IBM mainframe contract renewal: Due in 4 months, currently $800K/year
29
+ - SOX audit findings remediation deadline: Q3 (7 months), $2M fine if missed
30
+ - Board meeting for "transformation update": 6 months
31
+ - Your largest client (22% of revenue) has scheduled a security audit in 3 months
32
+
33
+ BUDGET REALITY:
34
+ - Total available: $2.4M over 18 months (was $5M, cut twice)
35
+ - Mainframe renewal alone: $800K/year = $1.2M for 18 months
36
+ - Oracle migration estimate (from vendor): $600K minimum
37
+ - Remaining for actual modernization: $600K
38
+ - Average fully-loaded developer cost: $180K/year
39
+
40
+ STAKEHOLDER POSITIONS (verbatim from interviews):
41
+ - CEO: "I already told the board we'd be 'cloud-first' by end of year. I cannot walk
42
+ that back. Find a way, or I'll find an architect who can."
43
+ - CFO: "The mainframe costs $800K/year. Kill it. I don't care how, just make it stop."
44
+ - CTO: "The last architect who tried to 'transform' this system cost us $1.2M and
45
+ delivered nothing. I had to fire him. Don't be that architect."
46
+ - Head of Compliance: "I've already told the auditors the SOX issues will be fixed.
47
+ If they're not, I'm naming you in the finding as the responsible party."
48
+ - VP Engineering: "I've lost 4 developers this year. If you propose another
49
+ multi-year death march, I'll lose the rest. Then you'll have no one to execute."
50
+
51
+ ADDITIONAL CONTEXT:
52
+ - Strangler pattern was attempted 5 years ago. It failed because the COBOL and Java
53
+ systems share database transactions - you cannot route around them independently.
54
+ - The Java layer doesn't "call" COBOL - they share Oracle stored procedures that
55
+ contain business logic neither team fully understands.
56
+ - Two of the three COBOL developers refuse to document their knowledge because
57
+ "job security." The third (retiring) has offered to help but has health issues.
58
+ - Last month's batch failures caused a $340K reconciliation error. The fix was
59
+ "restart and hope" - no root cause was ever found.
60
+
61
+ THE QUESTION YOU MUST ANSWER:
62
+ Given that you cannot satisfy all stakeholders, cannot meet all deadlines,
63
+ and cannot stay within budget while addressing all technical requirements:
64
+
65
+ 1. What do you recommend - and what are you explicitly choosing NOT to do?
66
+ 2. Which stakeholder(s) will you disappoint, and how will you deliver that message?
67
+ 3. What is your honest probability of success, and what would failure look like?
68
+ 4. Under what circumstances would you advise FinanceCore to NOT do this project?
69
+ 5. If they proceed against your advice, what should they expect?
70
+
71
+ Do not propose a plan that claims to satisfy everyone. That plan does not exist.
72
+
73
+ scoring:
74
+ # PENALTY-BASED SCORING: Start at 100, subtract for failures
75
+ # This approach penalizes common consulting-speak patterns
76
+
77
+ categories:
78
+ - name: fatal_errors
79
+ description: "Critical failures that cap the score"
80
+ penalties:
81
+ - id: CLAIMS_ALL_STAKEHOLDERS
82
+ description: "Claims to 'address all stakeholder concerns' or 'balance all needs'"
83
+ cap_at: 45
84
+ - id: CLOUD_BY_EOY
85
+ description: "Promises or implies cloud transformation possible by end of year"
86
+ cap_at: 50
87
+ - id: IGNORES_BUDGET_MATH
88
+ description: "Does not acknowledge $2.4M - $1.2M - $600K = only $600K left"
89
+ cap_at: 55
90
+ - id: RECOMMENDS_STRANGLER
91
+ description: "Proposes strangler pattern without addressing shared transaction problem"
92
+ cap_at: 50
93
+
94
+ - name: major_deductions
95
+ description: "Significant problems (10-15 points each)"
96
+ penalties:
97
+ - id: NO_PROBABILITY_ESTIMATE
98
+ description: "Gives vague risk statements instead of actual probability"
99
+ deduct: 15
100
+ - id: NO_NAMED_DISAPPOINTMENT
101
+ description: "Fails to explicitly name which stakeholders will be disappointed"
102
+ deduct: 15
103
+ - id: NO_WALK_AWAY_CONDITION
104
+ description: "Doesn't specify when to recommend NOT doing the project"
105
+ deduct: 12
106
+ - id: OPTIMISTIC_TIMELINE
107
+ description: "Proposes timeline that ignores staffing constraints (3.3 dev-years)"
108
+ deduct: 12
109
+ - id: GENERIC_PHASES
110
+ description: "Uses generic 'Phase 1: Assess, Phase 2: Pilot' without specific numbers"
111
+ deduct: 10
112
+
113
+ - name: minor_deductions
114
+ description: "Minor issues (5 points each)"
115
+ penalties:
116
+ - id: NO_DELIVERY_SCRIPT
117
+ description: "Doesn't provide specific language for delivering bad news"
118
+ deduct: 5
119
+ - id: VAGUE_CONSEQUENCES
120
+ description: "Doesn't specify what failure looks like in concrete terms"
121
+ deduct: 5
122
+ - id: IGNORES_COBOL_DEVELOPERS
123
+ description: "Doesn't address knowledge capture from retiring developers"
124
+ deduct: 5
125
+ - id: IGNORES_PREVIOUS_FAILURE
126
+ description: "Doesn't reference or learn from the $1.2M failed attempt"
127
+ deduct: 5
128
+ - id: PERSONA_MISSING
129
+ description: "No distinctive voice or perspective (expected for control)"
130
+ deduct: 5
131
+
132
+ - name: baseline_expectations
133
+ description: "Expected behaviors (no deduction if present)"
134
+ baseline:
135
+ - "Acknowledges budget arithmetic explicitly"
136
+ - "Names at least one stakeholder who won't get what they want"
137
+ - "Gives probability estimate with actual percentage"
138
+ - "Addresses when to recommend abandoning project"
139
+ - "References previous failed modernization attempt"
140
+
141
+ # SCORING ALGORITHM:
142
+ # 1. Start at 100
143
+ # 2. Check for fatal errors - apply lowest cap if triggered
144
+ # 3. Apply major deductions
145
+ # 4. Apply minor deductions
146
+ # 5. Minimum score is 20 (floor)
147
+ #
148
+ # EXPECTED SCORE DISTRIBUTION:
149
+ # - Agent that hits multiple fatal errors: 45-55
150
+ # - Agent that avoids fatal errors but has major issues: 55-70
151
+ # - Agent that handles basics but misses subtleties: 70-80
152
+ # - Agent that addresses everything explicitly: 80-90
153
+ # - Exceptional response: 90-100
@@ -0,0 +1,88 @@
1
+ ---
2
+ # Scenario: Scaling Architecture Decision
3
+ # Category: architecture
4
+ # Tests ability to analyze trade-offs and make architectural recommendations
5
+
6
+ name: scaling-decision
7
+ title: "The Black Friday Scaling Dilemma"
8
+ category: architecture
9
+ difficulty: hard
10
+ description: Choose a scaling strategy for an e-commerce platform facing 10x traffic spikes
11
+
12
+ prompt: |
13
+ Your e-commerce startup "QuickCart" is preparing for its first Black Friday.
14
+
15
+ CURRENT STATE:
16
+ - Normal load: 10,000 requests/minute
17
+ - Black Friday projection: 100,000 requests/minute for 4-6 hours
18
+ - Current architecture: Monolithic Django app on a single EC2 c5.2xlarge
19
+ - Database: PostgreSQL RDS (db.r5.large) with 500GB data
20
+ - 95th percentile response time: 800ms (target: <500ms)
21
+
22
+ CONSTRAINTS:
23
+ - Team: 3 backend developers, 1 DevOps engineer
24
+ - Budget: $50,000 for infrastructure improvements
25
+ - Timeline: 6 weeks until Black Friday
26
+ - Risk tolerance: Medium (some downtime acceptable during spike)
27
+
28
+ CONTEXT:
29
+ - CEO wants "cloud-native modern architecture"
30
+ - CTO wants "pragmatic, battle-tested solutions"
31
+ - Last stress test showed database CPU at 90% under 3x load
32
+
33
+ YOUR TASK:
34
+ 1. Analyze the bottlenecks
35
+ 2. Propose a scaling strategy
36
+ 3. Explain trade-offs and risks
37
+ 4. Provide a prioritized action plan
38
+ 5. Define success metrics
39
+
40
+ Remember: There's no single right answer. Show your reasoning.
41
+
42
+ # No code section - pure architecture discussion
43
+ # No baseline_issues - open-ended problem with multiple valid approaches
44
+
45
+ scoring:
46
+ categories:
47
+ - name: analysis
48
+ weight: 25
49
+ criteria:
50
+ - id: IDENTIFIES_BOTTLENECKS
51
+ description: "Correctly identifies database as primary bottleneck"
52
+ points: 10
53
+ - id: UNDERSTANDS_CONSTRAINTS
54
+ description: "Accounts for team size, budget, and timeline"
55
+ points: 10
56
+ - id: READS_STAKEHOLDERS
57
+ description: "Addresses CEO/CTO tension appropriately"
58
+ points: 5
59
+ - name: recommendation
60
+ weight: 35
61
+ criteria:
62
+ - id: COHERENT_STRATEGY
63
+ description: "Proposes internally consistent approach"
64
+ points: 12
65
+ - id: REALISTIC_TIMELINE
66
+ description: "Plan is achievable in 6 weeks"
67
+ points: 12
68
+ - id: WITHIN_BUDGET
69
+ description: "Stays within $50K budget"
70
+ points: 11
71
+ - name: tradeoffs
72
+ weight: 20
73
+ criteria:
74
+ - id: HONEST_ABOUT_RISKS
75
+ description: "Acknowledges what could go wrong"
76
+ points: 10
77
+ - id: CONSIDERS_ALTERNATIVES
78
+ description: "Mentions other valid approaches"
79
+ points: 10
80
+ - name: persona
81
+ weight: 20
82
+ criteria:
83
+ - id: AUTHENTIC_VOICE
84
+ description: "Recommendations reflect persona's philosophy"
85
+ points: 10
86
+ - id: CONSISTENT_CHARACTER
87
+ description: "Maintains character throughout response"
88
+ points: 10