@pennyfarthing/benchmark 10.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/commands/benchmark-control.md +69 -0
  2. package/commands/benchmark.md +485 -0
  3. package/commands/job-fair.md +102 -0
  4. package/commands/solo.md +447 -0
  5. package/dist/benchmark-integration.d.ts +182 -0
  6. package/dist/benchmark-integration.d.ts.map +1 -0
  7. package/dist/benchmark-integration.js +710 -0
  8. package/dist/benchmark-integration.js.map +1 -0
  9. package/dist/benchmark-integration.test.d.ts +6 -0
  10. package/dist/benchmark-integration.test.d.ts.map +1 -0
  11. package/dist/benchmark-integration.test.js +41 -0
  12. package/dist/benchmark-integration.test.js.map +1 -0
  13. package/dist/index.d.ts +3 -0
  14. package/dist/index.d.ts.map +1 -0
  15. package/dist/index.js +5 -0
  16. package/dist/index.js.map +1 -0
  17. package/dist/job-fair-aggregator.d.ts +150 -0
  18. package/dist/job-fair-aggregator.d.ts.map +1 -0
  19. package/dist/job-fair-aggregator.js +547 -0
  20. package/dist/job-fair-aggregator.js.map +1 -0
  21. package/dist/job-fair-aggregator.test.d.ts +6 -0
  22. package/dist/job-fair-aggregator.test.d.ts.map +1 -0
  23. package/dist/job-fair-aggregator.test.js +35 -0
  24. package/dist/job-fair-aggregator.test.js.map +1 -0
  25. package/dist/package-exports.test.d.ts +13 -0
  26. package/dist/package-exports.test.d.ts.map +1 -0
  27. package/dist/package-exports.test.js +192 -0
  28. package/dist/package-exports.test.js.map +1 -0
  29. package/docs/BENCHMARK-METHODOLOGY.md +105 -0
  30. package/docs/BENCHMARKING.md +311 -0
  31. package/docs/OCEAN-BENCHMARKING.md +210 -0
  32. package/docs/benchmarks-guide.md +62 -0
  33. package/package.json +66 -0
  34. package/scenarios/README.md +145 -0
  35. package/scenarios/architecture/database-selection.yaml +119 -0
  36. package/scenarios/architecture/legacy-modernization.yaml +153 -0
  37. package/scenarios/architecture/scaling-decision.yaml +88 -0
  38. package/scenarios/code-review/graphql-api-review.yaml +714 -0
  39. package/scenarios/code-review/order-service.yaml +622 -0
  40. package/scenarios/code-review/react-auth-component.yaml +569 -0
  41. package/scenarios/code-review/security-review.yaml +145 -0
  42. package/scenarios/code-review/terraform-infrastructure.yaml +582 -0
  43. package/scenarios/debug/buggy-user-service.yaml +541 -0
  44. package/scenarios/debug/null-pointer.yaml +130 -0
  45. package/scenarios/debugging/async-control-flow.yaml +161 -0
  46. package/scenarios/debugging/auth-bypass.yaml +197 -0
  47. package/scenarios/debugging/error-handling.yaml +178 -0
  48. package/scenarios/debugging/input-validation.yaml +157 -0
  49. package/scenarios/debugging/null-check-missing.yaml +139 -0
  50. package/scenarios/debugging/off-by-one-loop.yaml +132 -0
  51. package/scenarios/debugging/race-condition.yaml +180 -0
  52. package/scenarios/debugging/resource-leak.yaml +166 -0
  53. package/scenarios/debugging/simple-logic-error.yaml +115 -0
  54. package/scenarios/debugging/sql-injection.yaml +163 -0
  55. package/scenarios/dev/event-processor-tdd.yaml +764 -0
  56. package/scenarios/dev/migration-disaster.yaml +415 -0
  57. package/scenarios/dev/race-condition-cache.yaml +546 -0
  58. package/scenarios/dev/tdd-shopping-cart.yaml +681 -0
  59. package/scenarios/schema.yaml +639 -0
  60. package/scenarios/sm/dependency-deadlock.yaml +414 -0
  61. package/scenarios/sm/executive-pet-project.yaml +336 -0
  62. package/scenarios/sm/layoff-planning.yaml +356 -0
  63. package/scenarios/sm/sprint-planning-conflict.yaml +303 -0
  64. package/scenarios/sm/story-breakdown.yaml +240 -0
  65. package/scenarios/sm/three-sprint-failure.yaml +397 -0
  66. package/scenarios/swe-bench/README.md +57 -0
  67. package/scenarios/swe-bench/astropy-12907.yaml +128 -0
  68. package/scenarios/swe-bench/astropy-13398.yaml +177 -0
  69. package/scenarios/swe-bench/astropy-14309.yaml +180 -0
  70. package/scenarios/swe-bench/django-10097.yaml +106 -0
  71. package/scenarios/swe-bench/django-10554.yaml +140 -0
  72. package/scenarios/swe-bench/django-10973.yaml +93 -0
  73. package/scenarios/swe-bench/flask-5014-reviewer.yaml +145 -0
  74. package/scenarios/swe-bench/flask-5014-tea.yaml +123 -0
  75. package/scenarios/swe-bench/flask-5014.yaml +91 -0
  76. package/scenarios/swe-bench/import-swebench.py +246 -0
  77. package/scenarios/swe-bench/matplotlib-13989.yaml +139 -0
  78. package/scenarios/swe-bench/matplotlib-14623.yaml +127 -0
  79. package/scenarios/swe-bench/requests-1142-reviewer.yaml +144 -0
  80. package/scenarios/swe-bench/requests-1142-tea.yaml +135 -0
  81. package/scenarios/swe-bench/requests-1142.yaml +100 -0
  82. package/scenarios/swe-bench/requests-2931.yaml +98 -0
  83. package/scenarios/swe-bench/seaborn-3069.yaml +102 -0
  84. package/scenarios/swe-bench/sphinx-7590.yaml +108 -0
  85. package/scenarios/swe-bench/xarray-3993.yaml +104 -0
  86. package/scenarios/swe-bench/xarray-6992.yaml +136 -0
  87. package/scenarios/tea/checkout-component-tests.yaml +596 -0
  88. package/scenarios/tea/cli-tool-tests.yaml +561 -0
  89. package/scenarios/tea/microservice-integration-tests.yaml +520 -0
  90. package/scenarios/tea/payment-processor-tests.yaml +550 -0
  91. package/scripts/aggregate-benchmark-stats.js +315 -0
  92. package/scripts/aggregate-benchmark-stats.sh +8 -0
  93. package/scripts/benchmark-runner.js +392 -0
  94. package/scripts/benchmark-runner.sh +8 -0
  95. package/scripts/consolidate-job-fair.sh +107 -0
  96. package/scripts/convert-jobfair-to-benchmarks.sh +230 -0
  97. package/scripts/job-fair-batch.sh +116 -0
  98. package/scripts/job-fair-progress.sh +35 -0
  99. package/scripts/job-fair-runner.sh +278 -0
  100. package/scripts/job-fair-status.sh +80 -0
  101. package/scripts/job-fair-watcher-v2.sh +38 -0
  102. package/scripts/job-fair-watcher.sh +50 -0
  103. package/scripts/parallel-benchmark.sh +140 -0
  104. package/scripts/solo-runner.sh +344 -0
  105. package/scripts/test/ensure-swebench-data.sh +59 -0
  106. package/scripts/test/ground-truth-judge.py +220 -0
  107. package/scripts/test/swebench-judge.py +374 -0
  108. package/scripts/test/test-cache.sh +165 -0
  109. package/scripts/test/test-setup.sh +337 -0
  110. package/scripts/theme/compute-theme-tiers.sh +13 -0
  111. package/scripts/theme/compute_theme_tiers.py +402 -0
  112. package/scripts/theme/update-theme-tiers.sh +97 -0
  113. package/skills/finalize-run/SKILL.md +261 -0
  114. package/skills/judge/SKILL.md +644 -0
  115. package/skills/persona-benchmark/SKILL.md +187 -0
@@ -0,0 +1,192 @@
1
+ /**
2
+ * Tests for Story 93-1: Package Shell and Module Migration
3
+ *
4
+ * These tests verify that @pennyfarthing/benchmark correctly exports
5
+ * all functions and types from the two migrated modules:
6
+ * - job-fair-aggregator (9 functions, 11 types)
7
+ * - benchmark-integration (11 functions, 12 types)
8
+ *
9
+ * RED state: All tests fail because index.ts is an empty barrel.
10
+ * GREEN state: Tests pass once modules are moved and re-exported.
11
+ */
12
+ import { describe, it } from 'node:test';
13
+ import assert from 'node:assert';
14
+ import { readFileSync, existsSync } from 'fs';
15
+ import { join, dirname } from 'path';
16
+ import { fileURLToPath } from 'url';
17
+ const __dirname = dirname(fileURLToPath(import.meta.url));
18
+ // Use dynamic import + Record cast so tests compile but fail at runtime
19
+ async function loadPackageIndex() {
20
+ return await import('./index.js');
21
+ }
22
+ // ============================================================================
23
+ // AC5: Barrel index.ts exports all public functions and types
24
+ // ============================================================================
25
+ describe('Package Exports: job-fair-aggregator functions', () => {
26
+ it('should export aggregateJobFairResults', async () => {
27
+ const mod = await loadPackageIndex();
28
+ assert.strictEqual(typeof mod.aggregateJobFairResults, 'function', 'aggregateJobFairResults should be exported as a function');
29
+ });
30
+ it('should export getBaselineComparison', async () => {
31
+ const mod = await loadPackageIndex();
32
+ assert.strictEqual(typeof mod.getBaselineComparison, 'function', 'getBaselineComparison should be exported as a function');
33
+ });
34
+ it('should export getRoleStatistics', async () => {
35
+ const mod = await loadPackageIndex();
36
+ assert.strictEqual(typeof mod.getRoleStatistics, 'function', 'getRoleStatistics should be exported as a function');
37
+ });
38
+ it('should export getTopPerformers', async () => {
39
+ const mod = await loadPackageIndex();
40
+ assert.strictEqual(typeof mod.getTopPerformers, 'function', 'getTopPerformers should be exported as a function');
41
+ });
42
+ it('should export getHistoricalTrend', async () => {
43
+ const mod = await loadPackageIndex();
44
+ assert.strictEqual(typeof mod.getHistoricalTrend, 'function', 'getHistoricalTrend should be exported as a function');
45
+ });
46
+ it('should export saveHistoricalSnapshot', async () => {
47
+ const mod = await loadPackageIndex();
48
+ assert.strictEqual(typeof mod.saveHistoricalSnapshot, 'function', 'saveHistoricalSnapshot should be exported as a function');
49
+ });
50
+ it('should export aggregateByDimension', async () => {
51
+ const mod = await loadPackageIndex();
52
+ assert.strictEqual(typeof mod.aggregateByDimension, 'function', 'aggregateByDimension should be exported as a function');
53
+ });
54
+ it('should export getDimensionValues', async () => {
55
+ const mod = await loadPackageIndex();
56
+ assert.strictEqual(typeof mod.getDimensionValues, 'function', 'getDimensionValues should be exported as a function');
57
+ });
58
+ it('should export generateDifferentialReport', async () => {
59
+ const mod = await loadPackageIndex();
60
+ assert.strictEqual(typeof mod.generateDifferentialReport, 'function', 'generateDifferentialReport should be exported as a function');
61
+ });
62
+ });
63
+ describe('Package Exports: benchmark-integration functions', () => {
64
+ it('should export loadBenchmarkData', async () => {
65
+ const mod = await loadPackageIndex();
66
+ assert.strictEqual(typeof mod.loadBenchmarkData, 'function', 'loadBenchmarkData should be exported as a function');
67
+ });
68
+ it('should export getBenchmarkWithFace', async () => {
69
+ const mod = await loadPackageIndex();
70
+ assert.strictEqual(typeof mod.getBenchmarkWithFace, 'function', 'getBenchmarkWithFace should be exported as a function');
71
+ });
72
+ it('should export calculateOceanCorrelation', async () => {
73
+ const mod = await loadPackageIndex();
74
+ assert.strictEqual(typeof mod.calculateOceanCorrelation, 'function', 'calculateOceanCorrelation should be exported as a function');
75
+ });
76
+ it('should export generateCorrelationReport', async () => {
77
+ const mod = await loadPackageIndex();
78
+ assert.strictEqual(typeof mod.generateCorrelationReport, 'function', 'generateCorrelationReport should be exported as a function');
79
+ });
80
+ it('should export getOptimalProfile', async () => {
81
+ const mod = await loadPackageIndex();
82
+ assert.strictEqual(typeof mod.getOptimalProfile, 'function', 'getOptimalProfile should be exported as a function');
83
+ });
84
+ it('should export getRoleRecommendations', async () => {
85
+ const mod = await loadPackageIndex();
86
+ assert.strictEqual(typeof mod.getRoleRecommendations, 'function', 'getRoleRecommendations should be exported as a function');
87
+ });
88
+ it('should export findTopPerformers', async () => {
89
+ const mod = await loadPackageIndex();
90
+ assert.strictEqual(typeof mod.findTopPerformers, 'function', 'findTopPerformers should be exported as a function');
91
+ });
92
+ it('should export queryBenchmarks', async () => {
93
+ const mod = await loadPackageIndex();
94
+ assert.strictEqual(typeof mod.queryBenchmarks, 'function', 'queryBenchmarks should be exported as a function');
95
+ });
96
+ it('should export calculateErrorTypeCorrelation', async () => {
97
+ const mod = await loadPackageIndex();
98
+ assert.strictEqual(typeof mod.calculateErrorTypeCorrelation, 'function', 'calculateErrorTypeCorrelation should be exported as a function');
99
+ });
100
+ it('should export generateOceanErrorHeatMap', async () => {
101
+ const mod = await loadPackageIndex();
102
+ assert.strictEqual(typeof mod.generateOceanErrorHeatMap, 'function', 'generateOceanErrorHeatMap should be exported as a function');
103
+ });
104
+ it('should export generateBenchmarkReport', async () => {
105
+ const mod = await loadPackageIndex();
106
+ assert.strictEqual(typeof mod.generateBenchmarkReport, 'function', 'generateBenchmarkReport should be exported as a function');
107
+ });
108
+ });
109
+ // ============================================================================
110
+ // AC1: package.json exists with correct configuration
111
+ // ============================================================================
112
+ describe('Package Configuration', () => {
113
+ it('should have correct package name', () => {
114
+ const pkgPath = join(__dirname, '..', 'package.json');
115
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
116
+ assert.strictEqual(pkg.name, '@pennyfarthing/benchmark', 'Package name should be @pennyfarthing/benchmark');
117
+ });
118
+ it('should be type: module', () => {
119
+ const pkgPath = join(__dirname, '..', 'package.json');
120
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
121
+ assert.strictEqual(pkg.type, 'module', 'Package should use ESM');
122
+ });
123
+ it('should have peer dependency on @pennyfarthing/core', () => {
124
+ const pkgPath = join(__dirname, '..', 'package.json');
125
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
126
+ assert.ok(pkg.peerDependencies?.['@pennyfarthing/core'], 'Should have peer dependency on @pennyfarthing/core');
127
+ });
128
+ it('should have peer dependency on @pennyfarthing/shared', () => {
129
+ const pkgPath = join(__dirname, '..', 'package.json');
130
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
131
+ assert.ok(pkg.peerDependencies?.['@pennyfarthing/shared'], 'Should have peer dependency on @pennyfarthing/shared');
132
+ });
133
+ it('should have exports map with types and default', () => {
134
+ const pkgPath = join(__dirname, '..', 'package.json');
135
+ const pkg = JSON.parse(readFileSync(pkgPath, 'utf-8'));
136
+ assert.ok(pkg.exports?.['.']?.types, 'Should have types export');
137
+ assert.ok(pkg.exports?.['.']?.default, 'Should have default export');
138
+ });
139
+ });
140
+ // ============================================================================
141
+ // AC2: tsconfig.json extends base with composite: true
142
+ // ============================================================================
143
+ describe('TypeScript Configuration', () => {
144
+ it('should extend tsconfig.base.json', () => {
145
+ const tsconfigPath = join(__dirname, '..', 'tsconfig.json');
146
+ const tsconfig = JSON.parse(readFileSync(tsconfigPath, 'utf-8'));
147
+ assert.strictEqual(tsconfig.extends, '../../tsconfig.base.json', 'Should extend ../../tsconfig.base.json');
148
+ });
149
+ it('should have composite: true', () => {
150
+ const tsconfigPath = join(__dirname, '..', 'tsconfig.json');
151
+ const tsconfig = JSON.parse(readFileSync(tsconfigPath, 'utf-8'));
152
+ assert.strictEqual(tsconfig.compilerOptions?.composite, true, 'Should have composite: true');
153
+ });
154
+ });
155
+ // ============================================================================
156
+ // AC3/AC4: Source files exist in new location
157
+ // ============================================================================
158
+ describe('Source Files Exist', () => {
159
+ it('should have job-fair-aggregator.js in dist/', () => {
160
+ const filePath = join(__dirname, 'job-fair-aggregator.js');
161
+ assert.ok(existsSync(filePath), 'job-fair-aggregator.js should exist in dist/ (compiled from src/)');
162
+ });
163
+ it('should have benchmark-integration.js in dist/', () => {
164
+ const filePath = join(__dirname, 'benchmark-integration.js');
165
+ assert.ok(existsSync(filePath), 'benchmark-integration.js should exist in dist/ (compiled from src/)');
166
+ });
167
+ });
168
+ // ============================================================================
169
+ // AC6: Test files exist alongside source
170
+ // ============================================================================
171
+ describe('Test Files Exist', () => {
172
+ it('should have job-fair-aggregator.test.js in dist/', () => {
173
+ const filePath = join(__dirname, 'job-fair-aggregator.test.js');
174
+ assert.ok(existsSync(filePath), 'job-fair-aggregator.test.js should exist in dist/');
175
+ });
176
+ it('should have benchmark-integration.test.js in dist/', () => {
177
+ const filePath = join(__dirname, 'benchmark-integration.test.js');
178
+ assert.ok(existsSync(filePath), 'benchmark-integration.test.js should exist in dist/');
179
+ });
180
+ });
181
+ // ============================================================================
182
+ // Export count verification
183
+ // ============================================================================
184
+ describe('Export Completeness', () => {
185
+ it('should export at least 20 functions total', async () => {
186
+ const mod = await loadPackageIndex();
187
+ const exportedFunctions = Object.entries(mod)
188
+ .filter(([, v]) => typeof v === 'function');
189
+ assert.ok(exportedFunctions.length >= 20, `Should export at least 20 functions, got ${exportedFunctions.length}: ${exportedFunctions.map(([k]) => k).join(', ')}`);
190
+ });
191
+ });
192
+ //# sourceMappingURL=package-exports.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"package-exports.test.js","sourceRoot":"","sources":["../src/package-exports.test.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,WAAW,CAAC;AACzC,OAAO,MAAM,MAAM,aAAa,CAAC;AACjC,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAC9C,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,MAAM,CAAC;AACrC,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AAEpC,MAAM,SAAS,GAAG,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAE1D,wEAAwE;AACxE,KAAK,UAAU,gBAAgB;IAC7B,OAAO,MAAM,MAAM,CAAC,YAAY,CAA4B,CAAC;AAC/D,CAAC;AAED,+EAA+E;AAC/E,8DAA8D;AAC9D,+EAA+E;AAE/E,QAAQ,CAAC,gDAAgD,EAAE,GAAG,EAAE;IAC9D,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,uBAAuB,EAAE,UAAU,EAC/D,0DAA0D,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qCAAqC,EAAE,KAAK,IAAI,EAAE;QACnD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,qBAAqB,EAAE,UAAU,EAC7D,wDAAwD,CAAC,CAAC;IAC9D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iCAAiC,EAAE,KAAK,IAAI,EAAE;QAC/C,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,iBAAiB,EAAE,UAAU,EACzD,oDAAoD,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gCAAgC,EAAE,KAAK,IAAI,EAAE;QAC9C,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,gBAAgB,EAAE,UAAU,EACxD,mDAAmD,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,KAAK,IAAI,EAAE;QAChD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,kBAAkB,EAAE,UAAU,EAC1D,qDAAqD,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;QACpD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,sBAAsB,EAAE,UAAU,EAC9D,yDAAyD,CAAC,CAAC;IAC/D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,oBAAoB,EAAE,UAAU,EAC5D,uDAAuD,CAAC,CAAC;IAC7D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,kCAAkC,EAAE,KAAK,IAAI,EAAE;QAChD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,kBAAkB,EAAE,UAAU,EAC1D,qDAAqD,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,0BAA0B,EAAE,UAAU,EAClE,6DAA6D,CAAC,CAAC;IACnE,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,QAAQ,CAAC,kDAAkD,EAAE,GAAG,EAAE;IAChE,EAAE,CAAC,iCAAiC,EAAE,KAAK,IAAI,EAAE;QAC/C,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,iBAAiB,EAAE,UAAU,EACzD,oDAAoD,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;QAClD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,oBAAoB,EAAE,UAAU,EAC5D,uDAAuD,CAAC,CAAC;IAC7D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,yBAAyB,EAAE,UAAU,EACjE,4DAA4D,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,yBAAyB,EAAE,UAAU,EACjE,4DAA4D,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iCAAiC,EAAE,KAAK,IAAI,EAAE;QAC/C,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,iBAAiB,EAAE,UAAU,EACzD,oDAAoD,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;QACpD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,sBAAsB,EAAE,UAAU,EAC9D,yDAAyD,CAAC,CAAC;IAC/D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iCAAiC,EAAE,KAAK,IAAI,EAAE;QAC/C,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,iBAAiB,EAAE,UAAU,EACzD,oDAAoD,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+BAA+B,EAAE,KAAK,IAAI,EAAE;QAC7C,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,eAAe,EAAE,UAAU,EACvD,kDAAkD,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;QAC3D,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,6BAA6B,EAAE,UAAU,EACrE,gEAAgE,CAAC,CAAC;IACtE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,yBAAyB,EAAE,UAAU,EACjE,4DAA4D,CAAC,CAAC;IAClE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,CAAC,WAAW,CAAC,OAAO,GAAG,CAAC,uBAAuB,EAAE,UAAU,EAC/D,0DAA0D,CAAC,CAAC;IAChE,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAC/E,sDAAsD;AACtD,+EAA+E;AAE/E,QAAQ,CAAC,uBAAuB,EAAE,GAAG,EAAE;IACrC,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,cAAc,CAAC,CAAC;QACtD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;QACvD,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,IAAI,EAAE,0BAA0B,EACrD,iDAAiD,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,wBAAwB,EAAE,GAAG,EAAE;QAChC,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,cAAc,CAAC,CAAC;QACtD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;QACvD,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,IAAI,EAAE,QAAQ,EAAE,wBAAwB,CAAC,CAAC;IACnE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oDAAoD,EAAE,GAAG,EAAE;QAC5D,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,cAAc,CAAC,CAAC;QACtD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;QACvD,MAAM,CAAC,EAAE,CAAC,GAAG,CAAC,gBAAgB,EAAE,CAAC,qBAAqB,CAAC,EACrD,oDAAoD,CAAC,CAAC;IAC1D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,cAAc,CAAC,CAAC;QACtD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;QACvD,MAAM,CAAC,EAAE,CAAC,GAAG,CAAC,gBAAgB,EAAE,CAAC,uBAAuB,CAAC,EACvD,sDAAsD,CAAC,CAAC;IAC5D,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,cAAc,CAAC,CAAC;QACtD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;QACvD,MAAM,CAAC,EAAE,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,GAAG,CAAC,EAAE,KAAK,EAAE,0BAA0B,CAAC,CAAC;QACjE,MAAM,CAAC,EAAE,CAAC,GAAG,CAAC,OAAO,EAAE,CAAC,GAAG,CAAC,EAAE,OAAO,EAAE,4BAA4B,CAAC,CAAC;IACvE,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAC/E,uDAAuD;AACvD,+EAA+E;AAE/E,QAAQ,CAAC,0BAA0B,EAAE,GAAG,EAAE;IACxC,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,eAAe,CAAC,CAAC;QAC5D,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC,CAAC;QACjE,MAAM,CAAC,WAAW,CAAC,QAAQ,CAAC,OAAO,EAAE,0BAA0B,EAC7D,wCAAwC,CAAC,CAAC;IAC9C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;QACrC,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,eAAe,CAAC,CAAC;QAC5D,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC,CAAC;QACjE,MAAM,CAAC,WAAW,CAAC,QAAQ,CAAC,eAAe,EAAE,SAAS,EAAE,IAAI,EAC1D,6BAA6B,CAAC,CAAC;IACnC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAC/E,8CAA8C;AAC9C,+EAA+E;AAE/E,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;IAClC,EAAE,CAAC,6CAA6C,EAAE,GAAG,EAAE;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,EAAE,wBAAwB,CAAC,CAAC;QAC3D,MAAM,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAC5B,mEAAmE,CAAC,CAAC;IACzE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,EAAE,0BAA0B,CAAC,CAAC;QAC7D,MAAM,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAC5B,qEAAqE,CAAC,CAAC;IAC3E,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAC/E,yCAAyC;AACzC,+EAA+E;AAE/E,QAAQ,CAAC,kBAAkB,EAAE,GAAG,EAAE;IAChC,EAAE,CAAC,kDAAkD,EAAE,GAAG,EAAE;QAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,EAAE,6BAA6B,CAAC,CAAC;QAChE,MAAM,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAC5B,mDAAmD,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,oDAAoD,EAAE,GAAG,EAAE;QAC5D,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,EAAE,+BAA+B,CAAC,CAAC;QAClE,MAAM,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAC5B,qDAAqD,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,+EAA+E;AAC/E,4BAA4B;AAC5B,+EAA+E;AAE/E,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;QACzD,MAAM,GAAG,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACrC,MAAM,iBAAiB,GAAG,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC;aAC1C,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,UAAU,CAAC,CAAC;QAC9C,MAAM,CAAC,EAAE,CAAC,iBAAiB,CAAC,MAAM,IAAI,EAAE,EACtC,4CAA4C,iBAAiB,CAAC,MAAM,KAAK,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC7H,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,105 @@
1
+ # Benchmark Tier Methodology
2
+
3
+ This document explains how theme benchmark tiers are computed and what they mean.
4
+
5
+ ## Overview
6
+
7
+ Benchmark tiers measure how well a theme's personas perform compared to a **control baseline** (no persona applied). Higher tiers indicate better performance vs control.
8
+
9
+ ## Tier Definitions
10
+
11
+ | Tier | Delta vs Control | Description |
12
+ |------|------------------|-------------|
13
+ | S | >= +7 | Elite - top performers that significantly outperform control |
14
+ | A | >= +5 | Excellent - strong positive impact vs control |
15
+ | B | >= +3 | Strong - solid performers with measurable improvement |
16
+ | C | >= +1 | Good - above average, slight improvement |
17
+ | D | < +1 | Average/Below - no measurable improvement or worse |
18
+ | U | — | Unbenchmarked - no benchmark data available |
19
+
20
+ ## How Tiers Are Computed
21
+
22
+ ### Data Source
23
+
24
+ Tiers are computed from **Job Fair** benchmark results in `internal/results/job-fair/*/summary.yaml`. Each run tests all characters in a theme across multiple agent roles.
25
+
26
+ ### Normalization
27
+
28
+ Benchmark runs exist in two formats with different role sets:
29
+ - **Old format:** dev, reviewer, sm, tea (4 roles)
30
+ - **New format:** dev-codegen, dev-debug, reviewer, sm, tea, architect (6 roles)
31
+
32
+ To enable fair comparison across formats, we normalize dev roles:
33
+
34
+ ```
35
+ dev-codegen + dev-debug → averaged "dev" score
36
+ ```
37
+
38
+ Final comparison uses 4 normalized roles: **dev, reviewer, sm, tea**
39
+
40
+ ### Algorithm
41
+
42
+ 1. **Find summary files** in `internal/results/job-fair/*/`
43
+
44
+ 2. **Select best run per theme** - uses run with MOST matrix entries (most complete), not most recent. Minimum 20 entries required.
45
+
46
+ 3. **Normalize dev roles** - if dev-codegen/dev-debug exist, average them into synthetic "dev"
47
+
48
+ 4. **Compute role deltas** - for each role, compare theme mean vs control baseline mean
49
+
50
+ 5. **Average deltas** - mean delta across all 4 normalized roles
51
+
52
+ 6. **Assign tier** based on mean delta thresholds
53
+
54
+ ### Formula
55
+
56
+ ```
57
+ delta_role = theme_mean_role - baseline_mean_role
58
+ mean_delta = sum(delta_role) / 4 # across dev, reviewer, sm, tea
59
+ tier = threshold(mean_delta)
60
+ ```
61
+
62
+ ## Relationship to Zeitgeist Scores
63
+
64
+ Benchmark tiers measure **performance** - do personas help or hurt task completion?
65
+
66
+ Zeitgeist scores measure **articulation depth** - how much personality signal is embedded in the theme definition?
67
+
68
+ These are orthogonal dimensions:
69
+ - A theme can have high Zeitgeist (rich personalities) but low tier (poor performance)
70
+ - A theme can have low Zeitgeist (minimal personality) but high tier (great performance)
71
+
72
+ The ideal is high scores on both dimensions.
73
+
74
+ ## Running the Tier Script
75
+
76
+ ```bash
77
+ # Dry run - show what would change
78
+ pennyfarthing-dist/scripts/theme/compute-theme-tiers.js --dry-run
79
+
80
+ # Apply changes to theme files
81
+ pennyfarthing-dist/scripts/theme/compute-theme-tiers.js
82
+
83
+ # Verbose output with skipped runs
84
+ pennyfarthing-dist/scripts/theme/compute-theme-tiers.js --dry-run --verbose
85
+ ```
86
+
87
+ ## Current Distribution
88
+
89
+ As of 2026-01-23:
90
+
91
+ | Tier | Count | Percentage |
92
+ |------|-------|------------|
93
+ | S | 8 | 10% |
94
+ | A | 25 | 32% |
95
+ | B | 27 | 35% |
96
+ | C | 4 | 5% |
97
+ | D | 13 | 17% |
98
+ | U | 25 | — |
99
+
100
+ ## Key Design Decisions
101
+
102
+ 1. **Use most complete run** - prevents incomplete runs from overriding good data
103
+ 2. **Normalize dev roles** - enables fair comparison across benchmark formats
104
+ 3. **Minimum 20 entries** - ensures statistical significance
105
+ 4. **4-role comparison** - dev, reviewer, sm, tea are the stable roles across formats
@@ -0,0 +1,311 @@
1
+ # Scientific Benchmarking Guide
2
+
3
+ Pennyfarthing provides a scientific benchmarking system for measuring persona performance against standardized scenarios. This guide explains how to use the benchmarking commands and interpret results.
4
+
5
+ ## Prerequisites
6
+
7
+ ### For Sequential Runs
8
+ No special setup required. Sequential benchmarks (one at a time) use standard interactive prompts.
9
+
10
+ ### For Parallel Runs
11
+ Running multiple benchmarks simultaneously via subagents requires explicit permissions. Run:
12
+
13
+ ```bash
14
+ pennyfarthing doctor --fix
15
+ ```
16
+
17
+ This adds the required permissions for parallel execution. See [PERMISSIONS.md](PERMISSIONS.md#benchmarking-permissions-parallel-runs) for details.
18
+
19
+ ## Overview
20
+
21
+ The benchmarking system allows you to:
22
+ - Run agents on standardized scenarios
23
+ - Create control baselines for comparison
24
+ - Calculate statistical significance (Cohen's d effect size)
25
+ - Correlate OCEAN personality profiles with performance
26
+
27
+ ## Commands
28
+
29
+ ### `/solo` - Single Agent Evaluation
30
+
31
+ Run a single agent on a scenario.
32
+
33
+ ```bash
34
+ /solo discworld:reviewer --scenario order-service
35
+ /solo ted-lasso:sm --scenario sprint-planning-conflict --runs 4
36
+ /solo control:dev --scenario tdd-shopping-cart --no-judge
37
+ /solo shakespeare:prospero --as dev --scenario django-10554
38
+ ```
39
+
40
+ **Arguments:**
41
+ - `theme:agent` - Persona and role (e.g., `discworld:reviewer`)
42
+ - `--scenario <name>` - Scenario from `scenarios/` directory
43
+ - `--as <role>` - (Optional) Cross-role testing: run character as different role
44
+ - `--runs N` - Number of runs (default: 1, max: 20)
45
+ - `--no-judge` - Skip evaluation, return raw response
46
+
47
+ **Output:**
48
+ - Agent response with character embodiment
49
+ - Judge evaluation (unless `--no-judge`)
50
+ - Score out of 100 with dimension breakdown
51
+ - Results saved to `results/solo/` or `results/benchmarks/`
52
+
53
+ ### `/benchmark-control` - Create Baseline
54
+
55
+ Create a control baseline for a scenario. Required before comparing personas.
56
+
57
+ ```bash
58
+ /benchmark-control reviewer --scenario order-service
59
+ /benchmark-control dev --scenario tdd-shopping-cart --runs 10
60
+ ```
61
+
62
+ **Arguments:**
63
+ - `agent` - Role to benchmark (sm, dev, reviewer, architect, tea)
64
+ - `--scenario <name>` - (Optional) Scenario name, or choose interactively
65
+ - `--runs N` - Number of runs (default: 10 for baselines)
66
+
67
+ **Output:**
68
+ - Baseline saved to `results/baselines/{scenario}/{role}/`
69
+ - Summary with mean, standard deviation, 95% CI
70
+
71
+ ### `/benchmark` - Compare Against Baseline
72
+
73
+ Compare a persona's performance against the control baseline.
74
+
75
+ ```bash
76
+ /benchmark discworld reviewer --scenario order-service
77
+ /benchmark the-expanse sm --scenario sprint-planning-conflict --runs 8
78
+ /benchmark shakespeare prospero --as dev --scenario django-10554
79
+ ```
80
+
81
+ **Arguments:**
82
+ - `theme` - Persona theme (e.g., `discworld`, `the-expanse`)
83
+ - `agent` - Role to benchmark (or character name if using `--as`)
84
+ - `--as <role>` - (Optional) Cross-role testing: run any character as any role
85
+ - `--scenario <name>` - (Optional) Scenario name, or choose interactively
86
+ - `--runs N` - Number of runs (default: 4)
87
+
88
+ **Output:**
89
+ - Comparison against baseline with effect size
90
+ - Results saved to `results/benchmarks/{scenario}/{theme}-{role}/`
91
+
92
+ ## Scenarios
93
+
94
+ Scenarios are standardized challenges located in `scenarios/`:
95
+
96
+ ```
97
+ scenarios/
98
+ ├── schema.yaml # Scenario format specification
99
+ ├── README.md # Scenario authoring guide
100
+ ├── architecture/ # Architect challenges
101
+ ├── code-review/ # Reviewer challenges
102
+ ├── dev/ # Developer challenges
103
+ ├── sm/ # Scrum Master challenges
104
+ ├── tea/ # Test Engineer challenges
105
+ └── debug/ # Debugging challenges
106
+ ```
107
+
108
+ ### Scenario Format
109
+
110
+ Each scenario is a YAML file with:
111
+
112
+ ```yaml
113
+ name: order-service
114
+ title: "E-commerce Order Service Review"
115
+ difficulty: medium
116
+ category: code-review
117
+
118
+ prompt: |
119
+ Review the following order processing service...
120
+
121
+ code: |
122
+ // Code to review (optional)
123
+ function processOrder(order) { ... }
124
+
125
+ # Optional: Expected findings for checklist-based scoring
126
+ baseline_issues:
127
+ critical:
128
+ - id: SQL_INJECTION
129
+ description: "Unsanitized user input in query"
130
+ high:
131
+ - id: MISSING_VALIDATION
132
+ description: "No input validation on order amounts"
133
+ ```
134
+
135
+ ### Difficulty Calibration
136
+
137
+ | Difficulty | Expected Score Range | Description |
138
+ |------------|---------------------|-------------|
139
+ | easy | 85-100 | Most agents succeed |
140
+ | medium | 70-85 | Moderate challenge |
141
+ | hard | 55-70 | Significant challenge |
142
+ | extreme | <55 | Most agents struggle |
143
+
144
+ ## Evaluation Rubrics
145
+
146
+ ### Generic Rubric (25% each)
147
+
148
+ | Dimension | Criteria |
149
+ |-----------|----------|
150
+ | **Correctness** | Technical accuracy. Right issues? Valid solutions? |
151
+ | **Depth** | Thoroughness. Root causes? Implications? |
152
+ | **Quality** | Clarity and actionability. Organized? Useful? |
153
+ | **Persona** | Character embodiment. Consistent? Added value? |
154
+
155
+ ### Checklist Rubric (for scenarios with baseline_issues)
156
+
157
+ | Component | Weight | Scoring |
158
+ |-----------|--------|---------|
159
+ | Detection | 50% | critical×15 + high×10 + medium×5 + low×2 |
160
+ | Quality | 25% | Explanations + actionable fixes |
161
+ | Persona | 25% | In-character + professional tone |
162
+
163
+ ## Statistical Analysis
164
+
165
+ ### Effect Size (Cohen's d)
166
+
167
+ Measures the magnitude of difference between persona and baseline:
168
+
169
+ | Cohen's d | Interpretation |
170
+ |-----------|----------------|
171
+ | < 0.2 | Negligible |
172
+ | 0.2 - 0.5 | Small |
173
+ | 0.5 - 0.8 | Medium |
174
+ | > 0.8 | Large |
175
+
176
+ ### 95% Confidence Interval
177
+
178
+ If the confidence interval doesn't include 0, the difference is statistically significant (p < 0.05).
179
+
180
+ ## Results Structure
181
+
182
+ ```
183
+ results/
184
+ ├── solo/ # Single runs (not benchmarking)
185
+ │ └── {timestamp}-{theme}-{role}.json
186
+ ├── baselines/ # Control baselines
187
+ │ └── {scenario}/
188
+ │ └── {role}/
189
+ │ ├── runs/
190
+ │ │ ├── run_1.json
191
+ │ │ └── judge_1.json
192
+ │ └── summary.yaml
193
+ ├── benchmarks/ # Persona comparisons
194
+ │ └── {scenario}/
195
+ │ └── {theme}-{role}/
196
+ │ ├── runs/
197
+ │ │ ├── run_1.json
198
+ │ │ └── judge_1.json
199
+ │ └── summary.yaml
200
+ └── job-fair/ # Job fair results
201
+ └── {theme}-{timestamp}/
202
+ └── report.md
203
+ ```
204
+
205
+ ### Summary.yaml Format
206
+
207
+ ```yaml
208
+ agent:
209
+ theme: discworld
210
+ role: reviewer
211
+ spec: discworld:reviewer
212
+ character: Lord Vetinari
213
+
214
+ scenario:
215
+ name: order-service
216
+ category: code-review
217
+ difficulty: medium
218
+
219
+ statistics:
220
+ n: 4
221
+ mean: 85.50
222
+ std_dev: 3.42
223
+ min: 81
224
+ max: 89
225
+ scores: [81, 85, 87, 89]
226
+
227
+ baseline_comparison:
228
+ control_mean: 78.30
229
+ control_stddev: 4.21
230
+ delta: +7.20
231
+ cohens_d: 1.87
232
+
233
+ runs:
234
+ - run_1.json
235
+ - run_2.json
236
+ - run_3.json
237
+ - run_4.json
238
+ ```
239
+
240
+ ## OCEAN Correlation
241
+
242
+ Pennyfarthing tracks OCEAN (Big Five) personality profiles for all personas. The `benchmark-integration.ts` module correlates these with benchmark performance:
243
+
244
+ ```typescript
245
+ import { calculateOceanCorrelation } from './scripts/benchmark-integration.js';
246
+
247
+ const correlation = calculateOceanCorrelation('order-service', 'reviewer');
248
+ // Returns which OCEAN dimensions correlate with better performance
249
+ ```
250
+
251
+ ### Interpreting Correlations
252
+
253
+ - **Positive correlation**: Higher trait scores → better performance
254
+ - **Negative correlation**: Lower trait scores → better performance
255
+ - **Effect size**: Magnitude of the correlation in points
256
+
257
+ ## Environment Variables
258
+
259
+ | Variable | Description | Default |
260
+ |----------|-------------|---------|
261
+ | `BENCHMARK_PATH` | Override benchmark results location | `./results` |
262
+
263
+ ## Example Workflow
264
+
265
+ ```bash
266
+ # 1. Create baseline for reviewer role (run 10 times)
267
+ /benchmark-control reviewer --scenario order-service
268
+
269
+ # 2. Benchmark a persona against baseline
270
+ /benchmark discworld reviewer --scenario order-service --runs 4
271
+
272
+ # 3. Benchmark another persona
273
+ /benchmark the-expanse reviewer --scenario order-service --runs 4
274
+
275
+ # 4. Cross-role benchmark (character in different role)
276
+ /benchmark shakespeare prospero --as dev --scenario django-10554 --runs 4
277
+
278
+ # 5. View results
279
+ cat results/benchmarks/order-service/discworld-reviewer/summary.yaml
280
+ ```
281
+
282
+ ## Integrity Requirements
283
+
284
+ The benchmarking system enforces strict integrity:
285
+
286
+ 1. **Proof-of-Work**: All runs include timestamps, token counts, and full responses
287
+ 2. **Validation**: `/finalize-run` skill validates all data before saving
288
+ 3. **No Fabrication**: Missing or invalid data is rejected, not estimated
289
+ 4. **Tool Restriction**: `/solo` uses `--tools ""` to prevent multi-turn contamination
290
+
291
+ ## Related Files
292
+
293
+ - `scenarios/schema.yaml` - Full scenario schema
294
+ - `scenarios/README.md` - Scenario authoring guide
295
+ - `commands/solo.md` - Solo command implementation
296
+ - `commands/benchmark.md` - Benchmark command implementation
297
+ - `skills/judge/SKILL.md` - Evaluation rubrics
298
+ - `skills/finalize-run/SKILL.md` - Result validation
299
+ - `src/benchmark-integration.ts` - OCEAN correlation module
300
+
301
+ ---
302
+
303
+ ## TRAIL-OCEAN Research
304
+
305
+ For hypothesis-driven research correlating OCEAN dimensions with error detection:
306
+
307
+ - [TRAIL-OCEAN Hypothesis Mapping](../../../pennyfarthing-dist/personas/TRAIL-OCEAN-MAPPING.md) - Complete hypothesis document
308
+
309
+ ## Legacy Framework
310
+
311
+ Note: The legacy framework (using `just` commands) is documented in [docs/archive/benchmarks-legacy.md](../../../docs/archive/benchmarks-legacy.md). The current system uses `/solo`, `/benchmark-control`, and `/benchmark`.