@plaited/acp-harness 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -47
- package/bin/cli.ts +1 -1
- package/package.json +3 -3
- package/src/adapter-check.ts +3 -3
- package/src/adapter-scaffold.ts +10 -10
- package/src/balance.ts +31 -6
- package/src/calibrate.ts +23 -4
- package/src/summarize.ts +18 -4
- package/src/tests/adapter-scaffold.spec.ts +5 -5
- package/src/tests/balance-helpers.spec.ts +279 -0
- package/src/tests/calibrate-helpers.spec.ts +226 -0
- package/src/tests/capture-helpers.spec.ts +553 -0
- package/src/tests/summarize-helpers.spec.ts +339 -0
- package/src/tests/trials-calculations.spec.ts +209 -0
- package/src/trials.ts +14 -2
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
import { describe, expect, test } from 'bun:test'
|
|
2
|
+
import type { CaptureResult } from '../schemas.ts'
|
|
3
|
+
import { formatMarkdown, formatSummary } from '../summarize.ts'
|
|
4
|
+
|
|
5
|
+
// ============================================================================
|
|
6
|
+
// Test Fixtures
|
|
7
|
+
// ============================================================================
|
|
8
|
+
|
|
9
|
+
const createBasicResult = (overrides?: Partial<CaptureResult>): CaptureResult => ({
|
|
10
|
+
id: 'test-001',
|
|
11
|
+
input: 'What is 2+2?',
|
|
12
|
+
output: 'The answer is 4.',
|
|
13
|
+
trajectory: [
|
|
14
|
+
{ type: 'thought', content: 'Let me think about this...', timestamp: 0 },
|
|
15
|
+
{ type: 'message', content: 'The answer is 4.', timestamp: 100 },
|
|
16
|
+
],
|
|
17
|
+
metadata: { category: 'math', agent: 'test-agent' },
|
|
18
|
+
timing: { start: 1000, end: 2000 },
|
|
19
|
+
toolErrors: false,
|
|
20
|
+
...overrides,
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
const createResultWithToolCalls = (): CaptureResult => ({
|
|
24
|
+
id: 'test-002',
|
|
25
|
+
input: 'Read and summarize file.txt',
|
|
26
|
+
output: 'File contains important data.',
|
|
27
|
+
trajectory: [
|
|
28
|
+
{ type: 'thought', content: 'I will read the file first.', timestamp: 0 },
|
|
29
|
+
{
|
|
30
|
+
type: 'tool_call',
|
|
31
|
+
name: 'Read',
|
|
32
|
+
status: 'completed',
|
|
33
|
+
input: { file_path: '/path/to/file.txt' },
|
|
34
|
+
output: 'file contents here',
|
|
35
|
+
duration: 50,
|
|
36
|
+
timestamp: 100,
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
type: 'tool_call',
|
|
40
|
+
name: 'Write',
|
|
41
|
+
status: 'completed',
|
|
42
|
+
input: { file_path: '/output.md', content: 'Summary here' },
|
|
43
|
+
duration: 30,
|
|
44
|
+
timestamp: 200,
|
|
45
|
+
},
|
|
46
|
+
{ type: 'message', content: 'File contains important data.', timestamp: 300 },
|
|
47
|
+
],
|
|
48
|
+
metadata: { agent: 'test-agent' },
|
|
49
|
+
timing: { start: 1000, end: 1500 },
|
|
50
|
+
toolErrors: false,
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
// ============================================================================
|
|
54
|
+
// formatSummary
|
|
55
|
+
// ============================================================================
|
|
56
|
+
|
|
57
|
+
describe('formatSummary', () => {
|
|
58
|
+
test('extracts id, input, and output', () => {
|
|
59
|
+
const result = createBasicResult()
|
|
60
|
+
const summary = formatSummary(result)
|
|
61
|
+
|
|
62
|
+
expect(summary.id).toBe('test-001')
|
|
63
|
+
expect(summary.input).toBe('What is 2+2?')
|
|
64
|
+
expect(summary.output).toBe('The answer is 4.')
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
test('extracts tool call names', () => {
|
|
68
|
+
const result = createResultWithToolCalls()
|
|
69
|
+
const summary = formatSummary(result)
|
|
70
|
+
|
|
71
|
+
expect(summary.toolCalls).toEqual(['Read', 'Write'])
|
|
72
|
+
})
|
|
73
|
+
|
|
74
|
+
test('calculates duration from timing', () => {
|
|
75
|
+
const result = createBasicResult()
|
|
76
|
+
const summary = formatSummary(result)
|
|
77
|
+
|
|
78
|
+
expect(summary.duration).toBe(1000) // 2000 - 1000
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
test('handles empty trajectory', () => {
|
|
82
|
+
const result = createBasicResult({ trajectory: [] })
|
|
83
|
+
const summary = formatSummary(result)
|
|
84
|
+
|
|
85
|
+
expect(summary.toolCalls).toEqual([])
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
test('filters only tool_call steps for toolCalls list', () => {
|
|
89
|
+
const result = createBasicResult()
|
|
90
|
+
const summary = formatSummary(result)
|
|
91
|
+
|
|
92
|
+
// trajectory has thought and message, but no tool_call
|
|
93
|
+
expect(summary.toolCalls).toEqual([])
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
test('handles trajectory with only messages', () => {
|
|
97
|
+
const result = createBasicResult({
|
|
98
|
+
trajectory: [
|
|
99
|
+
{ type: 'message', content: 'First message', timestamp: 0 },
|
|
100
|
+
{ type: 'message', content: 'Second message', timestamp: 100 },
|
|
101
|
+
],
|
|
102
|
+
})
|
|
103
|
+
const summary = formatSummary(result)
|
|
104
|
+
|
|
105
|
+
expect(summary.toolCalls).toEqual([])
|
|
106
|
+
})
|
|
107
|
+
|
|
108
|
+
test('preserves original input/output exactly', () => {
|
|
109
|
+
const result = createBasicResult({
|
|
110
|
+
input: 'Input with\nnewlines and "quotes"',
|
|
111
|
+
output: 'Output with\ttabs',
|
|
112
|
+
})
|
|
113
|
+
const summary = formatSummary(result)
|
|
114
|
+
|
|
115
|
+
expect(summary.input).toBe('Input with\nnewlines and "quotes"')
|
|
116
|
+
expect(summary.output).toBe('Output with\ttabs')
|
|
117
|
+
})
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
// ============================================================================
|
|
121
|
+
// formatMarkdown
|
|
122
|
+
// ============================================================================
|
|
123
|
+
|
|
124
|
+
describe('formatMarkdown', () => {
|
|
125
|
+
test('includes evaluation record header with id', () => {
|
|
126
|
+
const result = createBasicResult()
|
|
127
|
+
const markdown = formatMarkdown(result)
|
|
128
|
+
|
|
129
|
+
expect(markdown).toContain('## Evaluation Record: test-001')
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
test('includes input field', () => {
|
|
133
|
+
const result = createBasicResult()
|
|
134
|
+
const markdown = formatMarkdown(result)
|
|
135
|
+
|
|
136
|
+
expect(markdown).toContain('**Input:** What is 2+2?')
|
|
137
|
+
})
|
|
138
|
+
|
|
139
|
+
test('includes trajectory section', () => {
|
|
140
|
+
const result = createBasicResult()
|
|
141
|
+
const markdown = formatMarkdown(result)
|
|
142
|
+
|
|
143
|
+
expect(markdown).toContain('**Trajectory:**')
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
test('formats thought steps with truncation', () => {
|
|
147
|
+
const result = createBasicResult({
|
|
148
|
+
trajectory: [
|
|
149
|
+
{ type: 'thought', content: 'Short thought', timestamp: 0 },
|
|
150
|
+
{ type: 'thought', content: 'A'.repeat(150), timestamp: 100 },
|
|
151
|
+
],
|
|
152
|
+
})
|
|
153
|
+
const markdown = formatMarkdown(result)
|
|
154
|
+
|
|
155
|
+
expect(markdown).toContain('[THOUGHT] Short thought')
|
|
156
|
+
expect(markdown).toContain(`[THOUGHT] ${'A'.repeat(100)}...`)
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
test('formats tool calls with status and duration', () => {
|
|
160
|
+
const result = createResultWithToolCalls()
|
|
161
|
+
const markdown = formatMarkdown(result)
|
|
162
|
+
|
|
163
|
+
expect(markdown).toContain('[TOOL:Read] -> completed (50ms)')
|
|
164
|
+
expect(markdown).toContain('[TOOL:Write] -> completed (30ms)')
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
test('includes file path for tool calls', () => {
|
|
168
|
+
const result = createResultWithToolCalls()
|
|
169
|
+
const markdown = formatMarkdown(result)
|
|
170
|
+
|
|
171
|
+
expect(markdown).toContain('File: /path/to/file.txt')
|
|
172
|
+
expect(markdown).toContain('File: /output.md')
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
test('includes step IDs for reference', () => {
|
|
176
|
+
const result = createBasicResult()
|
|
177
|
+
const markdown = formatMarkdown(result)
|
|
178
|
+
|
|
179
|
+
expect(markdown).toContain('[→test-001-step-1]')
|
|
180
|
+
expect(markdown).toContain('[→test-001-step-2]')
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
test('formats plan steps', () => {
|
|
184
|
+
const result = createBasicResult({
|
|
185
|
+
trajectory: [
|
|
186
|
+
{
|
|
187
|
+
type: 'plan',
|
|
188
|
+
entries: [
|
|
189
|
+
{ content: 'Step 1', status: 'completed' },
|
|
190
|
+
{ content: 'Step 2', status: 'in_progress' },
|
|
191
|
+
],
|
|
192
|
+
timestamp: 0,
|
|
193
|
+
},
|
|
194
|
+
],
|
|
195
|
+
})
|
|
196
|
+
const markdown = formatMarkdown(result)
|
|
197
|
+
|
|
198
|
+
expect(markdown).toContain('[PLAN]')
|
|
199
|
+
expect(markdown).toContain('Step 1: completed')
|
|
200
|
+
expect(markdown).toContain('Step 2: in_progress')
|
|
201
|
+
})
|
|
202
|
+
|
|
203
|
+
test('truncates long plan summaries', () => {
|
|
204
|
+
const result = createBasicResult({
|
|
205
|
+
trajectory: [
|
|
206
|
+
{
|
|
207
|
+
type: 'plan',
|
|
208
|
+
entries: [
|
|
209
|
+
{ content: 'A very long step description that goes on and on', status: 'completed' },
|
|
210
|
+
{ content: 'Another very long step description', status: 'pending' },
|
|
211
|
+
{ content: 'Yet another step', status: 'pending' },
|
|
212
|
+
],
|
|
213
|
+
timestamp: 0,
|
|
214
|
+
},
|
|
215
|
+
],
|
|
216
|
+
})
|
|
217
|
+
const markdown = formatMarkdown(result)
|
|
218
|
+
|
|
219
|
+
expect(markdown).toContain('...')
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
test('formats message steps', () => {
|
|
223
|
+
const result = createBasicResult({
|
|
224
|
+
trajectory: [{ type: 'message', content: 'Here is my response to your question.', timestamp: 0 }],
|
|
225
|
+
})
|
|
226
|
+
const markdown = formatMarkdown(result)
|
|
227
|
+
|
|
228
|
+
expect(markdown).toContain('[MESSAGE] Here is my response')
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
test('includes output preview', () => {
|
|
232
|
+
const result = createBasicResult()
|
|
233
|
+
const markdown = formatMarkdown(result)
|
|
234
|
+
|
|
235
|
+
expect(markdown).toContain('**Output:** The answer is 4.')
|
|
236
|
+
})
|
|
237
|
+
|
|
238
|
+
test('truncates long output', () => {
|
|
239
|
+
const result = createBasicResult({
|
|
240
|
+
output: 'X'.repeat(300),
|
|
241
|
+
})
|
|
242
|
+
const markdown = formatMarkdown(result)
|
|
243
|
+
|
|
244
|
+
expect(markdown).toContain(`${'X'.repeat(200)}...`)
|
|
245
|
+
})
|
|
246
|
+
|
|
247
|
+
test('includes metadata', () => {
|
|
248
|
+
const result = createBasicResult()
|
|
249
|
+
const markdown = formatMarkdown(result)
|
|
250
|
+
|
|
251
|
+
expect(markdown).toContain('**Metadata:**')
|
|
252
|
+
expect(markdown).toContain('category=math')
|
|
253
|
+
expect(markdown).toContain('agent=test-agent')
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
test('includes tool errors status', () => {
|
|
257
|
+
const result = createBasicResult({ toolErrors: true })
|
|
258
|
+
const markdown = formatMarkdown(result)
|
|
259
|
+
|
|
260
|
+
expect(markdown).toContain('**Tool Errors:** true')
|
|
261
|
+
})
|
|
262
|
+
|
|
263
|
+
test('includes duration', () => {
|
|
264
|
+
const result = createBasicResult()
|
|
265
|
+
const markdown = formatMarkdown(result)
|
|
266
|
+
|
|
267
|
+
expect(markdown).toContain('**Duration:** 1000ms')
|
|
268
|
+
})
|
|
269
|
+
|
|
270
|
+
test('includes score when present', () => {
|
|
271
|
+
const result = createBasicResult({
|
|
272
|
+
score: {
|
|
273
|
+
pass: true,
|
|
274
|
+
score: 0.95,
|
|
275
|
+
reasoning: 'Correct answer provided',
|
|
276
|
+
},
|
|
277
|
+
})
|
|
278
|
+
const markdown = formatMarkdown(result)
|
|
279
|
+
|
|
280
|
+
expect(markdown).toContain('**Score:** PASS (0.95)')
|
|
281
|
+
expect(markdown).toContain('**Reasoning:** Correct answer provided')
|
|
282
|
+
})
|
|
283
|
+
|
|
284
|
+
test('handles failed score', () => {
|
|
285
|
+
const result = createBasicResult({
|
|
286
|
+
score: {
|
|
287
|
+
pass: false,
|
|
288
|
+
score: 0.2,
|
|
289
|
+
reasoning: 'Incorrect answer',
|
|
290
|
+
},
|
|
291
|
+
})
|
|
292
|
+
const markdown = formatMarkdown(result)
|
|
293
|
+
|
|
294
|
+
expect(markdown).toContain('**Score:** FAIL (0.2)')
|
|
295
|
+
})
|
|
296
|
+
|
|
297
|
+
test('includes content preview with syntax highlighting', () => {
|
|
298
|
+
const result: CaptureResult = {
|
|
299
|
+
id: 'test-003',
|
|
300
|
+
input: 'Write a function',
|
|
301
|
+
output: 'Done',
|
|
302
|
+
trajectory: [
|
|
303
|
+
{
|
|
304
|
+
type: 'tool_call',
|
|
305
|
+
name: 'Write',
|
|
306
|
+
status: 'completed',
|
|
307
|
+
input: {
|
|
308
|
+
file_path: '/src/utils.ts',
|
|
309
|
+
content: 'export const add = (a: number, b: number) => a + b;',
|
|
310
|
+
},
|
|
311
|
+
duration: 20,
|
|
312
|
+
timestamp: 0,
|
|
313
|
+
},
|
|
314
|
+
],
|
|
315
|
+
metadata: { agent: 'test' },
|
|
316
|
+
timing: { start: 0, end: 100 },
|
|
317
|
+
toolErrors: false,
|
|
318
|
+
}
|
|
319
|
+
const markdown = formatMarkdown(result)
|
|
320
|
+
|
|
321
|
+
expect(markdown).toContain('```ts')
|
|
322
|
+
expect(markdown).toContain('export const add')
|
|
323
|
+
})
|
|
324
|
+
|
|
325
|
+
test('ends with horizontal rule separator', () => {
|
|
326
|
+
const result = createBasicResult()
|
|
327
|
+
const markdown = formatMarkdown(result)
|
|
328
|
+
|
|
329
|
+
expect(markdown).toContain('---')
|
|
330
|
+
})
|
|
331
|
+
|
|
332
|
+
test('handles empty trajectory', () => {
|
|
333
|
+
const result = createBasicResult({ trajectory: [] })
|
|
334
|
+
const markdown = formatMarkdown(result)
|
|
335
|
+
|
|
336
|
+
expect(markdown).toContain('**Trajectory:**')
|
|
337
|
+
expect(markdown).toContain('**Output:**')
|
|
338
|
+
})
|
|
339
|
+
})
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import { describe, expect, test } from 'bun:test'
|
|
2
|
+
import { calculatePassAtK, calculatePassExpK } from '../trials.ts'
|
|
3
|
+
|
|
4
|
+
// ============================================================================
|
|
5
|
+
// calculatePassAtK
|
|
6
|
+
// ============================================================================
|
|
7
|
+
|
|
8
|
+
describe('calculatePassAtK', () => {
|
|
9
|
+
test('returns 1 when all trials pass', () => {
|
|
10
|
+
expect(calculatePassAtK(5, 5)).toBe(1)
|
|
11
|
+
expect(calculatePassAtK(10, 10)).toBe(1)
|
|
12
|
+
expect(calculatePassAtK(1, 1)).toBe(1)
|
|
13
|
+
})
|
|
14
|
+
|
|
15
|
+
test('returns 0 when no trials pass', () => {
|
|
16
|
+
expect(calculatePassAtK(0, 5)).toBe(0)
|
|
17
|
+
expect(calculatePassAtK(0, 10)).toBe(0)
|
|
18
|
+
expect(calculatePassAtK(0, 1)).toBe(0)
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
test('calculates probability correctly for partial passes', () => {
|
|
22
|
+
// pass@k = 1 - (1 - passRate)^k
|
|
23
|
+
// For 3 passes out of 5: passRate = 0.6
|
|
24
|
+
// pass@5 = 1 - (0.4)^5 = 1 - 0.01024 = 0.98976
|
|
25
|
+
const result = calculatePassAtK(3, 5)
|
|
26
|
+
expect(result).toBeCloseTo(0.98976, 5)
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
test('k=1 equals the pass rate', () => {
|
|
30
|
+
// For k=1, pass@1 = 1 - (1 - p)^1 = p
|
|
31
|
+
expect(calculatePassAtK(1, 1)).toBe(1)
|
|
32
|
+
|
|
33
|
+
// More interesting: 0 passes, 1 trial
|
|
34
|
+
expect(calculatePassAtK(0, 1)).toBe(0)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('higher pass rate yields higher pass@k', () => {
|
|
38
|
+
const lowPassRate = calculatePassAtK(1, 5) // 20% pass rate
|
|
39
|
+
const highPassRate = calculatePassAtK(4, 5) // 80% pass rate
|
|
40
|
+
|
|
41
|
+
expect(highPassRate).toBeGreaterThan(lowPassRate)
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
test('larger k amplifies probability of at least one pass', () => {
|
|
45
|
+
// With 50% pass rate, larger k means higher chance of at least one pass
|
|
46
|
+
// k=2: 1 - (0.5)^2 = 0.75
|
|
47
|
+
// k=4: 1 - (0.5)^4 = 0.9375
|
|
48
|
+
|
|
49
|
+
const k2 = calculatePassAtK(1, 2) // 50% pass rate
|
|
50
|
+
const k4 = calculatePassAtK(2, 4) // Also 50% pass rate
|
|
51
|
+
|
|
52
|
+
expect(k4).toBeGreaterThan(k2)
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
test('handles edge case where passes equals k', () => {
|
|
56
|
+
expect(calculatePassAtK(3, 3)).toBe(1)
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
test('handles passes greater than k (returns 1)', () => {
|
|
60
|
+
// This shouldn't happen in practice, but the function handles it
|
|
61
|
+
expect(calculatePassAtK(10, 5)).toBe(1)
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
test('mathematical verification with known values', () => {
|
|
65
|
+
// 1 out of 3 passes: passRate = 1/3
|
|
66
|
+
// pass@3 = 1 - (2/3)^3 = 1 - 8/27 = 19/27 ≈ 0.7037
|
|
67
|
+
const result = calculatePassAtK(1, 3)
|
|
68
|
+
expect(result).toBeCloseTo(19 / 27, 5)
|
|
69
|
+
|
|
70
|
+
// 2 out of 4 passes: passRate = 0.5
|
|
71
|
+
// pass@4 = 1 - (0.5)^4 = 1 - 0.0625 = 0.9375
|
|
72
|
+
const result2 = calculatePassAtK(2, 4)
|
|
73
|
+
expect(result2).toBeCloseTo(0.9375, 5)
|
|
74
|
+
})
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
// ============================================================================
|
|
78
|
+
// calculatePassExpK
|
|
79
|
+
// ============================================================================
|
|
80
|
+
|
|
81
|
+
describe('calculatePassExpK', () => {
|
|
82
|
+
test('returns 1 when all trials pass', () => {
|
|
83
|
+
expect(calculatePassExpK(5, 5)).toBe(1)
|
|
84
|
+
expect(calculatePassExpK(10, 10)).toBe(1)
|
|
85
|
+
expect(calculatePassExpK(1, 1)).toBe(1)
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
test('returns 0 when no trials pass', () => {
|
|
89
|
+
expect(calculatePassExpK(0, 5)).toBe(0)
|
|
90
|
+
expect(calculatePassExpK(0, 10)).toBe(0)
|
|
91
|
+
expect(calculatePassExpK(0, 1)).toBe(0)
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
test('calculates probability correctly', () => {
|
|
95
|
+
// pass^k = passRate^k
|
|
96
|
+
// For 3 passes out of 5: passRate = 0.6
|
|
97
|
+
// pass^5 = (0.6)^5 = 0.07776
|
|
98
|
+
const result = calculatePassExpK(3, 5)
|
|
99
|
+
expect(result).toBeCloseTo(0.07776, 5)
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
test('k=1 equals the pass rate', () => {
|
|
103
|
+
// For k=1, pass^1 = p^1 = p
|
|
104
|
+
expect(calculatePassExpK(1, 1)).toBe(1)
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
test('higher pass rate yields higher pass^k', () => {
|
|
108
|
+
const lowPassRate = calculatePassExpK(1, 5) // 20% pass rate
|
|
109
|
+
const highPassRate = calculatePassExpK(4, 5) // 80% pass rate
|
|
110
|
+
|
|
111
|
+
expect(highPassRate).toBeGreaterThan(lowPassRate)
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
test('larger k reduces probability of all passing (for non-100% rates)', () => {
|
|
115
|
+
// With 80% pass rate:
|
|
116
|
+
// k=2: (0.8)^2 = 0.64
|
|
117
|
+
// k=5: (0.8)^5 = 0.32768
|
|
118
|
+
|
|
119
|
+
// Mathematical verification using known formulas
|
|
120
|
+
const k2_fair = 0.8 ** 2 // = 0.64
|
|
121
|
+
const k5_fair = 0.8 ** 5 // = 0.32768
|
|
122
|
+
|
|
123
|
+
expect(k5_fair).toBeLessThan(k2_fair)
|
|
124
|
+
|
|
125
|
+
// Also verify our function produces consistent results
|
|
126
|
+
// 4 out of 5 gives 80% pass rate
|
|
127
|
+
const result = calculatePassExpK(4, 5)
|
|
128
|
+
expect(result).toBeCloseTo(k5_fair, 5)
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
test('handles edge case where passes equals k', () => {
|
|
132
|
+
expect(calculatePassExpK(3, 3)).toBe(1)
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
test('mathematical verification with known values', () => {
|
|
136
|
+
// 1 out of 3 passes: passRate = 1/3
|
|
137
|
+
// pass^3 = (1/3)^3 = 1/27 ≈ 0.037
|
|
138
|
+
const result = calculatePassExpK(1, 3)
|
|
139
|
+
expect(result).toBeCloseTo(1 / 27, 5)
|
|
140
|
+
|
|
141
|
+
// 2 out of 4 passes: passRate = 0.5
|
|
142
|
+
// pass^4 = (0.5)^4 = 0.0625
|
|
143
|
+
const result2 = calculatePassExpK(2, 4)
|
|
144
|
+
expect(result2).toBeCloseTo(0.0625, 5)
|
|
145
|
+
|
|
146
|
+
// 3 out of 4 passes: passRate = 0.75
|
|
147
|
+
// pass^4 = (0.75)^4 = 0.31640625
|
|
148
|
+
const result3 = calculatePassExpK(3, 4)
|
|
149
|
+
expect(result3).toBeCloseTo(0.31640625, 5)
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
test('pass^k is always less than or equal to pass@k', () => {
|
|
153
|
+
// For any pass rate < 100%, pass^k <= pass@k
|
|
154
|
+
// This is because "all pass" is a subset of "at least one passes"
|
|
155
|
+
|
|
156
|
+
const testCases = [
|
|
157
|
+
{ passes: 1, k: 5 },
|
|
158
|
+
{ passes: 2, k: 5 },
|
|
159
|
+
{ passes: 3, k: 5 },
|
|
160
|
+
{ passes: 4, k: 5 },
|
|
161
|
+
{ passes: 1, k: 3 },
|
|
162
|
+
{ passes: 2, k: 4 },
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
for (const { passes, k } of testCases) {
|
|
166
|
+
const passExpK = calculatePassExpK(passes, k)
|
|
167
|
+
const passAtK = calculatePassAtK(passes, k)
|
|
168
|
+
expect(passExpK).toBeLessThanOrEqual(passAtK)
|
|
169
|
+
}
|
|
170
|
+
})
|
|
171
|
+
})
|
|
172
|
+
|
|
173
|
+
// ============================================================================
|
|
174
|
+
// Combined behavior tests
|
|
175
|
+
// ============================================================================
|
|
176
|
+
|
|
177
|
+
describe('pass@k and pass^k relationship', () => {
|
|
178
|
+
test('100% pass rate: both metrics equal 1', () => {
|
|
179
|
+
expect(calculatePassAtK(5, 5)).toBe(1)
|
|
180
|
+
expect(calculatePassExpK(5, 5)).toBe(1)
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
test('0% pass rate: both metrics equal 0', () => {
|
|
184
|
+
expect(calculatePassAtK(0, 5)).toBe(0)
|
|
185
|
+
expect(calculatePassExpK(0, 5)).toBe(0)
|
|
186
|
+
})
|
|
187
|
+
|
|
188
|
+
test('gap between metrics varies with pass rate', () => {
|
|
189
|
+
// At 50% pass rate, the gap is maximized
|
|
190
|
+
// At extreme pass rates (0% or 100%), the gap is 0
|
|
191
|
+
|
|
192
|
+
// 50% pass rate with k=4
|
|
193
|
+
const midAtK = calculatePassAtK(2, 4) // 0.9375
|
|
194
|
+
const midExpK = calculatePassExpK(2, 4) // 0.0625
|
|
195
|
+
const midGap = midAtK - midExpK // 0.875
|
|
196
|
+
|
|
197
|
+
// 80% pass rate with k=5
|
|
198
|
+
const highAtK = calculatePassAtK(4, 5)
|
|
199
|
+
const highExpK = calculatePassExpK(4, 5)
|
|
200
|
+
const highGap = highAtK - highExpK
|
|
201
|
+
|
|
202
|
+
// Both gaps should be positive (pass@k > pass^k for partial pass rates)
|
|
203
|
+
expect(midGap).toBeGreaterThan(0)
|
|
204
|
+
expect(highGap).toBeGreaterThan(0)
|
|
205
|
+
|
|
206
|
+
// Mid-range pass rate has larger gap than high pass rate
|
|
207
|
+
expect(midGap).toBeGreaterThan(highGap)
|
|
208
|
+
})
|
|
209
|
+
})
|
package/src/trials.ts
CHANGED
|
@@ -34,8 +34,14 @@ import { McpServerSchema } from './schemas.ts'
|
|
|
34
34
|
*
|
|
35
35
|
* For our case where n = k (we run exactly k trials per prompt):
|
|
36
36
|
* pass@k = 1 - (1 - passRate)^k (simplified)
|
|
37
|
+
*
|
|
38
|
+
* @param passes - Number of passing trials
|
|
39
|
+
* @param k - Total number of trials
|
|
40
|
+
* @returns Probability of at least one pass
|
|
41
|
+
*
|
|
42
|
+
* @public
|
|
37
43
|
*/
|
|
38
|
-
const calculatePassAtK = (passes: number, k: number): number => {
|
|
44
|
+
export const calculatePassAtK = (passes: number, k: number): number => {
|
|
39
45
|
if (passes >= k) return 1
|
|
40
46
|
if (passes === 0) return 0
|
|
41
47
|
|
|
@@ -49,8 +55,14 @@ const calculatePassAtK = (passes: number, k: number): number => {
|
|
|
49
55
|
*
|
|
50
56
|
* @remarks
|
|
51
57
|
* This is simply passRate^k
|
|
58
|
+
*
|
|
59
|
+
* @param passes - Number of passing trials
|
|
60
|
+
* @param k - Total number of trials
|
|
61
|
+
* @returns Probability of all k samples passing
|
|
62
|
+
*
|
|
63
|
+
* @public
|
|
52
64
|
*/
|
|
53
|
-
const calculatePassExpK = (passes: number, k: number): number => {
|
|
65
|
+
export const calculatePassExpK = (passes: number, k: number): number => {
|
|
54
66
|
if (passes === k) return 1
|
|
55
67
|
if (passes === 0) return 0
|
|
56
68
|
|