@plaited/acp-harness 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,339 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import type { CaptureResult } from '../schemas.ts'
3
+ import { formatMarkdown, formatSummary } from '../summarize.ts'
4
+
5
+ // ============================================================================
6
+ // Test Fixtures
7
+ // ============================================================================
8
+
9
+ const createBasicResult = (overrides?: Partial<CaptureResult>): CaptureResult => ({
10
+ id: 'test-001',
11
+ input: 'What is 2+2?',
12
+ output: 'The answer is 4.',
13
+ trajectory: [
14
+ { type: 'thought', content: 'Let me think about this...', timestamp: 0 },
15
+ { type: 'message', content: 'The answer is 4.', timestamp: 100 },
16
+ ],
17
+ metadata: { category: 'math', agent: 'test-agent' },
18
+ timing: { start: 1000, end: 2000 },
19
+ toolErrors: false,
20
+ ...overrides,
21
+ })
22
+
23
+ const createResultWithToolCalls = (): CaptureResult => ({
24
+ id: 'test-002',
25
+ input: 'Read and summarize file.txt',
26
+ output: 'File contains important data.',
27
+ trajectory: [
28
+ { type: 'thought', content: 'I will read the file first.', timestamp: 0 },
29
+ {
30
+ type: 'tool_call',
31
+ name: 'Read',
32
+ status: 'completed',
33
+ input: { file_path: '/path/to/file.txt' },
34
+ output: 'file contents here',
35
+ duration: 50,
36
+ timestamp: 100,
37
+ },
38
+ {
39
+ type: 'tool_call',
40
+ name: 'Write',
41
+ status: 'completed',
42
+ input: { file_path: '/output.md', content: 'Summary here' },
43
+ duration: 30,
44
+ timestamp: 200,
45
+ },
46
+ { type: 'message', content: 'File contains important data.', timestamp: 300 },
47
+ ],
48
+ metadata: { agent: 'test-agent' },
49
+ timing: { start: 1000, end: 1500 },
50
+ toolErrors: false,
51
+ })
52
+
53
+ // ============================================================================
54
+ // formatSummary
55
+ // ============================================================================
56
+
57
+ describe('formatSummary', () => {
58
+ test('extracts id, input, and output', () => {
59
+ const result = createBasicResult()
60
+ const summary = formatSummary(result)
61
+
62
+ expect(summary.id).toBe('test-001')
63
+ expect(summary.input).toBe('What is 2+2?')
64
+ expect(summary.output).toBe('The answer is 4.')
65
+ })
66
+
67
+ test('extracts tool call names', () => {
68
+ const result = createResultWithToolCalls()
69
+ const summary = formatSummary(result)
70
+
71
+ expect(summary.toolCalls).toEqual(['Read', 'Write'])
72
+ })
73
+
74
+ test('calculates duration from timing', () => {
75
+ const result = createBasicResult()
76
+ const summary = formatSummary(result)
77
+
78
+ expect(summary.duration).toBe(1000) // 2000 - 1000
79
+ })
80
+
81
+ test('handles empty trajectory', () => {
82
+ const result = createBasicResult({ trajectory: [] })
83
+ const summary = formatSummary(result)
84
+
85
+ expect(summary.toolCalls).toEqual([])
86
+ })
87
+
88
+ test('filters only tool_call steps for toolCalls list', () => {
89
+ const result = createBasicResult()
90
+ const summary = formatSummary(result)
91
+
92
+ // trajectory has thought and message, but no tool_call
93
+ expect(summary.toolCalls).toEqual([])
94
+ })
95
+
96
+ test('handles trajectory with only messages', () => {
97
+ const result = createBasicResult({
98
+ trajectory: [
99
+ { type: 'message', content: 'First message', timestamp: 0 },
100
+ { type: 'message', content: 'Second message', timestamp: 100 },
101
+ ],
102
+ })
103
+ const summary = formatSummary(result)
104
+
105
+ expect(summary.toolCalls).toEqual([])
106
+ })
107
+
108
+ test('preserves original input/output exactly', () => {
109
+ const result = createBasicResult({
110
+ input: 'Input with\nnewlines and "quotes"',
111
+ output: 'Output with\ttabs',
112
+ })
113
+ const summary = formatSummary(result)
114
+
115
+ expect(summary.input).toBe('Input with\nnewlines and "quotes"')
116
+ expect(summary.output).toBe('Output with\ttabs')
117
+ })
118
+ })
119
+
120
+ // ============================================================================
121
+ // formatMarkdown
122
+ // ============================================================================
123
+
124
+ describe('formatMarkdown', () => {
125
+ test('includes evaluation record header with id', () => {
126
+ const result = createBasicResult()
127
+ const markdown = formatMarkdown(result)
128
+
129
+ expect(markdown).toContain('## Evaluation Record: test-001')
130
+ })
131
+
132
+ test('includes input field', () => {
133
+ const result = createBasicResult()
134
+ const markdown = formatMarkdown(result)
135
+
136
+ expect(markdown).toContain('**Input:** What is 2+2?')
137
+ })
138
+
139
+ test('includes trajectory section', () => {
140
+ const result = createBasicResult()
141
+ const markdown = formatMarkdown(result)
142
+
143
+ expect(markdown).toContain('**Trajectory:**')
144
+ })
145
+
146
+ test('formats thought steps with truncation', () => {
147
+ const result = createBasicResult({
148
+ trajectory: [
149
+ { type: 'thought', content: 'Short thought', timestamp: 0 },
150
+ { type: 'thought', content: 'A'.repeat(150), timestamp: 100 },
151
+ ],
152
+ })
153
+ const markdown = formatMarkdown(result)
154
+
155
+ expect(markdown).toContain('[THOUGHT] Short thought')
156
+ expect(markdown).toContain(`[THOUGHT] ${'A'.repeat(100)}...`)
157
+ })
158
+
159
+ test('formats tool calls with status and duration', () => {
160
+ const result = createResultWithToolCalls()
161
+ const markdown = formatMarkdown(result)
162
+
163
+ expect(markdown).toContain('[TOOL:Read] -> completed (50ms)')
164
+ expect(markdown).toContain('[TOOL:Write] -> completed (30ms)')
165
+ })
166
+
167
+ test('includes file path for tool calls', () => {
168
+ const result = createResultWithToolCalls()
169
+ const markdown = formatMarkdown(result)
170
+
171
+ expect(markdown).toContain('File: /path/to/file.txt')
172
+ expect(markdown).toContain('File: /output.md')
173
+ })
174
+
175
+ test('includes step IDs for reference', () => {
176
+ const result = createBasicResult()
177
+ const markdown = formatMarkdown(result)
178
+
179
+ expect(markdown).toContain('[→test-001-step-1]')
180
+ expect(markdown).toContain('[→test-001-step-2]')
181
+ })
182
+
183
+ test('formats plan steps', () => {
184
+ const result = createBasicResult({
185
+ trajectory: [
186
+ {
187
+ type: 'plan',
188
+ entries: [
189
+ { content: 'Step 1', status: 'completed' },
190
+ { content: 'Step 2', status: 'in_progress' },
191
+ ],
192
+ timestamp: 0,
193
+ },
194
+ ],
195
+ })
196
+ const markdown = formatMarkdown(result)
197
+
198
+ expect(markdown).toContain('[PLAN]')
199
+ expect(markdown).toContain('Step 1: completed')
200
+ expect(markdown).toContain('Step 2: in_progress')
201
+ })
202
+
203
+ test('truncates long plan summaries', () => {
204
+ const result = createBasicResult({
205
+ trajectory: [
206
+ {
207
+ type: 'plan',
208
+ entries: [
209
+ { content: 'A very long step description that goes on and on', status: 'completed' },
210
+ { content: 'Another very long step description', status: 'pending' },
211
+ { content: 'Yet another step', status: 'pending' },
212
+ ],
213
+ timestamp: 0,
214
+ },
215
+ ],
216
+ })
217
+ const markdown = formatMarkdown(result)
218
+
219
+ expect(markdown).toContain('...')
220
+ })
221
+
222
+ test('formats message steps', () => {
223
+ const result = createBasicResult({
224
+ trajectory: [{ type: 'message', content: 'Here is my response to your question.', timestamp: 0 }],
225
+ })
226
+ const markdown = formatMarkdown(result)
227
+
228
+ expect(markdown).toContain('[MESSAGE] Here is my response')
229
+ })
230
+
231
+ test('includes output preview', () => {
232
+ const result = createBasicResult()
233
+ const markdown = formatMarkdown(result)
234
+
235
+ expect(markdown).toContain('**Output:** The answer is 4.')
236
+ })
237
+
238
+ test('truncates long output', () => {
239
+ const result = createBasicResult({
240
+ output: 'X'.repeat(300),
241
+ })
242
+ const markdown = formatMarkdown(result)
243
+
244
+ expect(markdown).toContain(`${'X'.repeat(200)}...`)
245
+ })
246
+
247
+ test('includes metadata', () => {
248
+ const result = createBasicResult()
249
+ const markdown = formatMarkdown(result)
250
+
251
+ expect(markdown).toContain('**Metadata:**')
252
+ expect(markdown).toContain('category=math')
253
+ expect(markdown).toContain('agent=test-agent')
254
+ })
255
+
256
+ test('includes tool errors status', () => {
257
+ const result = createBasicResult({ toolErrors: true })
258
+ const markdown = formatMarkdown(result)
259
+
260
+ expect(markdown).toContain('**Tool Errors:** true')
261
+ })
262
+
263
+ test('includes duration', () => {
264
+ const result = createBasicResult()
265
+ const markdown = formatMarkdown(result)
266
+
267
+ expect(markdown).toContain('**Duration:** 1000ms')
268
+ })
269
+
270
+ test('includes score when present', () => {
271
+ const result = createBasicResult({
272
+ score: {
273
+ pass: true,
274
+ score: 0.95,
275
+ reasoning: 'Correct answer provided',
276
+ },
277
+ })
278
+ const markdown = formatMarkdown(result)
279
+
280
+ expect(markdown).toContain('**Score:** PASS (0.95)')
281
+ expect(markdown).toContain('**Reasoning:** Correct answer provided')
282
+ })
283
+
284
+ test('handles failed score', () => {
285
+ const result = createBasicResult({
286
+ score: {
287
+ pass: false,
288
+ score: 0.2,
289
+ reasoning: 'Incorrect answer',
290
+ },
291
+ })
292
+ const markdown = formatMarkdown(result)
293
+
294
+ expect(markdown).toContain('**Score:** FAIL (0.2)')
295
+ })
296
+
297
+ test('includes content preview with syntax highlighting', () => {
298
+ const result: CaptureResult = {
299
+ id: 'test-003',
300
+ input: 'Write a function',
301
+ output: 'Done',
302
+ trajectory: [
303
+ {
304
+ type: 'tool_call',
305
+ name: 'Write',
306
+ status: 'completed',
307
+ input: {
308
+ file_path: '/src/utils.ts',
309
+ content: 'export const add = (a: number, b: number) => a + b;',
310
+ },
311
+ duration: 20,
312
+ timestamp: 0,
313
+ },
314
+ ],
315
+ metadata: { agent: 'test' },
316
+ timing: { start: 0, end: 100 },
317
+ toolErrors: false,
318
+ }
319
+ const markdown = formatMarkdown(result)
320
+
321
+ expect(markdown).toContain('```ts')
322
+ expect(markdown).toContain('export const add')
323
+ })
324
+
325
+ test('ends with horizontal rule separator', () => {
326
+ const result = createBasicResult()
327
+ const markdown = formatMarkdown(result)
328
+
329
+ expect(markdown).toContain('---')
330
+ })
331
+
332
+ test('handles empty trajectory', () => {
333
+ const result = createBasicResult({ trajectory: [] })
334
+ const markdown = formatMarkdown(result)
335
+
336
+ expect(markdown).toContain('**Trajectory:**')
337
+ expect(markdown).toContain('**Output:**')
338
+ })
339
+ })
@@ -0,0 +1,209 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import { calculatePassAtK, calculatePassExpK } from '../trials.ts'
3
+
4
+ // ============================================================================
5
+ // calculatePassAtK
6
+ // ============================================================================
7
+
8
+ describe('calculatePassAtK', () => {
9
+ test('returns 1 when all trials pass', () => {
10
+ expect(calculatePassAtK(5, 5)).toBe(1)
11
+ expect(calculatePassAtK(10, 10)).toBe(1)
12
+ expect(calculatePassAtK(1, 1)).toBe(1)
13
+ })
14
+
15
+ test('returns 0 when no trials pass', () => {
16
+ expect(calculatePassAtK(0, 5)).toBe(0)
17
+ expect(calculatePassAtK(0, 10)).toBe(0)
18
+ expect(calculatePassAtK(0, 1)).toBe(0)
19
+ })
20
+
21
+ test('calculates probability correctly for partial passes', () => {
22
+ // pass@k = 1 - (1 - passRate)^k
23
+ // For 3 passes out of 5: passRate = 0.6
24
+ // pass@5 = 1 - (0.4)^5 = 1 - 0.01024 = 0.98976
25
+ const result = calculatePassAtK(3, 5)
26
+ expect(result).toBeCloseTo(0.98976, 5)
27
+ })
28
+
29
+ test('k=1 equals the pass rate', () => {
30
+ // For k=1, pass@1 = 1 - (1 - p)^1 = p
31
+ expect(calculatePassAtK(1, 1)).toBe(1)
32
+
33
+ // More interesting: 0 passes, 1 trial
34
+ expect(calculatePassAtK(0, 1)).toBe(0)
35
+ })
36
+
37
+ test('higher pass rate yields higher pass@k', () => {
38
+ const lowPassRate = calculatePassAtK(1, 5) // 20% pass rate
39
+ const highPassRate = calculatePassAtK(4, 5) // 80% pass rate
40
+
41
+ expect(highPassRate).toBeGreaterThan(lowPassRate)
42
+ })
43
+
44
+ test('larger k amplifies probability of at least one pass', () => {
45
+ // With 50% pass rate, larger k means higher chance of at least one pass
46
+ // k=2: 1 - (0.5)^2 = 0.75
47
+ // k=4: 1 - (0.5)^4 = 0.9375
48
+
49
+ const k2 = calculatePassAtK(1, 2) // 50% pass rate
50
+ const k4 = calculatePassAtK(2, 4) // Also 50% pass rate
51
+
52
+ expect(k4).toBeGreaterThan(k2)
53
+ })
54
+
55
+ test('handles edge case where passes equals k', () => {
56
+ expect(calculatePassAtK(3, 3)).toBe(1)
57
+ })
58
+
59
+ test('handles passes greater than k (returns 1)', () => {
60
+ // This shouldn't happen in practice, but the function handles it
61
+ expect(calculatePassAtK(10, 5)).toBe(1)
62
+ })
63
+
64
+ test('mathematical verification with known values', () => {
65
+ // 1 out of 3 passes: passRate = 1/3
66
+ // pass@3 = 1 - (2/3)^3 = 1 - 8/27 = 19/27 ≈ 0.7037
67
+ const result = calculatePassAtK(1, 3)
68
+ expect(result).toBeCloseTo(19 / 27, 5)
69
+
70
+ // 2 out of 4 passes: passRate = 0.5
71
+ // pass@4 = 1 - (0.5)^4 = 1 - 0.0625 = 0.9375
72
+ const result2 = calculatePassAtK(2, 4)
73
+ expect(result2).toBeCloseTo(0.9375, 5)
74
+ })
75
+ })
76
+
77
+ // ============================================================================
78
+ // calculatePassExpK
79
+ // ============================================================================
80
+
81
+ describe('calculatePassExpK', () => {
82
+ test('returns 1 when all trials pass', () => {
83
+ expect(calculatePassExpK(5, 5)).toBe(1)
84
+ expect(calculatePassExpK(10, 10)).toBe(1)
85
+ expect(calculatePassExpK(1, 1)).toBe(1)
86
+ })
87
+
88
+ test('returns 0 when no trials pass', () => {
89
+ expect(calculatePassExpK(0, 5)).toBe(0)
90
+ expect(calculatePassExpK(0, 10)).toBe(0)
91
+ expect(calculatePassExpK(0, 1)).toBe(0)
92
+ })
93
+
94
+ test('calculates probability correctly', () => {
95
+ // pass^k = passRate^k
96
+ // For 3 passes out of 5: passRate = 0.6
97
+ // pass^5 = (0.6)^5 = 0.07776
98
+ const result = calculatePassExpK(3, 5)
99
+ expect(result).toBeCloseTo(0.07776, 5)
100
+ })
101
+
102
+ test('k=1 equals the pass rate', () => {
103
+ // For k=1, pass^1 = p^1 = p
104
+ expect(calculatePassExpK(1, 1)).toBe(1)
105
+ })
106
+
107
+ test('higher pass rate yields higher pass^k', () => {
108
+ const lowPassRate = calculatePassExpK(1, 5) // 20% pass rate
109
+ const highPassRate = calculatePassExpK(4, 5) // 80% pass rate
110
+
111
+ expect(highPassRate).toBeGreaterThan(lowPassRate)
112
+ })
113
+
114
+ test('larger k reduces probability of all passing (for non-100% rates)', () => {
115
+ // With 80% pass rate:
116
+ // k=2: (0.8)^2 = 0.64
117
+ // k=5: (0.8)^5 = 0.32768
118
+
119
+ // Mathematical verification using known formulas
120
+ const k2_fair = 0.8 ** 2 // = 0.64
121
+ const k5_fair = 0.8 ** 5 // = 0.32768
122
+
123
+ expect(k5_fair).toBeLessThan(k2_fair)
124
+
125
+ // Also verify our function produces consistent results
126
+ // 4 out of 5 gives 80% pass rate
127
+ const result = calculatePassExpK(4, 5)
128
+ expect(result).toBeCloseTo(k5_fair, 5)
129
+ })
130
+
131
+ test('handles edge case where passes equals k', () => {
132
+ expect(calculatePassExpK(3, 3)).toBe(1)
133
+ })
134
+
135
+ test('mathematical verification with known values', () => {
136
+ // 1 out of 3 passes: passRate = 1/3
137
+ // pass^3 = (1/3)^3 = 1/27 ≈ 0.037
138
+ const result = calculatePassExpK(1, 3)
139
+ expect(result).toBeCloseTo(1 / 27, 5)
140
+
141
+ // 2 out of 4 passes: passRate = 0.5
142
+ // pass^4 = (0.5)^4 = 0.0625
143
+ const result2 = calculatePassExpK(2, 4)
144
+ expect(result2).toBeCloseTo(0.0625, 5)
145
+
146
+ // 3 out of 4 passes: passRate = 0.75
147
+ // pass^4 = (0.75)^4 = 0.31640625
148
+ const result3 = calculatePassExpK(3, 4)
149
+ expect(result3).toBeCloseTo(0.31640625, 5)
150
+ })
151
+
152
+ test('pass^k is always less than or equal to pass@k', () => {
153
+ // For any pass rate < 100%, pass^k <= pass@k
154
+ // This is because "all pass" is a subset of "at least one passes"
155
+
156
+ const testCases = [
157
+ { passes: 1, k: 5 },
158
+ { passes: 2, k: 5 },
159
+ { passes: 3, k: 5 },
160
+ { passes: 4, k: 5 },
161
+ { passes: 1, k: 3 },
162
+ { passes: 2, k: 4 },
163
+ ]
164
+
165
+ for (const { passes, k } of testCases) {
166
+ const passExpK = calculatePassExpK(passes, k)
167
+ const passAtK = calculatePassAtK(passes, k)
168
+ expect(passExpK).toBeLessThanOrEqual(passAtK)
169
+ }
170
+ })
171
+ })
172
+
173
+ // ============================================================================
174
+ // Combined behavior tests
175
+ // ============================================================================
176
+
177
+ describe('pass@k and pass^k relationship', () => {
178
+ test('100% pass rate: both metrics equal 1', () => {
179
+ expect(calculatePassAtK(5, 5)).toBe(1)
180
+ expect(calculatePassExpK(5, 5)).toBe(1)
181
+ })
182
+
183
+ test('0% pass rate: both metrics equal 0', () => {
184
+ expect(calculatePassAtK(0, 5)).toBe(0)
185
+ expect(calculatePassExpK(0, 5)).toBe(0)
186
+ })
187
+
188
+ test('gap between metrics varies with pass rate', () => {
189
+ // At 50% pass rate, the gap is maximized
190
+ // At extreme pass rates (0% or 100%), the gap is 0
191
+
192
+ // 50% pass rate with k=4
193
+ const midAtK = calculatePassAtK(2, 4) // 0.9375
194
+ const midExpK = calculatePassExpK(2, 4) // 0.0625
195
+ const midGap = midAtK - midExpK // 0.875
196
+
197
+ // 80% pass rate with k=5
198
+ const highAtK = calculatePassAtK(4, 5)
199
+ const highExpK = calculatePassExpK(4, 5)
200
+ const highGap = highAtK - highExpK
201
+
202
+ // Both gaps should be positive (pass@k > pass^k for partial pass rates)
203
+ expect(midGap).toBeGreaterThan(0)
204
+ expect(highGap).toBeGreaterThan(0)
205
+
206
+ // Mid-range pass rate has larger gap than high pass rate
207
+ expect(midGap).toBeGreaterThan(highGap)
208
+ })
209
+ })
package/src/trials.ts CHANGED
@@ -34,8 +34,14 @@ import { McpServerSchema } from './schemas.ts'
34
34
  *
35
35
  * For our case where n = k (we run exactly k trials per prompt):
36
36
  * pass@k = 1 - (1 - passRate)^k (simplified)
37
+ *
38
+ * @param passes - Number of passing trials
39
+ * @param k - Total number of trials
40
+ * @returns Probability of at least one pass
41
+ *
42
+ * @public
37
43
  */
38
- const calculatePassAtK = (passes: number, k: number): number => {
44
+ export const calculatePassAtK = (passes: number, k: number): number => {
39
45
  if (passes >= k) return 1
40
46
  if (passes === 0) return 0
41
47
 
@@ -49,8 +55,14 @@ const calculatePassAtK = (passes: number, k: number): number => {
49
55
  *
50
56
  * @remarks
51
57
  * This is simply passRate^k
58
+ *
59
+ * @param passes - Number of passing trials
60
+ * @param k - Total number of trials
61
+ * @returns Probability of all k samples passing
62
+ *
63
+ * @public
52
64
  */
53
- const calculatePassExpK = (passes: number, k: number): number => {
65
+ export const calculatePassExpK = (passes: number, k: number): number => {
54
66
  if (passes === k) return 1
55
67
  if (passes === 0) return 0
56
68