@plaited/acp-harness 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,279 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import { analyzeCategories, findUnderrepresented, generateSuggestions } from '../balance.ts'
3
+ import type { CategoryDistribution, PromptCase } from '../schemas.ts'
4
+
5
+ // ============================================================================
6
+ // analyzeCategories
7
+ // ============================================================================
8
+
9
+ describe('analyzeCategories', () => {
10
+ test('counts prompts by category', () => {
11
+ const prompts: PromptCase[] = [
12
+ { id: '1', input: 'test', metadata: { category: 'math' } },
13
+ { id: '2', input: 'test', metadata: { category: 'math' } },
14
+ { id: '3', input: 'test', metadata: { category: 'code' } },
15
+ ]
16
+
17
+ const result = analyzeCategories(prompts, 'category')
18
+
19
+ expect(result).toHaveLength(2)
20
+ const math = result.find((d) => d.name === 'math')
21
+ const code = result.find((d) => d.name === 'code')
22
+
23
+ expect(math?.count).toBe(2)
24
+ expect(code?.count).toBe(1)
25
+ })
26
+
27
+ test('calculates percentages correctly', () => {
28
+ const prompts: PromptCase[] = [
29
+ { id: '1', input: 'test', metadata: { category: 'a' } },
30
+ { id: '2', input: 'test', metadata: { category: 'a' } },
31
+ { id: '3', input: 'test', metadata: { category: 'b' } },
32
+ { id: '4', input: 'test', metadata: { category: 'b' } },
33
+ ]
34
+
35
+ const result = analyzeCategories(prompts, 'category')
36
+
37
+ expect(result[0]?.percentage).toBe(50)
38
+ expect(result[1]?.percentage).toBe(50)
39
+ })
40
+
41
+ test('sorts by count descending', () => {
42
+ const prompts: PromptCase[] = [
43
+ { id: '1', input: 'test', metadata: { category: 'small' } },
44
+ { id: '2', input: 'test', metadata: { category: 'large' } },
45
+ { id: '3', input: 'test', metadata: { category: 'large' } },
46
+ { id: '4', input: 'test', metadata: { category: 'large' } },
47
+ { id: '5', input: 'test', metadata: { category: 'medium' } },
48
+ { id: '6', input: 'test', metadata: { category: 'medium' } },
49
+ ]
50
+
51
+ const result = analyzeCategories(prompts, 'category')
52
+
53
+ expect(result[0]?.name).toBe('large')
54
+ expect(result[1]?.name).toBe('medium')
55
+ expect(result[2]?.name).toBe('small')
56
+ })
57
+
58
+ test('handles missing metadata as (uncategorized)', () => {
59
+ const prompts: PromptCase[] = [
60
+ { id: '1', input: 'test', metadata: { category: 'known' } },
61
+ { id: '2', input: 'test' }, // No metadata
62
+ { id: '3', input: 'test', metadata: {} }, // Empty metadata
63
+ ]
64
+
65
+ const result = analyzeCategories(prompts, 'category')
66
+
67
+ const uncategorized = result.find((d) => d.name === '(uncategorized)')
68
+ expect(uncategorized?.count).toBe(2)
69
+ })
70
+
71
+ test('handles different metadata keys', () => {
72
+ const prompts: PromptCase[] = [
73
+ { id: '1', input: 'test', metadata: { difficulty: 'easy', category: 'math' } },
74
+ { id: '2', input: 'test', metadata: { difficulty: 'hard', category: 'math' } },
75
+ { id: '3', input: 'test', metadata: { difficulty: 'easy', category: 'code' } },
76
+ ]
77
+
78
+ const byDifficulty = analyzeCategories(prompts, 'difficulty')
79
+ const byCategory = analyzeCategories(prompts, 'category')
80
+
81
+ expect(byDifficulty.find((d) => d.name === 'easy')?.count).toBe(2)
82
+ expect(byCategory.find((d) => d.name === 'math')?.count).toBe(2)
83
+ })
84
+
85
+ test('converts non-string metadata values to strings', () => {
86
+ const prompts: PromptCase[] = [
87
+ { id: '1', input: 'test', metadata: { level: 1 } },
88
+ { id: '2', input: 'test', metadata: { level: 1 } },
89
+ { id: '3', input: 'test', metadata: { level: 2 } },
90
+ ]
91
+
92
+ const result = analyzeCategories(prompts, 'level')
93
+
94
+ expect(result.find((d) => d.name === '1')?.count).toBe(2)
95
+ expect(result.find((d) => d.name === '2')?.count).toBe(1)
96
+ })
97
+
98
+ test('handles empty prompts array', () => {
99
+ const result = analyzeCategories([], 'category')
100
+ expect(result).toEqual([])
101
+ })
102
+
103
+ test('rounds percentages to integers', () => {
104
+ const prompts: PromptCase[] = [
105
+ { id: '1', input: 'test', metadata: { category: 'a' } },
106
+ { id: '2', input: 'test', metadata: { category: 'b' } },
107
+ { id: '3', input: 'test', metadata: { category: 'c' } },
108
+ ]
109
+
110
+ const result = analyzeCategories(prompts, 'category')
111
+
112
+ // 1/3 = 33.33... should round to 33
113
+ for (const dist of result) {
114
+ expect(Number.isInteger(dist.percentage)).toBe(true)
115
+ }
116
+ })
117
+ })
118
+
119
+ // ============================================================================
120
+ // findUnderrepresented
121
+ // ============================================================================
122
+
123
+ describe('findUnderrepresented', () => {
124
+ test('identifies categories below threshold', () => {
125
+ const distributions: CategoryDistribution[] = [
126
+ { name: 'large', count: 50, percentage: 50 },
127
+ { name: 'medium', count: 30, percentage: 30 },
128
+ { name: 'small', count: 20, percentage: 20 },
129
+ ]
130
+
131
+ // Even distribution would be 33.3% each
132
+ // With 50% threshold, anything below 16.65% is underrepresented
133
+ const result = findUnderrepresented(distributions, 50)
134
+
135
+ // At 50% threshold, 20% is above 16.65%, so nothing should be underrepresented
136
+ expect(result).toEqual([])
137
+ })
138
+
139
+ test('returns underrepresented categories at stricter threshold', () => {
140
+ const distributions: CategoryDistribution[] = [
141
+ { name: 'large', count: 80, percentage: 80 },
142
+ { name: 'small', count: 20, percentage: 20 },
143
+ ]
144
+
145
+ // Even distribution would be 50% each
146
+ // With 50% threshold, anything below 25% is underrepresented
147
+ const result = findUnderrepresented(distributions, 50)
148
+
149
+ expect(result).toContain('small')
150
+ expect(result).not.toContain('large')
151
+ })
152
+
153
+ test('handles even distribution (no underrepresentation)', () => {
154
+ const distributions: CategoryDistribution[] = [
155
+ { name: 'a', count: 25, percentage: 25 },
156
+ { name: 'b', count: 25, percentage: 25 },
157
+ { name: 'c', count: 25, percentage: 25 },
158
+ { name: 'd', count: 25, percentage: 25 },
159
+ ]
160
+
161
+ const result = findUnderrepresented(distributions, 50)
162
+ expect(result).toEqual([])
163
+ })
164
+
165
+ test('handles single category (never underrepresented)', () => {
166
+ const distributions: CategoryDistribution[] = [{ name: 'only', count: 100, percentage: 100 }]
167
+
168
+ const result = findUnderrepresented(distributions, 50)
169
+ expect(result).toEqual([])
170
+ })
171
+
172
+ test('threshold affects sensitivity', () => {
173
+ const distributions: CategoryDistribution[] = [
174
+ { name: 'large', count: 70, percentage: 70 },
175
+ { name: 'small', count: 30, percentage: 30 },
176
+ ]
177
+
178
+ // Even = 50%, at 50% threshold: below 25% is underrepresented
179
+ const strict = findUnderrepresented(distributions, 50)
180
+ expect(strict).toEqual([])
181
+
182
+ // At 80% threshold: below 40% is underrepresented
183
+ const lenient = findUnderrepresented(distributions, 80)
184
+ expect(lenient).toContain('small')
185
+ })
186
+
187
+ test('handles empty distributions', () => {
188
+ const result = findUnderrepresented([], 50)
189
+ expect(result).toEqual([])
190
+ })
191
+ })
192
+
193
+ // ============================================================================
194
+ // generateSuggestions
195
+ // ============================================================================
196
+
197
+ describe('generateSuggestions', () => {
198
+ test('suggests adding cases for underrepresented categories', () => {
199
+ const distributions: CategoryDistribution[] = [
200
+ { name: 'math', count: 80, percentage: 80 },
201
+ { name: 'code', count: 20, percentage: 20 },
202
+ ]
203
+ const underrepresented = ['code']
204
+
205
+ const suggestions = generateSuggestions(distributions, underrepresented, 100)
206
+
207
+ expect(suggestions.some((s) => s.includes('code'))).toBe(true)
208
+ expect(suggestions.some((s) => s.toLowerCase().includes('add'))).toBe(true)
209
+ })
210
+
211
+ test('warns about dominant category (>50%)', () => {
212
+ const distributions: CategoryDistribution[] = [
213
+ { name: 'dominant', count: 60, percentage: 60 },
214
+ { name: 'other', count: 40, percentage: 40 },
215
+ ]
216
+
217
+ const suggestions = generateSuggestions(distributions, [], 100)
218
+
219
+ expect(suggestions.some((s) => s.includes('dominant') && s.includes('60%'))).toBe(true)
220
+ expect(suggestions.some((s) => s.toLowerCase().includes('diversify'))).toBe(true)
221
+ })
222
+
223
+ test('warns about tiny categories (<3 cases)', () => {
224
+ const distributions: CategoryDistribution[] = [
225
+ { name: 'large', count: 97, percentage: 97 },
226
+ { name: 'tiny', count: 2, percentage: 2 },
227
+ { name: 'also_tiny', count: 1, percentage: 1 },
228
+ ]
229
+
230
+ const suggestions = generateSuggestions(distributions, [], 100)
231
+
232
+ expect(suggestions.some((s) => s.includes('tiny') || s.includes('also_tiny'))).toBe(true)
233
+ expect(suggestions.some((s) => s.includes('< 3 cases'))).toBe(true)
234
+ })
235
+
236
+ test('suggests expanding small test sets (<20 cases)', () => {
237
+ const distributions: CategoryDistribution[] = [
238
+ { name: 'a', count: 5, percentage: 50 },
239
+ { name: 'b', count: 5, percentage: 50 },
240
+ ]
241
+
242
+ const suggestions = generateSuggestions(distributions, [], 10)
243
+
244
+ expect(suggestions.some((s) => s.includes('10 cases') && s.toLowerCase().includes('expand'))).toBe(true)
245
+ })
246
+
247
+ test('returns "well-balanced" when no issues found', () => {
248
+ const distributions: CategoryDistribution[] = [
249
+ { name: 'a', count: 25, percentage: 25 },
250
+ { name: 'b', count: 25, percentage: 25 },
251
+ { name: 'c', count: 25, percentage: 25 },
252
+ { name: 'd', count: 25, percentage: 25 },
253
+ ]
254
+
255
+ const suggestions = generateSuggestions(distributions, [], 100)
256
+
257
+ expect(suggestions.some((s) => s.toLowerCase().includes('well-balanced'))).toBe(true)
258
+ })
259
+
260
+ test('combines multiple suggestions', () => {
261
+ const distributions: CategoryDistribution[] = [
262
+ { name: 'huge', count: 8, percentage: 80 },
263
+ { name: 'tiny', count: 2, percentage: 20 },
264
+ ]
265
+ const underrepresented = ['tiny']
266
+
267
+ const suggestions = generateSuggestions(distributions, underrepresented, 10)
268
+
269
+ // Should have multiple suggestions: underrepresented, dominant, tiny count, small test set
270
+ expect(suggestions.length).toBeGreaterThanOrEqual(2)
271
+ })
272
+
273
+ test('handles empty distributions', () => {
274
+ const suggestions = generateSuggestions([], [], 0)
275
+
276
+ // Should suggest expanding (0 cases)
277
+ expect(suggestions.some((s) => s.includes('0 cases'))).toBe(true)
278
+ })
279
+ })
@@ -0,0 +1,226 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import { getTrajectorySnippet, sampleArray } from '../calibrate.ts'
3
+ import type { TrajectoryStep } from '../schemas.ts'
4
+
5
+ // ============================================================================
6
+ // sampleArray
7
+ // ============================================================================
8
+
9
+ describe('sampleArray', () => {
10
+ test('returns n elements from array', () => {
11
+ const arr = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
12
+ const result = sampleArray(arr, 3)
13
+
14
+ expect(result).toHaveLength(3)
15
+ })
16
+
17
+ test('returns all elements when n >= array length', () => {
18
+ const arr = [1, 2, 3]
19
+ const result = sampleArray(arr, 5)
20
+
21
+ expect(result).toHaveLength(3)
22
+ expect(new Set(result)).toEqual(new Set(arr))
23
+ })
24
+
25
+ test('returns empty array for empty input', () => {
26
+ const result = sampleArray([], 5)
27
+ expect(result).toEqual([])
28
+ })
29
+
30
+ test('returns empty array when n is 0', () => {
31
+ const arr = [1, 2, 3]
32
+ const result = sampleArray(arr, 0)
33
+
34
+ expect(result).toEqual([])
35
+ })
36
+
37
+ test('does not modify original array', () => {
38
+ const arr = [1, 2, 3, 4, 5]
39
+ const original = [...arr]
40
+ sampleArray(arr, 3)
41
+
42
+ expect(arr).toEqual(original)
43
+ })
44
+
45
+ test('returns unique elements (no duplicates)', () => {
46
+ const arr = [1, 2, 3, 4, 5]
47
+ const result = sampleArray(arr, 3)
48
+
49
+ const uniqueResult = new Set(result)
50
+ expect(uniqueResult.size).toBe(result.length)
51
+ })
52
+
53
+ test('all returned elements exist in original array', () => {
54
+ const arr = ['a', 'b', 'c', 'd', 'e']
55
+ const result = sampleArray(arr, 3)
56
+
57
+ for (const item of result) {
58
+ expect(arr).toContain(item)
59
+ }
60
+ })
61
+
62
+ test('works with objects', () => {
63
+ const arr = [{ id: 1 }, { id: 2 }, { id: 3 }, { id: 4 }]
64
+ const result = sampleArray(arr, 2)
65
+
66
+ expect(result).toHaveLength(2)
67
+ for (const item of result) {
68
+ expect(arr).toContainEqual(item)
69
+ }
70
+ })
71
+
72
+ test('produces different results on multiple calls (randomness)', () => {
73
+ const arr = Array.from({ length: 100 }, (_, i) => i)
74
+ const results = new Set<string>()
75
+
76
+ // Run multiple times and check we get different orderings
77
+ for (let i = 0; i < 10; i++) {
78
+ const sample = sampleArray(arr, 10)
79
+ results.add(sample.join(','))
80
+ }
81
+
82
+ // With 100 elements, sampling 10, we should get different results
83
+ // This is probabilistic but extremely unlikely to fail
84
+ expect(results.size).toBeGreaterThan(1)
85
+ })
86
+ })
87
+
88
+ // ============================================================================
89
+ // getTrajectorySnippet
90
+ // ============================================================================
91
+
92
+ describe('getTrajectorySnippet', () => {
93
+ const createStep = (index: number): TrajectoryStep => ({
94
+ type: 'message',
95
+ content: `Step ${index}`,
96
+ timestamp: index * 100,
97
+ })
98
+
99
+ test('returns full trajectory when length <= maxSteps', () => {
100
+ const trajectory: TrajectoryStep[] = [createStep(1), createStep(2), createStep(3)]
101
+
102
+ const result = getTrajectorySnippet(trajectory, 5)
103
+
104
+ expect(result).toHaveLength(3)
105
+ expect(result).toEqual(trajectory)
106
+ })
107
+
108
+ test('returns maxSteps elements for longer trajectories', () => {
109
+ const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
110
+
111
+ const result = getTrajectorySnippet(trajectory, 5)
112
+
113
+ expect(result).toHaveLength(5)
114
+ })
115
+
116
+ test('includes first two steps', () => {
117
+ const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
118
+
119
+ const result = getTrajectorySnippet(trajectory, 5)
120
+
121
+ expect(result[0]).toEqual(createStep(1))
122
+ expect(result[1]).toEqual(createStep(2))
123
+ })
124
+
125
+ test('includes last two steps', () => {
126
+ const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
127
+
128
+ const result = getTrajectorySnippet(trajectory, 5)
129
+
130
+ expect(result[3]).toEqual(createStep(9))
131
+ expect(result[4]).toEqual(createStep(10))
132
+ })
133
+
134
+ test('includes middle step', () => {
135
+ const trajectory: TrajectoryStep[] = Array.from({ length: 10 }, (_, i) => createStep(i + 1))
136
+
137
+ const result = getTrajectorySnippet(trajectory, 5)
138
+
139
+ // Middle of 10 is index 5 (0-indexed), which is Step 6
140
+ expect(result[2]).toEqual(createStep(6))
141
+ })
142
+
143
+ test('handles empty trajectory', () => {
144
+ const result = getTrajectorySnippet([], 5)
145
+ expect(result).toEqual([])
146
+ })
147
+
148
+ test('handles single element trajectory', () => {
149
+ const trajectory: TrajectoryStep[] = [createStep(1)]
150
+
151
+ const result = getTrajectorySnippet(trajectory, 5)
152
+
153
+ expect(result).toEqual([createStep(1)])
154
+ })
155
+
156
+ test('handles trajectory exactly at maxSteps boundary', () => {
157
+ const trajectory: TrajectoryStep[] = Array.from({ length: 5 }, (_, i) => createStep(i + 1))
158
+
159
+ const result = getTrajectorySnippet(trajectory, 5)
160
+
161
+ expect(result).toHaveLength(5)
162
+ expect(result).toEqual(trajectory)
163
+ })
164
+
165
+ test('respects custom maxSteps parameter', () => {
166
+ const trajectory: TrajectoryStep[] = Array.from({ length: 20 }, (_, i) => createStep(i + 1))
167
+
168
+ const result3 = getTrajectorySnippet(trajectory, 3)
169
+ const result7 = getTrajectorySnippet(trajectory, 7)
170
+
171
+ // With maxSteps=3, should still return 5 (first 2 + middle + last 2)
172
+ // because the algorithm always takes first 2, middle 1, last 2
173
+ // But the function returns full trajectory if <= maxSteps
174
+ expect(result3.length).toBeLessThanOrEqual(trajectory.length)
175
+ expect(result7.length).toBeLessThanOrEqual(trajectory.length)
176
+ })
177
+
178
+ test('works with different step types', () => {
179
+ const trajectory: TrajectoryStep[] = [
180
+ { type: 'thought', content: 'Thinking...', timestamp: 0 },
181
+ { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 100 },
182
+ { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 200 },
183
+ { type: 'tool_call', name: 'Bash', status: 'completed', timestamp: 300 },
184
+ { type: 'tool_call', name: 'Grep', status: 'completed', timestamp: 400 },
185
+ { type: 'tool_call', name: 'Glob', status: 'completed', timestamp: 500 },
186
+ { type: 'plan', entries: [{ content: 'Plan', status: 'done' }], timestamp: 600 },
187
+ { type: 'message', content: 'Done!', timestamp: 700 },
188
+ ]
189
+
190
+ const result = getTrajectorySnippet(trajectory, 5)
191
+
192
+ expect(result).toHaveLength(5)
193
+ // First two
194
+ expect(result[0]?.type).toBe('thought')
195
+ expect(result[1]?.type).toBe('tool_call')
196
+ // Last two
197
+ expect(result[3]?.type).toBe('plan')
198
+ expect(result[4]?.type).toBe('message')
199
+ })
200
+
201
+ test('preserves step content when extracting', () => {
202
+ const trajectory: TrajectoryStep[] = [
203
+ { type: 'thought', content: 'First thought', timestamp: 0 },
204
+ { type: 'message', content: 'First message', timestamp: 100 },
205
+ { type: 'tool_call', name: 'Read', status: 'completed', input: { file_path: '/test.ts' }, timestamp: 200 },
206
+ { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 300 },
207
+ { type: 'tool_call', name: 'Bash', status: 'failed', timestamp: 400 },
208
+ { type: 'message', content: 'Last message', timestamp: 500 },
209
+ ]
210
+
211
+ const result = getTrajectorySnippet(trajectory, 5)
212
+
213
+ // First step should preserve all properties
214
+ const firstStep = result[0]
215
+ if (firstStep?.type === 'thought') {
216
+ expect(firstStep.content).toBe('First thought')
217
+ expect(firstStep.timestamp).toBe(0)
218
+ }
219
+
220
+ // Last step should preserve all properties
221
+ const lastStep = result[result.length - 1]
222
+ if (lastStep?.type === 'message') {
223
+ expect(lastStep.content).toBe('Last message')
224
+ }
225
+ })
226
+ })