@plaited/acp-harness 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -31
- package/bin/cli.ts +15 -0
- package/package.json +5 -7
- package/src/acp-client.ts +7 -4
- package/src/adapter-check.ts +0 -1
- package/src/adapter-scaffold.ts +16 -15
- package/src/calibrate.ts +28 -8
- package/src/capture.ts +114 -33
- package/src/grader-loader.ts +3 -3
- package/src/harness.ts +4 -0
- package/src/headless-cli.ts +433 -0
- package/src/headless-history-builder.ts +141 -0
- package/src/headless-output-parser.ts +251 -0
- package/src/headless-session-manager.ts +389 -0
- package/src/headless.schemas.ts +241 -0
- package/src/headless.ts +71 -0
- package/src/headless.types.ts +19 -0
- package/src/integration_tests/acp-claude.spec.ts +170 -0
- package/src/integration_tests/acp-gemini.spec.ts +174 -0
- package/src/schemas.ts +88 -36
- package/src/summarize.ts +4 -8
- package/src/tests/acp-client.spec.ts +1 -1
- package/src/tests/capture-cli.spec.ts +188 -0
- package/src/tests/capture-helpers.spec.ts +229 -67
- package/src/tests/constants.spec.ts +121 -0
- package/src/tests/fixtures/grader-exec.py +3 -3
- package/src/tests/fixtures/grader-module.ts +2 -2
- package/src/tests/grader-loader.spec.ts +5 -5
- package/src/tests/headless.spec.ts +460 -0
- package/src/tests/schemas-cli.spec.ts +142 -0
- package/src/tests/schemas.spec.ts +657 -0
- package/src/tests/summarize-helpers.spec.ts +3 -3
- package/src/tests/trials-cli.spec.ts +145 -0
- package/src/trials.ts +6 -19
- package/src/validate-refs.ts +1 -1
- package/src/tests/acp-integration.docker.ts +0 -214
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import { describe, expect, test } from 'bun:test'
|
|
2
2
|
import type { SessionNotification } from '@agentclientprotocol/sdk'
|
|
3
3
|
import {
|
|
4
|
+
detectTrajectoryRichness,
|
|
4
5
|
extractContent,
|
|
5
6
|
extractFilePath,
|
|
6
7
|
extractOutput,
|
|
8
|
+
extractTokenCounts,
|
|
7
9
|
extractTrajectory,
|
|
8
10
|
hasToolErrors,
|
|
9
11
|
headTailPreview,
|
|
@@ -16,13 +18,13 @@ import type { TrajectoryStep } from '../schemas.ts'
|
|
|
16
18
|
// ============================================================================
|
|
17
19
|
|
|
18
20
|
describe('loadPrompts', () => {
|
|
19
|
-
test('parses valid JSONL file', async () => {
|
|
21
|
+
test('parses valid JSONL file with string input', async () => {
|
|
20
22
|
// Create a temporary test file
|
|
21
23
|
const testPath = '/tmp/test-prompts-valid.jsonl'
|
|
22
24
|
await Bun.write(
|
|
23
25
|
testPath,
|
|
24
26
|
`{"id": "test-1", "input": "What is 2+2?"}
|
|
25
|
-
{"id": "test-2", "input": "Hello world", "
|
|
27
|
+
{"id": "test-2", "input": "Hello world", "hint": "greeting"}`,
|
|
26
28
|
)
|
|
27
29
|
|
|
28
30
|
const prompts = await loadPrompts(testPath)
|
|
@@ -31,7 +33,20 @@ describe('loadPrompts', () => {
|
|
|
31
33
|
expect(prompts[0]?.id).toBe('test-1')
|
|
32
34
|
expect(prompts[0]?.input).toBe('What is 2+2?')
|
|
33
35
|
expect(prompts[1]?.id).toBe('test-2')
|
|
34
|
-
expect(prompts[1]?.
|
|
36
|
+
expect(prompts[1]?.hint).toBe('greeting')
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
test('parses multi-turn input (string array)', async () => {
|
|
40
|
+
const testPath = '/tmp/test-prompts-multiturn.jsonl'
|
|
41
|
+
await Bun.write(testPath, `{"id": "test-1", "input": ["Hello", "How are you?", "Goodbye"], "hint": "farewell"}`)
|
|
42
|
+
|
|
43
|
+
const prompts = await loadPrompts(testPath)
|
|
44
|
+
|
|
45
|
+
expect(prompts).toHaveLength(1)
|
|
46
|
+
expect(prompts[0]?.id).toBe('test-1')
|
|
47
|
+
expect(Array.isArray(prompts[0]?.input)).toBe(true)
|
|
48
|
+
expect(prompts[0]?.input).toEqual(['Hello', 'How are you?', 'Goodbye'])
|
|
49
|
+
expect(prompts[0]?.hint).toBe('farewell')
|
|
35
50
|
})
|
|
36
51
|
|
|
37
52
|
test('parses prompts with metadata', async () => {
|
|
@@ -104,9 +119,9 @@ describe('extractTrajectory', () => {
|
|
|
104
119
|
|
|
105
120
|
expect(trajectory).toHaveLength(1)
|
|
106
121
|
expect(trajectory[0]?.type).toBe('thought')
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
122
|
+
// Type narrowing after explicit assertion
|
|
123
|
+
const step = trajectory[0]!
|
|
124
|
+
expect(step.type === 'thought' && step.content).toBe('Let me think about this...')
|
|
110
125
|
})
|
|
111
126
|
|
|
112
127
|
test('extracts messages from agent_message_chunk notifications', () => {
|
|
@@ -124,9 +139,9 @@ describe('extractTrajectory', () => {
|
|
|
124
139
|
|
|
125
140
|
expect(trajectory).toHaveLength(1)
|
|
126
141
|
expect(trajectory[0]?.type).toBe('message')
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
142
|
+
// Type narrowing after explicit assertion
|
|
143
|
+
const step = trajectory[0]!
|
|
144
|
+
expect(step.type === 'message' && step.content).toBe('Here is my answer.')
|
|
130
145
|
})
|
|
131
146
|
|
|
132
147
|
test('extracts tool calls with initial pending status', () => {
|
|
@@ -147,11 +162,11 @@ describe('extractTrajectory', () => {
|
|
|
147
162
|
|
|
148
163
|
expect(trajectory).toHaveLength(1)
|
|
149
164
|
expect(trajectory[0]?.type).toBe('tool_call')
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
}
|
|
165
|
+
// Type narrowing after explicit assertion
|
|
166
|
+
const step = trajectory[0]!
|
|
167
|
+
expect(step.type === 'tool_call' && step.name).toBe('Read')
|
|
168
|
+
expect(step.type === 'tool_call' && step.status).toBe('pending')
|
|
169
|
+
expect(step.type === 'tool_call' && step.input).toBe('{"file_path": "/test.ts"}')
|
|
155
170
|
})
|
|
156
171
|
|
|
157
172
|
test('updates tool call status on subsequent notifications', () => {
|
|
@@ -181,10 +196,11 @@ describe('extractTrajectory', () => {
|
|
|
181
196
|
|
|
182
197
|
// Should still be 1 entry, just updated
|
|
183
198
|
expect(trajectory).toHaveLength(1)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
199
|
+
expect(trajectory[0]?.type).toBe('tool_call')
|
|
200
|
+
// Type narrowing after explicit assertion
|
|
201
|
+
const step = trajectory[0]!
|
|
202
|
+
expect(step.type === 'tool_call' && step.status).toBe('completed')
|
|
203
|
+
expect(step.type === 'tool_call' && step.output).toBe('file contents here')
|
|
188
204
|
})
|
|
189
205
|
|
|
190
206
|
test('tracks multiple independent tool calls', () => {
|
|
@@ -202,8 +218,13 @@ describe('extractTrajectory', () => {
|
|
|
202
218
|
const trajectory = extractTrajectory(notifications, baseTime)
|
|
203
219
|
|
|
204
220
|
expect(trajectory).toHaveLength(2)
|
|
205
|
-
expect(trajectory[0]?.type
|
|
206
|
-
expect(trajectory[1]?.type
|
|
221
|
+
expect(trajectory[0]?.type).toBe('tool_call')
|
|
222
|
+
expect(trajectory[1]?.type).toBe('tool_call')
|
|
223
|
+
// Type narrowing after explicit assertions
|
|
224
|
+
const step0 = trajectory[0]!
|
|
225
|
+
const step1 = trajectory[1]!
|
|
226
|
+
expect(step0.type === 'tool_call' && step0.name).toBe('Read')
|
|
227
|
+
expect(step1.type === 'tool_call' && step1.name).toBe('Write')
|
|
207
228
|
})
|
|
208
229
|
|
|
209
230
|
test('extracts plan entries', () => {
|
|
@@ -224,9 +245,9 @@ describe('extractTrajectory', () => {
|
|
|
224
245
|
|
|
225
246
|
expect(trajectory).toHaveLength(1)
|
|
226
247
|
expect(trajectory[0]?.type).toBe('plan')
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
248
|
+
// Type narrowing after explicit assertion
|
|
249
|
+
const step = trajectory[0]!
|
|
250
|
+
expect(step.type === 'plan' && step.entries).toHaveLength(2)
|
|
230
251
|
})
|
|
231
252
|
|
|
232
253
|
test('handles empty notifications', () => {
|
|
@@ -237,69 +258,72 @@ describe('extractTrajectory', () => {
|
|
|
237
258
|
test('assigns timestamps relative to start time', () => {
|
|
238
259
|
// Mock Date.now to control timestamps
|
|
239
260
|
const originalNow = Date.now
|
|
240
|
-
|
|
261
|
+
try {
|
|
262
|
+
let currentTime = 1000
|
|
241
263
|
|
|
242
|
-
|
|
264
|
+
Date.now = () => currentTime
|
|
243
265
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
const startTime = 1000
|
|
252
|
-
currentTime = 1500 // 500ms later
|
|
266
|
+
const notifications: SessionNotification[] = [
|
|
267
|
+
{
|
|
268
|
+
sessionId: 's1',
|
|
269
|
+
update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'First' } },
|
|
270
|
+
},
|
|
271
|
+
]
|
|
253
272
|
|
|
254
|
-
|
|
273
|
+
const startTime = 1000
|
|
274
|
+
currentTime = 1500 // 500ms later
|
|
255
275
|
|
|
256
|
-
|
|
276
|
+
const trajectory = extractTrajectory(notifications, startTime)
|
|
257
277
|
|
|
258
|
-
|
|
259
|
-
|
|
278
|
+
expect(trajectory[0]?.timestamp).toBe(500)
|
|
279
|
+
} finally {
|
|
280
|
+
Date.now = originalNow
|
|
281
|
+
}
|
|
260
282
|
})
|
|
261
283
|
|
|
262
284
|
test('calculates tool call duration correctly', () => {
|
|
263
285
|
const originalNow = Date.now
|
|
264
|
-
|
|
286
|
+
try {
|
|
287
|
+
let currentTime = 1000
|
|
265
288
|
|
|
266
|
-
|
|
289
|
+
Date.now = () => currentTime
|
|
267
290
|
|
|
268
|
-
|
|
291
|
+
const startTime = 1000
|
|
269
292
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
293
|
+
// Simulate time passing between notifications
|
|
294
|
+
// First notification at t=100 (currentTime = 1100)
|
|
295
|
+
// Second notification at t=600 (currentTime = 1600)
|
|
296
|
+
const notifications: SessionNotification[] = []
|
|
274
297
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
298
|
+
currentTime = 1100 // First call at 100ms relative to start
|
|
299
|
+
notifications.push({
|
|
300
|
+
sessionId: 's1',
|
|
301
|
+
update: { sessionUpdate: 'tool_call', toolCallId: 't1', title: 'Bash', status: 'pending' },
|
|
302
|
+
})
|
|
280
303
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
304
|
+
currentTime = 1600 // Second call at 600ms relative to start
|
|
305
|
+
notifications.push({
|
|
306
|
+
sessionId: 's1',
|
|
307
|
+
update: { sessionUpdate: 'tool_call', toolCallId: 't1', title: 'Bash', status: 'completed' },
|
|
308
|
+
})
|
|
286
309
|
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
310
|
+
// Now process all notifications in one call
|
|
311
|
+
// But the issue is extractTrajectory calls Date.now() for each notification
|
|
312
|
+
// so we need to mock it to return different values for each call
|
|
290
313
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
314
|
+
let callCount = 0
|
|
315
|
+
const times = [1100, 1600]
|
|
316
|
+
Date.now = () => times[callCount++] ?? 1600
|
|
294
317
|
|
|
295
|
-
|
|
318
|
+
const trajectory = extractTrajectory(notifications, startTime)
|
|
296
319
|
|
|
297
|
-
|
|
298
|
-
// Duration should be 500ms (600 - 100)
|
|
299
|
-
|
|
320
|
+
expect(trajectory[0]?.type).toBe('tool_call')
|
|
321
|
+
// Type narrowing after explicit assertion - Duration should be 500ms (600 - 100)
|
|
322
|
+
const step = trajectory[0]!
|
|
323
|
+
expect(step.type === 'tool_call' && step.duration).toBe(500)
|
|
324
|
+
} finally {
|
|
325
|
+
Date.now = originalNow
|
|
300
326
|
}
|
|
301
|
-
|
|
302
|
-
Date.now = originalNow
|
|
303
327
|
})
|
|
304
328
|
|
|
305
329
|
test('ignores non-text content in thought chunks', () => {
|
|
@@ -551,3 +575,141 @@ describe('extractContent', () => {
|
|
|
551
575
|
expect(extractContent(input)).toBe('line1\nline2\nline3')
|
|
552
576
|
})
|
|
553
577
|
})
|
|
578
|
+
|
|
579
|
+
// ============================================================================
|
|
580
|
+
// detectTrajectoryRichness
|
|
581
|
+
// ============================================================================
|
|
582
|
+
|
|
583
|
+
describe('detectTrajectoryRichness', () => {
|
|
584
|
+
test('returns "full" when trajectory has thoughts', () => {
|
|
585
|
+
const trajectory: TrajectoryStep[] = [
|
|
586
|
+
{ type: 'thought', content: 'Let me think...', timestamp: 0 },
|
|
587
|
+
{ type: 'message', content: 'Answer', timestamp: 100 },
|
|
588
|
+
]
|
|
589
|
+
|
|
590
|
+
expect(detectTrajectoryRichness(trajectory)).toBe('full')
|
|
591
|
+
})
|
|
592
|
+
|
|
593
|
+
test('returns "full" when trajectory has tool calls', () => {
|
|
594
|
+
const trajectory: TrajectoryStep[] = [
|
|
595
|
+
{ type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 },
|
|
596
|
+
{ type: 'message', content: 'Answer', timestamp: 100 },
|
|
597
|
+
]
|
|
598
|
+
|
|
599
|
+
expect(detectTrajectoryRichness(trajectory)).toBe('full')
|
|
600
|
+
})
|
|
601
|
+
|
|
602
|
+
test('returns "full" when trajectory has plans', () => {
|
|
603
|
+
const trajectory: TrajectoryStep[] = [
|
|
604
|
+
{ type: 'plan', entries: [{ content: 'Step 1', status: 'completed' }], timestamp: 0 },
|
|
605
|
+
{ type: 'message', content: 'Answer', timestamp: 100 },
|
|
606
|
+
]
|
|
607
|
+
|
|
608
|
+
expect(detectTrajectoryRichness(trajectory)).toBe('full')
|
|
609
|
+
})
|
|
610
|
+
|
|
611
|
+
test('returns "messages-only" when trajectory only has messages', () => {
|
|
612
|
+
const trajectory: TrajectoryStep[] = [
|
|
613
|
+
{ type: 'message', content: 'First', timestamp: 0 },
|
|
614
|
+
{ type: 'message', content: 'Second', timestamp: 100 },
|
|
615
|
+
]
|
|
616
|
+
|
|
617
|
+
expect(detectTrajectoryRichness(trajectory)).toBe('messages-only')
|
|
618
|
+
})
|
|
619
|
+
|
|
620
|
+
test('returns "minimal" when trajectory is empty', () => {
|
|
621
|
+
expect(detectTrajectoryRichness([])).toBe('minimal')
|
|
622
|
+
})
|
|
623
|
+
|
|
624
|
+
test('returns "full" when trajectory has mixed rich content', () => {
|
|
625
|
+
const trajectory: TrajectoryStep[] = [
|
|
626
|
+
{ type: 'thought', content: 'Thinking...', timestamp: 0 },
|
|
627
|
+
{ type: 'tool_call', name: 'Read', status: 'completed', timestamp: 50 },
|
|
628
|
+
{ type: 'plan', entries: [], timestamp: 100 },
|
|
629
|
+
{ type: 'message', content: 'Done', timestamp: 150 },
|
|
630
|
+
]
|
|
631
|
+
|
|
632
|
+
expect(detectTrajectoryRichness(trajectory)).toBe('full')
|
|
633
|
+
})
|
|
634
|
+
})
|
|
635
|
+
|
|
636
|
+
// ============================================================================
|
|
637
|
+
// extractTokenCounts
|
|
638
|
+
// ============================================================================
|
|
639
|
+
|
|
640
|
+
describe('extractTokenCounts', () => {
|
|
641
|
+
test('returns undefined when no usage data present', () => {
|
|
642
|
+
const updates: SessionNotification[] = [
|
|
643
|
+
{
|
|
644
|
+
sessionId: 's1',
|
|
645
|
+
update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'Hello' } },
|
|
646
|
+
},
|
|
647
|
+
]
|
|
648
|
+
|
|
649
|
+
const result = extractTokenCounts(updates)
|
|
650
|
+
|
|
651
|
+
expect(result.inputTokens).toBeUndefined()
|
|
652
|
+
expect(result.outputTokens).toBeUndefined()
|
|
653
|
+
})
|
|
654
|
+
|
|
655
|
+
test('extracts token counts from usage field when present', () => {
|
|
656
|
+
const updates: SessionNotification[] = [
|
|
657
|
+
{
|
|
658
|
+
sessionId: 's1',
|
|
659
|
+
update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'Hello' } },
|
|
660
|
+
// @ts-expect-error - SessionNotification type doesn't include 'usage' field, but adapters like Claude Code add it at runtime
|
|
661
|
+
usage: { inputTokens: 50, outputTokens: 30 },
|
|
662
|
+
},
|
|
663
|
+
]
|
|
664
|
+
|
|
665
|
+
const result = extractTokenCounts(updates)
|
|
666
|
+
|
|
667
|
+
expect(result.inputTokens).toBe(50)
|
|
668
|
+
expect(result.outputTokens).toBe(30)
|
|
669
|
+
})
|
|
670
|
+
|
|
671
|
+
test('accumulates token counts across multiple updates', () => {
|
|
672
|
+
const updates: SessionNotification[] = [
|
|
673
|
+
{
|
|
674
|
+
sessionId: 's1',
|
|
675
|
+
update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'First' } },
|
|
676
|
+
// @ts-expect-error - SessionNotification type doesn't include 'usage' field, but adapters like Claude Code add it at runtime
|
|
677
|
+
usage: { inputTokens: 50, outputTokens: 30 },
|
|
678
|
+
},
|
|
679
|
+
{
|
|
680
|
+
sessionId: 's1',
|
|
681
|
+
update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'Second' } },
|
|
682
|
+
// @ts-expect-error - SessionNotification type doesn't include 'usage' field, but adapters like Claude Code add it at runtime
|
|
683
|
+
usage: { inputTokens: 25, outputTokens: 45 },
|
|
684
|
+
},
|
|
685
|
+
]
|
|
686
|
+
|
|
687
|
+
const result = extractTokenCounts(updates)
|
|
688
|
+
|
|
689
|
+
expect(result.inputTokens).toBe(75) // 50 + 25
|
|
690
|
+
expect(result.outputTokens).toBe(75) // 30 + 45
|
|
691
|
+
})
|
|
692
|
+
|
|
693
|
+
test('handles empty updates array', () => {
|
|
694
|
+
const result = extractTokenCounts([])
|
|
695
|
+
|
|
696
|
+
expect(result.inputTokens).toBeUndefined()
|
|
697
|
+
expect(result.outputTokens).toBeUndefined()
|
|
698
|
+
})
|
|
699
|
+
|
|
700
|
+
test('handles partial token counts (only input or output)', () => {
|
|
701
|
+
const updates: SessionNotification[] = [
|
|
702
|
+
{
|
|
703
|
+
sessionId: 's1',
|
|
704
|
+
update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'Hello' } },
|
|
705
|
+
// @ts-expect-error - SessionNotification type doesn't include 'usage' field, but adapters like Claude Code add it at runtime
|
|
706
|
+
usage: { inputTokens: 100 },
|
|
707
|
+
},
|
|
708
|
+
]
|
|
709
|
+
|
|
710
|
+
const result = extractTokenCounts(updates)
|
|
711
|
+
|
|
712
|
+
expect(result.inputTokens).toBe(100)
|
|
713
|
+
expect(result.outputTokens).toBeUndefined()
|
|
714
|
+
})
|
|
715
|
+
})
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { describe, expect, test } from 'bun:test'
|
|
2
|
+
import {
|
|
3
|
+
ACP_METHODS,
|
|
4
|
+
ACP_PROTOCOL_VERSION,
|
|
5
|
+
DEFAULT_ACP_CLIENT_NAME,
|
|
6
|
+
DEFAULT_ACP_TIMEOUT,
|
|
7
|
+
DEFAULT_CALIBRATION_SAMPLE_SIZE,
|
|
8
|
+
DEFAULT_HARNESS_TIMEOUT,
|
|
9
|
+
DEFAULT_POLLING_INTERVAL,
|
|
10
|
+
DEFAULT_TRIAL_COUNT,
|
|
11
|
+
HEAD_LINES,
|
|
12
|
+
JSON_RPC_ERRORS,
|
|
13
|
+
MAX_CONTENT_LENGTH,
|
|
14
|
+
TAIL_LINES,
|
|
15
|
+
} from '../constants.ts'
|
|
16
|
+
|
|
17
|
+
// ============================================================================
|
|
18
|
+
// ACP Protocol Constants
|
|
19
|
+
// ============================================================================
|
|
20
|
+
|
|
21
|
+
describe('ACP_METHODS', () => {
|
|
22
|
+
test('contains all required lifecycle methods', () => {
|
|
23
|
+
expect(ACP_METHODS.INITIALIZE).toBe('initialize')
|
|
24
|
+
expect(ACP_METHODS.SHUTDOWN).toBe('shutdown')
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
test('contains all required session methods', () => {
|
|
28
|
+
expect(ACP_METHODS.CREATE_SESSION).toBe('session/new')
|
|
29
|
+
expect(ACP_METHODS.LOAD_SESSION).toBe('session/load')
|
|
30
|
+
expect(ACP_METHODS.PROMPT).toBe('session/prompt')
|
|
31
|
+
expect(ACP_METHODS.CANCEL).toBe('session/cancel')
|
|
32
|
+
expect(ACP_METHODS.UPDATE).toBe('session/update')
|
|
33
|
+
expect(ACP_METHODS.REQUEST_PERMISSION).toBe('session/request_permission')
|
|
34
|
+
expect(ACP_METHODS.SET_MODEL).toBe('session/set_model')
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('contains protocol-level methods', () => {
|
|
38
|
+
expect(ACP_METHODS.CANCEL_REQUEST).toBe('$/cancel_request')
|
|
39
|
+
})
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
describe('ACP_PROTOCOL_VERSION', () => {
|
|
43
|
+
test('is version 1', () => {
|
|
44
|
+
expect(ACP_PROTOCOL_VERSION).toBe(1)
|
|
45
|
+
})
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
// ============================================================================
|
|
49
|
+
// JSON-RPC Error Codes
|
|
50
|
+
// ============================================================================
|
|
51
|
+
|
|
52
|
+
describe('JSON_RPC_ERRORS', () => {
|
|
53
|
+
test('contains standard JSON-RPC error codes', () => {
|
|
54
|
+
expect(JSON_RPC_ERRORS.PARSE_ERROR).toBe(-32700)
|
|
55
|
+
expect(JSON_RPC_ERRORS.INVALID_REQUEST).toBe(-32600)
|
|
56
|
+
expect(JSON_RPC_ERRORS.METHOD_NOT_FOUND).toBe(-32601)
|
|
57
|
+
expect(JSON_RPC_ERRORS.INVALID_PARAMS).toBe(-32602)
|
|
58
|
+
expect(JSON_RPC_ERRORS.INTERNAL_ERROR).toBe(-32603)
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
test('contains ACP extension error codes', () => {
|
|
62
|
+
expect(JSON_RPC_ERRORS.REQUEST_CANCELLED).toBe(-32800)
|
|
63
|
+
})
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
// ============================================================================
|
|
67
|
+
// ACP Client Defaults
|
|
68
|
+
// ============================================================================
|
|
69
|
+
|
|
70
|
+
describe('ACP Client defaults', () => {
|
|
71
|
+
test('DEFAULT_ACP_CLIENT_NAME is set', () => {
|
|
72
|
+
expect(DEFAULT_ACP_CLIENT_NAME).toBe('plaited-acp-client')
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
test('DEFAULT_ACP_TIMEOUT is 30 seconds', () => {
|
|
76
|
+
expect(DEFAULT_ACP_TIMEOUT).toBe(30000)
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
test('DEFAULT_POLLING_INTERVAL is 50ms', () => {
|
|
80
|
+
expect(DEFAULT_POLLING_INTERVAL).toBe(50)
|
|
81
|
+
})
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
// ============================================================================
|
|
85
|
+
// Harness Preview Configuration
|
|
86
|
+
// ============================================================================
|
|
87
|
+
|
|
88
|
+
describe('Preview configuration', () => {
|
|
89
|
+
test('HEAD_LINES is positive', () => {
|
|
90
|
+
expect(HEAD_LINES).toBeGreaterThan(0)
|
|
91
|
+
expect(HEAD_LINES).toBe(8)
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
test('TAIL_LINES is positive', () => {
|
|
95
|
+
expect(TAIL_LINES).toBeGreaterThan(0)
|
|
96
|
+
expect(TAIL_LINES).toBe(4)
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
test('MAX_CONTENT_LENGTH is reasonable', () => {
|
|
100
|
+
expect(MAX_CONTENT_LENGTH).toBeGreaterThan(0)
|
|
101
|
+
expect(MAX_CONTENT_LENGTH).toBe(500)
|
|
102
|
+
})
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
// ============================================================================
|
|
106
|
+
// Harness Defaults
|
|
107
|
+
// ============================================================================
|
|
108
|
+
|
|
109
|
+
describe('Harness defaults', () => {
|
|
110
|
+
test('DEFAULT_HARNESS_TIMEOUT is 60 seconds', () => {
|
|
111
|
+
expect(DEFAULT_HARNESS_TIMEOUT).toBe(60000)
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
test('DEFAULT_TRIAL_COUNT is 5', () => {
|
|
115
|
+
expect(DEFAULT_TRIAL_COUNT).toBe(5)
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
test('DEFAULT_CALIBRATION_SAMPLE_SIZE is 10', () => {
|
|
119
|
+
expect(DEFAULT_CALIBRATION_SAMPLE_SIZE).toBe(10)
|
|
120
|
+
})
|
|
121
|
+
})
|
|
@@ -10,10 +10,10 @@ def main():
|
|
|
10
10
|
data = json.load(sys.stdin)
|
|
11
11
|
|
|
12
12
|
output = data.get("output", "").lower()
|
|
13
|
-
|
|
13
|
+
hint = (data.get("hint") or "").lower()
|
|
14
14
|
|
|
15
|
-
if
|
|
16
|
-
pass_result =
|
|
15
|
+
if hint:
|
|
16
|
+
pass_result = hint in output
|
|
17
17
|
else:
|
|
18
18
|
pass_result = True
|
|
19
19
|
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
import type { Grader } from '../../schemas.ts'
|
|
6
6
|
|
|
7
|
-
export const grade: Grader = async ({ input: _input, output,
|
|
8
|
-
const pass =
|
|
7
|
+
export const grade: Grader = async ({ input: _input, output, hint }) => {
|
|
8
|
+
const pass = hint ? output.toLowerCase().includes(hint.toLowerCase()) : true
|
|
9
9
|
return {
|
|
10
10
|
pass,
|
|
11
11
|
score: pass ? 1.0 : 0.0,
|
|
@@ -15,7 +15,7 @@ describe('loadGrader - module graders', () => {
|
|
|
15
15
|
const result = await grader({
|
|
16
16
|
input: 'What is 2+2?',
|
|
17
17
|
output: 'The answer is 4',
|
|
18
|
-
|
|
18
|
+
hint: '4',
|
|
19
19
|
})
|
|
20
20
|
|
|
21
21
|
expect(result.pass).toBe(true)
|
|
@@ -45,7 +45,7 @@ describe('loadGrader - executable graders', () => {
|
|
|
45
45
|
const result = await grader({
|
|
46
46
|
input: 'What is 2+2?',
|
|
47
47
|
output: 'The answer is 4',
|
|
48
|
-
|
|
48
|
+
hint: '4',
|
|
49
49
|
})
|
|
50
50
|
|
|
51
51
|
expect(result.pass).toBe(true)
|
|
@@ -59,7 +59,7 @@ describe('loadGrader - executable graders', () => {
|
|
|
59
59
|
const result = await grader({
|
|
60
60
|
input: 'What is 2+2?',
|
|
61
61
|
output: 'I do not know',
|
|
62
|
-
|
|
62
|
+
hint: '4',
|
|
63
63
|
})
|
|
64
64
|
|
|
65
65
|
expect(result.pass).toBe(false)
|
|
@@ -126,7 +126,7 @@ describe('loadGrader - trajectory support', () => {
|
|
|
126
126
|
const result = await grader({
|
|
127
127
|
input: 'test',
|
|
128
128
|
output: 'The answer is 4',
|
|
129
|
-
|
|
129
|
+
hint: '4',
|
|
130
130
|
trajectory,
|
|
131
131
|
})
|
|
132
132
|
|
|
@@ -144,7 +144,7 @@ describe('loadGrader - trajectory support', () => {
|
|
|
144
144
|
const result = await grader({
|
|
145
145
|
input: 'test',
|
|
146
146
|
output: 'The answer is 4',
|
|
147
|
-
|
|
147
|
+
hint: '4',
|
|
148
148
|
trajectory,
|
|
149
149
|
})
|
|
150
150
|
|