@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
package/src/core.ts
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core utilities re-export.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Public API for core utilities. Import from here for external use.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
export {
|
|
11
|
+
// Trajectory
|
|
12
|
+
detectTrajectoryRichness,
|
|
13
|
+
extractContent,
|
|
14
|
+
extractFilePath,
|
|
15
|
+
extractOutput,
|
|
16
|
+
extractTrajectory,
|
|
17
|
+
// Output
|
|
18
|
+
getInputPreview,
|
|
19
|
+
hasToolErrors,
|
|
20
|
+
headTailPreview,
|
|
21
|
+
// Loading
|
|
22
|
+
loadJsonl,
|
|
23
|
+
loadPrompts,
|
|
24
|
+
loadResults,
|
|
25
|
+
logProgress,
|
|
26
|
+
resolvePath,
|
|
27
|
+
writeOutput,
|
|
28
|
+
} from './core/core.ts'
|
package/src/harness.ts
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Harness commands for agent evaluation.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Re-exports all harness command modules for programmatic use.
|
|
6
|
+
* For CLI usage, run `agent-eval-harness <command> --help`.
|
|
7
|
+
*
|
|
8
|
+
* **Commands:**
|
|
9
|
+
* - `capture` - Core trajectory capture
|
|
10
|
+
* - `trials` - Multi-run pass@k/pass^k analysis
|
|
11
|
+
* - `summarize` - Derive compact views from results
|
|
12
|
+
* - `calibrate` - Sample failures for grader review
|
|
13
|
+
* - `validateRefs` - Check reference solutions
|
|
14
|
+
* - `balance` - Analyze test set coverage
|
|
15
|
+
* - `schemasCli` - Export JSON schemas
|
|
16
|
+
* - `headless` - Schema-driven adapter for headless CLI agents
|
|
17
|
+
*
|
|
18
|
+
* @packageDocumentation
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
export type { BalanceConfig } from './commands/balance.ts'
|
|
22
|
+
export { balance, runBalance } from './commands/balance.ts'
|
|
23
|
+
export type { CalibrateConfig } from './commands/calibrate.ts'
|
|
24
|
+
export { calibrate, runCalibrate } from './commands/calibrate.ts'
|
|
25
|
+
// Config types
|
|
26
|
+
export type { CaptureConfig } from './commands/capture.ts'
|
|
27
|
+
// Command implementations (for programmatic use)
|
|
28
|
+
export {
|
|
29
|
+
capture,
|
|
30
|
+
extractOutput,
|
|
31
|
+
extractTrajectory,
|
|
32
|
+
hasToolErrors,
|
|
33
|
+
loadPrompts,
|
|
34
|
+
runCapture,
|
|
35
|
+
} from './commands/capture.ts'
|
|
36
|
+
export type { SummarizeConfig } from './commands/summarize.ts'
|
|
37
|
+
export { runSummarize, summarize } from './commands/summarize.ts'
|
|
38
|
+
export type { TrialsConfig } from './commands/trials.ts'
|
|
39
|
+
export { runTrials, trials } from './commands/trials.ts'
|
|
40
|
+
export type { ValidateRefsConfig } from './commands/validate-refs.ts'
|
|
41
|
+
export { runValidateRefs, validateRefs } from './commands/validate-refs.ts'
|
|
42
|
+
export type { HeadlessAdapterConfig } from './headless.ts'
|
|
43
|
+
// Headless adapter factory
|
|
44
|
+
export { headless } from './headless.ts'
|
|
45
|
+
export type { SchemasConfig } from './schemas/schemas-cli.ts'
|
|
46
|
+
export { runSchemas, schemasCli } from './schemas/schemas-cli.ts'
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Headless adapter factory CLI entry point.
|
|
4
|
+
*
|
|
5
|
+
* @remarks
|
|
6
|
+
* This module implements a schema-driven adapter that can interact with
|
|
7
|
+
* ANY headless CLI agent. The adapter:
|
|
8
|
+
*
|
|
9
|
+
* 1. Reads a JSON schema defining how to interact with the CLI
|
|
10
|
+
* 2. Spawns the CLI process per schema's command + flags
|
|
11
|
+
* 3. Parses stdout using schema's outputEvents mappings
|
|
12
|
+
* 4. Emits session update notifications
|
|
13
|
+
* 5. Manages session state for multi-turn (stream or iterative mode)
|
|
14
|
+
*
|
|
15
|
+
* @packageDocumentation
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { createInterface } from 'node:readline'
|
|
19
|
+
import { parseArgs } from 'node:util'
|
|
20
|
+
import { PROTOCOL_VERSION } from '../schemas/constants.ts'
|
|
21
|
+
import { type HeadlessAdapterConfig, parseHeadlessConfig } from './headless.schemas.ts'
|
|
22
|
+
import { createSessionManager, type SessionManager } from './headless-session-manager.ts'
|
|
23
|
+
|
|
24
|
+
// ============================================================================
|
|
25
|
+
// Types
|
|
26
|
+
// ============================================================================
|
|
27
|
+
|
|
28
|
+
/** JSON-RPC 2.0 request */
|
|
29
|
+
type JsonRpcRequest = {
|
|
30
|
+
jsonrpc: '2.0'
|
|
31
|
+
id: string | number
|
|
32
|
+
method: string
|
|
33
|
+
params?: unknown
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/** JSON-RPC 2.0 notification */
|
|
37
|
+
type JsonRpcNotification = {
|
|
38
|
+
jsonrpc: '2.0'
|
|
39
|
+
method: string
|
|
40
|
+
params?: unknown
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** JSON-RPC 2.0 success response */
|
|
44
|
+
type JsonRpcSuccessResponse = {
|
|
45
|
+
jsonrpc: '2.0'
|
|
46
|
+
id: string | number
|
|
47
|
+
result: unknown
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/** JSON-RPC 2.0 error response */
|
|
51
|
+
type JsonRpcErrorResponse = {
|
|
52
|
+
jsonrpc: '2.0'
|
|
53
|
+
id: string | number | null
|
|
54
|
+
error: {
|
|
55
|
+
code: number
|
|
56
|
+
message: string
|
|
57
|
+
data?: unknown
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/** JSON-RPC 2.0 response */
|
|
62
|
+
type JsonRpcResponse = JsonRpcSuccessResponse | JsonRpcErrorResponse
|
|
63
|
+
|
|
64
|
+
/** Content block for prompts */
|
|
65
|
+
type ContentBlock = { type: 'text'; text: string } | { type: 'image'; source: unknown }
|
|
66
|
+
|
|
67
|
+
// ============================================================================
|
|
68
|
+
// Message Sending
|
|
69
|
+
// ============================================================================
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Sends a JSON-RPC message to stdout.
|
|
73
|
+
*/
|
|
74
|
+
const sendMessage = (message: JsonRpcResponse | JsonRpcNotification): void => {
|
|
75
|
+
// biome-ignore lint/suspicious/noConsole: Protocol output
|
|
76
|
+
console.log(JSON.stringify(message))
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Sends a session update notification.
|
|
81
|
+
*/
|
|
82
|
+
const sendSessionUpdate = (sessionId: string, update: unknown): void => {
|
|
83
|
+
sendMessage({
|
|
84
|
+
jsonrpc: '2.0',
|
|
85
|
+
method: 'session/update',
|
|
86
|
+
params: { sessionId, update },
|
|
87
|
+
})
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ============================================================================
|
|
91
|
+
// Request Handlers
|
|
92
|
+
// ============================================================================
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Creates request handlers for the headless adapter.
|
|
96
|
+
*
|
|
97
|
+
* @param schema - Headless adapter configuration
|
|
98
|
+
* @param sessions - Session manager instance
|
|
99
|
+
*/
|
|
100
|
+
const createHandlers = (schema: HeadlessAdapterConfig, sessions: SessionManager) => {
|
|
101
|
+
/**
|
|
102
|
+
* Handle initialize request.
|
|
103
|
+
*/
|
|
104
|
+
const handleInitialize = async (params: unknown): Promise<unknown> => {
|
|
105
|
+
const { protocolVersion } = params as { protocolVersion: number }
|
|
106
|
+
|
|
107
|
+
if (protocolVersion !== PROTOCOL_VERSION) {
|
|
108
|
+
throw new Error(`Unsupported protocol version: ${protocolVersion}`)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
protocolVersion: PROTOCOL_VERSION,
|
|
113
|
+
agentInfo: {
|
|
114
|
+
name: schema.name,
|
|
115
|
+
version: '1.0.0',
|
|
116
|
+
},
|
|
117
|
+
agentCapabilities: {
|
|
118
|
+
loadSession: !!schema.resume,
|
|
119
|
+
promptCapabilities: {
|
|
120
|
+
image: false,
|
|
121
|
+
},
|
|
122
|
+
},
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Handle session/new request.
|
|
128
|
+
*/
|
|
129
|
+
const handleSessionNew = async (params: unknown): Promise<unknown> => {
|
|
130
|
+
const { cwd } = params as { cwd: string }
|
|
131
|
+
const session = await sessions.create(cwd)
|
|
132
|
+
return { sessionId: session.id }
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Handle session/load request.
|
|
137
|
+
*/
|
|
138
|
+
const handleSessionLoad = async (params: unknown): Promise<unknown> => {
|
|
139
|
+
const { sessionId } = params as { sessionId: string }
|
|
140
|
+
const session = sessions.get(sessionId)
|
|
141
|
+
|
|
142
|
+
if (!session) {
|
|
143
|
+
throw new Error(`Session not found: ${sessionId}`)
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return { sessionId }
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Handle session/prompt request.
|
|
151
|
+
*/
|
|
152
|
+
const handleSessionPrompt = async (params: unknown): Promise<unknown> => {
|
|
153
|
+
const { sessionId, prompt } = params as { sessionId: string; prompt: ContentBlock[] }
|
|
154
|
+
|
|
155
|
+
// Extract text from content blocks
|
|
156
|
+
const promptText = prompt
|
|
157
|
+
.filter((block): block is ContentBlock & { type: 'text' } => block.type === 'text')
|
|
158
|
+
.map((block) => block.text)
|
|
159
|
+
.join('\n')
|
|
160
|
+
|
|
161
|
+
// Execute prompt and stream updates
|
|
162
|
+
const result = await sessions.prompt(sessionId, promptText, (update) => {
|
|
163
|
+
// Map parsed update to session update format
|
|
164
|
+
const sessionUpdate = mapToSessionUpdate(update)
|
|
165
|
+
sendSessionUpdate(sessionId, sessionUpdate)
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
content: [{ type: 'text', text: result.output }],
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Handle session/cancel notification.
|
|
175
|
+
*/
|
|
176
|
+
const handleSessionCancel = async (params: unknown): Promise<void> => {
|
|
177
|
+
const { sessionId } = params as { sessionId: string }
|
|
178
|
+
sessions.cancel(sessionId)
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
handleInitialize,
|
|
183
|
+
handleSessionNew,
|
|
184
|
+
handleSessionLoad,
|
|
185
|
+
handleSessionPrompt,
|
|
186
|
+
handleSessionCancel,
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Maps a parsed update to session update format.
|
|
192
|
+
*/
|
|
193
|
+
const mapToSessionUpdate = (update: { type: string; content?: string; title?: string; status?: string }): unknown => {
|
|
194
|
+
switch (update.type) {
|
|
195
|
+
case 'thought':
|
|
196
|
+
return {
|
|
197
|
+
sessionUpdate: 'agent_thought_chunk',
|
|
198
|
+
content: { type: 'text', text: update.content ?? '' },
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
case 'message':
|
|
202
|
+
return {
|
|
203
|
+
sessionUpdate: 'agent_message_chunk',
|
|
204
|
+
content: { type: 'text', text: update.content ?? '' },
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
case 'tool_call':
|
|
208
|
+
return {
|
|
209
|
+
sessionUpdate: 'agent_tool_call',
|
|
210
|
+
toolCall: {
|
|
211
|
+
name: update.title ?? 'unknown',
|
|
212
|
+
status: update.status ?? 'pending',
|
|
213
|
+
},
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
case 'plan':
|
|
217
|
+
return {
|
|
218
|
+
sessionUpdate: 'agent_plan',
|
|
219
|
+
content: { type: 'text', text: update.content ?? '' },
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
default:
|
|
223
|
+
return {
|
|
224
|
+
sessionUpdate: 'agent_message_chunk',
|
|
225
|
+
content: { type: 'text', text: update.content ?? '' },
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// ============================================================================
|
|
231
|
+
// Main Loop
|
|
232
|
+
// ============================================================================
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Runs the headless adapter main loop.
|
|
236
|
+
*
|
|
237
|
+
* @param schema - Headless adapter configuration
|
|
238
|
+
* @param verbose - Whether to show debug output
|
|
239
|
+
*/
|
|
240
|
+
const runAdapter = async (schema: HeadlessAdapterConfig, verbose = false): Promise<void> => {
|
|
241
|
+
const sessions = createSessionManager({ schema, verbose })
|
|
242
|
+
const handlers = createHandlers(schema, sessions)
|
|
243
|
+
|
|
244
|
+
// Method handlers (requests expect responses)
|
|
245
|
+
const methodHandlers: Record<string, (params: unknown) => Promise<unknown>> = {
|
|
246
|
+
initialize: handlers.handleInitialize,
|
|
247
|
+
'session/new': handlers.handleSessionNew,
|
|
248
|
+
'session/load': handlers.handleSessionLoad,
|
|
249
|
+
'session/prompt': handlers.handleSessionPrompt,
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Notification handlers (no response expected)
|
|
253
|
+
const notificationHandlers: Record<string, (params: unknown) => Promise<void>> = {
|
|
254
|
+
'session/cancel': handlers.handleSessionCancel,
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Process incoming JSON-RPC message.
|
|
259
|
+
*/
|
|
260
|
+
const processMessage = async (line: string): Promise<void> => {
|
|
261
|
+
let request: JsonRpcRequest | JsonRpcNotification
|
|
262
|
+
|
|
263
|
+
try {
|
|
264
|
+
request = JSON.parse(line)
|
|
265
|
+
} catch {
|
|
266
|
+
sendMessage({
|
|
267
|
+
jsonrpc: '2.0',
|
|
268
|
+
id: null,
|
|
269
|
+
error: { code: -32700, message: 'Parse error' },
|
|
270
|
+
})
|
|
271
|
+
return
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Check if it's a notification (no id)
|
|
275
|
+
const isNotification = !('id' in request)
|
|
276
|
+
|
|
277
|
+
if (isNotification) {
|
|
278
|
+
const handler = notificationHandlers[request.method]
|
|
279
|
+
if (handler) {
|
|
280
|
+
await handler(request.params)
|
|
281
|
+
}
|
|
282
|
+
// No response for notifications
|
|
283
|
+
return
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// It's a request - send response
|
|
287
|
+
const reqWithId = request as JsonRpcRequest
|
|
288
|
+
const handler = methodHandlers[reqWithId.method]
|
|
289
|
+
|
|
290
|
+
if (!handler) {
|
|
291
|
+
sendMessage({
|
|
292
|
+
jsonrpc: '2.0',
|
|
293
|
+
id: reqWithId.id,
|
|
294
|
+
error: { code: -32601, message: `Method not found: ${reqWithId.method}` },
|
|
295
|
+
})
|
|
296
|
+
return
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
try {
|
|
300
|
+
const result = await handler(reqWithId.params)
|
|
301
|
+
sendMessage({
|
|
302
|
+
jsonrpc: '2.0',
|
|
303
|
+
id: reqWithId.id,
|
|
304
|
+
result,
|
|
305
|
+
})
|
|
306
|
+
} catch (error) {
|
|
307
|
+
sendMessage({
|
|
308
|
+
jsonrpc: '2.0',
|
|
309
|
+
id: reqWithId.id,
|
|
310
|
+
error: {
|
|
311
|
+
code: -32603,
|
|
312
|
+
message: error instanceof Error ? error.message : 'Internal error',
|
|
313
|
+
},
|
|
314
|
+
})
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Main loop: read lines from stdin
|
|
319
|
+
const rl = createInterface({
|
|
320
|
+
input: process.stdin,
|
|
321
|
+
output: process.stdout,
|
|
322
|
+
terminal: false,
|
|
323
|
+
})
|
|
324
|
+
|
|
325
|
+
rl.on('line', processMessage)
|
|
326
|
+
|
|
327
|
+
// Handle clean shutdown
|
|
328
|
+
process.on('SIGTERM', () => {
|
|
329
|
+
rl.close()
|
|
330
|
+
process.exit(0)
|
|
331
|
+
})
|
|
332
|
+
|
|
333
|
+
process.on('SIGINT', () => {
|
|
334
|
+
rl.close()
|
|
335
|
+
process.exit(0)
|
|
336
|
+
})
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// ============================================================================
|
|
340
|
+
// CLI Entry Point
|
|
341
|
+
// ============================================================================
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* Headless adapter CLI entry point.
|
|
345
|
+
*
|
|
346
|
+
* @param args - Command line arguments
|
|
347
|
+
*/
|
|
348
|
+
export const headless = async (args: string[]): Promise<void> => {
|
|
349
|
+
const { values } = parseArgs({
|
|
350
|
+
args,
|
|
351
|
+
options: {
|
|
352
|
+
schema: { type: 'string', short: 's' },
|
|
353
|
+
verbose: { type: 'boolean', short: 'v' },
|
|
354
|
+
help: { type: 'boolean', short: 'h' },
|
|
355
|
+
},
|
|
356
|
+
allowPositionals: false,
|
|
357
|
+
})
|
|
358
|
+
|
|
359
|
+
if (values.help) {
|
|
360
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
361
|
+
console.log(`
|
|
362
|
+
Usage: agent-eval-harness headless --schema <path> [--verbose]
|
|
363
|
+
|
|
364
|
+
Arguments:
|
|
365
|
+
-s, --schema Path to headless adapter schema (JSON)
|
|
366
|
+
-v, --verbose Show constructed commands (for debugging)
|
|
367
|
+
-h, --help Show this help message
|
|
368
|
+
|
|
369
|
+
Description:
|
|
370
|
+
Schema-driven adapter for ANY headless CLI agent. The adapter reads
|
|
371
|
+
a JSON schema defining how to interact with the CLI and translates between
|
|
372
|
+
protocol and CLI stdio.
|
|
373
|
+
|
|
374
|
+
Schema Format:
|
|
375
|
+
{
|
|
376
|
+
"version": 1,
|
|
377
|
+
"name": "my-agent",
|
|
378
|
+
"command": ["my-agent-cli"],
|
|
379
|
+
"sessionMode": "stream" | "iterative",
|
|
380
|
+
"prompt": { "flag": "-p" },
|
|
381
|
+
"output": { "flag": "--output-format", "value": "stream-json" },
|
|
382
|
+
"outputEvents": [...],
|
|
383
|
+
"result": { "matchPath": "$.type", "matchValue": "result", "contentPath": "$.content" }
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
Examples:
|
|
387
|
+
# Run with Claude headless schema
|
|
388
|
+
agent-eval-harness headless --schema ./claude-headless.json
|
|
389
|
+
|
|
390
|
+
# Use in capture pipeline
|
|
391
|
+
agent-eval-harness capture prompts.jsonl --schema ./claude-headless.json -o results.jsonl
|
|
392
|
+
`)
|
|
393
|
+
return
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
if (!values.schema) {
|
|
397
|
+
console.error('Error: --schema is required')
|
|
398
|
+
console.error('Example: agent-eval-harness headless --schema ./my-agent.json')
|
|
399
|
+
process.exit(1)
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
// Load and validate schema
|
|
403
|
+
const schemaPath = values.schema
|
|
404
|
+
const schemaFile = Bun.file(schemaPath)
|
|
405
|
+
|
|
406
|
+
if (!(await schemaFile.exists())) {
|
|
407
|
+
console.error(`Error: schema file not found: ${schemaPath}`)
|
|
408
|
+
process.exit(1)
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
let schema: HeadlessAdapterConfig
|
|
412
|
+
try {
|
|
413
|
+
const rawSchema = await schemaFile.json()
|
|
414
|
+
schema = parseHeadlessConfig(rawSchema)
|
|
415
|
+
} catch (error) {
|
|
416
|
+
console.error(`Error: invalid schema: ${error instanceof Error ? error.message : String(error)}`)
|
|
417
|
+
process.exit(1)
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
// Run the adapter
|
|
421
|
+
await runAdapter(schema, values.verbose ?? false)
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
// Allow direct execution
|
|
425
|
+
if (import.meta.main) {
|
|
426
|
+
headless(Bun.argv.slice(2)).catch((error) => {
|
|
427
|
+
console.error('Error:', error instanceof Error ? error.message : error)
|
|
428
|
+
process.exit(1)
|
|
429
|
+
})
|
|
430
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* History builder for iterative mode sessions.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* In iterative mode, each prompt spawns a new process. The history builder
|
|
6
|
+
* accumulates conversation context and formats it using the schema's
|
|
7
|
+
* historyTemplate for inclusion in subsequent prompts.
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
// ============================================================================
|
|
13
|
+
// Types
|
|
14
|
+
// ============================================================================
|
|
15
|
+
|
|
16
|
+
/** A single turn in conversation history */
|
|
17
|
+
export type HistoryTurn = {
|
|
18
|
+
/** User input */
|
|
19
|
+
input: string
|
|
20
|
+
/** Agent output */
|
|
21
|
+
output: string
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** History builder configuration */
|
|
25
|
+
export type HistoryBuilderConfig = {
|
|
26
|
+
/** Template for formatting history (e.g., "User: {{input}}\nAssistant: {{output}}") */
|
|
27
|
+
template?: string
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// ============================================================================
|
|
31
|
+
// Default Template
|
|
32
|
+
// ============================================================================
|
|
33
|
+
|
|
34
|
+
const DEFAULT_TEMPLATE = 'User: {{input}}\nAssistant: {{output}}'
|
|
35
|
+
|
|
36
|
+
// ============================================================================
|
|
37
|
+
// History Builder Factory
|
|
38
|
+
// ============================================================================
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Creates a history builder for iterative mode sessions.
|
|
42
|
+
*
|
|
43
|
+
* @remarks
|
|
44
|
+
* The history builder:
|
|
45
|
+
* 1. Stores conversation turns
|
|
46
|
+
* 2. Formats history using the template
|
|
47
|
+
* 3. Builds complete prompts with context
|
|
48
|
+
*
|
|
49
|
+
* @param config - History builder configuration
|
|
50
|
+
* @returns History builder with add, format, and build methods
|
|
51
|
+
*/
|
|
52
|
+
export const createHistoryBuilder = (config: HistoryBuilderConfig = {}) => {
|
|
53
|
+
const template = config.template ?? DEFAULT_TEMPLATE
|
|
54
|
+
const history: HistoryTurn[] = []
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Adds a turn to history.
|
|
58
|
+
*
|
|
59
|
+
* @param input - User input
|
|
60
|
+
* @param output - Agent output
|
|
61
|
+
*/
|
|
62
|
+
const addTurn = (input: string, output: string): void => {
|
|
63
|
+
history.push({ input, output })
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Formats the current history as a string.
|
|
68
|
+
*
|
|
69
|
+
* @returns Formatted history string
|
|
70
|
+
*/
|
|
71
|
+
const formatHistory = (): string => {
|
|
72
|
+
return history.map((turn) => formatTurn(turn, template)).join('\n\n')
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Builds a prompt with history context.
|
|
77
|
+
*
|
|
78
|
+
* @remarks
|
|
79
|
+
* For the first turn, returns just the input.
|
|
80
|
+
* For subsequent turns, prepends formatted history.
|
|
81
|
+
*
|
|
82
|
+
* @param newInput - The new user input
|
|
83
|
+
* @returns Full prompt including history context
|
|
84
|
+
*/
|
|
85
|
+
const buildPrompt = (newInput: string): string => {
|
|
86
|
+
if (history.length === 0) {
|
|
87
|
+
return newInput
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const formattedHistory = formatHistory()
|
|
91
|
+
return `${formattedHistory}\n\nUser: ${newInput}`
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Gets the number of turns in history.
|
|
96
|
+
*/
|
|
97
|
+
const getLength = (): number => {
|
|
98
|
+
return history.length
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Clears all history.
|
|
103
|
+
*/
|
|
104
|
+
const clear = (): void => {
|
|
105
|
+
history.length = 0
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Gets a copy of the history.
|
|
110
|
+
*/
|
|
111
|
+
const getHistory = (): HistoryTurn[] => {
|
|
112
|
+
return [...history]
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
addTurn,
|
|
117
|
+
formatHistory,
|
|
118
|
+
buildPrompt,
|
|
119
|
+
getLength,
|
|
120
|
+
clear,
|
|
121
|
+
getHistory,
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// ============================================================================
|
|
126
|
+
// Helper Functions
|
|
127
|
+
// ============================================================================
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Formats a single turn using the template.
|
|
131
|
+
*
|
|
132
|
+
* @param turn - History turn
|
|
133
|
+
* @param template - Template string with {{input}} and {{output}} placeholders
|
|
134
|
+
* @returns Formatted turn string
|
|
135
|
+
*/
|
|
136
|
+
const formatTurn = (turn: HistoryTurn, template: string): string => {
|
|
137
|
+
return template.replace('{{input}}', turn.input).replace('{{output}}', turn.output)
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/** History builder type */
|
|
141
|
+
export type HistoryBuilder = ReturnType<typeof createHistoryBuilder>
|