@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generic output parser for headless CLI agents.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Uses schema-defined mappings to convert CLI JSON output into session updates.
|
|
6
|
+
* Supports JSONPath-like expressions for matching and extraction.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { HeadlessAdapterConfig, OutputEventMapping, PassthroughTypeMap } from './headless.schemas.ts'
|
|
12
|
+
|
|
13
|
+
// ============================================================================
|
|
14
|
+
// Types
|
|
15
|
+
// ============================================================================
|
|
16
|
+
|
|
17
|
+
/** session update types */
|
|
18
|
+
export type SessionUpdateType = 'thought' | 'tool_call' | 'message' | 'plan'
|
|
19
|
+
|
|
20
|
+
/** Parsed session update from CLI output */
|
|
21
|
+
export type ParsedUpdate = {
|
|
22
|
+
type: SessionUpdateType
|
|
23
|
+
content?: string
|
|
24
|
+
title?: string
|
|
25
|
+
status?: string
|
|
26
|
+
raw: unknown
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Result extraction from CLI output */
|
|
30
|
+
export type ParsedResult = {
|
|
31
|
+
isResult: true
|
|
32
|
+
content: string
|
|
33
|
+
raw: unknown
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/** Not a result */
|
|
37
|
+
export type NotResult = {
|
|
38
|
+
isResult: false
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Parse result for final output */
|
|
42
|
+
export type ResultParseResult = ParsedResult | NotResult
|
|
43
|
+
|
|
44
|
+
// ============================================================================
|
|
45
|
+
// JSONPath Implementation
|
|
46
|
+
// ============================================================================
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Extracts a value from an object using a simple JSONPath expression.
|
|
50
|
+
*
|
|
51
|
+
* @remarks
|
|
52
|
+
* Supports:
|
|
53
|
+
* - `$.field` - Root field access
|
|
54
|
+
* - `$.nested.field` - Nested field access
|
|
55
|
+
* - `$.array[0]` - Array index access
|
|
56
|
+
* - `$.array[*]` - Array wildcard (returns all items)
|
|
57
|
+
* - `$.array[0].field` - Combined array and field access
|
|
58
|
+
* - `'literal'` - Literal string values (single quotes)
|
|
59
|
+
*
|
|
60
|
+
* @param obj - Object to extract from
|
|
61
|
+
* @param path - JSONPath expression
|
|
62
|
+
* @returns Extracted value, array of values (for wildcard), or undefined
|
|
63
|
+
*/
|
|
64
|
+
export const jsonPath = (obj: unknown, path: string): unknown => {
|
|
65
|
+
// Handle literal strings (e.g., "'pending'")
|
|
66
|
+
if (path.startsWith("'") && path.endsWith("'")) {
|
|
67
|
+
return path.slice(1, -1)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Handle JSONPath expressions (e.g., "$.type", "$.message.content[0].text")
|
|
71
|
+
if (!path.startsWith('$.')) {
|
|
72
|
+
return undefined
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Parse path into segments, handling both dot notation and array indices
|
|
76
|
+
// e.g., "message.content[0].text" -> ["message", "content", 0, "text"]
|
|
77
|
+
// e.g., "message.content[*].type" -> ["message", "content", "*", "type"]
|
|
78
|
+
const segments: (string | number | '*')[] = []
|
|
79
|
+
const pathBody = path.slice(2) // Remove "$."
|
|
80
|
+
|
|
81
|
+
// Split by dots first, then handle array indices within each part
|
|
82
|
+
for (const part of pathBody.split('.')) {
|
|
83
|
+
if (!part) continue
|
|
84
|
+
|
|
85
|
+
// Check for array wildcard: "content[*]"
|
|
86
|
+
const wildcardMatch = part.match(/^([^[]*)\[\*\]$/)
|
|
87
|
+
if (wildcardMatch) {
|
|
88
|
+
const propName = wildcardMatch[1]
|
|
89
|
+
if (propName) {
|
|
90
|
+
segments.push(propName)
|
|
91
|
+
}
|
|
92
|
+
segments.push('*')
|
|
93
|
+
continue
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Check for array index: "content[0]" or just "[0]"
|
|
97
|
+
const arrayMatch = part.match(/^([^[]*)\[(\d+)\]$/)
|
|
98
|
+
if (arrayMatch) {
|
|
99
|
+
const propName = arrayMatch[1]
|
|
100
|
+
const indexStr = arrayMatch[2]
|
|
101
|
+
if (propName) {
|
|
102
|
+
segments.push(propName)
|
|
103
|
+
}
|
|
104
|
+
if (indexStr) {
|
|
105
|
+
segments.push(parseInt(indexStr, 10))
|
|
106
|
+
}
|
|
107
|
+
} else {
|
|
108
|
+
segments.push(part)
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
let current: unknown = obj
|
|
113
|
+
|
|
114
|
+
for (const segment of segments) {
|
|
115
|
+
if (current === null || current === undefined) {
|
|
116
|
+
return undefined
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (segment === '*') {
|
|
120
|
+
// Array wildcard - return array as-is for further processing
|
|
121
|
+
if (!Array.isArray(current)) {
|
|
122
|
+
return undefined
|
|
123
|
+
}
|
|
124
|
+
return current
|
|
125
|
+
} else if (typeof segment === 'number') {
|
|
126
|
+
// Array index access
|
|
127
|
+
if (!Array.isArray(current)) {
|
|
128
|
+
return undefined
|
|
129
|
+
}
|
|
130
|
+
current = current[segment]
|
|
131
|
+
} else {
|
|
132
|
+
// Property access
|
|
133
|
+
if (typeof current !== 'object') {
|
|
134
|
+
return undefined
|
|
135
|
+
}
|
|
136
|
+
current = (current as Record<string, unknown>)[segment]
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return current
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Extracts a string value from an object using JSONPath.
|
|
145
|
+
*
|
|
146
|
+
* @param obj - Object to extract from
|
|
147
|
+
* @param path - JSONPath expression
|
|
148
|
+
* @returns String value or undefined
|
|
149
|
+
*/
|
|
150
|
+
export const jsonPathString = (obj: unknown, path: string): string | undefined => {
|
|
151
|
+
const value = jsonPath(obj, path)
|
|
152
|
+
if (value === undefined || value === null) {
|
|
153
|
+
return undefined
|
|
154
|
+
}
|
|
155
|
+
return String(value)
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// ============================================================================
|
|
159
|
+
// Output Parser Factory
|
|
160
|
+
// ============================================================================
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Parse line using passthrough mode.
|
|
164
|
+
*
|
|
165
|
+
* @remarks
|
|
166
|
+
* Passthrough mode directly maps the agent's type field to session update types.
|
|
167
|
+
* Simpler than JSONPath for agents with well-structured output.
|
|
168
|
+
*
|
|
169
|
+
* @param line - JSON string from CLI stdout
|
|
170
|
+
* @param typeMap - Passthrough type mapping configuration
|
|
171
|
+
* @returns Parsed update or null if no mapping matches
|
|
172
|
+
*/
|
|
173
|
+
const parsePassthrough = (line: string, typeMap: PassthroughTypeMap): ParsedUpdate | null => {
|
|
174
|
+
let event: Record<string, unknown>
|
|
175
|
+
try {
|
|
176
|
+
event = JSON.parse(line) as Record<string, unknown>
|
|
177
|
+
} catch {
|
|
178
|
+
return null
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const typeField = typeMap.typeField ?? 'type'
|
|
182
|
+
const eventType = event[typeField]
|
|
183
|
+
|
|
184
|
+
if (typeof eventType !== 'string') {
|
|
185
|
+
return null
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Check if this type has a mapping
|
|
189
|
+
const typeValues = typeMap.typeValues as Record<string, SessionUpdateType> | undefined
|
|
190
|
+
const mappedType = typeValues?.[eventType]
|
|
191
|
+
if (!mappedType) {
|
|
192
|
+
// No explicit mapping - try direct match if it's a valid session type
|
|
193
|
+
const validTypes = ['thought', 'tool_call', 'message', 'plan'] as const
|
|
194
|
+
if (!validTypes.includes(eventType as (typeof validTypes)[number])) {
|
|
195
|
+
return null
|
|
196
|
+
}
|
|
197
|
+
// Use the event type directly if it's already a valid session type
|
|
198
|
+
return {
|
|
199
|
+
type: eventType as SessionUpdateType,
|
|
200
|
+
content: typeof event.content === 'string' ? event.content : undefined,
|
|
201
|
+
title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
|
|
202
|
+
status: typeof event.status === 'string' ? event.status : undefined,
|
|
203
|
+
raw: event,
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Use mapped type
|
|
208
|
+
return {
|
|
209
|
+
type: mappedType,
|
|
210
|
+
content: typeof event.content === 'string' ? event.content : undefined,
|
|
211
|
+
title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
|
|
212
|
+
status: typeof event.status === 'string' ? event.status : undefined,
|
|
213
|
+
raw: event,
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Creates an output parser from adapter configuration.
|
|
219
|
+
*
|
|
220
|
+
* @remarks
|
|
221
|
+
* The parser uses the schema's outputEvents mappings to:
|
|
222
|
+
* 1. Match incoming JSON lines against patterns
|
|
223
|
+
* 2. Extract content using JSONPath expressions
|
|
224
|
+
* 3. Emit session update objects
|
|
225
|
+
*
|
|
226
|
+
* Supports two modes:
|
|
227
|
+
* - 'jsonpath' (default): Uses outputEvents for complex pattern matching
|
|
228
|
+
* - 'passthrough': Direct type mapping for well-structured output
|
|
229
|
+
*
|
|
230
|
+
* @param config - Headless adapter configuration
|
|
231
|
+
* @returns Parser function for individual lines
|
|
232
|
+
*/
|
|
233
|
+
export const createOutputParser = (config: HeadlessAdapterConfig) => {
|
|
234
|
+
const { result, outputMode = 'jsonpath', outputEvents = [], passthroughTypeMap } = config
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Parses a single JSON line from CLI output.
|
|
238
|
+
*
|
|
239
|
+
* @param line - JSON string from CLI stdout
|
|
240
|
+
* @returns Parsed update, array of updates (for wildcard matches), or null if no mapping matches
|
|
241
|
+
*/
|
|
242
|
+
const parseLine = (line: string): ParsedUpdate | ParsedUpdate[] | null => {
|
|
243
|
+
// Use passthrough mode if configured
|
|
244
|
+
if (outputMode === 'passthrough' && passthroughTypeMap) {
|
|
245
|
+
return parsePassthrough(line, passthroughTypeMap)
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// JSONPath mode (default)
|
|
249
|
+
if (!outputEvents || outputEvents.length === 0) {
|
|
250
|
+
return null
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
let event: unknown
|
|
254
|
+
try {
|
|
255
|
+
event = JSON.parse(line)
|
|
256
|
+
} catch {
|
|
257
|
+
// Not valid JSON, skip
|
|
258
|
+
return null
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Try each mapping until one matches
|
|
262
|
+
for (const mapping of outputEvents) {
|
|
263
|
+
const matchValue = jsonPath(event, mapping.match.path)
|
|
264
|
+
|
|
265
|
+
// Handle array results from wildcard paths (e.g., $.message.content[*])
|
|
266
|
+
if (Array.isArray(matchValue)) {
|
|
267
|
+
const updates: ParsedUpdate[] = []
|
|
268
|
+
for (const item of matchValue) {
|
|
269
|
+
// Check if this array item matches the expected value
|
|
270
|
+
if (mapping.match.value === '*') {
|
|
271
|
+
// Wildcard: match any non-null item
|
|
272
|
+
if (item !== undefined && item !== null) {
|
|
273
|
+
updates.push(createUpdate(item, mapping))
|
|
274
|
+
}
|
|
275
|
+
} else if (typeof item === 'object' && item !== null && 'type' in item) {
|
|
276
|
+
// For objects with 'type' property, check nested match
|
|
277
|
+
const itemType = (item as Record<string, unknown>).type
|
|
278
|
+
if (itemType === mapping.match.value) {
|
|
279
|
+
updates.push(createUpdate(item, mapping))
|
|
280
|
+
}
|
|
281
|
+
} else if (item === mapping.match.value) {
|
|
282
|
+
// For primitives, direct match
|
|
283
|
+
updates.push(createUpdate(item, mapping))
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
if (updates.length > 0) {
|
|
287
|
+
return updates
|
|
288
|
+
}
|
|
289
|
+
} else {
|
|
290
|
+
// Single value matching (original behavior)
|
|
291
|
+
if (mapping.match.value === '*') {
|
|
292
|
+
if (matchValue !== undefined && matchValue !== null) {
|
|
293
|
+
return createUpdate(event, mapping)
|
|
294
|
+
}
|
|
295
|
+
} else if (matchValue === mapping.match.value) {
|
|
296
|
+
return createUpdate(event, mapping)
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
return null
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/**
|
|
305
|
+
* Creates a ParsedUpdate from a matched event.
|
|
306
|
+
*/
|
|
307
|
+
const createUpdate = (event: unknown, mapping: OutputEventMapping): ParsedUpdate => {
|
|
308
|
+
const update: ParsedUpdate = {
|
|
309
|
+
type: mapping.emitAs,
|
|
310
|
+
raw: event,
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
if (mapping.extract) {
|
|
314
|
+
if (mapping.extract.content) {
|
|
315
|
+
update.content = jsonPathString(event, mapping.extract.content)
|
|
316
|
+
}
|
|
317
|
+
if (mapping.extract.title) {
|
|
318
|
+
update.title = jsonPathString(event, mapping.extract.title)
|
|
319
|
+
}
|
|
320
|
+
if (mapping.extract.status) {
|
|
321
|
+
update.status = jsonPathString(event, mapping.extract.status)
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
return update
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Checks if a JSON line represents the final result.
|
|
330
|
+
*
|
|
331
|
+
* @param line - JSON string from CLI stdout
|
|
332
|
+
* @returns Result extraction or indication that it's not a result
|
|
333
|
+
*/
|
|
334
|
+
const parseResult = (line: string): ResultParseResult => {
|
|
335
|
+
let event: unknown
|
|
336
|
+
try {
|
|
337
|
+
event = JSON.parse(line)
|
|
338
|
+
} catch {
|
|
339
|
+
return { isResult: false }
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
const matchValue = jsonPath(event, result.matchPath)
|
|
343
|
+
// Support wildcard "*" to match any non-null value
|
|
344
|
+
const matches =
|
|
345
|
+
result.matchValue === '*' ? matchValue !== undefined && matchValue !== null : matchValue === result.matchValue
|
|
346
|
+
|
|
347
|
+
if (matches) {
|
|
348
|
+
const content = jsonPathString(event, result.contentPath)
|
|
349
|
+
return {
|
|
350
|
+
isResult: true,
|
|
351
|
+
content: content ?? '',
|
|
352
|
+
raw: event,
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
return { isResult: false }
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
return {
|
|
360
|
+
parseLine,
|
|
361
|
+
parseResult,
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/** Output parser type */
|
|
366
|
+
export type OutputParser = ReturnType<typeof createOutputParser>
|