@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,366 @@
1
+ /**
2
+ * Generic output parser for headless CLI agents.
3
+ *
4
+ * @remarks
5
+ * Uses schema-defined mappings to convert CLI JSON output into session updates.
6
+ * Supports JSONPath-like expressions for matching and extraction.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import type { HeadlessAdapterConfig, OutputEventMapping, PassthroughTypeMap } from './headless.schemas.ts'
12
+
13
+ // ============================================================================
14
+ // Types
15
+ // ============================================================================
16
+
17
+ /** session update types */
18
+ export type SessionUpdateType = 'thought' | 'tool_call' | 'message' | 'plan'
19
+
20
+ /** Parsed session update from CLI output */
21
+ export type ParsedUpdate = {
22
+ type: SessionUpdateType
23
+ content?: string
24
+ title?: string
25
+ status?: string
26
+ raw: unknown
27
+ }
28
+
29
+ /** Result extraction from CLI output */
30
+ export type ParsedResult = {
31
+ isResult: true
32
+ content: string
33
+ raw: unknown
34
+ }
35
+
36
+ /** Not a result */
37
+ export type NotResult = {
38
+ isResult: false
39
+ }
40
+
41
+ /** Parse result for final output */
42
+ export type ResultParseResult = ParsedResult | NotResult
43
+
44
+ // ============================================================================
45
+ // JSONPath Implementation
46
+ // ============================================================================
47
+
48
+ /**
49
+ * Extracts a value from an object using a simple JSONPath expression.
50
+ *
51
+ * @remarks
52
+ * Supports:
53
+ * - `$.field` - Root field access
54
+ * - `$.nested.field` - Nested field access
55
+ * - `$.array[0]` - Array index access
56
+ * - `$.array[*]` - Array wildcard (returns all items)
57
+ * - `$.array[0].field` - Combined array and field access
58
+ * - `'literal'` - Literal string values (single quotes)
59
+ *
60
+ * @param obj - Object to extract from
61
+ * @param path - JSONPath expression
62
+ * @returns Extracted value, array of values (for wildcard), or undefined
63
+ */
64
+ export const jsonPath = (obj: unknown, path: string): unknown => {
65
+ // Handle literal strings (e.g., "'pending'")
66
+ if (path.startsWith("'") && path.endsWith("'")) {
67
+ return path.slice(1, -1)
68
+ }
69
+
70
+ // Handle JSONPath expressions (e.g., "$.type", "$.message.content[0].text")
71
+ if (!path.startsWith('$.')) {
72
+ return undefined
73
+ }
74
+
75
+ // Parse path into segments, handling both dot notation and array indices
76
+ // e.g., "message.content[0].text" -> ["message", "content", 0, "text"]
77
+ // e.g., "message.content[*].type" -> ["message", "content", "*", "type"]
78
+ const segments: (string | number | '*')[] = []
79
+ const pathBody = path.slice(2) // Remove "$."
80
+
81
+ // Split by dots first, then handle array indices within each part
82
+ for (const part of pathBody.split('.')) {
83
+ if (!part) continue
84
+
85
+ // Check for array wildcard: "content[*]"
86
+ const wildcardMatch = part.match(/^([^[]*)\[\*\]$/)
87
+ if (wildcardMatch) {
88
+ const propName = wildcardMatch[1]
89
+ if (propName) {
90
+ segments.push(propName)
91
+ }
92
+ segments.push('*')
93
+ continue
94
+ }
95
+
96
+ // Check for array index: "content[0]" or just "[0]"
97
+ const arrayMatch = part.match(/^([^[]*)\[(\d+)\]$/)
98
+ if (arrayMatch) {
99
+ const propName = arrayMatch[1]
100
+ const indexStr = arrayMatch[2]
101
+ if (propName) {
102
+ segments.push(propName)
103
+ }
104
+ if (indexStr) {
105
+ segments.push(parseInt(indexStr, 10))
106
+ }
107
+ } else {
108
+ segments.push(part)
109
+ }
110
+ }
111
+
112
+ let current: unknown = obj
113
+
114
+ for (const segment of segments) {
115
+ if (current === null || current === undefined) {
116
+ return undefined
117
+ }
118
+
119
+ if (segment === '*') {
120
+ // Array wildcard - return array as-is for further processing
121
+ if (!Array.isArray(current)) {
122
+ return undefined
123
+ }
124
+ return current
125
+ } else if (typeof segment === 'number') {
126
+ // Array index access
127
+ if (!Array.isArray(current)) {
128
+ return undefined
129
+ }
130
+ current = current[segment]
131
+ } else {
132
+ // Property access
133
+ if (typeof current !== 'object') {
134
+ return undefined
135
+ }
136
+ current = (current as Record<string, unknown>)[segment]
137
+ }
138
+ }
139
+
140
+ return current
141
+ }
142
+
143
+ /**
144
+ * Extracts a string value from an object using JSONPath.
145
+ *
146
+ * @param obj - Object to extract from
147
+ * @param path - JSONPath expression
148
+ * @returns String value or undefined
149
+ */
150
+ export const jsonPathString = (obj: unknown, path: string): string | undefined => {
151
+ const value = jsonPath(obj, path)
152
+ if (value === undefined || value === null) {
153
+ return undefined
154
+ }
155
+ return String(value)
156
+ }
157
+
158
+ // ============================================================================
159
+ // Output Parser Factory
160
+ // ============================================================================
161
+
162
+ /**
163
+ * Parse line using passthrough mode.
164
+ *
165
+ * @remarks
166
+ * Passthrough mode directly maps the agent's type field to session update types.
167
+ * Simpler than JSONPath for agents with well-structured output.
168
+ *
169
+ * @param line - JSON string from CLI stdout
170
+ * @param typeMap - Passthrough type mapping configuration
171
+ * @returns Parsed update or null if no mapping matches
172
+ */
173
+ const parsePassthrough = (line: string, typeMap: PassthroughTypeMap): ParsedUpdate | null => {
174
+ let event: Record<string, unknown>
175
+ try {
176
+ event = JSON.parse(line) as Record<string, unknown>
177
+ } catch {
178
+ return null
179
+ }
180
+
181
+ const typeField = typeMap.typeField ?? 'type'
182
+ const eventType = event[typeField]
183
+
184
+ if (typeof eventType !== 'string') {
185
+ return null
186
+ }
187
+
188
+ // Check if this type has a mapping
189
+ const typeValues = typeMap.typeValues as Record<string, SessionUpdateType> | undefined
190
+ const mappedType = typeValues?.[eventType]
191
+ if (!mappedType) {
192
+ // No explicit mapping - try direct match if it's a valid session type
193
+ const validTypes = ['thought', 'tool_call', 'message', 'plan'] as const
194
+ if (!validTypes.includes(eventType as (typeof validTypes)[number])) {
195
+ return null
196
+ }
197
+ // Use the event type directly if it's already a valid session type
198
+ return {
199
+ type: eventType as SessionUpdateType,
200
+ content: typeof event.content === 'string' ? event.content : undefined,
201
+ title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
202
+ status: typeof event.status === 'string' ? event.status : undefined,
203
+ raw: event,
204
+ }
205
+ }
206
+
207
+ // Use mapped type
208
+ return {
209
+ type: mappedType,
210
+ content: typeof event.content === 'string' ? event.content : undefined,
211
+ title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
212
+ status: typeof event.status === 'string' ? event.status : undefined,
213
+ raw: event,
214
+ }
215
+ }
216
+
217
+ /**
218
+ * Creates an output parser from adapter configuration.
219
+ *
220
+ * @remarks
221
+ * The parser uses the schema's outputEvents mappings to:
222
+ * 1. Match incoming JSON lines against patterns
223
+ * 2. Extract content using JSONPath expressions
224
+ * 3. Emit session update objects
225
+ *
226
+ * Supports two modes:
227
+ * - 'jsonpath' (default): Uses outputEvents for complex pattern matching
228
+ * - 'passthrough': Direct type mapping for well-structured output
229
+ *
230
+ * @param config - Headless adapter configuration
231
+ * @returns Parser function for individual lines
232
+ */
233
+ export const createOutputParser = (config: HeadlessAdapterConfig) => {
234
+ const { result, outputMode = 'jsonpath', outputEvents = [], passthroughTypeMap } = config
235
+
236
+ /**
237
+ * Parses a single JSON line from CLI output.
238
+ *
239
+ * @param line - JSON string from CLI stdout
240
+ * @returns Parsed update, array of updates (for wildcard matches), or null if no mapping matches
241
+ */
242
+ const parseLine = (line: string): ParsedUpdate | ParsedUpdate[] | null => {
243
+ // Use passthrough mode if configured
244
+ if (outputMode === 'passthrough' && passthroughTypeMap) {
245
+ return parsePassthrough(line, passthroughTypeMap)
246
+ }
247
+
248
+ // JSONPath mode (default)
249
+ if (!outputEvents || outputEvents.length === 0) {
250
+ return null
251
+ }
252
+
253
+ let event: unknown
254
+ try {
255
+ event = JSON.parse(line)
256
+ } catch {
257
+ // Not valid JSON, skip
258
+ return null
259
+ }
260
+
261
+ // Try each mapping until one matches
262
+ for (const mapping of outputEvents) {
263
+ const matchValue = jsonPath(event, mapping.match.path)
264
+
265
+ // Handle array results from wildcard paths (e.g., $.message.content[*])
266
+ if (Array.isArray(matchValue)) {
267
+ const updates: ParsedUpdate[] = []
268
+ for (const item of matchValue) {
269
+ // Check if this array item matches the expected value
270
+ if (mapping.match.value === '*') {
271
+ // Wildcard: match any non-null item
272
+ if (item !== undefined && item !== null) {
273
+ updates.push(createUpdate(item, mapping))
274
+ }
275
+ } else if (typeof item === 'object' && item !== null && 'type' in item) {
276
+ // For objects with 'type' property, check nested match
277
+ const itemType = (item as Record<string, unknown>).type
278
+ if (itemType === mapping.match.value) {
279
+ updates.push(createUpdate(item, mapping))
280
+ }
281
+ } else if (item === mapping.match.value) {
282
+ // For primitives, direct match
283
+ updates.push(createUpdate(item, mapping))
284
+ }
285
+ }
286
+ if (updates.length > 0) {
287
+ return updates
288
+ }
289
+ } else {
290
+ // Single value matching (original behavior)
291
+ if (mapping.match.value === '*') {
292
+ if (matchValue !== undefined && matchValue !== null) {
293
+ return createUpdate(event, mapping)
294
+ }
295
+ } else if (matchValue === mapping.match.value) {
296
+ return createUpdate(event, mapping)
297
+ }
298
+ }
299
+ }
300
+
301
+ return null
302
+ }
303
+
304
+ /**
305
+ * Creates a ParsedUpdate from a matched event.
306
+ */
307
+ const createUpdate = (event: unknown, mapping: OutputEventMapping): ParsedUpdate => {
308
+ const update: ParsedUpdate = {
309
+ type: mapping.emitAs,
310
+ raw: event,
311
+ }
312
+
313
+ if (mapping.extract) {
314
+ if (mapping.extract.content) {
315
+ update.content = jsonPathString(event, mapping.extract.content)
316
+ }
317
+ if (mapping.extract.title) {
318
+ update.title = jsonPathString(event, mapping.extract.title)
319
+ }
320
+ if (mapping.extract.status) {
321
+ update.status = jsonPathString(event, mapping.extract.status)
322
+ }
323
+ }
324
+
325
+ return update
326
+ }
327
+
328
+ /**
329
+ * Checks if a JSON line represents the final result.
330
+ *
331
+ * @param line - JSON string from CLI stdout
332
+ * @returns Result extraction or indication that it's not a result
333
+ */
334
+ const parseResult = (line: string): ResultParseResult => {
335
+ let event: unknown
336
+ try {
337
+ event = JSON.parse(line)
338
+ } catch {
339
+ return { isResult: false }
340
+ }
341
+
342
+ const matchValue = jsonPath(event, result.matchPath)
343
+ // Support wildcard "*" to match any non-null value
344
+ const matches =
345
+ result.matchValue === '*' ? matchValue !== undefined && matchValue !== null : matchValue === result.matchValue
346
+
347
+ if (matches) {
348
+ const content = jsonPathString(event, result.contentPath)
349
+ return {
350
+ isResult: true,
351
+ content: content ?? '',
352
+ raw: event,
353
+ }
354
+ }
355
+
356
+ return { isResult: false }
357
+ }
358
+
359
+ return {
360
+ parseLine,
361
+ parseResult,
362
+ }
363
+ }
364
+
365
+ /** Output parser type */
366
+ export type OutputParser = ReturnType<typeof createOutputParser>