@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test fixture: Python grader script using stdin/stdout JSON protocol.
4
+ """
5
+
6
+ import json
7
+ import sys
8
+
9
+ def main():
10
+ data = json.load(sys.stdin)
11
+
12
+ output = data.get("output", "").lower()
13
+ hint = (data.get("hint") or "").lower()
14
+
15
+ if hint:
16
+ pass_result = hint in output
17
+ else:
18
+ pass_result = True
19
+
20
+ result = {
21
+ "pass": pass_result,
22
+ "score": 1.0 if pass_result else 0.0,
23
+ "reasoning": "Contains expected" if pass_result else "Missing expected"
24
+ }
25
+
26
+ print(json.dumps(result))
27
+
28
+ if __name__ == "__main__":
29
+ main()
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Test fixture: TypeScript grader module.
3
+ */
4
+
5
+ import type { Grader } from '../../schemas.ts'
6
+
7
+ export const grade: Grader = async ({ input: _input, output, hint }) => {
8
+ const pass = hint ? output.toLowerCase().includes(hint.toLowerCase()) : true
9
+ return {
10
+ pass,
11
+ score: pass ? 1.0 : 0.0,
12
+ reasoning: pass ? 'Contains expected text' : 'Missing expected text',
13
+ }
14
+ }
@@ -0,0 +1,153 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import { join } from 'node:path'
3
+ import { loadGrader } from '../grader-loader.ts'
4
+
5
+ const fixturesDir = join(import.meta.dir, 'fixtures')
6
+
7
+ // ============================================================================
8
+ // Module Graders (TypeScript/JavaScript)
9
+ // ============================================================================
10
+
11
+ describe('loadGrader - module graders', () => {
12
+ test('loads TypeScript grader module', async () => {
13
+ const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
14
+
15
+ const result = await grader({
16
+ input: 'What is 2+2?',
17
+ output: 'The answer is 4',
18
+ hint: '4',
19
+ })
20
+
21
+ expect(result.pass).toBe(true)
22
+ expect(result.score).toBe(1.0)
23
+ expect(result.reasoning).toBe('Contains expected text')
24
+ })
25
+
26
+ test('fails when module does not export grade function', async () => {
27
+ await expect(loadGrader(join(fixturesDir, 'grader-bad-module.ts'))).rejects.toThrow(
28
+ "Grader module must export a 'grade' function",
29
+ )
30
+ })
31
+
32
+ test('fails when module does not exist', async () => {
33
+ await expect(loadGrader(join(fixturesDir, 'nonexistent.ts'))).rejects.toThrow('Grader not found')
34
+ })
35
+ })
36
+
37
+ // ============================================================================
38
+ // Executable Graders (Python, etc.)
39
+ // ============================================================================
40
+
41
+ describe('loadGrader - executable graders', () => {
42
+ test('loads and executes Python grader', async () => {
43
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
44
+
45
+ const result = await grader({
46
+ input: 'What is 2+2?',
47
+ output: 'The answer is 4',
48
+ hint: '4',
49
+ })
50
+
51
+ expect(result.pass).toBe(true)
52
+ expect(result.score).toBe(1.0)
53
+ expect(result.reasoning).toBe('Contains expected')
54
+ })
55
+
56
+ test('Python grader returns pass=false when expected not in output', async () => {
57
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
58
+
59
+ const result = await grader({
60
+ input: 'What is 2+2?',
61
+ output: 'I do not know',
62
+ hint: '4',
63
+ })
64
+
65
+ expect(result.pass).toBe(false)
66
+ expect(result.score).toBe(0.0)
67
+ })
68
+
69
+ test('throws when executable exits with non-zero code', async () => {
70
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec-fail.py'))
71
+
72
+ await expect(
73
+ grader({
74
+ input: 'test',
75
+ output: 'test',
76
+ }),
77
+ ).rejects.toThrow('Grader exited with code 1')
78
+ })
79
+
80
+ test('throws when executable outputs invalid JSON', async () => {
81
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec-invalid.py'))
82
+
83
+ await expect(
84
+ grader({
85
+ input: 'test',
86
+ output: 'test',
87
+ }),
88
+ ).rejects.toThrow('Grader output is not valid JSON')
89
+ })
90
+
91
+ test('fails when executable does not exist', async () => {
92
+ await expect(loadGrader(join(fixturesDir, 'nonexistent.py'))).rejects.toThrow('Grader not found')
93
+ })
94
+ })
95
+
96
+ // ============================================================================
97
+ // Extension Detection
98
+ // ============================================================================
99
+
100
+ describe('loadGrader - extension detection', () => {
101
+ test('detects .ts as module', async () => {
102
+ const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
103
+ // If this doesn't throw, it was loaded as a module (not executed)
104
+ expect(grader).toBeInstanceOf(Function)
105
+ })
106
+
107
+ test('detects .py as executable', async () => {
108
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
109
+ expect(grader).toBeInstanceOf(Function)
110
+ })
111
+ })
112
+
113
+ // ============================================================================
114
+ // Trajectory Support
115
+ // ============================================================================
116
+
117
+ describe('loadGrader - trajectory support', () => {
118
+ test('passes trajectory to module grader', async () => {
119
+ const grader = await loadGrader(join(fixturesDir, 'grader-module.ts'))
120
+
121
+ const trajectory = [
122
+ { type: 'message' as const, content: 'Hello', timestamp: 0 },
123
+ { type: 'tool_call' as const, name: 'read', status: 'completed', timestamp: 100 },
124
+ ]
125
+
126
+ const result = await grader({
127
+ input: 'test',
128
+ output: 'The answer is 4',
129
+ hint: '4',
130
+ trajectory,
131
+ })
132
+
133
+ expect(result.pass).toBe(true)
134
+ })
135
+
136
+ test('passes trajectory to executable grader', async () => {
137
+ const grader = await loadGrader(join(fixturesDir, 'grader-exec.py'))
138
+
139
+ const trajectory = [
140
+ { type: 'message' as const, content: 'Hello', timestamp: 0 },
141
+ { type: 'tool_call' as const, name: 'read', status: 'completed', timestamp: 100 },
142
+ ]
143
+
144
+ const result = await grader({
145
+ input: 'test',
146
+ output: 'The answer is 4',
147
+ hint: '4',
148
+ trajectory,
149
+ })
150
+
151
+ expect(result.pass).toBe(true)
152
+ })
153
+ })
@@ -0,0 +1,142 @@
1
+ import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
2
+ import { runSchemas } from '../schemas-cli.ts'
3
+
4
+ // ============================================================================
5
+ // runSchemas
6
+ // ============================================================================
7
+
8
+ describe('runSchemas', () => {
9
+ const testOutputDir = '/tmp/agent-eval-harness-test-schemas'
10
+
11
+ beforeEach(async () => {
12
+ // Clean up test directory
13
+ await Bun.$`rm -rf ${testOutputDir}`.nothrow()
14
+ })
15
+
16
+ afterEach(async () => {
17
+ // Clean up test directory
18
+ await Bun.$`rm -rf ${testOutputDir}`.nothrow()
19
+ })
20
+
21
+ describe('list mode', () => {
22
+ test('returns array of schema names', async () => {
23
+ const result = await runSchemas({ list: true })
24
+ expect(Array.isArray(result)).toBe(true)
25
+ const names = result as string[]
26
+ expect(names).toContain('PromptCase')
27
+ expect(names).toContain('CaptureResult')
28
+ expect(names).toContain('GraderResult')
29
+ })
30
+ })
31
+
32
+ describe('single schema mode', () => {
33
+ test('returns single schema by name', async () => {
34
+ const result = await runSchemas({ schemaName: 'PromptCase', json: true })
35
+ expect(typeof result).toBe('object')
36
+ const schemas = result as Record<string, object>
37
+ expect(schemas.PromptCase).toBeDefined()
38
+ expect(schemas.PromptCase).toHaveProperty('$schema')
39
+ expect(schemas.PromptCase).toHaveProperty('title', 'PromptCase')
40
+ })
41
+
42
+ test('writes schema to file when outputPath provided', async () => {
43
+ const outputPath = `${testOutputDir}/prompt-case.json`
44
+ await Bun.$`mkdir -p ${testOutputDir}`
45
+
46
+ await runSchemas({
47
+ schemaName: 'GraderResult',
48
+ outputPath,
49
+ })
50
+
51
+ const content = await Bun.file(outputPath).text()
52
+ const schema = JSON.parse(content)
53
+ expect(schema.title).toBe('GraderResult')
54
+ expect(schema.$schema).toBe('https://json-schema.org/draft/2020-12/schema')
55
+ })
56
+ })
57
+
58
+ describe('all schemas mode', () => {
59
+ test('returns all schemas as object', async () => {
60
+ const result = await runSchemas({ json: true })
61
+ expect(typeof result).toBe('object')
62
+ const schemas = result as Record<string, object>
63
+
64
+ // Check a sampling of expected schemas
65
+ expect(schemas.PromptCase).toBeDefined()
66
+ expect(schemas.CaptureResult).toBeDefined()
67
+ expect(schemas.GraderResult).toBeDefined()
68
+ expect(schemas.TrajectoryStep).toBeDefined()
69
+ expect(schemas.Session).toBeDefined()
70
+ })
71
+
72
+ test('writes all schemas to single file', async () => {
73
+ const outputPath = `${testOutputDir}/all-schemas.json`
74
+ await Bun.$`mkdir -p ${testOutputDir}`
75
+
76
+ await runSchemas({
77
+ json: true,
78
+ outputPath,
79
+ })
80
+
81
+ const content = await Bun.file(outputPath).text()
82
+ const schemas = JSON.parse(content)
83
+ expect(schemas.PromptCase).toBeDefined()
84
+ expect(schemas.CaptureResult).toBeDefined()
85
+ })
86
+
87
+ test('splits schemas into separate files', async () => {
88
+ await runSchemas({
89
+ json: true,
90
+ split: true,
91
+ outputPath: testOutputDir,
92
+ })
93
+
94
+ // Check that individual files were created
95
+ const promptCaseExists = await Bun.file(`${testOutputDir}/PromptCase.json`).exists()
96
+ const captureResultExists = await Bun.file(`${testOutputDir}/CaptureResult.json`).exists()
97
+ const graderResultExists = await Bun.file(`${testOutputDir}/GraderResult.json`).exists()
98
+
99
+ expect(promptCaseExists).toBe(true)
100
+ expect(captureResultExists).toBe(true)
101
+ expect(graderResultExists).toBe(true)
102
+
103
+ // Verify content
104
+ const promptCaseContent = await Bun.file(`${testOutputDir}/PromptCase.json`).text()
105
+ const promptCaseSchema = JSON.parse(promptCaseContent)
106
+ expect(promptCaseSchema.title).toBe('PromptCase')
107
+ })
108
+ })
109
+
110
+ describe('schema content validation', () => {
111
+ test('PromptCase schema has correct structure', async () => {
112
+ const result = await runSchemas({ schemaName: 'PromptCase', json: true })
113
+ const schemas = result as Record<string, object>
114
+ const schema = schemas.PromptCase as Record<string, unknown>
115
+
116
+ expect(schema.$schema).toBe('https://json-schema.org/draft/2020-12/schema')
117
+ expect(schema.title).toBe('PromptCase')
118
+ expect(schema.type).toBe('object')
119
+
120
+ // Check properties exist
121
+ const properties = schema.properties as Record<string, unknown>
122
+ expect(properties).toBeDefined()
123
+ expect(properties.id).toBeDefined()
124
+ expect(properties.input).toBeDefined()
125
+ })
126
+
127
+ test('GraderResult schema has correct constraints', async () => {
128
+ const result = await runSchemas({ schemaName: 'GraderResult', json: true })
129
+ const schemas = result as Record<string, object>
130
+ const schema = schemas.GraderResult as Record<string, unknown>
131
+
132
+ expect(schema.type).toBe('object')
133
+ const properties = schema.properties as Record<string, Record<string, unknown>>
134
+ expect(properties.pass).toBeDefined()
135
+ expect(properties.score).toBeDefined()
136
+ expect(properties.pass?.type).toBe('boolean')
137
+ expect(properties.score?.type).toBe('number')
138
+ expect(properties.score?.minimum).toBe(0)
139
+ expect(properties.score?.maximum).toBe(1)
140
+ })
141
+ })
142
+ })