@plaited/agent-eval-harness 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -184,11 +184,68 @@ Key fields:
184
184
 
185
185
  ## Graders
186
186
 
187
- Graders score agent outputs. The harness supports two types:
187
+ Graders score agent outputs. The harness supports two types and two grading approaches:
188
188
 
189
- ### TypeScript/JavaScript Graders
189
+ ### Git-Based Outcome Grading (Recommended for Coding Agents)
190
190
 
191
- Export a `grade` function:
191
+ **Grade outcomes, not paths.** Use git to detect actual environmental changes:
192
+
193
+ ```typescript
194
+ import type { Grader } from '@plaited/agent-eval-harness/schemas'
195
+ import { resolve } from 'node:path'
196
+
197
+ export const grade: Grader = async ({ output, hint, cwd }) => {
198
+ // Validate cwd to prevent command injection
199
+ const isValidPath = (path: string): boolean => {
200
+ const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
201
+ if (dangerousChars.test(path)) return false
202
+ if (path.includes('..') || path.startsWith('-')) return false
203
+ return true
204
+ }
205
+
206
+ if (!cwd || !isValidPath(cwd)) {
207
+ return {
208
+ pass: false,
209
+ score: 0,
210
+ reasoning: 'Invalid working directory path'
211
+ }
212
+ }
213
+
214
+ const safeCwd = resolve(cwd)
215
+
216
+ // Detect file changes using git
217
+ const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
218
+ const filesCreated = status
219
+ .split('\n')
220
+ .filter(line => line.startsWith('??'))
221
+ .map(line => line.slice(3).trim())
222
+
223
+ // Run tests to verify outcome
224
+ const testResult = await Bun.$`cd ${safeCwd} && bun test`.nothrow()
225
+ const testsPassed = testResult.exitCode === 0
226
+
227
+ return {
228
+ pass: filesCreated.length > 0 && testsPassed,
229
+ score: testsPassed ? 1.0 : 0.0,
230
+ reasoning: `Files created: ${filesCreated.join(', ')}. Tests: ${testsPassed ? 'pass' : 'fail'}`,
231
+ outcome: { // Optional: structured data for analysis
232
+ filesCreated,
233
+ testsPassed,
234
+ type: 'file_creation_with_tests'
235
+ }
236
+ }
237
+ }
238
+ ```
239
+
240
+ **Benefits:**
241
+ - Detects actual file changes, test results, build success
242
+ - Works universally in any git repo, any language
243
+ - Returns structured `outcome` data for downstream analysis
244
+ - Zero configuration required
245
+
246
+ ### Output-Based Grading (General Purpose)
247
+
248
+ For non-coding tasks or when git is unavailable:
192
249
 
193
250
  ```typescript
194
251
  import type { Grader } from '@plaited/agent-eval-harness/schemas'
@@ -215,11 +272,62 @@ Any executable script using stdin/stdout JSON protocol:
215
272
  #!/usr/bin/env python3
216
273
  import json
217
274
  import sys
275
+ import subprocess
276
+ import re
277
+ import os
218
278
 
219
279
  data = json.load(sys.stdin)
220
280
  output = data["output"].lower()
221
281
  hint = (data.get("hint") or "").lower()
222
-
282
+ cwd = data.get("cwd")
283
+
284
+ # Validate cwd to prevent command injection
285
+ def is_valid_path(path):
286
+ if not path:
287
+ return False
288
+ # Reject shell metacharacters
289
+ if re.search(r'[;&|`$(){}\[\]<>\'"\\]', path):
290
+ return False
291
+ # Reject directory traversal and option injection
292
+ if '..' in path or path.startswith('-'):
293
+ return False
294
+ return True
295
+
296
+ # Git-based grading if cwd is provided
297
+ if cwd:
298
+ if not is_valid_path(cwd):
299
+ print(json.dumps({
300
+ "pass": False,
301
+ "score": 0.0,
302
+ "reasoning": "Invalid working directory path"
303
+ }))
304
+ sys.exit(0)
305
+
306
+ safe_cwd = os.path.abspath(cwd)
307
+
308
+ try:
309
+ result = subprocess.run(
310
+ ["git", "-C", safe_cwd, "status", "--porcelain"],
311
+ capture_output=True, text=True, check=True
312
+ )
313
+ files_created = [
314
+ line[3:].strip()
315
+ for line in result.stdout.split('\n')
316
+ if line.startswith('??')
317
+ ]
318
+ has_changes = len(files_created) > 0
319
+ print(json.dumps({
320
+ "pass": has_changes,
321
+ "score": 1.0 if has_changes else 0.0,
322
+ "reasoning": f"Files created: {', '.join(files_created)}",
323
+ "outcome": {"filesCreated": files_created, "type": "git_check"}
324
+ }))
325
+ sys.exit(0)
326
+ except subprocess.CalledProcessError:
327
+ # Fall back to output-based grading
328
+ pass
329
+
330
+ # Output-based grading fallback
223
331
  pass_result = hint in output if hint else True
224
332
  print(json.dumps({
225
333
  "pass": pass_result,
@@ -234,11 +342,14 @@ agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grade
234
342
  ```
235
343
 
236
344
  **Protocol:**
237
- - Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...]}`
238
- - Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "..."}`
345
+ - Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...], "cwd": "/path/to/dir"}`
346
+ - Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "...", "outcome": {...}}`
347
+ - `cwd` and `outcome` are optional fields
239
348
 
240
349
  ## Downstream Integration
241
350
 
351
+ The harness outputs standard JSONL. When graders return the optional `outcome` field, it's merged onto results for powerful downstream analysis:
352
+
242
353
  ```bash
243
354
  # Filter failures
244
355
  cat results.jsonl | jq 'select(.score.pass == false)'
@@ -246,10 +357,36 @@ cat results.jsonl | jq 'select(.score.pass == false)'
246
357
  # Extract tool usage patterns
247
358
  cat results.jsonl | jq '.trajectory[] | select(.type == "tool_call") | .name'
248
359
 
360
+ # Analyze outcomes from git-based graders
361
+ cat results.jsonl | jq 'select(.outcome.type == "test_execution")'
362
+ cat results.jsonl | jq -s 'map(select(.outcome.testsPassed)) | length'
363
+ cat results.jsonl | jq 'select(.outcome.touchedCriticalFiles == true)'
364
+
249
365
  # Use with your scoring pipeline
250
366
  cat results.jsonl | your-scoring-script.ts
251
367
  ```
252
368
 
369
+ ### Outcome Field
370
+
371
+ Git-based graders can return structured `outcome` data:
372
+
373
+ ```jsonl
374
+ {
375
+ "id": "fix-tests",
376
+ "input": "Fix the failing authentication tests",
377
+ "output": "I fixed the auth tests by...",
378
+ "score": {"pass": true, "score": 1.0, "reasoning": "Tests pass"},
379
+ "outcome": {
380
+ "testsPassed": true,
381
+ "filesModified": ["src/auth.ts", "src/auth.spec.ts"],
382
+ "exitCode": 0,
383
+ "type": "test_execution"
384
+ }
385
+ }
386
+ ```
387
+
388
+ This enables rich analysis across evaluations without re-parsing trajectories.
389
+
253
390
  ## Development
254
391
 
255
392
  ```bash
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.6.2",
3
+ "version": "0.7.0",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -225,13 +225,21 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
225
225
 
226
226
  // Apply grader if provided
227
227
  if (grader) {
228
- result.score = await grader({
228
+ const graderResult = await grader({
229
229
  input: promptCase.input,
230
230
  output,
231
231
  hint: promptCase.hint,
232
232
  trajectory,
233
233
  metadata: promptCase.metadata,
234
+ cwd: session.cwd,
234
235
  })
236
+
237
+ result.score = graderResult
238
+
239
+ // Merge outcome from grader if present
240
+ if (graderResult.outcome) {
241
+ result.outcome = graderResult.outcome
242
+ }
235
243
  }
236
244
 
237
245
  // Clean up session
@@ -217,10 +217,16 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
217
217
  hint: promptCase.hint,
218
218
  trajectory,
219
219
  metadata: promptCase.metadata,
220
+ cwd: session.cwd,
220
221
  })
221
222
  entry.pass = graderResult.pass
222
223
  entry.score = graderResult.score
223
224
  entry.reasoning = graderResult.reasoning
225
+
226
+ // Merge outcome from grader if present
227
+ if (graderResult.outcome) {
228
+ entry.outcome = graderResult.outcome
229
+ }
224
230
  }
225
231
 
226
232
  trialEntries.push(entry)
@@ -52,6 +52,7 @@ export const runGrade = async (
52
52
  hint: extracted.hint,
53
53
  trajectory: extracted.trajectory,
54
54
  metadata: extracted.metadata,
55
+ cwd: extracted.cwd,
55
56
  })
56
57
 
57
58
  const graded: GradedResult = {
@@ -59,6 +60,11 @@ export const runGrade = async (
59
60
  score,
60
61
  }
61
62
 
63
+ // Merge outcome from grader if present
64
+ if (score.outcome) {
65
+ graded.outcome = score.outcome
66
+ }
67
+
62
68
  const icon = score.pass ? '✓' : '✗'
63
69
  logProgress(` ${icon} score=${score.score.toFixed(2)}`, progress)
64
70
 
@@ -62,6 +62,8 @@ export type ExtractedResult = {
62
62
  toolErrors: boolean
63
63
  /** Optional metadata from original prompt */
64
64
  metadata?: Record<string, unknown>
65
+ /** Working directory path (optional, for git-based grading) */
66
+ cwd?: string
65
67
  /** Timing metadata */
66
68
  timing: {
67
69
  start: number
@@ -77,10 +79,13 @@ export type ExtractedResult = {
77
79
  *
78
80
  * @remarks
79
81
  * Adds grader score to extracted result.
82
+ * Outcome field is merged from grader result if present.
80
83
  */
81
84
  export type GradedResult = ExtractedResult & {
82
85
  /** Grader score */
83
86
  score: GraderResult
87
+ /** Outcome data from grader (if grader returned outcome) */
88
+ outcome?: Record<string, unknown>
84
89
  }
85
90
 
86
91
  /**
@@ -47,6 +47,7 @@ const resolvePath = (path: string): string => {
47
47
  * The metadata field contains arbitrary key-value pairs from the original
48
48
  * prompt JSONL (e.g., category, difficulty, tags). Use this to implement
49
49
  * category-specific grading logic or filter calibration samples.
50
+ * The cwd field provides the working directory path for git-based outcome detection.
50
51
  */
51
52
  type ExecGraderInput = {
52
53
  input: string | string[]
@@ -54,6 +55,7 @@ type ExecGraderInput = {
54
55
  hint?: string
55
56
  trajectory?: TrajectoryStep[]
56
57
  metadata?: Record<string, unknown>
58
+ cwd?: string
57
59
  }
58
60
 
59
61
  /**
@@ -73,6 +75,8 @@ const createExecGrader = (execPath: string): Grader => {
73
75
  output: params.output,
74
76
  hint: params.hint,
75
77
  trajectory: params.trajectory,
78
+ metadata: params.metadata,
79
+ cwd: params.cwd,
76
80
  }
77
81
 
78
82
  const inputJson = JSON.stringify(input)
@@ -209,6 +209,7 @@ export type PromptCase = z.infer<typeof PromptCaseSchema>
209
209
  *
210
210
  * @remarks
211
211
  * Result returned by user-provided grader functions.
212
+ * - `outcome`: Optional structured outcome data detected by the grader
212
213
  */
213
214
  export const GraderResultSchema = z.object({
214
215
  /** Whether the output passes the evaluation criteria */
@@ -217,6 +218,8 @@ export const GraderResultSchema = z.object({
217
218
  score: z.number().min(0).max(1),
218
219
  /** Optional explanation for the score */
219
220
  reasoning: z.string().optional(),
221
+ /** Optional outcome data (e.g., files created, tests passed) */
222
+ outcome: z.record(z.string(), z.unknown()).optional(),
220
223
  })
221
224
 
222
225
  /** Grader result type */
@@ -230,6 +233,7 @@ export type GraderResult = z.infer<typeof GraderResultSchema>
230
233
  * - `input` is the original prompt (string or array for multi-turn)
231
234
  * - `hint` provides grader context (renamed from `expected`)
232
235
  * - `metadata` contains arbitrary key-value pairs from the original prompt JSONL
236
+ * - `cwd` is the working directory path (optional, enables git-based outcome detection)
233
237
  */
234
238
  export type Grader = (params: {
235
239
  input: string | string[]
@@ -237,6 +241,7 @@ export type Grader = (params: {
237
241
  hint?: string
238
242
  trajectory?: TrajectoryStep[]
239
243
  metadata?: Record<string, unknown>
244
+ cwd?: string
240
245
  }) => Promise<GraderResult>
241
246
 
242
247
  // ============================================================================
@@ -375,6 +380,7 @@ export type TrajectoryRichness = z.infer<typeof TrajectoryRichnessSchema>
375
380
  * - `input` can be string (single turn) or string[] (multi-turn)
376
381
  * - `hint` provides grader context (renamed from `expected`)
377
382
  * - `toolErrors` replaces misleading `status: 'passed'|'failed'`
383
+ * - `outcome` is merged from grader result if grader returns outcome data
378
384
  * Real pass/fail determination comes from your grader.
379
385
  */
380
386
  export const CaptureResultSchema = z.object({
@@ -398,6 +404,8 @@ export const CaptureResultSchema = z.object({
398
404
  errors: z.array(z.string()).optional(),
399
405
  /** Grader score (if grader was provided) */
400
406
  score: GraderResultSchema.optional(),
407
+ /** Outcome data from grader (if grader provided and returned outcome) */
408
+ outcome: z.record(z.string(), z.unknown()).optional(),
401
409
  })
402
410
 
403
411
  /** Capture result type */
@@ -449,6 +457,8 @@ export const TrialEntrySchema = z.object({
449
457
  score: z.number().optional(),
450
458
  /** Grader reasoning (if grader provided) */
451
459
  reasoning: z.string().optional(),
460
+ /** Outcome data from grader (if grader provided and returned outcome) */
461
+ outcome: z.record(z.string(), z.unknown()).optional(),
452
462
  })
453
463
 
454
464
  /** Trial entry type */
@@ -0,0 +1,116 @@
1
+ /**
2
+ * Test fixture: Git-based grader that detects file changes.
3
+ *
4
+ * @remarks
5
+ * This grader uses git to detect environmental outcomes instead of just
6
+ * checking output text. It demonstrates the "grade outcomes, not paths" principle.
7
+ *
8
+ * SECURITY NOTE: This fixture validates the cwd parameter to prevent command injection.
9
+ * When implementing your own git-based graders, always validate paths from untrusted sources.
10
+ * The cwd parameter should only come from trusted sources (process.cwd(), CLI flags, etc.).
11
+ */
12
+
13
+ import { resolve } from 'node:path'
14
+ import type { Grader } from '../../schemas.ts'
15
+
16
+ /**
17
+ * Validates that a path is safe to use in shell commands.
18
+ *
19
+ * @remarks
20
+ * Rejects paths containing shell metacharacters or suspicious patterns
21
+ * that could be used for command injection.
22
+ *
23
+ * @param path - The path to validate
24
+ * @returns True if path appears safe, false otherwise
25
+ */
26
+ const isValidPath = (path: string): boolean => {
27
+ // Reject paths with shell metacharacters that could enable command injection
28
+ const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
29
+ if (dangerousChars.test(path)) {
30
+ return false
31
+ }
32
+
33
+ // Reject paths with suspicious patterns
34
+ if (path.includes('..') || path.startsWith('-')) {
35
+ return false
36
+ }
37
+
38
+ return true
39
+ }
40
+
41
+ export const grade: Grader = async ({ output: _output, hint, cwd }) => {
42
+ // If no cwd provided, fall back to hint-based grading
43
+ if (!cwd) {
44
+ return {
45
+ pass: false,
46
+ score: 0,
47
+ reasoning: 'No working directory provided',
48
+ }
49
+ }
50
+
51
+ // SECURITY: Validate cwd to prevent command injection
52
+ if (!isValidPath(cwd)) {
53
+ return {
54
+ pass: false,
55
+ score: 0,
56
+ reasoning: 'Invalid working directory path (contains suspicious characters)',
57
+ }
58
+ }
59
+
60
+ // Normalize path to prevent directory traversal
61
+ const safeCwd = resolve(cwd)
62
+
63
+ // Check if we're in a git repo
64
+ const isGit = await Bun.$`git -C ${safeCwd} rev-parse --git-dir 2>/dev/null`.nothrow()
65
+
66
+ if (isGit.exitCode !== 0) {
67
+ return {
68
+ pass: false,
69
+ score: 0,
70
+ reasoning: 'Not a git repository',
71
+ }
72
+ }
73
+
74
+ // Detect what files were created/modified using git
75
+ // Note: This detects untracked (??) and modified (M) files.
76
+ // Staged (A), renamed (R), deleted (D) files are not included in this example.
77
+ const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
78
+
79
+ const filesCreated = status
80
+ .split('\n')
81
+ .filter((line) => line.startsWith('??')) // ?? = untracked files
82
+ .map((line) => line.slice(3).trim())
83
+ .filter(Boolean)
84
+
85
+ const filesModified = status
86
+ .split('\n')
87
+ .filter((line) => line.startsWith(' M') || line.startsWith('M ')) // M = modified
88
+ .map((line) => line.slice(3).trim())
89
+ .filter(Boolean)
90
+
91
+ const hasChanges = filesCreated.length > 0 || filesModified.length > 0
92
+
93
+ // If hint is provided, check if any changed file matches the hint
94
+ let matchesHint = true
95
+ if (hint) {
96
+ const allChangedFiles = [...filesCreated, ...filesModified]
97
+ matchesHint = allChangedFiles.some((file) => file.toLowerCase().includes(hint.toLowerCase()))
98
+ }
99
+
100
+ const pass = hasChanges && matchesHint
101
+
102
+ return {
103
+ pass,
104
+ score: pass ? 1.0 : hasChanges ? 0.5 : 0.0,
105
+ reasoning: pass
106
+ ? `Files changed: ${[...filesCreated, ...filesModified].join(', ')}`
107
+ : hasChanges
108
+ ? 'File changes do not match hint'
109
+ : 'No file changes detected',
110
+ outcome: {
111
+ filesCreated,
112
+ filesModified,
113
+ type: 'git_status_check',
114
+ },
115
+ }
116
+ }
@@ -0,0 +1,222 @@
1
+ /**
2
+ * Tests for git-based grader fixture.
3
+ *
4
+ * @remarks
5
+ * Verifies that graders can use git to detect environmental outcomes
6
+ * and return structured outcome data.
7
+ */
8
+
9
+ import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
10
+ import { mkdtemp, rm } from 'node:fs/promises'
11
+ import { tmpdir } from 'node:os'
12
+ import { join } from 'node:path'
13
+ import type { Grader } from '../schemas.ts'
14
+
15
+ describe('Git-based grader', () => {
16
+ let tempDir: string
17
+ let grader: Grader
18
+
19
+ beforeEach(async () => {
20
+ // Create temporary directory
21
+ tempDir = await mkdtemp(join(tmpdir(), 'git-grader-test-'))
22
+
23
+ // Initialize git repo
24
+ await Bun.$`git -C ${tempDir} init`.quiet()
25
+ await Bun.$`git -C ${tempDir} config user.email "test@test.com"`.quiet()
26
+ await Bun.$`git -C ${tempDir} config user.name "Test User"`.quiet()
27
+
28
+ // Load the git-based grader
29
+ const module = await import('./fixtures/grader-git.ts')
30
+ grader = module.grade
31
+ })
32
+
33
+ afterEach(async () => {
34
+ // Clean up temporary directory
35
+ await rm(tempDir, { recursive: true, force: true })
36
+ })
37
+
38
+ test('detects newly created files', async () => {
39
+ // Create a new file (untracked)
40
+ await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => <button>Click</button>')
41
+
42
+ const result = await grader({
43
+ input: 'Create a button component',
44
+ output: 'I created Button.tsx',
45
+ hint: 'button',
46
+ cwd: tempDir,
47
+ })
48
+
49
+ expect(result.pass).toBe(true)
50
+ expect(result.score).toBe(1.0)
51
+ expect(result.reasoning).toContain('button.tsx')
52
+ expect(result.outcome).toBeDefined()
53
+ expect(result.outcome?.filesCreated).toEqual(['button.tsx'])
54
+ expect(result.outcome?.type).toBe('git_status_check')
55
+ })
56
+
57
+ test('detects modified files', async () => {
58
+ // Create and commit a file
59
+ await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 1 }')
60
+ await Bun.$`git -C ${tempDir} add config.ts`.quiet()
61
+ await Bun.$`git -C ${tempDir} commit -m "Initial commit"`.quiet()
62
+
63
+ // Modify the file
64
+ await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 2 }')
65
+
66
+ const result = await grader({
67
+ input: 'Update config value',
68
+ output: 'I updated the config',
69
+ hint: 'config',
70
+ cwd: tempDir,
71
+ })
72
+
73
+ expect(result.pass).toBe(true)
74
+ expect(result.score).toBe(1.0)
75
+ expect(result.reasoning).toContain('config.ts')
76
+ expect(result.outcome).toBeDefined()
77
+ expect(result.outcome?.filesModified).toEqual(['config.ts'])
78
+ expect(result.outcome?.type).toBe('git_status_check')
79
+ })
80
+
81
+ test('fails when no changes detected', async () => {
82
+ // No files created or modified
83
+ const result = await grader({
84
+ input: 'Create a button component',
85
+ output: 'I created a button component',
86
+ cwd: tempDir,
87
+ })
88
+
89
+ expect(result.pass).toBe(false)
90
+ expect(result.score).toBe(0)
91
+ expect(result.reasoning).toContain('No file changes detected')
92
+ expect(result.outcome).toBeDefined()
93
+ expect(result.outcome?.filesCreated).toEqual([])
94
+ expect(result.outcome?.filesModified).toEqual([])
95
+ })
96
+
97
+ test('partial score when changes do not match hint', async () => {
98
+ // Create a file that does not match the hint
99
+ await Bun.write(join(tempDir, 'unrelated.ts'), 'export const foo = 1')
100
+
101
+ const result = await grader({
102
+ input: 'Create a button component',
103
+ output: 'I created something',
104
+ hint: 'button',
105
+ cwd: tempDir,
106
+ })
107
+
108
+ expect(result.pass).toBe(false)
109
+ expect(result.score).toBe(0.5) // Has changes but doesn't match hint
110
+ expect(result.reasoning).toContain('do not match hint')
111
+ expect(result.outcome?.filesCreated).toEqual(['unrelated.ts'])
112
+ })
113
+
114
+ test('handles missing cwd parameter', async () => {
115
+ const result = await grader({
116
+ input: 'Create a button component',
117
+ output: 'I created a button',
118
+ hint: 'button',
119
+ // cwd not provided
120
+ })
121
+
122
+ expect(result.pass).toBe(false)
123
+ expect(result.score).toBe(0)
124
+ expect(result.reasoning).toBe('No working directory provided')
125
+ })
126
+
127
+ test('handles non-git directory', async () => {
128
+ // Create a non-git temp directory
129
+ const nonGitDir = await mkdtemp(join(tmpdir(), 'non-git-test-'))
130
+
131
+ try {
132
+ const result = await grader({
133
+ input: 'Create a button component',
134
+ output: 'I created a button',
135
+ cwd: nonGitDir,
136
+ })
137
+
138
+ expect(result.pass).toBe(false)
139
+ expect(result.score).toBe(0)
140
+ expect(result.reasoning).toBe('Not a git repository')
141
+ } finally {
142
+ await rm(nonGitDir, { recursive: true, force: true })
143
+ }
144
+ })
145
+
146
+ test('works without hint parameter', async () => {
147
+ // Create a file
148
+ await Bun.write(join(tempDir, 'any-file.ts'), 'export const x = 1')
149
+
150
+ const result = await grader({
151
+ input: 'Create a file',
152
+ output: 'I created a file',
153
+ cwd: tempDir,
154
+ // hint not provided
155
+ })
156
+
157
+ expect(result.pass).toBe(true)
158
+ expect(result.score).toBe(1.0)
159
+ expect(result.reasoning).toContain('any-file.ts')
160
+ expect(result.outcome?.filesCreated).toEqual(['any-file.ts'])
161
+ })
162
+
163
+ test('returns structured outcome for downstream analysis', async () => {
164
+ // Create multiple files
165
+ await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => <button />')
166
+ await Bun.write(join(tempDir, 'input.tsx'), 'export const Input = () => <input />')
167
+
168
+ const result = await grader({
169
+ input: 'Create UI components',
170
+ output: 'I created Button and Input components',
171
+ cwd: tempDir,
172
+ })
173
+
174
+ expect(result.outcome).toBeDefined()
175
+ expect(result.outcome?.type).toBe('git_status_check')
176
+ expect(result.outcome?.filesCreated).toBeInstanceOf(Array)
177
+ expect(result.outcome?.filesCreated).toHaveLength(2)
178
+ expect(result.outcome?.filesCreated).toContain('button.tsx')
179
+ expect(result.outcome?.filesCreated).toContain('input.tsx')
180
+ expect(result.outcome?.filesModified).toEqual([])
181
+ })
182
+
183
+ test('rejects path with command injection attempt', async () => {
184
+ const result = await grader({
185
+ input: 'Create a file',
186
+ output: 'Created file',
187
+ cwd: '/tmp/test; rm -rf /', // Command injection attempt
188
+ })
189
+
190
+ expect(result.pass).toBe(false)
191
+ expect(result.score).toBe(0)
192
+ expect(result.reasoning).toContain('Invalid working directory path')
193
+ })
194
+
195
+ test('rejects path with directory traversal', async () => {
196
+ const result = await grader({
197
+ input: 'Create a file',
198
+ output: 'Created file',
199
+ cwd: '/tmp/../../../etc', // Directory traversal
200
+ })
201
+
202
+ expect(result.pass).toBe(false)
203
+ expect(result.score).toBe(0)
204
+ expect(result.reasoning).toContain('Invalid working directory path')
205
+ })
206
+
207
+ test('rejects path with shell metacharacters', async () => {
208
+ const dangerousPaths = ['/tmp/test$(whoami)', '/tmp/test`id`', '/tmp/test|cat', '/tmp/test&echo', '/tmp/test>out']
209
+
210
+ for (const path of dangerousPaths) {
211
+ const result = await grader({
212
+ input: 'Create a file',
213
+ output: 'Created file',
214
+ cwd: path,
215
+ })
216
+
217
+ expect(result.pass).toBe(false)
218
+ expect(result.score).toBe(0)
219
+ expect(result.reasoning).toContain('Invalid working directory path')
220
+ }
221
+ })
222
+ })