@plaited/agent-eval-harness 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +143 -6
- package/package.json +1 -1
- package/src/commands/capture.ts +9 -1
- package/src/commands/trials.ts +6 -0
- package/src/pipeline/grade.ts +6 -0
- package/src/pipeline/pipeline.types.ts +5 -0
- package/src/schemas/grader-loader.ts +4 -0
- package/src/schemas/schemas.ts +10 -0
- package/src/schemas/tests/fixtures/grader-git.ts +116 -0
- package/src/schemas/tests/grader-git.spec.ts +222 -0
package/README.md
CHANGED
|
@@ -184,11 +184,68 @@ Key fields:
|
|
|
184
184
|
|
|
185
185
|
## Graders
|
|
186
186
|
|
|
187
|
-
Graders score agent outputs. The harness supports two types:
|
|
187
|
+
Graders score agent outputs. The harness supports two types and two grading approaches:
|
|
188
188
|
|
|
189
|
-
###
|
|
189
|
+
### Git-Based Outcome Grading (Recommended for Coding Agents)
|
|
190
190
|
|
|
191
|
-
|
|
191
|
+
**Grade outcomes, not paths.** Use git to detect actual environmental changes:
|
|
192
|
+
|
|
193
|
+
```typescript
|
|
194
|
+
import type { Grader } from '@plaited/agent-eval-harness/schemas'
|
|
195
|
+
import { resolve } from 'node:path'
|
|
196
|
+
|
|
197
|
+
export const grade: Grader = async ({ output, hint, cwd }) => {
|
|
198
|
+
// Validate cwd to prevent command injection
|
|
199
|
+
const isValidPath = (path: string): boolean => {
|
|
200
|
+
const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
|
|
201
|
+
if (dangerousChars.test(path)) return false
|
|
202
|
+
if (path.includes('..') || path.startsWith('-')) return false
|
|
203
|
+
return true
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
if (!cwd || !isValidPath(cwd)) {
|
|
207
|
+
return {
|
|
208
|
+
pass: false,
|
|
209
|
+
score: 0,
|
|
210
|
+
reasoning: 'Invalid working directory path'
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const safeCwd = resolve(cwd)
|
|
215
|
+
|
|
216
|
+
// Detect file changes using git
|
|
217
|
+
const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
|
|
218
|
+
const filesCreated = status
|
|
219
|
+
.split('\n')
|
|
220
|
+
.filter(line => line.startsWith('??'))
|
|
221
|
+
.map(line => line.slice(3).trim())
|
|
222
|
+
|
|
223
|
+
// Run tests to verify outcome
|
|
224
|
+
const testResult = await Bun.$`cd ${safeCwd} && bun test`.nothrow()
|
|
225
|
+
const testsPassed = testResult.exitCode === 0
|
|
226
|
+
|
|
227
|
+
return {
|
|
228
|
+
pass: filesCreated.length > 0 && testsPassed,
|
|
229
|
+
score: testsPassed ? 1.0 : 0.0,
|
|
230
|
+
reasoning: `Files created: ${filesCreated.join(', ')}. Tests: ${testsPassed ? 'pass' : 'fail'}`,
|
|
231
|
+
outcome: { // Optional: structured data for analysis
|
|
232
|
+
filesCreated,
|
|
233
|
+
testsPassed,
|
|
234
|
+
type: 'file_creation_with_tests'
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
**Benefits:**
|
|
241
|
+
- Detects actual file changes, test results, build success
|
|
242
|
+
- Works universally in any git repo, any language
|
|
243
|
+
- Returns structured `outcome` data for downstream analysis
|
|
244
|
+
- Zero configuration required
|
|
245
|
+
|
|
246
|
+
### Output-Based Grading (General Purpose)
|
|
247
|
+
|
|
248
|
+
For non-coding tasks or when git is unavailable:
|
|
192
249
|
|
|
193
250
|
```typescript
|
|
194
251
|
import type { Grader } from '@plaited/agent-eval-harness/schemas'
|
|
@@ -215,11 +272,62 @@ Any executable script using stdin/stdout JSON protocol:
|
|
|
215
272
|
#!/usr/bin/env python3
|
|
216
273
|
import json
|
|
217
274
|
import sys
|
|
275
|
+
import subprocess
|
|
276
|
+
import re
|
|
277
|
+
import os
|
|
218
278
|
|
|
219
279
|
data = json.load(sys.stdin)
|
|
220
280
|
output = data["output"].lower()
|
|
221
281
|
hint = (data.get("hint") or "").lower()
|
|
222
|
-
|
|
282
|
+
cwd = data.get("cwd")
|
|
283
|
+
|
|
284
|
+
# Validate cwd to prevent command injection
|
|
285
|
+
def is_valid_path(path):
|
|
286
|
+
if not path:
|
|
287
|
+
return False
|
|
288
|
+
# Reject shell metacharacters
|
|
289
|
+
if re.search(r'[;&|`$(){}\[\]<>\'"\\]', path):
|
|
290
|
+
return False
|
|
291
|
+
# Reject directory traversal and option injection
|
|
292
|
+
if '..' in path or path.startswith('-'):
|
|
293
|
+
return False
|
|
294
|
+
return True
|
|
295
|
+
|
|
296
|
+
# Git-based grading if cwd is provided
|
|
297
|
+
if cwd:
|
|
298
|
+
if not is_valid_path(cwd):
|
|
299
|
+
print(json.dumps({
|
|
300
|
+
"pass": False,
|
|
301
|
+
"score": 0.0,
|
|
302
|
+
"reasoning": "Invalid working directory path"
|
|
303
|
+
}))
|
|
304
|
+
sys.exit(0)
|
|
305
|
+
|
|
306
|
+
safe_cwd = os.path.abspath(cwd)
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
result = subprocess.run(
|
|
310
|
+
["git", "-C", safe_cwd, "status", "--porcelain"],
|
|
311
|
+
capture_output=True, text=True, check=True
|
|
312
|
+
)
|
|
313
|
+
files_created = [
|
|
314
|
+
line[3:].strip()
|
|
315
|
+
for line in result.stdout.split('\n')
|
|
316
|
+
if line.startswith('??')
|
|
317
|
+
]
|
|
318
|
+
has_changes = len(files_created) > 0
|
|
319
|
+
print(json.dumps({
|
|
320
|
+
"pass": has_changes,
|
|
321
|
+
"score": 1.0 if has_changes else 0.0,
|
|
322
|
+
"reasoning": f"Files created: {', '.join(files_created)}",
|
|
323
|
+
"outcome": {"filesCreated": files_created, "type": "git_check"}
|
|
324
|
+
}))
|
|
325
|
+
sys.exit(0)
|
|
326
|
+
except subprocess.CalledProcessError:
|
|
327
|
+
# Fall back to output-based grading
|
|
328
|
+
pass
|
|
329
|
+
|
|
330
|
+
# Output-based grading fallback
|
|
223
331
|
pass_result = hint in output if hint else True
|
|
224
332
|
print(json.dumps({
|
|
225
333
|
"pass": pass_result,
|
|
@@ -234,11 +342,14 @@ agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grade
|
|
|
234
342
|
```
|
|
235
343
|
|
|
236
344
|
**Protocol:**
|
|
237
|
-
- Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...]}`
|
|
238
|
-
- Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "..."}`
|
|
345
|
+
- Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...], "cwd": "/path/to/dir"}`
|
|
346
|
+
- Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "...", "outcome": {...}}`
|
|
347
|
+
- `cwd` and `outcome` are optional fields
|
|
239
348
|
|
|
240
349
|
## Downstream Integration
|
|
241
350
|
|
|
351
|
+
The harness outputs standard JSONL. When graders return the optional `outcome` field, it's merged onto results for powerful downstream analysis:
|
|
352
|
+
|
|
242
353
|
```bash
|
|
243
354
|
# Filter failures
|
|
244
355
|
cat results.jsonl | jq 'select(.score.pass == false)'
|
|
@@ -246,10 +357,36 @@ cat results.jsonl | jq 'select(.score.pass == false)'
|
|
|
246
357
|
# Extract tool usage patterns
|
|
247
358
|
cat results.jsonl | jq '.trajectory[] | select(.type == "tool_call") | .name'
|
|
248
359
|
|
|
360
|
+
# Analyze outcomes from git-based graders
|
|
361
|
+
cat results.jsonl | jq 'select(.outcome.type == "test_execution")'
|
|
362
|
+
cat results.jsonl | jq -s 'map(select(.outcome.testsPassed)) | length'
|
|
363
|
+
cat results.jsonl | jq 'select(.outcome.touchedCriticalFiles == true)'
|
|
364
|
+
|
|
249
365
|
# Use with your scoring pipeline
|
|
250
366
|
cat results.jsonl | your-scoring-script.ts
|
|
251
367
|
```
|
|
252
368
|
|
|
369
|
+
### Outcome Field
|
|
370
|
+
|
|
371
|
+
Git-based graders can return structured `outcome` data:
|
|
372
|
+
|
|
373
|
+
```jsonl
|
|
374
|
+
{
|
|
375
|
+
"id": "fix-tests",
|
|
376
|
+
"input": "Fix the failing authentication tests",
|
|
377
|
+
"output": "I fixed the auth tests by...",
|
|
378
|
+
"score": {"pass": true, "score": 1.0, "reasoning": "Tests pass"},
|
|
379
|
+
"outcome": {
|
|
380
|
+
"testsPassed": true,
|
|
381
|
+
"filesModified": ["src/auth.ts", "src/auth.spec.ts"],
|
|
382
|
+
"exitCode": 0,
|
|
383
|
+
"type": "test_execution"
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
This enables rich analysis across evaluations without re-parsing trajectories.
|
|
389
|
+
|
|
253
390
|
## Development
|
|
254
391
|
|
|
255
392
|
```bash
|
package/package.json
CHANGED
package/src/commands/capture.ts
CHANGED
|
@@ -225,13 +225,21 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
225
225
|
|
|
226
226
|
// Apply grader if provided
|
|
227
227
|
if (grader) {
|
|
228
|
-
|
|
228
|
+
const graderResult = await grader({
|
|
229
229
|
input: promptCase.input,
|
|
230
230
|
output,
|
|
231
231
|
hint: promptCase.hint,
|
|
232
232
|
trajectory,
|
|
233
233
|
metadata: promptCase.metadata,
|
|
234
|
+
cwd: session.cwd,
|
|
234
235
|
})
|
|
236
|
+
|
|
237
|
+
result.score = graderResult
|
|
238
|
+
|
|
239
|
+
// Merge outcome from grader if present
|
|
240
|
+
if (graderResult.outcome) {
|
|
241
|
+
result.outcome = graderResult.outcome
|
|
242
|
+
}
|
|
235
243
|
}
|
|
236
244
|
|
|
237
245
|
// Clean up session
|
package/src/commands/trials.ts
CHANGED
|
@@ -217,10 +217,16 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
217
217
|
hint: promptCase.hint,
|
|
218
218
|
trajectory,
|
|
219
219
|
metadata: promptCase.metadata,
|
|
220
|
+
cwd: session.cwd,
|
|
220
221
|
})
|
|
221
222
|
entry.pass = graderResult.pass
|
|
222
223
|
entry.score = graderResult.score
|
|
223
224
|
entry.reasoning = graderResult.reasoning
|
|
225
|
+
|
|
226
|
+
// Merge outcome from grader if present
|
|
227
|
+
if (graderResult.outcome) {
|
|
228
|
+
entry.outcome = graderResult.outcome
|
|
229
|
+
}
|
|
224
230
|
}
|
|
225
231
|
|
|
226
232
|
trialEntries.push(entry)
|
package/src/pipeline/grade.ts
CHANGED
|
@@ -52,6 +52,7 @@ export const runGrade = async (
|
|
|
52
52
|
hint: extracted.hint,
|
|
53
53
|
trajectory: extracted.trajectory,
|
|
54
54
|
metadata: extracted.metadata,
|
|
55
|
+
cwd: extracted.cwd,
|
|
55
56
|
})
|
|
56
57
|
|
|
57
58
|
const graded: GradedResult = {
|
|
@@ -59,6 +60,11 @@ export const runGrade = async (
|
|
|
59
60
|
score,
|
|
60
61
|
}
|
|
61
62
|
|
|
63
|
+
// Merge outcome from grader if present
|
|
64
|
+
if (score.outcome) {
|
|
65
|
+
graded.outcome = score.outcome
|
|
66
|
+
}
|
|
67
|
+
|
|
62
68
|
const icon = score.pass ? '✓' : '✗'
|
|
63
69
|
logProgress(` ${icon} score=${score.score.toFixed(2)}`, progress)
|
|
64
70
|
|
|
@@ -62,6 +62,8 @@ export type ExtractedResult = {
|
|
|
62
62
|
toolErrors: boolean
|
|
63
63
|
/** Optional metadata from original prompt */
|
|
64
64
|
metadata?: Record<string, unknown>
|
|
65
|
+
/** Working directory path (optional, for git-based grading) */
|
|
66
|
+
cwd?: string
|
|
65
67
|
/** Timing metadata */
|
|
66
68
|
timing: {
|
|
67
69
|
start: number
|
|
@@ -77,10 +79,13 @@ export type ExtractedResult = {
|
|
|
77
79
|
*
|
|
78
80
|
* @remarks
|
|
79
81
|
* Adds grader score to extracted result.
|
|
82
|
+
* Outcome field is merged from grader result if present.
|
|
80
83
|
*/
|
|
81
84
|
export type GradedResult = ExtractedResult & {
|
|
82
85
|
/** Grader score */
|
|
83
86
|
score: GraderResult
|
|
87
|
+
/** Outcome data from grader (if grader returned outcome) */
|
|
88
|
+
outcome?: Record<string, unknown>
|
|
84
89
|
}
|
|
85
90
|
|
|
86
91
|
/**
|
|
@@ -47,6 +47,7 @@ const resolvePath = (path: string): string => {
|
|
|
47
47
|
* The metadata field contains arbitrary key-value pairs from the original
|
|
48
48
|
* prompt JSONL (e.g., category, difficulty, tags). Use this to implement
|
|
49
49
|
* category-specific grading logic or filter calibration samples.
|
|
50
|
+
* The cwd field provides the working directory path for git-based outcome detection.
|
|
50
51
|
*/
|
|
51
52
|
type ExecGraderInput = {
|
|
52
53
|
input: string | string[]
|
|
@@ -54,6 +55,7 @@ type ExecGraderInput = {
|
|
|
54
55
|
hint?: string
|
|
55
56
|
trajectory?: TrajectoryStep[]
|
|
56
57
|
metadata?: Record<string, unknown>
|
|
58
|
+
cwd?: string
|
|
57
59
|
}
|
|
58
60
|
|
|
59
61
|
/**
|
|
@@ -73,6 +75,8 @@ const createExecGrader = (execPath: string): Grader => {
|
|
|
73
75
|
output: params.output,
|
|
74
76
|
hint: params.hint,
|
|
75
77
|
trajectory: params.trajectory,
|
|
78
|
+
metadata: params.metadata,
|
|
79
|
+
cwd: params.cwd,
|
|
76
80
|
}
|
|
77
81
|
|
|
78
82
|
const inputJson = JSON.stringify(input)
|
package/src/schemas/schemas.ts
CHANGED
|
@@ -209,6 +209,7 @@ export type PromptCase = z.infer<typeof PromptCaseSchema>
|
|
|
209
209
|
*
|
|
210
210
|
* @remarks
|
|
211
211
|
* Result returned by user-provided grader functions.
|
|
212
|
+
* - `outcome`: Optional structured outcome data detected by the grader
|
|
212
213
|
*/
|
|
213
214
|
export const GraderResultSchema = z.object({
|
|
214
215
|
/** Whether the output passes the evaluation criteria */
|
|
@@ -217,6 +218,8 @@ export const GraderResultSchema = z.object({
|
|
|
217
218
|
score: z.number().min(0).max(1),
|
|
218
219
|
/** Optional explanation for the score */
|
|
219
220
|
reasoning: z.string().optional(),
|
|
221
|
+
/** Optional outcome data (e.g., files created, tests passed) */
|
|
222
|
+
outcome: z.record(z.string(), z.unknown()).optional(),
|
|
220
223
|
})
|
|
221
224
|
|
|
222
225
|
/** Grader result type */
|
|
@@ -230,6 +233,7 @@ export type GraderResult = z.infer<typeof GraderResultSchema>
|
|
|
230
233
|
* - `input` is the original prompt (string or array for multi-turn)
|
|
231
234
|
* - `hint` provides grader context (renamed from `expected`)
|
|
232
235
|
* - `metadata` contains arbitrary key-value pairs from the original prompt JSONL
|
|
236
|
+
* - `cwd` is the working directory path (optional, enables git-based outcome detection)
|
|
233
237
|
*/
|
|
234
238
|
export type Grader = (params: {
|
|
235
239
|
input: string | string[]
|
|
@@ -237,6 +241,7 @@ export type Grader = (params: {
|
|
|
237
241
|
hint?: string
|
|
238
242
|
trajectory?: TrajectoryStep[]
|
|
239
243
|
metadata?: Record<string, unknown>
|
|
244
|
+
cwd?: string
|
|
240
245
|
}) => Promise<GraderResult>
|
|
241
246
|
|
|
242
247
|
// ============================================================================
|
|
@@ -375,6 +380,7 @@ export type TrajectoryRichness = z.infer<typeof TrajectoryRichnessSchema>
|
|
|
375
380
|
* - `input` can be string (single turn) or string[] (multi-turn)
|
|
376
381
|
* - `hint` provides grader context (renamed from `expected`)
|
|
377
382
|
* - `toolErrors` replaces misleading `status: 'passed'|'failed'`
|
|
383
|
+
* - `outcome` is merged from grader result if grader returns outcome data
|
|
378
384
|
* Real pass/fail determination comes from your grader.
|
|
379
385
|
*/
|
|
380
386
|
export const CaptureResultSchema = z.object({
|
|
@@ -398,6 +404,8 @@ export const CaptureResultSchema = z.object({
|
|
|
398
404
|
errors: z.array(z.string()).optional(),
|
|
399
405
|
/** Grader score (if grader was provided) */
|
|
400
406
|
score: GraderResultSchema.optional(),
|
|
407
|
+
/** Outcome data from grader (if grader provided and returned outcome) */
|
|
408
|
+
outcome: z.record(z.string(), z.unknown()).optional(),
|
|
401
409
|
})
|
|
402
410
|
|
|
403
411
|
/** Capture result type */
|
|
@@ -449,6 +457,8 @@ export const TrialEntrySchema = z.object({
|
|
|
449
457
|
score: z.number().optional(),
|
|
450
458
|
/** Grader reasoning (if grader provided) */
|
|
451
459
|
reasoning: z.string().optional(),
|
|
460
|
+
/** Outcome data from grader (if grader provided and returned outcome) */
|
|
461
|
+
outcome: z.record(z.string(), z.unknown()).optional(),
|
|
452
462
|
})
|
|
453
463
|
|
|
454
464
|
/** Trial entry type */
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test fixture: Git-based grader that detects file changes.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* This grader uses git to detect environmental outcomes instead of just
|
|
6
|
+
* checking output text. It demonstrates the "grade outcomes, not paths" principle.
|
|
7
|
+
*
|
|
8
|
+
* SECURITY NOTE: This fixture validates the cwd parameter to prevent command injection.
|
|
9
|
+
* When implementing your own git-based graders, always validate paths from untrusted sources.
|
|
10
|
+
* The cwd parameter should only come from trusted sources (process.cwd(), CLI flags, etc.).
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { resolve } from 'node:path'
|
|
14
|
+
import type { Grader } from '../../schemas.ts'
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Validates that a path is safe to use in shell commands.
|
|
18
|
+
*
|
|
19
|
+
* @remarks
|
|
20
|
+
* Rejects paths containing shell metacharacters or suspicious patterns
|
|
21
|
+
* that could be used for command injection.
|
|
22
|
+
*
|
|
23
|
+
* @param path - The path to validate
|
|
24
|
+
* @returns True if path appears safe, false otherwise
|
|
25
|
+
*/
|
|
26
|
+
const isValidPath = (path: string): boolean => {
|
|
27
|
+
// Reject paths with shell metacharacters that could enable command injection
|
|
28
|
+
const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
|
|
29
|
+
if (dangerousChars.test(path)) {
|
|
30
|
+
return false
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Reject paths with suspicious patterns
|
|
34
|
+
if (path.includes('..') || path.startsWith('-')) {
|
|
35
|
+
return false
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return true
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export const grade: Grader = async ({ output: _output, hint, cwd }) => {
|
|
42
|
+
// If no cwd provided, fall back to hint-based grading
|
|
43
|
+
if (!cwd) {
|
|
44
|
+
return {
|
|
45
|
+
pass: false,
|
|
46
|
+
score: 0,
|
|
47
|
+
reasoning: 'No working directory provided',
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// SECURITY: Validate cwd to prevent command injection
|
|
52
|
+
if (!isValidPath(cwd)) {
|
|
53
|
+
return {
|
|
54
|
+
pass: false,
|
|
55
|
+
score: 0,
|
|
56
|
+
reasoning: 'Invalid working directory path (contains suspicious characters)',
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Normalize path to prevent directory traversal
|
|
61
|
+
const safeCwd = resolve(cwd)
|
|
62
|
+
|
|
63
|
+
// Check if we're in a git repo
|
|
64
|
+
const isGit = await Bun.$`git -C ${safeCwd} rev-parse --git-dir 2>/dev/null`.nothrow()
|
|
65
|
+
|
|
66
|
+
if (isGit.exitCode !== 0) {
|
|
67
|
+
return {
|
|
68
|
+
pass: false,
|
|
69
|
+
score: 0,
|
|
70
|
+
reasoning: 'Not a git repository',
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Detect what files were created/modified using git
|
|
75
|
+
// Note: This detects untracked (??) and modified (M) files.
|
|
76
|
+
// Staged (A), renamed (R), deleted (D) files are not included in this example.
|
|
77
|
+
const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
|
|
78
|
+
|
|
79
|
+
const filesCreated = status
|
|
80
|
+
.split('\n')
|
|
81
|
+
.filter((line) => line.startsWith('??')) // ?? = untracked files
|
|
82
|
+
.map((line) => line.slice(3).trim())
|
|
83
|
+
.filter(Boolean)
|
|
84
|
+
|
|
85
|
+
const filesModified = status
|
|
86
|
+
.split('\n')
|
|
87
|
+
.filter((line) => line.startsWith(' M') || line.startsWith('M ')) // M = modified
|
|
88
|
+
.map((line) => line.slice(3).trim())
|
|
89
|
+
.filter(Boolean)
|
|
90
|
+
|
|
91
|
+
const hasChanges = filesCreated.length > 0 || filesModified.length > 0
|
|
92
|
+
|
|
93
|
+
// If hint is provided, check if any changed file matches the hint
|
|
94
|
+
let matchesHint = true
|
|
95
|
+
if (hint) {
|
|
96
|
+
const allChangedFiles = [...filesCreated, ...filesModified]
|
|
97
|
+
matchesHint = allChangedFiles.some((file) => file.toLowerCase().includes(hint.toLowerCase()))
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const pass = hasChanges && matchesHint
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
pass,
|
|
104
|
+
score: pass ? 1.0 : hasChanges ? 0.5 : 0.0,
|
|
105
|
+
reasoning: pass
|
|
106
|
+
? `Files changed: ${[...filesCreated, ...filesModified].join(', ')}`
|
|
107
|
+
: hasChanges
|
|
108
|
+
? 'File changes do not match hint'
|
|
109
|
+
: 'No file changes detected',
|
|
110
|
+
outcome: {
|
|
111
|
+
filesCreated,
|
|
112
|
+
filesModified,
|
|
113
|
+
type: 'git_status_check',
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
}
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for git-based grader fixture.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Verifies that graders can use git to detect environmental outcomes
|
|
6
|
+
* and return structured outcome data.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
|
|
10
|
+
import { mkdtemp, rm } from 'node:fs/promises'
|
|
11
|
+
import { tmpdir } from 'node:os'
|
|
12
|
+
import { join } from 'node:path'
|
|
13
|
+
import type { Grader } from '../schemas.ts'
|
|
14
|
+
|
|
15
|
+
describe('Git-based grader', () => {
|
|
16
|
+
let tempDir: string
|
|
17
|
+
let grader: Grader
|
|
18
|
+
|
|
19
|
+
beforeEach(async () => {
|
|
20
|
+
// Create temporary directory
|
|
21
|
+
tempDir = await mkdtemp(join(tmpdir(), 'git-grader-test-'))
|
|
22
|
+
|
|
23
|
+
// Initialize git repo
|
|
24
|
+
await Bun.$`git -C ${tempDir} init`.quiet()
|
|
25
|
+
await Bun.$`git -C ${tempDir} config user.email "test@test.com"`.quiet()
|
|
26
|
+
await Bun.$`git -C ${tempDir} config user.name "Test User"`.quiet()
|
|
27
|
+
|
|
28
|
+
// Load the git-based grader
|
|
29
|
+
const module = await import('./fixtures/grader-git.ts')
|
|
30
|
+
grader = module.grade
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
afterEach(async () => {
|
|
34
|
+
// Clean up temporary directory
|
|
35
|
+
await rm(tempDir, { recursive: true, force: true })
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
test('detects newly created files', async () => {
|
|
39
|
+
// Create a new file (untracked)
|
|
40
|
+
await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => <button>Click</button>')
|
|
41
|
+
|
|
42
|
+
const result = await grader({
|
|
43
|
+
input: 'Create a button component',
|
|
44
|
+
output: 'I created Button.tsx',
|
|
45
|
+
hint: 'button',
|
|
46
|
+
cwd: tempDir,
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
expect(result.pass).toBe(true)
|
|
50
|
+
expect(result.score).toBe(1.0)
|
|
51
|
+
expect(result.reasoning).toContain('button.tsx')
|
|
52
|
+
expect(result.outcome).toBeDefined()
|
|
53
|
+
expect(result.outcome?.filesCreated).toEqual(['button.tsx'])
|
|
54
|
+
expect(result.outcome?.type).toBe('git_status_check')
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
test('detects modified files', async () => {
|
|
58
|
+
// Create and commit a file
|
|
59
|
+
await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 1 }')
|
|
60
|
+
await Bun.$`git -C ${tempDir} add config.ts`.quiet()
|
|
61
|
+
await Bun.$`git -C ${tempDir} commit -m "Initial commit"`.quiet()
|
|
62
|
+
|
|
63
|
+
// Modify the file
|
|
64
|
+
await Bun.write(join(tempDir, 'config.ts'), 'export const config = { value: 2 }')
|
|
65
|
+
|
|
66
|
+
const result = await grader({
|
|
67
|
+
input: 'Update config value',
|
|
68
|
+
output: 'I updated the config',
|
|
69
|
+
hint: 'config',
|
|
70
|
+
cwd: tempDir,
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
expect(result.pass).toBe(true)
|
|
74
|
+
expect(result.score).toBe(1.0)
|
|
75
|
+
expect(result.reasoning).toContain('config.ts')
|
|
76
|
+
expect(result.outcome).toBeDefined()
|
|
77
|
+
expect(result.outcome?.filesModified).toEqual(['config.ts'])
|
|
78
|
+
expect(result.outcome?.type).toBe('git_status_check')
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
test('fails when no changes detected', async () => {
|
|
82
|
+
// No files created or modified
|
|
83
|
+
const result = await grader({
|
|
84
|
+
input: 'Create a button component',
|
|
85
|
+
output: 'I created a button component',
|
|
86
|
+
cwd: tempDir,
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
expect(result.pass).toBe(false)
|
|
90
|
+
expect(result.score).toBe(0)
|
|
91
|
+
expect(result.reasoning).toContain('No file changes detected')
|
|
92
|
+
expect(result.outcome).toBeDefined()
|
|
93
|
+
expect(result.outcome?.filesCreated).toEqual([])
|
|
94
|
+
expect(result.outcome?.filesModified).toEqual([])
|
|
95
|
+
})
|
|
96
|
+
|
|
97
|
+
test('partial score when changes do not match hint', async () => {
|
|
98
|
+
// Create a file that does not match the hint
|
|
99
|
+
await Bun.write(join(tempDir, 'unrelated.ts'), 'export const foo = 1')
|
|
100
|
+
|
|
101
|
+
const result = await grader({
|
|
102
|
+
input: 'Create a button component',
|
|
103
|
+
output: 'I created something',
|
|
104
|
+
hint: 'button',
|
|
105
|
+
cwd: tempDir,
|
|
106
|
+
})
|
|
107
|
+
|
|
108
|
+
expect(result.pass).toBe(false)
|
|
109
|
+
expect(result.score).toBe(0.5) // Has changes but doesn't match hint
|
|
110
|
+
expect(result.reasoning).toContain('do not match hint')
|
|
111
|
+
expect(result.outcome?.filesCreated).toEqual(['unrelated.ts'])
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
test('handles missing cwd parameter', async () => {
|
|
115
|
+
const result = await grader({
|
|
116
|
+
input: 'Create a button component',
|
|
117
|
+
output: 'I created a button',
|
|
118
|
+
hint: 'button',
|
|
119
|
+
// cwd not provided
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
expect(result.pass).toBe(false)
|
|
123
|
+
expect(result.score).toBe(0)
|
|
124
|
+
expect(result.reasoning).toBe('No working directory provided')
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
test('handles non-git directory', async () => {
|
|
128
|
+
// Create a non-git temp directory
|
|
129
|
+
const nonGitDir = await mkdtemp(join(tmpdir(), 'non-git-test-'))
|
|
130
|
+
|
|
131
|
+
try {
|
|
132
|
+
const result = await grader({
|
|
133
|
+
input: 'Create a button component',
|
|
134
|
+
output: 'I created a button',
|
|
135
|
+
cwd: nonGitDir,
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
expect(result.pass).toBe(false)
|
|
139
|
+
expect(result.score).toBe(0)
|
|
140
|
+
expect(result.reasoning).toBe('Not a git repository')
|
|
141
|
+
} finally {
|
|
142
|
+
await rm(nonGitDir, { recursive: true, force: true })
|
|
143
|
+
}
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
test('works without hint parameter', async () => {
|
|
147
|
+
// Create a file
|
|
148
|
+
await Bun.write(join(tempDir, 'any-file.ts'), 'export const x = 1')
|
|
149
|
+
|
|
150
|
+
const result = await grader({
|
|
151
|
+
input: 'Create a file',
|
|
152
|
+
output: 'I created a file',
|
|
153
|
+
cwd: tempDir,
|
|
154
|
+
// hint not provided
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
expect(result.pass).toBe(true)
|
|
158
|
+
expect(result.score).toBe(1.0)
|
|
159
|
+
expect(result.reasoning).toContain('any-file.ts')
|
|
160
|
+
expect(result.outcome?.filesCreated).toEqual(['any-file.ts'])
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
test('returns structured outcome for downstream analysis', async () => {
|
|
164
|
+
// Create multiple files
|
|
165
|
+
await Bun.write(join(tempDir, 'button.tsx'), 'export const Button = () => <button />')
|
|
166
|
+
await Bun.write(join(tempDir, 'input.tsx'), 'export const Input = () => <input />')
|
|
167
|
+
|
|
168
|
+
const result = await grader({
|
|
169
|
+
input: 'Create UI components',
|
|
170
|
+
output: 'I created Button and Input components',
|
|
171
|
+
cwd: tempDir,
|
|
172
|
+
})
|
|
173
|
+
|
|
174
|
+
expect(result.outcome).toBeDefined()
|
|
175
|
+
expect(result.outcome?.type).toBe('git_status_check')
|
|
176
|
+
expect(result.outcome?.filesCreated).toBeInstanceOf(Array)
|
|
177
|
+
expect(result.outcome?.filesCreated).toHaveLength(2)
|
|
178
|
+
expect(result.outcome?.filesCreated).toContain('button.tsx')
|
|
179
|
+
expect(result.outcome?.filesCreated).toContain('input.tsx')
|
|
180
|
+
expect(result.outcome?.filesModified).toEqual([])
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
test('rejects path with command injection attempt', async () => {
|
|
184
|
+
const result = await grader({
|
|
185
|
+
input: 'Create a file',
|
|
186
|
+
output: 'Created file',
|
|
187
|
+
cwd: '/tmp/test; rm -rf /', // Command injection attempt
|
|
188
|
+
})
|
|
189
|
+
|
|
190
|
+
expect(result.pass).toBe(false)
|
|
191
|
+
expect(result.score).toBe(0)
|
|
192
|
+
expect(result.reasoning).toContain('Invalid working directory path')
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
test('rejects path with directory traversal', async () => {
|
|
196
|
+
const result = await grader({
|
|
197
|
+
input: 'Create a file',
|
|
198
|
+
output: 'Created file',
|
|
199
|
+
cwd: '/tmp/../../../etc', // Directory traversal
|
|
200
|
+
})
|
|
201
|
+
|
|
202
|
+
expect(result.pass).toBe(false)
|
|
203
|
+
expect(result.score).toBe(0)
|
|
204
|
+
expect(result.reasoning).toContain('Invalid working directory path')
|
|
205
|
+
})
|
|
206
|
+
|
|
207
|
+
test('rejects path with shell metacharacters', async () => {
|
|
208
|
+
const dangerousPaths = ['/tmp/test$(whoami)', '/tmp/test`id`', '/tmp/test|cat', '/tmp/test&echo', '/tmp/test>out']
|
|
209
|
+
|
|
210
|
+
for (const path of dangerousPaths) {
|
|
211
|
+
const result = await grader({
|
|
212
|
+
input: 'Create a file',
|
|
213
|
+
output: 'Created file',
|
|
214
|
+
cwd: path,
|
|
215
|
+
})
|
|
216
|
+
|
|
217
|
+
expect(result.pass).toBe(false)
|
|
218
|
+
expect(result.score).toBe(0)
|
|
219
|
+
expect(result.reasoning).toContain('Invalid working directory path')
|
|
220
|
+
}
|
|
221
|
+
})
|
|
222
|
+
})
|