snapeval 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/engine/grader.js +67 -10
- package/dist/src/engine/grader.js.map +1 -1
- package/package.json +1 -1
- package/plugin.json +1 -1
- package/src/engine/grader.ts +77 -13
|
@@ -37,10 +37,10 @@ ${output}
|
|
|
37
37
|
ASSERTIONS TO GRADE:
|
|
38
38
|
${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
|
|
39
39
|
|
|
40
|
-
Respond with JSON only:
|
|
40
|
+
Respond with valid JSON only. IMPORTANT: Escape any double quotes inside string values with a backslash (\\"). Do not use unescaped double quotes inside evidence text.
|
|
41
41
|
{
|
|
42
42
|
"results": [
|
|
43
|
-
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output
|
|
43
|
+
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output — escape any double quotes>"}
|
|
44
44
|
]
|
|
45
45
|
}`;
|
|
46
46
|
}
|
|
@@ -75,23 +75,83 @@ function runScript(scriptName, outputDir, scriptsDir) {
|
|
|
75
75
|
return { text: `script:${scriptName}`, passed: false, evidence };
|
|
76
76
|
}
|
|
77
77
|
}
|
|
78
|
+
function stripAnsi(text) {
|
|
79
|
+
// eslint-disable-next-line no-control-regex
|
|
80
|
+
return text.replace(/\x1b\[[0-9;]*m/g, '');
|
|
81
|
+
}
|
|
78
82
|
function extractJSON(text) {
|
|
83
|
+
const clean = stripAnsi(text);
|
|
79
84
|
// Try JSON-tagged fence first, then bare fence, then raw text
|
|
80
|
-
const jsonFence =
|
|
85
|
+
const jsonFence = clean.match(/```json\s*([\s\S]*?)```/);
|
|
81
86
|
if (jsonFence)
|
|
82
87
|
return jsonFence[1].trim();
|
|
83
88
|
// Try parsing raw text as JSON before falling back to any fence
|
|
84
|
-
const trimmed =
|
|
89
|
+
const trimmed = clean.trim();
|
|
85
90
|
try {
|
|
86
91
|
JSON.parse(trimmed);
|
|
87
92
|
return trimmed;
|
|
88
93
|
}
|
|
89
94
|
catch { /* not raw JSON */ }
|
|
90
|
-
const anyFence =
|
|
95
|
+
const anyFence = clean.match(/```\s*([\s\S]*?)```/);
|
|
91
96
|
if (anyFence)
|
|
92
97
|
return anyFence[1].trim();
|
|
98
|
+
// Last resort: find first { or [ and extract to its matching close
|
|
99
|
+
const start = trimmed.search(/[{[]/);
|
|
100
|
+
if (start >= 0) {
|
|
101
|
+
const sub = trimmed.slice(start);
|
|
102
|
+
try {
|
|
103
|
+
JSON.parse(sub);
|
|
104
|
+
return sub;
|
|
105
|
+
}
|
|
106
|
+
catch { /* not valid from here */ }
|
|
107
|
+
}
|
|
93
108
|
return trimmed;
|
|
94
109
|
}
|
|
110
|
+
function parseGraderResponse(response) {
|
|
111
|
+
const jsonText = extractJSON(response);
|
|
112
|
+
const parsed = JSON.parse(jsonText);
|
|
113
|
+
const items = Array.isArray(parsed) ? parsed : parsed.results;
|
|
114
|
+
if (!Array.isArray(items))
|
|
115
|
+
throw new Error('No results array in grader response');
|
|
116
|
+
return items.map((r) => ({
|
|
117
|
+
text: r.text ?? '',
|
|
118
|
+
passed: Boolean(r.passed),
|
|
119
|
+
evidence: r.evidence ?? '',
|
|
120
|
+
}));
|
|
121
|
+
}
|
|
122
|
+
async function gradeLLMAssertions(prompt, assertions, runDir, inference) {
|
|
123
|
+
// Step 1: Grade assertions
|
|
124
|
+
const response = await inference.chat([{ role: 'user', content: prompt }], { temperature: 0, responseFormat: 'json' });
|
|
125
|
+
try {
|
|
126
|
+
return parseGraderResponse(response);
|
|
127
|
+
}
|
|
128
|
+
catch (firstErr) {
|
|
129
|
+
// Step 2: Validation loop — send malformed JSON back to LLM to fix
|
|
130
|
+
const debugPath = path.join(runDir, 'grader-debug.txt');
|
|
131
|
+
fs.writeFileSync(debugPath, `--- original response ---\n${response}\n--- parse error ---\n${firstErr}\n`);
|
|
132
|
+
const fixPrompt = `The following JSON is malformed. Fix it so it is valid JSON. Return ONLY the corrected JSON, nothing else.
|
|
133
|
+
|
|
134
|
+
Error: ${firstErr}
|
|
135
|
+
|
|
136
|
+
Malformed JSON:
|
|
137
|
+
${extractJSON(response)}`;
|
|
138
|
+
try {
|
|
139
|
+
const fixedResponse = await inference.chat([{ role: 'user', content: fixPrompt }], { temperature: 0, responseFormat: 'json' });
|
|
140
|
+
const results = parseGraderResponse(fixedResponse);
|
|
141
|
+
fs.appendFileSync(debugPath, `\n--- fix succeeded ---\n${extractJSON(fixedResponse)}\n`);
|
|
142
|
+
return results;
|
|
143
|
+
}
|
|
144
|
+
catch (fixErr) {
|
|
145
|
+
// Step 3: Both attempts failed — fail gracefully
|
|
146
|
+
fs.appendFileSync(debugPath, `\n--- fix also failed ---\n${fixErr}\n`);
|
|
147
|
+
return assertions.map((text) => ({
|
|
148
|
+
text,
|
|
149
|
+
passed: false,
|
|
150
|
+
evidence: `Grading failed: LLM returned malformed JSON that could not be repaired. See ${debugPath}`,
|
|
151
|
+
}));
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
95
155
|
export async function gradeAssertions(assertions, output, runDir, inference, scriptsDir) {
|
|
96
156
|
if (assertions.length === 0)
|
|
97
157
|
return null;
|
|
@@ -112,11 +172,8 @@ export async function gradeAssertions(assertions, output, runDir, inference, scr
|
|
|
112
172
|
}
|
|
113
173
|
if (llmAssertions.length > 0) {
|
|
114
174
|
const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
|
|
115
|
-
const
|
|
116
|
-
|
|
117
|
-
for (const r of parsed.results) {
|
|
118
|
-
results.push({ text: r.text, passed: Boolean(r.passed), evidence: r.evidence });
|
|
119
|
-
}
|
|
175
|
+
const graded = await gradeLLMAssertions(prompt, llmAssertions, runDir, inference);
|
|
176
|
+
results.push(...graded);
|
|
120
177
|
}
|
|
121
178
|
const passed = results.filter(r => r.passed).length;
|
|
122
179
|
const failed = results.filter(r => !r.passed).length;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACpI,MAAM,QAAQ,GAAG,MAAM,IAAI,kBAAkB,UAAU,EAAE,CAAC;QAC1D,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,8DAA8D;QAC9D,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,QAAgB,CAAC;QACrB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC1B,QAAQ,GAAG,sBAAsB,UAAU,qCAAqC,UAAU,EAAE,CAAC;QAC/F,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,gEAAgE;YAChE,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,2BAA2B,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAClE,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,8DAA8D;IAC9D,MAAM,SAAS,GAAG,
|
|
1
|
+
{"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACpI,MAAM,QAAQ,GAAG,MAAM,IAAI,kBAAkB,UAAU,EAAE,CAAC;QAC1D,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,8DAA8D;QAC9D,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,QAAgB,CAAC;QACrB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC1B,QAAQ,GAAG,sBAAsB,UAAU,qCAAqC,UAAU,EAAE,CAAC;QAC/F,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,gEAAgE;YAChE,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,2BAA2B,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAClE,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAAC,IAAY;IAC7B,4CAA4C;IAC5C,OAAO,IAAI,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAC7C,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,MAAM,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC9B,8DAA8D;IAC9D,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC,yBAAyB,CAAC,CAAC;IACzD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1C,gEAAgE;IAChE,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC;IAC7B,IAAI,CAAC;QAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAAC,OAAO,OAAO,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACpD,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,mEAAmE;IACnE,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IACrC,IAAI,KAAK,IAAI,CAAC,EAAE,CAAC;QACf,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACjC,IAAI,CAAC;YAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAAC,OAAO,GAAG,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC,CAAC,yBAAyB,CAAC,CAAC;IAC1E,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,mBAAmB,CAAC,QAAgB;IAC3C,MAAM,QAAQ,GAAG,WAAW,CAAC,QAAQ,CAAC,CAAC;IACvC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IACpC,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC;IAC9D,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC;QAAE,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;IAClF,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC;QAC5B,IAAI,EAAE,CAAC,CAAC,IAAI,IAAI,EAAE;QAClB,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC;QACzB,QAAQ,EAAE,CAAC,CAAC,QAAQ,IAAI,EAAE;KAC3B,CAAC,CAAC,CAAC;AACN,CAAC;AAED,KAAK,UAAU,kBAAkB,CAC/B,MAAc,EACd,UAAoB,EACpB,MAAc,EACd,SAA2B;IAE3B,2BAA2B;IAC3B,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;IAEF,IAAI,CAAC;QACH,OAAO,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC;IAAC,OAAO,QAAQ,EAAE,CAAC;QAClB,mEAAmE;QACnE,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,kBAAkB,CAAC,CAAC;QACxD,EAAE,CAAC,aAAa,CAAC,SAAS,EAAE,8BAA8B,QAAQ,0BAA0B,QAAQ,IAAI,CAAC,CAAC;QAE1G,MAAM,SAAS,GAAG;;SAEb,QAAQ;;;EAGf,WAAW,CAAC,QAAQ,CAAC,EAAE,CAAC;QAEtB,IAAI,CAAC;YACH,MAAM,aAAa,GAAG,MAAM,SAAS,CAAC,IAAI,CACxC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC,EACtC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;YACF,MAAM,OAAO,GAAG,mBAAmB,CAAC,aAAa,CAAC,CAAC;YACnD,EAAE,CAAC,cAAc,CAAC,SAAS,EAAE,4BAA4B,WAAW,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;YACzF,OAAO,OAAO,CAAC;QACjB,CAAC;QAAC,OAAO,MAAM,EAAE,CAAC;YAChB,iDAAiD;YACjD,EAAE,CAAC,cAAc,CAAC,SAAS,EAAE,8BAA8B,MAAM,IAAI,CAAC,CAAC;YACvE,OAAO,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBAC/B,IAAI;gBACJ,MAAM,EAAE,KAAK;gBACb,QAAQ,EAAE,+EAA+E,SAAS,EAAE;aACrG,CAAC,CAAC,CAAC;QACN,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAoB,EACpB,MAAwB,EACxB,MAAc,EACd,SAA2B,EAC3B,UAAmB;IAEnB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,MAAM,gBAAgB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACzE,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACxG,MAAM,aAAa,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACvG,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC/C,MAAM,GAAG,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;QACtD,IAAI,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3E,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,MAAM,EAAE,aAAa,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;IAC1B,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACrD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,MAAM,OAAO,GAAkB;QAC7B,iBAAiB,EAAE,OAAO;QAC1B,OAAO,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE;KAC9E,CAAC;IAEF,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtF,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
package/package.json
CHANGED
package/plugin.json
CHANGED
package/src/engine/grader.ts
CHANGED
|
@@ -45,10 +45,10 @@ ${output}
|
|
|
45
45
|
ASSERTIONS TO GRADE:
|
|
46
46
|
${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
|
|
47
47
|
|
|
48
|
-
Respond with JSON only:
|
|
48
|
+
Respond with valid JSON only. IMPORTANT: Escape any double quotes inside string values with a backslash (\\"). Do not use unescaped double quotes inside evidence text.
|
|
49
49
|
{
|
|
50
50
|
"results": [
|
|
51
|
-
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output
|
|
51
|
+
{"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output — escape any double quotes>"}
|
|
52
52
|
]
|
|
53
53
|
}`;
|
|
54
54
|
}
|
|
@@ -85,18 +85,88 @@ function runScript(
|
|
|
85
85
|
}
|
|
86
86
|
}
|
|
87
87
|
|
|
88
|
+
function stripAnsi(text: string): string {
|
|
89
|
+
// eslint-disable-next-line no-control-regex
|
|
90
|
+
return text.replace(/\x1b\[[0-9;]*m/g, '');
|
|
91
|
+
}
|
|
92
|
+
|
|
88
93
|
function extractJSON(text: string): string {
|
|
94
|
+
const clean = stripAnsi(text);
|
|
89
95
|
// Try JSON-tagged fence first, then bare fence, then raw text
|
|
90
|
-
const jsonFence =
|
|
96
|
+
const jsonFence = clean.match(/```json\s*([\s\S]*?)```/);
|
|
91
97
|
if (jsonFence) return jsonFence[1].trim();
|
|
92
98
|
// Try parsing raw text as JSON before falling back to any fence
|
|
93
|
-
const trimmed =
|
|
99
|
+
const trimmed = clean.trim();
|
|
94
100
|
try { JSON.parse(trimmed); return trimmed; } catch { /* not raw JSON */ }
|
|
95
|
-
const anyFence =
|
|
101
|
+
const anyFence = clean.match(/```\s*([\s\S]*?)```/);
|
|
96
102
|
if (anyFence) return anyFence[1].trim();
|
|
103
|
+
// Last resort: find first { or [ and extract to its matching close
|
|
104
|
+
const start = trimmed.search(/[{[]/);
|
|
105
|
+
if (start >= 0) {
|
|
106
|
+
const sub = trimmed.slice(start);
|
|
107
|
+
try { JSON.parse(sub); return sub; } catch { /* not valid from here */ }
|
|
108
|
+
}
|
|
97
109
|
return trimmed;
|
|
98
110
|
}
|
|
99
111
|
|
|
112
|
+
function parseGraderResponse(response: string): AssertionResult[] {
|
|
113
|
+
const jsonText = extractJSON(response);
|
|
114
|
+
const parsed = JSON.parse(jsonText);
|
|
115
|
+
const items = Array.isArray(parsed) ? parsed : parsed.results;
|
|
116
|
+
if (!Array.isArray(items)) throw new Error('No results array in grader response');
|
|
117
|
+
return items.map((r: any) => ({
|
|
118
|
+
text: r.text ?? '',
|
|
119
|
+
passed: Boolean(r.passed),
|
|
120
|
+
evidence: r.evidence ?? '',
|
|
121
|
+
}));
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
async function gradeLLMAssertions(
|
|
125
|
+
prompt: string,
|
|
126
|
+
assertions: string[],
|
|
127
|
+
runDir: string,
|
|
128
|
+
inference: InferenceAdapter,
|
|
129
|
+
): Promise<AssertionResult[]> {
|
|
130
|
+
// Step 1: Grade assertions
|
|
131
|
+
const response = await inference.chat(
|
|
132
|
+
[{ role: 'user', content: prompt }],
|
|
133
|
+
{ temperature: 0, responseFormat: 'json' }
|
|
134
|
+
);
|
|
135
|
+
|
|
136
|
+
try {
|
|
137
|
+
return parseGraderResponse(response);
|
|
138
|
+
} catch (firstErr) {
|
|
139
|
+
// Step 2: Validation loop — send malformed JSON back to LLM to fix
|
|
140
|
+
const debugPath = path.join(runDir, 'grader-debug.txt');
|
|
141
|
+
fs.writeFileSync(debugPath, `--- original response ---\n${response}\n--- parse error ---\n${firstErr}\n`);
|
|
142
|
+
|
|
143
|
+
const fixPrompt = `The following JSON is malformed. Fix it so it is valid JSON. Return ONLY the corrected JSON, nothing else.
|
|
144
|
+
|
|
145
|
+
Error: ${firstErr}
|
|
146
|
+
|
|
147
|
+
Malformed JSON:
|
|
148
|
+
${extractJSON(response)}`;
|
|
149
|
+
|
|
150
|
+
try {
|
|
151
|
+
const fixedResponse = await inference.chat(
|
|
152
|
+
[{ role: 'user', content: fixPrompt }],
|
|
153
|
+
{ temperature: 0, responseFormat: 'json' }
|
|
154
|
+
);
|
|
155
|
+
const results = parseGraderResponse(fixedResponse);
|
|
156
|
+
fs.appendFileSync(debugPath, `\n--- fix succeeded ---\n${extractJSON(fixedResponse)}\n`);
|
|
157
|
+
return results;
|
|
158
|
+
} catch (fixErr) {
|
|
159
|
+
// Step 3: Both attempts failed — fail gracefully
|
|
160
|
+
fs.appendFileSync(debugPath, `\n--- fix also failed ---\n${fixErr}\n`);
|
|
161
|
+
return assertions.map((text) => ({
|
|
162
|
+
text,
|
|
163
|
+
passed: false,
|
|
164
|
+
evidence: `Grading failed: LLM returned malformed JSON that could not be repaired. See ${debugPath}`,
|
|
165
|
+
}));
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
100
170
|
export async function gradeAssertions(
|
|
101
171
|
assertions: string[],
|
|
102
172
|
output: HarnessRunResult,
|
|
@@ -125,14 +195,8 @@ export async function gradeAssertions(
|
|
|
125
195
|
|
|
126
196
|
if (llmAssertions.length > 0) {
|
|
127
197
|
const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
|
|
128
|
-
const
|
|
129
|
-
|
|
130
|
-
{ temperature: 0, responseFormat: 'json' }
|
|
131
|
-
);
|
|
132
|
-
const parsed = JSON.parse(extractJSON(response));
|
|
133
|
-
for (const r of parsed.results) {
|
|
134
|
-
results.push({ text: r.text, passed: Boolean(r.passed), evidence: r.evidence });
|
|
135
|
-
}
|
|
198
|
+
const graded = await gradeLLMAssertions(prompt, llmAssertions, runDir, inference);
|
|
199
|
+
results.push(...graded);
|
|
136
200
|
}
|
|
137
201
|
|
|
138
202
|
const passed = results.filter(r => r.passed).length;
|