snapeval 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,10 +37,10 @@ ${output}
37
37
  ASSERTIONS TO GRADE:
38
38
  ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
39
39
 
40
- Respond with JSON only:
40
+ Respond with valid JSON only. IMPORTANT: Escape any double quotes inside string values with a backslash (\\"). Do not use unescaped double quotes inside evidence text.
41
41
  {
42
42
  "results": [
43
- {"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
43
+ {"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output escape any double quotes>"}
44
44
  ]
45
45
  }`;
46
46
  }
@@ -75,23 +75,83 @@ function runScript(scriptName, outputDir, scriptsDir) {
75
75
  return { text: `script:${scriptName}`, passed: false, evidence };
76
76
  }
77
77
  }
78
+ function stripAnsi(text) {
79
+ // eslint-disable-next-line no-control-regex
80
+ return text.replace(/\x1b\[[0-9;]*m/g, '');
81
+ }
78
82
  function extractJSON(text) {
83
+ const clean = stripAnsi(text);
79
84
  // Try JSON-tagged fence first, then bare fence, then raw text
80
- const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
85
+ const jsonFence = clean.match(/```json\s*([\s\S]*?)```/);
81
86
  if (jsonFence)
82
87
  return jsonFence[1].trim();
83
88
  // Try parsing raw text as JSON before falling back to any fence
84
- const trimmed = text.trim();
89
+ const trimmed = clean.trim();
85
90
  try {
86
91
  JSON.parse(trimmed);
87
92
  return trimmed;
88
93
  }
89
94
  catch { /* not raw JSON */ }
90
- const anyFence = text.match(/```\s*([\s\S]*?)```/);
95
+ const anyFence = clean.match(/```\s*([\s\S]*?)```/);
91
96
  if (anyFence)
92
97
  return anyFence[1].trim();
98
+ // Last resort: find first { or [ and extract to its matching close
99
+ const start = trimmed.search(/[{[]/);
100
+ if (start >= 0) {
101
+ const sub = trimmed.slice(start);
102
+ try {
103
+ JSON.parse(sub);
104
+ return sub;
105
+ }
106
+ catch { /* not valid from here */ }
107
+ }
93
108
  return trimmed;
94
109
  }
110
+ function parseGraderResponse(response) {
111
+ const jsonText = extractJSON(response);
112
+ const parsed = JSON.parse(jsonText);
113
+ const items = Array.isArray(parsed) ? parsed : parsed.results;
114
+ if (!Array.isArray(items))
115
+ throw new Error('No results array in grader response');
116
+ return items.map((r) => ({
117
+ text: r.text ?? '',
118
+ passed: Boolean(r.passed),
119
+ evidence: r.evidence ?? '',
120
+ }));
121
+ }
122
+ async function gradeLLMAssertions(prompt, assertions, runDir, inference) {
123
+ // Step 1: Grade assertions
124
+ const response = await inference.chat([{ role: 'user', content: prompt }], { temperature: 0, responseFormat: 'json' });
125
+ try {
126
+ return parseGraderResponse(response);
127
+ }
128
+ catch (firstErr) {
129
+ // Step 2: Validation loop — send malformed JSON back to LLM to fix
130
+ const debugPath = path.join(runDir, 'grader-debug.txt');
131
+ fs.writeFileSync(debugPath, `--- original response ---\n${response}\n--- parse error ---\n${firstErr}\n`);
132
+ const fixPrompt = `The following JSON is malformed. Fix it so it is valid JSON. Return ONLY the corrected JSON, nothing else.
133
+
134
+ Error: ${firstErr}
135
+
136
+ Malformed JSON:
137
+ ${extractJSON(response)}`;
138
+ try {
139
+ const fixedResponse = await inference.chat([{ role: 'user', content: fixPrompt }], { temperature: 0, responseFormat: 'json' });
140
+ const results = parseGraderResponse(fixedResponse);
141
+ fs.appendFileSync(debugPath, `\n--- fix succeeded ---\n${extractJSON(fixedResponse)}\n`);
142
+ return results;
143
+ }
144
+ catch (fixErr) {
145
+ // Step 3: Both attempts failed — fail gracefully
146
+ fs.appendFileSync(debugPath, `\n--- fix also failed ---\n${fixErr}\n`);
147
+ return assertions.map((text) => ({
148
+ text,
149
+ passed: false,
150
+ evidence: `Grading failed: LLM returned malformed JSON that could not be repaired. See ${debugPath}`,
151
+ }));
152
+ }
153
+ }
154
+ }
95
155
  export async function gradeAssertions(assertions, output, runDir, inference, scriptsDir) {
96
156
  if (assertions.length === 0)
97
157
  return null;
@@ -112,11 +172,8 @@ export async function gradeAssertions(assertions, output, runDir, inference, scr
112
172
  }
113
173
  if (llmAssertions.length > 0) {
114
174
  const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
115
- const response = await inference.chat([{ role: 'user', content: prompt }], { temperature: 0, responseFormat: 'json' });
116
- const parsed = JSON.parse(extractJSON(response));
117
- for (const r of parsed.results) {
118
- results.push({ text: r.text, passed: Boolean(r.passed), evidence: r.evidence });
119
- }
175
+ const graded = await gradeLLMAssertions(prompt, llmAssertions, runDir, inference);
176
+ results.push(...graded);
120
177
  }
121
178
  const passed = results.filter(r => r.passed).length;
122
179
  const failed = results.filter(r => !r.passed).length;
@@ -1 +1 @@
1
- {"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACpI,MAAM,QAAQ,GAAG,MAAM,IAAI,kBAAkB,UAAU,EAAE,CAAC;QAC1D,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,8DAA8D;QAC9D,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,QAAgB,CAAC;QACrB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC1B,QAAQ,GAAG,sBAAsB,UAAU,qCAAqC,UAAU,EAAE,CAAC;QAC/F,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,gEAAgE;YAChE,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,2BAA2B,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAClE,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,8DAA8D;IAC9D,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,yBAAyB,CAAC,CAAC;IACxD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1C,gEAAgE;IAChE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,IAAI,CAAC;QAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAAC,OAAO,OAAO,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACnD,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAoB,EACpB,MAAwB,EACxB,MAAc,EACd,SAA2B,EAC3B,UAAmB;IAEnB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,MAAM,gBAAgB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACzE,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACxG,MAAM,aAAa,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACvG,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC/C,MAAM,GAAG,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;QACtD,IAAI,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3E,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;QACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC,CAAC;QACjD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC/B,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACrD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,MAAM,OAAO,GAAkB;QAC7B,iBAAiB,EAAE,OAAO;QAC1B,OAAO,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE;KAC9E,CAAC;IAEF,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtF,OAAO,OAAO,CAAC;AACjB,CAAC"}
1
+ {"version":3,"file":"grader.js","sourceRoot":"","sources":["../../../src/engine/grader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAQlD,MAAM,mBAAmB,GAAG,4CAA4C,CAAC;AAEzE,SAAS,eAAe,CAAC,SAAiB,EAAE,MAAc;IACxD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACnD,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IACxB,MAAM,QAAQ,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAG,MAAM,KAAK,QAAQ,CAAC;IACnC,OAAO;QACL,IAAI,EAAE,SAAS;QACf,MAAM;QACN,QAAQ,EAAE,MAAM;YACd,CAAC,CAAC,iBAAiB,QAAQ,GAAG;YAC9B,CAAC,CAAC,cAAc,QAAQ,YAAY,MAAM,GAAG;KAChD,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,UAAoB,EAAE,MAAc,EAAE,KAAe;IAC/E,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,qBAAqB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,OAAO;;;;;;;;;;;;EAYP,MAAM;KACH,QAAQ;;;EAGX,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;EAOrD,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAChB,UAAkB,EAClB,SAAiB,EACjB,UAAkB;IAElB,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;IACrD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,qBAAqB,UAAU,EAAE,EAAE,CAAC;IACtG,CAAC;IACD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,YAAY,CAAC,UAAU,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACpI,MAAM,QAAQ,GAAG,MAAM,IAAI,kBAAkB,UAAU,EAAE,CAAC;QAC1D,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAClE,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,8DAA8D;QAC9D,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC;QAClC,IAAI,QAAgB,CAAC;QACrB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAC1B,QAAQ,GAAG,sBAAsB,UAAU,qCAAqC,UAAU,EAAE,CAAC;QAC/F,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,gEAAgE;YAChE,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,MAAM,EAAE,CAAC;YAClB,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,QAAQ,GAAG,2BAA2B,GAAG,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAClE,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,UAAU,UAAU,EAAE,EAAE,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;IACnE,CAAC;AACH,CAAC;AAED,SAAS,SAAS,CAAC,IAAY;IAC7B,4CAA4C;IAC5C,OAAO,IAAI,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;AAC7C,CAAC;AAED,SAAS,WAAW,CAAC,IAAY;IAC/B,MAAM,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IAC9B,8DAA8D;IAC9D,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC,yBAAyB,CAAC,CAAC;IACzD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1C,gEAAgE;IAChE,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC;IAC7B,IAAI,CAAC;QAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAAC,OAAO,OAAO,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IACpD,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACxC,mEAAmE;IACnE,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IACrC,IAAI,KAAK,IAAI,CAAC,EAAE,CAAC;QACf,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACjC,IAAI,CAAC;YAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAAC,OAAO,GAAG,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC,CAAC,yBAAyB,CAAC,CAAC;IAC1E,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,mBAAmB,CAAC,QAAgB;IAC3C,MAAM,QAAQ,GAAG,WAAW,CAAC,QAAQ,CAAC,CAAC;IACvC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IACpC,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC;IAC9D,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC;QAAE,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;IAClF,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC;QAC5B,IAAI,EAAE,CAAC,CAAC,IAAI,IAAI,EAAE;QAClB,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC;QACzB,QAAQ,EAAE,CAAC,CAAC,QAAQ,IAAI,EAAE;KAC3B,CAAC,CAAC,CAAC;AACN,CAAC;AAED,KAAK,UAAU,kBAAkB,CAC/B,MAAc,EACd,UAAoB,EACpB,MAAc,EACd,SAA2B;IAE3B,2BAA2B;IAC3B,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,IAAI,CACnC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,EACnC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;IAEF,IAAI,CAAC;QACH,OAAO,mBAAmB,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC;IAAC,OAAO,QAAQ,EAAE,CAAC;QAClB,mEAAmE;QACnE,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,kBAAkB,CAAC,CAAC;QACxD,EAAE,CAAC,aAAa,CAAC,SAAS,EAAE,8BAA8B,QAAQ,0BAA0B,QAAQ,IAAI,CAAC,CAAC;QAE1G,MAAM,SAAS,GAAG;;SAEb,QAAQ;;;EAGf,WAAW,CAAC,QAAQ,CAAC,EAAE,CAAC;QAEtB,IAAI,CAAC;YACH,MAAM,aAAa,GAAG,MAAM,SAAS,CAAC,IAAI,CACxC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC,EACtC,EAAE,WAAW,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,EAAE,CAC3C,CAAC;YACF,MAAM,OAAO,GAAG,mBAAmB,CAAC,aAAa,CAAC,CAAC;YACnD,EAAE,CAAC,cAAc,CAAC,SAAS,EAAE,4BAA4B,WAAW,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;YACzF,OAAO,OAAO,CAAC;QACjB,CAAC;QAAC,OAAO,MAAM,EAAE,CAAC;YAChB,iDAAiD;YACjD,EAAE,CAAC,cAAc,CAAC,SAAS,EAAE,8BAA8B,MAAM,IAAI,CAAC,CAAC;YACvE,OAAO,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;gBAC/B,IAAI;gBACJ,MAAM,EAAE,KAAK;gBACb,QAAQ,EAAE,+EAA+E,SAAS,EAAE;aACrG,CAAC,CAAC,CAAC;QACN,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,UAAoB,EACpB,MAAwB,EACxB,MAAc,EACd,SAA2B,EAC3B,UAAmB;IAEnB,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEzC,MAAM,gBAAgB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC;IACzE,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACxG,MAAM,aAAa,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACvG,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,gBAAgB,EAAE,CAAC;QACzC,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC/C,MAAM,GAAG,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,MAAM,SAAS,IAAI,eAAe,EAAE,CAAC;QACxC,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;QACtD,IAAI,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,kBAAkB,CAAC,aAAa,EAAE,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QAC3E,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,MAAM,EAAE,aAAa,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;IAC1B,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACrD,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC;IAE7B,MAAM,OAAO,GAAkB;QAC7B,iBAAiB,EAAE,OAAO;QAC1B,OAAO,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE;KAC9E,CAAC;IAEF,EAAE,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAEtF,OAAO,OAAO,CAAC;AACjB,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "snapeval",
3
- "version": "2.1.1",
3
+ "version": "2.1.2",
4
4
  "description": "Harness-agnostic eval runner for agentskills.io skills",
5
5
  "type": "module",
6
6
  "bin": {
package/plugin.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "snapeval",
3
- "version": "2.1.1",
3
+ "version": "2.1.2",
4
4
  "description": "Semantic snapshot testing for AI skills. Zero assertions. AI-driven. Free inference.",
5
5
  "author": "Matan Tsach",
6
6
  "license": "MIT",
@@ -45,10 +45,10 @@ ${output}
45
45
  ASSERTIONS TO GRADE:
46
46
  ${assertions.map((a, i) => `${i + 1}. ${a}`).join('\n')}
47
47
 
48
- Respond with JSON only:
48
+ Respond with valid JSON only. IMPORTANT: Escape any double quotes inside string values with a backslash (\\"). Do not use unescaped double quotes inside evidence text.
49
49
  {
50
50
  "results": [
51
- {"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output supporting your verdict>"}
51
+ {"text": "<assertion text>", "passed": true/false, "evidence": "<quote from output escape any double quotes>"}
52
52
  ]
53
53
  }`;
54
54
  }
@@ -85,18 +85,88 @@ function runScript(
85
85
  }
86
86
  }
87
87
 
88
+ function stripAnsi(text: string): string {
89
+ // eslint-disable-next-line no-control-regex
90
+ return text.replace(/\x1b\[[0-9;]*m/g, '');
91
+ }
92
+
88
93
  function extractJSON(text: string): string {
94
+ const clean = stripAnsi(text);
89
95
  // Try JSON-tagged fence first, then bare fence, then raw text
90
- const jsonFence = text.match(/```json\s*([\s\S]*?)```/);
96
+ const jsonFence = clean.match(/```json\s*([\s\S]*?)```/);
91
97
  if (jsonFence) return jsonFence[1].trim();
92
98
  // Try parsing raw text as JSON before falling back to any fence
93
- const trimmed = text.trim();
99
+ const trimmed = clean.trim();
94
100
  try { JSON.parse(trimmed); return trimmed; } catch { /* not raw JSON */ }
95
- const anyFence = text.match(/```\s*([\s\S]*?)```/);
101
+ const anyFence = clean.match(/```\s*([\s\S]*?)```/);
96
102
  if (anyFence) return anyFence[1].trim();
103
+ // Last resort: find first { or [ and extract to its matching close
104
+ const start = trimmed.search(/[{[]/);
105
+ if (start >= 0) {
106
+ const sub = trimmed.slice(start);
107
+ try { JSON.parse(sub); return sub; } catch { /* not valid from here */ }
108
+ }
97
109
  return trimmed;
98
110
  }
99
111
 
112
+ function parseGraderResponse(response: string): AssertionResult[] {
113
+ const jsonText = extractJSON(response);
114
+ const parsed = JSON.parse(jsonText);
115
+ const items = Array.isArray(parsed) ? parsed : parsed.results;
116
+ if (!Array.isArray(items)) throw new Error('No results array in grader response');
117
+ return items.map((r: any) => ({
118
+ text: r.text ?? '',
119
+ passed: Boolean(r.passed),
120
+ evidence: r.evidence ?? '',
121
+ }));
122
+ }
123
+
124
+ async function gradeLLMAssertions(
125
+ prompt: string,
126
+ assertions: string[],
127
+ runDir: string,
128
+ inference: InferenceAdapter,
129
+ ): Promise<AssertionResult[]> {
130
+ // Step 1: Grade assertions
131
+ const response = await inference.chat(
132
+ [{ role: 'user', content: prompt }],
133
+ { temperature: 0, responseFormat: 'json' }
134
+ );
135
+
136
+ try {
137
+ return parseGraderResponse(response);
138
+ } catch (firstErr) {
139
+ // Step 2: Validation loop — send malformed JSON back to LLM to fix
140
+ const debugPath = path.join(runDir, 'grader-debug.txt');
141
+ fs.writeFileSync(debugPath, `--- original response ---\n${response}\n--- parse error ---\n${firstErr}\n`);
142
+
143
+ const fixPrompt = `The following JSON is malformed. Fix it so it is valid JSON. Return ONLY the corrected JSON, nothing else.
144
+
145
+ Error: ${firstErr}
146
+
147
+ Malformed JSON:
148
+ ${extractJSON(response)}`;
149
+
150
+ try {
151
+ const fixedResponse = await inference.chat(
152
+ [{ role: 'user', content: fixPrompt }],
153
+ { temperature: 0, responseFormat: 'json' }
154
+ );
155
+ const results = parseGraderResponse(fixedResponse);
156
+ fs.appendFileSync(debugPath, `\n--- fix succeeded ---\n${extractJSON(fixedResponse)}\n`);
157
+ return results;
158
+ } catch (fixErr) {
159
+ // Step 3: Both attempts failed — fail gracefully
160
+ fs.appendFileSync(debugPath, `\n--- fix also failed ---\n${fixErr}\n`);
161
+ return assertions.map((text) => ({
162
+ text,
163
+ passed: false,
164
+ evidence: `Grading failed: LLM returned malformed JSON that could not be repaired. See ${debugPath}`,
165
+ }));
166
+ }
167
+ }
168
+ }
169
+
100
170
  export async function gradeAssertions(
101
171
  assertions: string[],
102
172
  output: HarnessRunResult,
@@ -125,14 +195,8 @@ export async function gradeAssertions(
125
195
 
126
196
  if (llmAssertions.length > 0) {
127
197
  const prompt = buildGradingPrompt(llmAssertions, output.raw, output.files);
128
- const response = await inference.chat(
129
- [{ role: 'user', content: prompt }],
130
- { temperature: 0, responseFormat: 'json' }
131
- );
132
- const parsed = JSON.parse(extractJSON(response));
133
- for (const r of parsed.results) {
134
- results.push({ text: r.text, passed: Boolean(r.passed), evidence: r.evidence });
135
- }
198
+ const graded = await gradeLLMAssertions(prompt, llmAssertions, runDir, inference);
199
+ results.push(...graded);
136
200
  }
137
201
 
138
202
  const passed = results.filter(r => r.passed).length;