@artemiskit/core 0.1.6 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +116 -0
- package/dist/adapters/types.d.ts +8 -1
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +39 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/cost/index.d.ts +5 -0
- package/dist/cost/index.d.ts.map +1 -0
- package/dist/cost/pricing.d.ts +67 -0
- package/dist/cost/pricing.d.ts.map +1 -0
- package/dist/evaluators/combined.d.ts +10 -0
- package/dist/evaluators/combined.d.ts.map +1 -0
- package/dist/evaluators/index.d.ts +4 -0
- package/dist/evaluators/index.d.ts.map +1 -1
- package/dist/evaluators/inline.d.ts +22 -0
- package/dist/evaluators/inline.d.ts.map +1 -0
- package/dist/evaluators/llm-grader.d.ts.map +1 -1
- package/dist/evaluators/not-contains.d.ts +10 -0
- package/dist/evaluators/not-contains.d.ts.map +1 -0
- package/dist/evaluators/similarity.d.ts +16 -0
- package/dist/evaluators/similarity.d.ts.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +13212 -12018
- package/dist/scenario/discovery.d.ts +72 -0
- package/dist/scenario/discovery.d.ts.map +1 -0
- package/dist/scenario/index.d.ts +1 -0
- package/dist/scenario/index.d.ts.map +1 -1
- package/dist/scenario/schema.d.ts +1253 -9
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/local.d.ts +44 -2
- package/dist/storage/local.d.ts.map +1 -1
- package/dist/storage/types.d.ts +62 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +8 -1
- package/src/artifacts/types.ts +39 -0
- package/src/cost/index.ts +14 -0
- package/src/cost/pricing.ts +450 -0
- package/src/evaluators/combined.test.ts +172 -0
- package/src/evaluators/combined.ts +95 -0
- package/src/evaluators/index.ts +12 -0
- package/src/evaluators/inline.test.ts +409 -0
- package/src/evaluators/inline.ts +393 -0
- package/src/evaluators/llm-grader.ts +45 -13
- package/src/evaluators/not-contains.test.ts +105 -0
- package/src/evaluators/not-contains.ts +45 -0
- package/src/evaluators/similarity.test.ts +333 -0
- package/src/evaluators/similarity.ts +258 -0
- package/src/index.ts +3 -0
- package/src/scenario/discovery.test.ts +153 -0
- package/src/scenario/discovery.ts +277 -0
- package/src/scenario/index.ts +1 -0
- package/src/scenario/schema.ts +47 -2
- package/src/storage/local.test.ts +243 -0
- package/src/storage/local.ts +162 -2
- package/src/storage/types.ts +73 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inline custom matcher evaluator
|
|
3
|
+
* Allows users to define simple matching expressions directly in YAML
|
|
4
|
+
*
|
|
5
|
+
* Supports a safe subset of JavaScript-like expressions:
|
|
6
|
+
* - String methods: includes, startsWith, endsWith, length, toLowerCase, toUpperCase, trim
|
|
7
|
+
* - Comparisons: ==, !=, >, <, >=, <=
|
|
8
|
+
* - Logical operators: &&, ||, !
|
|
9
|
+
* - Regex matching: response.match(/pattern/)
|
|
10
|
+
* - JSON parsing: JSON.parse(response)
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import type { Expected } from '../scenario/schema';
|
|
14
|
+
import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Split a string on a delimiter, but only when the delimiter is outside of quotes
|
|
18
|
+
*/
|
|
19
|
+
function splitOutsideQuotes(str: string, delimiter: string): string[] {
|
|
20
|
+
const parts: string[] = [];
|
|
21
|
+
let current = '';
|
|
22
|
+
let inSingleQuote = false;
|
|
23
|
+
let inDoubleQuote = false;
|
|
24
|
+
let i = 0;
|
|
25
|
+
|
|
26
|
+
while (i < str.length) {
|
|
27
|
+
const char = str[i];
|
|
28
|
+
|
|
29
|
+
// Check for delimiter outside of quotes
|
|
30
|
+
if (!inSingleQuote && !inDoubleQuote && str.slice(i, i + delimiter.length) === delimiter) {
|
|
31
|
+
parts.push(current);
|
|
32
|
+
current = '';
|
|
33
|
+
i += delimiter.length;
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Track quote state (handle escaped quotes)
|
|
38
|
+
if (char === "'" && !inDoubleQuote && (i === 0 || str[i - 1] !== '\\')) {
|
|
39
|
+
inSingleQuote = !inSingleQuote;
|
|
40
|
+
} else if (char === '"' && !inSingleQuote && (i === 0 || str[i - 1] !== '\\')) {
|
|
41
|
+
inDoubleQuote = !inDoubleQuote;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
current += char;
|
|
45
|
+
i++;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
parts.push(current);
|
|
49
|
+
return parts;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Safe expression context with allowed variables and functions
|
|
54
|
+
*/
|
|
55
|
+
interface ExpressionContext {
|
|
56
|
+
response: string;
|
|
57
|
+
expected: string | undefined;
|
|
58
|
+
length: number;
|
|
59
|
+
words: string[];
|
|
60
|
+
lines: string[];
|
|
61
|
+
json: unknown | null;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Evaluate a safe expression
|
|
66
|
+
*/
|
|
67
|
+
function evaluateExpression(
|
|
68
|
+
expression: string,
|
|
69
|
+
context: ExpressionContext
|
|
70
|
+
): { result: boolean; score: number } {
|
|
71
|
+
const { response, expected, length, words, lines, json } = context;
|
|
72
|
+
|
|
73
|
+
// Normalize the expression
|
|
74
|
+
const expr = expression.trim();
|
|
75
|
+
|
|
76
|
+
// Handle common patterns with safe evaluation
|
|
77
|
+
try {
|
|
78
|
+
// Handle combined patterns with && and || FIRST (before individual patterns)
|
|
79
|
+
// Split on && or || that appear outside of quoted strings
|
|
80
|
+
const andParts = splitOutsideQuotes(expr, '&&');
|
|
81
|
+
if (andParts.length > 1) {
|
|
82
|
+
const results = andParts.map((p) => evaluateExpression(p.trim(), context));
|
|
83
|
+
const allPassed = results.every((r) => r.result);
|
|
84
|
+
const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
|
|
85
|
+
return { result: allPassed, score: avgScore };
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const orParts = splitOutsideQuotes(expr, '||');
|
|
89
|
+
if (orParts.length > 1) {
|
|
90
|
+
const results = orParts.map((p) => evaluateExpression(p.trim(), context));
|
|
91
|
+
const anyPassed = results.some((r) => r.result);
|
|
92
|
+
const maxScore = Math.max(...results.map((r) => r.score));
|
|
93
|
+
return { result: anyPassed, score: maxScore };
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Pattern: response.includes("text")
|
|
97
|
+
const includesMatch = expr.match(/^response\.includes\s*\(\s*["'](.+?)["']\s*\)$/);
|
|
98
|
+
if (includesMatch) {
|
|
99
|
+
const result = response.includes(includesMatch[1]);
|
|
100
|
+
return { result, score: result ? 1 : 0 };
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Pattern: !response.includes("text")
|
|
104
|
+
const notIncludesMatch = expr.match(/^!\s*response\.includes\s*\(\s*["'](.+?)["']\s*\)$/);
|
|
105
|
+
if (notIncludesMatch) {
|
|
106
|
+
const result = !response.includes(notIncludesMatch[1]);
|
|
107
|
+
return { result, score: result ? 1 : 0 };
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Pattern: response.startsWith("text")
|
|
111
|
+
const startsWithMatch = expr.match(/^response\.startsWith\s*\(\s*["'](.+?)["']\s*\)$/);
|
|
112
|
+
if (startsWithMatch) {
|
|
113
|
+
const result = response.startsWith(startsWithMatch[1]);
|
|
114
|
+
return { result, score: result ? 1 : 0 };
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Pattern: response.endsWith("text")
|
|
118
|
+
const endsWithMatch = expr.match(/^response\.endsWith\s*\(\s*["'](.+?)["']\s*\)$/);
|
|
119
|
+
if (endsWithMatch) {
|
|
120
|
+
const result = response.endsWith(endsWithMatch[1]);
|
|
121
|
+
return { result, score: result ? 1 : 0 };
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Pattern: response.toLowerCase().includes("text")
|
|
125
|
+
const lowerIncludesMatch = expr.match(
|
|
126
|
+
/^response\.toLowerCase\s*\(\s*\)\.includes\s*\(\s*["'](.+?)["']\s*\)$/
|
|
127
|
+
);
|
|
128
|
+
if (lowerIncludesMatch) {
|
|
129
|
+
const result = response.toLowerCase().includes(lowerIncludesMatch[1].toLowerCase());
|
|
130
|
+
return { result, score: result ? 1 : 0 };
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Pattern: response.match(/regex/)
|
|
134
|
+
const regexMatch = expr.match(/^response\.match\s*\(\s*\/(.+?)\/([gimsuy]*)\s*\)$/);
|
|
135
|
+
if (regexMatch) {
|
|
136
|
+
const regex = new RegExp(regexMatch[1], regexMatch[2]);
|
|
137
|
+
const result = regex.test(response);
|
|
138
|
+
return { result, score: result ? 1 : 0 };
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Pattern: !response.match(/regex/)
|
|
142
|
+
const notRegexMatch = expr.match(/^!\s*response\.match\s*\(\s*\/(.+?)\/([gimsuy]*)\s*\)$/);
|
|
143
|
+
if (notRegexMatch) {
|
|
144
|
+
const regex = new RegExp(notRegexMatch[1], notRegexMatch[2]);
|
|
145
|
+
const result = !regex.test(response);
|
|
146
|
+
return { result, score: result ? 1 : 0 };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Pattern: length > N, length < N, length >= N, length <= N, length == N
|
|
150
|
+
const lengthMatch = expr.match(/^length\s*(>=|<=|>|<|==|!=)\s*(\d+)$/);
|
|
151
|
+
if (lengthMatch) {
|
|
152
|
+
const op = lengthMatch[1];
|
|
153
|
+
const num = Number.parseInt(lengthMatch[2], 10);
|
|
154
|
+
let result = false;
|
|
155
|
+
switch (op) {
|
|
156
|
+
case '>':
|
|
157
|
+
result = length > num;
|
|
158
|
+
break;
|
|
159
|
+
case '<':
|
|
160
|
+
result = length < num;
|
|
161
|
+
break;
|
|
162
|
+
case '>=':
|
|
163
|
+
result = length >= num;
|
|
164
|
+
break;
|
|
165
|
+
case '<=':
|
|
166
|
+
result = length <= num;
|
|
167
|
+
break;
|
|
168
|
+
case '==':
|
|
169
|
+
result = length === num;
|
|
170
|
+
break;
|
|
171
|
+
case '!=':
|
|
172
|
+
result = length !== num;
|
|
173
|
+
break;
|
|
174
|
+
}
|
|
175
|
+
return { result, score: result ? 1 : 0 };
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Pattern: words.length > N
|
|
179
|
+
const wordsLengthMatch = expr.match(/^words\.length\s*(>=|<=|>|<|==|!=)\s*(\d+)$/);
|
|
180
|
+
if (wordsLengthMatch) {
|
|
181
|
+
const op = wordsLengthMatch[1];
|
|
182
|
+
const num = Number.parseInt(wordsLengthMatch[2], 10);
|
|
183
|
+
let result = false;
|
|
184
|
+
switch (op) {
|
|
185
|
+
case '>':
|
|
186
|
+
result = words.length > num;
|
|
187
|
+
break;
|
|
188
|
+
case '<':
|
|
189
|
+
result = words.length < num;
|
|
190
|
+
break;
|
|
191
|
+
case '>=':
|
|
192
|
+
result = words.length >= num;
|
|
193
|
+
break;
|
|
194
|
+
case '<=':
|
|
195
|
+
result = words.length <= num;
|
|
196
|
+
break;
|
|
197
|
+
case '==':
|
|
198
|
+
result = words.length === num;
|
|
199
|
+
break;
|
|
200
|
+
case '!=':
|
|
201
|
+
result = words.length !== num;
|
|
202
|
+
break;
|
|
203
|
+
}
|
|
204
|
+
return { result, score: result ? 1 : 0 };
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Pattern: lines.length > N
|
|
208
|
+
const linesLengthMatch = expr.match(/^lines\.length\s*(>=|<=|>|<|==|!=)\s*(\d+)$/);
|
|
209
|
+
if (linesLengthMatch) {
|
|
210
|
+
const op = linesLengthMatch[1];
|
|
211
|
+
const num = Number.parseInt(linesLengthMatch[2], 10);
|
|
212
|
+
let result = false;
|
|
213
|
+
switch (op) {
|
|
214
|
+
case '>':
|
|
215
|
+
result = lines.length > num;
|
|
216
|
+
break;
|
|
217
|
+
case '<':
|
|
218
|
+
result = lines.length < num;
|
|
219
|
+
break;
|
|
220
|
+
case '>=':
|
|
221
|
+
result = lines.length >= num;
|
|
222
|
+
break;
|
|
223
|
+
case '<=':
|
|
224
|
+
result = lines.length <= num;
|
|
225
|
+
break;
|
|
226
|
+
case '==':
|
|
227
|
+
result = lines.length === num;
|
|
228
|
+
break;
|
|
229
|
+
case '!=':
|
|
230
|
+
result = lines.length !== num;
|
|
231
|
+
break;
|
|
232
|
+
}
|
|
233
|
+
return { result, score: result ? 1 : 0 };
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Pattern: json.field == "value" or json.field == value
|
|
237
|
+
const jsonFieldMatch = expr.match(
|
|
238
|
+
/^json\.(\w+(?:\.\w+)*)\s*(>=|<=|>|<|==|!=)\s*(?:["'](.+?)["']|(\d+(?:\.\d+)?)|(true|false|null))$/
|
|
239
|
+
);
|
|
240
|
+
if (jsonFieldMatch && json !== null) {
|
|
241
|
+
const path = jsonFieldMatch[1];
|
|
242
|
+
const op = jsonFieldMatch[2];
|
|
243
|
+
let compareValue: unknown = jsonFieldMatch[3] ?? jsonFieldMatch[4] ?? jsonFieldMatch[5];
|
|
244
|
+
|
|
245
|
+
// Parse numbers and booleans
|
|
246
|
+
if (compareValue === 'true') compareValue = true;
|
|
247
|
+
else if (compareValue === 'false') compareValue = false;
|
|
248
|
+
else if (compareValue === 'null') compareValue = null;
|
|
249
|
+
else if (jsonFieldMatch[4]) compareValue = Number.parseFloat(jsonFieldMatch[4]);
|
|
250
|
+
|
|
251
|
+
// Navigate JSON path
|
|
252
|
+
let fieldValue: unknown = json;
|
|
253
|
+
for (const key of path.split('.')) {
|
|
254
|
+
if (fieldValue && typeof fieldValue === 'object' && key in fieldValue) {
|
|
255
|
+
fieldValue = (fieldValue as Record<string, unknown>)[key];
|
|
256
|
+
} else {
|
|
257
|
+
fieldValue = undefined;
|
|
258
|
+
break;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
let result = false;
|
|
263
|
+
switch (op) {
|
|
264
|
+
case '==':
|
|
265
|
+
result = fieldValue === compareValue;
|
|
266
|
+
break;
|
|
267
|
+
case '!=':
|
|
268
|
+
result = fieldValue !== compareValue;
|
|
269
|
+
break;
|
|
270
|
+
case '>':
|
|
271
|
+
result = typeof fieldValue === 'number' && fieldValue > (compareValue as number);
|
|
272
|
+
break;
|
|
273
|
+
case '<':
|
|
274
|
+
result = typeof fieldValue === 'number' && fieldValue < (compareValue as number);
|
|
275
|
+
break;
|
|
276
|
+
case '>=':
|
|
277
|
+
result = typeof fieldValue === 'number' && fieldValue >= (compareValue as number);
|
|
278
|
+
break;
|
|
279
|
+
case '<=':
|
|
280
|
+
result = typeof fieldValue === 'number' && fieldValue <= (compareValue as number);
|
|
281
|
+
break;
|
|
282
|
+
}
|
|
283
|
+
return { result, score: result ? 1 : 0 };
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Pattern: json != null (check if valid JSON)
|
|
287
|
+
if (expr === 'json != null') {
|
|
288
|
+
const result = json !== null;
|
|
289
|
+
return { result, score: result ? 1 : 0 };
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Pattern: response == expected
|
|
293
|
+
if (expr === 'response == expected' && expected !== undefined) {
|
|
294
|
+
const result = response === expected;
|
|
295
|
+
return { result, score: result ? 1 : 0 };
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Pattern: response.trim() == expected
|
|
299
|
+
if (expr === 'response.trim() == expected' && expected !== undefined) {
|
|
300
|
+
const result = response.trim() === expected;
|
|
301
|
+
return { result, score: result ? 1 : 0 };
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Unknown expression pattern
|
|
305
|
+
throw new Error(`Unsupported expression pattern: ${expr}`);
|
|
306
|
+
} catch (error) {
|
|
307
|
+
throw new Error(`Expression evaluation failed: ${(error as Error).message}`);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
export class InlineEvaluator implements Evaluator {
|
|
312
|
+
readonly type = 'inline';
|
|
313
|
+
|
|
314
|
+
async evaluate(
|
|
315
|
+
response: string,
|
|
316
|
+
expected: Expected,
|
|
317
|
+
_context?: EvaluatorContext
|
|
318
|
+
): Promise<EvaluatorResult> {
|
|
319
|
+
if (expected.type !== 'inline') {
|
|
320
|
+
throw new Error('Invalid expected type for InlineEvaluator');
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
const expression = expected.expression;
|
|
324
|
+
const expectedValue = expected.value;
|
|
325
|
+
|
|
326
|
+
// Build context
|
|
327
|
+
let json: unknown = null;
|
|
328
|
+
try {
|
|
329
|
+
json = JSON.parse(response);
|
|
330
|
+
} catch {
|
|
331
|
+
// Not valid JSON, that's fine
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const context: ExpressionContext = {
|
|
335
|
+
response,
|
|
336
|
+
expected: expectedValue,
|
|
337
|
+
length: response.length,
|
|
338
|
+
words: response.split(/\s+/).filter((w) => w.length > 0),
|
|
339
|
+
lines: response.split('\n').filter((l) => l.trim().length > 0),
|
|
340
|
+
json,
|
|
341
|
+
};
|
|
342
|
+
|
|
343
|
+
try {
|
|
344
|
+
const { result, score } = evaluateExpression(expression, context);
|
|
345
|
+
|
|
346
|
+
return {
|
|
347
|
+
passed: result,
|
|
348
|
+
score,
|
|
349
|
+
reason: result ? `Expression passed: ${expression}` : `Expression failed: ${expression}`,
|
|
350
|
+
details: {
|
|
351
|
+
expression,
|
|
352
|
+
expectedValue,
|
|
353
|
+
responseLength: response.length,
|
|
354
|
+
wordCount: context.words.length,
|
|
355
|
+
lineCount: context.lines.length,
|
|
356
|
+
isValidJson: json !== null,
|
|
357
|
+
},
|
|
358
|
+
};
|
|
359
|
+
} catch (error) {
|
|
360
|
+
return {
|
|
361
|
+
passed: false,
|
|
362
|
+
score: 0,
|
|
363
|
+
reason: `Inline matcher error: ${(error as Error).message}`,
|
|
364
|
+
details: {
|
|
365
|
+
expression,
|
|
366
|
+
error: (error as Error).message,
|
|
367
|
+
},
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* List of supported expression patterns for documentation
|
|
375
|
+
*/
|
|
376
|
+
export const SUPPORTED_EXPRESSIONS = [
|
|
377
|
+
'response.includes("text")',
|
|
378
|
+
'!response.includes("text")',
|
|
379
|
+
'response.startsWith("text")',
|
|
380
|
+
'response.endsWith("text")',
|
|
381
|
+
'response.toLowerCase().includes("text")',
|
|
382
|
+
'response.match(/regex/)',
|
|
383
|
+
'!response.match(/regex/)',
|
|
384
|
+
'length > N / length < N / length >= N / length <= N / length == N',
|
|
385
|
+
'words.length > N',
|
|
386
|
+
'lines.length > N',
|
|
387
|
+
'json.field == "value"',
|
|
388
|
+
'json.field > N',
|
|
389
|
+
'json != null',
|
|
390
|
+
'response == expected',
|
|
391
|
+
'expression1 && expression2',
|
|
392
|
+
'expression1 || expression2',
|
|
393
|
+
];
|
|
@@ -5,22 +5,27 @@
|
|
|
5
5
|
import type { Expected } from '../scenario/schema';
|
|
6
6
|
import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
|
|
7
7
|
|
|
8
|
-
const GRADER_PROMPT = `You are
|
|
8
|
+
const GRADER_PROMPT = `You are a strict JSON-only evaluator. You grade AI responses based on rubrics.
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
RUBRIC:
|
|
11
11
|
{{rubric}}
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
RESPONSE TO EVALUATE:
|
|
14
14
|
{{response}}
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
Score the response from 0.0 to 1.0 based on the rubric.
|
|
18
|
-
Be objective and consistent in your scoring.
|
|
16
|
+
TASK: Score the response from 0.0 to 1.0 based on the rubric above.
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
{"score":
|
|
18
|
+
OUTPUT FORMAT: You MUST respond with ONLY this exact JSON structure, nothing else:
|
|
19
|
+
{"score":0.0,"reason":"explanation"}
|
|
22
20
|
|
|
23
|
-
|
|
21
|
+
RULES:
|
|
22
|
+
- Output ONLY valid JSON, no markdown, no code blocks, no extra text
|
|
23
|
+
- "score" must be a number between 0.0 and 1.0
|
|
24
|
+
- "reason" must be a brief string explaining the score
|
|
25
|
+
- Do NOT wrap in \`\`\`json or any formatting
|
|
26
|
+
- Your entire response must be parseable by JSON.parse()
|
|
27
|
+
|
|
28
|
+
JSON OUTPUT:`;
|
|
24
29
|
|
|
25
30
|
export class LLMGraderEvaluator implements Evaluator {
|
|
26
31
|
readonly type = 'llm_grader';
|
|
@@ -44,11 +49,13 @@ export class LLMGraderEvaluator implements Evaluator {
|
|
|
44
49
|
);
|
|
45
50
|
|
|
46
51
|
try {
|
|
52
|
+
// Note: Some models (like o1, o3, gpt-5-mini, reasoning models) only support temperature=1
|
|
53
|
+
// We omit temperature to let the API use its default for maximum compatibility
|
|
54
|
+
// Use higher maxTokens for reasoning models which use tokens for internal "thinking"
|
|
47
55
|
const result = await context.client.generate({
|
|
48
56
|
prompt,
|
|
49
57
|
model: expected.model,
|
|
50
|
-
|
|
51
|
-
maxTokens: 200,
|
|
58
|
+
maxTokens: 1000,
|
|
52
59
|
});
|
|
53
60
|
|
|
54
61
|
const parsed = this.parseGraderResponse(result.text);
|
|
@@ -76,9 +83,25 @@ export class LLMGraderEvaluator implements Evaluator {
|
|
|
76
83
|
}
|
|
77
84
|
|
|
78
85
|
private parseGraderResponse(text: string): { score: number; reason?: string } {
|
|
79
|
-
|
|
86
|
+
// Clean up the response - remove markdown code blocks if present
|
|
87
|
+
const cleanedText = text
|
|
88
|
+
.replace(/```json\s*/gi, '')
|
|
89
|
+
.replace(/```\s*/g, '')
|
|
90
|
+
.trim();
|
|
91
|
+
|
|
92
|
+
// Try to find JSON object in the response
|
|
93
|
+
const jsonMatch = cleanedText.match(/\{[\s\S]*?\}/);
|
|
94
|
+
|
|
80
95
|
if (!jsonMatch) {
|
|
81
|
-
|
|
96
|
+
// Fallback: try to extract score from plain text patterns like "Score: 0.8" or "0.85"
|
|
97
|
+
const scoreMatch = cleanedText.match(/(?:score[:\s]*)?(\d+\.?\d*)/i);
|
|
98
|
+
if (scoreMatch) {
|
|
99
|
+
const score = Number(scoreMatch[1]);
|
|
100
|
+
if (!Number.isNaN(score) && score >= 0 && score <= 1) {
|
|
101
|
+
return { score, reason: cleanedText };
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
throw new Error(`No JSON found in grader response: ${text.substring(0, 100)}...`);
|
|
82
105
|
}
|
|
83
106
|
|
|
84
107
|
try {
|
|
@@ -94,6 +117,15 @@ export class LLMGraderEvaluator implements Evaluator {
|
|
|
94
117
|
reason: parsed.reason,
|
|
95
118
|
};
|
|
96
119
|
} catch (error) {
|
|
120
|
+
// If JSON parsing fails, try extracting score directly
|
|
121
|
+
const scoreMatch = jsonMatch[0].match(/"score"[:\s]*(\d+\.?\d*)/i);
|
|
122
|
+
if (scoreMatch) {
|
|
123
|
+
const score = Number(scoreMatch[1]);
|
|
124
|
+
if (!Number.isNaN(score) && score >= 0 && score <= 1) {
|
|
125
|
+
const reasonMatch = jsonMatch[0].match(/"reason"[:\s]*"([^"]+)"/i);
|
|
126
|
+
return { score, reason: reasonMatch?.[1] };
|
|
127
|
+
}
|
|
128
|
+
}
|
|
97
129
|
throw new Error(`Failed to parse grader response: ${(error as Error).message}`);
|
|
98
130
|
}
|
|
99
131
|
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for NotContainsEvaluator
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, expect, test } from 'bun:test';
|
|
6
|
+
import { NotContainsEvaluator } from './not-contains';
|
|
7
|
+
|
|
8
|
+
describe('NotContainsEvaluator', () => {
|
|
9
|
+
const evaluator = new NotContainsEvaluator();
|
|
10
|
+
|
|
11
|
+
test('passes when no forbidden values are present (mode: all)', async () => {
|
|
12
|
+
const result = await evaluator.evaluate('The colors are green and purple.', {
|
|
13
|
+
type: 'not_contains',
|
|
14
|
+
values: ['red', 'blue', 'yellow'],
|
|
15
|
+
mode: 'all',
|
|
16
|
+
});
|
|
17
|
+
expect(result.passed).toBe(true);
|
|
18
|
+
expect(result.score).toBe(1);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
test('fails when any forbidden value is present (mode: all)', async () => {
|
|
22
|
+
const result = await evaluator.evaluate('The colors are red and green.', {
|
|
23
|
+
type: 'not_contains',
|
|
24
|
+
values: ['red', 'blue', 'yellow'],
|
|
25
|
+
mode: 'all',
|
|
26
|
+
});
|
|
27
|
+
expect(result.passed).toBe(false);
|
|
28
|
+
expect(result.score).toBeCloseTo(0.67, 1);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
test('fails when all forbidden values are present (mode: all)', async () => {
|
|
32
|
+
const result = await evaluator.evaluate('The colors are red, blue, and yellow.', {
|
|
33
|
+
type: 'not_contains',
|
|
34
|
+
values: ['red', 'blue', 'yellow'],
|
|
35
|
+
mode: 'all',
|
|
36
|
+
});
|
|
37
|
+
expect(result.passed).toBe(false);
|
|
38
|
+
expect(result.score).toBe(0);
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
test('passes when at least one forbidden value is absent (mode: any)', async () => {
|
|
42
|
+
const result = await evaluator.evaluate('The colors are red and blue.', {
|
|
43
|
+
type: 'not_contains',
|
|
44
|
+
values: ['red', 'blue', 'yellow'],
|
|
45
|
+
mode: 'any',
|
|
46
|
+
});
|
|
47
|
+
expect(result.passed).toBe(true);
|
|
48
|
+
expect(result.score).toBeCloseTo(0.33, 1);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
test('fails when all forbidden values are present (mode: any)', async () => {
|
|
52
|
+
const result = await evaluator.evaluate('I have red, blue, and yellow paint.', {
|
|
53
|
+
type: 'not_contains',
|
|
54
|
+
values: ['red', 'blue', 'yellow'],
|
|
55
|
+
mode: 'any',
|
|
56
|
+
});
|
|
57
|
+
expect(result.passed).toBe(false);
|
|
58
|
+
expect(result.score).toBe(0);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
test('is case insensitive', async () => {
|
|
62
|
+
const result = await evaluator.evaluate('GREEN PURPLE ORANGE', {
|
|
63
|
+
type: 'not_contains',
|
|
64
|
+
values: ['red', 'blue', 'yellow'],
|
|
65
|
+
mode: 'all',
|
|
66
|
+
});
|
|
67
|
+
expect(result.passed).toBe(true);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
test('detects values case insensitively', async () => {
|
|
71
|
+
const result = await evaluator.evaluate('I have RED paint', {
|
|
72
|
+
type: 'not_contains',
|
|
73
|
+
values: ['red'],
|
|
74
|
+
mode: 'all',
|
|
75
|
+
});
|
|
76
|
+
expect(result.passed).toBe(false);
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
test('handles empty values array', async () => {
|
|
80
|
+
const result = await evaluator.evaluate('Any response text here', {
|
|
81
|
+
type: 'not_contains',
|
|
82
|
+
values: [],
|
|
83
|
+
mode: 'all',
|
|
84
|
+
});
|
|
85
|
+
expect(result.passed).toBe(true);
|
|
86
|
+
expect(result.score).toBe(1);
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
test('provides detailed results in details field', async () => {
|
|
90
|
+
const result = await evaluator.evaluate('The answer is red and green.', {
|
|
91
|
+
type: 'not_contains',
|
|
92
|
+
values: ['red', 'blue'],
|
|
93
|
+
mode: 'all',
|
|
94
|
+
});
|
|
95
|
+
expect(result.details).toEqual({
|
|
96
|
+
mode: 'all',
|
|
97
|
+
results: [
|
|
98
|
+
{ value: 'red', found: true },
|
|
99
|
+
{ value: 'blue', found: false },
|
|
100
|
+
],
|
|
101
|
+
notFoundCount: 1,
|
|
102
|
+
totalCount: 2,
|
|
103
|
+
});
|
|
104
|
+
});
|
|
105
|
+
});
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Not Contains evaluator - checks if response does NOT contain specific values
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { Expected } from '../scenario/schema';
|
|
6
|
+
import type { Evaluator, EvaluatorResult } from './types';
|
|
7
|
+
|
|
8
|
+
export class NotContainsEvaluator implements Evaluator {
|
|
9
|
+
readonly type = 'not_contains';
|
|
10
|
+
|
|
11
|
+
async evaluate(response: string, expected: Expected): Promise<EvaluatorResult> {
|
|
12
|
+
if (expected.type !== 'not_contains') {
|
|
13
|
+
throw new Error('Invalid expected type for NotContainsEvaluator');
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const normalizedResponse = response.toLowerCase();
|
|
17
|
+
const results = expected.values.map((value) => ({
|
|
18
|
+
value,
|
|
19
|
+
found: normalizedResponse.includes(value.toLowerCase()),
|
|
20
|
+
}));
|
|
21
|
+
|
|
22
|
+
const notFoundCount = results.filter((r) => !r.found).length;
|
|
23
|
+
|
|
24
|
+
// mode: 'all' means ALL values must be absent
|
|
25
|
+
// mode: 'any' means AT LEAST ONE value must be absent
|
|
26
|
+
const passed =
|
|
27
|
+
expected.mode === 'all' ? notFoundCount === expected.values.length : notFoundCount > 0;
|
|
28
|
+
|
|
29
|
+
const score = expected.values.length > 0 ? notFoundCount / expected.values.length : 1;
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
passed,
|
|
33
|
+
score,
|
|
34
|
+
reason: passed
|
|
35
|
+
? `Correctly absent: ${notFoundCount}/${expected.values.length} values (mode: ${expected.mode})`
|
|
36
|
+
: `Found forbidden values (mode: ${expected.mode})`,
|
|
37
|
+
details: {
|
|
38
|
+
mode: expected.mode,
|
|
39
|
+
results,
|
|
40
|
+
notFoundCount,
|
|
41
|
+
totalCount: expected.values.length,
|
|
42
|
+
},
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
}
|