@artemiskit/core 0.1.6 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CHANGELOG.md +116 -0
  2. package/dist/adapters/types.d.ts +8 -1
  3. package/dist/adapters/types.d.ts.map +1 -1
  4. package/dist/artifacts/types.d.ts +39 -0
  5. package/dist/artifacts/types.d.ts.map +1 -1
  6. package/dist/cost/index.d.ts +5 -0
  7. package/dist/cost/index.d.ts.map +1 -0
  8. package/dist/cost/pricing.d.ts +67 -0
  9. package/dist/cost/pricing.d.ts.map +1 -0
  10. package/dist/evaluators/combined.d.ts +10 -0
  11. package/dist/evaluators/combined.d.ts.map +1 -0
  12. package/dist/evaluators/index.d.ts +4 -0
  13. package/dist/evaluators/index.d.ts.map +1 -1
  14. package/dist/evaluators/inline.d.ts +22 -0
  15. package/dist/evaluators/inline.d.ts.map +1 -0
  16. package/dist/evaluators/llm-grader.d.ts.map +1 -1
  17. package/dist/evaluators/not-contains.d.ts +10 -0
  18. package/dist/evaluators/not-contains.d.ts.map +1 -0
  19. package/dist/evaluators/similarity.d.ts +16 -0
  20. package/dist/evaluators/similarity.d.ts.map +1 -0
  21. package/dist/index.d.ts +1 -0
  22. package/dist/index.d.ts.map +1 -1
  23. package/dist/index.js +13212 -12018
  24. package/dist/scenario/discovery.d.ts +72 -0
  25. package/dist/scenario/discovery.d.ts.map +1 -0
  26. package/dist/scenario/index.d.ts +1 -0
  27. package/dist/scenario/index.d.ts.map +1 -1
  28. package/dist/scenario/schema.d.ts +1253 -9
  29. package/dist/scenario/schema.d.ts.map +1 -1
  30. package/dist/storage/local.d.ts +44 -2
  31. package/dist/storage/local.d.ts.map +1 -1
  32. package/dist/storage/types.d.ts +62 -0
  33. package/dist/storage/types.d.ts.map +1 -1
  34. package/package.json +1 -1
  35. package/src/adapters/types.ts +8 -1
  36. package/src/artifacts/types.ts +39 -0
  37. package/src/cost/index.ts +14 -0
  38. package/src/cost/pricing.ts +450 -0
  39. package/src/evaluators/combined.test.ts +172 -0
  40. package/src/evaluators/combined.ts +95 -0
  41. package/src/evaluators/index.ts +12 -0
  42. package/src/evaluators/inline.test.ts +409 -0
  43. package/src/evaluators/inline.ts +393 -0
  44. package/src/evaluators/llm-grader.ts +45 -13
  45. package/src/evaluators/not-contains.test.ts +105 -0
  46. package/src/evaluators/not-contains.ts +45 -0
  47. package/src/evaluators/similarity.test.ts +333 -0
  48. package/src/evaluators/similarity.ts +258 -0
  49. package/src/index.ts +3 -0
  50. package/src/scenario/discovery.test.ts +153 -0
  51. package/src/scenario/discovery.ts +277 -0
  52. package/src/scenario/index.ts +1 -0
  53. package/src/scenario/schema.ts +47 -2
  54. package/src/storage/local.test.ts +243 -0
  55. package/src/storage/local.ts +162 -2
  56. package/src/storage/types.ts +73 -0
@@ -0,0 +1,393 @@
1
+ /**
2
+ * Inline custom matcher evaluator
3
+ * Allows users to define simple matching expressions directly in YAML
4
+ *
5
+ * Supports a safe subset of JavaScript-like expressions:
6
+ * - String methods: includes, startsWith, endsWith, length, toLowerCase, toUpperCase, trim
7
+ * - Comparisons: ==, !=, >, <, >=, <=
8
+ * - Logical operators: &&, ||, !
9
+ * - Regex matching: response.match(/pattern/)
10
+ * - JSON parsing: JSON.parse(response)
11
+ */
12
+
13
+ import type { Expected } from '../scenario/schema';
14
+ import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
15
+
16
+ /**
17
+ * Split a string on a delimiter, but only when the delimiter is outside of quotes
18
+ */
19
+ function splitOutsideQuotes(str: string, delimiter: string): string[] {
20
+ const parts: string[] = [];
21
+ let current = '';
22
+ let inSingleQuote = false;
23
+ let inDoubleQuote = false;
24
+ let i = 0;
25
+
26
+ while (i < str.length) {
27
+ const char = str[i];
28
+
29
+ // Check for delimiter outside of quotes
30
+ if (!inSingleQuote && !inDoubleQuote && str.slice(i, i + delimiter.length) === delimiter) {
31
+ parts.push(current);
32
+ current = '';
33
+ i += delimiter.length;
34
+ continue;
35
+ }
36
+
37
+ // Track quote state (handle escaped quotes)
38
+ if (char === "'" && !inDoubleQuote && (i === 0 || str[i - 1] !== '\\')) {
39
+ inSingleQuote = !inSingleQuote;
40
+ } else if (char === '"' && !inSingleQuote && (i === 0 || str[i - 1] !== '\\')) {
41
+ inDoubleQuote = !inDoubleQuote;
42
+ }
43
+
44
+ current += char;
45
+ i++;
46
+ }
47
+
48
+ parts.push(current);
49
+ return parts;
50
+ }
51
+
52
+ /**
53
+ * Safe expression context with allowed variables and functions
54
+ */
55
+ interface ExpressionContext {
56
+ response: string;
57
+ expected: string | undefined;
58
+ length: number;
59
+ words: string[];
60
+ lines: string[];
61
+ json: unknown | null;
62
+ }
63
+
64
+ /**
65
+ * Evaluate a safe expression
66
+ */
67
+ function evaluateExpression(
68
+ expression: string,
69
+ context: ExpressionContext
70
+ ): { result: boolean; score: number } {
71
+ const { response, expected, length, words, lines, json } = context;
72
+
73
+ // Normalize the expression
74
+ const expr = expression.trim();
75
+
76
+ // Handle common patterns with safe evaluation
77
+ try {
78
+ // Handle combined patterns with && and || FIRST (before individual patterns)
79
+ // Split on && or || that appear outside of quoted strings
80
+ const andParts = splitOutsideQuotes(expr, '&&');
81
+ if (andParts.length > 1) {
82
+ const results = andParts.map((p) => evaluateExpression(p.trim(), context));
83
+ const allPassed = results.every((r) => r.result);
84
+ const avgScore = results.reduce((sum, r) => sum + r.score, 0) / results.length;
85
+ return { result: allPassed, score: avgScore };
86
+ }
87
+
88
+ const orParts = splitOutsideQuotes(expr, '||');
89
+ if (orParts.length > 1) {
90
+ const results = orParts.map((p) => evaluateExpression(p.trim(), context));
91
+ const anyPassed = results.some((r) => r.result);
92
+ const maxScore = Math.max(...results.map((r) => r.score));
93
+ return { result: anyPassed, score: maxScore };
94
+ }
95
+
96
+ // Pattern: response.includes("text")
97
+ const includesMatch = expr.match(/^response\.includes\s*\(\s*["'](.+?)["']\s*\)$/);
98
+ if (includesMatch) {
99
+ const result = response.includes(includesMatch[1]);
100
+ return { result, score: result ? 1 : 0 };
101
+ }
102
+
103
+ // Pattern: !response.includes("text")
104
+ const notIncludesMatch = expr.match(/^!\s*response\.includes\s*\(\s*["'](.+?)["']\s*\)$/);
105
+ if (notIncludesMatch) {
106
+ const result = !response.includes(notIncludesMatch[1]);
107
+ return { result, score: result ? 1 : 0 };
108
+ }
109
+
110
+ // Pattern: response.startsWith("text")
111
+ const startsWithMatch = expr.match(/^response\.startsWith\s*\(\s*["'](.+?)["']\s*\)$/);
112
+ if (startsWithMatch) {
113
+ const result = response.startsWith(startsWithMatch[1]);
114
+ return { result, score: result ? 1 : 0 };
115
+ }
116
+
117
+ // Pattern: response.endsWith("text")
118
+ const endsWithMatch = expr.match(/^response\.endsWith\s*\(\s*["'](.+?)["']\s*\)$/);
119
+ if (endsWithMatch) {
120
+ const result = response.endsWith(endsWithMatch[1]);
121
+ return { result, score: result ? 1 : 0 };
122
+ }
123
+
124
+ // Pattern: response.toLowerCase().includes("text")
125
+ const lowerIncludesMatch = expr.match(
126
+ /^response\.toLowerCase\s*\(\s*\)\.includes\s*\(\s*["'](.+?)["']\s*\)$/
127
+ );
128
+ if (lowerIncludesMatch) {
129
+ const result = response.toLowerCase().includes(lowerIncludesMatch[1].toLowerCase());
130
+ return { result, score: result ? 1 : 0 };
131
+ }
132
+
133
+ // Pattern: response.match(/regex/)
134
+ const regexMatch = expr.match(/^response\.match\s*\(\s*\/(.+?)\/([gimsuy]*)\s*\)$/);
135
+ if (regexMatch) {
136
+ const regex = new RegExp(regexMatch[1], regexMatch[2]);
137
+ const result = regex.test(response);
138
+ return { result, score: result ? 1 : 0 };
139
+ }
140
+
141
+ // Pattern: !response.match(/regex/)
142
+ const notRegexMatch = expr.match(/^!\s*response\.match\s*\(\s*\/(.+?)\/([gimsuy]*)\s*\)$/);
143
+ if (notRegexMatch) {
144
+ const regex = new RegExp(notRegexMatch[1], notRegexMatch[2]);
145
+ const result = !regex.test(response);
146
+ return { result, score: result ? 1 : 0 };
147
+ }
148
+
149
+ // Pattern: length > N, length < N, length >= N, length <= N, length == N
150
+ const lengthMatch = expr.match(/^length\s*(>=|<=|>|<|==|!=)\s*(\d+)$/);
151
+ if (lengthMatch) {
152
+ const op = lengthMatch[1];
153
+ const num = Number.parseInt(lengthMatch[2], 10);
154
+ let result = false;
155
+ switch (op) {
156
+ case '>':
157
+ result = length > num;
158
+ break;
159
+ case '<':
160
+ result = length < num;
161
+ break;
162
+ case '>=':
163
+ result = length >= num;
164
+ break;
165
+ case '<=':
166
+ result = length <= num;
167
+ break;
168
+ case '==':
169
+ result = length === num;
170
+ break;
171
+ case '!=':
172
+ result = length !== num;
173
+ break;
174
+ }
175
+ return { result, score: result ? 1 : 0 };
176
+ }
177
+
178
+ // Pattern: words.length > N
179
+ const wordsLengthMatch = expr.match(/^words\.length\s*(>=|<=|>|<|==|!=)\s*(\d+)$/);
180
+ if (wordsLengthMatch) {
181
+ const op = wordsLengthMatch[1];
182
+ const num = Number.parseInt(wordsLengthMatch[2], 10);
183
+ let result = false;
184
+ switch (op) {
185
+ case '>':
186
+ result = words.length > num;
187
+ break;
188
+ case '<':
189
+ result = words.length < num;
190
+ break;
191
+ case '>=':
192
+ result = words.length >= num;
193
+ break;
194
+ case '<=':
195
+ result = words.length <= num;
196
+ break;
197
+ case '==':
198
+ result = words.length === num;
199
+ break;
200
+ case '!=':
201
+ result = words.length !== num;
202
+ break;
203
+ }
204
+ return { result, score: result ? 1 : 0 };
205
+ }
206
+
207
+ // Pattern: lines.length > N
208
+ const linesLengthMatch = expr.match(/^lines\.length\s*(>=|<=|>|<|==|!=)\s*(\d+)$/);
209
+ if (linesLengthMatch) {
210
+ const op = linesLengthMatch[1];
211
+ const num = Number.parseInt(linesLengthMatch[2], 10);
212
+ let result = false;
213
+ switch (op) {
214
+ case '>':
215
+ result = lines.length > num;
216
+ break;
217
+ case '<':
218
+ result = lines.length < num;
219
+ break;
220
+ case '>=':
221
+ result = lines.length >= num;
222
+ break;
223
+ case '<=':
224
+ result = lines.length <= num;
225
+ break;
226
+ case '==':
227
+ result = lines.length === num;
228
+ break;
229
+ case '!=':
230
+ result = lines.length !== num;
231
+ break;
232
+ }
233
+ return { result, score: result ? 1 : 0 };
234
+ }
235
+
236
+ // Pattern: json.field == "value" or json.field == value
237
+ const jsonFieldMatch = expr.match(
238
+ /^json\.(\w+(?:\.\w+)*)\s*(>=|<=|>|<|==|!=)\s*(?:["'](.+?)["']|(\d+(?:\.\d+)?)|(true|false|null))$/
239
+ );
240
+ if (jsonFieldMatch && json !== null) {
241
+ const path = jsonFieldMatch[1];
242
+ const op = jsonFieldMatch[2];
243
+ let compareValue: unknown = jsonFieldMatch[3] ?? jsonFieldMatch[4] ?? jsonFieldMatch[5];
244
+
245
+ // Parse numbers and booleans
246
+ if (compareValue === 'true') compareValue = true;
247
+ else if (compareValue === 'false') compareValue = false;
248
+ else if (compareValue === 'null') compareValue = null;
249
+ else if (jsonFieldMatch[4]) compareValue = Number.parseFloat(jsonFieldMatch[4]);
250
+
251
+ // Navigate JSON path
252
+ let fieldValue: unknown = json;
253
+ for (const key of path.split('.')) {
254
+ if (fieldValue && typeof fieldValue === 'object' && key in fieldValue) {
255
+ fieldValue = (fieldValue as Record<string, unknown>)[key];
256
+ } else {
257
+ fieldValue = undefined;
258
+ break;
259
+ }
260
+ }
261
+
262
+ let result = false;
263
+ switch (op) {
264
+ case '==':
265
+ result = fieldValue === compareValue;
266
+ break;
267
+ case '!=':
268
+ result = fieldValue !== compareValue;
269
+ break;
270
+ case '>':
271
+ result = typeof fieldValue === 'number' && fieldValue > (compareValue as number);
272
+ break;
273
+ case '<':
274
+ result = typeof fieldValue === 'number' && fieldValue < (compareValue as number);
275
+ break;
276
+ case '>=':
277
+ result = typeof fieldValue === 'number' && fieldValue >= (compareValue as number);
278
+ break;
279
+ case '<=':
280
+ result = typeof fieldValue === 'number' && fieldValue <= (compareValue as number);
281
+ break;
282
+ }
283
+ return { result, score: result ? 1 : 0 };
284
+ }
285
+
286
+ // Pattern: json != null (check if valid JSON)
287
+ if (expr === 'json != null') {
288
+ const result = json !== null;
289
+ return { result, score: result ? 1 : 0 };
290
+ }
291
+
292
+ // Pattern: response == expected
293
+ if (expr === 'response == expected' && expected !== undefined) {
294
+ const result = response === expected;
295
+ return { result, score: result ? 1 : 0 };
296
+ }
297
+
298
+ // Pattern: response.trim() == expected
299
+ if (expr === 'response.trim() == expected' && expected !== undefined) {
300
+ const result = response.trim() === expected;
301
+ return { result, score: result ? 1 : 0 };
302
+ }
303
+
304
+ // Unknown expression pattern
305
+ throw new Error(`Unsupported expression pattern: ${expr}`);
306
+ } catch (error) {
307
+ throw new Error(`Expression evaluation failed: ${(error as Error).message}`);
308
+ }
309
+ }
310
+
311
+ export class InlineEvaluator implements Evaluator {
312
+ readonly type = 'inline';
313
+
314
+ async evaluate(
315
+ response: string,
316
+ expected: Expected,
317
+ _context?: EvaluatorContext
318
+ ): Promise<EvaluatorResult> {
319
+ if (expected.type !== 'inline') {
320
+ throw new Error('Invalid expected type for InlineEvaluator');
321
+ }
322
+
323
+ const expression = expected.expression;
324
+ const expectedValue = expected.value;
325
+
326
+ // Build context
327
+ let json: unknown = null;
328
+ try {
329
+ json = JSON.parse(response);
330
+ } catch {
331
+ // Not valid JSON, that's fine
332
+ }
333
+
334
+ const context: ExpressionContext = {
335
+ response,
336
+ expected: expectedValue,
337
+ length: response.length,
338
+ words: response.split(/\s+/).filter((w) => w.length > 0),
339
+ lines: response.split('\n').filter((l) => l.trim().length > 0),
340
+ json,
341
+ };
342
+
343
+ try {
344
+ const { result, score } = evaluateExpression(expression, context);
345
+
346
+ return {
347
+ passed: result,
348
+ score,
349
+ reason: result ? `Expression passed: ${expression}` : `Expression failed: ${expression}`,
350
+ details: {
351
+ expression,
352
+ expectedValue,
353
+ responseLength: response.length,
354
+ wordCount: context.words.length,
355
+ lineCount: context.lines.length,
356
+ isValidJson: json !== null,
357
+ },
358
+ };
359
+ } catch (error) {
360
+ return {
361
+ passed: false,
362
+ score: 0,
363
+ reason: `Inline matcher error: ${(error as Error).message}`,
364
+ details: {
365
+ expression,
366
+ error: (error as Error).message,
367
+ },
368
+ };
369
+ }
370
+ }
371
+ }
372
+
373
+ /**
374
+ * List of supported expression patterns for documentation
375
+ */
376
+ export const SUPPORTED_EXPRESSIONS = [
377
+ 'response.includes("text")',
378
+ '!response.includes("text")',
379
+ 'response.startsWith("text")',
380
+ 'response.endsWith("text")',
381
+ 'response.toLowerCase().includes("text")',
382
+ 'response.match(/regex/)',
383
+ '!response.match(/regex/)',
384
+ 'length > N / length < N / length >= N / length <= N / length == N',
385
+ 'words.length > N',
386
+ 'lines.length > N',
387
+ 'json.field == "value"',
388
+ 'json.field > N',
389
+ 'json != null',
390
+ 'response == expected',
391
+ 'expression1 && expression2',
392
+ 'expression1 || expression2',
393
+ ];
@@ -5,22 +5,27 @@
5
5
  import type { Expected } from '../scenario/schema';
6
6
  import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
7
7
 
8
- const GRADER_PROMPT = `You are an evaluator grading an AI response based on a rubric.
8
+ const GRADER_PROMPT = `You are a strict JSON-only evaluator. You grade AI responses based on rubrics.
9
9
 
10
- ## RUBRIC
10
+ RUBRIC:
11
11
  {{rubric}}
12
12
 
13
- ## RESPONSE TO EVALUATE
13
+ RESPONSE TO EVALUATE:
14
14
  {{response}}
15
15
 
16
- ## INSTRUCTIONS
17
- Score the response from 0.0 to 1.0 based on the rubric.
18
- Be objective and consistent in your scoring.
16
+ TASK: Score the response from 0.0 to 1.0 based on the rubric above.
19
17
 
20
- Respond with ONLY a JSON object in this exact format:
21
- {"score": <number between 0 and 1>, "reason": "<brief explanation of score>"}
18
+ OUTPUT FORMAT: You MUST respond with ONLY this exact JSON structure, nothing else:
19
+ {"score":0.0,"reason":"explanation"}
22
20
 
23
- Do not include any other text, markdown, or formatting.`;
21
+ RULES:
22
+ - Output ONLY valid JSON, no markdown, no code blocks, no extra text
23
+ - "score" must be a number between 0.0 and 1.0
24
+ - "reason" must be a brief string explaining the score
25
+ - Do NOT wrap in \`\`\`json or any formatting
26
+ - Your entire response must be parseable by JSON.parse()
27
+
28
+ JSON OUTPUT:`;
24
29
 
25
30
  export class LLMGraderEvaluator implements Evaluator {
26
31
  readonly type = 'llm_grader';
@@ -44,11 +49,13 @@ export class LLMGraderEvaluator implements Evaluator {
44
49
  );
45
50
 
46
51
  try {
52
+ // Note: Some models (like o1, o3, gpt-5-mini, reasoning models) only support temperature=1
53
+ // We omit temperature to let the API use its default for maximum compatibility
54
+ // Use higher maxTokens for reasoning models which use tokens for internal "thinking"
47
55
  const result = await context.client.generate({
48
56
  prompt,
49
57
  model: expected.model,
50
- temperature: 0,
51
- maxTokens: 200,
58
+ maxTokens: 1000,
52
59
  });
53
60
 
54
61
  const parsed = this.parseGraderResponse(result.text);
@@ -76,9 +83,25 @@ export class LLMGraderEvaluator implements Evaluator {
76
83
  }
77
84
 
78
85
  private parseGraderResponse(text: string): { score: number; reason?: string } {
79
- const jsonMatch = text.match(/\{[\s\S]*?\}/);
86
+ // Clean up the response - remove markdown code blocks if present
87
+ const cleanedText = text
88
+ .replace(/```json\s*/gi, '')
89
+ .replace(/```\s*/g, '')
90
+ .trim();
91
+
92
+ // Try to find JSON object in the response
93
+ const jsonMatch = cleanedText.match(/\{[\s\S]*?\}/);
94
+
80
95
  if (!jsonMatch) {
81
- throw new Error('No JSON found in grader response');
96
+ // Fallback: try to extract score from plain text patterns like "Score: 0.8" or "0.85"
97
+ const scoreMatch = cleanedText.match(/(?:score[:\s]*)?(\d+\.?\d*)/i);
98
+ if (scoreMatch) {
99
+ const score = Number(scoreMatch[1]);
100
+ if (!Number.isNaN(score) && score >= 0 && score <= 1) {
101
+ return { score, reason: cleanedText };
102
+ }
103
+ }
104
+ throw new Error(`No JSON found in grader response: ${text.substring(0, 100)}...`);
82
105
  }
83
106
 
84
107
  try {
@@ -94,6 +117,15 @@ export class LLMGraderEvaluator implements Evaluator {
94
117
  reason: parsed.reason,
95
118
  };
96
119
  } catch (error) {
120
+ // If JSON parsing fails, try extracting score directly
121
+ const scoreMatch = jsonMatch[0].match(/"score"[:\s]*(\d+\.?\d*)/i);
122
+ if (scoreMatch) {
123
+ const score = Number(scoreMatch[1]);
124
+ if (!Number.isNaN(score) && score >= 0 && score <= 1) {
125
+ const reasonMatch = jsonMatch[0].match(/"reason"[:\s]*"([^"]+)"/i);
126
+ return { score, reason: reasonMatch?.[1] };
127
+ }
128
+ }
97
129
  throw new Error(`Failed to parse grader response: ${(error as Error).message}`);
98
130
  }
99
131
  }
@@ -0,0 +1,105 @@
1
+ /**
2
+ * Tests for NotContainsEvaluator
3
+ */
4
+
5
+ import { describe, expect, test } from 'bun:test';
6
+ import { NotContainsEvaluator } from './not-contains';
7
+
8
+ describe('NotContainsEvaluator', () => {
9
+ const evaluator = new NotContainsEvaluator();
10
+
11
+ test('passes when no forbidden values are present (mode: all)', async () => {
12
+ const result = await evaluator.evaluate('The colors are green and purple.', {
13
+ type: 'not_contains',
14
+ values: ['red', 'blue', 'yellow'],
15
+ mode: 'all',
16
+ });
17
+ expect(result.passed).toBe(true);
18
+ expect(result.score).toBe(1);
19
+ });
20
+
21
+ test('fails when any forbidden value is present (mode: all)', async () => {
22
+ const result = await evaluator.evaluate('The colors are red and green.', {
23
+ type: 'not_contains',
24
+ values: ['red', 'blue', 'yellow'],
25
+ mode: 'all',
26
+ });
27
+ expect(result.passed).toBe(false);
28
+ expect(result.score).toBeCloseTo(0.67, 1);
29
+ });
30
+
31
+ test('fails when all forbidden values are present (mode: all)', async () => {
32
+ const result = await evaluator.evaluate('The colors are red, blue, and yellow.', {
33
+ type: 'not_contains',
34
+ values: ['red', 'blue', 'yellow'],
35
+ mode: 'all',
36
+ });
37
+ expect(result.passed).toBe(false);
38
+ expect(result.score).toBe(0);
39
+ });
40
+
41
+ test('passes when at least one forbidden value is absent (mode: any)', async () => {
42
+ const result = await evaluator.evaluate('The colors are red and blue.', {
43
+ type: 'not_contains',
44
+ values: ['red', 'blue', 'yellow'],
45
+ mode: 'any',
46
+ });
47
+ expect(result.passed).toBe(true);
48
+ expect(result.score).toBeCloseTo(0.33, 1);
49
+ });
50
+
51
+ test('fails when all forbidden values are present (mode: any)', async () => {
52
+ const result = await evaluator.evaluate('I have red, blue, and yellow paint.', {
53
+ type: 'not_contains',
54
+ values: ['red', 'blue', 'yellow'],
55
+ mode: 'any',
56
+ });
57
+ expect(result.passed).toBe(false);
58
+ expect(result.score).toBe(0);
59
+ });
60
+
61
+ test('is case insensitive', async () => {
62
+ const result = await evaluator.evaluate('GREEN PURPLE ORANGE', {
63
+ type: 'not_contains',
64
+ values: ['red', 'blue', 'yellow'],
65
+ mode: 'all',
66
+ });
67
+ expect(result.passed).toBe(true);
68
+ });
69
+
70
+ test('detects values case insensitively', async () => {
71
+ const result = await evaluator.evaluate('I have RED paint', {
72
+ type: 'not_contains',
73
+ values: ['red'],
74
+ mode: 'all',
75
+ });
76
+ expect(result.passed).toBe(false);
77
+ });
78
+
79
+ test('handles empty values array', async () => {
80
+ const result = await evaluator.evaluate('Any response text here', {
81
+ type: 'not_contains',
82
+ values: [],
83
+ mode: 'all',
84
+ });
85
+ expect(result.passed).toBe(true);
86
+ expect(result.score).toBe(1);
87
+ });
88
+
89
+ test('provides detailed results in details field', async () => {
90
+ const result = await evaluator.evaluate('The answer is red and green.', {
91
+ type: 'not_contains',
92
+ values: ['red', 'blue'],
93
+ mode: 'all',
94
+ });
95
+ expect(result.details).toEqual({
96
+ mode: 'all',
97
+ results: [
98
+ { value: 'red', found: true },
99
+ { value: 'blue', found: false },
100
+ ],
101
+ notFoundCount: 1,
102
+ totalCount: 2,
103
+ });
104
+ });
105
+ });
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Not Contains evaluator - checks if response does NOT contain specific values
3
+ */
4
+
5
+ import type { Expected } from '../scenario/schema';
6
+ import type { Evaluator, EvaluatorResult } from './types';
7
+
8
+ export class NotContainsEvaluator implements Evaluator {
9
+ readonly type = 'not_contains';
10
+
11
+ async evaluate(response: string, expected: Expected): Promise<EvaluatorResult> {
12
+ if (expected.type !== 'not_contains') {
13
+ throw new Error('Invalid expected type for NotContainsEvaluator');
14
+ }
15
+
16
+ const normalizedResponse = response.toLowerCase();
17
+ const results = expected.values.map((value) => ({
18
+ value,
19
+ found: normalizedResponse.includes(value.toLowerCase()),
20
+ }));
21
+
22
+ const notFoundCount = results.filter((r) => !r.found).length;
23
+
24
+ // mode: 'all' means ALL values must be absent
25
+ // mode: 'any' means AT LEAST ONE value must be absent
26
+ const passed =
27
+ expected.mode === 'all' ? notFoundCount === expected.values.length : notFoundCount > 0;
28
+
29
+ const score = expected.values.length > 0 ? notFoundCount / expected.values.length : 1;
30
+
31
+ return {
32
+ passed,
33
+ score,
34
+ reason: passed
35
+ ? `Correctly absent: ${notFoundCount}/${expected.values.length} values (mode: ${expected.mode})`
36
+ : `Found forbidden values (mode: ${expected.mode})`,
37
+ details: {
38
+ mode: expected.mode,
39
+ results,
40
+ notFoundCount,
41
+ totalCount: expected.values.length,
42
+ },
43
+ };
44
+ }
45
+ }