@artemiskit/core 0.1.6 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +116 -0
- package/dist/adapters/types.d.ts +8 -1
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +39 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/cost/index.d.ts +5 -0
- package/dist/cost/index.d.ts.map +1 -0
- package/dist/cost/pricing.d.ts +67 -0
- package/dist/cost/pricing.d.ts.map +1 -0
- package/dist/evaluators/combined.d.ts +10 -0
- package/dist/evaluators/combined.d.ts.map +1 -0
- package/dist/evaluators/index.d.ts +4 -0
- package/dist/evaluators/index.d.ts.map +1 -1
- package/dist/evaluators/inline.d.ts +22 -0
- package/dist/evaluators/inline.d.ts.map +1 -0
- package/dist/evaluators/llm-grader.d.ts.map +1 -1
- package/dist/evaluators/not-contains.d.ts +10 -0
- package/dist/evaluators/not-contains.d.ts.map +1 -0
- package/dist/evaluators/similarity.d.ts +16 -0
- package/dist/evaluators/similarity.d.ts.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +13212 -12018
- package/dist/scenario/discovery.d.ts +72 -0
- package/dist/scenario/discovery.d.ts.map +1 -0
- package/dist/scenario/index.d.ts +1 -0
- package/dist/scenario/index.d.ts.map +1 -1
- package/dist/scenario/schema.d.ts +1253 -9
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/local.d.ts +44 -2
- package/dist/storage/local.d.ts.map +1 -1
- package/dist/storage/types.d.ts +62 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +8 -1
- package/src/artifacts/types.ts +39 -0
- package/src/cost/index.ts +14 -0
- package/src/cost/pricing.ts +450 -0
- package/src/evaluators/combined.test.ts +172 -0
- package/src/evaluators/combined.ts +95 -0
- package/src/evaluators/index.ts +12 -0
- package/src/evaluators/inline.test.ts +409 -0
- package/src/evaluators/inline.ts +393 -0
- package/src/evaluators/llm-grader.ts +45 -13
- package/src/evaluators/not-contains.test.ts +105 -0
- package/src/evaluators/not-contains.ts +45 -0
- package/src/evaluators/similarity.test.ts +333 -0
- package/src/evaluators/similarity.ts +258 -0
- package/src/index.ts +3 -0
- package/src/scenario/discovery.test.ts +153 -0
- package/src/scenario/discovery.ts +277 -0
- package/src/scenario/index.ts +1 -0
- package/src/scenario/schema.ts +47 -2
- package/src/storage/local.test.ts +243 -0
- package/src/storage/local.ts +162 -2
- package/src/storage/types.ts +73 -0
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for semantic similarity evaluator
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, expect, test } from 'bun:test';
|
|
6
|
+
import { SimilarityEvaluator } from './similarity';
|
|
7
|
+
import type { EvaluatorContext } from './types';
|
|
8
|
+
|
|
9
|
+
describe('SimilarityEvaluator', () => {
|
|
10
|
+
const evaluator = new SimilarityEvaluator();
|
|
11
|
+
|
|
12
|
+
test('has correct type', () => {
|
|
13
|
+
expect(evaluator.type).toBe('similarity');
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
test('throws on invalid expected type', async () => {
|
|
17
|
+
await expect(
|
|
18
|
+
evaluator.evaluate('response', { type: 'exact', value: 'test' } as any)
|
|
19
|
+
).rejects.toThrow('Invalid expected type');
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
test('fails when no client is provided', async () => {
|
|
23
|
+
const result = await evaluator.evaluate('The capital of France is Paris', {
|
|
24
|
+
type: 'similarity',
|
|
25
|
+
value: 'Paris is the capital of France',
|
|
26
|
+
threshold: 0.75,
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
expect(result.passed).toBe(false);
|
|
30
|
+
expect(result.score).toBe(0);
|
|
31
|
+
expect(result.reason).toContain('requires a ModelClient');
|
|
32
|
+
expect(result.details?.method).toBe('unavailable');
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
test('uses embedding-based similarity when embed is available', async () => {
|
|
36
|
+
// Mock client with embedding support
|
|
37
|
+
const mockContext: EvaluatorContext = {
|
|
38
|
+
client: {
|
|
39
|
+
provider: 'mock',
|
|
40
|
+
embed: async (text: string) => {
|
|
41
|
+
// Simulate semantic embeddings where similar texts have similar vectors
|
|
42
|
+
if (text.toLowerCase().includes('paris') && text.toLowerCase().includes('france')) {
|
|
43
|
+
return [0.9, 0.3, 0.1, 0.4];
|
|
44
|
+
}
|
|
45
|
+
if (text.toLowerCase().includes('paris') || text.toLowerCase().includes('france')) {
|
|
46
|
+
return [0.85, 0.35, 0.15, 0.38];
|
|
47
|
+
}
|
|
48
|
+
return [0.1, 0.8, 0.5, 0.2];
|
|
49
|
+
},
|
|
50
|
+
generate: async () => ({
|
|
51
|
+
id: '',
|
|
52
|
+
model: '',
|
|
53
|
+
text: '',
|
|
54
|
+
tokens: { prompt: 0, completion: 0, total: 0 },
|
|
55
|
+
latencyMs: 0,
|
|
56
|
+
}),
|
|
57
|
+
capabilities: async () => ({
|
|
58
|
+
streaming: false,
|
|
59
|
+
functionCalling: false,
|
|
60
|
+
toolUse: false,
|
|
61
|
+
maxContext: 4096,
|
|
62
|
+
}),
|
|
63
|
+
},
|
|
64
|
+
};
|
|
65
|
+
|
|
66
|
+
const result = await evaluator.evaluate(
|
|
67
|
+
'The capital of France is Paris',
|
|
68
|
+
{
|
|
69
|
+
type: 'similarity',
|
|
70
|
+
value: 'Paris is the capital city of France',
|
|
71
|
+
threshold: 0.75,
|
|
72
|
+
},
|
|
73
|
+
mockContext
|
|
74
|
+
);
|
|
75
|
+
|
|
76
|
+
expect(result.passed).toBe(true);
|
|
77
|
+
expect(result.score).toBeGreaterThan(0.75);
|
|
78
|
+
expect(result.details?.method).toBe('embedding');
|
|
79
|
+
expect(result.reason).toContain('embedding');
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
test('uses high threshold correctly', async () => {
|
|
83
|
+
const mockContext: EvaluatorContext = {
|
|
84
|
+
client: {
|
|
85
|
+
provider: 'mock',
|
|
86
|
+
embed: async (text: string) => {
|
|
87
|
+
// Return dissimilar vectors for different texts
|
|
88
|
+
if (text.includes('capital')) {
|
|
89
|
+
return [1.0, 0.0, 0.0, 0.0];
|
|
90
|
+
}
|
|
91
|
+
return [0.0, 1.0, 0.0, 0.0]; // Orthogonal vector = 0 similarity
|
|
92
|
+
},
|
|
93
|
+
generate: async () => ({
|
|
94
|
+
id: '',
|
|
95
|
+
model: '',
|
|
96
|
+
text: '',
|
|
97
|
+
tokens: { prompt: 0, completion: 0, total: 0 },
|
|
98
|
+
latencyMs: 0,
|
|
99
|
+
}),
|
|
100
|
+
capabilities: async () => ({
|
|
101
|
+
streaming: false,
|
|
102
|
+
functionCalling: false,
|
|
103
|
+
toolUse: false,
|
|
104
|
+
maxContext: 4096,
|
|
105
|
+
}),
|
|
106
|
+
},
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
const result = await evaluator.evaluate(
|
|
110
|
+
'Some text about weather',
|
|
111
|
+
{
|
|
112
|
+
type: 'similarity',
|
|
113
|
+
value: 'Related text about capitals',
|
|
114
|
+
threshold: 0.5, // Even moderate threshold should fail
|
|
115
|
+
},
|
|
116
|
+
mockContext
|
|
117
|
+
);
|
|
118
|
+
|
|
119
|
+
expect(result.passed).toBe(false);
|
|
120
|
+
expect(result.score).toBeLessThan(0.5);
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
test('falls back to LLM when embedding fails', async () => {
|
|
124
|
+
const mockContext: EvaluatorContext = {
|
|
125
|
+
client: {
|
|
126
|
+
provider: 'mock',
|
|
127
|
+
embed: async () => {
|
|
128
|
+
throw new Error('Embedding model not available');
|
|
129
|
+
},
|
|
130
|
+
generate: async () => ({
|
|
131
|
+
id: 'test',
|
|
132
|
+
model: 'mock',
|
|
133
|
+
text: '{"score": 0.85, "reason": "Both texts describe Paris as the capital of France"}',
|
|
134
|
+
tokens: { prompt: 100, completion: 20, total: 120 },
|
|
135
|
+
latencyMs: 100,
|
|
136
|
+
}),
|
|
137
|
+
capabilities: async () => ({
|
|
138
|
+
streaming: false,
|
|
139
|
+
functionCalling: false,
|
|
140
|
+
toolUse: false,
|
|
141
|
+
maxContext: 4096,
|
|
142
|
+
}),
|
|
143
|
+
},
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
const result = await evaluator.evaluate(
|
|
147
|
+
'The capital of France is Paris',
|
|
148
|
+
{
|
|
149
|
+
type: 'similarity',
|
|
150
|
+
value: 'Paris is the capital city of France',
|
|
151
|
+
threshold: 0.75,
|
|
152
|
+
},
|
|
153
|
+
mockContext
|
|
154
|
+
);
|
|
155
|
+
|
|
156
|
+
expect(result.passed).toBe(true);
|
|
157
|
+
expect(result.score).toBe(0.85);
|
|
158
|
+
expect(result.details?.method).toBe('llm');
|
|
159
|
+
expect(result.reason).toContain('LLM');
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
test('uses LLM when embed is not available', async () => {
|
|
163
|
+
const mockContext: EvaluatorContext = {
|
|
164
|
+
client: {
|
|
165
|
+
provider: 'mock',
|
|
166
|
+
// No embed method
|
|
167
|
+
generate: async () => ({
|
|
168
|
+
id: 'test',
|
|
169
|
+
model: 'mock',
|
|
170
|
+
text: '{"score": 0.92, "reason": "Semantically equivalent statements"}',
|
|
171
|
+
tokens: { prompt: 100, completion: 20, total: 120 },
|
|
172
|
+
latencyMs: 100,
|
|
173
|
+
}),
|
|
174
|
+
capabilities: async () => ({
|
|
175
|
+
streaming: false,
|
|
176
|
+
functionCalling: false,
|
|
177
|
+
toolUse: false,
|
|
178
|
+
maxContext: 4096,
|
|
179
|
+
}),
|
|
180
|
+
},
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
const result = await evaluator.evaluate(
|
|
184
|
+
'The weather is nice today',
|
|
185
|
+
{
|
|
186
|
+
type: 'similarity',
|
|
187
|
+
value: "It's a beautiful day outside",
|
|
188
|
+
threshold: 0.8,
|
|
189
|
+
},
|
|
190
|
+
mockContext
|
|
191
|
+
);
|
|
192
|
+
|
|
193
|
+
expect(result.passed).toBe(true);
|
|
194
|
+
expect(result.score).toBe(0.92);
|
|
195
|
+
expect(result.details?.method).toBe('llm');
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
test('handles invalid LLM response gracefully', async () => {
|
|
199
|
+
const mockContext: EvaluatorContext = {
|
|
200
|
+
client: {
|
|
201
|
+
provider: 'mock',
|
|
202
|
+
generate: async () => ({
|
|
203
|
+
id: 'test',
|
|
204
|
+
model: 'mock',
|
|
205
|
+
text: 'This is not valid JSON',
|
|
206
|
+
tokens: { prompt: 100, completion: 20, total: 120 },
|
|
207
|
+
latencyMs: 100,
|
|
208
|
+
}),
|
|
209
|
+
capabilities: async () => ({
|
|
210
|
+
streaming: false,
|
|
211
|
+
functionCalling: false,
|
|
212
|
+
toolUse: false,
|
|
213
|
+
maxContext: 4096,
|
|
214
|
+
}),
|
|
215
|
+
},
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
const result = await evaluator.evaluate(
|
|
219
|
+
'Some text',
|
|
220
|
+
{
|
|
221
|
+
type: 'similarity',
|
|
222
|
+
value: 'Some other text',
|
|
223
|
+
threshold: 0.75,
|
|
224
|
+
},
|
|
225
|
+
mockContext
|
|
226
|
+
);
|
|
227
|
+
|
|
228
|
+
expect(result.passed).toBe(false);
|
|
229
|
+
expect(result.score).toBe(0);
|
|
230
|
+
expect(result.details?.method).toBe('failed');
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
test('clamps score to 0-1 range', async () => {
|
|
234
|
+
const mockContext: EvaluatorContext = {
|
|
235
|
+
client: {
|
|
236
|
+
provider: 'mock',
|
|
237
|
+
generate: async () => ({
|
|
238
|
+
id: 'test',
|
|
239
|
+
model: 'mock',
|
|
240
|
+
text: '{"score": 1.5, "reason": "Invalid score"}', // Score > 1
|
|
241
|
+
tokens: { prompt: 100, completion: 20, total: 120 },
|
|
242
|
+
latencyMs: 100,
|
|
243
|
+
}),
|
|
244
|
+
capabilities: async () => ({
|
|
245
|
+
streaming: false,
|
|
246
|
+
functionCalling: false,
|
|
247
|
+
toolUse: false,
|
|
248
|
+
maxContext: 4096,
|
|
249
|
+
}),
|
|
250
|
+
},
|
|
251
|
+
};
|
|
252
|
+
|
|
253
|
+
const result = await evaluator.evaluate(
|
|
254
|
+
'Text',
|
|
255
|
+
{
|
|
256
|
+
type: 'similarity',
|
|
257
|
+
value: 'Text',
|
|
258
|
+
threshold: 0.5,
|
|
259
|
+
},
|
|
260
|
+
mockContext
|
|
261
|
+
);
|
|
262
|
+
|
|
263
|
+
expect(result.score).toBe(1); // Clamped to 1
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
test('uses default threshold of 0.75', async () => {
|
|
267
|
+
const mockContext: EvaluatorContext = {
|
|
268
|
+
client: {
|
|
269
|
+
provider: 'mock',
|
|
270
|
+
generate: async () => ({
|
|
271
|
+
id: 'test',
|
|
272
|
+
model: 'mock',
|
|
273
|
+
text: '{"score": 0.74, "reason": "Just below threshold"}',
|
|
274
|
+
tokens: { prompt: 100, completion: 20, total: 120 },
|
|
275
|
+
latencyMs: 100,
|
|
276
|
+
}),
|
|
277
|
+
capabilities: async () => ({
|
|
278
|
+
streaming: false,
|
|
279
|
+
functionCalling: false,
|
|
280
|
+
toolUse: false,
|
|
281
|
+
maxContext: 4096,
|
|
282
|
+
}),
|
|
283
|
+
},
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
const result = await evaluator.evaluate(
|
|
287
|
+
'Text A',
|
|
288
|
+
{
|
|
289
|
+
type: 'similarity',
|
|
290
|
+
value: 'Text B',
|
|
291
|
+
// No threshold specified, should use default 0.75
|
|
292
|
+
} as any,
|
|
293
|
+
mockContext
|
|
294
|
+
);
|
|
295
|
+
|
|
296
|
+
expect(result.passed).toBe(false);
|
|
297
|
+
expect(result.score).toBe(0.74);
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
test('extracts JSON from LLM response with surrounding text', async () => {
|
|
301
|
+
const mockContext: EvaluatorContext = {
|
|
302
|
+
client: {
|
|
303
|
+
provider: 'mock',
|
|
304
|
+
generate: async () => ({
|
|
305
|
+
id: 'test',
|
|
306
|
+
model: 'mock',
|
|
307
|
+
text: 'Here is my analysis:\n\n{"score": 0.88, "reason": "Very similar meaning"}\n\nHope this helps!',
|
|
308
|
+
tokens: { prompt: 100, completion: 50, total: 150 },
|
|
309
|
+
latencyMs: 100,
|
|
310
|
+
}),
|
|
311
|
+
capabilities: async () => ({
|
|
312
|
+
streaming: false,
|
|
313
|
+
functionCalling: false,
|
|
314
|
+
toolUse: false,
|
|
315
|
+
maxContext: 4096,
|
|
316
|
+
}),
|
|
317
|
+
},
|
|
318
|
+
};
|
|
319
|
+
|
|
320
|
+
const result = await evaluator.evaluate(
|
|
321
|
+
'Hello world',
|
|
322
|
+
{
|
|
323
|
+
type: 'similarity',
|
|
324
|
+
value: 'Hello, world!',
|
|
325
|
+
threshold: 0.8,
|
|
326
|
+
},
|
|
327
|
+
mockContext
|
|
328
|
+
);
|
|
329
|
+
|
|
330
|
+
expect(result.passed).toBe(true);
|
|
331
|
+
expect(result.score).toBe(0.88);
|
|
332
|
+
});
|
|
333
|
+
});
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic similarity evaluator
|
|
3
|
+
* Uses vector embeddings for semantic similarity matching when available,
|
|
4
|
+
* falls back to LLM-based semantic comparison otherwise.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import type { Expected } from '../scenario/schema';
|
|
8
|
+
import type { Evaluator, EvaluatorContext, EvaluatorResult } from './types';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Cosine similarity between two vectors
|
|
12
|
+
*/
|
|
13
|
+
function cosineSimilarity(vecA: number[], vecB: number[]): number {
|
|
14
|
+
if (vecA.length !== vecB.length) {
|
|
15
|
+
throw new Error('Vector dimensions must match');
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
let dotProduct = 0;
|
|
19
|
+
let magnitudeA = 0;
|
|
20
|
+
let magnitudeB = 0;
|
|
21
|
+
|
|
22
|
+
for (let i = 0; i < vecA.length; i++) {
|
|
23
|
+
dotProduct += vecA[i] * vecB[i];
|
|
24
|
+
magnitudeA += vecA[i] * vecA[i];
|
|
25
|
+
magnitudeB += vecB[i] * vecB[i];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
magnitudeA = Math.sqrt(magnitudeA);
|
|
29
|
+
magnitudeB = Math.sqrt(magnitudeB);
|
|
30
|
+
|
|
31
|
+
if (magnitudeA === 0 || magnitudeB === 0) {
|
|
32
|
+
return 0;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return dotProduct / (magnitudeA * magnitudeB);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Normalize similarity score to 0-1 range
|
|
40
|
+
* Cosine similarity can be -1 to 1, we map it to 0 to 1
|
|
41
|
+
*/
|
|
42
|
+
function normalizeSimilarity(similarity: number): number {
|
|
43
|
+
return (similarity + 1) / 2;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const LLM_SIMILARITY_PROMPT = `You are a semantic similarity evaluator. Compare the semantic meaning of two texts and rate their similarity.
|
|
47
|
+
|
|
48
|
+
Text A (Reference):
|
|
49
|
+
"""
|
|
50
|
+
{{expected}}
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
Text B (Response):
|
|
54
|
+
"""
|
|
55
|
+
{{response}}
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
Rate the semantic similarity between these texts on a scale from 0.0 to 1.0:
|
|
59
|
+
- 1.0: Identical meaning, same information conveyed
|
|
60
|
+
- 0.8-0.9: Very similar meaning, minor differences in phrasing
|
|
61
|
+
- 0.6-0.7: Similar topic and general meaning, some differences in detail
|
|
62
|
+
- 0.4-0.5: Related topics but different focus or conclusions
|
|
63
|
+
- 0.2-0.3: Loosely related, different meanings
|
|
64
|
+
- 0.0-0.1: Completely unrelated or contradictory
|
|
65
|
+
|
|
66
|
+
Respond with ONLY a JSON object in this exact format:
|
|
67
|
+
{"score": <number between 0 and 1>, "reason": "<brief 1-sentence explanation>"}`;
|
|
68
|
+
|
|
69
|
+
export class SimilarityEvaluator implements Evaluator {
|
|
70
|
+
readonly type = 'similarity';
|
|
71
|
+
|
|
72
|
+
async evaluate(
|
|
73
|
+
response: string,
|
|
74
|
+
expected: Expected,
|
|
75
|
+
context?: EvaluatorContext
|
|
76
|
+
): Promise<EvaluatorResult> {
|
|
77
|
+
if (expected.type !== 'similarity') {
|
|
78
|
+
throw new Error('Invalid expected type for SimilarityEvaluator');
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const threshold = expected.threshold ?? 0.75;
|
|
82
|
+
const expectedValue = expected.value;
|
|
83
|
+
const mode = expected.mode; // 'embedding' | 'llm' | undefined (auto)
|
|
84
|
+
|
|
85
|
+
// If mode is explicitly 'llm', skip embedding and go straight to LLM
|
|
86
|
+
if (mode === 'llm') {
|
|
87
|
+
return this.evaluateWithLLM(response, expectedValue, expected.model, threshold, context);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// If mode is 'embedding' or auto (undefined), try embedding first
|
|
91
|
+
if (mode === 'embedding' || mode === undefined) {
|
|
92
|
+
// Check if embedding is available
|
|
93
|
+
if (context?.client?.embed) {
|
|
94
|
+
try {
|
|
95
|
+
const embeddingModel = expected.embeddingModel;
|
|
96
|
+
const [responseEmbedding, expectedEmbedding] = await Promise.all([
|
|
97
|
+
context.client.embed(response, embeddingModel),
|
|
98
|
+
context.client.embed(expectedValue, embeddingModel),
|
|
99
|
+
]);
|
|
100
|
+
|
|
101
|
+
const rawSimilarity = cosineSimilarity(responseEmbedding, expectedEmbedding);
|
|
102
|
+
// For semantic embeddings, cosine similarity is typically 0-1 for similar texts
|
|
103
|
+
// We use raw similarity directly if positive, otherwise normalize
|
|
104
|
+
const similarity =
|
|
105
|
+
rawSimilarity >= 0 ? rawSimilarity : normalizeSimilarity(rawSimilarity);
|
|
106
|
+
const passed = similarity >= threshold;
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
passed,
|
|
110
|
+
score: similarity,
|
|
111
|
+
reason: `Semantic similarity (embedding${embeddingModel ? `: ${embeddingModel}` : ''}): ${(similarity * 100).toFixed(1)}% (threshold: ${(threshold * 100).toFixed(1)}%)`,
|
|
112
|
+
details: {
|
|
113
|
+
method: 'embedding',
|
|
114
|
+
embeddingModel: embeddingModel || 'default',
|
|
115
|
+
similarity,
|
|
116
|
+
threshold,
|
|
117
|
+
expected: expectedValue.slice(0, 200),
|
|
118
|
+
actual: response.slice(0, 200),
|
|
119
|
+
},
|
|
120
|
+
};
|
|
121
|
+
} catch (error) {
|
|
122
|
+
// If mode is explicitly 'embedding', fail instead of falling back
|
|
123
|
+
if (mode === 'embedding') {
|
|
124
|
+
return {
|
|
125
|
+
passed: false,
|
|
126
|
+
score: 0,
|
|
127
|
+
reason: `Embedding evaluation failed: ${(error as Error).message}`,
|
|
128
|
+
details: {
|
|
129
|
+
error: (error as Error).message,
|
|
130
|
+
method: 'embedding',
|
|
131
|
+
embeddingModel: expected.embeddingModel || 'default',
|
|
132
|
+
},
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
// Auto mode: fall through to LLM-based evaluation
|
|
136
|
+
console.warn(`Embedding failed, falling back to LLM: ${(error as Error).message}`);
|
|
137
|
+
}
|
|
138
|
+
} else if (mode === 'embedding') {
|
|
139
|
+
// Explicitly requested embedding mode but no embed function available
|
|
140
|
+
return {
|
|
141
|
+
passed: false,
|
|
142
|
+
score: 0,
|
|
143
|
+
reason:
|
|
144
|
+
'Embedding mode requested but no embedding function available. Ensure the provider supports embeddings.',
|
|
145
|
+
details: {
|
|
146
|
+
error: 'No embed function available on client',
|
|
147
|
+
method: 'embedding',
|
|
148
|
+
embeddingModel: expected.embeddingModel || 'not-configured',
|
|
149
|
+
},
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Fall back to LLM-based semantic comparison (auto mode only reaches here if embedding failed/unavailable)
|
|
155
|
+
return this.evaluateWithLLM(response, expectedValue, expected.model, threshold, context);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Evaluate similarity using LLM-based comparison
|
|
160
|
+
*/
|
|
161
|
+
private async evaluateWithLLM(
|
|
162
|
+
response: string,
|
|
163
|
+
expectedValue: string,
|
|
164
|
+
model: string | undefined,
|
|
165
|
+
threshold: number,
|
|
166
|
+
context?: EvaluatorContext
|
|
167
|
+
): Promise<EvaluatorResult> {
|
|
168
|
+
if (!context?.client) {
|
|
169
|
+
return {
|
|
170
|
+
passed: false,
|
|
171
|
+
score: 0,
|
|
172
|
+
reason: 'Similarity evaluation requires a ModelClient (for embeddings or LLM comparison)',
|
|
173
|
+
details: {
|
|
174
|
+
error: 'No ModelClient provided in context',
|
|
175
|
+
method: 'unavailable',
|
|
176
|
+
},
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
try {
|
|
181
|
+
const prompt = LLM_SIMILARITY_PROMPT.replace('{{expected}}', expectedValue).replace(
|
|
182
|
+
'{{response}}',
|
|
183
|
+
response
|
|
184
|
+
);
|
|
185
|
+
|
|
186
|
+
// Note: Some models (like o1, o3, reasoning models) only support temperature=1
|
|
187
|
+
// We omit temperature to let the API use its default
|
|
188
|
+
const result = await context.client.generate({
|
|
189
|
+
prompt,
|
|
190
|
+
model,
|
|
191
|
+
maxTokens: 150,
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
// Parse JSON response - handle various formats including reasoning model outputs
|
|
195
|
+
// Try to find the JSON object containing score and reason
|
|
196
|
+
const jsonMatch = result.text.match(
|
|
197
|
+
/\{\s*"score"\s*:\s*[\d.]+\s*,\s*"reason"\s*:\s*"[^"]*"\s*\}/
|
|
198
|
+
);
|
|
199
|
+
const fallbackMatch = result.text.match(/\{[\s\S]*?"score"[\s\S]*?"reason"[\s\S]*?\}/);
|
|
200
|
+
const matchText = jsonMatch?.[0] || fallbackMatch?.[0];
|
|
201
|
+
|
|
202
|
+
if (!matchText) {
|
|
203
|
+
// Try to extract score from plain text as last resort
|
|
204
|
+
const scoreMatch = result.text.match(/(?:score|similarity)[:\s]*(\d*\.?\d+)/i);
|
|
205
|
+
if (scoreMatch) {
|
|
206
|
+
const extractedScore = Number.parseFloat(scoreMatch[1]);
|
|
207
|
+
const normalizedScore = extractedScore > 1 ? extractedScore / 100 : extractedScore;
|
|
208
|
+
const passed = normalizedScore >= threshold;
|
|
209
|
+
return {
|
|
210
|
+
passed,
|
|
211
|
+
score: normalizedScore,
|
|
212
|
+
reason: `Semantic similarity (LLM${model ? `: ${model}` : ''}): ${(normalizedScore * 100).toFixed(1)}% (threshold: ${(threshold * 100).toFixed(1)}%)`,
|
|
213
|
+
details: {
|
|
214
|
+
method: 'llm',
|
|
215
|
+
model: model || 'default',
|
|
216
|
+
similarity: normalizedScore,
|
|
217
|
+
threshold,
|
|
218
|
+
expected: expectedValue.slice(0, 200),
|
|
219
|
+
actual: response.slice(0, 200),
|
|
220
|
+
llmReason: 'Extracted from plain text response',
|
|
221
|
+
rawResponse: result.text.slice(0, 500),
|
|
222
|
+
},
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
throw new Error('Invalid LLM response format - could not extract score');
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const parsed = JSON.parse(matchText) as { score: number; reason: string };
|
|
229
|
+
const similarity = Math.max(0, Math.min(1, parsed.score));
|
|
230
|
+
const passed = similarity >= threshold;
|
|
231
|
+
|
|
232
|
+
return {
|
|
233
|
+
passed,
|
|
234
|
+
score: similarity,
|
|
235
|
+
reason: `Semantic similarity (LLM${model ? `: ${model}` : ''}): ${(similarity * 100).toFixed(1)}% - ${parsed.reason}`,
|
|
236
|
+
details: {
|
|
237
|
+
method: 'llm',
|
|
238
|
+
model: model || 'default',
|
|
239
|
+
similarity,
|
|
240
|
+
threshold,
|
|
241
|
+
expected: expectedValue.slice(0, 200),
|
|
242
|
+
actual: response.slice(0, 200),
|
|
243
|
+
llmReason: parsed.reason,
|
|
244
|
+
},
|
|
245
|
+
};
|
|
246
|
+
} catch (error) {
|
|
247
|
+
return {
|
|
248
|
+
passed: false,
|
|
249
|
+
score: 0,
|
|
250
|
+
reason: `Similarity evaluation failed: ${(error as Error).message}`,
|
|
251
|
+
details: {
|
|
252
|
+
error: (error as Error).message,
|
|
253
|
+
method: 'failed',
|
|
254
|
+
},
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|