@assay-ai/core 0.2.0-beta → 0.3.0-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +576 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +69 -1
- package/dist/index.d.ts +69 -1
- package/dist/index.js +568 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -32,6 +32,7 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
AnswerRelevancyMetric: () => AnswerRelevancyMetric,
|
|
34
34
|
AnthropicProvider: () => AnthropicProvider,
|
|
35
|
+
AzureOpenAIProvider: () => AzureOpenAIProvider,
|
|
35
36
|
BaseLLMProvider: () => BaseLLMProvider,
|
|
36
37
|
BaseMetric: () => BaseMetric,
|
|
37
38
|
BiasMetric: () => BiasMetric,
|
|
@@ -39,14 +40,21 @@ __export(index_exports, {
|
|
|
39
40
|
ContextualPrecisionMetric: () => ContextualPrecisionMetric,
|
|
40
41
|
ContextualRecallMetric: () => ContextualRecallMetric,
|
|
41
42
|
ContextualRelevancyMetric: () => ContextualRelevancyMetric,
|
|
43
|
+
ConversationCompletenessMetric: () => ConversationCompletenessMetric,
|
|
42
44
|
ExactMatchMetric: () => ExactMatchMetric,
|
|
43
45
|
FaithfulnessMetric: () => FaithfulnessMetric,
|
|
44
46
|
GEval: () => GEval,
|
|
47
|
+
GeminiProvider: () => GeminiProvider,
|
|
48
|
+
GoalAccuracyMetric: () => GoalAccuracyMetric,
|
|
45
49
|
HallucinationMetric: () => HallucinationMetric,
|
|
46
50
|
JsonCorrectnessMetric: () => JsonCorrectnessMetric,
|
|
51
|
+
KnowledgeRetentionMetric: () => KnowledgeRetentionMetric,
|
|
47
52
|
OllamaProvider: () => OllamaProvider,
|
|
48
53
|
OpenAIProvider: () => OpenAIProvider,
|
|
54
|
+
RoleAdherenceMetric: () => RoleAdherenceMetric,
|
|
49
55
|
SummarizationMetric: () => SummarizationMetric,
|
|
56
|
+
TaskCompletionMetric: () => TaskCompletionMetric,
|
|
57
|
+
ToolCorrectnessMetric: () => ToolCorrectnessMetric,
|
|
50
58
|
ToxicityMetric: () => ToxicityMetric,
|
|
51
59
|
assertEval: () => assertEval,
|
|
52
60
|
createLimiter: () => createLimiter,
|
|
@@ -314,6 +322,102 @@ var OllamaProvider = class extends BaseLLMProvider {
|
|
|
314
322
|
}
|
|
315
323
|
};
|
|
316
324
|
|
|
325
|
+
// src/providers/gemini.ts
|
|
326
|
+
var DEFAULT_MODEL4 = "gemini-2.0-flash";
|
|
327
|
+
var GeminiProvider = class extends BaseLLMProvider {
|
|
328
|
+
apiKey;
|
|
329
|
+
constructor(config = {}) {
|
|
330
|
+
super(config, DEFAULT_MODEL4);
|
|
331
|
+
const key = config.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
332
|
+
if (!key) {
|
|
333
|
+
throw new Error(
|
|
334
|
+
"Google API key is required. Set GOOGLE_API_KEY env var or pass apiKey in config."
|
|
335
|
+
);
|
|
336
|
+
}
|
|
337
|
+
this.apiKey = key;
|
|
338
|
+
}
|
|
339
|
+
get providerName() {
|
|
340
|
+
return "gemini";
|
|
341
|
+
}
|
|
342
|
+
async generate(prompt) {
|
|
343
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.modelName}:generateContent?key=${this.apiKey}`;
|
|
344
|
+
const response = await fetch(url, {
|
|
345
|
+
method: "POST",
|
|
346
|
+
headers: { "Content-Type": "application/json" },
|
|
347
|
+
body: JSON.stringify({
|
|
348
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
349
|
+
generationConfig: { temperature: this.temperature }
|
|
350
|
+
})
|
|
351
|
+
});
|
|
352
|
+
if (!response.ok) {
|
|
353
|
+
const errorText = await response.text();
|
|
354
|
+
throw new Error(`Gemini request failed (${response.status}): ${errorText}`);
|
|
355
|
+
}
|
|
356
|
+
const data = await response.json();
|
|
357
|
+
const content = data.candidates?.[0]?.content?.parts?.[0]?.text;
|
|
358
|
+
if (!content) {
|
|
359
|
+
throw new Error("Gemini returned an empty response");
|
|
360
|
+
}
|
|
361
|
+
return content;
|
|
362
|
+
}
|
|
363
|
+
};
|
|
364
|
+
|
|
365
|
+
// src/providers/azure-openai.ts
|
|
366
|
+
var DEFAULT_API_VERSION = "2024-08-01-preview";
|
|
367
|
+
var AzureOpenAIProvider = class extends BaseLLMProvider {
|
|
368
|
+
constructor(azureConfig = {}) {
|
|
369
|
+
super(
|
|
370
|
+
azureConfig,
|
|
371
|
+
azureConfig.deploymentName ?? process.env.AZURE_OPENAI_DEPLOYMENT ?? "gpt-4o"
|
|
372
|
+
);
|
|
373
|
+
this.azureConfig = azureConfig;
|
|
374
|
+
this.endpoint = azureConfig.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT ?? "";
|
|
375
|
+
this.apiKey = azureConfig.apiKey ?? process.env.AZURE_OPENAI_API_KEY ?? "";
|
|
376
|
+
this.deploymentName = azureConfig.deploymentName ?? process.env.AZURE_OPENAI_DEPLOYMENT ?? "gpt-4o";
|
|
377
|
+
this.apiVersion = azureConfig.apiVersion ?? DEFAULT_API_VERSION;
|
|
378
|
+
if (!this.endpoint) {
|
|
379
|
+
throw new Error(
|
|
380
|
+
"Azure OpenAI endpoint is required. Set AZURE_OPENAI_ENDPOINT env var or pass endpoint in config."
|
|
381
|
+
);
|
|
382
|
+
}
|
|
383
|
+
if (!this.apiKey) {
|
|
384
|
+
throw new Error(
|
|
385
|
+
"Azure OpenAI API key is required. Set AZURE_OPENAI_API_KEY env var or pass apiKey in config."
|
|
386
|
+
);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
client;
|
|
390
|
+
endpoint;
|
|
391
|
+
apiKey;
|
|
392
|
+
deploymentName;
|
|
393
|
+
apiVersion;
|
|
394
|
+
get providerName() {
|
|
395
|
+
return "azure-openai";
|
|
396
|
+
}
|
|
397
|
+
async generate(prompt) {
|
|
398
|
+
if (!this.client) {
|
|
399
|
+
const { default: OpenAI } = await import("openai");
|
|
400
|
+
this.client = new OpenAI({
|
|
401
|
+
apiKey: this.apiKey,
|
|
402
|
+
baseURL: `${this.endpoint}/openai/deployments/${this.deploymentName}`,
|
|
403
|
+
defaultQuery: { "api-version": this.apiVersion }
|
|
404
|
+
});
|
|
405
|
+
}
|
|
406
|
+
const openai = this.client;
|
|
407
|
+
const response = await openai.chat.completions.create({
|
|
408
|
+
model: this.deploymentName,
|
|
409
|
+
messages: [{ role: "user", content: prompt }],
|
|
410
|
+
temperature: this.temperature,
|
|
411
|
+
max_tokens: this.maxTokens
|
|
412
|
+
});
|
|
413
|
+
const content = response.choices[0]?.message.content;
|
|
414
|
+
if (!content) {
|
|
415
|
+
throw new Error("Azure OpenAI returned an empty response");
|
|
416
|
+
}
|
|
417
|
+
return content;
|
|
418
|
+
}
|
|
419
|
+
};
|
|
420
|
+
|
|
317
421
|
// src/providers/index.ts
|
|
318
422
|
var NoopProvider = class extends BaseLLMProvider {
|
|
319
423
|
constructor() {
|
|
@@ -331,6 +435,8 @@ function resolveProvider(provider) {
|
|
|
331
435
|
if (typeof process !== "undefined" && process.env) {
|
|
332
436
|
if (process.env.OPENAI_API_KEY) return new OpenAIProvider();
|
|
333
437
|
if (process.env.ANTHROPIC_API_KEY) return new AnthropicProvider();
|
|
438
|
+
if (process.env.GOOGLE_API_KEY) return new GeminiProvider();
|
|
439
|
+
if (process.env.AZURE_OPENAI_API_KEY) return new AzureOpenAIProvider();
|
|
334
440
|
}
|
|
335
441
|
return new NoopProvider();
|
|
336
442
|
}
|
|
@@ -342,6 +448,9 @@ function resolveProvider(provider) {
|
|
|
342
448
|
if (provider.startsWith("claude-")) {
|
|
343
449
|
return new AnthropicProvider({ model: provider });
|
|
344
450
|
}
|
|
451
|
+
if (provider.startsWith("gemini-")) {
|
|
452
|
+
return new GeminiProvider({ model: provider });
|
|
453
|
+
}
|
|
345
454
|
return new OllamaProvider({ model: provider });
|
|
346
455
|
}
|
|
347
456
|
return new NoopProvider();
|
|
@@ -2070,6 +2179,465 @@ var JsonCorrectnessMetric = class extends BaseMetric {
|
|
|
2070
2179
|
}
|
|
2071
2180
|
};
|
|
2072
2181
|
|
|
2182
|
+
// src/metrics/tool-correctness.ts
|
|
2183
|
+
var ToolCorrectnessMetric = class extends BaseMetric {
|
|
2184
|
+
name = "Tool Correctness";
|
|
2185
|
+
requiredFields = ["toolsCalled", "expectedTools"];
|
|
2186
|
+
requiresProvider = false;
|
|
2187
|
+
matchParameters;
|
|
2188
|
+
constructor(config) {
|
|
2189
|
+
super({ ...config, provider: void 0 });
|
|
2190
|
+
this.matchParameters = config?.matchParameters ?? false;
|
|
2191
|
+
}
|
|
2192
|
+
async measure(testCase) {
|
|
2193
|
+
this.validate(testCase);
|
|
2194
|
+
const start = performance.now();
|
|
2195
|
+
const toolsCalled = testCase.toolsCalled;
|
|
2196
|
+
const expectedTools = testCase.expectedTools;
|
|
2197
|
+
if (expectedTools.length === 0) {
|
|
2198
|
+
return this.buildResult(1, "No expected tools specified \u2014 trivially correct.", start);
|
|
2199
|
+
}
|
|
2200
|
+
const calledNames = new Set(toolsCalled.map((t) => t.name));
|
|
2201
|
+
let matchCount = 0;
|
|
2202
|
+
for (const expected of expectedTools) {
|
|
2203
|
+
if (!calledNames.has(expected.name)) {
|
|
2204
|
+
continue;
|
|
2205
|
+
}
|
|
2206
|
+
if (this.matchParameters) {
|
|
2207
|
+
const calledTool = toolsCalled.find((t) => t.name === expected.name);
|
|
2208
|
+
if (calledTool && JSON.stringify(calledTool.inputParameters) === JSON.stringify(expected.inputParameters)) {
|
|
2209
|
+
matchCount++;
|
|
2210
|
+
}
|
|
2211
|
+
} else {
|
|
2212
|
+
matchCount++;
|
|
2213
|
+
}
|
|
2214
|
+
}
|
|
2215
|
+
let score = matchCount / expectedTools.length;
|
|
2216
|
+
score = this.applyStrictMode(score);
|
|
2217
|
+
const reason = score === 1 ? "All expected tools were called correctly." : `${matchCount} of ${expectedTools.length} expected tools were called correctly.`;
|
|
2218
|
+
return this.buildResult(score, reason, start, {
|
|
2219
|
+
matchCount,
|
|
2220
|
+
expectedCount: expectedTools.length,
|
|
2221
|
+
calledCount: toolsCalled.length
|
|
2222
|
+
});
|
|
2223
|
+
}
|
|
2224
|
+
};
|
|
2225
|
+
|
|
2226
|
+
// src/metrics/task-completion.ts
|
|
2227
|
+
var import_zod11 = require("zod");
|
|
2228
|
+
|
|
2229
|
+
// src/templates/task-completion.ts
|
|
2230
|
+
var TaskCompletionTemplate = {
|
|
2231
|
+
evaluate(input, actualOutput) {
|
|
2232
|
+
return `You are an expert evaluator. Given an input task and the AI agent's actual output, evaluate how well the agent completed the task.
|
|
2233
|
+
|
|
2234
|
+
Score the task completion on a scale of 1 to 5:
|
|
2235
|
+
- 1: The agent completely failed to address the task.
|
|
2236
|
+
- 2: The agent partially addressed the task but missed critical aspects.
|
|
2237
|
+
- 3: The agent addressed the main aspects of the task but with notable gaps.
|
|
2238
|
+
- 4: The agent mostly completed the task with only minor issues.
|
|
2239
|
+
- 5: The agent fully and correctly completed the task.
|
|
2240
|
+
|
|
2241
|
+
**
|
|
2242
|
+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation are needed outside of the JSON. Ensure all strings are properly closed. Repair any invalid JSON before you output it.
|
|
2243
|
+
|
|
2244
|
+
Expected JSON format:
|
|
2245
|
+
{
|
|
2246
|
+
"score": <1-5>,
|
|
2247
|
+
"reason": "<concise explanation for the score>"
|
|
2248
|
+
}
|
|
2249
|
+
**
|
|
2250
|
+
|
|
2251
|
+
Input Task:
|
|
2252
|
+
${input}
|
|
2253
|
+
|
|
2254
|
+
Actual Output:
|
|
2255
|
+
${actualOutput}
|
|
2256
|
+
|
|
2257
|
+
JSON:
|
|
2258
|
+
`;
|
|
2259
|
+
}
|
|
2260
|
+
};
|
|
2261
|
+
|
|
2262
|
+
// src/metrics/task-completion.ts
|
|
2263
|
+
var evaluationSchema2 = import_zod11.z.object({
|
|
2264
|
+
score: import_zod11.z.number().min(1).max(5),
|
|
2265
|
+
reason: import_zod11.z.string()
|
|
2266
|
+
});
|
|
2267
|
+
var TaskCompletionMetric = class extends BaseMetric {
|
|
2268
|
+
name = "Task Completion";
|
|
2269
|
+
requiredFields = ["input", "actualOutput"];
|
|
2270
|
+
async measure(testCase) {
|
|
2271
|
+
this.validate(testCase);
|
|
2272
|
+
const start = performance.now();
|
|
2273
|
+
const { score: rawScore, reason: llmReason } = await this.provider.generateJSON(
|
|
2274
|
+
TaskCompletionTemplate.evaluate(testCase.input, testCase.actualOutput),
|
|
2275
|
+
evaluationSchema2
|
|
2276
|
+
);
|
|
2277
|
+
let score = (rawScore - 1) / 4;
|
|
2278
|
+
score = this.applyStrictMode(score);
|
|
2279
|
+
const reason = this.includeReason ? llmReason : void 0;
|
|
2280
|
+
return this.buildResult(score, reason, start, { rawScore });
|
|
2281
|
+
}
|
|
2282
|
+
};
|
|
2283
|
+
|
|
2284
|
+
// src/metrics/goal-accuracy.ts
|
|
2285
|
+
var import_zod12 = require("zod");
|
|
2286
|
+
|
|
2287
|
+
// src/templates/goal-accuracy.ts
|
|
2288
|
+
var GoalAccuracyTemplate = {
|
|
2289
|
+
evaluate(input, actualOutput, expectedOutput) {
|
|
2290
|
+
return `You are an expert evaluator. Given an input task, the AI agent's actual output, and the expected goal/outcome, evaluate how accurately the agent's output achieves the expected goal.
|
|
2291
|
+
|
|
2292
|
+
Score the goal accuracy on a scale of 1 to 5:
|
|
2293
|
+
- 1: The output completely fails to achieve the expected goal.
|
|
2294
|
+
- 2: The output partially achieves the goal but misses critical elements.
|
|
2295
|
+
- 3: The output achieves the main goal but with notable inaccuracies or omissions.
|
|
2296
|
+
- 4: The output mostly achieves the goal with only minor deviations.
|
|
2297
|
+
- 5: The output fully and accurately achieves the expected goal.
|
|
2298
|
+
|
|
2299
|
+
**
|
|
2300
|
+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation are needed outside of the JSON. Ensure all strings are properly closed. Repair any invalid JSON before you output it.
|
|
2301
|
+
|
|
2302
|
+
Expected JSON format:
|
|
2303
|
+
{
|
|
2304
|
+
"score": <1-5>,
|
|
2305
|
+
"reason": "<concise explanation for the score>"
|
|
2306
|
+
}
|
|
2307
|
+
**
|
|
2308
|
+
|
|
2309
|
+
Input Task:
|
|
2310
|
+
${input}
|
|
2311
|
+
|
|
2312
|
+
Expected Goal/Outcome:
|
|
2313
|
+
${expectedOutput}
|
|
2314
|
+
|
|
2315
|
+
Actual Output:
|
|
2316
|
+
${actualOutput}
|
|
2317
|
+
|
|
2318
|
+
JSON:
|
|
2319
|
+
`;
|
|
2320
|
+
}
|
|
2321
|
+
};
|
|
2322
|
+
|
|
2323
|
+
// src/metrics/goal-accuracy.ts
|
|
2324
|
+
var evaluationSchema3 = import_zod12.z.object({
|
|
2325
|
+
score: import_zod12.z.number().min(1).max(5),
|
|
2326
|
+
reason: import_zod12.z.string()
|
|
2327
|
+
});
|
|
2328
|
+
var GoalAccuracyMetric = class extends BaseMetric {
|
|
2329
|
+
name = "Goal Accuracy";
|
|
2330
|
+
requiredFields = ["input", "actualOutput", "expectedOutput"];
|
|
2331
|
+
async measure(testCase) {
|
|
2332
|
+
this.validate(testCase);
|
|
2333
|
+
const start = performance.now();
|
|
2334
|
+
const { score: rawScore, reason: llmReason } = await this.provider.generateJSON(
|
|
2335
|
+
GoalAccuracyTemplate.evaluate(
|
|
2336
|
+
testCase.input,
|
|
2337
|
+
testCase.actualOutput,
|
|
2338
|
+
testCase.expectedOutput
|
|
2339
|
+
),
|
|
2340
|
+
evaluationSchema3
|
|
2341
|
+
);
|
|
2342
|
+
let score = (rawScore - 1) / 4;
|
|
2343
|
+
score = this.applyStrictMode(score);
|
|
2344
|
+
const reason = this.includeReason ? llmReason : void 0;
|
|
2345
|
+
return this.buildResult(score, reason, start, { rawScore });
|
|
2346
|
+
}
|
|
2347
|
+
};
|
|
2348
|
+
|
|
2349
|
+
// src/metrics/conversation-completeness.ts
|
|
2350
|
+
var import_zod13 = require("zod");
|
|
2351
|
+
|
|
2352
|
+
// src/templates/conversation-completeness.ts
|
|
2353
|
+
var ConversationCompletenessTemplate = {
|
|
2354
|
+
evaluate(conversation, scenario, expectedOutcome, latestInput, latestOutput) {
|
|
2355
|
+
const formattedTurns = conversation.map((t) => `[${t.role.toUpperCase()}]: ${t.content}`).join("\n");
|
|
2356
|
+
return `You are evaluating whether a conversation successfully achieved its intended goal. Review the full conversation in the context of the stated scenario and expected outcome.
|
|
2357
|
+
|
|
2358
|
+
**Scenario:**
|
|
2359
|
+
${scenario}
|
|
2360
|
+
|
|
2361
|
+
**Expected Outcome:**
|
|
2362
|
+
${expectedOutcome}
|
|
2363
|
+
|
|
2364
|
+
**Conversation History:**
|
|
2365
|
+
${formattedTurns}
|
|
2366
|
+
|
|
2367
|
+
**Latest User Input:**
|
|
2368
|
+
${latestInput}
|
|
2369
|
+
|
|
2370
|
+
**Latest Assistant Response:**
|
|
2371
|
+
${latestOutput}
|
|
2372
|
+
|
|
2373
|
+
Score the conversation completeness on a scale of 1-5:
|
|
2374
|
+
- 5: Fully complete \u2014 the conversation has completely achieved the expected outcome with all goals met.
|
|
2375
|
+
- 4: Mostly complete \u2014 the primary goal is achieved, with minor aspects left unaddressed.
|
|
2376
|
+
- 3: Partially complete \u2014 some progress toward the goal, but significant parts remain unresolved.
|
|
2377
|
+
- 2: Mostly incomplete \u2014 minimal progress toward the expected outcome.
|
|
2378
|
+
- 1: Not complete \u2014 the conversation made no meaningful progress toward the goal.
|
|
2379
|
+
|
|
2380
|
+
**
|
|
2381
|
+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation outside the JSON.
|
|
2382
|
+
|
|
2383
|
+
Example JSON:
|
|
2384
|
+
{
|
|
2385
|
+
"score": 4,
|
|
2386
|
+
"reason": "The conversation successfully helped the user book a flight, but did not confirm the seat preference as expected."
|
|
2387
|
+
}
|
|
2388
|
+
**
|
|
2389
|
+
|
|
2390
|
+
JSON:
|
|
2391
|
+
`;
|
|
2392
|
+
},
|
|
2393
|
+
generateReason(score, normalizedScore) {
|
|
2394
|
+
return `Given the conversation completeness score of ${score}/5 (normalized: ${normalizedScore}), provide a CONCISE reason for this score. Explain how well the conversation achieved its intended goal.
|
|
2395
|
+
|
|
2396
|
+
**
|
|
2397
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
2398
|
+
Example JSON:
|
|
2399
|
+
{
|
|
2400
|
+
"reason": "The score is ${normalizedScore} because <your_reason>."
|
|
2401
|
+
}
|
|
2402
|
+
**
|
|
2403
|
+
|
|
2404
|
+
JSON:
|
|
2405
|
+
`;
|
|
2406
|
+
}
|
|
2407
|
+
};
|
|
2408
|
+
|
|
2409
|
+
// src/metrics/conversation-completeness.ts
|
|
2410
|
+
var evaluationSchema4 = import_zod13.z.object({
|
|
2411
|
+
score: import_zod13.z.number().min(1).max(5),
|
|
2412
|
+
reason: import_zod13.z.string()
|
|
2413
|
+
});
|
|
2414
|
+
var ConversationCompletenessMetric = class extends BaseMetric {
|
|
2415
|
+
name = "Conversation Completeness";
|
|
2416
|
+
requiredFields = ["input", "actualOutput"];
|
|
2417
|
+
async measure(testCase) {
|
|
2418
|
+
this.validate(testCase);
|
|
2419
|
+
const start = performance.now();
|
|
2420
|
+
if (!testCase.conversation) {
|
|
2421
|
+
throw new Error(
|
|
2422
|
+
`[${this.name}] This metric requires a "conversation" field on the test case.`
|
|
2423
|
+
);
|
|
2424
|
+
}
|
|
2425
|
+
if (!testCase.conversation.scenario || !testCase.conversation.expectedOutcome) {
|
|
2426
|
+
throw new Error(
|
|
2427
|
+
`[${this.name}] This metric requires "scenario" and "expectedOutcome" on the conversation.`
|
|
2428
|
+
);
|
|
2429
|
+
}
|
|
2430
|
+
const evaluation = await this.provider.generateJSON(
|
|
2431
|
+
ConversationCompletenessTemplate.evaluate(
|
|
2432
|
+
testCase.conversation.turns,
|
|
2433
|
+
testCase.conversation.scenario,
|
|
2434
|
+
testCase.conversation.expectedOutcome,
|
|
2435
|
+
testCase.input,
|
|
2436
|
+
testCase.actualOutput
|
|
2437
|
+
),
|
|
2438
|
+
evaluationSchema4
|
|
2439
|
+
);
|
|
2440
|
+
let score = (evaluation.score - 1) / 4;
|
|
2441
|
+
score = this.applyStrictMode(score);
|
|
2442
|
+
let reason;
|
|
2443
|
+
if (this.includeReason) {
|
|
2444
|
+
reason = evaluation.reason;
|
|
2445
|
+
}
|
|
2446
|
+
return this.buildResult(score, reason, start, {
|
|
2447
|
+
rawScore: evaluation.score
|
|
2448
|
+
});
|
|
2449
|
+
}
|
|
2450
|
+
};
|
|
2451
|
+
|
|
2452
|
+
// src/metrics/knowledge-retention.ts
|
|
2453
|
+
var import_zod14 = require("zod");
|
|
2454
|
+
|
|
2455
|
+
// src/templates/knowledge-retention.ts
|
|
2456
|
+
var KnowledgeRetentionTemplate = {
|
|
2457
|
+
evaluate(conversation, latestInput, latestOutput) {
|
|
2458
|
+
const formattedTurns = conversation.map((t) => `[${t.role.toUpperCase()}]: ${t.content}`).join("\n");
|
|
2459
|
+
return `You are evaluating whether a chatbot retains and correctly uses knowledge from earlier turns in a conversation. The chatbot should remember facts, preferences, names, and other details mentioned previously and apply them consistently.
|
|
2460
|
+
|
|
2461
|
+
Review the full conversation history, then evaluate the LATEST assistant response for knowledge retention.
|
|
2462
|
+
|
|
2463
|
+
**Conversation History:**
|
|
2464
|
+
${formattedTurns}
|
|
2465
|
+
|
|
2466
|
+
**Latest User Input:**
|
|
2467
|
+
${latestInput}
|
|
2468
|
+
|
|
2469
|
+
**Latest Assistant Response:**
|
|
2470
|
+
${latestOutput}
|
|
2471
|
+
|
|
2472
|
+
Score the knowledge retention on a scale of 1-5:
|
|
2473
|
+
- 5: Perfect retention \u2014 all previously mentioned facts, preferences, and details are correctly remembered and applied.
|
|
2474
|
+
- 4: Good retention \u2014 most information is remembered, with only minor omissions.
|
|
2475
|
+
- 3: Moderate retention \u2014 some important details from earlier turns are forgotten or misapplied.
|
|
2476
|
+
- 2: Poor retention \u2014 significant information is forgotten, leading to inconsistent or contradictory responses.
|
|
2477
|
+
- 1: No retention \u2014 the assistant appears to have no memory of previous conversation turns.
|
|
2478
|
+
|
|
2479
|
+
**
|
|
2480
|
+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation outside the JSON.
|
|
2481
|
+
|
|
2482
|
+
Example JSON:
|
|
2483
|
+
{
|
|
2484
|
+
"score": 4,
|
|
2485
|
+
"reason": "The assistant correctly remembered the user's name and preference for vegetarian food from earlier turns, but forgot the specific restaurant they discussed."
|
|
2486
|
+
}
|
|
2487
|
+
**
|
|
2488
|
+
|
|
2489
|
+
JSON:
|
|
2490
|
+
`;
|
|
2491
|
+
},
|
|
2492
|
+
generateReason(score, normalizedScore) {
|
|
2493
|
+
return `Given the knowledge retention score of ${score}/5 (normalized: ${normalizedScore}), provide a CONCISE reason for this score. Explain what the chatbot remembered or forgot from earlier conversation turns.
|
|
2494
|
+
|
|
2495
|
+
**
|
|
2496
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
2497
|
+
Example JSON:
|
|
2498
|
+
{
|
|
2499
|
+
"reason": "The score is ${normalizedScore} because <your_reason>."
|
|
2500
|
+
}
|
|
2501
|
+
**
|
|
2502
|
+
|
|
2503
|
+
JSON:
|
|
2504
|
+
`;
|
|
2505
|
+
}
|
|
2506
|
+
};
|
|
2507
|
+
|
|
2508
|
+
// src/metrics/knowledge-retention.ts
|
|
2509
|
+
var evaluationSchema5 = import_zod14.z.object({
|
|
2510
|
+
score: import_zod14.z.number().min(1).max(5),
|
|
2511
|
+
reason: import_zod14.z.string()
|
|
2512
|
+
});
|
|
2513
|
+
var KnowledgeRetentionMetric = class extends BaseMetric {
|
|
2514
|
+
name = "Knowledge Retention";
|
|
2515
|
+
requiredFields = ["input", "actualOutput"];
|
|
2516
|
+
async measure(testCase) {
|
|
2517
|
+
this.validate(testCase);
|
|
2518
|
+
const start = performance.now();
|
|
2519
|
+
if (!testCase.conversation || testCase.conversation.turns.length < 2) {
|
|
2520
|
+
throw new Error(
|
|
2521
|
+
`[${this.name}] This metric requires a "conversation" with at least 2 turns.`
|
|
2522
|
+
);
|
|
2523
|
+
}
|
|
2524
|
+
const evaluation = await this.provider.generateJSON(
|
|
2525
|
+
KnowledgeRetentionTemplate.evaluate(
|
|
2526
|
+
testCase.conversation.turns,
|
|
2527
|
+
testCase.input,
|
|
2528
|
+
testCase.actualOutput
|
|
2529
|
+
),
|
|
2530
|
+
evaluationSchema5
|
|
2531
|
+
);
|
|
2532
|
+
let score = (evaluation.score - 1) / 4;
|
|
2533
|
+
score = this.applyStrictMode(score);
|
|
2534
|
+
let reason;
|
|
2535
|
+
if (this.includeReason) {
|
|
2536
|
+
reason = evaluation.reason;
|
|
2537
|
+
}
|
|
2538
|
+
return this.buildResult(score, reason, start, {
|
|
2539
|
+
rawScore: evaluation.score
|
|
2540
|
+
});
|
|
2541
|
+
}
|
|
2542
|
+
};
|
|
2543
|
+
|
|
2544
|
+
// src/metrics/role-adherence.ts
|
|
2545
|
+
var import_zod15 = require("zod");
|
|
2546
|
+
|
|
2547
|
+
// src/templates/role-adherence.ts
|
|
2548
|
+
var RoleAdherenceTemplate = {
|
|
2549
|
+
evaluate(conversation, chatbotRole, latestInput, latestOutput) {
|
|
2550
|
+
const formattedTurns = conversation.map((t) => `[${t.role.toUpperCase()}]: ${t.content}`).join("\n");
|
|
2551
|
+
return `You are evaluating whether a chatbot consistently stays in character and adheres to its assigned role throughout a conversation. The chatbot should maintain its persona, tone, expertise level, and behavioral boundaries as defined by its role.
|
|
2552
|
+
|
|
2553
|
+
**Assigned Role:**
|
|
2554
|
+
${chatbotRole}
|
|
2555
|
+
|
|
2556
|
+
**Conversation History:**
|
|
2557
|
+
${formattedTurns}
|
|
2558
|
+
|
|
2559
|
+
**Latest User Input:**
|
|
2560
|
+
${latestInput}
|
|
2561
|
+
|
|
2562
|
+
**Latest Assistant Response:**
|
|
2563
|
+
${latestOutput}
|
|
2564
|
+
|
|
2565
|
+
Score the role adherence on a scale of 1-5:
|
|
2566
|
+
- 5: Perfect adherence \u2014 the assistant fully embodies the assigned role in tone, knowledge, boundaries, and behavior throughout.
|
|
2567
|
+
- 4: Good adherence \u2014 the assistant mostly stays in character, with only minor deviations.
|
|
2568
|
+
- 3: Moderate adherence \u2014 the assistant sometimes breaks character or responds in ways inconsistent with the role.
|
|
2569
|
+
- 2: Poor adherence \u2014 the assistant frequently acts outside the defined role, breaking immersion.
|
|
2570
|
+
- 1: No adherence \u2014 the assistant completely ignores the assigned role.
|
|
2571
|
+
|
|
2572
|
+
**
|
|
2573
|
+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation outside the JSON.
|
|
2574
|
+
|
|
2575
|
+
Example JSON:
|
|
2576
|
+
{
|
|
2577
|
+
"score": 4,
|
|
2578
|
+
"reason": "The assistant maintained a professional customer service tone throughout, but briefly used technical jargon that a support agent would typically avoid."
|
|
2579
|
+
}
|
|
2580
|
+
**
|
|
2581
|
+
|
|
2582
|
+
JSON:
|
|
2583
|
+
`;
|
|
2584
|
+
},
|
|
2585
|
+
generateReason(score, normalizedScore) {
|
|
2586
|
+
return `Given the role adherence score of ${score}/5 (normalized: ${normalizedScore}), provide a CONCISE reason for this score. Explain how well the chatbot stayed in its assigned character.
|
|
2587
|
+
|
|
2588
|
+
**
|
|
2589
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
2590
|
+
Example JSON:
|
|
2591
|
+
{
|
|
2592
|
+
"reason": "The score is ${normalizedScore} because <your_reason>."
|
|
2593
|
+
}
|
|
2594
|
+
**
|
|
2595
|
+
|
|
2596
|
+
JSON:
|
|
2597
|
+
`;
|
|
2598
|
+
}
|
|
2599
|
+
};
|
|
2600
|
+
|
|
2601
|
+
// src/metrics/role-adherence.ts
|
|
2602
|
+
var evaluationSchema6 = import_zod15.z.object({
|
|
2603
|
+
score: import_zod15.z.number().min(1).max(5),
|
|
2604
|
+
reason: import_zod15.z.string()
|
|
2605
|
+
});
|
|
2606
|
+
var RoleAdherenceMetric = class extends BaseMetric {
|
|
2607
|
+
name = "Role Adherence";
|
|
2608
|
+
requiredFields = ["input", "actualOutput"];
|
|
2609
|
+
async measure(testCase) {
|
|
2610
|
+
this.validate(testCase);
|
|
2611
|
+
const start = performance.now();
|
|
2612
|
+
if (!testCase.conversation) {
|
|
2613
|
+
throw new Error(
|
|
2614
|
+
`[${this.name}] This metric requires a "conversation" field on the test case.`
|
|
2615
|
+
);
|
|
2616
|
+
}
|
|
2617
|
+
if (!testCase.conversation.chatbotRole) {
|
|
2618
|
+
throw new Error(`[${this.name}] This metric requires "chatbotRole" on the conversation.`);
|
|
2619
|
+
}
|
|
2620
|
+
const evaluation = await this.provider.generateJSON(
|
|
2621
|
+
RoleAdherenceTemplate.evaluate(
|
|
2622
|
+
testCase.conversation.turns,
|
|
2623
|
+
testCase.conversation.chatbotRole,
|
|
2624
|
+
testCase.input,
|
|
2625
|
+
testCase.actualOutput
|
|
2626
|
+
),
|
|
2627
|
+
evaluationSchema6
|
|
2628
|
+
);
|
|
2629
|
+
let score = (evaluation.score - 1) / 4;
|
|
2630
|
+
score = this.applyStrictMode(score);
|
|
2631
|
+
let reason;
|
|
2632
|
+
if (this.includeReason) {
|
|
2633
|
+
reason = evaluation.reason;
|
|
2634
|
+
}
|
|
2635
|
+
return this.buildResult(score, reason, start, {
|
|
2636
|
+
rawScore: evaluation.score
|
|
2637
|
+
});
|
|
2638
|
+
}
|
|
2639
|
+
};
|
|
2640
|
+
|
|
2073
2641
|
// src/config.ts
|
|
2074
2642
|
var cachedConfig = null;
|
|
2075
2643
|
async function resolveConfig(overrides = {}) {
|
|
@@ -2412,6 +2980,7 @@ function meanAveragePrecision(relevances) {
|
|
|
2412
2980
|
0 && (module.exports = {
|
|
2413
2981
|
AnswerRelevancyMetric,
|
|
2414
2982
|
AnthropicProvider,
|
|
2983
|
+
AzureOpenAIProvider,
|
|
2415
2984
|
BaseLLMProvider,
|
|
2416
2985
|
BaseMetric,
|
|
2417
2986
|
BiasMetric,
|
|
@@ -2419,14 +2988,21 @@ function meanAveragePrecision(relevances) {
|
|
|
2419
2988
|
ContextualPrecisionMetric,
|
|
2420
2989
|
ContextualRecallMetric,
|
|
2421
2990
|
ContextualRelevancyMetric,
|
|
2991
|
+
ConversationCompletenessMetric,
|
|
2422
2992
|
ExactMatchMetric,
|
|
2423
2993
|
FaithfulnessMetric,
|
|
2424
2994
|
GEval,
|
|
2995
|
+
GeminiProvider,
|
|
2996
|
+
GoalAccuracyMetric,
|
|
2425
2997
|
HallucinationMetric,
|
|
2426
2998
|
JsonCorrectnessMetric,
|
|
2999
|
+
KnowledgeRetentionMetric,
|
|
2427
3000
|
OllamaProvider,
|
|
2428
3001
|
OpenAIProvider,
|
|
3002
|
+
RoleAdherenceMetric,
|
|
2429
3003
|
SummarizationMetric,
|
|
3004
|
+
TaskCompletionMetric,
|
|
3005
|
+
ToolCorrectnessMetric,
|
|
2430
3006
|
ToxicityMetric,
|
|
2431
3007
|
assertEval,
|
|
2432
3008
|
createLimiter,
|