@assay-ai/core 0.2.0-beta → 0.3.0-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -250,6 +250,102 @@ var OllamaProvider = class extends BaseLLMProvider {
250
250
  }
251
251
  };
252
252
 
253
+ // src/providers/gemini.ts
254
+ var DEFAULT_MODEL4 = "gemini-2.0-flash";
255
+ var GeminiProvider = class extends BaseLLMProvider {
256
+ apiKey;
257
+ constructor(config = {}) {
258
+ super(config, DEFAULT_MODEL4);
259
+ const key = config.apiKey ?? process.env.GOOGLE_API_KEY;
260
+ if (!key) {
261
+ throw new Error(
262
+ "Google API key is required. Set GOOGLE_API_KEY env var or pass apiKey in config."
263
+ );
264
+ }
265
+ this.apiKey = key;
266
+ }
267
+ get providerName() {
268
+ return "gemini";
269
+ }
270
+ async generate(prompt) {
271
+ const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.modelName}:generateContent?key=${this.apiKey}`;
272
+ const response = await fetch(url, {
273
+ method: "POST",
274
+ headers: { "Content-Type": "application/json" },
275
+ body: JSON.stringify({
276
+ contents: [{ parts: [{ text: prompt }] }],
277
+ generationConfig: { temperature: this.temperature }
278
+ })
279
+ });
280
+ if (!response.ok) {
281
+ const errorText = await response.text();
282
+ throw new Error(`Gemini request failed (${response.status}): ${errorText}`);
283
+ }
284
+ const data = await response.json();
285
+ const content = data.candidates?.[0]?.content?.parts?.[0]?.text;
286
+ if (!content) {
287
+ throw new Error("Gemini returned an empty response");
288
+ }
289
+ return content;
290
+ }
291
+ };
292
+
293
+ // src/providers/azure-openai.ts
294
+ var DEFAULT_API_VERSION = "2024-08-01-preview";
295
+ var AzureOpenAIProvider = class extends BaseLLMProvider {
296
+ constructor(azureConfig = {}) {
297
+ super(
298
+ azureConfig,
299
+ azureConfig.deploymentName ?? process.env.AZURE_OPENAI_DEPLOYMENT ?? "gpt-4o"
300
+ );
301
+ this.azureConfig = azureConfig;
302
+ this.endpoint = azureConfig.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT ?? "";
303
+ this.apiKey = azureConfig.apiKey ?? process.env.AZURE_OPENAI_API_KEY ?? "";
304
+ this.deploymentName = azureConfig.deploymentName ?? process.env.AZURE_OPENAI_DEPLOYMENT ?? "gpt-4o";
305
+ this.apiVersion = azureConfig.apiVersion ?? DEFAULT_API_VERSION;
306
+ if (!this.endpoint) {
307
+ throw new Error(
308
+ "Azure OpenAI endpoint is required. Set AZURE_OPENAI_ENDPOINT env var or pass endpoint in config."
309
+ );
310
+ }
311
+ if (!this.apiKey) {
312
+ throw new Error(
313
+ "Azure OpenAI API key is required. Set AZURE_OPENAI_API_KEY env var or pass apiKey in config."
314
+ );
315
+ }
316
+ }
317
+ client;
318
+ endpoint;
319
+ apiKey;
320
+ deploymentName;
321
+ apiVersion;
322
+ get providerName() {
323
+ return "azure-openai";
324
+ }
325
+ async generate(prompt) {
326
+ if (!this.client) {
327
+ const { default: OpenAI } = await import("openai");
328
+ this.client = new OpenAI({
329
+ apiKey: this.apiKey,
330
+ baseURL: `${this.endpoint}/openai/deployments/${this.deploymentName}`,
331
+ defaultQuery: { "api-version": this.apiVersion }
332
+ });
333
+ }
334
+ const openai = this.client;
335
+ const response = await openai.chat.completions.create({
336
+ model: this.deploymentName,
337
+ messages: [{ role: "user", content: prompt }],
338
+ temperature: this.temperature,
339
+ max_tokens: this.maxTokens
340
+ });
341
+ const content = response.choices[0]?.message.content;
342
+ if (!content) {
343
+ throw new Error("Azure OpenAI returned an empty response");
344
+ }
345
+ return content;
346
+ }
347
+ };
348
+
253
349
  // src/providers/index.ts
254
350
  var NoopProvider = class extends BaseLLMProvider {
255
351
  constructor() {
@@ -267,6 +363,8 @@ function resolveProvider(provider) {
267
363
  if (typeof process !== "undefined" && process.env) {
268
364
  if (process.env.OPENAI_API_KEY) return new OpenAIProvider();
269
365
  if (process.env.ANTHROPIC_API_KEY) return new AnthropicProvider();
366
+ if (process.env.GOOGLE_API_KEY) return new GeminiProvider();
367
+ if (process.env.AZURE_OPENAI_API_KEY) return new AzureOpenAIProvider();
270
368
  }
271
369
  return new NoopProvider();
272
370
  }
@@ -278,6 +376,9 @@ function resolveProvider(provider) {
278
376
  if (provider.startsWith("claude-")) {
279
377
  return new AnthropicProvider({ model: provider });
280
378
  }
379
+ if (provider.startsWith("gemini-")) {
380
+ return new GeminiProvider({ model: provider });
381
+ }
281
382
  return new OllamaProvider({ model: provider });
282
383
  }
283
384
  return new NoopProvider();
@@ -2006,6 +2107,465 @@ var JsonCorrectnessMetric = class extends BaseMetric {
2006
2107
  }
2007
2108
  };
2008
2109
 
2110
+ // src/metrics/tool-correctness.ts
2111
+ var ToolCorrectnessMetric = class extends BaseMetric {
2112
+ name = "Tool Correctness";
2113
+ requiredFields = ["toolsCalled", "expectedTools"];
2114
+ requiresProvider = false;
2115
+ matchParameters;
2116
+ constructor(config) {
2117
+ super({ ...config, provider: void 0 });
2118
+ this.matchParameters = config?.matchParameters ?? false;
2119
+ }
2120
+ async measure(testCase) {
2121
+ this.validate(testCase);
2122
+ const start = performance.now();
2123
+ const toolsCalled = testCase.toolsCalled;
2124
+ const expectedTools = testCase.expectedTools;
2125
+ if (expectedTools.length === 0) {
2126
+ return this.buildResult(1, "No expected tools specified \u2014 trivially correct.", start);
2127
+ }
2128
+ const calledNames = new Set(toolsCalled.map((t) => t.name));
2129
+ let matchCount = 0;
2130
+ for (const expected of expectedTools) {
2131
+ if (!calledNames.has(expected.name)) {
2132
+ continue;
2133
+ }
2134
+ if (this.matchParameters) {
2135
+ const calledTool = toolsCalled.find((t) => t.name === expected.name);
2136
+ if (calledTool && JSON.stringify(calledTool.inputParameters) === JSON.stringify(expected.inputParameters)) {
2137
+ matchCount++;
2138
+ }
2139
+ } else {
2140
+ matchCount++;
2141
+ }
2142
+ }
2143
+ let score = matchCount / expectedTools.length;
2144
+ score = this.applyStrictMode(score);
2145
+ const reason = score === 1 ? "All expected tools were called correctly." : `${matchCount} of ${expectedTools.length} expected tools were called correctly.`;
2146
+ return this.buildResult(score, reason, start, {
2147
+ matchCount,
2148
+ expectedCount: expectedTools.length,
2149
+ calledCount: toolsCalled.length
2150
+ });
2151
+ }
2152
+ };
2153
+
2154
+ // src/metrics/task-completion.ts
2155
+ import { z as z11 } from "zod";
2156
+
2157
+ // src/templates/task-completion.ts
2158
+ var TaskCompletionTemplate = {
2159
+ evaluate(input, actualOutput) {
2160
+ return `You are an expert evaluator. Given an input task and the AI agent's actual output, evaluate how well the agent completed the task.
2161
+
2162
+ Score the task completion on a scale of 1 to 5:
2163
+ - 1: The agent completely failed to address the task.
2164
+ - 2: The agent partially addressed the task but missed critical aspects.
2165
+ - 3: The agent addressed the main aspects of the task but with notable gaps.
2166
+ - 4: The agent mostly completed the task with only minor issues.
2167
+ - 5: The agent fully and correctly completed the task.
2168
+
2169
+ **
2170
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation are needed outside of the JSON. Ensure all strings are properly closed. Repair any invalid JSON before you output it.
2171
+
2172
+ Expected JSON format:
2173
+ {
2174
+ "score": <1-5>,
2175
+ "reason": "<concise explanation for the score>"
2176
+ }
2177
+ **
2178
+
2179
+ Input Task:
2180
+ ${input}
2181
+
2182
+ Actual Output:
2183
+ ${actualOutput}
2184
+
2185
+ JSON:
2186
+ `;
2187
+ }
2188
+ };
2189
+
2190
+ // src/metrics/task-completion.ts
2191
+ var evaluationSchema2 = z11.object({
2192
+ score: z11.number().min(1).max(5),
2193
+ reason: z11.string()
2194
+ });
2195
+ var TaskCompletionMetric = class extends BaseMetric {
2196
+ name = "Task Completion";
2197
+ requiredFields = ["input", "actualOutput"];
2198
+ async measure(testCase) {
2199
+ this.validate(testCase);
2200
+ const start = performance.now();
2201
+ const { score: rawScore, reason: llmReason } = await this.provider.generateJSON(
2202
+ TaskCompletionTemplate.evaluate(testCase.input, testCase.actualOutput),
2203
+ evaluationSchema2
2204
+ );
2205
+ let score = (rawScore - 1) / 4;
2206
+ score = this.applyStrictMode(score);
2207
+ const reason = this.includeReason ? llmReason : void 0;
2208
+ return this.buildResult(score, reason, start, { rawScore });
2209
+ }
2210
+ };
2211
+
2212
+ // src/metrics/goal-accuracy.ts
2213
+ import { z as z12 } from "zod";
2214
+
2215
+ // src/templates/goal-accuracy.ts
2216
+ var GoalAccuracyTemplate = {
2217
+ evaluate(input, actualOutput, expectedOutput) {
2218
+ return `You are an expert evaluator. Given an input task, the AI agent's actual output, and the expected goal/outcome, evaluate how accurately the agent's output achieves the expected goal.
2219
+
2220
+ Score the goal accuracy on a scale of 1 to 5:
2221
+ - 1: The output completely fails to achieve the expected goal.
2222
+ - 2: The output partially achieves the goal but misses critical elements.
2223
+ - 3: The output achieves the main goal but with notable inaccuracies or omissions.
2224
+ - 4: The output mostly achieves the goal with only minor deviations.
2225
+ - 5: The output fully and accurately achieves the expected goal.
2226
+
2227
+ **
2228
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation are needed outside of the JSON. Ensure all strings are properly closed. Repair any invalid JSON before you output it.
2229
+
2230
+ Expected JSON format:
2231
+ {
2232
+ "score": <1-5>,
2233
+ "reason": "<concise explanation for the score>"
2234
+ }
2235
+ **
2236
+
2237
+ Input Task:
2238
+ ${input}
2239
+
2240
+ Expected Goal/Outcome:
2241
+ ${expectedOutput}
2242
+
2243
+ Actual Output:
2244
+ ${actualOutput}
2245
+
2246
+ JSON:
2247
+ `;
2248
+ }
2249
+ };
2250
+
2251
+ // src/metrics/goal-accuracy.ts
2252
+ var evaluationSchema3 = z12.object({
2253
+ score: z12.number().min(1).max(5),
2254
+ reason: z12.string()
2255
+ });
2256
+ var GoalAccuracyMetric = class extends BaseMetric {
2257
+ name = "Goal Accuracy";
2258
+ requiredFields = ["input", "actualOutput", "expectedOutput"];
2259
+ async measure(testCase) {
2260
+ this.validate(testCase);
2261
+ const start = performance.now();
2262
+ const { score: rawScore, reason: llmReason } = await this.provider.generateJSON(
2263
+ GoalAccuracyTemplate.evaluate(
2264
+ testCase.input,
2265
+ testCase.actualOutput,
2266
+ testCase.expectedOutput
2267
+ ),
2268
+ evaluationSchema3
2269
+ );
2270
+ let score = (rawScore - 1) / 4;
2271
+ score = this.applyStrictMode(score);
2272
+ const reason = this.includeReason ? llmReason : void 0;
2273
+ return this.buildResult(score, reason, start, { rawScore });
2274
+ }
2275
+ };
2276
+
2277
+ // src/metrics/conversation-completeness.ts
2278
+ import { z as z13 } from "zod";
2279
+
2280
+ // src/templates/conversation-completeness.ts
2281
+ var ConversationCompletenessTemplate = {
2282
+ evaluate(conversation, scenario, expectedOutcome, latestInput, latestOutput) {
2283
+ const formattedTurns = conversation.map((t) => `[${t.role.toUpperCase()}]: ${t.content}`).join("\n");
2284
+ return `You are evaluating whether a conversation successfully achieved its intended goal. Review the full conversation in the context of the stated scenario and expected outcome.
2285
+
2286
+ **Scenario:**
2287
+ ${scenario}
2288
+
2289
+ **Expected Outcome:**
2290
+ ${expectedOutcome}
2291
+
2292
+ **Conversation History:**
2293
+ ${formattedTurns}
2294
+
2295
+ **Latest User Input:**
2296
+ ${latestInput}
2297
+
2298
+ **Latest Assistant Response:**
2299
+ ${latestOutput}
2300
+
2301
+ Score the conversation completeness on a scale of 1-5:
2302
+ - 5: Fully complete \u2014 the conversation has completely achieved the expected outcome with all goals met.
2303
+ - 4: Mostly complete \u2014 the primary goal is achieved, with minor aspects left unaddressed.
2304
+ - 3: Partially complete \u2014 some progress toward the goal, but significant parts remain unresolved.
2305
+ - 2: Mostly incomplete \u2014 minimal progress toward the expected outcome.
2306
+ - 1: Not complete \u2014 the conversation made no meaningful progress toward the goal.
2307
+
2308
+ **
2309
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation outside the JSON.
2310
+
2311
+ Example JSON:
2312
+ {
2313
+ "score": 4,
2314
+ "reason": "The conversation successfully helped the user book a flight, but did not confirm the seat preference as expected."
2315
+ }
2316
+ **
2317
+
2318
+ JSON:
2319
+ `;
2320
+ },
2321
+ generateReason(score, normalizedScore) {
2322
+ return `Given the conversation completeness score of ${score}/5 (normalized: ${normalizedScore}), provide a CONCISE reason for this score. Explain how well the conversation achieved its intended goal.
2323
+
2324
+ **
2325
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
2326
+ Example JSON:
2327
+ {
2328
+ "reason": "The score is ${normalizedScore} because <your_reason>."
2329
+ }
2330
+ **
2331
+
2332
+ JSON:
2333
+ `;
2334
+ }
2335
+ };
2336
+
2337
+ // src/metrics/conversation-completeness.ts
2338
+ var evaluationSchema4 = z13.object({
2339
+ score: z13.number().min(1).max(5),
2340
+ reason: z13.string()
2341
+ });
2342
+ var ConversationCompletenessMetric = class extends BaseMetric {
2343
+ name = "Conversation Completeness";
2344
+ requiredFields = ["input", "actualOutput"];
2345
+ async measure(testCase) {
2346
+ this.validate(testCase);
2347
+ const start = performance.now();
2348
+ if (!testCase.conversation) {
2349
+ throw new Error(
2350
+ `[${this.name}] This metric requires a "conversation" field on the test case.`
2351
+ );
2352
+ }
2353
+ if (!testCase.conversation.scenario || !testCase.conversation.expectedOutcome) {
2354
+ throw new Error(
2355
+ `[${this.name}] This metric requires "scenario" and "expectedOutcome" on the conversation.`
2356
+ );
2357
+ }
2358
+ const evaluation = await this.provider.generateJSON(
2359
+ ConversationCompletenessTemplate.evaluate(
2360
+ testCase.conversation.turns,
2361
+ testCase.conversation.scenario,
2362
+ testCase.conversation.expectedOutcome,
2363
+ testCase.input,
2364
+ testCase.actualOutput
2365
+ ),
2366
+ evaluationSchema4
2367
+ );
2368
+ let score = (evaluation.score - 1) / 4;
2369
+ score = this.applyStrictMode(score);
2370
+ let reason;
2371
+ if (this.includeReason) {
2372
+ reason = evaluation.reason;
2373
+ }
2374
+ return this.buildResult(score, reason, start, {
2375
+ rawScore: evaluation.score
2376
+ });
2377
+ }
2378
+ };
2379
+
2380
+ // src/metrics/knowledge-retention.ts
2381
+ import { z as z14 } from "zod";
2382
+
2383
+ // src/templates/knowledge-retention.ts
2384
+ var KnowledgeRetentionTemplate = {
2385
+ evaluate(conversation, latestInput, latestOutput) {
2386
+ const formattedTurns = conversation.map((t) => `[${t.role.toUpperCase()}]: ${t.content}`).join("\n");
2387
+ return `You are evaluating whether a chatbot retains and correctly uses knowledge from earlier turns in a conversation. The chatbot should remember facts, preferences, names, and other details mentioned previously and apply them consistently.
2388
+
2389
+ Review the full conversation history, then evaluate the LATEST assistant response for knowledge retention.
2390
+
2391
+ **Conversation History:**
2392
+ ${formattedTurns}
2393
+
2394
+ **Latest User Input:**
2395
+ ${latestInput}
2396
+
2397
+ **Latest Assistant Response:**
2398
+ ${latestOutput}
2399
+
2400
+ Score the knowledge retention on a scale of 1-5:
2401
+ - 5: Perfect retention \u2014 all previously mentioned facts, preferences, and details are correctly remembered and applied.
2402
+ - 4: Good retention \u2014 most information is remembered, with only minor omissions.
2403
+ - 3: Moderate retention \u2014 some important details from earlier turns are forgotten or misapplied.
2404
+ - 2: Poor retention \u2014 significant information is forgotten, leading to inconsistent or contradictory responses.
2405
+ - 1: No retention \u2014 the assistant appears to have no memory of previous conversation turns.
2406
+
2407
+ **
2408
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation outside the JSON.
2409
+
2410
+ Example JSON:
2411
+ {
2412
+ "score": 4,
2413
+ "reason": "The assistant correctly remembered the user's name and preference for vegetarian food from earlier turns, but forgot the specific restaurant they discussed."
2414
+ }
2415
+ **
2416
+
2417
+ JSON:
2418
+ `;
2419
+ },
2420
+ generateReason(score, normalizedScore) {
2421
+ return `Given the knowledge retention score of ${score}/5 (normalized: ${normalizedScore}), provide a CONCISE reason for this score. Explain what the chatbot remembered or forgot from earlier conversation turns.
2422
+
2423
+ **
2424
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
2425
+ Example JSON:
2426
+ {
2427
+ "reason": "The score is ${normalizedScore} because <your_reason>."
2428
+ }
2429
+ **
2430
+
2431
+ JSON:
2432
+ `;
2433
+ }
2434
+ };
2435
+
2436
+ // src/metrics/knowledge-retention.ts
2437
+ var evaluationSchema5 = z14.object({
2438
+ score: z14.number().min(1).max(5),
2439
+ reason: z14.string()
2440
+ });
2441
+ var KnowledgeRetentionMetric = class extends BaseMetric {
2442
+ name = "Knowledge Retention";
2443
+ requiredFields = ["input", "actualOutput"];
2444
+ async measure(testCase) {
2445
+ this.validate(testCase);
2446
+ const start = performance.now();
2447
+ if (!testCase.conversation || testCase.conversation.turns.length < 2) {
2448
+ throw new Error(
2449
+ `[${this.name}] This metric requires a "conversation" with at least 2 turns.`
2450
+ );
2451
+ }
2452
+ const evaluation = await this.provider.generateJSON(
2453
+ KnowledgeRetentionTemplate.evaluate(
2454
+ testCase.conversation.turns,
2455
+ testCase.input,
2456
+ testCase.actualOutput
2457
+ ),
2458
+ evaluationSchema5
2459
+ );
2460
+ let score = (evaluation.score - 1) / 4;
2461
+ score = this.applyStrictMode(score);
2462
+ let reason;
2463
+ if (this.includeReason) {
2464
+ reason = evaluation.reason;
2465
+ }
2466
+ return this.buildResult(score, reason, start, {
2467
+ rawScore: evaluation.score
2468
+ });
2469
+ }
2470
+ };
2471
+
2472
+ // src/metrics/role-adherence.ts
2473
+ import { z as z15 } from "zod";
2474
+
2475
+ // src/templates/role-adherence.ts
2476
+ var RoleAdherenceTemplate = {
2477
+ evaluate(conversation, chatbotRole, latestInput, latestOutput) {
2478
+ const formattedTurns = conversation.map((t) => `[${t.role.toUpperCase()}]: ${t.content}`).join("\n");
2479
+ return `You are evaluating whether a chatbot consistently stays in character and adheres to its assigned role throughout a conversation. The chatbot should maintain its persona, tone, expertise level, and behavioral boundaries as defined by its role.
2480
+
2481
+ **Assigned Role:**
2482
+ ${chatbotRole}
2483
+
2484
+ **Conversation History:**
2485
+ ${formattedTurns}
2486
+
2487
+ **Latest User Input:**
2488
+ ${latestInput}
2489
+
2490
+ **Latest Assistant Response:**
2491
+ ${latestOutput}
2492
+
2493
+ Score the role adherence on a scale of 1-5:
2494
+ - 5: Perfect adherence \u2014 the assistant fully embodies the assigned role in tone, knowledge, boundaries, and behavior throughout.
2495
+ - 4: Good adherence \u2014 the assistant mostly stays in character, with only minor deviations.
2496
+ - 3: Moderate adherence \u2014 the assistant sometimes breaks character or responds in ways inconsistent with the role.
2497
+ - 2: Poor adherence \u2014 the assistant frequently acts outside the defined role, breaking immersion.
2498
+ - 1: No adherence \u2014 the assistant completely ignores the assigned role.
2499
+
2500
+ **
2501
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation outside the JSON.
2502
+
2503
+ Example JSON:
2504
+ {
2505
+ "score": 4,
2506
+ "reason": "The assistant maintained a professional customer service tone throughout, but briefly used technical jargon that a support agent would typically avoid."
2507
+ }
2508
+ **
2509
+
2510
+ JSON:
2511
+ `;
2512
+ },
2513
+ generateReason(score, normalizedScore) {
2514
+ return `Given the role adherence score of ${score}/5 (normalized: ${normalizedScore}), provide a CONCISE reason for this score. Explain how well the chatbot stayed in its assigned character.
2515
+
2516
+ **
2517
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
2518
+ Example JSON:
2519
+ {
2520
+ "reason": "The score is ${normalizedScore} because <your_reason>."
2521
+ }
2522
+ **
2523
+
2524
+ JSON:
2525
+ `;
2526
+ }
2527
+ };
2528
+
2529
+ // src/metrics/role-adherence.ts
2530
+ var evaluationSchema6 = z15.object({
2531
+ score: z15.number().min(1).max(5),
2532
+ reason: z15.string()
2533
+ });
2534
+ var RoleAdherenceMetric = class extends BaseMetric {
2535
+ name = "Role Adherence";
2536
+ requiredFields = ["input", "actualOutput"];
2537
+ async measure(testCase) {
2538
+ this.validate(testCase);
2539
+ const start = performance.now();
2540
+ if (!testCase.conversation) {
2541
+ throw new Error(
2542
+ `[${this.name}] This metric requires a "conversation" field on the test case.`
2543
+ );
2544
+ }
2545
+ if (!testCase.conversation.chatbotRole) {
2546
+ throw new Error(`[${this.name}] This metric requires "chatbotRole" on the conversation.`);
2547
+ }
2548
+ const evaluation = await this.provider.generateJSON(
2549
+ RoleAdherenceTemplate.evaluate(
2550
+ testCase.conversation.turns,
2551
+ testCase.conversation.chatbotRole,
2552
+ testCase.input,
2553
+ testCase.actualOutput
2554
+ ),
2555
+ evaluationSchema6
2556
+ );
2557
+ let score = (evaluation.score - 1) / 4;
2558
+ score = this.applyStrictMode(score);
2559
+ let reason;
2560
+ if (this.includeReason) {
2561
+ reason = evaluation.reason;
2562
+ }
2563
+ return this.buildResult(score, reason, start, {
2564
+ rawScore: evaluation.score
2565
+ });
2566
+ }
2567
+ };
2568
+
2009
2569
  // src/config.ts
2010
2570
  var cachedConfig = null;
2011
2571
  async function resolveConfig(overrides = {}) {
@@ -2347,6 +2907,7 @@ function meanAveragePrecision(relevances) {
2347
2907
  export {
2348
2908
  AnswerRelevancyMetric,
2349
2909
  AnthropicProvider,
2910
+ AzureOpenAIProvider,
2350
2911
  BaseLLMProvider,
2351
2912
  BaseMetric,
2352
2913
  BiasMetric,
@@ -2354,14 +2915,21 @@ export {
2354
2915
  ContextualPrecisionMetric,
2355
2916
  ContextualRecallMetric,
2356
2917
  ContextualRelevancyMetric,
2918
+ ConversationCompletenessMetric,
2357
2919
  ExactMatchMetric,
2358
2920
  FaithfulnessMetric,
2359
2921
  GEval,
2922
+ GeminiProvider,
2923
+ GoalAccuracyMetric,
2360
2924
  HallucinationMetric,
2361
2925
  JsonCorrectnessMetric,
2926
+ KnowledgeRetentionMetric,
2362
2927
  OllamaProvider,
2363
2928
  OpenAIProvider,
2929
+ RoleAdherenceMetric,
2364
2930
  SummarizationMetric,
2931
+ TaskCompletionMetric,
2932
+ ToolCorrectnessMetric,
2365
2933
  ToxicityMetric,
2366
2934
  assertEval,
2367
2935
  createLimiter,