npm - @assay-ai/core - Versions diffs - 0.2.0-beta → 0.3.0-beta - Mend

@assay-ai/core 0.2.0-beta → 0.3.0-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -32,6 +32,7 @@ var index_exports = {};
 __export(index_exports, {
   AnswerRelevancyMetric: () => AnswerRelevancyMetric,
   AnthropicProvider: () => AnthropicProvider,
+  AzureOpenAIProvider: () => AzureOpenAIProvider,
   BaseLLMProvider: () => BaseLLMProvider,
   BaseMetric: () => BaseMetric,
   BiasMetric: () => BiasMetric,
@@ -39,14 +40,21 @@ __export(index_exports, {
   ContextualPrecisionMetric: () => ContextualPrecisionMetric,
   ContextualRecallMetric: () => ContextualRecallMetric,
   ContextualRelevancyMetric: () => ContextualRelevancyMetric,
+  ConversationCompletenessMetric: () => ConversationCompletenessMetric,
   ExactMatchMetric: () => ExactMatchMetric,
   FaithfulnessMetric: () => FaithfulnessMetric,
   GEval: () => GEval,
+  GeminiProvider: () => GeminiProvider,
+  GoalAccuracyMetric: () => GoalAccuracyMetric,
   HallucinationMetric: () => HallucinationMetric,
   JsonCorrectnessMetric: () => JsonCorrectnessMetric,
+  KnowledgeRetentionMetric: () => KnowledgeRetentionMetric,
   OllamaProvider: () => OllamaProvider,
   OpenAIProvider: () => OpenAIProvider,
+  RoleAdherenceMetric: () => RoleAdherenceMetric,
   SummarizationMetric: () => SummarizationMetric,
+  TaskCompletionMetric: () => TaskCompletionMetric,
+  ToolCorrectnessMetric: () => ToolCorrectnessMetric,
   ToxicityMetric: () => ToxicityMetric,
   assertEval: () => assertEval,
   createLimiter: () => createLimiter,
@@ -314,6 +322,102 @@ var OllamaProvider = class extends BaseLLMProvider {
   }
 };
+// src/providers/gemini.ts
+var DEFAULT_MODEL4 = "gemini-2.0-flash";
+var GeminiProvider = class extends BaseLLMProvider {
+  apiKey;
+  constructor(config = {}) {
+    super(config, DEFAULT_MODEL4);
+    const key = config.apiKey ?? process.env.GOOGLE_API_KEY;
+    if (!key) {
+      throw new Error(
+        "Google API key is required. Set GOOGLE_API_KEY env var or pass apiKey in config."
+      );
+    }
+    this.apiKey = key;
+  }
+  get providerName() {
+    return "gemini";
+  }
+  async generate(prompt) {
+    const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.modelName}:generateContent?key=${this.apiKey}`;
+    const response = await fetch(url, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({
+        contents: [{ parts: [{ text: prompt }] }],
+        generationConfig: { temperature: this.temperature }
+      })
+    });
+    if (!response.ok) {
+      const errorText = await response.text();
+      throw new Error(`Gemini request failed (${response.status}): ${errorText}`);
+    }
+    const data = await response.json();
+    const content = data.candidates?.[0]?.content?.parts?.[0]?.text;
+    if (!content) {
+      throw new Error("Gemini returned an empty response");
+    }
+    return content;
+  }
+};
+// src/providers/azure-openai.ts
+var DEFAULT_API_VERSION = "2024-08-01-preview";
+var AzureOpenAIProvider = class extends BaseLLMProvider {
+  constructor(azureConfig = {}) {
+    super(
+      azureConfig,
+      azureConfig.deploymentName ?? process.env.AZURE_OPENAI_DEPLOYMENT ?? "gpt-4o"
+    );
+    this.azureConfig = azureConfig;
+    this.endpoint = azureConfig.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT ?? "";
+    this.apiKey = azureConfig.apiKey ?? process.env.AZURE_OPENAI_API_KEY ?? "";
+    this.deploymentName = azureConfig.deploymentName ?? process.env.AZURE_OPENAI_DEPLOYMENT ?? "gpt-4o";
+    this.apiVersion = azureConfig.apiVersion ?? DEFAULT_API_VERSION;
+    if (!this.endpoint) {
+      throw new Error(
+        "Azure OpenAI endpoint is required. Set AZURE_OPENAI_ENDPOINT env var or pass endpoint in config."
+      );
+    }
+    if (!this.apiKey) {
+      throw new Error(
+        "Azure OpenAI API key is required. Set AZURE_OPENAI_API_KEY env var or pass apiKey in config."
+      );
+    }
+  }
+  client;
+  endpoint;
+  apiKey;
+  deploymentName;
+  apiVersion;
+  get providerName() {
+    return "azure-openai";
+  }
+  async generate(prompt) {
+    if (!this.client) {
+      const { default: OpenAI } = await import("openai");
+      this.client = new OpenAI({
+        apiKey: this.apiKey,
+        baseURL: `${this.endpoint}/openai/deployments/${this.deploymentName}`,
+        defaultQuery: { "api-version": this.apiVersion }
+      });
+    }
+    const openai = this.client;
+    const response = await openai.chat.completions.create({
+      model: this.deploymentName,
+      messages: [{ role: "user", content: prompt }],
+      temperature: this.temperature,
+      max_tokens: this.maxTokens
+    });
+    const content = response.choices[0]?.message.content;
+    if (!content) {
+      throw new Error("Azure OpenAI returned an empty response");
+    }
+    return content;
+  }
+};
 // src/providers/index.ts
 var NoopProvider = class extends BaseLLMProvider {
   constructor() {
@@ -331,6 +435,8 @@ function resolveProvider(provider) {
     if (typeof process !== "undefined" && process.env) {
       if (process.env.OPENAI_API_KEY) return new OpenAIProvider();
       if (process.env.ANTHROPIC_API_KEY) return new AnthropicProvider();
+      if (process.env.GOOGLE_API_KEY) return new GeminiProvider();
+      if (process.env.AZURE_OPENAI_API_KEY) return new AzureOpenAIProvider();
     }
     return new NoopProvider();
   }
@@ -342,6 +448,9 @@ function resolveProvider(provider) {
     if (provider.startsWith("claude-")) {
       return new AnthropicProvider({ model: provider });
     }
+    if (provider.startsWith("gemini-")) {
+      return new GeminiProvider({ model: provider });
+    }
     return new OllamaProvider({ model: provider });
   }
   return new NoopProvider();
@@ -2070,6 +2179,465 @@ var JsonCorrectnessMetric = class extends BaseMetric {
   }
 };
+// src/metrics/tool-correctness.ts
+var ToolCorrectnessMetric = class extends BaseMetric {
+  name = "Tool Correctness";
+  requiredFields = ["toolsCalled", "expectedTools"];
+  requiresProvider = false;
+  matchParameters;
+  constructor(config) {
+    super({ ...config, provider: void 0 });
+    this.matchParameters = config?.matchParameters ?? false;
+  }
+  async measure(testCase) {
+    this.validate(testCase);
+    const start = performance.now();
+    const toolsCalled = testCase.toolsCalled;
+    const expectedTools = testCase.expectedTools;
+    if (expectedTools.length === 0) {
+      return this.buildResult(1, "No expected tools specified \u2014 trivially correct.", start);
+    }
+    const calledNames = new Set(toolsCalled.map((t) => t.name));
+    let matchCount = 0;
+    for (const expected of expectedTools) {
+      if (!calledNames.has(expected.name)) {
+        continue;
+      }
+      if (this.matchParameters) {
+        const calledTool = toolsCalled.find((t) => t.name === expected.name);
+        if (calledTool && JSON.stringify(calledTool.inputParameters) === JSON.stringify(expected.inputParameters)) {
+          matchCount++;
+        }
+      } else {
+        matchCount++;
+      }
+    }
+    let score = matchCount / expectedTools.length;
+    score = this.applyStrictMode(score);
+    const reason = score === 1 ? "All expected tools were called correctly." : `${matchCount} of ${expectedTools.length} expected tools were called correctly.`;
+    return this.buildResult(score, reason, start, {
+      matchCount,
+      expectedCount: expectedTools.length,
+      calledCount: toolsCalled.length
+    });
+  }
+};
+// src/metrics/task-completion.ts
+var import_zod11 = require("zod");
+// src/templates/task-completion.ts
+var TaskCompletionTemplate = {
+  evaluate(input, actualOutput) {
+    return `You are an expert evaluator. Given an input task and the AI agent's actual output, evaluate how well the agent completed the task.
+Score the task completion on a scale of 1 to 5:
+- 1: The agent completely failed to address the task.
+- 2: The agent partially addressed the task but missed critical aspects.
+- 3: The agent addressed the main aspects of the task but with notable gaps.
+- 4: The agent mostly completed the task with only minor issues.
+- 5: The agent fully and correctly completed the task.
+**
+IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation are needed outside of the JSON. Ensure all strings are properly closed. Repair any invalid JSON before you output it.
+Expected JSON format:
+{
+    "score": <1-5>,
+    "reason": "<concise explanation for the score>"
+}
+**
+Input Task:
+${input}
+Actual Output:
+${actualOutput}
+JSON:
+`;
+  }
+};
+// src/metrics/task-completion.ts
+var evaluationSchema2 = import_zod11.z.object({
+  score: import_zod11.z.number().min(1).max(5),
+  reason: import_zod11.z.string()
+});
+var TaskCompletionMetric = class extends BaseMetric {
+  name = "Task Completion";
+  requiredFields = ["input", "actualOutput"];
+  async measure(testCase) {
+    this.validate(testCase);
+    const start = performance.now();
+    const { score: rawScore, reason: llmReason } = await this.provider.generateJSON(
+      TaskCompletionTemplate.evaluate(testCase.input, testCase.actualOutput),
+      evaluationSchema2
+    );
+    let score = (rawScore - 1) / 4;
+    score = this.applyStrictMode(score);
+    const reason = this.includeReason ? llmReason : void 0;
+    return this.buildResult(score, reason, start, { rawScore });
+  }
+};
+// src/metrics/goal-accuracy.ts
+var import_zod12 = require("zod");
+// src/templates/goal-accuracy.ts
+var GoalAccuracyTemplate = {
+  evaluate(input, actualOutput, expectedOutput) {
+    return `You are an expert evaluator. Given an input task, the AI agent's actual output, and the expected goal/outcome, evaluate how accurately the agent's output achieves the expected goal.
+Score the goal accuracy on a scale of 1 to 5:
+- 1: The output completely fails to achieve the expected goal.
+- 2: The output partially achieves the goal but misses critical elements.
+- 3: The output achieves the main goal but with notable inaccuracies or omissions.
+- 4: The output mostly achieves the goal with only minor deviations.
+- 5: The output fully and accurately achieves the expected goal.
+**
+IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation are needed outside of the JSON. Ensure all strings are properly closed. Repair any invalid JSON before you output it.
+Expected JSON format:
+{
+    "score": <1-5>,
+    "reason": "<concise explanation for the score>"
+}
+**
+Input Task:
+${input}
+Expected Goal/Outcome:
+${expectedOutput}
+Actual Output:
+${actualOutput}
+JSON:
+`;
+  }
+};
+// src/metrics/goal-accuracy.ts
+var evaluationSchema3 = import_zod12.z.object({
+  score: import_zod12.z.number().min(1).max(5),
+  reason: import_zod12.z.string()
+});
+var GoalAccuracyMetric = class extends BaseMetric {
+  name = "Goal Accuracy";
+  requiredFields = ["input", "actualOutput", "expectedOutput"];
+  async measure(testCase) {
+    this.validate(testCase);
+    const start = performance.now();
+    const { score: rawScore, reason: llmReason } = await this.provider.generateJSON(
+      GoalAccuracyTemplate.evaluate(
+        testCase.input,
+        testCase.actualOutput,
+        testCase.expectedOutput
+      ),
+      evaluationSchema3
+    );
+    let score = (rawScore - 1) / 4;
+    score = this.applyStrictMode(score);
+    const reason = this.includeReason ? llmReason : void 0;
+    return this.buildResult(score, reason, start, { rawScore });
+  }
+};
+// src/metrics/conversation-completeness.ts
+var import_zod13 = require("zod");
+// src/templates/conversation-completeness.ts
+var ConversationCompletenessTemplate = {
+  evaluate(conversation, scenario, expectedOutcome, latestInput, latestOutput) {
+    const formattedTurns = conversation.map((t) => `[${t.role.toUpperCase()}]: ${t.content}`).join("\n");
+    return `You are evaluating whether a conversation successfully achieved its intended goal. Review the full conversation in the context of the stated scenario and expected outcome.
+**Scenario:**
+${scenario}
+**Expected Outcome:**
+${expectedOutcome}
+**Conversation History:**
+${formattedTurns}
+**Latest User Input:**
+${latestInput}
+**Latest Assistant Response:**
+${latestOutput}
+Score the conversation completeness on a scale of 1-5:
+- 5: Fully complete \u2014 the conversation has completely achieved the expected outcome with all goals met.
+- 4: Mostly complete \u2014 the primary goal is achieved, with minor aspects left unaddressed.
+- 3: Partially complete \u2014 some progress toward the goal, but significant parts remain unresolved.
+- 2: Mostly incomplete \u2014 minimal progress toward the expected outcome.
+- 1: Not complete \u2014 the conversation made no meaningful progress toward the goal.
+**
+IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation outside the JSON.
+Example JSON:
+{
+    "score": 4,
+    "reason": "The conversation successfully helped the user book a flight, but did not confirm the seat preference as expected."
+}
+**
+JSON:
+`;
+  },
+  generateReason(score, normalizedScore) {
+    return `Given the conversation completeness score of ${score}/5 (normalized: ${normalizedScore}), provide a CONCISE reason for this score. Explain how well the conversation achieved its intended goal.
+**
+IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
+Example JSON:
+{
+    "reason": "The score is ${normalizedScore} because <your_reason>."
+}
+**
+JSON:
+`;
+  }
+};
+// src/metrics/conversation-completeness.ts
+var evaluationSchema4 = import_zod13.z.object({
+  score: import_zod13.z.number().min(1).max(5),
+  reason: import_zod13.z.string()
+});
+var ConversationCompletenessMetric = class extends BaseMetric {
+  name = "Conversation Completeness";
+  requiredFields = ["input", "actualOutput"];
+  async measure(testCase) {
+    this.validate(testCase);
+    const start = performance.now();
+    if (!testCase.conversation) {
+      throw new Error(
+        `[${this.name}] This metric requires a "conversation" field on the test case.`
+      );
+    }
+    if (!testCase.conversation.scenario || !testCase.conversation.expectedOutcome) {
+      throw new Error(
+        `[${this.name}] This metric requires "scenario" and "expectedOutcome" on the conversation.`
+      );
+    }
+    const evaluation = await this.provider.generateJSON(
+      ConversationCompletenessTemplate.evaluate(
+        testCase.conversation.turns,
+        testCase.conversation.scenario,
+        testCase.conversation.expectedOutcome,
+        testCase.input,
+        testCase.actualOutput
+      ),
+      evaluationSchema4
+    );
+    let score = (evaluation.score - 1) / 4;
+    score = this.applyStrictMode(score);
+    let reason;
+    if (this.includeReason) {
+      reason = evaluation.reason;
+    }
+    return this.buildResult(score, reason, start, {
+      rawScore: evaluation.score
+    });
+  }
+};
+// src/metrics/knowledge-retention.ts
+var import_zod14 = require("zod");
+// src/templates/knowledge-retention.ts
+var KnowledgeRetentionTemplate = {
+  evaluate(conversation, latestInput, latestOutput) {
+    const formattedTurns = conversation.map((t) => `[${t.role.toUpperCase()}]: ${t.content}`).join("\n");
+    return `You are evaluating whether a chatbot retains and correctly uses knowledge from earlier turns in a conversation. The chatbot should remember facts, preferences, names, and other details mentioned previously and apply them consistently.
+Review the full conversation history, then evaluate the LATEST assistant response for knowledge retention.
+**Conversation History:**
+${formattedTurns}
+**Latest User Input:**
+${latestInput}
+**Latest Assistant Response:**
+${latestOutput}
+Score the knowledge retention on a scale of 1-5:
+- 5: Perfect retention \u2014 all previously mentioned facts, preferences, and details are correctly remembered and applied.
+- 4: Good retention \u2014 most information is remembered, with only minor omissions.
+- 3: Moderate retention \u2014 some important details from earlier turns are forgotten or misapplied.
+- 2: Poor retention \u2014 significant information is forgotten, leading to inconsistent or contradictory responses.
+- 1: No retention \u2014 the assistant appears to have no memory of previous conversation turns.
+**
+IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation outside the JSON.
+Example JSON:
+{
+    "score": 4,
+    "reason": "The assistant correctly remembered the user's name and preference for vegetarian food from earlier turns, but forgot the specific restaurant they discussed."
+}
+**
+JSON:
+`;
+  },
+  generateReason(score, normalizedScore) {
+    return `Given the knowledge retention score of ${score}/5 (normalized: ${normalizedScore}), provide a CONCISE reason for this score. Explain what the chatbot remembered or forgot from earlier conversation turns.
+**
+IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
+Example JSON:
+{
+    "reason": "The score is ${normalizedScore} because <your_reason>."
+}
+**
+JSON:
+`;
+  }
+};
+// src/metrics/knowledge-retention.ts
+var evaluationSchema5 = import_zod14.z.object({
+  score: import_zod14.z.number().min(1).max(5),
+  reason: import_zod14.z.string()
+});
+var KnowledgeRetentionMetric = class extends BaseMetric {
+  name = "Knowledge Retention";
+  requiredFields = ["input", "actualOutput"];
+  async measure(testCase) {
+    this.validate(testCase);
+    const start = performance.now();
+    if (!testCase.conversation || testCase.conversation.turns.length < 2) {
+      throw new Error(
+        `[${this.name}] This metric requires a "conversation" with at least 2 turns.`
+      );
+    }
+    const evaluation = await this.provider.generateJSON(
+      KnowledgeRetentionTemplate.evaluate(
+        testCase.conversation.turns,
+        testCase.input,
+        testCase.actualOutput
+      ),
+      evaluationSchema5
+    );
+    let score = (evaluation.score - 1) / 4;
+    score = this.applyStrictMode(score);
+    let reason;
+    if (this.includeReason) {
+      reason = evaluation.reason;
+    }
+    return this.buildResult(score, reason, start, {
+      rawScore: evaluation.score
+    });
+  }
+};
+// src/metrics/role-adherence.ts
+var import_zod15 = require("zod");
+// src/templates/role-adherence.ts
+var RoleAdherenceTemplate = {
+  evaluate(conversation, chatbotRole, latestInput, latestOutput) {
+    const formattedTurns = conversation.map((t) => `[${t.role.toUpperCase()}]: ${t.content}`).join("\n");
+    return `You are evaluating whether a chatbot consistently stays in character and adheres to its assigned role throughout a conversation. The chatbot should maintain its persona, tone, expertise level, and behavioral boundaries as defined by its role.
+**Assigned Role:**
+${chatbotRole}
+**Conversation History:**
+${formattedTurns}
+**Latest User Input:**
+${latestInput}
+**Latest Assistant Response:**
+${latestOutput}
+Score the role adherence on a scale of 1-5:
+- 5: Perfect adherence \u2014 the assistant fully embodies the assigned role in tone, knowledge, boundaries, and behavior throughout.
+- 4: Good adherence \u2014 the assistant mostly stays in character, with only minor deviations.
+- 3: Moderate adherence \u2014 the assistant sometimes breaks character or responds in ways inconsistent with the role.
+- 2: Poor adherence \u2014 the assistant frequently acts outside the defined role, breaking immersion.
+- 1: No adherence \u2014 the assistant completely ignores the assigned role.
+**
+IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "score" and "reason" keys. No words or explanation outside the JSON.
+Example JSON:
+{
+    "score": 4,
+    "reason": "The assistant maintained a professional customer service tone throughout, but briefly used technical jargon that a support agent would typically avoid."
+}
+**
+JSON:
+`;
+  },
+  generateReason(score, normalizedScore) {
+    return `Given the role adherence score of ${score}/5 (normalized: ${normalizedScore}), provide a CONCISE reason for this score. Explain how well the chatbot stayed in its assigned character.
+**
+IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
+Example JSON:
+{
+    "reason": "The score is ${normalizedScore} because <your_reason>."
+}
+**
+JSON:
+`;
+  }
+};
+// src/metrics/role-adherence.ts
+var evaluationSchema6 = import_zod15.z.object({
+  score: import_zod15.z.number().min(1).max(5),
+  reason: import_zod15.z.string()
+});
+var RoleAdherenceMetric = class extends BaseMetric {
+  name = "Role Adherence";
+  requiredFields = ["input", "actualOutput"];
+  async measure(testCase) {
+    this.validate(testCase);
+    const start = performance.now();
+    if (!testCase.conversation) {
+      throw new Error(
+        `[${this.name}] This metric requires a "conversation" field on the test case.`
+      );
+    }
+    if (!testCase.conversation.chatbotRole) {
+      throw new Error(`[${this.name}] This metric requires "chatbotRole" on the conversation.`);
+    }
+    const evaluation = await this.provider.generateJSON(
+      RoleAdherenceTemplate.evaluate(
+        testCase.conversation.turns,
+        testCase.conversation.chatbotRole,
+        testCase.input,
+        testCase.actualOutput
+      ),
+      evaluationSchema6
+    );
+    let score = (evaluation.score - 1) / 4;
+    score = this.applyStrictMode(score);
+    let reason;
+    if (this.includeReason) {
+      reason = evaluation.reason;
+    }
+    return this.buildResult(score, reason, start, {
+      rawScore: evaluation.score
+    });
+  }
+};
 // src/config.ts
 var cachedConfig = null;
 async function resolveConfig(overrides = {}) {
@@ -2412,6 +2980,7 @@ function meanAveragePrecision(relevances) {
 0 && (module.exports = {
   AnswerRelevancyMetric,
   AnthropicProvider,
+  AzureOpenAIProvider,
   BaseLLMProvider,
   BaseMetric,
   BiasMetric,
@@ -2419,14 +2988,21 @@ function meanAveragePrecision(relevances) {
   ContextualPrecisionMetric,
   ContextualRecallMetric,
   ContextualRelevancyMetric,
+  ConversationCompletenessMetric,
   ExactMatchMetric,
   FaithfulnessMetric,
   GEval,
+  GeminiProvider,
+  GoalAccuracyMetric,
   HallucinationMetric,
   JsonCorrectnessMetric,
+  KnowledgeRetentionMetric,
   OllamaProvider,
   OpenAIProvider,
+  RoleAdherenceMetric,
   SummarizationMetric,
+  TaskCompletionMetric,
+  ToolCorrectnessMetric,
   ToxicityMetric,
   assertEval,
   createLimiter,