npm - @arizeai/phoenix-evals - Versions diffs - 0.2.0 → 0.2.2 - Mend

@arizeai/phoenix-evals 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

package/README.md +30 -1
package/dist/esm/llm/ClassificationEvaluator.d.ts +17 -0
package/dist/esm/llm/ClassificationEvaluator.d.ts.map +1 -0
package/dist/esm/llm/ClassificationEvaluator.js +33 -0
package/dist/esm/llm/ClassificationEvaluator.js.map +1 -0
package/dist/esm/llm/LLMEvaluator.d.ts +12 -0
package/dist/esm/llm/LLMEvaluator.d.ts.map +1 -0
package/dist/esm/llm/LLMEvaluator.js +13 -0
package/dist/esm/llm/LLMEvaluator.js.map +1 -0
package/dist/esm/llm/createClassificationEvaluator.d.ts +3 -2
package/dist/esm/llm/createClassificationEvaluator.d.ts.map +1 -1
package/dist/esm/llm/createClassificationEvaluator.js +2 -7
package/dist/esm/llm/createClassificationEvaluator.js.map +1 -1
package/dist/esm/llm/createClassifierFn.d.ts +1 -1
package/dist/esm/llm/createClassifierFn.d.ts.map +1 -1
package/dist/esm/llm/createClassifierFn.js.map +1 -1
package/dist/esm/llm/createDocumentRelevancyEvaluator.d.ts +5 -4
package/dist/esm/llm/createDocumentRelevancyEvaluator.d.ts.map +1 -1
package/dist/esm/llm/createDocumentRelevancyEvaluator.js.map +1 -1
package/dist/esm/llm/createHallucinationEvaluator.d.ts +5 -4
package/dist/esm/llm/createHallucinationEvaluator.d.ts.map +1 -1
package/dist/esm/llm/createHallucinationEvaluator.js.map +1 -1
package/dist/esm/llm/generateClassification.d.ts +2 -2
package/dist/esm/llm/generateClassification.d.ts.map +1 -1
package/dist/esm/llm/generateClassification.js.map +1 -1
package/dist/esm/template/getTemplateVariables.d.ts +12 -0
package/dist/esm/template/getTemplateVariables.d.ts.map +1 -0
package/dist/esm/template/getTemplateVariables.js +18 -0
package/dist/esm/template/getTemplateVariables.js.map +1 -0
package/dist/esm/template/index.d.ts +1 -0
package/dist/esm/template/index.d.ts.map +1 -1
package/dist/esm/template/index.js +1 -0
package/dist/esm/template/index.js.map +1 -1
package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
package/dist/esm/types/evals.d.ts +13 -6
package/dist/esm/types/evals.d.ts.map +1 -1
package/dist/esm/types/otel.d.ts +2 -2
package/dist/esm/types/otel.d.ts.map +1 -1
package/dist/esm/types/prompts.d.ts +2 -16
package/dist/esm/types/prompts.d.ts.map +1 -1
package/dist/src/llm/ClassificationEvaluator.d.ts +17 -0
package/dist/src/llm/ClassificationEvaluator.d.ts.map +1 -0
package/dist/src/llm/ClassificationEvaluator.js +34 -0
package/dist/src/llm/ClassificationEvaluator.js.map +1 -0
package/dist/src/llm/LLMEvaluator.d.ts +12 -0
package/dist/src/llm/LLMEvaluator.d.ts.map +1 -0
package/dist/src/llm/LLMEvaluator.js +15 -0
package/dist/src/llm/LLMEvaluator.js.map +1 -0
package/dist/src/llm/createClassificationEvaluator.d.ts +3 -2
package/dist/src/llm/createClassificationEvaluator.d.ts.map +1 -1
package/dist/src/llm/createClassificationEvaluator.js +2 -7
package/dist/src/llm/createClassificationEvaluator.js.map +1 -1
package/dist/src/llm/createClassifierFn.d.ts +1 -1
package/dist/src/llm/createClassifierFn.d.ts.map +1 -1
package/dist/src/llm/createClassifierFn.js.map +1 -1
package/dist/src/llm/createDocumentRelevancyEvaluator.d.ts +5 -4
package/dist/src/llm/createDocumentRelevancyEvaluator.d.ts.map +1 -1
package/dist/src/llm/createDocumentRelevancyEvaluator.js.map +1 -1
package/dist/src/llm/createHallucinationEvaluator.d.ts +5 -4
package/dist/src/llm/createHallucinationEvaluator.d.ts.map +1 -1
package/dist/src/llm/createHallucinationEvaluator.js.map +1 -1
package/dist/src/llm/generateClassification.d.ts +2 -2
package/dist/src/llm/generateClassification.d.ts.map +1 -1
package/dist/src/llm/generateClassification.js.map +1 -1
package/dist/src/template/getTemplateVariables.d.ts +12 -0
package/dist/src/template/getTemplateVariables.d.ts.map +1 -0
package/dist/src/template/getTemplateVariables.js +24 -0
package/dist/src/template/getTemplateVariables.js.map +1 -0
package/dist/src/template/index.d.ts +1 -0
package/dist/src/template/index.d.ts.map +1 -1
package/dist/src/template/index.js +1 -0
package/dist/src/template/index.js.map +1 -1
package/dist/src/types/evals.d.ts +13 -6
package/dist/src/types/evals.d.ts.map +1 -1
package/dist/src/types/otel.d.ts +2 -2
package/dist/src/types/otel.d.ts.map +1 -1
package/dist/src/types/prompts.d.ts +2 -16
package/dist/src/types/prompts.d.ts.map +1 -1
package/dist/tsconfig.tsbuildinfo +1 -1
package/package.json +3 -3
package/src/llm/ClassificationEvaluator.ts +40 -0
package/src/llm/LLMEvaluator.ts +22 -0
package/src/llm/createClassificationEvaluator.ts +7 -10
package/src/llm/createClassifierFn.ts +4 -4
package/src/llm/createDocumentRelevancyEvaluator.ts +11 -7
package/src/llm/createHallucinationEvaluator.ts +8 -7
package/src/llm/generateClassification.ts +16 -14
package/src/template/getTemplateVariables.ts +22 -0
package/src/template/index.ts +1 -0
package/src/types/evals.ts +18 -7
package/src/types/otel.ts +2 -2
package/src/types/prompts.ts +2 -17

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@arizeai/phoenix-evals",
-  "version": "0.2.0",
+  "version": "0.2.2",
   "description": "A library for running evaluations for AI use cases",
   "main": "dist/src/index.js",
   "module": "dist/esm/index.js",
@@ -50,7 +50,7 @@
     "typedoc": "^0.27.9",
     "typescript": "^5.8.2",
     "vitest": "^2.1.9",
-    "@arizeai/phoenix-client": "3.0.0"
+    "@arizeai/phoenix-client": "4.2.0"
   },
   "engines": {
     "node": ">=18"
@@ -65,7 +65,7 @@
     "clean": "rimraf dist",
     "prebuild": "pnpm run clean",
     "build": "tsc --build tsconfig.json tsconfig.esm.json && tsc-alias -p tsconfig.esm.json",
-    "postbuild": "echo '{\"type\": \"module\"}' > ./dist/esm/package.json && rimraf dist/test dist/examples",
+    "postbuild": "echo '{\"type\": \"module\"}' > ./dist/esm/package.json",
     "type:check": "tsc --noEmit",
     "test": "vitest --typecheck",
     "docs": "typedoc",

package/src/llm/ClassificationEvaluator.ts ADDED Viewed

@@ -0,0 +1,40 @@
+import {
+  CreateClassificationEvaluatorArgs,
+  EvaluatorFn,
+  Template,
+} from "../types";
+import { createClassifierFn } from "./createClassifierFn";
+import { LLMEvaluator } from "./LLMEvaluator";
+import { getTemplateVariables } from "../template";
+/**
+ * An LLM evaluator that performs evaluation via classification
+ */
+export class ClassificationEvaluator<
+  RecordType extends Record<string, unknown>,
+> extends LLMEvaluator<RecordType> {
+  readonly evaluatorFn: EvaluatorFn<RecordType>;
+  readonly promptTemplate: Template;
+  private _promptTemplateVariables: string[] | undefined;
+  constructor(args: CreateClassificationEvaluatorArgs) {
+    super(args);
+    this.promptTemplate = args.promptTemplate;
+    this.evaluatorFn = createClassifierFn<RecordType>(args);
+  }
+  evaluate = (example: RecordType) => {
+    return this.evaluatorFn(example);
+  };
+  /**
+   * List out the prompt template variables needed to perform evaluation
+   */
+  get promptTemplateVariables(): string[] {
+    // Use dynamic programming to see if it's computed already
+    if (!Array.isArray(this._promptTemplateVariables)) {
+      this._promptTemplateVariables = getTemplateVariables({
+        template: this.promptTemplate,
+      });
+    }
+    // Give a copy of the variables
+    return [...this._promptTemplateVariables];
+  }
+}

package/src/llm/LLMEvaluator.ts ADDED Viewed

@@ -0,0 +1,22 @@
+import {
+  EvaluationResult,
+  Evaluator,
+  OptimizationDirection,
+  CreateEvaluatorArgs,
+} from "../types";
+/**
+ * Base class for llm evaluation metrics / scores
+ */
+export abstract class LLMEvaluator<RecordType extends Record<string, unknown>>
+  implements Evaluator<RecordType>
+{
+  readonly name: string;
+  readonly source = "LLM" as const;
+  readonly optimizationDirection?: OptimizationDirection;
+  constructor({ name, optimizationDirection }: CreateEvaluatorArgs) {
+    this.name = name;
+    this.optimizationDirection = optimizationDirection;
+  }
+  abstract evaluate(_example: RecordType): Promise<EvaluationResult>;
+}

package/src/llm/createClassificationEvaluator.ts CHANGED Viewed

@@ -1,13 +1,10 @@
-import { CreateClassificationEvaluatorArgs, Evaluator } from "../types/evals";
-import { createClassifierFn } from "./createClassifierFn";
+import { CreateClassificationEvaluatorArgs } from "../types/evals";
+import { ClassificationEvaluator } from "./ClassificationEvaluator";
 export function createClassificationEvaluator<
-  ExampleType extends Record<string, unknown>,
->(args: CreateClassificationEvaluatorArgs): Evaluator<ExampleType> {
-  return {
-    name: args.name,
-    source: "LLM",
-    optimizationDirection: args.optimizationDirection,
-    evaluate: createClassifierFn(args),
-  };
+  RecordType extends Record<string, unknown>,
+>(
+  args: CreateClassificationEvaluatorArgs
+): ClassificationEvaluator<RecordType> {
+  return new ClassificationEvaluator<RecordType>(args);
 }

package/src/llm/createClassifierFn.ts CHANGED Viewed

@@ -24,12 +24,12 @@ function choicesToLabels(
 /**
  * A function that serves as a factory that will output a classification evaluator function
  */
-export function createClassifierFn<ExampleType extends Record<string, unknown>>(
-  args: CreateClassifierArgs
-): EvaluatorFn<ExampleType> {
+export function createClassifierFn<
+  RecordToEvaluate extends Record<string, unknown>,
+>(args: CreateClassifierArgs): EvaluatorFn<RecordToEvaluate> {
   const { model, choices, promptTemplate, ...rest } = args;
-  return async (args: ExampleType): Promise<EvaluationResult> => {
+  return async (args: RecordToEvaluate): Promise<EvaluationResult> => {
     const templateVariables = {
       ...args,
     };

package/src/llm/createDocumentRelevancyEvaluator.ts CHANGED Viewed

@@ -17,12 +17,13 @@ export interface DocumentRelevancyEvaluatorArgs
 }
 /**
- * An example to be evaluated by the document relevancy evaluator.
+ * A record to be evaluated by the document relevancy evaluator.
  */
-export type DocumentRelevancyExample = {
+export interface DocumentRelevancyEvaluationRecord {
   input: string;
   documentText: string;
-};
+  [key: string]: unknown;
+}
 /**
  * Creates a document relevancy evaluator function.
@@ -50,9 +51,12 @@ export type DocumentRelevancyExample = {
  * console.log(result.label); // "relevant" or "unrelated"
  * ```
  */
-export function createDocumentRelevancyEvaluator(
-  args: DocumentRelevancyEvaluatorArgs
-): Evaluator<DocumentRelevancyExample> {
+export function createDocumentRelevancyEvaluator<
+  RecordType extends Record<
+    string,
+    unknown
+  > = DocumentRelevancyEvaluationRecord,
+>(args: DocumentRelevancyEvaluatorArgs): Evaluator<RecordType> {
   const {
     choices = DOCUMENT_RELEVANCY_CHOICES,
     promptTemplate = DOCUMENT_RELEVANCY_TEMPLATE,
@@ -61,7 +65,7 @@ export function createDocumentRelevancyEvaluator(
     ...rest
   } = args;
-  return createClassificationEvaluator<DocumentRelevancyExample>({
+  return createClassificationEvaluator<RecordType>({
     ...args,
     promptTemplate,
     choices,

package/src/llm/createHallucinationEvaluator.ts CHANGED Viewed

@@ -1,9 +1,10 @@
-import { CreateClassificationEvaluatorArgs, Evaluator } from "../types/evals";
+import { CreateClassificationEvaluatorArgs } from "../types/evals";
 import {
   HALLUCINATION_TEMPLATE,
   HALLUCINATION_CHOICES,
 } from "../default_templates/HALLUCINATION_TEMPLATE";
 import { createClassificationEvaluator } from "./createClassificationEvaluator";
+import { ClassificationEvaluator } from "./ClassificationEvaluator";
 export interface HallucinationEvaluatorArgs
   extends Omit<
@@ -17,9 +18,9 @@ export interface HallucinationEvaluatorArgs
 }
 /**
- * An example to be evaluated by the hallucination evaluator.
+ * A record to be evaluated by the hallucination evaluator.
  */
-export type HallucinationExample = {
+export type HallucinationEvaluationRecord = {
   input: string;
   output: string;
   reference?: string;
@@ -31,9 +32,9 @@ export type HallucinationExample = {
  * @param args - The arguments for creating the hallucination evaluator.
  * @returns A function that evaluates whether an answer is factual or hallucinated based on a query and reference text.
  */
-export function createHallucinationEvaluator(
-  args: HallucinationEvaluatorArgs
-): Evaluator<HallucinationExample> {
+export function createHallucinationEvaluator<
+  RecordType extends Record<string, unknown> = HallucinationEvaluationRecord,
+>(args: HallucinationEvaluatorArgs): ClassificationEvaluator<RecordType> {
   const {
     choices = HALLUCINATION_CHOICES,
     promptTemplate = HALLUCINATION_TEMPLATE,
@@ -41,7 +42,7 @@ export function createHallucinationEvaluator(
     name = "hallucination",
     ...rest
   } = args;
-  return createClassificationEvaluator<HallucinationExample>({
+  return createClassificationEvaluator<RecordType>({
     ...args,
     promptTemplate,
     choices,

package/src/llm/generateClassification.ts CHANGED Viewed

@@ -4,20 +4,22 @@ import type { WithPrompt } from "../types/prompts";
 import { generateObject } from "ai";
 import { z } from "zod";
 import { tracer } from "../telemetry";
-export interface ClassifyArgs extends WithLLM, WithPrompt, WithTelemetry {
-  /**
-   * The labels to classify the example into. E.x. ["correct", "incorrect"]
-   */
-  labels: [string, ...string[]];
-  /**
-   * The name of the schema for generating the label and explanation.
-   */
-  schemaName?: string;
-  /**
-   * The description of the schema for generating the label and explanation.
-   */
-  schemaDescription?: string;
-}
+export type ClassifyArgs = WithLLM &
+  WithTelemetry &
+  WithPrompt & {
+    /**
+     * The labels to classify the example into. E.x. ["correct", "incorrect"]
+     */
+    labels: [string, ...string[]];
+    /**
+     * The name of the schema for generating the label and explanation.
+     */
+    schemaName?: string;
+    /**
+     * The description of the schema for generating the label and explanation.
+     */
+    schemaDescription?: string;
+  };
 /**
  * A function that leverages an llm to perform a classification
  */

package/src/template/getTemplateVariables.ts ADDED Viewed

@@ -0,0 +1,22 @@
+import { Template } from "../types/templating";
+import Mustache from "mustache";
+type GetTemplateVariableArgs = {
+  template: Template;
+};
+/**
+ * Parse out the template variables of a prompt
+ * @param {GetTemplateVariableArgs} args
+ * @returns {string[]} a list of prompt template variables
+ */
+export function getTemplateVariables(args: GetTemplateVariableArgs): string[] {
+  const { template } = args;
+  const templateSpans = Mustache.parse(template);
+  return templateSpans.reduce((acc, templateSpan) => {
+    const [spanType, value] = templateSpan;
+    if (spanType === "name" && typeof value === "string") {
+      acc = [...acc, value];
+    }
+    return acc;
+  }, [] as string[]);
+}

package/src/template/index.ts CHANGED Viewed

	@@ -1 +1,2 @@
1 1	export * from "./applyTemplate";
2	+ export * from "./getTemplateVariables";

package/src/types/evals.ts CHANGED Viewed

@@ -15,6 +15,7 @@ export interface WithLLM {
   model: LanguageModel;
 }
+// eslint-disable-next-line @typescript-eslint/no-empty-object-type
 export interface LLMEvaluationArgs extends WithLLM {}
 /**
@@ -78,8 +79,7 @@ export interface CreateClassifierArgs extends WithTelemetry {
   promptTemplate: string;
 }
-export interface CreateClassificationEvaluatorArgs
-  extends CreateClassifierArgs {
+export interface CreateEvaluatorArgs {
   /**
    * The name of the metric that the evaluator produces
    * E.x. "correctness"
@@ -92,6 +92,10 @@ export interface CreateClassificationEvaluatorArgs
   optimizationDirection?: OptimizationDirection;
 }
+export interface CreateClassificationEvaluatorArgs
+  extends CreateClassifierArgs,
+    CreateEvaluatorArgs {}
 export type EvaluatorFn<ExampleType extends Record<string, unknown>> = (
   args: ExampleType
 ) => Promise<EvaluationResult>;
@@ -99,19 +103,18 @@ export type EvaluatorFn<ExampleType extends Record<string, unknown>> = (
 /**
  * The source of the evaluation
  */
-type EvaluationSource = "LLM" | "CODE";
+export type EvaluationSource = "LLM" | "CODE";
 /**
  * The direction to optimize the numeric evaluation score
  * E.x. "MAXIMIZE" means that the higher the score, the better the evaluation
  */
-type OptimizationDirection = "MAXIMIZE" | "MINIMIZE";
+export type OptimizationDirection = "MAXIMIZE" | "MINIMIZE";
 /**
- * The Base Evaluator interface
- * This is the interface that all evaluators must implement
+ * The description of an evaluator
  */
-export interface Evaluator<ExampleType extends Record<string, unknown>> {
+interface EvaluatorDescription {
   /**
    * The name of the evaluator / the metric that it measures
    */
@@ -125,6 +128,14 @@ export interface Evaluator<ExampleType extends Record<string, unknown>> {
    * E.x. "MAXIMIZE" means that the higher the score, the better the evaluation
    */
   optimizationDirection?: OptimizationDirection;
+}
+/**
+ * The Base Evaluator interface
+ * This is the interface that all evaluators must implement
+ */
+export interface Evaluator<ExampleType extends Record<string, unknown>>
+  extends EvaluatorDescription {
   /**
    * The function that evaluates the example
    */

package/src/types/otel.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import { Tracer } from "@opentelemetry/api";
-export interface WithTelemetry {
+export type WithTelemetry = {
   telemetry?: {
     /**
      * Whether OpenTelemetry is enabled on the call.
@@ -14,4 +14,4 @@ export interface WithTelemetry {
      */
     tracer?: Tracer;
   };
-}
+};

package/src/types/prompts.ts CHANGED Viewed

@@ -1,22 +1,7 @@
-import { ModelMessage } from "ai";
+import type { Prompt } from "ai";
 /**
  * Prompt part of the AI function options for model generation.
  * It contains a system message, a simple text prompt, or a list of model messages.
  * Uses ModelMessage format compatible with AI SDK v5 generateObject function.
  */
-export interface WithPrompt {
-  /**
-   * System message to include in the prompt. Can be used with `prompt` or `messages`.
-   */
-  system?: string;
-  /**
-   * A simple text prompt. You can either use `prompt` or `messages` but not both.
-   */
-  prompt?: string;
-  /**
-   * A list of model messages. You can either use `prompt` or `messages` but not both.
-   * Uses ModelMessage format for compatibility with AI SDK v5.
-   */
-  messages?: Array<ModelMessage>;
-}
+export type WithPrompt = Prompt;