npm - @arizeai/phoenix-evals - Versions diffs - 0.7.0 → 0.9.0 - Mend

@arizeai/phoenix-evals 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

package/dist/esm/llm/createToolResponseHandlingEvaluator.js ADDED Viewed

@@ -0,0 +1,59 @@
+import { TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG } from "../__generated__/default_templates/index.js";
+import { createClassificationEvaluator } from "./createClassificationEvaluator.js";
+/**
+ * Creates a tool response handling evaluator function.
+ *
+ * This function returns an evaluator that determines whether an AI agent properly
+ * handled a tool's response, including error handling, data extraction,
+ * transformation, and safe information disclosure.
+ *
+ * @param args - The arguments for creating the tool response handling evaluator.
+ * @param args.model - The model to use for classification.
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
+ * @param args.promptTemplate - The prompt template to use.
+ * @param args.telemetry - The telemetry to use for the evaluator.
+ *
+ * @returns An evaluator function that takes a {@link ToolResponseHandlingEvaluationRecord}
+ * and returns a classification result indicating whether the tool response handling
+ * is correct or incorrect.
+ *
+ * @example
+ * ```ts
+ * const evaluator = createToolResponseHandlingEvaluator({ model: openai("gpt-4o-mini") });
+ *
+ * // Example: Correct extraction from tool result
+ * const result = await evaluator.evaluate({
+ *   input: "What's the weather in Seattle?",
+ *   toolCall: 'get_weather(location="Seattle")',
+ *   toolResult: JSON.stringify({
+ *     temperature: 58,
+ *     unit: "fahrenheit",
+ *     conditions: "partly cloudy"
+ *   }),
+ *   output: "The weather in Seattle is 58°F and partly cloudy."
+ * });
+ * console.log(result.label); // "correct"
+ *
+ * // Example: Hallucinated data (incorrect)
+ * const resultHallucinated = await evaluator.evaluate({
+ *   input: "What restaurants are nearby?",
+ *   toolCall: 'search_restaurants(location="downtown")',
+ *   toolResult: JSON.stringify({
+ *     results: [{ name: "Cafe Luna", rating: 4.2 }]
+ *   }),
+ *   output: "I found Cafe Luna (4.2 stars) and Mario's Italian (4.8 stars) nearby."
+ * });
+ * console.log(resultHallucinated.label); // "incorrect" - Mario's was hallucinated
+ * ```
+ */
+export function createToolResponseHandlingEvaluator(args) {
+    const { choices = TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.name, ...rest } = args;
+    return createClassificationEvaluator({
+        ...rest,
+        promptTemplate,
+        choices,
+        optimizationDirection,
+        name,
+    });
+}
+//# sourceMappingURL=createToolResponseHandlingEvaluator.js.map

package/dist/esm/llm/createToolResponseHandlingEvaluator.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"createToolResponseHandlingEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createToolResponseHandlingEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,sDAAsD,EAAE,MAAM,oCAAoC,CAAC;AAI5G,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAsChF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6CG;AACH,MAAM,UAAU,mCAAmC,CAIjD,IAAmD;IAEnD,MAAM,EACJ,OAAO,GAAG,sDAAsD,CAAC,OAAO,EACxE,cAAc,GAAG,sDAAsD,CAAC,QAAQ,EAChF,qBAAqB,GAAG,sDAAsD,CAAC,qBAAqB,EACpG,IAAI,GAAG,sDAAsD,CAAC,IAAI,EAClE,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}

package/dist/esm/llm/createToolSelectionEvaluator.d.ts ADDED Viewed

@@ -0,0 +1,64 @@
+import { CreateClassificationEvaluatorArgs } from "../types/evals.js";
+import { ClassificationEvaluator } from "./ClassificationEvaluator.js";
+export interface ToolSelectionEvaluatorArgs<RecordType extends Record<string, unknown> = ToolSelectionEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
+    optimizationDirection?: CreateClassificationEvaluatorArgs<RecordType>["optimizationDirection"];
+    name?: CreateClassificationEvaluatorArgs<RecordType>["name"];
+    choices?: CreateClassificationEvaluatorArgs<RecordType>["choices"];
+    promptTemplate?: CreateClassificationEvaluatorArgs<RecordType>["promptTemplate"];
+}
+/**
+ * A record to be evaluated by the tool selection evaluator.
+ */
+export type ToolSelectionEvaluationRecord = {
+    /**
+     * The input query or conversation context.
+     */
+    input: string;
+    /**
+     * The available tools that the LLM could use.
+     */
+    availableTools: string;
+    /**
+     * The tool or tools selected by the LLM.
+     */
+    toolSelection: string;
+};
+/**
+ * Creates a tool selection evaluator function.
+ *
+ * This function returns an evaluator that determines whether the correct tool
+ * was selected for a given context. Unlike the tool invocation evaluator which
+ * checks if the tool was called correctly with proper arguments, this evaluator
+ * focuses on whether the right tool was chosen in the first place.
+ *
+ * The evaluator checks for:
+ * - Whether the LLM chose the best available tool for the user query
+ * - Whether the tool name exists in the available tools list
+ * - Whether the correct number of tools were selected for the task
+ * - Whether the tool selection is safe and appropriate
+ *
+ * @param args - The arguments for creating the tool selection evaluator.
+ * @param args.model - The model to use for classification.
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
+ * @param args.promptTemplate - The prompt template to use (defaults to TOOL_SELECTION_TEMPLATE).
+ * @param args.telemetry - The telemetry to use for the evaluator.
+ *
+ * @returns An evaluator function that takes a {@link ToolSelectionEvaluationRecord} and returns
+ * a classification result indicating whether the tool selection is correct or incorrect.
+ *
+ * @example
+ * ```ts
+ * const evaluator = createToolSelectionEvaluator({ model: openai("gpt-4o-mini") });
+ *
+ * const result = await evaluator.evaluate({
+ *   input: "User: What is the weather in San Francisco?",
+ *   availableTools: `WeatherTool: Get the current weather for a location.
+ * NewsTool: Stay connected to global events with our up-to-date news around the world.
+ * MusicTool: Create playlists, search for music, and check the latest music trends.`,
+ *   toolSelection: "WeatherTool"
+ * });
+ * console.log(result.label); // "correct" or "incorrect"
+ * ```
+ */
+export declare function createToolSelectionEvaluator<RecordType extends Record<string, unknown> = ToolSelectionEvaluationRecord>(args: ToolSelectionEvaluatorArgs<RecordType>): ClassificationEvaluator<RecordType>;
+//# sourceMappingURL=createToolSelectionEvaluator.d.ts.map

package/dist/esm/llm/createToolSelectionEvaluator.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"createToolSelectionEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createToolSelectionEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,0BAA0B,CACzC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,6BAA6B,CAC1E,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,6BAA6B,GAAG;IAC1C;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB;;OAEG;IACH,aAAa,EAAE,MAAM,CAAC;CACvB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoCG;AACH,wBAAgB,4BAA4B,CAC1C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,6BAA6B,EAE1E,IAAI,EAAE,0BAA0B,CAAC,UAAU,CAAC,GAC3C,uBAAuB,CAAC,UAAU,CAAC,CAerC"}

package/dist/esm/llm/createToolSelectionEvaluator.js ADDED Viewed

@@ -0,0 +1,50 @@
+import { TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG } from "../__generated__/default_templates/index.js";
+import { createClassificationEvaluator } from "./createClassificationEvaluator.js";
+/**
+ * Creates a tool selection evaluator function.
+ *
+ * This function returns an evaluator that determines whether the correct tool
+ * was selected for a given context. Unlike the tool invocation evaluator which
+ * checks if the tool was called correctly with proper arguments, this evaluator
+ * focuses on whether the right tool was chosen in the first place.
+ *
+ * The evaluator checks for:
+ * - Whether the LLM chose the best available tool for the user query
+ * - Whether the tool name exists in the available tools list
+ * - Whether the correct number of tools were selected for the task
+ * - Whether the tool selection is safe and appropriate
+ *
+ * @param args - The arguments for creating the tool selection evaluator.
+ * @param args.model - The model to use for classification.
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
+ * @param args.promptTemplate - The prompt template to use (defaults to TOOL_SELECTION_TEMPLATE).
+ * @param args.telemetry - The telemetry to use for the evaluator.
+ *
+ * @returns An evaluator function that takes a {@link ToolSelectionEvaluationRecord} and returns
+ * a classification result indicating whether the tool selection is correct or incorrect.
+ *
+ * @example
+ * ```ts
+ * const evaluator = createToolSelectionEvaluator({ model: openai("gpt-4o-mini") });
+ *
+ * const result = await evaluator.evaluate({
+ *   input: "User: What is the weather in San Francisco?",
+ *   availableTools: `WeatherTool: Get the current weather for a location.
+ * NewsTool: Stay connected to global events with our up-to-date news around the world.
+ * MusicTool: Create playlists, search for music, and check the latest music trends.`,
+ *   toolSelection: "WeatherTool"
+ * });
+ * console.log(result.label); // "correct" or "incorrect"
+ * ```
+ */
+export function createToolSelectionEvaluator(args) {
+    const { choices = TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.name, ...rest } = args;
+    return createClassificationEvaluator({
+        ...rest,
+        promptTemplate,
+        choices,
+        optimizationDirection,
+        name,
+    });
+}
+//# sourceMappingURL=createToolSelectionEvaluator.js.map

package/dist/esm/llm/createToolSelectionEvaluator.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"createToolSelectionEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createToolSelectionEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,8CAA8C,EAAE,MAAM,oCAAoC,CAAC;AAIpG,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAgChF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoCG;AACH,MAAM,UAAU,4BAA4B,CAG1C,IAA4C;IAE5C,MAAM,EACJ,OAAO,GAAG,8CAA8C,CAAC,OAAO,EAChE,cAAc,GAAG,8CAA8C,CAAC,QAAQ,EACxE,qBAAqB,GAAG,8CAA8C,CAAC,qBAAqB,EAC5F,IAAI,GAAG,8CAA8C,CAAC,IAAI,EAC1D,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}

package/dist/esm/llm/index.d.ts CHANGED Viewed

@@ -1,9 +1,13 @@
-export * from "./generateClassification.js";
-export * from "./createClassifierFn.js";
+export * from "./ClassificationEvaluator.js";
 export * from "./createClassificationEvaluator.js";
-export * from "./createHallucinationEvaluator.js";
-export * from "./createDocumentRelevanceEvaluator.js";
+export * from "./createClassifierFn.js";
 export * from "./createCorrectnessEvaluator.js";
-export * from "./ClassificationEvaluator.js";
+export * from "./createDocumentRelevanceEvaluator.js";
+export * from "./createFaithfulnessEvaluator.js";
+export * from "./createHallucinationEvaluator.js";
+export * from "./createToolInvocationEvaluator.js";
+export * from "./createToolResponseHandlingEvaluator.js";
+export * from "./createToolSelectionEvaluator.js";
+export * from "./generateClassification.js";
 export * from "./LLMEvaluator.js";
 //# sourceMappingURL=index.d.ts.map

package/dist/esm/llm/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,~~0BAA0B~~,CAAC;~~AACzC~~,cAAc,sBAAsB,CAAC;AACrC,cAAc,~~iCAAiC~~,CAAC;~~AAChD~~,cAAc,gCAAgC,CAAC;AAC/C,cAAc,~~oCAAoC~~,CAAC;~~AACnD~~,cAAc,~~8BAA8B~~,CAAC;~~AAC7C~~,cAAc,~~2BAA2B~~,CAAC;~~AAC1C~~,cAAc,gBAAgB,CAAC"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,iCAAiC,CAAC;AAChD,cAAc,uCAAuC,CAAC;AACtD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}

package/dist/esm/llm/index.js CHANGED Viewed

@@ -1,9 +1,13 @@
-export * from "./generateClassification.js";
-export * from "./createClassifierFn.js";
+export * from "./ClassificationEvaluator.js";
 export * from "./createClassificationEvaluator.js";
-export * from "./createHallucinationEvaluator.js";
-export * from "./createDocumentRelevanceEvaluator.js";
+export * from "./createClassifierFn.js";
 export * from "./createCorrectnessEvaluator.js";
-export * from "./ClassificationEvaluator.js";
+export * from "./createDocumentRelevanceEvaluator.js";
+export * from "./createFaithfulnessEvaluator.js";
+export * from "./createHallucinationEvaluator.js"; // Deprecated: use createFaithfulnessEvaluator
+export * from "./createToolInvocationEvaluator.js";
+export * from "./createToolResponseHandlingEvaluator.js";
+export * from "./createToolSelectionEvaluator.js";
+export * from "./generateClassification.js";
 export * from "./LLMEvaluator.js";
 //# sourceMappingURL=index.js.map

package/dist/esm/llm/index.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,~~0BAA0B~~,CAAC;~~AACzC~~,cAAc,sBAAsB,CAAC;AACrC,cAAc,~~iCAAiC~~,CAAC;~~AAChD~~,cAAc,gCAAgC,CAAC;~~AAC/C~~,cAAc,~~oCAAoC~~,CAAC;~~AACnD~~,cAAc,~~8BAA8B~~,CAAC;~~AAC7C~~,cAAc,~~2BAA2B~~,CAAC;~~AAC1C~~,cAAc,gBAAgB,CAAC"}
1	+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC,CAAC,8CAA8C;AAC9F,cAAc,iCAAiC,CAAC;AAChD,cAAc,uCAAuC,CAAC;AACtD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}