@arizeai/phoenix-evals 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  2. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  3. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js +86 -0
  4. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  5. package/dist/esm/__generated__/default_templates/index.d.ts +1 -0
  6. package/dist/esm/__generated__/default_templates/index.d.ts.map +1 -1
  7. package/dist/esm/__generated__/default_templates/index.js +1 -0
  8. package/dist/esm/__generated__/default_templates/index.js.map +1 -1
  9. package/dist/esm/llm/createToolResponseHandlingEvaluator.d.ts +78 -0
  10. package/dist/esm/llm/createToolResponseHandlingEvaluator.d.ts.map +1 -0
  11. package/dist/esm/llm/createToolResponseHandlingEvaluator.js +59 -0
  12. package/dist/esm/llm/createToolResponseHandlingEvaluator.js.map +1 -0
  13. package/dist/esm/llm/createToolSelectionEvaluator.d.ts +64 -0
  14. package/dist/esm/llm/createToolSelectionEvaluator.d.ts.map +1 -0
  15. package/dist/esm/llm/createToolSelectionEvaluator.js +50 -0
  16. package/dist/esm/llm/createToolSelectionEvaluator.js.map +1 -0
  17. package/dist/esm/llm/index.d.ts +2 -0
  18. package/dist/esm/llm/index.d.ts.map +1 -1
  19. package/dist/esm/llm/index.js +2 -0
  20. package/dist/esm/llm/index.js.map +1 -1
  21. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  22. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  23. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  24. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js +89 -0
  25. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  26. package/dist/src/__generated__/default_templates/index.d.ts +1 -0
  27. package/dist/src/__generated__/default_templates/index.d.ts.map +1 -1
  28. package/dist/src/__generated__/default_templates/index.js +3 -1
  29. package/dist/src/__generated__/default_templates/index.js.map +1 -1
  30. package/dist/src/llm/createToolResponseHandlingEvaluator.d.ts +78 -0
  31. package/dist/src/llm/createToolResponseHandlingEvaluator.d.ts.map +1 -0
  32. package/dist/src/llm/createToolResponseHandlingEvaluator.js +70 -0
  33. package/dist/src/llm/createToolResponseHandlingEvaluator.js.map +1 -0
  34. package/dist/src/llm/createToolSelectionEvaluator.d.ts +64 -0
  35. package/dist/src/llm/createToolSelectionEvaluator.d.ts.map +1 -0
  36. package/dist/src/llm/createToolSelectionEvaluator.js +61 -0
  37. package/dist/src/llm/createToolSelectionEvaluator.js.map +1 -0
  38. package/dist/src/llm/index.d.ts +2 -0
  39. package/dist/src/llm/index.d.ts.map +1 -1
  40. package/dist/src/llm/index.js +2 -0
  41. package/dist/src/llm/index.js.map +1 -1
  42. package/dist/tsconfig.tsbuildinfo +1 -1
  43. package/package.json +1 -1
  44. package/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.ts +88 -0
  45. package/src/__generated__/default_templates/index.ts +1 -0
  46. package/src/llm/createToolResponseHandlingEvaluator.ts +109 -0
  47. package/src/llm/createToolSelectionEvaluator.ts +93 -0
  48. package/src/llm/index.ts +2 -0
@@ -0,0 +1,3 @@
1
+ import type { ClassificationEvaluatorConfig } from "../types.js";
2
+ export declare const TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG: ClassificationEvaluatorConfig;
3
+ //# sourceMappingURL=TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,sDAAsD,EAAE,6BAmFpE,CAAC"}
@@ -0,0 +1,86 @@
1
+ // This file is generated. Do not edit by hand.
2
+ export const TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG = {
3
+ name: "tool_response_handling",
4
+ description: "For determining if an AI agent properly handled a tool's response, including error handling, data extraction, transformation, and safe information disclosure. Requires conversation context, the tool call(s), the tool result(s), and the agent's output.",
5
+ optimizationDirection: "MAXIMIZE",
6
+ template: [
7
+ {
8
+ role: "user",
9
+ content: `
10
+ You are an impartial judge evaluating an AI agent's handling of a tool's response. Your task is to determine whether the agent correctly processed the tool result to produce an appropriate output.
11
+
12
+ IMPORTANT - Scope of Evaluation:
13
+ - You are ONLY evaluating how the agent handled the tool response, NOT whether the right tool was selected or whether the tool was invoked correctly.
14
+ - This evaluation focuses on what happens AFTER the tool returns a result.
15
+
16
+ IMPORTANT - Multi-Tool Handling:
17
+ - The agent may make MULTIPLE tool calls in a single interaction. This is valid and expected.
18
+ - When multiple tools are called, evaluate how the agent handled ALL tool results together.
19
+ - Return "correct" only if the agent properly handled ALL tool results.
20
+ - Return "incorrect" if the agent mishandled ANY tool result.
21
+
22
+ IMPORTANT - Error Response Handling:
23
+ - Tool results may contain errors (rate limits, timeouts, not found, invalid arguments, etc.).
24
+ - The agent's output may include retries, follow-up tool calls, or a final response to the user.
25
+ - Evaluate the ENTIRE handling sequence, not just the final message.
26
+ - Appropriate error handling includes:
27
+ - Retrying on transient errors (rate limits, timeouts)
28
+ - Correcting arguments after invalid argument errors
29
+ - Informing the user appropriately when errors are not recoverable
30
+ - NOT making repeated identical calls that continue to fail
31
+
32
+ Criteria for CORRECT handling:
33
+ - Data is extracted accurately from the tool result (no hallucination of data that wasn't returned)
34
+ - Dates, numbers, and structured fields are properly transformed and formatted
35
+ - Results are accurately summarized to address the user's original query
36
+ - Error responses are handled appropriately (retries for transient errors, corrections for invalid arguments)
37
+ - No repeated identical calls after non-retryable errors
38
+ - No disclosure of sensitive/internal information (database credentials, internal URLs, PII, API keys, etc.)
39
+ - The agent's response actually uses the tool result rather than ignoring it
40
+
41
+ Criteria for INCORRECT handling:
42
+ - Hallucinated data: The output includes information not present in the tool result
43
+ - Misinterpretation: The meaning of the tool result is misrepresented or reversed
44
+ - Improper transformation: Dates, numbers, or structured data are incorrectly converted
45
+ - Missing retry: Failed to retry on retryable errors (rate limits, timeouts)
46
+ - Missing correction: Failed to correct arguments after invalid argument errors
47
+ - Futile retries: Repeated identical calls that continue to fail
48
+ - Information disclosure: Leaked sensitive information (credentials, internal URLs, PII)
49
+ - Ignored results: The agent's response doesn't incorporate the tool result
50
+ - Incomplete handling: Only some tool results are used when multiple tools were called
51
+
52
+ Before providing your final judgment, explain your reasoning and consider:
53
+ - Does the output accurately reflect what the tool returned?
54
+ - Are there any fabricated details not in the tool result?
55
+ - Were errors handled appropriately?
56
+ - Is sensitive information properly protected?
57
+ - Does the output actually address the user's query using the tool data?
58
+
59
+ <data>
60
+ <input>
61
+ {{input}}
62
+ </input>
63
+
64
+ <tool_call>
65
+ {{toolCall}}
66
+ </tool_call>
67
+
68
+ <tool_result>
69
+ {{toolResult}}
70
+ </tool_result>
71
+
72
+ <output>
73
+ {{output}}
74
+ </output>
75
+ </data>
76
+
77
+ Given the above data, did the agent handle the tool response correctly or incorrectly?
78
+ `,
79
+ },
80
+ ],
81
+ choices: {
82
+ "correct": 1,
83
+ "incorrect": 0
84
+ },
85
+ };
86
+ //# sourceMappingURL=TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,sDAAsD,GAAkC;IACnG,IAAI,EAAE,wBAAwB;IAC9B,WAAW,EAAE,6PAA6P;IAC1Q,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAqEd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
@@ -3,5 +3,6 @@ export { DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG } from "./DOCUMENT_R
3
3
  export { FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js";
4
4
  export { HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
5
5
  export { TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
6
+ export { TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js";
6
7
  export { TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js";
7
8
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,sDAAsD,EAAE,MAAM,0DAA0D,CAAC;AAClI,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
@@ -4,5 +4,6 @@ export { DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG } from "./DOCUMENT_R
4
4
  export { FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js";
5
5
  export { HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
6
6
  export { TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
7
+ export { TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js";
7
8
  export { TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js";
8
9
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAE/C,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAE/C,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,sDAAsD,EAAE,MAAM,0DAA0D,CAAC;AAClI,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
@@ -0,0 +1,78 @@
1
+ import { CreateClassificationEvaluatorArgs } from "../types/evals.js";
2
+ import { ClassificationEvaluator } from "./ClassificationEvaluator.js";
3
+ export interface ToolResponseHandlingEvaluatorArgs<RecordType extends Record<string, unknown> = ToolResponseHandlingEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
4
+ optimizationDirection?: CreateClassificationEvaluatorArgs<RecordType>["optimizationDirection"];
5
+ name?: CreateClassificationEvaluatorArgs<RecordType>["name"];
6
+ choices?: CreateClassificationEvaluatorArgs<RecordType>["choices"];
7
+ promptTemplate?: CreateClassificationEvaluatorArgs<RecordType>["promptTemplate"];
8
+ }
9
+ /**
10
+ * A record to be evaluated by the tool response handling evaluator.
11
+ */
12
+ export type ToolResponseHandlingEvaluationRecord = {
13
+ /**
14
+ * The user query or conversation context.
15
+ */
16
+ input: string;
17
+ /**
18
+ * The tool invocation(s) made by the agent, including arguments.
19
+ */
20
+ toolCall: string;
21
+ /**
22
+ * The tool's response (data, errors, or partial results).
23
+ */
24
+ toolResult: string;
25
+ /**
26
+ * The agent's handling after receiving the tool result
27
+ * (may include retries, follow-ups, or final response).
28
+ */
29
+ output: string;
30
+ };
31
+ /**
32
+ * Creates a tool response handling evaluator function.
33
+ *
34
+ * This function returns an evaluator that determines whether an AI agent properly
35
+ * handled a tool's response, including error handling, data extraction,
36
+ * transformation, and safe information disclosure.
37
+ *
38
+ * @param args - The arguments for creating the tool response handling evaluator.
39
+ * @param args.model - The model to use for classification.
40
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
41
+ * @param args.promptTemplate - The prompt template to use.
42
+ * @param args.telemetry - The telemetry to use for the evaluator.
43
+ *
44
+ * @returns An evaluator function that takes a {@link ToolResponseHandlingEvaluationRecord}
45
+ * and returns a classification result indicating whether the tool response handling
46
+ * is correct or incorrect.
47
+ *
48
+ * @example
49
+ * ```ts
50
+ * const evaluator = createToolResponseHandlingEvaluator({ model: openai("gpt-4o-mini") });
51
+ *
52
+ * // Example: Correct extraction from tool result
53
+ * const result = await evaluator.evaluate({
54
+ * input: "What's the weather in Seattle?",
55
+ * toolCall: 'get_weather(location="Seattle")',
56
+ * toolResult: JSON.stringify({
57
+ * temperature: 58,
58
+ * unit: "fahrenheit",
59
+ * conditions: "partly cloudy"
60
+ * }),
61
+ * output: "The weather in Seattle is 58°F and partly cloudy."
62
+ * });
63
+ * console.log(result.label); // "correct"
64
+ *
65
+ * // Example: Hallucinated data (incorrect)
66
+ * const resultHallucinated = await evaluator.evaluate({
67
+ * input: "What restaurants are nearby?",
68
+ * toolCall: 'search_restaurants(location="downtown")',
69
+ * toolResult: JSON.stringify({
70
+ * results: [{ name: "Cafe Luna", rating: 4.2 }]
71
+ * }),
72
+ * output: "I found Cafe Luna (4.2 stars) and Mario's Italian (4.8 stars) nearby."
73
+ * });
74
+ * console.log(resultHallucinated.label); // "incorrect" - Mario's was hallucinated
75
+ * ```
76
+ */
77
+ export declare function createToolResponseHandlingEvaluator<RecordType extends Record<string, unknown> = ToolResponseHandlingEvaluationRecord>(args: ToolResponseHandlingEvaluatorArgs<RecordType>): ClassificationEvaluator<RecordType>;
78
+ //# sourceMappingURL=createToolResponseHandlingEvaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"createToolResponseHandlingEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createToolResponseHandlingEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,iCAAiC,CAChD,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GACxC,oCAAoC,CACtC,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,oCAAoC,GAAG;IACjD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IACjB;;OAEG;IACH,UAAU,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,MAAM,EAAE,MAAM,CAAC;CAChB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6CG;AACH,wBAAgB,mCAAmC,CACjD,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GACxC,oCAAoC,EAEtC,IAAI,EAAE,iCAAiC,CAAC,UAAU,CAAC,GAClD,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
@@ -0,0 +1,59 @@
1
+ import { TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG } from "../__generated__/default_templates/index.js";
2
+ import { createClassificationEvaluator } from "./createClassificationEvaluator.js";
3
+ /**
4
+ * Creates a tool response handling evaluator function.
5
+ *
6
+ * This function returns an evaluator that determines whether an AI agent properly
7
+ * handled a tool's response, including error handling, data extraction,
8
+ * transformation, and safe information disclosure.
9
+ *
10
+ * @param args - The arguments for creating the tool response handling evaluator.
11
+ * @param args.model - The model to use for classification.
12
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
13
+ * @param args.promptTemplate - The prompt template to use.
14
+ * @param args.telemetry - The telemetry to use for the evaluator.
15
+ *
16
+ * @returns An evaluator function that takes a {@link ToolResponseHandlingEvaluationRecord}
17
+ * and returns a classification result indicating whether the tool response handling
18
+ * is correct or incorrect.
19
+ *
20
+ * @example
21
+ * ```ts
22
+ * const evaluator = createToolResponseHandlingEvaluator({ model: openai("gpt-4o-mini") });
23
+ *
24
+ * // Example: Correct extraction from tool result
25
+ * const result = await evaluator.evaluate({
26
+ * input: "What's the weather in Seattle?",
27
+ * toolCall: 'get_weather(location="Seattle")',
28
+ * toolResult: JSON.stringify({
29
+ * temperature: 58,
30
+ * unit: "fahrenheit",
31
+ * conditions: "partly cloudy"
32
+ * }),
33
+ * output: "The weather in Seattle is 58°F and partly cloudy."
34
+ * });
35
+ * console.log(result.label); // "correct"
36
+ *
37
+ * // Example: Hallucinated data (incorrect)
38
+ * const resultHallucinated = await evaluator.evaluate({
39
+ * input: "What restaurants are nearby?",
40
+ * toolCall: 'search_restaurants(location="downtown")',
41
+ * toolResult: JSON.stringify({
42
+ * results: [{ name: "Cafe Luna", rating: 4.2 }]
43
+ * }),
44
+ * output: "I found Cafe Luna (4.2 stars) and Mario's Italian (4.8 stars) nearby."
45
+ * });
46
+ * console.log(resultHallucinated.label); // "incorrect" - Mario's was hallucinated
47
+ * ```
48
+ */
49
+ export function createToolResponseHandlingEvaluator(args) {
50
+ const { choices = TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.name, ...rest } = args;
51
+ return createClassificationEvaluator({
52
+ ...rest,
53
+ promptTemplate,
54
+ choices,
55
+ optimizationDirection,
56
+ name,
57
+ });
58
+ }
59
+ //# sourceMappingURL=createToolResponseHandlingEvaluator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"createToolResponseHandlingEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createToolResponseHandlingEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,sDAAsD,EAAE,MAAM,oCAAoC,CAAC;AAI5G,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAsChF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6CG;AACH,MAAM,UAAU,mCAAmC,CAIjD,IAAmD;IAEnD,MAAM,EACJ,OAAO,GAAG,sDAAsD,CAAC,OAAO,EACxE,cAAc,GAAG,sDAAsD,CAAC,QAAQ,EAChF,qBAAqB,GAAG,sDAAsD,CAAC,qBAAqB,EACpG,IAAI,GAAG,sDAAsD,CAAC,IAAI,EAClE,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,64 @@
1
+ import { CreateClassificationEvaluatorArgs } from "../types/evals.js";
2
+ import { ClassificationEvaluator } from "./ClassificationEvaluator.js";
3
+ export interface ToolSelectionEvaluatorArgs<RecordType extends Record<string, unknown> = ToolSelectionEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
4
+ optimizationDirection?: CreateClassificationEvaluatorArgs<RecordType>["optimizationDirection"];
5
+ name?: CreateClassificationEvaluatorArgs<RecordType>["name"];
6
+ choices?: CreateClassificationEvaluatorArgs<RecordType>["choices"];
7
+ promptTemplate?: CreateClassificationEvaluatorArgs<RecordType>["promptTemplate"];
8
+ }
9
+ /**
10
+ * A record to be evaluated by the tool selection evaluator.
11
+ */
12
+ export type ToolSelectionEvaluationRecord = {
13
+ /**
14
+ * The input query or conversation context.
15
+ */
16
+ input: string;
17
+ /**
18
+ * The available tools that the LLM could use.
19
+ */
20
+ availableTools: string;
21
+ /**
22
+ * The tool or tools selected by the LLM.
23
+ */
24
+ toolSelection: string;
25
+ };
26
+ /**
27
+ * Creates a tool selection evaluator function.
28
+ *
29
+ * This function returns an evaluator that determines whether the correct tool
30
+ * was selected for a given context. Unlike the tool invocation evaluator which
31
+ * checks if the tool was called correctly with proper arguments, this evaluator
32
+ * focuses on whether the right tool was chosen in the first place.
33
+ *
34
+ * The evaluator checks for:
35
+ * - Whether the LLM chose the best available tool for the user query
36
+ * - Whether the tool name exists in the available tools list
37
+ * - Whether the correct number of tools were selected for the task
38
+ * - Whether the tool selection is safe and appropriate
39
+ *
40
+ * @param args - The arguments for creating the tool selection evaluator.
41
+ * @param args.model - The model to use for classification.
42
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
43
+ * @param args.promptTemplate - The prompt template to use (defaults to TOOL_SELECTION_TEMPLATE).
44
+ * @param args.telemetry - The telemetry to use for the evaluator.
45
+ *
46
+ * @returns An evaluator function that takes a {@link ToolSelectionEvaluationRecord} and returns
47
+ * a classification result indicating whether the tool selection is correct or incorrect.
48
+ *
49
+ * @example
50
+ * ```ts
51
+ * const evaluator = createToolSelectionEvaluator({ model: openai("gpt-4o-mini") });
52
+ *
53
+ * const result = await evaluator.evaluate({
54
+ * input: "User: What is the weather in San Francisco?",
55
+ * availableTools: `WeatherTool: Get the current weather for a location.
56
+ * NewsTool: Stay connected to global events with our up-to-date news around the world.
57
+ * MusicTool: Create playlists, search for music, and check the latest music trends.`,
58
+ * toolSelection: "WeatherTool"
59
+ * });
60
+ * console.log(result.label); // "correct" or "incorrect"
61
+ * ```
62
+ */
63
+ export declare function createToolSelectionEvaluator<RecordType extends Record<string, unknown> = ToolSelectionEvaluationRecord>(args: ToolSelectionEvaluatorArgs<RecordType>): ClassificationEvaluator<RecordType>;
64
+ //# sourceMappingURL=createToolSelectionEvaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"createToolSelectionEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createToolSelectionEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,0BAA0B,CACzC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,6BAA6B,CAC1E,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,6BAA6B,GAAG;IAC1C;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB;;OAEG;IACH,aAAa,EAAE,MAAM,CAAC;CACvB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoCG;AACH,wBAAgB,4BAA4B,CAC1C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,6BAA6B,EAE1E,IAAI,EAAE,0BAA0B,CAAC,UAAU,CAAC,GAC3C,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
@@ -0,0 +1,50 @@
1
+ import { TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG } from "../__generated__/default_templates/index.js";
2
+ import { createClassificationEvaluator } from "./createClassificationEvaluator.js";
3
+ /**
4
+ * Creates a tool selection evaluator function.
5
+ *
6
+ * This function returns an evaluator that determines whether the correct tool
7
+ * was selected for a given context. Unlike the tool invocation evaluator which
8
+ * checks if the tool was called correctly with proper arguments, this evaluator
9
+ * focuses on whether the right tool was chosen in the first place.
10
+ *
11
+ * The evaluator checks for:
12
+ * - Whether the LLM chose the best available tool for the user query
13
+ * - Whether the tool name exists in the available tools list
14
+ * - Whether the correct number of tools were selected for the task
15
+ * - Whether the tool selection is safe and appropriate
16
+ *
17
+ * @param args - The arguments for creating the tool selection evaluator.
18
+ * @param args.model - The model to use for classification.
19
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
20
+ * @param args.promptTemplate - The prompt template to use (defaults to TOOL_SELECTION_TEMPLATE).
21
+ * @param args.telemetry - The telemetry to use for the evaluator.
22
+ *
23
+ * @returns An evaluator function that takes a {@link ToolSelectionEvaluationRecord} and returns
24
+ * a classification result indicating whether the tool selection is correct or incorrect.
25
+ *
26
+ * @example
27
+ * ```ts
28
+ * const evaluator = createToolSelectionEvaluator({ model: openai("gpt-4o-mini") });
29
+ *
30
+ * const result = await evaluator.evaluate({
31
+ * input: "User: What is the weather in San Francisco?",
32
+ * availableTools: `WeatherTool: Get the current weather for a location.
33
+ * NewsTool: Stay connected to global events with our up-to-date news around the world.
34
+ * MusicTool: Create playlists, search for music, and check the latest music trends.`,
35
+ * toolSelection: "WeatherTool"
36
+ * });
37
+ * console.log(result.label); // "correct" or "incorrect"
38
+ * ```
39
+ */
40
+ export function createToolSelectionEvaluator(args) {
41
+ const { choices = TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.name, ...rest } = args;
42
+ return createClassificationEvaluator({
43
+ ...rest,
44
+ promptTemplate,
45
+ choices,
46
+ optimizationDirection,
47
+ name,
48
+ });
49
+ }
50
+ //# sourceMappingURL=createToolSelectionEvaluator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"createToolSelectionEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createToolSelectionEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,8CAA8C,EAAE,MAAM,oCAAoC,CAAC;AAIpG,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAgChF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAoCG;AACH,MAAM,UAAU,4BAA4B,CAG1C,IAA4C;IAE5C,MAAM,EACJ,OAAO,GAAG,8CAA8C,CAAC,OAAO,EAChE,cAAc,GAAG,8CAA8C,CAAC,QAAQ,EACxE,qBAAqB,GAAG,8CAA8C,CAAC,qBAAqB,EAC5F,IAAI,GAAG,8CAA8C,CAAC,IAAI,EAC1D,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}
@@ -6,6 +6,8 @@ export * from "./createDocumentRelevanceEvaluator.js";
6
6
  export * from "./createFaithfulnessEvaluator.js";
7
7
  export * from "./createHallucinationEvaluator.js";
8
8
  export * from "./createToolInvocationEvaluator.js";
9
+ export * from "./createToolResponseHandlingEvaluator.js";
10
+ export * from "./createToolSelectionEvaluator.js";
9
11
  export * from "./generateClassification.js";
10
12
  export * from "./LLMEvaluator.js";
11
13
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,iCAAiC,CAAC;AAChD,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,iCAAiC,CAAC;AAChD,cAAc,uCAAuC,CAAC;AACtD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}
@@ -6,6 +6,8 @@ export * from "./createDocumentRelevanceEvaluator.js";
6
6
  export * from "./createFaithfulnessEvaluator.js";
7
7
  export * from "./createHallucinationEvaluator.js"; // Deprecated: use createFaithfulnessEvaluator
8
8
  export * from "./createToolInvocationEvaluator.js";
9
+ export * from "./createToolResponseHandlingEvaluator.js";
10
+ export * from "./createToolSelectionEvaluator.js";
9
11
  export * from "./generateClassification.js";
10
12
  export * from "./LLMEvaluator.js";
11
13
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC,CAAC,8CAA8C;AAC9F,cAAc,iCAAiC,CAAC;AAChD,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,2BAA2B,CAAC;AAC1C,cAAc,iCAAiC,CAAC;AAChD,cAAc,sBAAsB,CAAC;AACrC,cAAc,8BAA8B,CAAC;AAC7C,cAAc,oCAAoC,CAAC;AACnD,cAAc,+BAA+B,CAAC;AAC9C,cAAc,gCAAgC,CAAC,CAAC,8CAA8C;AAC9F,cAAc,iCAAiC,CAAC;AAChD,cAAc,uCAAuC,CAAC;AACtD,cAAc,gCAAgC,CAAC;AAC/C,cAAc,0BAA0B,CAAC;AACzC,cAAc,gBAAgB,CAAC"}