@arizeai/phoenix-evals 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  2. package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  3. package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +33 -0
  4. package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  5. package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  6. package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  7. package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js +30 -0
  8. package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  9. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  10. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  11. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js +86 -0
  12. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  13. package/dist/esm/__generated__/default_templates/index.d.ts +3 -0
  14. package/dist/esm/__generated__/default_templates/index.d.ts.map +1 -1
  15. package/dist/esm/__generated__/default_templates/index.js +3 -0
  16. package/dist/esm/__generated__/default_templates/index.js.map +1 -1
  17. package/dist/esm/core/EvaluatorBase.d.ts.map +1 -1
  18. package/dist/esm/llm/createCorrectnessEvaluator.d.ts.map +1 -1
  19. package/dist/esm/llm/createDocumentRelevanceEvaluator.d.ts.map +1 -1
  20. package/dist/esm/llm/createDocumentRelevanceEvaluator.js.map +1 -1
  21. package/dist/esm/llm/createFaithfulnessEvaluator.d.ts +24 -0
  22. package/dist/esm/llm/createFaithfulnessEvaluator.d.ts.map +1 -0
  23. package/dist/esm/llm/createFaithfulnessEvaluator.js +19 -0
  24. package/dist/esm/llm/createFaithfulnessEvaluator.js.map +1 -0
  25. package/dist/esm/llm/createHallucinationEvaluator.d.ts +12 -0
  26. package/dist/esm/llm/createHallucinationEvaluator.d.ts.map +1 -1
  27. package/dist/esm/llm/createHallucinationEvaluator.js +17 -0
  28. package/dist/esm/llm/createHallucinationEvaluator.js.map +1 -1
  29. package/dist/esm/llm/createToolInvocationEvaluator.d.ts +74 -0
  30. package/dist/esm/llm/createToolInvocationEvaluator.d.ts.map +1 -0
  31. package/dist/esm/llm/createToolInvocationEvaluator.js +60 -0
  32. package/dist/esm/llm/createToolInvocationEvaluator.js.map +1 -0
  33. package/dist/esm/llm/createToolResponseHandlingEvaluator.d.ts +78 -0
  34. package/dist/esm/llm/createToolResponseHandlingEvaluator.d.ts.map +1 -0
  35. package/dist/esm/llm/createToolResponseHandlingEvaluator.js +59 -0
  36. package/dist/esm/llm/createToolResponseHandlingEvaluator.js.map +1 -0
  37. package/dist/esm/llm/createToolSelectionEvaluator.d.ts +64 -0
  38. package/dist/esm/llm/createToolSelectionEvaluator.d.ts.map +1 -0
  39. package/dist/esm/llm/createToolSelectionEvaluator.js +50 -0
  40. package/dist/esm/llm/createToolSelectionEvaluator.js.map +1 -0
  41. package/dist/esm/llm/index.d.ts +9 -5
  42. package/dist/esm/llm/index.d.ts.map +1 -1
  43. package/dist/esm/llm/index.js +9 -5
  44. package/dist/esm/llm/index.js.map +1 -1
  45. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  46. package/dist/esm/types/evals.d.ts.map +1 -1
  47. package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  48. package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  49. package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +36 -0
  50. package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  51. package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  52. package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  53. package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js +33 -0
  54. package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  55. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  56. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  57. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js +89 -0
  58. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  59. package/dist/src/__generated__/default_templates/index.d.ts +3 -0
  60. package/dist/src/__generated__/default_templates/index.d.ts.map +1 -1
  61. package/dist/src/__generated__/default_templates/index.js +7 -1
  62. package/dist/src/__generated__/default_templates/index.js.map +1 -1
  63. package/dist/src/core/EvaluatorBase.d.ts.map +1 -1
  64. package/dist/src/llm/createCorrectnessEvaluator.d.ts.map +1 -1
  65. package/dist/src/llm/createDocumentRelevanceEvaluator.d.ts.map +1 -1
  66. package/dist/src/llm/createDocumentRelevanceEvaluator.js.map +1 -1
  67. package/dist/src/llm/createFaithfulnessEvaluator.d.ts +24 -0
  68. package/dist/src/llm/createFaithfulnessEvaluator.d.ts.map +1 -0
  69. package/dist/src/llm/createFaithfulnessEvaluator.js +30 -0
  70. package/dist/src/llm/createFaithfulnessEvaluator.js.map +1 -0
  71. package/dist/src/llm/createHallucinationEvaluator.d.ts +12 -0
  72. package/dist/src/llm/createHallucinationEvaluator.d.ts.map +1 -1
  73. package/dist/src/llm/createHallucinationEvaluator.js +17 -0
  74. package/dist/src/llm/createHallucinationEvaluator.js.map +1 -1
  75. package/dist/src/llm/createToolInvocationEvaluator.d.ts +74 -0
  76. package/dist/src/llm/createToolInvocationEvaluator.d.ts.map +1 -0
  77. package/dist/src/llm/createToolInvocationEvaluator.js +71 -0
  78. package/dist/src/llm/createToolInvocationEvaluator.js.map +1 -0
  79. package/dist/src/llm/createToolResponseHandlingEvaluator.d.ts +78 -0
  80. package/dist/src/llm/createToolResponseHandlingEvaluator.d.ts.map +1 -0
  81. package/dist/src/llm/createToolResponseHandlingEvaluator.js +70 -0
  82. package/dist/src/llm/createToolResponseHandlingEvaluator.js.map +1 -0
  83. package/dist/src/llm/createToolSelectionEvaluator.d.ts +64 -0
  84. package/dist/src/llm/createToolSelectionEvaluator.d.ts.map +1 -0
  85. package/dist/src/llm/createToolSelectionEvaluator.js +61 -0
  86. package/dist/src/llm/createToolSelectionEvaluator.js.map +1 -0
  87. package/dist/src/llm/index.d.ts +9 -5
  88. package/dist/src/llm/index.d.ts.map +1 -1
  89. package/dist/src/llm/index.js +9 -5
  90. package/dist/src/llm/index.js.map +1 -1
  91. package/dist/src/types/evals.d.ts.map +1 -1
  92. package/dist/tsconfig.tsbuildinfo +1 -1
  93. package/package.json +1 -1
  94. package/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +35 -0
  95. package/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts +32 -0
  96. package/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.ts +88 -0
  97. package/src/__generated__/default_templates/index.ts +3 -0
  98. package/src/core/EvaluatorBase.ts +3 -3
  99. package/src/llm/createCorrectnessEvaluator.ts +3 -3
  100. package/src/llm/createDocumentRelevanceEvaluator.ts +7 -11
  101. package/src/llm/createFaithfulnessEvaluator.ts +52 -0
  102. package/src/llm/createHallucinationEvaluator.ts +25 -3
  103. package/src/llm/createToolInvocationEvaluator.ts +103 -0
  104. package/src/llm/createToolResponseHandlingEvaluator.ts +109 -0
  105. package/src/llm/createToolSelectionEvaluator.ts +93 -0
  106. package/src/llm/index.ts +9 -5
  107. package/src/types/evals.ts +5 -4
@@ -0,0 +1,3 @@
1
+ import type { ClassificationEvaluatorConfig } from "../types.js";
2
+ export declare const FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG: ClassificationEvaluatorConfig;
3
+ //# sourceMappingURL=FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,4CAA4C,EAAE,6BA8B1D,CAAC"}
@@ -0,0 +1,33 @@
1
+ // This file is generated. Do not edit by hand.
2
+ export const FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG = {
3
+ name: "faithfulness",
4
+ description: "A specialized evaluator for detecting faithfulness in grounded LLM responses.",
5
+ optimizationDirection: "MAXIMIZE",
6
+ template: [
7
+ {
8
+ role: "user",
9
+ content: `
10
+ In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is unfaithful to the facts. Your objective is to determine whether the response text contains factual information and is faithful to the context. An 'unfaithful' response refers to a response that is not based on the context or assumes information that is not available in the context. Your response should be a single word: either 'faithful' or 'unfaithful', and it should not include any other text or characters. 'unfaithful' indicates that the response provides factually inaccurate information to the query based on the context. 'faithful' indicates that the response to the question is correct relative to the context, and does not contain made up information. Please read the query and context carefully before determining your response.
11
+
12
+ <data>
13
+ <query>
14
+ {{input}}
15
+ </query>
16
+ <context>
17
+ {{context}}
18
+ </context>
19
+ <response>
20
+ {{output}}
21
+ </response>
22
+ </data>
23
+
24
+ Is the response above faithful or unfaithful based on the query and context?
25
+ `,
26
+ },
27
+ ],
28
+ choices: {
29
+ "faithful": 1,
30
+ "unfaithful": 0
31
+ },
32
+ };
33
+ //# sourceMappingURL=FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,4CAA4C,GAAkC;IACzF,IAAI,EAAE,cAAc;IACpB,WAAW,EAAE,+EAA+E;IAC5F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;CAgBd;SACI;KACF;IACD,OAAO,EAAE;QACT,UAAU,EAAE,CAAC;QACb,YAAY,EAAE,CAAC;KAChB;CACA,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { ClassificationEvaluatorConfig } from "../types.js";
2
+ export declare const TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG: ClassificationEvaluatorConfig;
3
+ //# sourceMappingURL=TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,+CAA+C,EAAE,6BA2B7D,CAAC"}
@@ -0,0 +1,30 @@
1
+ // This file is generated. Do not edit by hand.
2
+ export const TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG = {
3
+ name: "tool_invocation",
4
+ description: "For determining if a tool was invoked correctly with proper arguments, formatting, and safe content. Requires conversation context, available tool schemas, and the LLM's tool invocation(s).",
5
+ optimizationDirection: "MAXIMIZE",
6
+ template: [
7
+ {
8
+ role: "user",
9
+ content: `
10
+ You are an impartial judge evaluating an LLM's tool-calling behavior, specifically whether the LLM invoked a tool (or tools) correctly with valid arguments and proper formatting.
11
+ Your task: Determine whether the LLM's tool invocation(s) were correct or incorrect based on: - The full conversation context (including all previous turns, not just the most recent message) - The available tool schemas - The LLM's tool invocation(s) with arguments
12
+ IMPORTANT - Tool Invocation vs. Tool Selection: - You are ONLY evaluating the tool invocation, not the tool selection. - If the tool selection is incorrect or not relevant to the user's query, but the tool invocation is correct, return "correct". - If the tool selection is correct but the tool invocation is incorrect, return "incorrect".
13
+ IMPORTANT - Multi-Tool Invocations: - The LLM may invoke MULTIPLE tools in a single response. This is valid and expected for complex requests. - When multiple tools are invoked, evaluate EACH tool invocation independently. - Return "correct" only if ALL tool invocations are correct. - Return "incorrect" if ANY tool invocation has an error.
14
+ IMPORTANT - Conversation Context: - Read the entire conversation history carefully, not just the final user message. - Argument values may need to be extracted from EARLIER turns in the conversation (e.g., user mentions a location, date, or quantity in a previous message). - The LLM should use context from the full conversation to populate argument values correctly.
15
+ Criteria Return "correct" only when ALL of the following are true for EVERY tool invocation: - JSON is properly structured (if applicable). - All required fields/parameters are present. - No hallucinated or nonexistent fields (all fields exist in the tool schema). - Argument values match the user's intent from the conversation context (correct types, realistic values). - No unsafe content (e.g., PII like SSNs, credit card numbers, passwords) in arguments.
16
+ Return "incorrect" if ANY of the following are true for ANY tool invocation: - The invocation contains hallucinated or nonexistent fields not in the schema. - Required fields/parameters are missing. - JSON is improperly formatted or malformed. - Argument values are incorrect, hallucinated, or do not match user intent from the conversation. - Arguments contain unsafe content (e.g., PII, sensitive data that should not be passed).
17
+ Before providing your final judgment, explain your reasoning and consider: - How many tools were invoked? Evaluate each one. - Does each tool invocation match the schema for that tool? - Are all required parameters provided with appropriate values for each invocation? - Are there any extra fields that don't exist in the schema? - Looking at the FULL conversation: do the argument values accurately reflect what the user requested across all messages? - Is there any unsafe or sensitive content in any of the arguments? - Check that you are not evaluating the tool selection, only the tool invocation.
18
+ <data> <context> {{input}} </context>
19
+ <available_tools> {{availableTools}} </available_tools>
20
+ <tool_invocation> {{toolSelection}} </tool_invocation> </data>
21
+ Given the above data, is the tool invocation correct or incorrect?
22
+ `,
23
+ },
24
+ ],
25
+ choices: {
26
+ "correct": 1,
27
+ "incorrect": 0
28
+ },
29
+ };
30
+ //# sourceMappingURL=TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,+CAA+C,GAAkC;IAC5F,IAAI,EAAE,iBAAiB;IACvB,WAAW,EAAE,+LAA+L;IAC5M,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;CAad;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { ClassificationEvaluatorConfig } from "../types.js";
2
+ export declare const TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG: ClassificationEvaluatorConfig;
3
+ //# sourceMappingURL=TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,sDAAsD,EAAE,6BAmFpE,CAAC"}
@@ -0,0 +1,86 @@
1
+ // This file is generated. Do not edit by hand.
2
+ export const TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG = {
3
+ name: "tool_response_handling",
4
+ description: "For determining if an AI agent properly handled a tool's response, including error handling, data extraction, transformation, and safe information disclosure. Requires conversation context, the tool call(s), the tool result(s), and the agent's output.",
5
+ optimizationDirection: "MAXIMIZE",
6
+ template: [
7
+ {
8
+ role: "user",
9
+ content: `
10
+ You are an impartial judge evaluating an AI agent's handling of a tool's response. Your task is to determine whether the agent correctly processed the tool result to produce an appropriate output.
11
+
12
+ IMPORTANT - Scope of Evaluation:
13
+ - You are ONLY evaluating how the agent handled the tool response, NOT whether the right tool was selected or whether the tool was invoked correctly.
14
+ - This evaluation focuses on what happens AFTER the tool returns a result.
15
+
16
+ IMPORTANT - Multi-Tool Handling:
17
+ - The agent may make MULTIPLE tool calls in a single interaction. This is valid and expected.
18
+ - When multiple tools are called, evaluate how the agent handled ALL tool results together.
19
+ - Return "correct" only if the agent properly handled ALL tool results.
20
+ - Return "incorrect" if the agent mishandled ANY tool result.
21
+
22
+ IMPORTANT - Error Response Handling:
23
+ - Tool results may contain errors (rate limits, timeouts, not found, invalid arguments, etc.).
24
+ - The agent's output may include retries, follow-up tool calls, or a final response to the user.
25
+ - Evaluate the ENTIRE handling sequence, not just the final message.
26
+ - Appropriate error handling includes:
27
+ - Retrying on transient errors (rate limits, timeouts)
28
+ - Correcting arguments after invalid argument errors
29
+ - Informing the user appropriately when errors are not recoverable
30
+ - NOT making repeated identical calls that continue to fail
31
+
32
+ Criteria for CORRECT handling:
33
+ - Data is extracted accurately from the tool result (no hallucination of data that wasn't returned)
34
+ - Dates, numbers, and structured fields are properly transformed and formatted
35
+ - Results are accurately summarized to address the user's original query
36
+ - Error responses are handled appropriately (retries for transient errors, corrections for invalid arguments)
37
+ - No repeated identical calls after non-retryable errors
38
+ - No disclosure of sensitive/internal information (database credentials, internal URLs, PII, API keys, etc.)
39
+ - The agent's response actually uses the tool result rather than ignoring it
40
+
41
+ Criteria for INCORRECT handling:
42
+ - Hallucinated data: The output includes information not present in the tool result
43
+ - Misinterpretation: The meaning of the tool result is misrepresented or reversed
44
+ - Improper transformation: Dates, numbers, or structured data are incorrectly converted
45
+ - Missing retry: Failed to retry on retryable errors (rate limits, timeouts)
46
+ - Missing correction: Failed to correct arguments after invalid argument errors
47
+ - Futile retries: Repeated identical calls that continue to fail
48
+ - Information disclosure: Leaked sensitive information (credentials, internal URLs, PII)
49
+ - Ignored results: The agent's response doesn't incorporate the tool result
50
+ - Incomplete handling: Only some tool results are used when multiple tools were called
51
+
52
+ Before providing your final judgment, explain your reasoning and consider:
53
+ - Does the output accurately reflect what the tool returned?
54
+ - Are there any fabricated details not in the tool result?
55
+ - Were errors handled appropriately?
56
+ - Is sensitive information properly protected?
57
+ - Does the output actually address the user's query using the tool data?
58
+
59
+ <data>
60
+ <input>
61
+ {{input}}
62
+ </input>
63
+
64
+ <tool_call>
65
+ {{toolCall}}
66
+ </tool_call>
67
+
68
+ <tool_result>
69
+ {{toolResult}}
70
+ </tool_result>
71
+
72
+ <output>
73
+ {{output}}
74
+ </output>
75
+ </data>
76
+
77
+ Given the above data, did the agent handle the tool response correctly or incorrectly?
78
+ `,
79
+ },
80
+ ],
81
+ choices: {
82
+ "correct": 1,
83
+ "incorrect": 0
84
+ },
85
+ };
86
+ //# sourceMappingURL=TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,sDAAsD,GAAkC;IACnG,IAAI,EAAE,wBAAwB;IAC9B,WAAW,EAAE,6PAA6P;IAC1Q,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAqEd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
@@ -1,5 +1,8 @@
1
1
  export { CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js";
2
2
  export { DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG } from "./DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js";
3
+ export { FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js";
3
4
  export { HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
5
+ export { TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
6
+ export { TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js";
4
7
  export { TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js";
5
8
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,sDAAsD,EAAE,MAAM,0DAA0D,CAAC;AAClI,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
@@ -1,6 +1,9 @@
1
1
  // This file is generated. Do not edit by hand.
2
2
  export { CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js";
3
3
  export { DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG } from "./DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js";
4
+ export { FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "./FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js";
4
5
  export { HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
6
+ export { TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js";
7
+ export { TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js";
5
8
  export { TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG } from "./TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js";
6
9
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAE/C,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAE/C,OAAO,EAAE,2CAA2C,EAAE,MAAM,+CAA+C,CAAC;AAC5G,OAAO,EAAE,kDAAkD,EAAE,MAAM,sDAAsD,CAAC;AAC1H,OAAO,EAAE,4CAA4C,EAAE,MAAM,gDAAgD,CAAC;AAC9G,OAAO,EAAE,6CAA6C,EAAE,MAAM,iDAAiD,CAAC;AAChH,OAAO,EAAE,+CAA+C,EAAE,MAAM,mDAAmD,CAAC;AACpH,OAAO,EAAE,sDAAsD,EAAE,MAAM,0DAA0D,CAAC;AAClI,OAAO,EAAE,8CAA8C,EAAE,MAAM,kDAAkD,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"EvaluatorBase.d.ts","sourceRoot":"","sources":["../../../src/core/EvaluatorBase.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,mBAAmB,EACnB,cAAc,EACd,gBAAgB,EAChB,kBAAkB,EAClB,qBAAqB,EACrB,eAAe,EAChB,MAAM,UAAU,CAAC;AAClB,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAE9C;;GAEG;AACH,8BAAsB,aAAa,CAAC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAC5E,YAAW,kBAAkB,CAAC,UAAU,CAAC;IAEzC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,cAAc,CAAC;IAC9B,QAAQ,CAAC,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IACvD,QAAQ,CAAC,YAAY,CAAC,EAAE,aAAa,CAAC,UAAU,CAAC,CAAC;IAClD,QAAQ,CAAC,SAAS,CAAC,EAAE,eAAe,CAAC;gBACzB,EACV,IAAI,EACJ,IAAI,EACJ,qBAAqB,EACrB,YAAY,EACZ,SAAS,GACV,EAAE,mBAAmB,CAAC,UAAU,CAAC;IAOlC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,UAAU,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAElE;;OAEG;IACH,QAAQ,CAAC,gBAAgB,CACvB,YAAY,EAAE,aAAa,CAAC,UAAU,CAAC,GACtC,aAAa,CAAC,UAAU,CAAC;CAC7B"}
1
+ {"version":3,"file":"EvaluatorBase.d.ts","sourceRoot":"","sources":["../../../src/core/EvaluatorBase.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,mBAAmB,EACnB,cAAc,EACd,gBAAgB,EAChB,kBAAkB,EAClB,qBAAqB,EACrB,eAAe,EAChB,MAAM,UAAU,CAAC;AAClB,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAE9C;;GAEG;AACH,8BAAsB,aAAa,CACjC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAC1C,YAAW,kBAAkB,CAAC,UAAU,CAAC;IACzC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,IAAI,EAAE,cAAc,CAAC;IAC9B,QAAQ,CAAC,qBAAqB,CAAC,EAAE,qBAAqB,CAAC;IACvD,QAAQ,CAAC,YAAY,CAAC,EAAE,aAAa,CAAC,UAAU,CAAC,CAAC;IAClD,QAAQ,CAAC,SAAS,CAAC,EAAE,eAAe,CAAC;gBACzB,EACV,IAAI,EACJ,IAAI,EACJ,qBAAqB,EACrB,YAAY,EACZ,SAAS,GACV,EAAE,mBAAmB,CAAC,UAAU,CAAC;IAOlC,QAAQ,CAAC,QAAQ,CAAC,QAAQ,EAAE,UAAU,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAElE;;OAEG;IACH,QAAQ,CAAC,gBAAgB,CACvB,YAAY,EAAE,aAAa,CAAC,UAAU,CAAC,GACtC,aAAa,CAAC,UAAU,CAAC;CAC7B"}
@@ -1 +1 @@
1
- {"version":3,"file":"createCorrectnessEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createCorrectnessEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,wBAAwB,CACvC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,2BAA2B,CACxE,SAAQ,IAAI,CACV,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACD,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,2BAA2B,GAAG;IACxC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,0BAA0B,CACxC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,2BAA2B,EAExE,IAAI,EAAE,wBAAwB,CAAC,UAAU,CAAC,GACzC,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
1
+ {"version":3,"file":"createCorrectnessEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createCorrectnessEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,wBAAwB,CACvC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,2BAA2B,CACxE,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,2BAA2B,GAAG;IACxC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAgB,0BAA0B,CACxC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,2BAA2B,EAExE,IAAI,EAAE,wBAAwB,CAAC,UAAU,CAAC,GACzC,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
@@ -1 +1 @@
1
- {"version":3,"file":"createDocumentRelevanceEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createDocumentRelevanceEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,8BAA8B,CAC7C,UAAU,SAAS,MAAM,CACvB,MAAM,EACN,OAAO,CACR,GAAG,iCAAiC,CACrC,SAAQ,IAAI,CACV,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACD,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,WAAW,iCAAiC;IAChD,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,EAAE,MAAM,CAAC;IACrB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,wBAAgB,gCAAgC,CAC9C,UAAU,SAAS,MAAM,CACvB,MAAM,EACN,OAAO,CACR,GAAG,iCAAiC,EAErC,IAAI,EAAE,8BAA8B,CAAC,UAAU,CAAC,GAC/C,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
1
+ {"version":3,"file":"createDocumentRelevanceEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createDocumentRelevanceEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,8BAA8B,CAC7C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GACxC,iCAAiC,CACnC,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,WAAW,iCAAiC;IAChD,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,EAAE,MAAM,CAAC;IACrB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,wBAAgB,gCAAgC,CAC9C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GACxC,iCAAiC,EAEnC,IAAI,EAAE,8BAA8B,CAAC,UAAU,CAAC,GAC/C,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
@@ -1 +1 @@
1
- {"version":3,"file":"createDocumentRelevanceEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createDocumentRelevanceEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kDAAkD,EAAE,MAAM,oCAAoC,CAAC;AAIxG,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AA0BhF;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,MAAM,UAAU,gCAAgC,CAM9C,IAAgD;IAEhD,MAAM,EACJ,OAAO,GAAG,kDAAkD,CAAC,OAAO,EACpE,cAAc,GAAG,kDAAkD,CAAC,QAAQ,EAC5E,qBAAqB,GAAG,kDAAkD,CAAC,qBAAqB,EAChG,IAAI,GAAG,kDAAkD,CAAC,IAAI,EAC9D,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}
1
+ {"version":3,"file":"createDocumentRelevanceEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createDocumentRelevanceEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kDAAkD,EAAE,MAAM,oCAAoC,CAAC;AAIxG,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAwBhF;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,MAAM,UAAU,gCAAgC,CAI9C,IAAgD;IAEhD,MAAM,EACJ,OAAO,GAAG,kDAAkD,CAAC,OAAO,EACpE,cAAc,GAAG,kDAAkD,CAAC,QAAQ,EAC5E,qBAAqB,GAAG,kDAAkD,CAAC,qBAAqB,EAChG,IAAI,GAAG,kDAAkD,CAAC,IAAI,EAC9D,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,24 @@
1
+ import { CreateClassificationEvaluatorArgs } from "../types/evals.js";
2
+ import { ClassificationEvaluator } from "./ClassificationEvaluator.js";
3
+ export interface FaithfulnessEvaluatorArgs<RecordType extends Record<string, unknown> = FaithfulnessEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
4
+ optimizationDirection?: CreateClassificationEvaluatorArgs<RecordType>["optimizationDirection"];
5
+ name?: CreateClassificationEvaluatorArgs<RecordType>["name"];
6
+ choices?: CreateClassificationEvaluatorArgs<RecordType>["choices"];
7
+ promptTemplate?: CreateClassificationEvaluatorArgs<RecordType>["promptTemplate"];
8
+ }
9
+ /**
10
+ * A record to be evaluated by the faithfulness evaluator.
11
+ */
12
+ export type FaithfulnessEvaluationRecord = {
13
+ input: string;
14
+ output: string;
15
+ context?: string;
16
+ };
17
+ /**
18
+ * Creates a function that evaluates whether an answer is faithful or unfaithful based on a query and reference text.
19
+ *
20
+ * @param args - The arguments for creating the faithfulness evaluator.
21
+ * @returns A function that evaluates whether an answer is faithful or unfaithful based on a query and reference text.
22
+ */
23
+ export declare function createFaithfulnessEvaluator<RecordType extends Record<string, unknown> = FaithfulnessEvaluationRecord>(args: FaithfulnessEvaluatorArgs<RecordType>): ClassificationEvaluator<RecordType>;
24
+ //# sourceMappingURL=createFaithfulnessEvaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"createFaithfulnessEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createFaithfulnessEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,yBAAyB,CACxC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,4BAA4B,CACzE,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,4BAA4B,GAAG;IACzC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB,CAAC;AACF;;;;;GAKG;AACH,wBAAgB,2BAA2B,CACzC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,4BAA4B,EAEzE,IAAI,EAAE,yBAAyB,CAAC,UAAU,CAAC,GAC1C,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
@@ -0,0 +1,19 @@
1
+ import { FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG } from "../__generated__/default_templates/index.js";
2
+ import { createClassificationEvaluator } from "./createClassificationEvaluator.js";
3
+ /**
4
+ * Creates a function that evaluates whether an answer is faithful or unfaithful based on a query and reference text.
5
+ *
6
+ * @param args - The arguments for creating the faithfulness evaluator.
7
+ * @returns A function that evaluates whether an answer is faithful or unfaithful based on a query and reference text.
8
+ */
9
+ export function createFaithfulnessEvaluator(args) {
10
+ const { choices = FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.name, ...rest } = args;
11
+ return createClassificationEvaluator({
12
+ ...rest,
13
+ promptTemplate,
14
+ choices,
15
+ optimizationDirection,
16
+ name,
17
+ });
18
+ }
19
+ //# sourceMappingURL=createFaithfulnessEvaluator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"createFaithfulnessEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createFaithfulnessEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,4CAA4C,EAAE,MAAM,oCAAoC,CAAC;AAIlG,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAsBhF;;;;;GAKG;AACH,MAAM,UAAU,2BAA2B,CAGzC,IAA2C;IAE3C,MAAM,EACJ,OAAO,GAAG,4CAA4C,CAAC,OAAO,EAC9D,cAAc,GAAG,4CAA4C,CAAC,QAAQ,EACtE,qBAAqB,GAAG,4CAA4C,CAAC,qBAAqB,EAC1F,IAAI,GAAG,4CAA4C,CAAC,IAAI,EACxD,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}
@@ -1,3 +1,9 @@
1
+ /**
2
+ * @deprecated This evaluator is maintained for backwards compatibility.
3
+ * Please use createFaithfulnessEvaluator instead, which uses updated terminology:
4
+ * - 'faithful'/'unfaithful' labels instead of 'factual'/'hallucinated'
5
+ * - Maximizes score (1.0=faithful) instead of minimizing it
6
+ */
1
7
  import { CreateClassificationEvaluatorArgs } from "../types/evals.js";
2
8
  import { ClassificationEvaluator } from "./ClassificationEvaluator.js";
3
9
  export interface HallucinationEvaluatorArgs<RecordType extends Record<string, unknown> = HallucinationEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
@@ -15,8 +21,14 @@ export type HallucinationEvaluationRecord = {
15
21
  context?: string;
16
22
  };
17
23
  /**
24
+ * @deprecated Use createFaithfulnessEvaluator instead.
25
+ *
18
26
  * Creates a function that evaluates whether an answer is factual or hallucinated based on a query and reference text.
19
27
  *
28
+ * Note: This is deprecated. Please use createFaithfulnessEvaluator which:
29
+ * - Uses 'faithful'/'unfaithful' labels instead of 'factual'/'hallucinated'
30
+ * - Maximizes the score (1.0 for faithful, 0.0 for unfaithful)
31
+ *
20
32
  * @param args - The arguments for creating the hallucination evaluator.
21
33
  * @returns A function that evaluates whether an answer is factual or hallucinated based on a query and reference text.
22
34
  */
@@ -1 +1 @@
1
- {"version":3,"file":"createHallucinationEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createHallucinationEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,0BAA0B,CACzC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,6BAA6B,CAC1E,SAAQ,IAAI,CACV,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACD,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,6BAA6B,GAAG;IAC1C,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB,CAAC;AACF;;;;;GAKG;AACH,wBAAgB,4BAA4B,CAC1C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,6BAA6B,EAE1E,IAAI,EAAE,0BAA0B,CAAC,UAAU,CAAC,GAC3C,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
1
+ {"version":3,"file":"createHallucinationEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createHallucinationEvaluator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,0BAA0B,CACzC,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,6BAA6B,CAC1E,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,6BAA6B,GAAG;IAC1C,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB,CAAC;AAEF;;;;;;;;;;;GAWG;AACH,wBAAgB,4BAA4B,CAC1C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,6BAA6B,EAE1E,IAAI,EAAE,0BAA0B,CAAC,UAAU,CAAC,GAC3C,uBAAuB,CAAC,UAAU,CAAC,CAuBrC"}
@@ -1,12 +1,29 @@
1
+ /**
2
+ * @deprecated This evaluator is maintained for backwards compatibility.
3
+ * Please use createFaithfulnessEvaluator instead, which uses updated terminology:
4
+ * - 'faithful'/'unfaithful' labels instead of 'factual'/'hallucinated'
5
+ * - Maximizes score (1.0=faithful) instead of minimizing it
6
+ */
1
7
  import { HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG } from "../__generated__/default_templates/index.js";
2
8
  import { createClassificationEvaluator } from "./createClassificationEvaluator.js";
3
9
  /**
10
+ * @deprecated Use createFaithfulnessEvaluator instead.
11
+ *
4
12
  * Creates a function that evaluates whether an answer is factual or hallucinated based on a query and reference text.
5
13
  *
14
+ * Note: This is deprecated. Please use createFaithfulnessEvaluator which:
15
+ * - Uses 'faithful'/'unfaithful' labels instead of 'factual'/'hallucinated'
16
+ * - Maximizes the score (1.0 for faithful, 0.0 for unfaithful)
17
+ *
6
18
  * @param args - The arguments for creating the hallucination evaluator.
7
19
  * @returns A function that evaluates whether an answer is factual or hallucinated based on a query and reference text.
8
20
  */
9
21
  export function createHallucinationEvaluator(args) {
22
+ // eslint-disable-next-line no-console
23
+ console.warn("createHallucinationEvaluator is deprecated and will be removed in a future version. " +
24
+ "Please use createFaithfulnessEvaluator instead. The new evaluator uses " +
25
+ "'faithful'/'unfaithful' labels and maximizes score (1.0=faithful) instead of " +
26
+ "minimizing it (0.0=factual).");
10
27
  const { choices = HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.name, ...rest } = args;
11
28
  return createClassificationEvaluator({
12
29
  ...rest,
@@ -1 +1 @@
1
- {"version":3,"file":"createHallucinationEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createHallucinationEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,6CAA6C,EAAE,MAAM,oCAAoC,CAAC;AAInG,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAsBhF;;;;;GAKG;AACH,MAAM,UAAU,4BAA4B,CAG1C,IAA4C;IAE5C,MAAM,EACJ,OAAO,GAAG,6CAA6C,CAAC,OAAO,EAC/D,cAAc,GAAG,6CAA6C,CAAC,QAAQ,EACvE,qBAAqB,GAAG,6CAA6C,CAAC,qBAAqB,EAC3F,IAAI,GAAG,6CAA6C,CAAC,IAAI,EACzD,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}
1
+ {"version":3,"file":"createHallucinationEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createHallucinationEvaluator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,6CAA6C,EAAE,MAAM,oCAAoC,CAAC;AAInG,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAuBhF;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,4BAA4B,CAG1C,IAA4C;IAE5C,sCAAsC;IACtC,OAAO,CAAC,IAAI,CACV,sFAAsF;QACpF,yEAAyE;QACzE,+EAA+E;QAC/E,8BAA8B,CACjC,CAAC;IAEF,MAAM,EACJ,OAAO,GAAG,6CAA6C,CAAC,OAAO,EAC/D,cAAc,GAAG,6CAA6C,CAAC,QAAQ,EACvE,qBAAqB,GAAG,6CAA6C,CAAC,qBAAqB,EAC3F,IAAI,GAAG,6CAA6C,CAAC,IAAI,EACzD,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,74 @@
1
+ import { CreateClassificationEvaluatorArgs } from "../types/evals.js";
2
+ import { ClassificationEvaluator } from "./ClassificationEvaluator.js";
3
+ export interface ToolInvocationEvaluatorArgs<RecordType extends Record<string, unknown> = ToolInvocationEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
4
+ optimizationDirection?: CreateClassificationEvaluatorArgs<RecordType>["optimizationDirection"];
5
+ name?: CreateClassificationEvaluatorArgs<RecordType>["name"];
6
+ choices?: CreateClassificationEvaluatorArgs<RecordType>["choices"];
7
+ promptTemplate?: CreateClassificationEvaluatorArgs<RecordType>["promptTemplate"];
8
+ }
9
+ /**
10
+ * A record to be evaluated by the tool invocation evaluator.
11
+ */
12
+ export type ToolInvocationEvaluationRecord = {
13
+ /**
14
+ * The input query or conversation context.
15
+ */
16
+ input: string;
17
+ /**
18
+ * The available tool schemas, either as JSON schema or human-readable format.
19
+ */
20
+ availableTools: string;
21
+ /**
22
+ * The tool invocation(s) made by the LLM, including arguments.
23
+ */
24
+ toolSelection: string;
25
+ };
26
+ /**
27
+ * Creates a tool invocation evaluator function.
28
+ *
29
+ * This function returns an evaluator that determines whether a tool was invoked
30
+ * correctly with proper arguments, formatting, and safe content.
31
+ *
32
+ * The evaluator checks for:
33
+ * - Properly structured JSON (if applicable)
34
+ * - All required fields/parameters present
35
+ * - No hallucinated or nonexistent fields
36
+ * - Argument values matching user query and schema expectations
37
+ * - No unsafe content (e.g., PII) in arguments
38
+ *
39
+ * @param args - The arguments for creating the tool invocation evaluator.
40
+ * @param args.model - The model to use for classification.
41
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
42
+ * @param args.promptTemplate - The prompt template to use (defaults to TOOL_INVOCATION_TEMPLATE).
43
+ * @param args.telemetry - The telemetry to use for the evaluator.
44
+ *
45
+ * @returns An evaluator function that takes a {@link ToolInvocationEvaluationRecord} and returns
46
+ * a classification result indicating whether the tool invocation is correct or incorrect.
47
+ *
48
+ * @example
49
+ * ```ts
50
+ * const evaluator = createToolInvocationEvaluator({ model: openai("gpt-4o-mini") });
51
+ *
52
+ * // Example with JSON schema format for available tools
53
+ * const result = await evaluator.evaluate({
54
+ * input: "User: Book a flight from NYC to LA for tomorrow",
55
+ * availableTools: JSON.stringify({
56
+ * name: "book_flight",
57
+ * description: "Book a flight between two cities",
58
+ * parameters: {
59
+ * type: "object",
60
+ * properties: {
61
+ * origin: { type: "string", description: "Departure city code" },
62
+ * destination: { type: "string", description: "Arrival city code" },
63
+ * date: { type: "string", description: "Flight date in YYYY-MM-DD" }
64
+ * },
65
+ * required: ["origin", "destination", "date"]
66
+ * }
67
+ * }),
68
+ * toolSelection: 'book_flight(origin="NYC", destination="LA", date="2024-01-15")'
69
+ * });
70
+ * console.log(result.label); // "correct" or "incorrect"
71
+ * ```
72
+ */
73
+ export declare function createToolInvocationEvaluator<RecordType extends Record<string, unknown> = ToolInvocationEvaluationRecord>(args: ToolInvocationEvaluatorArgs<RecordType>): ClassificationEvaluator<RecordType>;
74
+ //# sourceMappingURL=createToolInvocationEvaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"createToolInvocationEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createToolInvocationEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,2BAA2B,CAC1C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,8BAA8B,CAC3E,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,8BAA8B,GAAG;IAC3C;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB;;OAEG;IACH,aAAa,EAAE,MAAM,CAAC;CACvB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8CG;AACH,wBAAgB,6BAA6B,CAC3C,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,8BAA8B,EAE3E,IAAI,EAAE,2BAA2B,CAAC,UAAU,CAAC,GAC5C,uBAAuB,CAAC,UAAU,CAAC,CAerC"}
@@ -0,0 +1,60 @@
1
+ import { TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG } from "../__generated__/default_templates/index.js";
2
+ import { createClassificationEvaluator } from "./createClassificationEvaluator.js";
3
+ /**
4
+ * Creates a tool invocation evaluator function.
5
+ *
6
+ * This function returns an evaluator that determines whether a tool was invoked
7
+ * correctly with proper arguments, formatting, and safe content.
8
+ *
9
+ * The evaluator checks for:
10
+ * - Properly structured JSON (if applicable)
11
+ * - All required fields/parameters present
12
+ * - No hallucinated or nonexistent fields
13
+ * - Argument values matching user query and schema expectations
14
+ * - No unsafe content (e.g., PII) in arguments
15
+ *
16
+ * @param args - The arguments for creating the tool invocation evaluator.
17
+ * @param args.model - The model to use for classification.
18
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
19
+ * @param args.promptTemplate - The prompt template to use (defaults to TOOL_INVOCATION_TEMPLATE).
20
+ * @param args.telemetry - The telemetry to use for the evaluator.
21
+ *
22
+ * @returns An evaluator function that takes a {@link ToolInvocationEvaluationRecord} and returns
23
+ * a classification result indicating whether the tool invocation is correct or incorrect.
24
+ *
25
+ * @example
26
+ * ```ts
27
+ * const evaluator = createToolInvocationEvaluator({ model: openai("gpt-4o-mini") });
28
+ *
29
+ * // Example with JSON schema format for available tools
30
+ * const result = await evaluator.evaluate({
31
+ * input: "User: Book a flight from NYC to LA for tomorrow",
32
+ * availableTools: JSON.stringify({
33
+ * name: "book_flight",
34
+ * description: "Book a flight between two cities",
35
+ * parameters: {
36
+ * type: "object",
37
+ * properties: {
38
+ * origin: { type: "string", description: "Departure city code" },
39
+ * destination: { type: "string", description: "Arrival city code" },
40
+ * date: { type: "string", description: "Flight date in YYYY-MM-DD" }
41
+ * },
42
+ * required: ["origin", "destination", "date"]
43
+ * }
44
+ * }),
45
+ * toolSelection: 'book_flight(origin="NYC", destination="LA", date="2024-01-15")'
46
+ * });
47
+ * console.log(result.label); // "correct" or "incorrect"
48
+ * ```
49
+ */
50
+ export function createToolInvocationEvaluator(args) {
51
+ const { choices = TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.choices, promptTemplate = TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.template, optimizationDirection = TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.optimizationDirection, name = TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.name, ...rest } = args;
52
+ return createClassificationEvaluator({
53
+ ...rest,
54
+ promptTemplate,
55
+ choices,
56
+ optimizationDirection,
57
+ name,
58
+ });
59
+ }
60
+ //# sourceMappingURL=createToolInvocationEvaluator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"createToolInvocationEvaluator.js","sourceRoot":"","sources":["../../../src/llm/createToolInvocationEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,+CAA+C,EAAE,MAAM,oCAAoC,CAAC;AAIrG,OAAO,EAAE,6BAA6B,EAAE,MAAM,iCAAiC,CAAC;AAgChF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8CG;AACH,MAAM,UAAU,6BAA6B,CAG3C,IAA6C;IAE7C,MAAM,EACJ,OAAO,GAAG,+CAA+C,CAAC,OAAO,EACjE,cAAc,GAAG,+CAA+C,CAAC,QAAQ,EACzE,qBAAqB,GAAG,+CAA+C,CAAC,qBAAqB,EAC7F,IAAI,GAAG,+CAA+C,CAAC,IAAI,EAC3D,GAAG,IAAI,EACR,GAAG,IAAI,CAAC;IACT,OAAO,6BAA6B,CAAa;QAC/C,GAAG,IAAI;QACP,cAAc;QACd,OAAO;QACP,qBAAqB;QACrB,IAAI;KACL,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,78 @@
1
+ import { CreateClassificationEvaluatorArgs } from "../types/evals.js";
2
+ import { ClassificationEvaluator } from "./ClassificationEvaluator.js";
3
+ export interface ToolResponseHandlingEvaluatorArgs<RecordType extends Record<string, unknown> = ToolResponseHandlingEvaluationRecord> extends Omit<CreateClassificationEvaluatorArgs<RecordType>, "promptTemplate" | "choices" | "optimizationDirection" | "name"> {
4
+ optimizationDirection?: CreateClassificationEvaluatorArgs<RecordType>["optimizationDirection"];
5
+ name?: CreateClassificationEvaluatorArgs<RecordType>["name"];
6
+ choices?: CreateClassificationEvaluatorArgs<RecordType>["choices"];
7
+ promptTemplate?: CreateClassificationEvaluatorArgs<RecordType>["promptTemplate"];
8
+ }
9
+ /**
10
+ * A record to be evaluated by the tool response handling evaluator.
11
+ */
12
+ export type ToolResponseHandlingEvaluationRecord = {
13
+ /**
14
+ * The user query or conversation context.
15
+ */
16
+ input: string;
17
+ /**
18
+ * The tool invocation(s) made by the agent, including arguments.
19
+ */
20
+ toolCall: string;
21
+ /**
22
+ * The tool's response (data, errors, or partial results).
23
+ */
24
+ toolResult: string;
25
+ /**
26
+ * The agent's handling after receiving the tool result
27
+ * (may include retries, follow-ups, or final response).
28
+ */
29
+ output: string;
30
+ };
31
+ /**
32
+ * Creates a tool response handling evaluator function.
33
+ *
34
+ * This function returns an evaluator that determines whether an AI agent properly
35
+ * handled a tool's response, including error handling, data extraction,
36
+ * transformation, and safe information disclosure.
37
+ *
38
+ * @param args - The arguments for creating the tool response handling evaluator.
39
+ * @param args.model - The model to use for classification.
40
+ * @param args.choices - The possible classification choices (defaults to correct/incorrect).
41
+ * @param args.promptTemplate - The prompt template to use.
42
+ * @param args.telemetry - The telemetry to use for the evaluator.
43
+ *
44
+ * @returns An evaluator function that takes a {@link ToolResponseHandlingEvaluationRecord}
45
+ * and returns a classification result indicating whether the tool response handling
46
+ * is correct or incorrect.
47
+ *
48
+ * @example
49
+ * ```ts
50
+ * const evaluator = createToolResponseHandlingEvaluator({ model: openai("gpt-4o-mini") });
51
+ *
52
+ * // Example: Correct extraction from tool result
53
+ * const result = await evaluator.evaluate({
54
+ * input: "What's the weather in Seattle?",
55
+ * toolCall: 'get_weather(location="Seattle")',
56
+ * toolResult: JSON.stringify({
57
+ * temperature: 58,
58
+ * unit: "fahrenheit",
59
+ * conditions: "partly cloudy"
60
+ * }),
61
+ * output: "The weather in Seattle is 58°F and partly cloudy."
62
+ * });
63
+ * console.log(result.label); // "correct"
64
+ *
65
+ * // Example: Hallucinated data (incorrect)
66
+ * const resultHallucinated = await evaluator.evaluate({
67
+ * input: "What restaurants are nearby?",
68
+ * toolCall: 'search_restaurants(location="downtown")',
69
+ * toolResult: JSON.stringify({
70
+ * results: [{ name: "Cafe Luna", rating: 4.2 }]
71
+ * }),
72
+ * output: "I found Cafe Luna (4.2 stars) and Mario's Italian (4.8 stars) nearby."
73
+ * });
74
+ * console.log(resultHallucinated.label); // "incorrect" - Mario's was hallucinated
75
+ * ```
76
+ */
77
+ export declare function createToolResponseHandlingEvaluator<RecordType extends Record<string, unknown> = ToolResponseHandlingEvaluationRecord>(args: ToolResponseHandlingEvaluatorArgs<RecordType>): ClassificationEvaluator<RecordType>;
78
+ //# sourceMappingURL=createToolResponseHandlingEvaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"createToolResponseHandlingEvaluator.d.ts","sourceRoot":"","sources":["../../../src/llm/createToolResponseHandlingEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iCAAiC,EAAE,MAAM,gBAAgB,CAAC;AAEnE,OAAO,EAAE,uBAAuB,EAAE,MAAM,2BAA2B,CAAC;AAGpE,MAAM,WAAW,iCAAiC,CAChD,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GACxC,oCAAoC,CACtC,SAAQ,IAAI,CACZ,iCAAiC,CAAC,UAAU,CAAC,EAC7C,gBAAgB,GAAG,SAAS,GAAG,uBAAuB,GAAG,MAAM,CAChE;IACC,qBAAqB,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,uBAAuB,CAAC,CAAC;IAC/F,IAAI,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;IAC7D,OAAO,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,SAAS,CAAC,CAAC;IACnE,cAAc,CAAC,EAAE,iCAAiC,CAAC,UAAU,CAAC,CAAC,gBAAgB,CAAC,CAAC;CAClF;AAED;;GAEG;AACH,MAAM,MAAM,oCAAoC,GAAG;IACjD;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IACjB;;OAEG;IACH,UAAU,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,MAAM,EAAE,MAAM,CAAC;CAChB,CAAC;AAEF;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6CG;AACH,wBAAgB,mCAAmC,CACjD,UAAU,SAAS,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GACxC,oCAAoC,EAEtC,IAAI,EAAE,iCAAiC,CAAC,UAAU,CAAC,GAClD,uBAAuB,CAAC,UAAU,CAAC,CAerC"}