@arizeai/phoenix-evals 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. package/README.md +23 -23
  2. package/dist/esm/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  3. package/dist/esm/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  4. package/dist/esm/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js +58 -0
  5. package/dist/esm/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  6. package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  7. package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +9 -1
  8. package/dist/esm/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  9. package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  10. package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js +14 -3
  11. package/dist/esm/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  12. package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  13. package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +16 -2
  14. package/dist/esm/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  15. package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  16. package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js +15 -1
  17. package/dist/esm/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  18. package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  19. package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js +61 -11
  20. package/dist/esm/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  21. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  22. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  23. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js +86 -0
  24. package/dist/esm/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  25. package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  26. package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js +44 -8
  27. package/dist/esm/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  28. package/dist/esm/__generated__/default_templates/index.d.ts +2 -0
  29. package/dist/esm/__generated__/default_templates/index.d.ts.map +1 -1
  30. package/dist/esm/__generated__/default_templates/index.js +2 -0
  31. package/dist/esm/__generated__/default_templates/index.js.map +1 -1
  32. package/dist/esm/core/EvaluatorBase.d.ts +2 -2
  33. package/dist/esm/core/EvaluatorBase.d.ts.map +1 -1
  34. package/dist/esm/core/FunctionEvaluator.d.ts +1 -1
  35. package/dist/esm/core/FunctionEvaluator.d.ts.map +1 -1
  36. package/dist/esm/core/FunctionEvaluator.js.map +1 -1
  37. package/dist/esm/helpers/asEvaluatorFn.d.ts +1 -1
  38. package/dist/esm/helpers/asEvaluatorFn.d.ts.map +1 -1
  39. package/dist/esm/helpers/asEvaluatorFn.js.map +1 -1
  40. package/dist/esm/helpers/createEvaluator.d.ts +2 -2
  41. package/dist/esm/helpers/createEvaluator.d.ts.map +1 -1
  42. package/dist/esm/helpers/createEvaluator.js.map +1 -1
  43. package/dist/esm/helpers/toEvaluationResult.d.ts +1 -1
  44. package/dist/esm/helpers/toEvaluationResult.d.ts.map +1 -1
  45. package/dist/esm/llm/ClassificationEvaluator.d.ts +3 -3
  46. package/dist/esm/llm/ClassificationEvaluator.d.ts.map +1 -1
  47. package/dist/esm/llm/ClassificationEvaluator.js.map +1 -1
  48. package/dist/esm/llm/LLMEvaluator.d.ts +1 -1
  49. package/dist/esm/llm/LLMEvaluator.d.ts.map +1 -1
  50. package/dist/esm/llm/createClassificationEvaluator.d.ts +1 -1
  51. package/dist/esm/llm/createClassificationEvaluator.d.ts.map +1 -1
  52. package/dist/esm/llm/createClassificationEvaluator.js.map +1 -1
  53. package/dist/esm/llm/createClassifierFn.d.ts +1 -1
  54. package/dist/esm/llm/createClassifierFn.d.ts.map +1 -1
  55. package/dist/esm/llm/createClassifierFn.js.map +1 -1
  56. package/dist/esm/llm/createConcisenessEvaluator.d.ts +43 -0
  57. package/dist/esm/llm/createConcisenessEvaluator.d.ts.map +1 -0
  58. package/dist/esm/llm/createConcisenessEvaluator.js +39 -0
  59. package/dist/esm/llm/createConcisenessEvaluator.js.map +1 -0
  60. package/dist/esm/llm/createCorrectnessEvaluator.d.ts +2 -2
  61. package/dist/esm/llm/createCorrectnessEvaluator.d.ts.map +1 -1
  62. package/dist/esm/llm/createCorrectnessEvaluator.js.map +1 -1
  63. package/dist/esm/llm/createDocumentRelevanceEvaluator.d.ts +2 -2
  64. package/dist/esm/llm/createDocumentRelevanceEvaluator.d.ts.map +1 -1
  65. package/dist/esm/llm/createDocumentRelevanceEvaluator.js.map +1 -1
  66. package/dist/esm/llm/createFaithfulnessEvaluator.d.ts +2 -2
  67. package/dist/esm/llm/createFaithfulnessEvaluator.d.ts.map +1 -1
  68. package/dist/esm/llm/createFaithfulnessEvaluator.js.map +1 -1
  69. package/dist/esm/llm/createHallucinationEvaluator.d.ts +2 -2
  70. package/dist/esm/llm/createHallucinationEvaluator.d.ts.map +1 -1
  71. package/dist/esm/llm/createHallucinationEvaluator.js.map +1 -1
  72. package/dist/esm/llm/createToolInvocationEvaluator.d.ts +2 -2
  73. package/dist/esm/llm/createToolInvocationEvaluator.d.ts.map +1 -1
  74. package/dist/esm/llm/createToolInvocationEvaluator.js.map +1 -1
  75. package/dist/esm/llm/createToolResponseHandlingEvaluator.d.ts +78 -0
  76. package/dist/esm/llm/createToolResponseHandlingEvaluator.d.ts.map +1 -0
  77. package/dist/esm/llm/createToolResponseHandlingEvaluator.js +59 -0
  78. package/dist/esm/llm/createToolResponseHandlingEvaluator.js.map +1 -0
  79. package/dist/esm/llm/createToolSelectionEvaluator.d.ts +64 -0
  80. package/dist/esm/llm/createToolSelectionEvaluator.d.ts.map +1 -0
  81. package/dist/esm/llm/createToolSelectionEvaluator.js +50 -0
  82. package/dist/esm/llm/createToolSelectionEvaluator.js.map +1 -0
  83. package/dist/esm/llm/generateClassification.d.ts +2 -2
  84. package/dist/esm/llm/generateClassification.d.ts.map +1 -1
  85. package/dist/esm/llm/generateClassification.js +1 -1
  86. package/dist/esm/llm/generateClassification.js.map +1 -1
  87. package/dist/esm/llm/index.d.ts +3 -0
  88. package/dist/esm/llm/index.d.ts.map +1 -1
  89. package/dist/esm/llm/index.js +3 -0
  90. package/dist/esm/llm/index.js.map +1 -1
  91. package/dist/esm/template/applyTemplate.d.ts +1 -1
  92. package/dist/esm/template/applyTemplate.d.ts.map +1 -1
  93. package/dist/esm/template/applyTemplate.js +1 -1
  94. package/dist/esm/template/applyTemplate.js.map +1 -1
  95. package/dist/esm/template/getTemplateVariables.d.ts +1 -1
  96. package/dist/esm/template/getTemplateVariables.d.ts.map +1 -1
  97. package/dist/esm/template/getTemplateVariables.js.map +1 -1
  98. package/dist/esm/tsconfig.esm.tsbuildinfo +1 -1
  99. package/dist/esm/types/evals.d.ts +4 -4
  100. package/dist/esm/types/evals.d.ts.map +1 -1
  101. package/dist/esm/types/otel.d.ts +1 -1
  102. package/dist/esm/types/otel.d.ts.map +1 -1
  103. package/dist/esm/utils/bindEvaluator.d.ts +2 -2
  104. package/dist/esm/utils/bindEvaluator.d.ts.map +1 -1
  105. package/dist/esm/utils/objectMappingUtils.d.ts +1 -1
  106. package/dist/esm/utils/objectMappingUtils.d.ts.map +1 -1
  107. package/dist/esm/utils/objectMappingUtils.js.map +1 -1
  108. package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  109. package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  110. package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js +61 -0
  111. package/dist/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  112. package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  113. package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +9 -1
  114. package/dist/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  115. package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  116. package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js +14 -3
  117. package/dist/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  118. package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  119. package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js +16 -2
  120. package/dist/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  121. package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  122. package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js +15 -1
  123. package/dist/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  124. package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  125. package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js +61 -11
  126. package/dist/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  127. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts +3 -0
  128. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -0
  129. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js +89 -0
  130. package/dist/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -0
  131. package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map +1 -1
  132. package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js +44 -8
  133. package/dist/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.js.map +1 -1
  134. package/dist/src/__generated__/default_templates/index.d.ts +2 -0
  135. package/dist/src/__generated__/default_templates/index.d.ts.map +1 -1
  136. package/dist/src/__generated__/default_templates/index.js +5 -1
  137. package/dist/src/__generated__/default_templates/index.js.map +1 -1
  138. package/dist/src/core/EvaluatorBase.d.ts +2 -2
  139. package/dist/src/core/EvaluatorBase.d.ts.map +1 -1
  140. package/dist/src/core/FunctionEvaluator.d.ts +1 -1
  141. package/dist/src/core/FunctionEvaluator.d.ts.map +1 -1
  142. package/dist/src/core/FunctionEvaluator.js.map +1 -1
  143. package/dist/src/helpers/asEvaluatorFn.d.ts +1 -1
  144. package/dist/src/helpers/asEvaluatorFn.d.ts.map +1 -1
  145. package/dist/src/helpers/asEvaluatorFn.js.map +1 -1
  146. package/dist/src/helpers/createEvaluator.d.ts +2 -2
  147. package/dist/src/helpers/createEvaluator.d.ts.map +1 -1
  148. package/dist/src/helpers/createEvaluator.js.map +1 -1
  149. package/dist/src/helpers/toEvaluationResult.d.ts +1 -1
  150. package/dist/src/helpers/toEvaluationResult.d.ts.map +1 -1
  151. package/dist/src/llm/ClassificationEvaluator.d.ts +3 -3
  152. package/dist/src/llm/ClassificationEvaluator.d.ts.map +1 -1
  153. package/dist/src/llm/ClassificationEvaluator.js.map +1 -1
  154. package/dist/src/llm/LLMEvaluator.d.ts +1 -1
  155. package/dist/src/llm/LLMEvaluator.d.ts.map +1 -1
  156. package/dist/src/llm/createClassificationEvaluator.d.ts +1 -1
  157. package/dist/src/llm/createClassificationEvaluator.d.ts.map +1 -1
  158. package/dist/src/llm/createClassificationEvaluator.js.map +1 -1
  159. package/dist/src/llm/createClassifierFn.d.ts +1 -1
  160. package/dist/src/llm/createClassifierFn.d.ts.map +1 -1
  161. package/dist/src/llm/createClassifierFn.js.map +1 -1
  162. package/dist/src/llm/createConcisenessEvaluator.d.ts +43 -0
  163. package/dist/src/llm/createConcisenessEvaluator.d.ts.map +1 -0
  164. package/dist/src/llm/createConcisenessEvaluator.js +50 -0
  165. package/dist/src/llm/createConcisenessEvaluator.js.map +1 -0
  166. package/dist/src/llm/createCorrectnessEvaluator.d.ts +2 -2
  167. package/dist/src/llm/createCorrectnessEvaluator.d.ts.map +1 -1
  168. package/dist/src/llm/createCorrectnessEvaluator.js.map +1 -1
  169. package/dist/src/llm/createDocumentRelevanceEvaluator.d.ts +2 -2
  170. package/dist/src/llm/createDocumentRelevanceEvaluator.d.ts.map +1 -1
  171. package/dist/src/llm/createDocumentRelevanceEvaluator.js.map +1 -1
  172. package/dist/src/llm/createFaithfulnessEvaluator.d.ts +2 -2
  173. package/dist/src/llm/createFaithfulnessEvaluator.d.ts.map +1 -1
  174. package/dist/src/llm/createFaithfulnessEvaluator.js.map +1 -1
  175. package/dist/src/llm/createHallucinationEvaluator.d.ts +2 -2
  176. package/dist/src/llm/createHallucinationEvaluator.d.ts.map +1 -1
  177. package/dist/src/llm/createHallucinationEvaluator.js.map +1 -1
  178. package/dist/src/llm/createToolInvocationEvaluator.d.ts +2 -2
  179. package/dist/src/llm/createToolInvocationEvaluator.d.ts.map +1 -1
  180. package/dist/src/llm/createToolInvocationEvaluator.js.map +1 -1
  181. package/dist/src/llm/createToolResponseHandlingEvaluator.d.ts +78 -0
  182. package/dist/src/llm/createToolResponseHandlingEvaluator.d.ts.map +1 -0
  183. package/dist/src/llm/createToolResponseHandlingEvaluator.js +70 -0
  184. package/dist/src/llm/createToolResponseHandlingEvaluator.js.map +1 -0
  185. package/dist/src/llm/createToolSelectionEvaluator.d.ts +64 -0
  186. package/dist/src/llm/createToolSelectionEvaluator.d.ts.map +1 -0
  187. package/dist/src/llm/createToolSelectionEvaluator.js +61 -0
  188. package/dist/src/llm/createToolSelectionEvaluator.js.map +1 -0
  189. package/dist/src/llm/generateClassification.d.ts +2 -2
  190. package/dist/src/llm/generateClassification.d.ts.map +1 -1
  191. package/dist/src/llm/generateClassification.js +1 -1
  192. package/dist/src/llm/generateClassification.js.map +1 -1
  193. package/dist/src/llm/index.d.ts +3 -0
  194. package/dist/src/llm/index.d.ts.map +1 -1
  195. package/dist/src/llm/index.js +3 -0
  196. package/dist/src/llm/index.js.map +1 -1
  197. package/dist/src/template/applyTemplate.d.ts +1 -1
  198. package/dist/src/template/applyTemplate.d.ts.map +1 -1
  199. package/dist/src/template/applyTemplate.js +1 -1
  200. package/dist/src/template/applyTemplate.js.map +1 -1
  201. package/dist/src/template/getTemplateVariables.d.ts +1 -1
  202. package/dist/src/template/getTemplateVariables.d.ts.map +1 -1
  203. package/dist/src/template/getTemplateVariables.js.map +1 -1
  204. package/dist/src/types/evals.d.ts +4 -4
  205. package/dist/src/types/evals.d.ts.map +1 -1
  206. package/dist/src/types/otel.d.ts +1 -1
  207. package/dist/src/types/otel.d.ts.map +1 -1
  208. package/dist/src/utils/bindEvaluator.d.ts +2 -2
  209. package/dist/src/utils/bindEvaluator.d.ts.map +1 -1
  210. package/dist/src/utils/objectMappingUtils.d.ts +1 -1
  211. package/dist/src/utils/objectMappingUtils.d.ts.map +1 -1
  212. package/dist/src/utils/objectMappingUtils.js.map +1 -1
  213. package/dist/tsconfig.tsbuildinfo +1 -1
  214. package/package.json +37 -38
  215. package/src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +60 -0
  216. package/src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +9 -1
  217. package/src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts +14 -3
  218. package/src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts +16 -2
  219. package/src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts +15 -1
  220. package/src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts +61 -11
  221. package/src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.ts +88 -0
  222. package/src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts +44 -8
  223. package/src/__generated__/default_templates/index.ts +2 -0
  224. package/src/core/EvaluatorBase.ts +2 -2
  225. package/src/core/FunctionEvaluator.ts +5 -2
  226. package/src/helpers/asEvaluatorFn.ts +1 -2
  227. package/src/helpers/createEvaluator.ts +2 -3
  228. package/src/helpers/toEvaluationResult.ts +1 -1
  229. package/src/llm/ClassificationEvaluator.ts +4 -5
  230. package/src/llm/LLMEvaluator.ts +1 -1
  231. package/src/llm/createClassificationEvaluator.ts +1 -2
  232. package/src/llm/createClassifierFn.ts +1 -2
  233. package/src/llm/createConcisenessEvaluator.ts +71 -0
  234. package/src/llm/createCorrectnessEvaluator.ts +2 -3
  235. package/src/llm/createDocumentRelevanceEvaluator.ts +2 -3
  236. package/src/llm/createFaithfulnessEvaluator.ts +2 -3
  237. package/src/llm/createHallucinationEvaluator.ts +2 -3
  238. package/src/llm/createToolInvocationEvaluator.ts +2 -3
  239. package/src/llm/createToolResponseHandlingEvaluator.ts +108 -0
  240. package/src/llm/createToolSelectionEvaluator.ts +92 -0
  241. package/src/llm/generateClassification.ts +5 -5
  242. package/src/llm/index.ts +3 -0
  243. package/src/template/applyTemplate.ts +2 -3
  244. package/src/template/getTemplateVariables.ts +2 -2
  245. package/src/types/evals.ts +4 -4
  246. package/src/types/otel.ts +1 -1
  247. package/src/utils/bindEvaluator.ts +2 -2
  248. package/src/utils/objectMappingUtils.ts +2 -2
package/README.md CHANGED
@@ -92,24 +92,24 @@ See the complete example in [`examples/classifier_example.ts`](examples/classifi
92
92
  The library includes several pre-built evaluators for common evaluation tasks. These evaluators come with optimized prompts and can be used directly with any AI SDK model.
93
93
 
94
94
  ```typescript
95
- import { createHallucinationEvaluator } from "@arizeai/phoenix-evals/llm";
95
+ import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals/llm";
96
96
  import { openai } from "@ai-sdk/openai";
97
97
  const model = openai("gpt-4o-mini");
98
98
 
99
- // Hallucination Detection
100
- const hallucinationEvaluator = createHallucinationEvaluator({
99
+ // Faithfulness Detection
100
+ const faithfulnessEvaluator = createFaithfulnessEvaluator({
101
101
  model,
102
102
  });
103
103
 
104
104
  // Use the evaluators
105
- const result = await hallucinationEvaluator({
105
+ const result = await faithfulnessEvaluator({
106
106
  input: "What is the capital of France?",
107
107
  context: "France is a country in Europe. Paris is its capital city.",
108
108
  output: "The capital of France is London.",
109
109
  });
110
110
 
111
111
  console.log(result);
112
- // Output: { label: "hallucinated", score: 0, explanation: "..." }
112
+ // Output: { label: "unfaithful", score: 0, explanation: "..." }
113
113
  ```
114
114
 
115
115
  ### Data Mapping
@@ -119,7 +119,7 @@ When your data structure doesn't match what an evaluator expects, use `bindEvalu
119
119
  ```typescript
120
120
  import {
121
121
  bindEvaluator,
122
- createHallucinationEvaluator,
122
+ createFaithfulnessEvaluator,
123
123
  } from "@arizeai/phoenix-evals";
124
124
  import { openai } from "@ai-sdk/openai";
125
125
 
@@ -132,11 +132,11 @@ type ExampleType = {
132
132
  };
133
133
 
134
134
  const evaluator = bindEvaluator<ExampleType>(
135
- createHallucinationEvaluator({ model }),
135
+ createFaithfulnessEvaluator({ model }),
136
136
  {
137
137
  inputMapping: {
138
138
  input: "question", // Map "input" from "question"
139
- reference: "context", // Map "reference" from "context"
139
+ context: "context", // Map "context" from "context"
140
140
  output: "answer", // Map "output" from "answer"
141
141
  },
142
142
  }
@@ -167,23 +167,23 @@ npm install @arizeai/phoenix-client
167
167
  ```
168
168
 
169
169
  ```typescript
170
- import { createHallucinationEvaluator } from "@arizeai/phoenix-evals/llm";
170
+ import { createFaithfulnessEvaluator } from "@arizeai/phoenix-evals/llm";
171
171
  import { openai } from "@ai-sdk/openai";
172
172
  import { createDataset } from "@arizeai/phoenix-client/datasets";
173
173
  import {
174
- asEvaluator,
174
+ asExperimentEvaluator,
175
175
  runExperiment,
176
176
  } from "@arizeai/phoenix-client/experiments";
177
177
 
178
178
  // Create your evaluator
179
- const hallucinationEvaluator = createHallucinationEvaluator({
179
+ const faithfulnessEvaluator = createFaithfulnessEvaluator({
180
180
  model: openai("gpt-4o-mini"),
181
181
  });
182
182
 
183
183
  // Create a dataset for your experiment
184
184
  const dataset = await createDataset({
185
- name: "hallucination-eval",
186
- description: "Evaluate the hallucination of the model",
185
+ name: "faithfulness-eval",
186
+ description: "Evaluate the faithfulness of the model",
187
187
  examples: [
188
188
  {
189
189
  input: {
@@ -202,14 +202,14 @@ const task = async (example) => {
202
202
  };
203
203
 
204
204
  // Create a custom evaluator to validate results
205
- const hallucinationCheck = asEvaluator({
206
- name: "hallucination",
205
+ const faithfulnessCheck = asExperimentEvaluator({
206
+ name: "faithfulness",
207
207
  kind: "LLM",
208
208
  evaluate: async ({ input, output }) => {
209
- // Use the hallucination evaluator from phoenix-evals
210
- const result = await hallucinationEvaluator({
209
+ // Use the faithfulness evaluator from phoenix-evals
210
+ const result = await faithfulnessEvaluator({
211
211
  input: input.question,
212
- context: input.context, // Note: uses 'context' not 'reference'
212
+ context: input.context,
213
213
  output: output,
214
214
  });
215
215
 
@@ -219,11 +219,11 @@ const hallucinationCheck = asEvaluator({
219
219
 
220
220
  // Run the experiment with automatic tracing
221
221
  runExperiment({
222
- experimentName: "hallucination-eval",
223
- experimentDescription: "Evaluate the hallucination of the model",
222
+ experimentName: "faithfulness-eval",
223
+ experimentDescription: "Evaluate the faithfulness of the model",
224
224
  dataset: dataset,
225
225
  task,
226
- evaluators: [hallucinationCheck],
226
+ evaluators: [faithfulnessCheck],
227
227
  });
228
228
  ```
229
229
 
@@ -233,7 +233,7 @@ To run examples, install dependencies using `pnpm` and run:
233
233
 
234
234
  ```bash
235
235
  pnpm install
236
- pnpx tsx examples/experiment_evaluation_example.ts
236
+ pnpx tsx examples/classifier_example.ts
237
237
  # change the file name to run other examples
238
238
  ```
239
239
 
@@ -241,7 +241,7 @@ pnpx tsx examples/experiment_evaluation_example.ts
241
241
 
242
242
  Join our community to connect with thousands of AI builders:
243
243
 
244
- - 🌍 Join our [Slack community](https://arize-ai.slack.com/join/shared_invite/zt-11t1vbu4x-xkBIHmOREQnYnYDH1GDfCg).
244
+ - 🌍 Join our [Slack community](https://join.slack.com/t/arize-ai/shared_invite/zt-3r07iavnk-ammtATWSlF0pSrd1DsMW7g).
245
245
  - 📚 Read the [Phoenix documentation](https://arize.com/docs/phoenix).
246
246
  - 💡 Ask questions and provide feedback in the _#phoenix-support_ channel.
247
247
  - 🌟 Leave a star on our [GitHub](https://github.com/Arize-ai/phoenix).
@@ -0,0 +1,3 @@
1
+ import type { ClassificationEvaluatorConfig } from "../types.js";
2
+ export declare const CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG: ClassificationEvaluatorConfig;
3
+ //# sourceMappingURL=CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,2CAA2C,EAAE,6BAuDzD,CAAC"}
@@ -0,0 +1,58 @@
1
+ // This file is generated. Do not edit by hand.
2
+ export const CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG = {
3
+ name: "conciseness",
4
+ description: "Evaluate whether model outputs are concise and free of unnecessary content.",
5
+ optimizationDirection: "MAXIMIZE",
6
+ template: [
7
+ {
8
+ role: "user",
9
+ content: `
10
+ You are an expert evaluator assessing the conciseness of model outputs. Your task is to determine whether a response uses the minimum number of words necessary to fully answer the question.
11
+
12
+ <rubric>
13
+
14
+ CONCISE - The response:
15
+
16
+ - Contains only the exact information requested
17
+ - Uses the minimum number of words necessary to convey the complete answer
18
+ - Omits pleasantries, hedging language, and unnecessary context
19
+ - Excludes meta-commentary about the answer or the model's capabilities
20
+ - Avoids redundant information or restatements
21
+ - Does not include explanations unless explicitly requested
22
+
23
+
24
+ VERBOSE - The response contains any of:
25
+
26
+ - Unnecessary pleasantries, greetings, or filler phrases (e.g., "Great question!", "Sure!", "I'd be happy to help")
27
+ - Hedging language or excessive qualifiers (e.g., "It's worth noting that...", "It's important to understand that...")
28
+ - Meta-commentary about the response itself or the model's capabilities
29
+ - Redundant restatements of the same information
30
+ - Unsolicited explanations, context, or caveats beyond what was asked
31
+ - Unnecessary formatting, bullet points, or structure for simple answers
32
+
33
+ </rubric>
34
+
35
+ <data>
36
+
37
+ <input>
38
+ {{input}}
39
+ </input>
40
+
41
+ <output>
42
+ {{output}}
43
+ </output>
44
+
45
+ </data>
46
+
47
+ Evaluate only the conciseness of the response. Do not assess correctness, helpfulness, or quality of information. Focus solely on whether the response uses more words than necessary to answer the question.
48
+
49
+ Is the output concise or verbose?
50
+ `,
51
+ },
52
+ ],
53
+ choices: {
54
+ "concise": 1,
55
+ "verbose": 0
56
+ },
57
+ };
58
+ //# sourceMappingURL=CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CONCISENESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,2CAA2C,GAAkC;IACxF,IAAI,EAAE,aAAa;IACnB,WAAW,EAAE,6EAA6E;IAC1F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAyCd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,SAAS,EAAE,CAAC;KACb;CACA,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,2CAA2C,EAAE,6BA8CzD,CAAC"}
1
+ {"version":3,"file":"CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,2CAA2C,EAAE,6BAsDzD,CAAC"}
@@ -1,7 +1,7 @@
1
1
  // This file is generated. Do not edit by hand.
2
2
  export const CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG = {
3
3
  name: "correctness",
4
- description: "Assess factual accuracy and completeness of model outputs.",
4
+ description: "Assess general correctness and completeness of model outputs.",
5
5
  optimizationDirection: "MAXIMIZE",
6
6
  template: [
7
7
  {
@@ -10,29 +10,37 @@ export const CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG = {
10
10
  You are an expert evaluator labeling model outputs for correctness. Your task is to assign a classification based on the following criteria:
11
11
 
12
12
  <rubric>
13
+
13
14
  CORRECT - The response:
15
+
14
16
  - Provides accurate and complete information with no factual errors
15
17
  - Addresses all parts of the question
16
18
  - Is logically consistent with no contradictions
17
19
  - Uses precise, domain-appropriate terminology
18
20
  - Avoids ambiguous or misleading language
19
21
 
22
+
20
23
  INCORRECT - The response contains any of:
24
+
21
25
  - Factual errors or inaccuracies
22
26
  - Incomplete or partial answers
23
27
  - Misleading or ambiguous statements
24
28
  - Incorrect terminology
25
29
  - Logical inconsistencies
26
30
  - Missing key information
31
+
27
32
  </rubric>
28
33
 
29
34
  <data>
35
+
30
36
  <input>
31
37
  {{input}}
32
38
  </input>
39
+
33
40
  <output>
34
41
  {{output}}
35
42
  </output>
43
+
36
44
  </data>
37
45
 
38
46
  Carefully read the input and output and check for factual accuracy and completeness. Focus on correctness of information rather than verboseness or style.
@@ -1 +1 @@
1
- {"version":3,"file":"CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,2CAA2C,GAAkC;IACxF,IAAI,EAAE,aAAa;IACnB,WAAW,EAAE,4DAA4D;IACzE,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAgCd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
1
+ {"version":3,"file":"CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/CORRECTNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,2CAA2C,GAAkC;IACxF,IAAI,EAAE,aAAa;IACnB,WAAW,EAAE,+DAA+D;IAC5E,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAwCd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,kDAAkD,EAAE,6BA2BhE,CAAC"}
1
+ {"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,kDAAkD,EAAE,6BAsChE,CAAC"}
@@ -1,24 +1,35 @@
1
1
  // This file is generated. Do not edit by hand.
2
2
  export const DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG = {
3
3
  name: "document_relevance",
4
- description: "A specialized evaluator for determining document relevance to a given question.",
4
+ description: "For determining if a document is relevant to a given question.",
5
5
  optimizationDirection: "MAXIMIZE",
6
6
  template: [
7
7
  {
8
8
  role: "user",
9
9
  content: `
10
- You are comparing a document to a question and trying to determine if the document text contains information relevant to answering the question. Here is the data:
10
+ You are comparing a document to a question and trying to determine
11
+ if the document text contains information relevant to answering the
12
+ question. Here is the data:
11
13
 
12
14
  <data>
15
+
13
16
  <question>
14
17
  {{input}}
15
18
  </question>
19
+
16
20
  <document_text>
17
21
  {{documentText}}
18
22
  </document_text>
23
+
19
24
  </data>
20
25
 
21
- Compare the question above to the document text. You must determine whether the document text contains information that can answer the question. Please focus on whether the very specific question can be answered by the information in the document text. Your response must be either "relevant" or "unrelated". "unrelated" means that the document text does not contain an answer to the question. "relevant" means the document text contains an answer to the question.
26
+ Compare the question above to the document text. You must determine
27
+ whether the document text contains information that can answer the
28
+ question. Please focus on whether the very specific question can be
29
+ answered by the information in the document text. Your response must be
30
+ either "relevant" or "unrelated". "unrelated" means that the document
31
+ text does not contain an answer to the question. "relevant" means the
32
+ document text contains an answer to the question.
22
33
  `,
23
34
  },
24
35
  ],
@@ -1 +1 @@
1
- {"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,kDAAkD,GAAkC;IAC/F,IAAI,EAAE,oBAAoB;IAC1B,WAAW,EAAE,iFAAiF;IAC9F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;CAad;SACI;KACF;IACD,OAAO,EAAE;QACT,UAAU,EAAE,CAAC;QACb,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
1
+ {"version":3,"file":"DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/DOCUMENT_RELEVANCE_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,kDAAkD,GAAkC;IAC/F,IAAI,EAAE,oBAAoB;IAC1B,WAAW,EAAE,gEAAgE;IAC7E,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;CAwBd;SACI;KACF;IACD,OAAO,EAAE;QACT,UAAU,EAAE,CAAC;QACb,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,4CAA4C,EAAE,6BA8B1D,CAAC"}
1
+ {"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,4CAA4C,EAAE,6BA4C1D,CAAC"}
@@ -1,24 +1,38 @@
1
1
  // This file is generated. Do not edit by hand.
2
2
  export const FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG = {
3
3
  name: "faithfulness",
4
- description: "A specialized evaluator for detecting faithfulness in grounded LLM responses.",
4
+ description: "For determining if a response is faithful to the context.",
5
5
  optimizationDirection: "MAXIMIZE",
6
6
  template: [
7
7
  {
8
8
  role: "user",
9
9
  content: `
10
- In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is unfaithful to the facts. Your objective is to determine whether the response text contains factual information and is faithful to the context. An 'unfaithful' response refers to a response that is not based on the context or assumes information that is not available in the context. Your response should be a single word: either 'faithful' or 'unfaithful', and it should not include any other text or characters. 'unfaithful' indicates that the response provides factually inaccurate information to the query based on the context. 'faithful' indicates that the response to the question is correct relative to the context, and does not contain made up information. Please read the query and context carefully before determining your response.
10
+ In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is unfaithful to the facts.
11
+
12
+ Your objective is to determine whether the response text contains factual information and is faithful to the context. An 'unfaithful' response refers to a response that is not based on the context or assumes information that is not available in the context.
13
+
14
+ Your response should be a single word: either 'faithful' or 'unfaithful', and it should not include any other text or characters.
15
+
16
+ 'unfaithful' indicates that the response provides factually inaccurate information to the query based on the context.
17
+
18
+ 'faithful' indicates that the response to the question is correct relative to the context, and does not contain made up information.
19
+
20
+ Please read the query and context carefully before determining your response.
11
21
 
12
22
  <data>
23
+
13
24
  <query>
14
25
  {{input}}
15
26
  </query>
27
+
16
28
  <context>
17
29
  {{context}}
18
30
  </context>
31
+
19
32
  <response>
20
33
  {{output}}
21
34
  </response>
35
+
22
36
  </data>
23
37
 
24
38
  Is the response above faithful or unfaithful based on the query and context?
@@ -1 +1 @@
1
- {"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,4CAA4C,GAAkC;IACzF,IAAI,EAAE,cAAc;IACpB,WAAW,EAAE,+EAA+E;IAC5F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;CAgBd;SACI;KACF;IACD,OAAO,EAAE;QACT,UAAU,EAAE,CAAC;QACb,YAAY,EAAE,CAAC;KAChB;CACA,CAAC"}
1
+ {"version":3,"file":"FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/FAITHFULNESS_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,4CAA4C,GAAkC;IACzF,IAAI,EAAE,cAAc;IACpB,WAAW,EAAE,2DAA2D;IACxE,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA8Bd;SACI;KACF;IACD,OAAO,EAAE;QACT,UAAU,EAAE,CAAC;QACb,YAAY,EAAE,CAAC;KAChB;CACA,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,6CAA6C,EAAE,6BA8B3D,CAAC"}
1
+ {"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,6CAA6C,EAAE,6BA4C3D,CAAC"}
@@ -7,18 +7,32 @@ export const HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG = {
7
7
  {
8
8
  role: "user",
9
9
  content: `
10
- In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is a hallucination of facts. Your objective is to determine whether the response text contains factual information and is not a hallucination. A 'hallucination' refers to a response that is not based on the context or assumes information that is not available in the context. Your response should be a single word: either 'factual' or 'hallucinated', and it should not include any other text or characters. 'hallucinated' indicates that the response provides factually inaccurate information to the query based on the context. 'factual' indicates that the response to the question is correct relative to the context, and does not contain made up information. Please read the query and context carefully before determining your response.
10
+ In this task, you will be presented with a query, some context and a response. The response is generated to the question based on the context. The response may contain false information. You must use the context to determine if the response to the question contains false information, if the response is hallucinated.
11
+
12
+ Your objective is to determine whether the response text contains factual information and is factual relative to the context. An 'hallucinated' response refers to a response that is not based on the context or assumes information that is not available in the context.
13
+
14
+ Your response should be a single word: either 'factual' or 'hallucinated', and it should not include any other text or characters.
15
+
16
+ 'hallucinated' indicates that the response provides factually inaccurate information to the query based on the context.
17
+
18
+ 'factual' indicates that the response to the question is correct relative to the context, and does not contain made up information.
19
+
20
+ Please read the query and context carefully before determining your response.
11
21
 
12
22
  <data>
23
+
13
24
  <query>
14
25
  {{input}}
15
26
  </query>
27
+
16
28
  <context>
17
29
  {{context}}
18
30
  </context>
31
+
19
32
  <response>
20
33
  {{output}}
21
34
  </response>
35
+
22
36
  </data>
23
37
 
24
38
  Is the response above factual or hallucinated based on the query and context?
@@ -1 +1 @@
1
- {"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,6CAA6C,GAAkC;IAC1F,IAAI,EAAE,eAAe;IACrB,WAAW,EAAE,iFAAiF;IAC9F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;CAgBd;SACI;KACF;IACD,OAAO,EAAE;QACT,cAAc,EAAE,CAAC;QACjB,SAAS,EAAE,CAAC;KACb;CACA,CAAC"}
1
+ {"version":3,"file":"HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/HALLUCINATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,6CAA6C,GAAkC;IAC1F,IAAI,EAAE,eAAe;IACrB,WAAW,EAAE,iFAAiF;IAC9F,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA8Bd;SACI;KACF;IACD,OAAO,EAAE;QACT,cAAc,EAAE,CAAC;QACjB,SAAS,EAAE,CAAC;KACb;CACA,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,+CAA+C,EAAE,6BA2B7D,CAAC"}
1
+ {"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,+CAA+C,EAAE,6BA6E7D,CAAC"}
@@ -1,23 +1,73 @@
1
1
  // This file is generated. Do not edit by hand.
2
2
  export const TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG = {
3
3
  name: "tool_invocation",
4
- description: "For determining if a tool was invoked correctly with proper arguments, formatting, and safe content. Requires conversation context, available tool schemas, and the LLM's tool invocation(s).",
4
+ description: "For determining if a tool was invoked correctly with proper arguments, formatting, and safe content.",
5
5
  optimizationDirection: "MAXIMIZE",
6
6
  template: [
7
7
  {
8
8
  role: "user",
9
9
  content: `
10
10
  You are an impartial judge evaluating an LLM's tool-calling behavior, specifically whether the LLM invoked a tool (or tools) correctly with valid arguments and proper formatting.
11
- Your task: Determine whether the LLM's tool invocation(s) were correct or incorrect based on: - The full conversation context (including all previous turns, not just the most recent message) - The available tool schemas - The LLM's tool invocation(s) with arguments
12
- IMPORTANT - Tool Invocation vs. Tool Selection: - You are ONLY evaluating the tool invocation, not the tool selection. - If the tool selection is incorrect or not relevant to the user's query, but the tool invocation is correct, return "correct". - If the tool selection is correct but the tool invocation is incorrect, return "incorrect".
13
- IMPORTANT - Multi-Tool Invocations: - The LLM may invoke MULTIPLE tools in a single response. This is valid and expected for complex requests. - When multiple tools are invoked, evaluate EACH tool invocation independently. - Return "correct" only if ALL tool invocations are correct. - Return "incorrect" if ANY tool invocation has an error.
14
- IMPORTANT - Conversation Context: - Read the entire conversation history carefully, not just the final user message. - Argument values may need to be extracted from EARLIER turns in the conversation (e.g., user mentions a location, date, or quantity in a previous message). - The LLM should use context from the full conversation to populate argument values correctly.
15
- Criteria Return "correct" only when ALL of the following are true for EVERY tool invocation: - JSON is properly structured (if applicable). - All required fields/parameters are present. - No hallucinated or nonexistent fields (all fields exist in the tool schema). - Argument values match the user's intent from the conversation context (correct types, realistic values). - No unsafe content (e.g., PII like SSNs, credit card numbers, passwords) in arguments.
16
- Return "incorrect" if ANY of the following are true for ANY tool invocation: - The invocation contains hallucinated or nonexistent fields not in the schema. - Required fields/parameters are missing. - JSON is improperly formatted or malformed. - Argument values are incorrect, hallucinated, or do not match user intent from the conversation. - Arguments contain unsafe content (e.g., PII, sensitive data that should not be passed).
17
- Before providing your final judgment, explain your reasoning and consider: - How many tools were invoked? Evaluate each one. - Does each tool invocation match the schema for that tool? - Are all required parameters provided with appropriate values for each invocation? - Are there any extra fields that don't exist in the schema? - Looking at the FULL conversation: do the argument values accurately reflect what the user requested across all messages? - Is there any unsafe or sensitive content in any of the arguments? - Check that you are not evaluating the tool selection, only the tool invocation.
18
- <data> <context> {{input}} </context>
19
- <available_tools> {{availableTools}} </available_tools>
20
- <tool_invocation> {{toolSelection}} </tool_invocation> </data>
11
+
12
+ Your task: Determine whether the LLM's tool invocation(s) were correct or incorrect based on:
13
+ - The full conversation context (including all previous turns, not just the most recent message)
14
+ - The available tool schemas
15
+ - The LLM's tool invocation(s) with arguments
16
+
17
+ IMPORTANT - Tool Invocation vs. Tool Selection:
18
+ - You are ONLY evaluating the tool invocation, not the tool selection.
19
+ - If the tool selection is incorrect or not relevant to the user's query, but the tool invocation is correct, return "correct".
20
+ - If the tool selection is correct but the tool invocation is incorrect, return "incorrect".
21
+
22
+ IMPORTANT - Multi-Tool Invocations:
23
+ - The LLM may invoke MULTIPLE tools in a single response. This is valid and expected for complex requests.
24
+ - When multiple tools are invoked, evaluate EACH tool invocation independently.
25
+ - Return "correct" only if ALL tool invocations are correct.
26
+ - Return "incorrect" if ANY tool invocation has an error.
27
+
28
+ IMPORTANT - Conversation Context (input):
29
+ - Read the entire conversation history carefully, not just the final user message.
30
+ - Argument values may need to be extracted from EARLIER turns in the conversation (e.g., user mentions a location, date, or quantity in a previous message).
31
+ - The LLM should use context from the full conversation to populate argument values correctly.
32
+
33
+ Criteria
34
+ Return "correct" only when ALL of the following are true for EVERY tool invocation:
35
+ - JSON is properly structured (if applicable).
36
+ - All required fields/parameters are present.
37
+ - No hallucinated or nonexistent fields (all fields exist in the tool schema).
38
+ - Argument values match the user's intent from the conversation context (correct types, realistic values).
39
+ - No unsafe content (e.g., PII like SSNs, credit card numbers, passwords) in arguments.
40
+
41
+ Return "incorrect" if ANY of the following are true for ANY tool invocation:
42
+ - The invocation contains hallucinated or nonexistent fields not in the schema.
43
+ - Required fields/parameters are missing.
44
+ - JSON is improperly formatted or malformed.
45
+ - Argument values are incorrect, hallucinated, or do not match user intent from the conversation.
46
+ - Arguments contain unsafe content (e.g., PII, sensitive data that should not be passed).
47
+
48
+ Before providing your final judgment, explain your reasoning and consider:
49
+ - How many tools were invoked? Evaluate each one.
50
+ - Does each tool invocation match the schema for that tool?
51
+ - Are all required parameters provided with appropriate values for each invocation?
52
+ - Are there any extra fields that don't exist in the schema?
53
+ - Looking at the FULL input: do the argument values accurately reflect what the user requested across all messages?
54
+ - Is there any unsafe or sensitive content in any of the arguments?
55
+ - Check that you are not evaluating the tool selection, only the tool invocation.
56
+
57
+ <data>
58
+ <input>
59
+ {{input}}
60
+ </input>
61
+
62
+ <available_tools>
63
+ {{availableTools}}
64
+ </available_tools>
65
+
66
+ <output>
67
+ {{toolSelection}}
68
+ </output>
69
+ </data>
70
+
21
71
  Given the above data, is the tool invocation correct or incorrect?
22
72
  `,
23
73
  },
@@ -1 +1 @@
1
- {"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,+CAA+C,GAAkC;IAC5F,IAAI,EAAE,iBAAiB;IACvB,WAAW,EAAE,+LAA+L;IAC5M,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;CAad;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
1
+ {"version":3,"file":"TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_INVOCATION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,+CAA+C,GAAkC;IAC5F,IAAI,EAAE,iBAAiB;IACvB,WAAW,EAAE,sGAAsG;IACnH,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA+Dd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { ClassificationEvaluatorConfig } from "../types.js";
2
+ export declare const TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG: ClassificationEvaluatorConfig;
3
+ //# sourceMappingURL=TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,sDAAsD,EAAE,6BAmFpE,CAAC"}
@@ -0,0 +1,86 @@
1
+ // This file is generated. Do not edit by hand.
2
+ export const TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG = {
3
+ name: "tool_response_handling",
4
+ description: "For determining if an AI agent properly handled a tool's response, including error handling, data extraction, transformation, and safe information disclosure. Requires conversation context, the tool call(s), the tool result(s), and the agent's output.",
5
+ optimizationDirection: "MAXIMIZE",
6
+ template: [
7
+ {
8
+ role: "user",
9
+ content: `
10
+ You are an impartial judge evaluating an AI agent's handling of a tool's response. Your task is to determine whether the agent correctly processed the tool result to produce an appropriate output.
11
+
12
+ IMPORTANT - Scope of Evaluation:
13
+ - You are ONLY evaluating how the agent handled the tool response, NOT whether the right tool was selected or whether the tool was invoked correctly.
14
+ - This evaluation focuses on what happens AFTER the tool returns a result.
15
+
16
+ IMPORTANT - Multi-Tool Handling:
17
+ - The agent may make MULTIPLE tool calls in a single interaction. This is valid and expected.
18
+ - When multiple tools are called, evaluate how the agent handled ALL tool results together.
19
+ - Return "correct" only if the agent properly handled ALL tool results.
20
+ - Return "incorrect" if the agent mishandled ANY tool result.
21
+
22
+ IMPORTANT - Error Response Handling:
23
+ - Tool results may contain errors (rate limits, timeouts, not found, invalid arguments, etc.).
24
+ - The agent's output may include retries, follow-up tool calls, or a final response to the user.
25
+ - Evaluate the ENTIRE handling sequence, not just the final message.
26
+ - Appropriate error handling includes:
27
+ - Retrying on transient errors (rate limits, timeouts)
28
+ - Correcting arguments after invalid argument errors
29
+ - Informing the user appropriately when errors are not recoverable
30
+ - NOT making repeated identical calls that continue to fail
31
+
32
+ Criteria for CORRECT handling:
33
+ - Data is extracted accurately from the tool result (no hallucination of data that wasn't returned)
34
+ - Dates, numbers, and structured fields are properly transformed and formatted
35
+ - Results are accurately summarized to address the user's original query
36
+ - Error responses are handled appropriately (retries for transient errors, corrections for invalid arguments)
37
+ - No repeated identical calls after non-retryable errors
38
+ - No disclosure of sensitive/internal information (database credentials, internal URLs, PII, API keys, etc.)
39
+ - The agent's response actually uses the tool result rather than ignoring it
40
+
41
+ Criteria for INCORRECT handling:
42
+ - Hallucinated data: The output includes information not present in the tool result
43
+ - Misinterpretation: The meaning of the tool result is misrepresented or reversed
44
+ - Improper transformation: Dates, numbers, or structured data are incorrectly converted
45
+ - Missing retry: Failed to retry on retryable errors (rate limits, timeouts)
46
+ - Missing correction: Failed to correct arguments after invalid argument errors
47
+ - Futile retries: Repeated identical calls that continue to fail
48
+ - Information disclosure: Leaked sensitive information (credentials, internal URLs, PII)
49
+ - Ignored results: The agent's response doesn't incorporate the tool result
50
+ - Incomplete handling: Only some tool results are used when multiple tools were called
51
+
52
+ Before providing your final judgment, explain your reasoning and consider:
53
+ - Does the output accurately reflect what the tool returned?
54
+ - Are there any fabricated details not in the tool result?
55
+ - Were errors handled appropriately?
56
+ - Is sensitive information properly protected?
57
+ - Does the output actually address the user's query using the tool data?
58
+
59
+ <data>
60
+ <input>
61
+ {{input}}
62
+ </input>
63
+
64
+ <tool_call>
65
+ {{toolCall}}
66
+ </tool_call>
67
+
68
+ <tool_result>
69
+ {{toolResult}}
70
+ </tool_result>
71
+
72
+ <output>
73
+ {{output}}
74
+ </output>
75
+ </data>
76
+
77
+ Given the above data, did the agent handle the tool response correctly or incorrectly?
78
+ `,
79
+ },
80
+ ],
81
+ choices: {
82
+ "correct": 1,
83
+ "incorrect": 0
84
+ },
85
+ };
86
+ //# sourceMappingURL=TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.js","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_RESPONSE_HANDLING_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAI/C,MAAM,CAAC,MAAM,sDAAsD,GAAkC;IACnG,IAAI,EAAE,wBAAwB;IAC9B,WAAW,EAAE,6PAA6P;IAC1Q,qBAAqB,EAAE,UAAU;IACjC,QAAQ,EAAE;QACR;YACE,IAAI,EAAE,MAAM;YACZ,OAAO,EAAE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAqEd;SACI;KACF;IACD,OAAO,EAAE;QACT,SAAS,EAAE,CAAC;QACZ,WAAW,EAAE,CAAC;KACf;CACA,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,8CAA8C,EAAE,6BAwB5D,CAAC"}
1
+ {"version":3,"file":"TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.d.ts","sourceRoot":"","sources":["../../../../src/__generated__/default_templates/TOOL_SELECTION_CLASSIFICATION_EVALUATOR_CONFIG.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,6BAA6B,EAAE,MAAM,UAAU,CAAC;AAE9D,eAAO,MAAM,8CAA8C,EAAE,6BA4D5D,CAAC"}