deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,627 @@
1
+ from typing import List, Optional, Union, Type, Tuple
2
+ import asyncio
3
+ import itertools
4
+ from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
+ from deepeval.metrics import BaseConversationalMetric
6
+ from deepeval.utils import (
7
+ get_or_create_event_loop,
8
+ prettify_list,
9
+ )
10
+ from deepeval.metrics.utils import (
11
+ construct_verbose_logs,
12
+ trimAndLoadJson,
13
+ check_conversational_test_case_params,
14
+ get_unit_interactions,
15
+ get_turns_in_sliding_window,
16
+ initialize_model,
17
+ )
18
+ from deepeval.models import DeepEvalBaseLLM
19
+ from deepeval.metrics.turn_faithfulness.template import (
20
+ TurnFaithfulnessTemplate,
21
+ )
22
+ from deepeval.metrics.indicator import metric_progress_indicator
23
+ from deepeval.metrics.turn_faithfulness.schema import (
24
+ FaithfulnessVerdict,
25
+ Verdicts,
26
+ FaithfulnessScoreReason,
27
+ Truths,
28
+ Claims,
29
+ InteractionFaithfulnessScore,
30
+ )
31
+ from deepeval.metrics.api import metric_data_manager
32
+
33
+
34
+ class TurnFaithfulnessMetric(BaseConversationalMetric):
35
+ _required_test_case_params: List[TurnParams] = [
36
+ TurnParams.ROLE,
37
+ TurnParams.CONTENT,
38
+ TurnParams.RETRIEVAL_CONTEXT,
39
+ ]
40
+
41
+ def __init__(
42
+ self,
43
+ threshold: float = 0.5,
44
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
45
+ include_reason: bool = True,
46
+ async_mode: bool = True,
47
+ strict_mode: bool = False,
48
+ verbose_mode: bool = False,
49
+ truths_extraction_limit: Optional[int] = None,
50
+ penalize_ambiguous_claims: bool = False,
51
+ window_size: int = 10,
52
+ evaluation_template: Type[
53
+ TurnFaithfulnessTemplate
54
+ ] = TurnFaithfulnessTemplate,
55
+ ):
56
+ self.threshold = 1 if strict_mode else threshold
57
+ self.model, self.using_native_model = initialize_model(model)
58
+ self.evaluation_model = self.model.get_model_name()
59
+ self.include_reason = include_reason
60
+ self.async_mode = async_mode
61
+ self.strict_mode = strict_mode
62
+ self.verbose_mode = verbose_mode
63
+ self.evaluation_template = evaluation_template
64
+ self.penalize_ambiguous_claims = penalize_ambiguous_claims
65
+ self.window_size = window_size
66
+
67
+ self.truths_extraction_limit = truths_extraction_limit
68
+ if self.truths_extraction_limit is not None:
69
+ self.truths_extraction_limit = max(self.truths_extraction_limit, 0)
70
+
71
+ def measure(
72
+ self,
73
+ test_case: ConversationalTestCase,
74
+ _show_indicator: bool = True,
75
+ _in_component: bool = False,
76
+ _log_metric_to_confident: bool = True,
77
+ ):
78
+ check_conversational_test_case_params(
79
+ test_case,
80
+ self._required_test_case_params,
81
+ self,
82
+ False,
83
+ self.model,
84
+ test_case.multimodal,
85
+ )
86
+
87
+ multimodal = test_case.multimodal
88
+
89
+ self.evaluation_cost = 0 if self.using_native_model else None
90
+ with metric_progress_indicator(
91
+ self, _show_indicator=_show_indicator, _in_component=_in_component
92
+ ):
93
+ if self.async_mode:
94
+ loop = get_or_create_event_loop()
95
+ loop.run_until_complete(
96
+ self.a_measure(
97
+ test_case,
98
+ _show_indicator=False,
99
+ _in_component=_in_component,
100
+ _log_metric_to_confident=_log_metric_to_confident,
101
+ )
102
+ )
103
+ else:
104
+ unit_interactions = get_unit_interactions(test_case.turns)
105
+ turns_windows: List[List[Turn]] = [
106
+ list(itertools.chain(*window))
107
+ for window in get_turns_in_sliding_window(
108
+ unit_interactions, self.window_size
109
+ )
110
+ ]
111
+ scores = []
112
+ for window in turns_windows:
113
+ scores.extend(
114
+ self._get_faithfulness_scores(window, multimodal)
115
+ )
116
+ self.score = self._calculate_score(scores)
117
+ self.success = self.score >= self.threshold
118
+ self.reason = self._generate_reason(scores)
119
+ verbose_steps = self._get_verbose_steps(scores)
120
+ self.verbose_logs = construct_verbose_logs(
121
+ self,
122
+ steps=[
123
+ *verbose_steps,
124
+ f"Final Score: {self.score}\n",
125
+ f"Final Reason: {self.reason}\n",
126
+ ],
127
+ )
128
+ if _log_metric_to_confident:
129
+ metric_data_manager.post_metric_if_enabled(
130
+ self, test_case=test_case
131
+ )
132
+
133
+ return self.score
134
+
135
+ async def a_measure(
136
+ self,
137
+ test_case: ConversationalTestCase,
138
+ _show_indicator: bool = True,
139
+ _in_component: bool = False,
140
+ _log_metric_to_confident: bool = True,
141
+ ) -> float:
142
+ check_conversational_test_case_params(
143
+ test_case,
144
+ self._required_test_case_params,
145
+ self,
146
+ False,
147
+ self.model,
148
+ test_case.multimodal,
149
+ )
150
+
151
+ multimodal = test_case.multimodal
152
+
153
+ self.evaluation_cost = 0 if self.using_native_model else None
154
+ with metric_progress_indicator(
155
+ self,
156
+ async_mode=True,
157
+ _show_indicator=_show_indicator,
158
+ _in_component=_in_component,
159
+ ):
160
+ unit_interactions = get_unit_interactions(test_case.turns)
161
+ turns_windows: List[List[Turn]] = [
162
+ list(itertools.chain(*window))
163
+ for window in get_turns_in_sliding_window(
164
+ unit_interactions, self.window_size
165
+ )
166
+ ]
167
+ scores = []
168
+ tasks = []
169
+
170
+ async def get_individual_scores(window):
171
+ scores.extend(
172
+ await self._a_get_faithfulness_scores(window, multimodal)
173
+ )
174
+
175
+ for window in turns_windows:
176
+ tasks.append(get_individual_scores(window))
177
+ await asyncio.gather(*tasks)
178
+ self.score = self._calculate_score(scores)
179
+ self.success = self.score >= self.threshold
180
+ self.reason = await self._a_generate_reason(scores)
181
+ verbose_steps = self._get_verbose_steps(scores)
182
+ self.verbose_logs = construct_verbose_logs(
183
+ self,
184
+ steps=[
185
+ *verbose_steps,
186
+ f"Final Score: {self.score}\n",
187
+ f"Final Reason: {self.reason}\n",
188
+ ],
189
+ )
190
+ if _log_metric_to_confident:
191
+ metric_data_manager.post_metric_if_enabled(
192
+ self, test_case=test_case
193
+ )
194
+
195
+ return self.score
196
+
197
+ async def _a_get_faithfulness_scores(
198
+ self, turns_window: List[Turn], multimodal: bool
199
+ ):
200
+
201
+ windows_scores = []
202
+
203
+ user_content = ""
204
+ assistant_content = ""
205
+ retrieval_context = []
206
+ for turn in turns_window:
207
+ if turn.role == "user":
208
+ user_content += f"\n{turn.content} "
209
+ else:
210
+ assistant_content += f"\n{turn.content}"
211
+ if turn.retrieval_context is not None:
212
+ retrieval_context.extend(turn.retrieval_context)
213
+
214
+ truths = await self._a_generate_truths(retrieval_context, multimodal)
215
+ claims = await self._a_generate_claims(
216
+ user_content, assistant_content, multimodal
217
+ )
218
+ verdicts = await self._a_generate_verdicts(claims, truths, multimodal)
219
+ score, reason = self._get_interaction_score_and_reason(
220
+ verdicts, multimodal
221
+ )
222
+ interaction_score = InteractionFaithfulnessScore(
223
+ score=score,
224
+ reason=reason,
225
+ claims=claims,
226
+ truths=truths,
227
+ verdicts=verdicts,
228
+ )
229
+ windows_scores.append(interaction_score)
230
+
231
+ return windows_scores
232
+
233
+ def _get_faithfulness_scores(
234
+ self, turns_window: List[Turn], multimodal: bool
235
+ ):
236
+ windows_scores = []
237
+
238
+ user_content = ""
239
+ assistant_content = ""
240
+ retrieval_context = []
241
+ for turn in turns_window:
242
+ if turn.role == "user":
243
+ user_content += f"\n{turn.content} "
244
+ else:
245
+ assistant_content += f"\n{turn.content}"
246
+ if turn.retrieval_context is not None:
247
+ retrieval_context.extend(turn.retrieval_context)
248
+
249
+ truths = self._generate_truths(retrieval_context, multimodal)
250
+ claims = self._generate_claims(
251
+ user_content, assistant_content, multimodal
252
+ )
253
+ verdicts = self._generate_verdicts(claims, truths, multimodal)
254
+ score, reason = self._get_interaction_score_and_reason(
255
+ verdicts, multimodal
256
+ )
257
+ interaction_score = InteractionFaithfulnessScore(
258
+ score=score,
259
+ reason=reason,
260
+ claims=claims,
261
+ truths=truths,
262
+ verdicts=verdicts,
263
+ )
264
+ windows_scores.append(interaction_score)
265
+
266
+ return windows_scores
267
+
268
+ async def _a_generate_truths(
269
+ self, retrieval_context: str, multimodal: bool
270
+ ) -> List[str]:
271
+ prompt = self.evaluation_template.generate_truths(
272
+ reference_context="\n\n".join(retrieval_context),
273
+ extraction_limit=self.truths_extraction_limit,
274
+ multimodal=multimodal,
275
+ )
276
+ if self.using_native_model:
277
+ res, cost = await self.model.a_generate(prompt, schema=Truths)
278
+ self.evaluation_cost += cost
279
+ return res.truths
280
+ else:
281
+ try:
282
+ res: Truths = await self.model.a_generate(prompt, schema=Truths)
283
+ return res.truths
284
+ except TypeError:
285
+ res = await self.model.a_generate(prompt)
286
+ data = trimAndLoadJson(res, self)
287
+ return data["truths"]
288
+
289
+ def _generate_truths(
290
+ self, retrieval_context: str, multimodal: bool
291
+ ) -> List[str]:
292
+ prompt = self.evaluation_template.generate_truths(
293
+ reference_context="\n\n".join(retrieval_context),
294
+ extraction_limit=self.truths_extraction_limit,
295
+ multimodal=multimodal,
296
+ )
297
+ if self.using_native_model:
298
+ res, cost = self.model.generate(prompt, schema=Truths)
299
+ self.evaluation_cost += cost
300
+ return res.truths
301
+ else:
302
+ try:
303
+ res: Truths = self.model.generate(prompt, schema=Truths)
304
+ return res.truths
305
+ except TypeError:
306
+ res = self.model.generate(prompt)
307
+ data = trimAndLoadJson(res, self)
308
+ return data["truths"]
309
+
310
+ async def _a_generate_claims(
311
+ self, user_content: str, assistant_content: str, multimodal: bool
312
+ ) -> List[str]:
313
+ prompt = self.evaluation_template.generate_claims(
314
+ input=user_content,
315
+ assistant_output=assistant_content,
316
+ multimodal=multimodal,
317
+ )
318
+ if self.using_native_model:
319
+ res, cost = await self.model.a_generate(prompt, schema=Claims)
320
+ self.evaluation_cost += cost
321
+ return res.claims
322
+ else:
323
+ try:
324
+ res: Claims = await self.model.a_generate(prompt, schema=Claims)
325
+ return res.claims
326
+ except TypeError:
327
+ res = await self.model.a_generate(prompt)
328
+ data = trimAndLoadJson(res, self)
329
+ return data["claims"]
330
+
331
+ def _generate_claims(
332
+ self, user_content: str, assistant_content: str, multimodal: bool
333
+ ) -> List[str]:
334
+ prompt = self.evaluation_template.generate_claims(
335
+ input=user_content,
336
+ assistant_output=assistant_content,
337
+ multimodal=multimodal,
338
+ )
339
+ if self.using_native_model:
340
+ res, cost = self.model.generate(prompt, schema=Claims)
341
+ self.evaluation_cost += cost
342
+ return res.claims
343
+ else:
344
+ try:
345
+ res: Claims = self.model.generate(prompt, schema=Claims)
346
+ return res.claims
347
+ except TypeError:
348
+ res = self.model.generate(prompt)
349
+ data = trimAndLoadJson(res, self)
350
+ return data["claims"]
351
+
352
+ async def _a_generate_verdicts(
353
+ self, claims: Claims, truths: Truths, multimodal: bool
354
+ ) -> List[FaithfulnessVerdict]:
355
+ if len(claims) == 0:
356
+ return []
357
+
358
+ verdicts: List[FaithfulnessVerdict] = []
359
+
360
+ prompt = self.evaluation_template.generate_verdicts(
361
+ claims=claims,
362
+ reference_context="\n\n".join(truths),
363
+ multimodal=multimodal,
364
+ )
365
+
366
+ if self.using_native_model:
367
+ res, cost = await self.model.a_generate(prompt, schema=Verdicts)
368
+ self.evaluation_cost += cost
369
+ verdicts = [item for item in res.verdicts]
370
+ return verdicts
371
+ else:
372
+ try:
373
+ res: Verdicts = await self.model.a_generate(
374
+ prompt, schema=Verdicts
375
+ )
376
+ verdicts = [item for item in res.verdicts]
377
+ return verdicts
378
+ except TypeError:
379
+ res = await self.model.a_generate(prompt)
380
+ data = trimAndLoadJson(res, self)
381
+ verdicts = [
382
+ FaithfulnessVerdict(**item) for item in data["verdicts"]
383
+ ]
384
+ return verdicts
385
+
386
+ def _generate_verdicts(
387
+ self, claims: Claims, truths: Truths, multimodal: bool
388
+ ) -> List[FaithfulnessVerdict]:
389
+ if len(claims) == 0:
390
+ return []
391
+
392
+ verdicts: List[FaithfulnessVerdict] = []
393
+
394
+ prompt = self.evaluation_template.generate_verdicts(
395
+ claims=claims,
396
+ reference_context="\n\n".join(truths),
397
+ multimodal=multimodal,
398
+ )
399
+
400
+ if self.using_native_model:
401
+ res, cost = self.model.generate(prompt, schema=Verdicts)
402
+ self.evaluation_cost += cost
403
+ verdicts = [item for item in res.verdicts]
404
+ return verdicts
405
+ else:
406
+ try:
407
+ res: Verdicts = self.model.generate(prompt, schema=Verdicts)
408
+ verdicts = [item for item in res.verdicts]
409
+ return verdicts
410
+ except TypeError:
411
+ res = self.model.generate(prompt)
412
+ data = trimAndLoadJson(res, self)
413
+ verdicts = [
414
+ FaithfulnessVerdict(**item) for item in data["verdicts"]
415
+ ]
416
+ return verdicts
417
+
418
+ def _get_interaction_score_and_reason(
419
+ self, verdicts, multimodal: bool
420
+ ) -> Tuple[float, str]:
421
+ number_of_verdicts = len(verdicts)
422
+ if number_of_verdicts == 0:
423
+ return 1
424
+
425
+ faithfulness_count = 0
426
+ for verdict in verdicts:
427
+ if verdict.verdict.strip().lower() != "no":
428
+ faithfulness_count += 1
429
+
430
+ if (
431
+ self.penalize_ambiguous_claims
432
+ and verdict.verdict.strip().lower() == "idk"
433
+ ):
434
+ faithfulness_count -= 1
435
+
436
+ score = faithfulness_count / number_of_verdicts
437
+ reason = self._get_interaction_reason(score, verdicts, multimodal)
438
+ return (
439
+ (0, reason)
440
+ if self.strict_mode and score < self.threshold
441
+ else (score, reason)
442
+ )
443
+
444
+ async def _a_get_interaction_score_and_reason(
445
+ self, verdicts, multimodal: bool
446
+ ) -> Tuple[float, str]:
447
+ number_of_verdicts = len(verdicts)
448
+ if number_of_verdicts == 0:
449
+ return 1
450
+
451
+ faithfulness_count = 0
452
+ for verdict in verdicts:
453
+ if verdict.verdict.strip().lower() != "no":
454
+ faithfulness_count += 1
455
+
456
+ if (
457
+ self.penalize_ambiguous_claims
458
+ and verdict.verdict.strip().lower() == "idk"
459
+ ):
460
+ faithfulness_count -= 1
461
+
462
+ score = faithfulness_count / number_of_verdicts
463
+ reason = await self._a_get_interaction_reason(
464
+ score, verdicts, multimodal
465
+ )
466
+ return (
467
+ (0, reason)
468
+ if self.strict_mode and score < self.threshold
469
+ else (score, reason)
470
+ )
471
+
472
+ async def _a_get_interaction_reason(
473
+ self, score, verdicts, multimodal: bool
474
+ ) -> str:
475
+ if self.include_reason is False:
476
+ return None
477
+
478
+ contradictions = []
479
+ for verdict in verdicts:
480
+ if verdict.verdict.strip().lower() == "no":
481
+ contradictions.append(verdict.reason)
482
+
483
+ prompt = self.evaluation_template.generate_reason(
484
+ contradictions=contradictions,
485
+ score=format(score, ".2f"),
486
+ multimodal=multimodal,
487
+ )
488
+
489
+ if self.using_native_model:
490
+ res, cost = await self.model.a_generate(
491
+ prompt, schema=FaithfulnessScoreReason
492
+ )
493
+ self.evaluation_cost += cost
494
+ return res.reason
495
+ else:
496
+ try:
497
+ res: FaithfulnessScoreReason = await self.model.a_generate(
498
+ prompt, schema=FaithfulnessScoreReason
499
+ )
500
+ return res.reason
501
+ except TypeError:
502
+ res = await self.model.a_generate(prompt)
503
+ data = trimAndLoadJson(res, self)
504
+ return data["reason"]
505
+
506
+ def _get_interaction_reason(self, score, verdicts, multimodal: bool) -> str:
507
+ if self.include_reason is False:
508
+ return None
509
+
510
+ contradictions = []
511
+ for verdict in verdicts:
512
+ if verdict.verdict.strip().lower() == "no":
513
+ contradictions.append(verdict.reason)
514
+
515
+ prompt = self.evaluation_template.generate_reason(
516
+ contradictions=contradictions,
517
+ score=format(score, ".2f"),
518
+ multimodal=multimodal,
519
+ )
520
+
521
+ if self.using_native_model:
522
+ res, cost = self.model.generate(
523
+ prompt, schema=FaithfulnessScoreReason
524
+ )
525
+ self.evaluation_cost += cost
526
+ return res.reason
527
+ else:
528
+ try:
529
+ res: FaithfulnessScoreReason = self.model.generate(
530
+ prompt, schema=FaithfulnessScoreReason
531
+ )
532
+ return res.reason
533
+ except TypeError:
534
+ res = self.model.generate(prompt)
535
+ data = trimAndLoadJson(res, self)
536
+ return data["reason"]
537
+
538
+ def _get_verbose_steps(
539
+ self, interaction_scores: List[InteractionFaithfulnessScore]
540
+ ):
541
+ steps = []
542
+ for index, interaction_score in enumerate(interaction_scores):
543
+ interaction_steps = [
544
+ f"Window {index + 1} \n",
545
+ f"Truths: {prettify_list(interaction_score.truths)} \n",
546
+ f"Claims: {prettify_list(interaction_score.claims)} \n",
547
+ f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
548
+ f"Score: {interaction_score.score} \n",
549
+ f"Reason: {interaction_score.reason} \n",
550
+ ]
551
+ steps.extend(interaction_steps)
552
+ return steps
553
+
554
+ def _generate_reason(
555
+ self, scores: List[InteractionFaithfulnessScore]
556
+ ) -> str:
557
+ if self.include_reason is False:
558
+ return None
559
+
560
+ if len(scores) == 0:
561
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
562
+
563
+ reasons = []
564
+ for score in scores:
565
+ reasons.append(score.reason)
566
+
567
+ prompt = self.evaluation_template.generate_final_reason(
568
+ self.score, self.success, reasons
569
+ )
570
+
571
+ if self.using_native_model:
572
+ res, cost = self.model.generate(prompt)
573
+ self.evaluation_cost += cost
574
+ return res
575
+ else:
576
+ res = self.model.generate(prompt)
577
+ return res
578
+
579
+ async def _a_generate_reason(
580
+ self, scores: List[InteractionFaithfulnessScore]
581
+ ) -> str:
582
+ if self.include_reason is False:
583
+ return None
584
+
585
+ if len(scores) == 0:
586
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
587
+
588
+ reasons = []
589
+ for score in scores:
590
+ reasons.append(score.reason)
591
+
592
+ prompt = self.evaluation_template.generate_final_reason(
593
+ self.score, self.success, reasons
594
+ )
595
+
596
+ if self.using_native_model:
597
+ res, cost = await self.model.a_generate(prompt)
598
+ self.evaluation_cost += cost
599
+ return res
600
+ else:
601
+ res = await self.model.a_generate(prompt)
602
+ return res
603
+
604
+ def _calculate_score(
605
+ self, scores: List[InteractionFaithfulnessScore]
606
+ ) -> float:
607
+ number_of_scores = len(scores)
608
+ if number_of_scores == 0:
609
+ return 1
610
+ total_score = 0
611
+ for score in scores:
612
+ total_score += score.score
613
+ return total_score / number_of_scores
614
+
615
+ def is_successful(self) -> bool:
616
+ if self.error is not None:
617
+ self.success = False
618
+ else:
619
+ try:
620
+ self.success = self.score >= self.threshold
621
+ except:
622
+ self.success = False
623
+ return self.success
624
+
625
+ @property
626
+ def __name__(self):
627
+ return "Turn Faithfulness"
@@ -2,9 +2,20 @@ from typing import List, Dict
2
2
 
3
3
 
4
4
  class TurnRelevancyTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_verdicts(sliding_window: List[Dict]):
7
15
  return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to indicate whether the LAST `assistant` message is relevant to context in messages. The JSON will have 2 fields: 'verdict' and 'reason'.
16
+
17
+ {TurnRelevancyTemplate.multimodal_rules}
18
+
8
19
  The 'verdict' key should STRICTLY be either 'yes' or 'no', which states whether the last `assistant` message is relevant according to the context in messages
9
20
  Provide a 'reason' ONLY if the answer is 'no'.
10
21
  You MUST USE the previous messages (if any) provided in the list of messages to make an informed judgement on relevancy.
@@ -52,6 +63,9 @@ JSON:
52
63
  @staticmethod
53
64
  def generate_reason(score, irrelevancies):
54
65
  return f"""Below is a list of irrelevancies drawn from some messages in a conversation, which you have minimal knowledge of. It is a list of strings explaining why the 'assistant' messages are irrelevant to the 'user' messages.
66
+
67
+ {TurnRelevancyTemplate.multimodal_rules}
68
+
55
69
  Given the relevancy score, which is a 0-1 score indicating how irrelevant the OVERALL AI messages are in a conversation (higher the better), CONCISELY summarize the irrelevancies to justify the score.
56
70
 
57
71
  **