deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,576 @@
1
+ from typing import List, Optional, Union, Type, Tuple
2
+ import asyncio
3
+ import itertools
4
+ from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
+ from deepeval.metrics import BaseConversationalMetric
6
+ from deepeval.utils import (
7
+ get_or_create_event_loop,
8
+ prettify_list,
9
+ )
10
+ from deepeval.metrics.utils import (
11
+ construct_verbose_logs,
12
+ trimAndLoadJson,
13
+ check_conversational_test_case_params,
14
+ get_unit_interactions,
15
+ get_turns_in_sliding_window,
16
+ initialize_model,
17
+ )
18
+ from deepeval.models import DeepEvalBaseLLM
19
+ from deepeval.metrics.turn_contextual_relevancy.template import (
20
+ TurnContextualRelevancyTemplate,
21
+ )
22
+ from deepeval.metrics.indicator import metric_progress_indicator
23
+ from deepeval.metrics.turn_contextual_relevancy.schema import (
24
+ ContextualRelevancyVerdict,
25
+ ContextualRelevancyVerdicts,
26
+ ContextualRelevancyScoreReason,
27
+ InteractionContextualRelevancyScore,
28
+ )
29
+ from deepeval.metrics.api import metric_data_manager
30
+
31
+
32
+ class TurnContextualRelevancyMetric(BaseConversationalMetric):
33
+ _required_test_case_params: List[TurnParams] = [
34
+ TurnParams.ROLE,
35
+ TurnParams.CONTENT,
36
+ TurnParams.RETRIEVAL_CONTEXT,
37
+ ]
38
+
39
+ def __init__(
40
+ self,
41
+ threshold: float = 0.5,
42
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
43
+ include_reason: bool = True,
44
+ async_mode: bool = True,
45
+ strict_mode: bool = False,
46
+ verbose_mode: bool = False,
47
+ window_size: int = 10,
48
+ evaluation_template: Type[
49
+ TurnContextualRelevancyTemplate
50
+ ] = TurnContextualRelevancyTemplate,
51
+ ):
52
+ self.threshold = 1 if strict_mode else threshold
53
+ self.model, self.using_native_model = initialize_model(model)
54
+ self.evaluation_model = self.model.get_model_name()
55
+ self.include_reason = include_reason
56
+ self.async_mode = async_mode
57
+ self.strict_mode = strict_mode
58
+ self.verbose_mode = verbose_mode
59
+ self.window_size = window_size
60
+ self.evaluation_template = evaluation_template
61
+
62
+ def measure(
63
+ self,
64
+ test_case: ConversationalTestCase,
65
+ _show_indicator: bool = True,
66
+ _in_component: bool = False,
67
+ _log_metric_to_confident: bool = True,
68
+ ):
69
+ check_conversational_test_case_params(
70
+ test_case,
71
+ self._required_test_case_params,
72
+ self,
73
+ False,
74
+ self.model,
75
+ test_case.multimodal,
76
+ )
77
+
78
+ multimodal = test_case.multimodal
79
+
80
+ self.evaluation_cost = 0 if self.using_native_model else None
81
+ with metric_progress_indicator(
82
+ self, _show_indicator=_show_indicator, _in_component=_in_component
83
+ ):
84
+ if self.async_mode:
85
+ loop = get_or_create_event_loop()
86
+ loop.run_until_complete(
87
+ self.a_measure(
88
+ test_case,
89
+ _show_indicator=False,
90
+ _in_component=_in_component,
91
+ _log_metric_to_confident=_log_metric_to_confident,
92
+ )
93
+ )
94
+ else:
95
+ unit_interactions = get_unit_interactions(test_case.turns)
96
+ turns_windows: List[List[Turn]] = [
97
+ list(itertools.chain(*window))
98
+ for window in get_turns_in_sliding_window(
99
+ unit_interactions, self.window_size
100
+ )
101
+ ]
102
+ scores = []
103
+ for window in turns_windows:
104
+ scores.extend(
105
+ self._get_contextual_relevancy_scores(
106
+ window, multimodal
107
+ )
108
+ )
109
+ self.score = self._calculate_score(scores)
110
+ self.success = self.score >= self.threshold
111
+ self.reason = self._generate_reason(scores)
112
+ verbose_steps = self._get_verbose_steps(scores)
113
+ self.verbose_logs = construct_verbose_logs(
114
+ self,
115
+ steps=[
116
+ *verbose_steps,
117
+ f"Final Score: {self.score}\n",
118
+ f"Final Reason: {self.reason}\n",
119
+ ],
120
+ )
121
+ if _log_metric_to_confident:
122
+ metric_data_manager.post_metric_if_enabled(
123
+ self, test_case=test_case
124
+ )
125
+
126
+ return self.score
127
+
128
+ async def a_measure(
129
+ self,
130
+ test_case: ConversationalTestCase,
131
+ _show_indicator: bool = True,
132
+ _in_component: bool = False,
133
+ _log_metric_to_confident: bool = True,
134
+ ) -> float:
135
+ check_conversational_test_case_params(
136
+ test_case,
137
+ self._required_test_case_params,
138
+ self,
139
+ False,
140
+ self.model,
141
+ test_case.multimodal,
142
+ )
143
+
144
+ multimodal = test_case.multimodal
145
+
146
+ self.evaluation_cost = 0 if self.using_native_model else None
147
+ with metric_progress_indicator(
148
+ self,
149
+ async_mode=True,
150
+ _show_indicator=_show_indicator,
151
+ _in_component=_in_component,
152
+ ):
153
+ unit_interactions = get_unit_interactions(test_case.turns)
154
+ turns_windows: List[List[Turn]] = [
155
+ list(itertools.chain(*window))
156
+ for window in get_turns_in_sliding_window(
157
+ unit_interactions, self.window_size
158
+ )
159
+ ]
160
+ scores = []
161
+ tasks = []
162
+
163
+ async def get_individual_scores(window):
164
+ scores.extend(
165
+ await self._a_get_contextual_relevancy_scores(
166
+ window, multimodal
167
+ )
168
+ )
169
+
170
+ for window in turns_windows:
171
+ tasks.append(get_individual_scores(window))
172
+ await asyncio.gather(*tasks)
173
+ self.score = self._calculate_score(scores)
174
+ self.success = self.score >= self.threshold
175
+ self.reason = await self._a_generate_reason(scores)
176
+ verbose_steps = self._get_verbose_steps(scores)
177
+ self.verbose_logs = construct_verbose_logs(
178
+ self,
179
+ steps=[
180
+ *verbose_steps,
181
+ f"Final Score: {self.score}\n",
182
+ f"Final Reason: {self.reason}\n",
183
+ ],
184
+ )
185
+ if _log_metric_to_confident:
186
+ metric_data_manager.post_metric_if_enabled(
187
+ self, test_case=test_case
188
+ )
189
+
190
+ return self.score
191
+
192
+ async def _a_get_contextual_relevancy_scores(
193
+ self, turns_window: List[Turn], multimodal: bool
194
+ ):
195
+ windows_scores = []
196
+
197
+ user_content = ""
198
+ retrieval_context = []
199
+ for turn in turns_window:
200
+ if turn.role == "user":
201
+ user_content += f"\n{turn.content} "
202
+ else:
203
+ if turn.retrieval_context is not None:
204
+ retrieval_context.extend(turn.retrieval_context)
205
+
206
+ verdicts = await self._a_generate_verdicts(
207
+ user_content, retrieval_context, multimodal
208
+ )
209
+ score, reason = await self._a_get_interaction_score_and_reason(
210
+ user_content, verdicts, multimodal
211
+ )
212
+ interaction_score = InteractionContextualRelevancyScore(
213
+ score=score,
214
+ reason=reason,
215
+ verdicts=verdicts,
216
+ )
217
+
218
+ windows_scores.append(interaction_score)
219
+
220
+ return windows_scores
221
+
222
+ def _get_contextual_relevancy_scores(
223
+ self, turns_window: List[Turn], multimodal: bool
224
+ ):
225
+ windows_scores = []
226
+
227
+ user_content = ""
228
+ retrieval_context = []
229
+ for turn in turns_window:
230
+ if turn.role == "user":
231
+ user_content += f"\n{turn.content} "
232
+ else:
233
+ if turn.retrieval_context is not None:
234
+ retrieval_context.extend(turn.retrieval_context)
235
+
236
+ verdicts = self._generate_verdicts(
237
+ user_content, retrieval_context, multimodal
238
+ )
239
+ score, reason = self._get_interaction_score_and_reason(
240
+ user_content, verdicts, multimodal
241
+ )
242
+ interaction_score = InteractionContextualRelevancyScore(
243
+ score=score,
244
+ reason=reason,
245
+ verdicts=verdicts,
246
+ )
247
+ windows_scores.append(interaction_score)
248
+
249
+ return windows_scores
250
+
251
+ async def _a_generate_verdicts(
252
+ self, input: str, retrieval_context: List[str], multimodal: bool
253
+ ) -> List[ContextualRelevancyVerdict]:
254
+ if len(retrieval_context) == 0:
255
+ return []
256
+
257
+ verdicts: List[ContextualRelevancyVerdict] = []
258
+
259
+ # Generate verdicts for each context node
260
+ for context in retrieval_context:
261
+ prompt = self.evaluation_template.generate_verdicts(
262
+ input=input,
263
+ context=context,
264
+ multimodal=multimodal,
265
+ )
266
+
267
+ if self.using_native_model:
268
+ res, cost = await self.model.a_generate(
269
+ prompt, schema=ContextualRelevancyVerdicts
270
+ )
271
+ self.evaluation_cost += cost
272
+ verdicts.extend([item for item in res.verdicts])
273
+ else:
274
+ try:
275
+ res: ContextualRelevancyVerdicts = (
276
+ await self.model.a_generate(
277
+ prompt, schema=ContextualRelevancyVerdicts
278
+ )
279
+ )
280
+ verdicts.extend([item for item in res.verdicts])
281
+ except TypeError:
282
+ res = await self.model.a_generate(prompt)
283
+ data = trimAndLoadJson(res, self)
284
+ verdicts.extend(
285
+ [
286
+ ContextualRelevancyVerdict(**item)
287
+ for item in data["verdicts"]
288
+ ]
289
+ )
290
+
291
+ return verdicts
292
+
293
+ def _generate_verdicts(
294
+ self, input: str, retrieval_context: List[str], multimodal: bool
295
+ ) -> List[ContextualRelevancyVerdict]:
296
+ if len(retrieval_context) == 0:
297
+ return []
298
+
299
+ verdicts: List[ContextualRelevancyVerdict] = []
300
+
301
+ # Generate verdicts for each context node
302
+ for context in retrieval_context:
303
+ prompt = self.evaluation_template.generate_verdicts(
304
+ input=input,
305
+ context=context,
306
+ multimodal=multimodal,
307
+ )
308
+
309
+ if self.using_native_model:
310
+ res, cost = self.model.generate(
311
+ prompt, schema=ContextualRelevancyVerdicts
312
+ )
313
+ self.evaluation_cost += cost
314
+ verdicts.extend([item for item in res.verdicts])
315
+ else:
316
+ try:
317
+ res: ContextualRelevancyVerdicts = self.model.generate(
318
+ prompt, schema=ContextualRelevancyVerdicts
319
+ )
320
+ verdicts.extend([item for item in res.verdicts])
321
+ except TypeError:
322
+ res = self.model.generate(prompt)
323
+ data = trimAndLoadJson(res, self)
324
+ verdicts.extend(
325
+ [
326
+ ContextualRelevancyVerdict(**item)
327
+ for item in data["verdicts"]
328
+ ]
329
+ )
330
+
331
+ return verdicts
332
+
333
+ async def _a_get_interaction_score_and_reason(
334
+ self,
335
+ input: str,
336
+ verdicts: List[ContextualRelevancyVerdict],
337
+ multimodal: bool,
338
+ ) -> Tuple[float, str]:
339
+ if len(verdicts) == 0:
340
+ return (
341
+ 1,
342
+ "There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
343
+ )
344
+
345
+ score = self._calculate_interaction_score(verdicts)
346
+ reason = await self._a_get_interaction_reason(
347
+ input, score, verdicts, multimodal
348
+ )
349
+ return (
350
+ (0, reason)
351
+ if self.strict_mode and score < self.threshold
352
+ else (score, reason)
353
+ )
354
+
355
+ def _get_interaction_score_and_reason(
356
+ self,
357
+ input: str,
358
+ verdicts: List[ContextualRelevancyVerdict],
359
+ multimodal: bool,
360
+ ) -> Tuple[float, str]:
361
+ if len(verdicts) == 0:
362
+ return (
363
+ 1,
364
+ "There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
365
+ )
366
+
367
+ score = self._calculate_interaction_score(verdicts)
368
+ reason = self._get_interaction_reason(
369
+ input, score, verdicts, multimodal
370
+ )
371
+ return (
372
+ (0, reason)
373
+ if self.strict_mode and score < self.threshold
374
+ else (score, reason)
375
+ )
376
+
377
+ def _calculate_interaction_score(
378
+ self, verdicts: List[ContextualRelevancyVerdict]
379
+ ) -> float:
380
+ number_of_verdicts = len(verdicts)
381
+ if number_of_verdicts == 0:
382
+ return 1
383
+
384
+ relevant_count = 0
385
+ for verdict in verdicts:
386
+ if verdict.verdict.strip().lower() == "yes":
387
+ relevant_count += 1
388
+
389
+ score = relevant_count / number_of_verdicts
390
+ return score
391
+
392
+ async def _a_get_interaction_reason(
393
+ self,
394
+ input: str,
395
+ score: float,
396
+ verdicts: List[ContextualRelevancyVerdict],
397
+ multimodal: bool,
398
+ ) -> str:
399
+ if self.include_reason is False:
400
+ return None
401
+
402
+ # Separate relevant and irrelevant statements
403
+ irrelevant_statements = []
404
+ relevant_statements = []
405
+
406
+ for verdict in verdicts:
407
+ if verdict.verdict.strip().lower() == "yes":
408
+ relevant_statements.append(verdict.statement)
409
+ else:
410
+ irrelevant_statements.append(
411
+ f"{verdict.statement}: {verdict.reason}"
412
+ )
413
+
414
+ prompt = self.evaluation_template.generate_reason(
415
+ input=input,
416
+ irrelevant_statements=irrelevant_statements,
417
+ relevant_statements=relevant_statements,
418
+ score=format(score, ".2f"),
419
+ multimodal=multimodal,
420
+ )
421
+
422
+ if self.using_native_model:
423
+ res, cost = await self.model.a_generate(
424
+ prompt, schema=ContextualRelevancyScoreReason
425
+ )
426
+ self.evaluation_cost += cost
427
+ return res.reason
428
+ else:
429
+ try:
430
+ res: ContextualRelevancyScoreReason = (
431
+ await self.model.a_generate(
432
+ prompt, schema=ContextualRelevancyScoreReason
433
+ )
434
+ )
435
+ return res.reason
436
+ except TypeError:
437
+ res = await self.model.a_generate(prompt)
438
+ data = trimAndLoadJson(res, self)
439
+ return data["reason"]
440
+
441
+ def _get_interaction_reason(
442
+ self,
443
+ input: str,
444
+ score: float,
445
+ verdicts: List[ContextualRelevancyVerdict],
446
+ multimodal: bool,
447
+ ) -> str:
448
+ if self.include_reason is False:
449
+ return None
450
+
451
+ # Separate relevant and irrelevant statements
452
+ irrelevant_statements = []
453
+ relevant_statements = []
454
+
455
+ for verdict in verdicts:
456
+ if verdict.verdict.strip().lower() == "yes":
457
+ relevant_statements.append(verdict.statement)
458
+ else:
459
+ # Include the reason for irrelevance
460
+ irrelevant_statements.append(
461
+ f"{verdict.statement}: {verdict.reason}"
462
+ )
463
+
464
+ prompt = self.evaluation_template.generate_reason(
465
+ input=input,
466
+ irrelevant_statements=irrelevant_statements,
467
+ relevant_statements=relevant_statements,
468
+ score=format(score, ".2f"),
469
+ multimodal=multimodal,
470
+ )
471
+
472
+ if self.using_native_model:
473
+ res, cost = self.model.generate(
474
+ prompt, schema=ContextualRelevancyScoreReason
475
+ )
476
+ self.evaluation_cost += cost
477
+ return res.reason
478
+ else:
479
+ try:
480
+ res: ContextualRelevancyScoreReason = self.model.generate(
481
+ prompt, schema=ContextualRelevancyScoreReason
482
+ )
483
+ return res.reason
484
+ except TypeError:
485
+ res = self.model.generate(prompt)
486
+ data = trimAndLoadJson(res, self)
487
+ return data["reason"]
488
+
489
+ def _get_verbose_steps(
490
+ self, windows_scores: List[InteractionContextualRelevancyScore]
491
+ ):
492
+ steps = []
493
+ for index, interaction_score in enumerate(windows_scores):
494
+ interaction_steps = [
495
+ f"Window {index + 1} \n",
496
+ f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
497
+ f"Score: {interaction_score.score} \n",
498
+ f"Reason: {interaction_score.reason} \n",
499
+ ]
500
+ steps.extend(interaction_steps)
501
+ return steps
502
+
503
+ def _generate_reason(
504
+ self, scores: List[InteractionContextualRelevancyScore]
505
+ ) -> str:
506
+ if self.include_reason is False:
507
+ return None
508
+
509
+ if len(scores) == 0:
510
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
511
+
512
+ reasons = []
513
+ for score in scores:
514
+ reasons.append(score.reason)
515
+
516
+ prompt = self.evaluation_template.generate_final_reason(
517
+ self.score, self.success, reasons
518
+ )
519
+
520
+ if self.using_native_model:
521
+ res, cost = self.model.generate(prompt)
522
+ self.evaluation_cost += cost
523
+ return res
524
+ else:
525
+ res = self.model.generate(prompt)
526
+ return res
527
+
528
+ async def _a_generate_reason(
529
+ self, scores: List[InteractionContextualRelevancyScore]
530
+ ) -> str:
531
+ if self.include_reason is False:
532
+ return None
533
+
534
+ if len(scores) == 0:
535
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
536
+
537
+ reasons = []
538
+ for score in scores:
539
+ reasons.append(score.reason)
540
+
541
+ prompt = self.evaluation_template.generate_final_reason(
542
+ self.score, self.success, reasons
543
+ )
544
+
545
+ if self.using_native_model:
546
+ res, cost = await self.model.a_generate(prompt)
547
+ self.evaluation_cost += cost
548
+ return res
549
+ else:
550
+ res = await self.model.a_generate(prompt)
551
+ return res
552
+
553
+ def _calculate_score(
554
+ self, scores: List[InteractionContextualRelevancyScore]
555
+ ) -> float:
556
+ number_of_scores = len(scores)
557
+ if number_of_scores == 0:
558
+ return 1
559
+ total_score = 0
560
+ for score in scores:
561
+ total_score += score.score
562
+ return total_score / number_of_scores
563
+
564
+ def is_successful(self) -> bool:
565
+ if self.error is not None:
566
+ self.success = False
567
+ else:
568
+ try:
569
+ self.success = self.score >= self.threshold
570
+ except:
571
+ self.success = False
572
+ return self.success
573
+
574
+ @property
575
+ def __name__(self):
576
+ return "Turn Contextual Relevancy"
@@ -1,10 +1,10 @@
1
- from typing import List, Optional
1
+ from typing import List, Optional, Literal
2
2
  from pydantic import BaseModel, Field
3
3
 
4
4
 
5
5
  class FaithfulnessVerdict(BaseModel):
6
- verdict: str
7
6
  reason: Optional[str] = Field(default=None)
7
+ verdict: Literal["yes", "no", "idk"]
8
8
 
9
9
 
10
10
  class Verdicts(BaseModel):
@@ -19,5 +19,13 @@ class Claims(BaseModel):
19
19
  claims: List[str]
20
20
 
21
21
 
22
- class MultimodalFaithfulnessScoreReason(BaseModel):
22
+ class FaithfulnessScoreReason(BaseModel):
23
23
  reason: str
24
+
25
+
26
+ class InteractionFaithfulnessScore(BaseModel):
27
+ score: float
28
+ reason: Optional[str]
29
+ claims: List[str]
30
+ truths: List[str]
31
+ verdicts: List[FaithfulnessVerdict]