deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,592 @@
1
+ from typing import List, Optional, Union, Type, Tuple
2
+ import asyncio
3
+ import itertools
4
+ from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
+ from deepeval.metrics import BaseConversationalMetric
6
+ from deepeval.utils import (
7
+ get_or_create_event_loop,
8
+ prettify_list,
9
+ )
10
+ from deepeval.metrics.utils import (
11
+ construct_verbose_logs,
12
+ trimAndLoadJson,
13
+ check_conversational_test_case_params,
14
+ get_unit_interactions,
15
+ get_turns_in_sliding_window,
16
+ initialize_model,
17
+ )
18
+ from deepeval.models import DeepEvalBaseLLM
19
+ from deepeval.metrics.turn_contextual_precision.template import (
20
+ TurnContextualPrecisionTemplate,
21
+ )
22
+ from deepeval.metrics.indicator import metric_progress_indicator
23
+ from deepeval.metrics.turn_contextual_precision.schema import (
24
+ ContextualPrecisionVerdict,
25
+ Verdicts,
26
+ ContextualPrecisionScoreReason,
27
+ InteractionContextualPrecisionScore,
28
+ )
29
+ from deepeval.metrics.api import metric_data_manager
30
+
31
+
32
+ class TurnContextualPrecisionMetric(BaseConversationalMetric):
33
+ _required_test_case_params: List[TurnParams] = [
34
+ TurnParams.ROLE,
35
+ TurnParams.CONTENT,
36
+ TurnParams.RETRIEVAL_CONTEXT,
37
+ TurnParams.EXPECTED_OUTCOME,
38
+ ]
39
+
40
+ def __init__(
41
+ self,
42
+ threshold: float = 0.5,
43
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
44
+ include_reason: bool = True,
45
+ async_mode: bool = True,
46
+ strict_mode: bool = False,
47
+ verbose_mode: bool = False,
48
+ window_size: int = 10,
49
+ evaluation_template: Type[
50
+ TurnContextualPrecisionTemplate
51
+ ] = TurnContextualPrecisionTemplate,
52
+ ):
53
+ self.threshold = 1 if strict_mode else threshold
54
+ self.model, self.using_native_model = initialize_model(model)
55
+ self.evaluation_model = self.model.get_model_name()
56
+ self.include_reason = include_reason
57
+ self.async_mode = async_mode
58
+ self.strict_mode = strict_mode
59
+ self.verbose_mode = verbose_mode
60
+ self.window_size = window_size
61
+ self.evaluation_template = evaluation_template
62
+
63
+ def measure(
64
+ self,
65
+ test_case: ConversationalTestCase,
66
+ _show_indicator: bool = True,
67
+ _in_component: bool = False,
68
+ _log_metric_to_confident: bool = True,
69
+ ):
70
+ check_conversational_test_case_params(
71
+ test_case,
72
+ self._required_test_case_params,
73
+ self,
74
+ False,
75
+ self.model,
76
+ test_case.multimodal,
77
+ )
78
+
79
+ multimodal = test_case.multimodal
80
+
81
+ self.evaluation_cost = 0 if self.using_native_model else None
82
+ with metric_progress_indicator(
83
+ self, _show_indicator=_show_indicator, _in_component=_in_component
84
+ ):
85
+ if self.async_mode:
86
+ loop = get_or_create_event_loop()
87
+ loop.run_until_complete(
88
+ self.a_measure(
89
+ test_case,
90
+ _show_indicator=False,
91
+ _in_component=_in_component,
92
+ _log_metric_to_confident=_log_metric_to_confident,
93
+ )
94
+ )
95
+ else:
96
+ unit_interactions = get_unit_interactions(test_case.turns)
97
+ turns_windows: List[List[Turn]] = [
98
+ list(itertools.chain(*window))
99
+ for window in get_turns_in_sliding_window(
100
+ unit_interactions, self.window_size
101
+ )
102
+ ]
103
+ scores = []
104
+ for window in turns_windows:
105
+ scores.extend(
106
+ self._get_contextual_precision_scores(
107
+ window, test_case.expected_outcome, multimodal
108
+ )
109
+ )
110
+ self.score = self._calculate_score(scores)
111
+ self.success = self.score >= self.threshold
112
+ self.reason = self._generate_reason(scores)
113
+ verbose_steps = self._get_verbose_steps(scores)
114
+ self.verbose_logs = construct_verbose_logs(
115
+ self,
116
+ steps=[
117
+ *verbose_steps,
118
+ f"Final Score: {self.score}\n",
119
+ f"Final Reason: {self.reason}\n",
120
+ ],
121
+ )
122
+ if _log_metric_to_confident:
123
+ metric_data_manager.post_metric_if_enabled(
124
+ self, test_case=test_case
125
+ )
126
+
127
+ return self.score
128
+
129
+ async def a_measure(
130
+ self,
131
+ test_case: ConversationalTestCase,
132
+ _show_indicator: bool = True,
133
+ _in_component: bool = False,
134
+ _log_metric_to_confident: bool = True,
135
+ ) -> float:
136
+ check_conversational_test_case_params(
137
+ test_case,
138
+ self._required_test_case_params,
139
+ self,
140
+ False,
141
+ self.model,
142
+ test_case.multimodal,
143
+ )
144
+
145
+ multimodal = test_case.multimodal
146
+
147
+ self.evaluation_cost = 0 if self.using_native_model else None
148
+ with metric_progress_indicator(
149
+ self,
150
+ async_mode=True,
151
+ _show_indicator=_show_indicator,
152
+ _in_component=_in_component,
153
+ ):
154
+ unit_interactions = get_unit_interactions(test_case.turns)
155
+ turns_windows: List[List[Turn]] = [
156
+ list(itertools.chain(*window))
157
+ for window in get_turns_in_sliding_window(
158
+ unit_interactions, self.window_size
159
+ )
160
+ ]
161
+ scores = []
162
+ tasks = []
163
+
164
+ async def get_individual_scores(window):
165
+ scores.extend(
166
+ await self._a_get_contextual_precision_scores(
167
+ window, test_case.expected_outcome, multimodal
168
+ )
169
+ )
170
+
171
+ for window in turns_windows:
172
+ tasks.append(get_individual_scores(window))
173
+ await asyncio.gather(*tasks)
174
+ self.score = self._calculate_score(scores)
175
+ self.success = self.score >= self.threshold
176
+ self.reason = await self._a_generate_reason(scores)
177
+ verbose_steps = self._get_verbose_steps(scores)
178
+ self.verbose_logs = construct_verbose_logs(
179
+ self,
180
+ steps=[
181
+ *verbose_steps,
182
+ f"Final Score: {self.score}\n",
183
+ f"Final Reason: {self.reason}\n",
184
+ ],
185
+ )
186
+ if _log_metric_to_confident:
187
+ metric_data_manager.post_metric_if_enabled(
188
+ self, test_case=test_case
189
+ )
190
+
191
+ return self.score
192
+
193
+ async def _a_get_contextual_precision_scores(
194
+ self,
195
+ turns_window: List[Turn],
196
+ expected_outcome: str,
197
+ multimodal: bool,
198
+ ):
199
+ windows_scores = []
200
+
201
+ user_content = ""
202
+ retrieval_context = []
203
+ for turn in turns_window:
204
+ if turn.role == "user":
205
+ user_content += f"\n{turn.content} "
206
+ else:
207
+ if turn.retrieval_context is not None:
208
+ retrieval_context.extend(turn.retrieval_context)
209
+
210
+ verdicts = await self._a_generate_verdicts(
211
+ user_content,
212
+ expected_outcome,
213
+ retrieval_context,
214
+ multimodal,
215
+ )
216
+ score, reason = await self._a_get_interaction_score_and_reason(
217
+ user_content, verdicts, multimodal
218
+ )
219
+ interaction_score = InteractionContextualPrecisionScore(
220
+ score=score,
221
+ reason=reason,
222
+ verdicts=verdicts,
223
+ )
224
+ windows_scores.append(interaction_score)
225
+
226
+ return windows_scores
227
+
228
+ def _get_contextual_precision_scores(
229
+ self,
230
+ turns_window: List[Turn],
231
+ expected_outcome: str,
232
+ multimodal: bool,
233
+ ):
234
+ windows_scores = []
235
+
236
+ user_content = ""
237
+ retrieval_context = []
238
+ for turn in turns_window:
239
+ if turn.role == "user":
240
+ user_content += f"\n{turn.content} "
241
+ else:
242
+ if turn.retrieval_context is not None:
243
+ retrieval_context.extend(turn.retrieval_context)
244
+
245
+ verdicts = self._generate_verdicts(
246
+ user_content,
247
+ expected_outcome,
248
+ retrieval_context,
249
+ multimodal,
250
+ )
251
+ score, reason = self._get_interaction_score_and_reason(
252
+ user_content, verdicts, multimodal
253
+ )
254
+ interaction_score = InteractionContextualPrecisionScore(
255
+ score=score,
256
+ reason=reason,
257
+ verdicts=verdicts,
258
+ )
259
+ windows_scores.append(interaction_score)
260
+
261
+ return windows_scores
262
+
263
+ async def _a_generate_verdicts(
264
+ self,
265
+ input: str,
266
+ expected_outcome: str,
267
+ retrieval_context: List[str],
268
+ multimodal: bool,
269
+ ) -> List[ContextualPrecisionVerdict]:
270
+ if len(retrieval_context) == 0:
271
+ return []
272
+
273
+ verdicts: List[ContextualPrecisionVerdict] = []
274
+
275
+ prompt = self.evaluation_template.generate_verdicts(
276
+ input=input,
277
+ expected_outcome=expected_outcome,
278
+ retrieval_context=retrieval_context,
279
+ multimodal=multimodal,
280
+ )
281
+
282
+ if self.using_native_model:
283
+ res, cost = await self.model.a_generate(prompt, schema=Verdicts)
284
+ self.evaluation_cost += cost
285
+ verdicts = [item for item in res.verdicts]
286
+ return verdicts
287
+ else:
288
+ try:
289
+ res: Verdicts = await self.model.a_generate(
290
+ prompt, schema=Verdicts
291
+ )
292
+ verdicts = [item for item in res.verdicts]
293
+ return verdicts
294
+ except TypeError:
295
+ res = await self.model.a_generate(prompt)
296
+ data = trimAndLoadJson(res, self)
297
+ verdicts = [
298
+ ContextualPrecisionVerdict(**item)
299
+ for item in data["verdicts"]
300
+ ]
301
+ return verdicts
302
+
303
+ def _generate_verdicts(
304
+ self,
305
+ input: str,
306
+ expected_outcome: str,
307
+ retrieval_context: List[str],
308
+ multimodal: bool,
309
+ ) -> List[ContextualPrecisionVerdict]:
310
+ if len(retrieval_context) == 0:
311
+ return []
312
+
313
+ verdicts: List[ContextualPrecisionVerdict] = []
314
+
315
+ prompt = self.evaluation_template.generate_verdicts(
316
+ input=input,
317
+ expected_outcome=expected_outcome,
318
+ retrieval_context=retrieval_context,
319
+ multimodal=multimodal,
320
+ )
321
+
322
+ if self.using_native_model:
323
+ res, cost = self.model.generate(prompt, schema=Verdicts)
324
+ self.evaluation_cost += cost
325
+ verdicts = [item for item in res.verdicts]
326
+ return verdicts
327
+ else:
328
+ try:
329
+ res: Verdicts = self.model.generate(prompt, schema=Verdicts)
330
+ verdicts = [item for item in res.verdicts]
331
+ return verdicts
332
+ except TypeError:
333
+ res = self.model.generate(prompt)
334
+ data = trimAndLoadJson(res, self)
335
+ verdicts = [
336
+ ContextualPrecisionVerdict(**item)
337
+ for item in data["verdicts"]
338
+ ]
339
+ return verdicts
340
+
341
+ async def _a_get_interaction_score_and_reason(
342
+ self,
343
+ input: str,
344
+ verdicts: List[ContextualPrecisionVerdict],
345
+ multimodal: bool,
346
+ ) -> Tuple[float, str]:
347
+ if len(verdicts) == 0:
348
+ return (
349
+ 1,
350
+ "There were no retrieval contexts in the given turns to evaluate the contextual precision.",
351
+ )
352
+
353
+ score = self._calculate_interaction_score(verdicts)
354
+ reason = await self._a_get_interaction_reason(
355
+ input, score, verdicts, multimodal
356
+ )
357
+ return (
358
+ (0, reason)
359
+ if self.strict_mode and score < self.threshold
360
+ else (score, reason)
361
+ )
362
+
363
+ def _get_interaction_score_and_reason(
364
+ self,
365
+ input: str,
366
+ verdicts: List[ContextualPrecisionVerdict],
367
+ multimodal: bool,
368
+ ) -> Tuple[float, str]:
369
+ if len(verdicts) == 0:
370
+ return (
371
+ 1,
372
+ "There were no retrieval contexts in the given turns to evaluate the contextual precision.",
373
+ )
374
+
375
+ score = self._calculate_interaction_score(verdicts)
376
+ reason = self._get_interaction_reason(
377
+ input, score, verdicts, multimodal
378
+ )
379
+ return (
380
+ (0, reason)
381
+ if self.strict_mode and score < self.threshold
382
+ else (score, reason)
383
+ )
384
+
385
+ def _calculate_interaction_score(
386
+ self, verdicts: List[ContextualPrecisionVerdict]
387
+ ) -> float:
388
+ number_of_verdicts = len(verdicts)
389
+ if number_of_verdicts == 0:
390
+ return 0
391
+
392
+ # Convert verdicts to binary list where 'yes' is 1 and others are 0
393
+ node_verdicts = [
394
+ 1 if v.verdict.strip().lower() == "yes" else 0 for v in verdicts
395
+ ]
396
+
397
+ sum_weighted_precision_at_k = 0.0
398
+ relevant_nodes_count = 0
399
+
400
+ for k, is_relevant in enumerate(node_verdicts, start=1):
401
+ # If the item is relevant, update the counter and add weighted precision to sum
402
+ if is_relevant:
403
+ relevant_nodes_count += 1
404
+ precision_at_k = relevant_nodes_count / k
405
+ sum_weighted_precision_at_k += precision_at_k * is_relevant
406
+
407
+ if relevant_nodes_count == 0:
408
+ return 0
409
+
410
+ score = sum_weighted_precision_at_k / relevant_nodes_count
411
+ return 0 if self.strict_mode and score < self.threshold else score
412
+
413
+ async def _a_get_interaction_reason(
414
+ self,
415
+ input: str,
416
+ score: float,
417
+ verdicts: List[ContextualPrecisionVerdict],
418
+ multimodal: bool,
419
+ ) -> str:
420
+ if self.include_reason is False:
421
+ return None
422
+
423
+ # Prepare verdicts with node information for reasoning
424
+ verdicts_with_nodes = []
425
+ for i, verdict in enumerate(verdicts):
426
+ verdicts_with_nodes.append(
427
+ {
428
+ "verdict": verdict.verdict,
429
+ "reason": verdict.reason,
430
+ "node": f"Node {i + 1}",
431
+ }
432
+ )
433
+
434
+ prompt = self.evaluation_template.generate_reason(
435
+ input=input,
436
+ score=format(score, ".2f"),
437
+ verdicts=verdicts_with_nodes,
438
+ multimodal=multimodal,
439
+ )
440
+
441
+ if self.using_native_model:
442
+ res, cost = await self.model.a_generate(
443
+ prompt, schema=ContextualPrecisionScoreReason
444
+ )
445
+ self.evaluation_cost += cost
446
+ return res.reason
447
+ else:
448
+ try:
449
+ res: ContextualPrecisionScoreReason = (
450
+ await self.model.a_generate(
451
+ prompt, schema=ContextualPrecisionScoreReason
452
+ )
453
+ )
454
+ return res.reason
455
+ except TypeError:
456
+ res = await self.model.a_generate(prompt)
457
+ data = trimAndLoadJson(res, self)
458
+ return data["reason"]
459
+
460
+ def _get_interaction_reason(
461
+ self,
462
+ input: str,
463
+ score: float,
464
+ verdicts: List[ContextualPrecisionVerdict],
465
+ multimodal: bool,
466
+ ) -> str:
467
+ if self.include_reason is False:
468
+ return None
469
+
470
+ # Prepare verdicts with node information for reasoning
471
+ verdicts_with_nodes = []
472
+ for i, verdict in enumerate(verdicts):
473
+ verdicts_with_nodes.append(
474
+ {
475
+ "verdict": verdict.verdict,
476
+ "reason": verdict.reason,
477
+ "node": f"Node {i + 1}",
478
+ }
479
+ )
480
+
481
+ prompt = self.evaluation_template.generate_reason(
482
+ input=input,
483
+ score=format(score, ".2f"),
484
+ verdicts=verdicts_with_nodes,
485
+ multimodal=multimodal,
486
+ )
487
+
488
+ if self.using_native_model:
489
+ res, cost = self.model.generate(
490
+ prompt, schema=ContextualPrecisionScoreReason
491
+ )
492
+ self.evaluation_cost += cost
493
+ return res.reason
494
+ else:
495
+ try:
496
+ res: ContextualPrecisionScoreReason = self.model.generate(
497
+ prompt, schema=ContextualPrecisionScoreReason
498
+ )
499
+ return res.reason
500
+ except TypeError:
501
+ res = self.model.generate(prompt)
502
+ data = trimAndLoadJson(res, self)
503
+ return data["reason"]
504
+
505
+ def _get_verbose_steps(
506
+ self, interaction_scores: List[InteractionContextualPrecisionScore]
507
+ ):
508
+ steps = []
509
+ for index, interaction_score in enumerate(interaction_scores):
510
+ interaction_steps = [
511
+ f"Window {index + 1} \n",
512
+ f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
513
+ f"Score: {interaction_score.score} \n",
514
+ f"Reason: {interaction_score.reason} \n",
515
+ ]
516
+ steps.extend(interaction_steps)
517
+ return steps
518
+
519
+ def _generate_reason(
520
+ self, scores: List[InteractionContextualPrecisionScore]
521
+ ) -> str:
522
+ if self.include_reason is False:
523
+ return None
524
+
525
+ if len(scores) == 0:
526
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
527
+
528
+ reasons = []
529
+ for score in scores:
530
+ reasons.append(score.reason)
531
+
532
+ prompt = self.evaluation_template.generate_final_reason(
533
+ self.score, self.success, reasons
534
+ )
535
+
536
+ if self.using_native_model:
537
+ res, cost = self.model.generate(prompt)
538
+ self.evaluation_cost += cost
539
+ return res
540
+ else:
541
+ res = self.model.generate(prompt)
542
+ return res
543
+
544
+ async def _a_generate_reason(
545
+ self, scores: List[InteractionContextualPrecisionScore]
546
+ ) -> str:
547
+ if self.include_reason is False:
548
+ return None
549
+
550
+ if len(scores) == 0:
551
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
552
+
553
+ reasons = []
554
+ for score in scores:
555
+ reasons.append(score.reason)
556
+
557
+ prompt = self.evaluation_template.generate_final_reason(
558
+ self.score, self.success, reasons
559
+ )
560
+
561
+ if self.using_native_model:
562
+ res, cost = await self.model.a_generate(prompt)
563
+ self.evaluation_cost += cost
564
+ return res
565
+ else:
566
+ res = await self.model.a_generate(prompt)
567
+ return res
568
+
569
+ def _calculate_score(
570
+ self, scores: List[InteractionContextualPrecisionScore]
571
+ ) -> float:
572
+ number_of_scores = len(scores)
573
+ if number_of_scores == 0:
574
+ return 1
575
+ total_score = 0
576
+ for score in scores:
577
+ total_score += score.score
578
+ return total_score / number_of_scores
579
+
580
+ def is_successful(self) -> bool:
581
+ if self.error is not None:
582
+ self.success = False
583
+ else:
584
+ try:
585
+ self.success = self.score >= self.threshold
586
+ except:
587
+ self.success = False
588
+ return self.success
589
+
590
+ @property
591
+ def __name__(self):
592
+ return "Turn Contextual Precision"
@@ -0,0 +1,21 @@
1
+ from typing import List, Optional
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class ContextualRecallVerdict(BaseModel):
6
+ verdict: str
7
+ reason: str
8
+
9
+
10
+ class Verdicts(BaseModel):
11
+ verdicts: List[ContextualRecallVerdict]
12
+
13
+
14
+ class ContextualRecallScoreReason(BaseModel):
15
+ reason: str
16
+
17
+
18
+ class InteractionContextualRecallScore(BaseModel):
19
+ score: float
20
+ reason: Optional[str]
21
+ verdicts: Optional[List[ContextualRecallVerdict]]