deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,301 +0,0 @@
1
- from typing import Optional, List, Union
2
-
3
- from deepeval.metrics import BaseMultimodalMetric
4
- from deepeval.test_case import MLLMTestCase
5
- from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.template import (
6
- MultiModalContextualPrecisionTemplate,
7
- )
8
- from deepeval.utils import get_or_create_event_loop, prettify_list
9
- from deepeval.metrics.utils import (
10
- construct_verbose_logs,
11
- trimAndLoadJson,
12
- check_mllm_test_case_params,
13
- initialize_multimodal_model,
14
- )
15
- from deepeval.test_case import LLMTestCaseParams
16
- from deepeval.models import DeepEvalBaseMLLM
17
- import deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema as mcpschema
18
- from deepeval.metrics.indicator import metric_progress_indicator
19
-
20
-
21
- class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
22
-
23
- _required_params: List[LLMTestCaseParams] = [
24
- LLMTestCaseParams.INPUT,
25
- LLMTestCaseParams.ACTUAL_OUTPUT,
26
- LLMTestCaseParams.RETRIEVAL_CONTEXT,
27
- LLMTestCaseParams.EXPECTED_OUTPUT,
28
- ]
29
-
30
- def __init__(
31
- self,
32
- threshold: float = 0.5,
33
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
34
- include_reason: bool = True,
35
- async_mode: bool = True,
36
- strict_mode: bool = False,
37
- verbose_mode: bool = False,
38
- ):
39
- self.threshold = 1 if strict_mode else threshold
40
- self.include_reason = include_reason
41
- self.model, self.using_native_model = initialize_multimodal_model(model)
42
- self.evaluation_model = self.model.get_model_name()
43
- self.async_mode = async_mode
44
- self.strict_mode = strict_mode
45
- self.verbose_mode = verbose_mode
46
-
47
- def measure(
48
- self,
49
- test_case: MLLMTestCase,
50
- _show_indicator: bool = True,
51
- _in_component: bool = False,
52
- _log_metric_to_confident: bool = True,
53
- ) -> float:
54
- check_mllm_test_case_params(
55
- test_case, self._required_params, None, None, self
56
- )
57
-
58
- self.evaluation_cost = 0 if self.using_native_model else None
59
- with metric_progress_indicator(
60
- self,
61
- _show_indicator=_show_indicator,
62
- _in_component=_in_component,
63
- ):
64
- if self.async_mode:
65
- loop = get_or_create_event_loop()
66
- loop.run_until_complete(
67
- self.a_measure(
68
- test_case,
69
- _show_indicator=False,
70
- _in_component=_in_component,
71
- _log_metric_to_confident=_log_metric_to_confident,
72
- )
73
- )
74
- else:
75
- self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
76
- self._generate_verdicts(
77
- test_case.input,
78
- test_case.expected_output,
79
- test_case.retrieval_context,
80
- )
81
- )
82
- self.score = self._calculate_score()
83
- self.reason = self._generate_reason(test_case.input)
84
- self.success = self.score >= self.threshold
85
- self.verbose_logs = construct_verbose_logs(
86
- self,
87
- steps=[
88
- f"Verdicts:\n{prettify_list(self.verdicts)}",
89
- f"Score: {self.score}\nReason: {self.reason}",
90
- ],
91
- )
92
-
93
- return self.score
94
-
95
- async def a_measure(
96
- self,
97
- test_case: MLLMTestCase,
98
- _show_indicator: bool = True,
99
- _in_component: bool = False,
100
- _log_metric_to_confident: bool = True,
101
- ) -> float:
102
- check_mllm_test_case_params(
103
- test_case, self._required_params, None, None, self
104
- )
105
-
106
- self.evaluation_cost = 0 if self.using_native_model else None
107
- with metric_progress_indicator(
108
- self,
109
- async_mode=True,
110
- _show_indicator=_show_indicator,
111
- _in_component=_in_component,
112
- ):
113
- self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
114
- await self._a_generate_verdicts(
115
- test_case.input,
116
- test_case.expected_output,
117
- test_case.retrieval_context,
118
- )
119
- )
120
- self.score = self._calculate_score()
121
- self.reason = await self._a_generate_reason(test_case.input)
122
- self.success = self.score >= self.threshold
123
- self.verbose_logs = construct_verbose_logs(
124
- self,
125
- steps=[
126
- f"Verdicts:\n{prettify_list(self.verdicts)}",
127
- f"Score: {self.score}\nReason: {self.reason}",
128
- ],
129
- )
130
-
131
- return self.score
132
-
133
- async def _a_generate_reason(self, input: str) -> Optional[str]:
134
- if self.include_reason is False:
135
- return None
136
-
137
- retrieval_contexts_verdicts = [
138
- {"verdict": verdict.verdict, "reason": verdict.reason}
139
- for verdict in self.verdicts
140
- ]
141
- prompt = MultiModalContextualPrecisionTemplate.generate_reason(
142
- input=input,
143
- verdicts=retrieval_contexts_verdicts,
144
- score=format(self.score, ".2f"),
145
- )
146
-
147
- if self.using_native_model:
148
- res, cost = await self.model.a_generate(
149
- prompt,
150
- schema=mcpschema.MultimodelContextualPrecisionScoreReason,
151
- )
152
- self.evaluation_cost += cost
153
- return res.reason
154
- else:
155
- try:
156
- res: mcpschema.MultimodelContextualPrecisionScoreReason = (
157
- await self.model.a_generate(
158
- prompt,
159
- schema=mcpschema.MultimodelContextualPrecisionScoreReason,
160
- )
161
- )
162
- return res.reason
163
- except TypeError:
164
- res = await self.model.a_generate(prompt)
165
- data = trimAndLoadJson(res, self)
166
- return data["reason"]
167
-
168
- def _generate_reason(self, input: str) -> Optional[str]:
169
- if self.include_reason is False:
170
- return None
171
-
172
- retrieval_contexts_verdicts = [
173
- {"verdict": verdict.verdict, "reason": verdict.reason}
174
- for verdict in self.verdicts
175
- ]
176
- prompt = MultiModalContextualPrecisionTemplate.generate_reason(
177
- input=input,
178
- verdicts=retrieval_contexts_verdicts,
179
- score=format(self.score, ".2f"),
180
- )
181
-
182
- if self.using_native_model:
183
- res, cost = self.model.generate(
184
- prompt,
185
- schema=mcpschema.MultimodelContextualPrecisionScoreReason,
186
- )
187
- self.evaluation_cost += cost
188
- return res.reason
189
- else:
190
- try:
191
- res: mcpschema.MultimodelContextualPrecisionScoreReason = (
192
- self.model.generate(
193
- prompt,
194
- schema=mcpschema.MultimodelContextualPrecisionScoreReason,
195
- )
196
- )
197
- return res.reason
198
- except TypeError:
199
- res = self.model.generate(prompt)
200
- data = trimAndLoadJson(res, self)
201
- return data["reason"]
202
-
203
- async def _a_generate_verdicts(
204
- self, input: str, expected_output: str, retrieval_context: List[str]
205
- ) -> List[mcpschema.ContextualPrecisionVerdict]:
206
- prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
207
- input=input,
208
- expected_output=expected_output,
209
- retrieval_context=retrieval_context,
210
- )
211
- if self.using_native_model:
212
- res, cost = await self.model.a_generate(
213
- prompt, schema=mcpschema.Verdicts
214
- )
215
- self.evaluation_cost += cost
216
- verdicts = [item for item in res.verdicts]
217
- return verdicts
218
- else:
219
- try:
220
- res: mcpschema.Verdicts = await self.model.a_generate(
221
- prompt, schema=mcpschema.Verdicts
222
- )
223
- verdicts = [item for item in res.verdicts]
224
- return verdicts
225
- except TypeError:
226
- res = await self.model.a_generate(prompt)
227
- data = trimAndLoadJson(res, self)
228
- verdicts = [
229
- mcpschema.ContextualPrecisionVerdict(**item)
230
- for item in data["verdicts"]
231
- ]
232
- return verdicts
233
-
234
- def _generate_verdicts(
235
- self, input: str, expected_output: str, retrieval_context: List[str]
236
- ) -> List[mcpschema.ContextualPrecisionVerdict]:
237
- prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
238
- input=input,
239
- expected_output=expected_output,
240
- retrieval_context=retrieval_context,
241
- )
242
- if self.using_native_model:
243
- res, cost = self.model.generate(prompt, schema=mcpschema.Verdicts)
244
- self.evaluation_cost += cost
245
- verdicts = [item for item in res.verdicts]
246
- return verdicts
247
- else:
248
- try:
249
- res: mcpschema.Verdicts = self.model.generate(
250
- prompt, schema=mcpschema.Verdicts
251
- )
252
- verdicts = [item for item in res.verdicts]
253
- return verdicts
254
- except TypeError:
255
- res = self.model.generate(prompt)
256
- data = trimAndLoadJson(res, self)
257
- verdicts = [
258
- mcpschema.ContextualPrecisionVerdict(**item)
259
- for item in data["verdicts"]
260
- ]
261
- return verdicts
262
-
263
- def _calculate_score(self):
264
- number_of_verdicts = len(self.verdicts)
265
- if number_of_verdicts == 0:
266
- return 0
267
-
268
- # Convert verdicts to a binary list where 'yes' is 1 and others are 0
269
- node_verdicts = [
270
- 1 if v.verdict.strip().lower() == "yes" else 0
271
- for v in self.verdicts
272
- ]
273
-
274
- sum_weighted_precision_at_k = 0.0
275
- relevant_nodes_count = 0
276
- for k, is_relevant in enumerate(node_verdicts, start=1):
277
- # If the item is relevant, update the counter and add the weighted precision at k to the sum
278
- if is_relevant:
279
- relevant_nodes_count += 1
280
- precision_at_k = relevant_nodes_count / k
281
- sum_weighted_precision_at_k += precision_at_k * is_relevant
282
-
283
- if relevant_nodes_count == 0:
284
- return 0
285
- # Calculate weighted cumulative precision
286
- score = sum_weighted_precision_at_k / relevant_nodes_count
287
- return 0 if self.strict_mode and score < self.threshold else score
288
-
289
- def is_successful(self) -> bool:
290
- if self.error is not None:
291
- self.success = False
292
- else:
293
- try:
294
- self.success = self.score >= self.threshold
295
- except TypeError:
296
- self.success = False
297
- return self.success
298
-
299
- @property
300
- def __name__(self):
301
- return "Multimodal Contextual Precision"
@@ -1,15 +0,0 @@
1
- from typing import List, Optional
2
- from pydantic import BaseModel, Field
3
-
4
-
5
- class ContextualPrecisionVerdict(BaseModel):
6
- verdict: str
7
- reason: str
8
-
9
-
10
- class Verdicts(BaseModel):
11
- verdicts: List[ContextualPrecisionVerdict]
12
-
13
-
14
- class MultimodelContextualPrecisionScoreReason(BaseModel):
15
- reason: str
@@ -1,132 +0,0 @@
1
- from typing import Union, List
2
- import textwrap
3
-
4
- from deepeval.test_case import MLLMImage
5
-
6
-
7
- class MultiModalContextualPrecisionTemplate:
8
- @staticmethod
9
- def generate_verdicts(
10
- input: List[Union[str, MLLMImage]],
11
- expected_output: List[Union[str, MLLMImage]],
12
- retrieval_context: List[Union[str, MLLMImage]],
13
- ) -> List[Union[str, MLLMImage]]:
14
- document_count_str = f" ({len(retrieval_context)} document{'s' if len(retrieval_context) > 1 else ''})"
15
- return (
16
- [
17
- textwrap.dedent(
18
- f"""Given the input, expected output, and retrieval context, please generate a list of JSON objects to determine whether each node in the retrieval context was remotely useful in arriving at the expected output.
19
-
20
- **
21
- IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON. These JSON only contain the `verdict` key that outputs only 'yes' or 'no', and a `reason` key to justify the verdict. In your reason, you should aim to quote parts of the context (which can be text or an image).
22
- Example Retrieval Context: ["Einstein won the Nobel Prize for his discovery of the photoelectric effect", "He won the Nobel Prize in 1968.", "There was a cat."]
23
- Example Input: "Who won the Nobel Prize in 1968 and for what?"
24
- Example Expected Output: "Einstein won the Nobel Prize in 1968 for his discovery of the photoelectric effect."
25
-
26
- Example:
27
- {{
28
- "verdicts": [
29
- {{
30
- "reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'",
31
- "verdict": "yes"
32
- }},
33
- {{
34
- "reason": "The text verifies that the prize was indeed won in 1968.",
35
- "verdict": "yes"
36
- }},
37
- {{
38
- "reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize.",
39
- "verdict": "no"
40
- }}
41
- ]
42
- }}
43
- Since you are going to generate a verdict for each context, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to that of the contexts.
44
- **
45
-
46
- Input:
47
- """
48
- )
49
- ]
50
- + input
51
- + [
52
- textwrap.dedent(
53
- """
54
- Expected output:
55
- """
56
- )
57
- ]
58
- + expected_output
59
- + [
60
- textwrap.dedent(
61
- f"""
62
- Retrieval Context{document_count_str}:
63
- """
64
- )
65
- ]
66
- + MultiModalContextualPrecisionTemplate.id_retrieval_context(
67
- retrieval_context
68
- )
69
- + [
70
- textwrap.dedent(
71
- """
72
- JSON:
73
- """
74
- )
75
- ]
76
- )
77
-
78
- @staticmethod
79
- def generate_reason(input, verdicts, score) -> List[Union[str, MLLMImage]]:
80
- # given the input and retrieval context for this input, where the verdict is whether ... and the node is the ..., give a reason for the score
81
- return (
82
- [
83
- textwrap.dedent(
84
- f"""Given the input, retrieval contexts, and contextual precision score, provide a CONCISE summarize for the score. Explain why it is not higher, but also why it is at its current score.
85
- The retrieval contexts is a list of JSON with three keys: `verdict`, `reason` (reason for the verdict) and `node`. `verdict` will be either 'yes' or 'no', which represents whether the corresponding 'node' in the retrieval context is relevant to the input.
86
- Contextual precision represents if the relevant nodes are ranked higher than irrelevant nodes. Also note that retrieval contexts is given IN THE ORDER OF THEIR RANKINGS.
87
-
88
- **
89
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
90
- Example JSON:
91
- {{
92
- "reason": "The score is <contextual_precision_score> because <your_reason>."
93
- }}
94
-
95
-
96
- DO NOT mention 'verdict' in your reason, but instead phrase it as irrelevant nodes. The term 'verdict' are just here for you to understand the broader scope of things.
97
- Also DO NOT mention there are `reason` fields in the retrieval contexts you are presented with, instead just use the information in the `reason` field.
98
- In your reason, you MUST USE the `reason`, QUOTES in the 'reason', and the node RANK (starting from 1, eg. first node) to explain why the 'no' verdicts should be ranked lower than the 'yes' verdicts.
99
- When addressing nodes, make it explicit that it is nodes in retrieval context.
100
- If the score is 1, keep it short and say something positive with an upbeat tone (but don't overdo it otherwise it gets annoying).
101
- **
102
-
103
- Contextual Precision Score:
104
- {score}
105
-
106
- Input:
107
- """
108
- )
109
- ]
110
- + input
111
- + [
112
- textwrap.dedent(
113
- f"""
114
- Retrieval Contexts:
115
- {verdicts}
116
-
117
- JSON:
118
- """
119
- )
120
- ]
121
- )
122
-
123
- @staticmethod
124
- def id_retrieval_context(retrieval_context) -> List[Union[str, MLLMImage]]:
125
- annotated_retrieval_context = []
126
- for i, context in enumerate(retrieval_context):
127
- if isinstance(context, str):
128
- annotated_retrieval_context.append(f"Node {i + 1}: {context}")
129
- elif isinstance(context, MLLMImage):
130
- annotated_retrieval_context.append(f"Node {i + 1}:")
131
- annotated_retrieval_context.append(context)
132
- return annotated_retrieval_context