deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,285 +0,0 @@
1
- from typing import Optional, List, Union
2
-
3
- from deepeval.metrics import BaseMultimodalMetric
4
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
5
- from deepeval.metrics.multimodal_metrics.multimodal_contextual_recall.template import (
6
- MultimodalContextualRecallTemplate,
7
- )
8
- from deepeval.utils import get_or_create_event_loop, prettify_list
9
- from deepeval.metrics.utils import (
10
- construct_verbose_logs,
11
- trimAndLoadJson,
12
- check_mllm_test_case_params,
13
- initialize_multimodal_model,
14
- )
15
- from deepeval.models import DeepEvalBaseMLLM
16
- from deepeval.metrics.multimodal_metrics.multimodal_contextual_recall.schema import *
17
- from deepeval.metrics.indicator import metric_progress_indicator
18
-
19
-
20
- class MultimodalContextualRecallMetric(BaseMultimodalMetric):
21
-
22
- _required_params: List[MLLMTestCaseParams] = [
23
- MLLMTestCaseParams.INPUT,
24
- MLLMTestCaseParams.ACTUAL_OUTPUT,
25
- MLLMTestCaseParams.RETRIEVAL_CONTEXT,
26
- MLLMTestCaseParams.EXPECTED_OUTPUT,
27
- ]
28
-
29
- def __init__(
30
- self,
31
- threshold: float = 0.5,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
33
- include_reason: bool = True,
34
- async_mode: bool = True,
35
- strict_mode: bool = False,
36
- verbose_mode: bool = False,
37
- ):
38
- self.threshold = 1 if strict_mode else threshold
39
- self.model, self.using_native_model = initialize_multimodal_model(model)
40
- self.evaluation_model = self.model.get_model_name()
41
- self.include_reason = include_reason
42
- self.async_mode = async_mode
43
- self.strict_mode = strict_mode
44
- self.verbose_mode = verbose_mode
45
-
46
- def measure(
47
- self,
48
- test_case: MLLMTestCase,
49
- _show_indicator: bool = True,
50
- _in_component: bool = False,
51
- _log_metric_to_confident: bool = True,
52
- ) -> float:
53
- check_mllm_test_case_params(
54
- test_case, self._required_params, None, None, self
55
- )
56
-
57
- self.evaluation_cost = 0 if self.using_native_model else None
58
- with metric_progress_indicator(
59
- self, _show_indicator=_show_indicator, _in_component=_in_component
60
- ):
61
- if self.async_mode:
62
- loop = get_or_create_event_loop()
63
- loop.run_until_complete(
64
- self.a_measure(
65
- test_case,
66
- _show_indicator=False,
67
- _in_component=_in_component,
68
- _log_metric_to_confident=_log_metric_to_confident,
69
- )
70
- )
71
- else:
72
- self.verdicts: List[ContextualRecallVerdict] = (
73
- self._generate_verdicts(
74
- test_case.expected_output, test_case.retrieval_context
75
- )
76
- )
77
- self.score = self._calculate_score()
78
- self.reason = self._generate_reason(test_case.expected_output)
79
- self.success = self.score >= self.threshold
80
- self.verbose_logs = construct_verbose_logs(
81
- self,
82
- steps=[
83
- f"Verdicts:\n{prettify_list(self.verdicts)}",
84
- f"Score: {self.score}\nReason: {self.reason}",
85
- ],
86
- )
87
-
88
- return self.score
89
-
90
- async def a_measure(
91
- self,
92
- test_case: MLLMTestCase,
93
- _show_indicator: bool = True,
94
- _in_component: bool = False,
95
- _log_metric_to_confident: bool = True,
96
- ) -> float:
97
- check_mllm_test_case_params(
98
- test_case, self._required_params, None, None, self
99
- )
100
-
101
- self.evaluation_cost = 0 if self.using_native_model else None
102
- with metric_progress_indicator(
103
- self,
104
- async_mode=True,
105
- _show_indicator=_show_indicator,
106
- _in_component=_in_component,
107
- ):
108
- self.verdicts: List[ContextualRecallVerdict] = (
109
- await self._a_generate_verdicts(
110
- test_case.expected_output, test_case.retrieval_context
111
- )
112
- )
113
- self.score = self._calculate_score()
114
- self.reason = await self._a_generate_reason(
115
- test_case.expected_output
116
- )
117
- self.success = self.score >= self.threshold
118
- self.verbose_logs = construct_verbose_logs(
119
- self,
120
- steps=[
121
- f"Verdicts:\n{prettify_list(self.verdicts)}",
122
- f"Score: {self.score}\nReason: {self.reason}",
123
- ],
124
- )
125
-
126
- return self.score
127
-
128
- async def _a_generate_reason(
129
- self, expected_output: List[Union[str, MLLMImage]]
130
- ):
131
- if self.include_reason is False:
132
- return None
133
-
134
- supportive_reasons = []
135
- unsupportive_reasons = []
136
- for verdict in self.verdicts:
137
- if verdict.verdict.lower() == "yes":
138
- supportive_reasons.append(verdict.reason)
139
- else:
140
- unsupportive_reasons.append(verdict.reason)
141
-
142
- prompt = MultimodalContextualRecallTemplate.generate_reason(
143
- expected_output=expected_output,
144
- supportive_reasons=supportive_reasons,
145
- unsupportive_reasons=unsupportive_reasons,
146
- score=format(self.score, ".2f"),
147
- )
148
-
149
- if self.using_native_model:
150
- res, cost = await self.model.a_generate(
151
- prompt, schema=MultimodalContextualRecallScoreReason
152
- )
153
- self.evaluation_cost += cost
154
- return res.reason
155
- else:
156
- try:
157
- res: MultimodalContextualRecallScoreReason = (
158
- await self.model.a_generate(
159
- prompt, schema=MultimodalContextualRecallScoreReason
160
- )
161
- )
162
- return res.reason
163
- except TypeError:
164
- res = await self.model.a_generate(prompt)
165
- data = trimAndLoadJson(res, self)
166
- return data["reason"]
167
-
168
- def _generate_reason(self, expected_output: List[Union[str, MLLMImage]]):
169
- if self.include_reason is False:
170
- return None
171
-
172
- supportive_reasons = []
173
- unsupportive_reasons = []
174
- for verdict in self.verdicts:
175
- if verdict.verdict.lower() == "yes":
176
- supportive_reasons.append(verdict.reason)
177
- else:
178
- unsupportive_reasons.append(verdict.reason)
179
-
180
- prompt = MultimodalContextualRecallTemplate.generate_reason(
181
- expected_output=expected_output,
182
- supportive_reasons=supportive_reasons,
183
- unsupportive_reasons=unsupportive_reasons,
184
- score=format(self.score, ".2f"),
185
- )
186
-
187
- if self.using_native_model:
188
- res, cost = self.model.generate(
189
- prompt, schema=MultimodalContextualRecallScoreReason
190
- )
191
- self.evaluation_cost += cost
192
- return res.reason
193
- else:
194
- try:
195
- res: MultimodalContextualRecallScoreReason = (
196
- self.model.generate(
197
- prompt, schema=MultimodalContextualRecallScoreReason
198
- )
199
- )
200
- return res.reason
201
- except TypeError:
202
- res = self.model.generate(prompt)
203
- data = trimAndLoadJson(res, self)
204
- return data["reason"]
205
-
206
- def _calculate_score(self):
207
- number_of_verdicts = len(self.verdicts)
208
- if number_of_verdicts == 0:
209
- return 0
210
-
211
- justified_sentences = 0
212
- for verdict in self.verdicts:
213
- if verdict.verdict.lower() == "yes":
214
- justified_sentences += 1
215
-
216
- score = justified_sentences / number_of_verdicts
217
- return 0 if self.strict_mode and score < self.threshold else score
218
-
219
- async def _a_generate_verdicts(
220
- self,
221
- expected_output: List[Union[str, MLLMImage]],
222
- retrieval_context: List[Union[str, MLLMImage]],
223
- ) -> List[ContextualRecallVerdict]:
224
- prompt = MultimodalContextualRecallTemplate.generate_verdicts(
225
- expected_output=expected_output, retrieval_context=retrieval_context
226
- )
227
- if self.using_native_model:
228
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
229
- self.evaluation_cost += cost
230
- verdicts: Verdicts = [item for item in res.verdicts]
231
- return verdicts
232
- else:
233
- try:
234
- res: Verdicts = await self.model.a_generate(
235
- prompt, schema=Verdicts
236
- )
237
- verdicts: Verdicts = [item for item in res.verdicts]
238
- return verdicts
239
- except TypeError:
240
- res = await self.model.a_generate(prompt)
241
- data = trimAndLoadJson(res, self)
242
- verdicts = [
243
- ContextualRecallVerdict(**item) for item in data["verdicts"]
244
- ]
245
- return verdicts
246
-
247
- def _generate_verdicts(
248
- self,
249
- expected_output: List[Union[str, MLLMImage]],
250
- retrieval_context: List[Union[str, MLLMImage]],
251
- ) -> List[ContextualRecallVerdict]:
252
- prompt = MultimodalContextualRecallTemplate.generate_verdicts(
253
- expected_output=expected_output, retrieval_context=retrieval_context
254
- )
255
- if self.using_native_model:
256
- res, cost = self.model.generate(prompt, schema=Verdicts)
257
- self.evaluation_cost += cost
258
- verdicts: Verdicts = [item for item in res.verdicts]
259
- return verdicts
260
- else:
261
- try:
262
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
263
- verdicts: Verdicts = [item for item in res.verdicts]
264
- return verdicts
265
- except TypeError:
266
- res = self.model.generate(prompt)
267
- data = trimAndLoadJson(res, self)
268
- verdicts = [
269
- ContextualRecallVerdict(**item) for item in data["verdicts"]
270
- ]
271
- return verdicts
272
-
273
- def is_successful(self) -> bool:
274
- if self.error is not None:
275
- self.success = False
276
- else:
277
- try:
278
- self.success = self.score >= self.threshold
279
- except:
280
- self.success = False
281
- return self.success
282
-
283
- @property
284
- def __name__(self):
285
- return "Multimodal Contextual Recall"
@@ -1,15 +0,0 @@
1
- from typing import List, Optional
2
- from pydantic import BaseModel, Field
3
-
4
-
5
- class ContextualRecallVerdict(BaseModel):
6
- verdict: str
7
- reason: str
8
-
9
-
10
- class Verdicts(BaseModel):
11
- verdicts: List[ContextualRecallVerdict]
12
-
13
-
14
- class MultimodalContextualRecallScoreReason(BaseModel):
15
- reason: str
@@ -1,112 +0,0 @@
1
- from typing import Union, List
2
- import textwrap
3
-
4
- from deepeval.test_case import MLLMImage
5
-
6
-
7
- class MultimodalContextualRecallTemplate:
8
- @staticmethod
9
- def generate_reason(
10
- expected_output, supportive_reasons, unsupportive_reasons, score
11
- ) -> List[Union[str, MLLMImage]]:
12
- return (
13
- [
14
- textwrap.dedent(
15
- f"""Given the original expected output, a list of supportive reasons, and a list of unsupportive reasons (which is deduced directly from the 'expected output'), and a contextual recall score (closer to 1 the better), summarize a CONCISE reason for the score.
16
- A supportive reason is the reason why a certain sentence or image in the original expected output can be attributed to the node in the retrieval context.
17
- An unsupportive reason is the reason why a certain sentence or image in the original expected output cannot be attributed to anything in the retrieval context.
18
- In your reason, you should related supportive/unsupportive reasons to the sentence or image number in expected output, and info regarding the node number in retrieval context to support your final reason. The first mention of "node(s)" should specify "node(s) in retrieval context)".
19
-
20
- **
21
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
22
- Example JSON:
23
- {{
24
- "reason": "The score is <contextual_recall_score> because <your_reason>."
25
- }}
26
-
27
- DO NOT mention 'supportive reasons' and 'unsupportive reasons' in your reason, these terms are just here for you to understand the broader scope of things.
28
- If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
29
- **
30
-
31
- Contextual Recall Score:
32
- {score}
33
-
34
- Expected Output:
35
- """
36
- )
37
- ]
38
- + expected_output
39
- + [
40
- textwrap.dedent(
41
- f"""Supportive Reasons:
42
- {supportive_reasons}
43
-
44
- Unsupportive Reasons:
45
- {unsupportive_reasons}
46
-
47
- JSON:"""
48
- )
49
- ]
50
- )
51
-
52
- @staticmethod
53
- def generate_verdicts(
54
- expected_output, retrieval_context
55
- ) -> List[Union[str, MLLMImage]]:
56
- return (
57
- [
58
- textwrap.dedent(
59
- f"""For EACH sentence and image in the given expected output below, determine whether the sentence or image can be attributed to the nodes of retrieval contexts. Please generate a list of JSON with two keys: `verdict` and `reason`.
60
- The `verdict` key should STRICTLY be either a 'yes' or 'no'. Answer 'yes' if the sentence or image can be attributed to any parts of the retrieval context, else answer 'no'.
61
- The `reason` key should provide a reason why to the verdict. In the reason, you should aim to include the node(s) count in the retrieval context (eg., 1st node, and 2nd node in the retrieval context) that is attributed to said sentence or image. A node is either a string or image, but not both (so do not group images and texts in the same nodes). You should also aim to quote the specific part of the retrieval context to justify your verdict, but keep it extremely concise and cut short the quote with an ellipsis if possible.
62
-
63
- **
64
- IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects, each with two keys: `verdict` and `reason`.
65
-
66
- {{
67
- "verdicts": [
68
- {{
69
- "reason": "...",
70
- "verdict": "yes"
71
- }},
72
- ...
73
- ]
74
- }}
75
-
76
- Since you are going to generate a verdict for each sentence, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of sentences and images in the `expected output`.
77
- **
78
-
79
- Expected Output:
80
- """
81
- )
82
- ]
83
- + expected_output
84
- + [
85
- textwrap.dedent(
86
- """
87
- Retrieval Context:
88
- """
89
- )
90
- ]
91
- + MultimodalContextualRecallTemplate.id_retrieval_context(
92
- retrieval_context
93
- )
94
- + [
95
- textwrap.dedent(
96
- """
97
- JSON:
98
- """
99
- )
100
- ]
101
- )
102
-
103
- @staticmethod
104
- def id_retrieval_context(retrieval_context) -> List[Union[str, MLLMImage]]:
105
- annotated_retrieval_context = []
106
- for i, context in enumerate(retrieval_context):
107
- if isinstance(context, str):
108
- annotated_retrieval_context.append(f"Node {i + 1}: {context}")
109
- elif isinstance(context, MLLMImage):
110
- annotated_retrieval_context.append(f"Node {i + 1}:")
111
- annotated_retrieval_context.append(context)
112
- return annotated_retrieval_context
@@ -1,282 +0,0 @@
1
- from typing import Optional, List, Union
2
- import asyncio
3
-
4
- from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
6
- from deepeval.metrics.multimodal_metrics.multimodal_contextual_relevancy.template import (
7
- MultimodalContextualRelevancyTemplate,
8
- )
9
- from deepeval.utils import get_or_create_event_loop, prettify_list
10
- from deepeval.metrics.utils import (
11
- construct_verbose_logs,
12
- trimAndLoadJson,
13
- check_mllm_test_case_params,
14
- initialize_multimodal_model,
15
- )
16
-
17
- from deepeval.models import DeepEvalBaseMLLM
18
- from deepeval.metrics.multimodal_metrics.multimodal_contextual_relevancy.schema import *
19
- from deepeval.metrics.indicator import metric_progress_indicator
20
-
21
-
22
- class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
23
-
24
- _required_params: List[MLLMTestCaseParams] = [
25
- MLLMTestCaseParams.INPUT,
26
- MLLMTestCaseParams.ACTUAL_OUTPUT,
27
- MLLMTestCaseParams.RETRIEVAL_CONTEXT,
28
- ]
29
-
30
- def __init__(
31
- self,
32
- threshold: float = 0.5,
33
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
34
- include_reason: bool = True,
35
- async_mode: bool = True,
36
- strict_mode: bool = False,
37
- verbose_mode: bool = False,
38
- ):
39
- self.threshold = 1 if strict_mode else threshold
40
- self.model, self.using_native_model = initialize_multimodal_model(model)
41
- self.evaluation_model = self.model.get_model_name()
42
- self.include_reason = include_reason
43
- self.async_mode = async_mode
44
- self.strict_mode = strict_mode
45
- self.verbose_mode = verbose_mode
46
-
47
- def measure(
48
- self,
49
- test_case: MLLMTestCase,
50
- _show_indicator: bool = True,
51
- _in_component: bool = False,
52
- _log_metric_to_confident: bool = True,
53
- ) -> float:
54
- check_mllm_test_case_params(
55
- test_case, self._required_params, None, None, self
56
- )
57
-
58
- self.evaluation_cost = 0 if self.using_native_model else None
59
- with metric_progress_indicator(
60
- self, _show_indicator=_show_indicator, _in_component=_in_component
61
- ):
62
- if self.async_mode:
63
- loop = get_or_create_event_loop()
64
- loop.run_until_complete(
65
- self.a_measure(
66
- test_case,
67
- _show_indicator=False,
68
- _in_component=_in_component,
69
- _log_metric_to_confident=_log_metric_to_confident,
70
- )
71
- )
72
- else:
73
- self.verdicts_list: List[ContextualRelevancyVerdicts] = [
74
- (self._generate_verdicts(test_case.input, context))
75
- for context in test_case.retrieval_context
76
- ]
77
- self.score = self._calculate_score()
78
- self.reason = self._generate_reason(test_case.input)
79
- self.success = self.score >= self.threshold
80
- self.verbose_logs = construct_verbose_logs(
81
- self,
82
- steps=[
83
- f"Verdicts:\n{prettify_list(self.verdicts_list)}",
84
- f"Score: {self.score}\nReason: {self.reason}",
85
- ],
86
- )
87
-
88
- return self.score
89
-
90
- async def a_measure(
91
- self,
92
- test_case: MLLMTestCase,
93
- _show_indicator: bool = True,
94
- _in_component: bool = False,
95
- _log_metric_to_confident: bool = True,
96
- ) -> float:
97
- check_mllm_test_case_params(
98
- test_case, self._required_params, None, None, self
99
- )
100
-
101
- self.evaluation_cost = 0 if self.using_native_model else None
102
- with metric_progress_indicator(
103
- self,
104
- async_mode=True,
105
- _show_indicator=_show_indicator,
106
- _in_component=_in_component,
107
- ):
108
- self.verdicts_list: List[ContextualRelevancyVerdicts] = (
109
- await asyncio.gather(
110
- *[
111
- self._a_generate_verdicts(test_case.input, context)
112
- for context in test_case.retrieval_context
113
- ]
114
- )
115
- )
116
- self.score = self._calculate_score()
117
- self.reason = await self._a_generate_reason(test_case.input)
118
- self.success = self.score >= self.threshold
119
- self.verbose_logs = construct_verbose_logs(
120
- self,
121
- steps=[
122
- f"Verdicts:\n{prettify_list(self.verdicts_list)}",
123
- f"Score: {self.score}\nReason: {self.reason}",
124
- ],
125
- )
126
-
127
- return self.score
128
-
129
- async def _a_generate_reason(self, input: List[Union[str, MLLMImage]]):
130
- if self.include_reason is False:
131
- return None
132
-
133
- irrelevancies = []
134
- relevant_statements = []
135
- for verdicts in self.verdicts_list:
136
- for verdict in verdicts.verdicts:
137
- if verdict.verdict.lower() == "no":
138
- irrelevancies.append(verdict.reason)
139
- else:
140
- relevant_statements.append(verdict.statement)
141
-
142
- prompt: dict = MultimodalContextualRelevancyTemplate.generate_reason(
143
- input=input,
144
- irrelevancies=irrelevancies,
145
- relevant_statements=relevant_statements,
146
- score=format(self.score, ".2f"),
147
- )
148
- if self.using_native_model:
149
- res, cost = await self.model.a_generate(
150
- prompt, schema=MultimodelContextualRelevancyScoreReason
151
- )
152
- self.evaluation_cost += cost
153
- return res.reason
154
- else:
155
- try:
156
- res: MultimodelContextualRelevancyScoreReason = (
157
- await self.model.a_generate(
158
- prompt, schema=MultimodelContextualRelevancyScoreReason
159
- )
160
- )
161
- return res.reason
162
- except TypeError:
163
- res = await self.model.a_generate(prompt)
164
- data = trimAndLoadJson(res, self)
165
- return data["reason"]
166
-
167
- def _generate_reason(self, input: List[Union[str, MLLMImage]]):
168
- if self.include_reason is False:
169
- return None
170
-
171
- irrelevancies = []
172
- relevant_statements = []
173
- for verdicts in self.verdicts_list:
174
- for verdict in verdicts.verdicts:
175
- if verdict.verdict.lower() == "no":
176
- irrelevancies.append(verdict.reason)
177
- else:
178
- relevant_statements.append(verdict.statement)
179
-
180
- prompt: dict = MultimodalContextualRelevancyTemplate.generate_reason(
181
- input=input,
182
- irrelevancies=irrelevancies,
183
- relevant_statements=relevant_statements,
184
- score=format(self.score, ".2f"),
185
- )
186
- if self.using_native_model:
187
- res, cost = self.model.generate(
188
- prompt, schema=MultimodelContextualRelevancyScoreReason
189
- )
190
- self.evaluation_cost += cost
191
- return res.reason
192
- else:
193
- try:
194
- res: MultimodelContextualRelevancyScoreReason = (
195
- self.model.generate(
196
- prompt, schema=MultimodelContextualRelevancyScoreReason
197
- )
198
- )
199
- return res.reason
200
- except TypeError:
201
- res = self.model.generate(prompt)
202
- data = trimAndLoadJson(res, self)
203
- return data["reason"]
204
-
205
- def _calculate_score(self):
206
- total_verdicts = 0
207
- relevant_statements = 0
208
- for verdicts in self.verdicts_list:
209
- for verdict in verdicts.verdicts:
210
- total_verdicts += 1
211
- if verdict.verdict.lower() == "yes":
212
- relevant_statements += 1
213
-
214
- if total_verdicts == 0:
215
- return 0
216
-
217
- score = relevant_statements / total_verdicts
218
- return 0 if self.strict_mode and score < self.threshold else score
219
-
220
- async def _a_generate_verdicts(
221
- self,
222
- input: List[Union[str, MLLMImage]],
223
- context: List[Union[str, MLLMImage]],
224
- ) -> ContextualRelevancyVerdicts:
225
- prompt = MultimodalContextualRelevancyTemplate.generate_verdicts(
226
- input=input, context=context
227
- )
228
- if self.using_native_model:
229
- res, cost = await self.model.a_generate(
230
- prompt, schema=ContextualRelevancyVerdicts
231
- )
232
- self.evaluation_cost += cost
233
- return res
234
- else:
235
- try:
236
- res = await self.model.a_generate(
237
- prompt, schema=ContextualRelevancyVerdicts
238
- )
239
- return res
240
- except TypeError:
241
- res = await self.model.a_generate(prompt)
242
- data = trimAndLoadJson(res, self)
243
- return ContextualRelevancyVerdicts(**data)
244
-
245
- def _generate_verdicts(
246
- self,
247
- input: List[Union[str, MLLMImage]],
248
- context: List[Union[str, MLLMImage]],
249
- ) -> ContextualRelevancyVerdicts:
250
- prompt = MultimodalContextualRelevancyTemplate.generate_verdicts(
251
- input=input, context=context
252
- )
253
- if self.using_native_model:
254
- res, cost = self.model.generate(
255
- prompt, schema=ContextualRelevancyVerdicts
256
- )
257
- self.evaluation_cost += cost
258
- return res
259
- else:
260
- try:
261
- res = self.model.generate(
262
- prompt, schema=ContextualRelevancyVerdicts
263
- )
264
- return res
265
- except TypeError:
266
- res = self.model.generate(prompt)
267
- data = trimAndLoadJson(res, self)
268
- return ContextualRelevancyVerdicts(**data)
269
-
270
- def is_successful(self) -> bool:
271
- if self.error is not None:
272
- self.success = False
273
- else:
274
- try:
275
- self.success = self.score >= self.threshold
276
- except:
277
- self.success = False
278
- return self.success
279
-
280
- @property
281
- def __name__(self):
282
- return "Multimodal Contextual Relevancy"