deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,102 +0,0 @@
1
- from typing import Union, List
2
- import textwrap
3
-
4
- from deepeval.test_case import MLLMImage
5
-
6
-
7
- class MultimodalContextualRelevancyTemplate:
8
- @staticmethod
9
- def generate_reason(
10
- input: List[Union[str, MLLMImage]],
11
- irrelevancies: List[str],
12
- relevant_statements: List[str],
13
- score: float,
14
- ):
15
- return (
16
- [
17
- textwrap.dedent(
18
- f"""Based on the given input, reasons for why the retrieval context is irrelevant to the input, the statements in the retrieval context that is actually relevant to the retrieval context, and the contextual relevancy score (the closer to 1 the better), please generate a CONCISE reason for the score.
19
- In your reason, you should quote data provided in the reasons for irrelevancy and relevant statements to support your point.
20
-
21
- **
22
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
23
- Example JSON:
24
- {{
25
- "reason": "The score is <contextual_relevancy_score> because <your_reason>."
26
- }}
27
-
28
- If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
29
- **
30
-
31
-
32
- Contextual Relevancy Score:
33
- {score}
34
-
35
- Input:
36
- """
37
- )
38
- ]
39
- + input
40
- + [
41
- textwrap.dedent(
42
- f"""
43
- Reasons for why the retrieval context is irrelevant to the input:
44
- {irrelevancies}
45
-
46
- Statement in the retrieval context that is relevant to the input:
47
- {relevant_statements}
48
-
49
- JSON:
50
- """
51
- )
52
- ]
53
- )
54
-
55
- @staticmethod
56
- def generate_verdicts(
57
- input: List[Union[str, MLLMImage]], context: List[Union[str, MLLMImage]]
58
- ) -> List[Union[str, MLLMImage]]:
59
- return (
60
- [
61
- textwrap.dedent(
62
- f"""Based on the input and context (image or string), please generate a JSON object to indicate whether the context is relevant to the provided input. The JSON will be a list of 'verdicts', with 2 mandatory fields: 'verdict' and 'statement', and 1 optional field: 'reason'.
63
- If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
64
- If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
65
- The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the statement or image is relevant to the input.
66
- Provide a 'reason' ONLY IF verdict is no. You MUST quote the irrelevant parts of the statement or image to back up your reason.
67
-
68
- **
69
- IMPORTANT: Please make sure to only return in JSON format.
70
- Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
71
- Example Input: "What were some of Einstein's achievements?"
72
-
73
- Example:
74
- {{
75
- "verdicts": [
76
- {{
77
- "verdict": "yes",
78
- "statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
79
- }},
80
- {{
81
- "verdict": "no",
82
- "statement": "There was a cat.",
83
- "reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements."
84
- }}
85
- ]
86
- }}
87
- **
88
-
89
- Input:
90
- """
91
- )
92
- ]
93
- + input
94
- + [
95
- textwrap.dedent(
96
- """
97
- Context:
98
- """
99
- )
100
- ]
101
- + [context]
102
- )
@@ -1,356 +0,0 @@
1
- from typing import List, Optional, Union
2
- import asyncio
3
-
4
- from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
6
- from deepeval.metrics.multimodal_metrics.multimodal_faithfulness.template import (
7
- MultimodalFaithfulnessTemplate,
8
- )
9
- from deepeval.utils import get_or_create_event_loop, prettify_list
10
- from deepeval.metrics.utils import (
11
- construct_verbose_logs,
12
- trimAndLoadJson,
13
- check_mllm_test_case_params,
14
- initialize_multimodal_model,
15
- )
16
- from deepeval.models import DeepEvalBaseLLM
17
- from deepeval.metrics.multimodal_metrics.multimodal_faithfulness.schema import *
18
- from deepeval.metrics.indicator import metric_progress_indicator
19
-
20
-
21
- class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
22
-
23
- _required_params: List[MLLMTestCaseParams] = [
24
- MLLMTestCaseParams.INPUT,
25
- MLLMTestCaseParams.ACTUAL_OUTPUT,
26
- MLLMTestCaseParams.RETRIEVAL_CONTEXT,
27
- ]
28
-
29
- def __init__(
30
- self,
31
- threshold: float = 0.5,
32
- model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
- include_reason: bool = True,
34
- async_mode: bool = True,
35
- strict_mode: bool = False,
36
- verbose_mode: bool = False,
37
- truths_extraction_limit: Optional[int] = None,
38
- ):
39
- self.threshold = 1 if strict_mode else threshold
40
- self.model, self.using_native_model = initialize_multimodal_model(model)
41
- self.evaluation_model = self.model.get_model_name()
42
- self.include_reason = include_reason
43
- self.async_mode = async_mode
44
- self.strict_mode = strict_mode
45
- self.verbose_mode = verbose_mode
46
-
47
- self.truths_extraction_limit = truths_extraction_limit
48
- if self.truths_extraction_limit is not None:
49
- self.truths_extraction_limit = max(self.truths_extraction_limit, 0)
50
-
51
- def measure(
52
- self,
53
- test_case: MLLMTestCase,
54
- _show_indicator: bool = True,
55
- _in_component: bool = False,
56
- _log_metric_to_confident: bool = True,
57
- ) -> float:
58
- check_mllm_test_case_params(
59
- test_case, self._required_params, None, None, self
60
- )
61
-
62
- self.evaluation_cost = 0 if self.using_native_model else None
63
- with metric_progress_indicator(
64
- self,
65
- _show_indicator=_show_indicator,
66
- _in_component=_in_component,
67
- ):
68
- if self.async_mode:
69
- loop = get_or_create_event_loop()
70
- loop.run_until_complete(
71
- self.a_measure(
72
- test_case,
73
- _show_indicator=False,
74
- _in_component=_in_component,
75
- _log_metric_to_confident=_log_metric_to_confident,
76
- )
77
- )
78
- else:
79
- self.truths = self._generate_truths(test_case.retrieval_context)
80
- self.claims = self._generate_claims(test_case.actual_output)
81
- self.verdicts = self._generate_verdicts()
82
- self.score = self._calculate_score()
83
- self.reason = self._generate_reason()
84
- self.success = self.score >= self.threshold
85
- self.verbose_logs = construct_verbose_logs(
86
- self,
87
- steps=[
88
- f"Truths (limit={self.truths_extraction_limit}):\n{prettify_list(self.truths)}",
89
- f"Claims:\n{prettify_list(self.claims)}",
90
- f"Verdicts:\n{prettify_list(self.verdicts)}",
91
- f"Score: {self.score}\nReason: {self.reason}",
92
- ],
93
- )
94
-
95
- return self.score
96
-
97
- async def a_measure(
98
- self,
99
- test_case: MLLMTestCase,
100
- _show_indicator: bool = True,
101
- _in_component: bool = False,
102
- _log_metric_to_confident: bool = True,
103
- ) -> float:
104
- check_mllm_test_case_params(
105
- test_case, self._required_params, None, None, self
106
- )
107
-
108
- self.evaluation_cost = 0 if self.using_native_model else None
109
- with metric_progress_indicator(
110
- self,
111
- async_mode=True,
112
- _show_indicator=_show_indicator,
113
- _in_component=_in_component,
114
- ):
115
- self.truths, self.claims = await asyncio.gather(
116
- self._a_generate_truths(test_case.retrieval_context),
117
- self._a_generate_claims(test_case.actual_output),
118
- )
119
- self.verdicts = await self._a_generate_verdicts()
120
- self.score = self._calculate_score()
121
- self.reason = await self._a_generate_reason()
122
- self.success = self.score >= self.threshold
123
- self.verbose_logs = construct_verbose_logs(
124
- self,
125
- steps=[
126
- f"Truths (limit={self.truths_extraction_limit}):\n{prettify_list(self.truths)}",
127
- f"Claims:\n{prettify_list(self.claims)}",
128
- f"Verdicts:\n{prettify_list(self.verdicts)}",
129
- f"Score: {self.score}\nReason: {self.reason}",
130
- ],
131
- )
132
-
133
- return self.score
134
-
135
- async def _a_generate_reason(self) -> str:
136
- if self.include_reason is False:
137
- return None
138
-
139
- contradictions = []
140
- for verdict in self.verdicts:
141
- if verdict.verdict.strip().lower() == "no":
142
- contradictions.append(verdict.reason)
143
-
144
- prompt: dict = MultimodalFaithfulnessTemplate.generate_reason(
145
- contradictions=contradictions,
146
- score=format(self.score, ".2f"),
147
- )
148
-
149
- if self.using_native_model:
150
- res, cost = await self.model.a_generate(
151
- prompt, schema=MultimodalFaithfulnessScoreReason
152
- )
153
- self.evaluation_cost += cost
154
- return res.reason
155
- else:
156
- try:
157
- res: MultimodalFaithfulnessScoreReason = (
158
- await self.model.a_generate(
159
- prompt, schema=MultimodalFaithfulnessScoreReason
160
- )
161
- )
162
- return res.reason
163
- except TypeError:
164
- res = await self.model.a_generate(prompt)
165
- data = trimAndLoadJson(res, self)
166
- return data["reason"]
167
-
168
- def _generate_reason(self) -> str:
169
- if self.include_reason is False:
170
- return None
171
-
172
- contradictions = []
173
- for verdict in self.verdicts:
174
- if verdict.verdict.strip().lower() == "no":
175
- contradictions.append(verdict.reason)
176
-
177
- prompt: dict = MultimodalFaithfulnessTemplate.generate_reason(
178
- contradictions=contradictions,
179
- score=format(self.score, ".2f"),
180
- )
181
-
182
- if self.using_native_model:
183
- res, cost = self.model.generate(
184
- prompt, schema=MultimodalFaithfulnessScoreReason
185
- )
186
- self.evaluation_cost += cost
187
- return res.reason
188
- else:
189
- try:
190
- res: MultimodalFaithfulnessScoreReason = self.model.generate(
191
- prompt, schema=MultimodalFaithfulnessScoreReason
192
- )
193
- return res.reason
194
- except TypeError:
195
- res = self.model.generate(prompt)
196
- data = trimAndLoadJson(res, self)
197
- return data["reason"]
198
-
199
- async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]:
200
- if len(self.claims) == 0:
201
- return []
202
-
203
- verdicts: List[FaithfulnessVerdict] = []
204
- prompt = MultimodalFaithfulnessTemplate.generate_verdicts(
205
- claims=self.claims, retrieval_context="\n\n".join(self.truths)
206
- )
207
- if self.using_native_model:
208
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
209
- self.evaluation_cost += cost
210
- verdicts = [item for item in res.verdicts]
211
- return verdicts
212
- else:
213
- try:
214
- res: Verdicts = await self.model.a_generate(
215
- prompt, schema=Verdicts
216
- )
217
- verdicts = [item for item in res.verdicts]
218
- return verdicts
219
- except TypeError:
220
- res = await self.model.a_generate(prompt)
221
- data = trimAndLoadJson(res, self)
222
- verdicts = [
223
- FaithfulnessVerdict(**item) for item in data["verdicts"]
224
- ]
225
- return verdicts
226
-
227
- def _generate_verdicts(self) -> List[FaithfulnessVerdict]:
228
- if len(self.claims) == 0:
229
- return []
230
-
231
- verdicts: List[FaithfulnessVerdict] = []
232
- prompt = MultimodalFaithfulnessTemplate.generate_verdicts(
233
- claims=self.claims, retrieval_context="\n\n".join(self.truths)
234
- )
235
- if self.using_native_model:
236
- res, cost = self.model.generate(prompt, schema=Verdicts)
237
- self.evaluation_cost += cost
238
- verdicts = [item for item in res.verdicts]
239
- return verdicts
240
- else:
241
- try:
242
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
243
- verdicts = [item for item in res.verdicts]
244
- return verdicts
245
- except TypeError:
246
- res = self.model.generate(prompt)
247
- data = trimAndLoadJson(res, self)
248
- verdicts = [
249
- FaithfulnessVerdict(**item) for item in data["verdicts"]
250
- ]
251
- return verdicts
252
-
253
- async def _a_generate_truths(
254
- self, retrieval_context: List[Union[str, MLLMImage]]
255
- ) -> List[str]:
256
- prompt = MultimodalFaithfulnessTemplate.generate_truths(
257
- excerpt=retrieval_context,
258
- extraction_limit=self.truths_extraction_limit,
259
- )
260
- if self.using_native_model:
261
- res, cost = await self.model.a_generate(prompt, schema=Truths)
262
- self.evaluation_cost += cost
263
- return res.truths
264
- else:
265
- try:
266
- res: Truths = await self.model.a_generate(prompt, schema=Truths)
267
- return res.truths
268
- except TypeError:
269
- res = await self.model.a_generate(prompt)
270
- data = trimAndLoadJson(res, self)
271
- return data["truths"]
272
-
273
- def _generate_truths(
274
- self, retrieval_context: List[Union[str, MLLMImage]]
275
- ) -> List[str]:
276
- prompt = MultimodalFaithfulnessTemplate.generate_truths(
277
- excerpt=retrieval_context,
278
- extraction_limit=self.truths_extraction_limit,
279
- )
280
- if self.using_native_model:
281
- res, cost = self.model.generate(prompt, schema=Truths)
282
- self.evaluation_cost += cost
283
- return res.truths
284
- else:
285
- try:
286
- res: Truths = self.model.generate(prompt, schema=Truths)
287
- return res.truths
288
- except TypeError:
289
- res = self.model.generate(prompt)
290
- data = trimAndLoadJson(res, self)
291
- return data["truths"]
292
-
293
- async def _a_generate_claims(
294
- self, actual_output: List[Union[str, MLLMImage]]
295
- ) -> List[str]:
296
- prompt = MultimodalFaithfulnessTemplate.generate_claims(
297
- excerpt=actual_output
298
- )
299
- if self.using_native_model:
300
- res, cost = await self.model.a_generate(prompt, schema=Claims)
301
- self.evaluation_cost += cost
302
- return res.claims
303
- else:
304
- try:
305
- res: Claims = await self.model.a_generate(prompt, schema=Claims)
306
- return res.claims
307
- except TypeError:
308
- res = await self.model.a_generate(prompt)
309
- data = trimAndLoadJson(res, self)
310
- return data["claims"]
311
-
312
- def _generate_claims(
313
- self, actual_output: List[Union[str, MLLMImage]]
314
- ) -> List[str]:
315
- prompt = MultimodalFaithfulnessTemplate.generate_claims(
316
- excerpt=actual_output
317
- )
318
- if self.using_native_model:
319
- res, cost = self.model.generate(prompt, schema=Claims)
320
- self.evaluation_cost += cost
321
- return res.claims
322
- else:
323
- try:
324
- res: Claims = self.model.generate(prompt, schema=Claims)
325
- return res.claims
326
- except TypeError:
327
- res = self.model.generate(prompt)
328
- data = trimAndLoadJson(res, self)
329
- return data["claims"]
330
-
331
- def _calculate_score(self) -> float:
332
- number_of_verdicts = len(self.verdicts)
333
- if number_of_verdicts == 0:
334
- return 1
335
-
336
- faithfulness_count = 0
337
- for verdict in self.verdicts:
338
- if verdict.verdict.strip().lower() != "no":
339
- faithfulness_count += 1
340
-
341
- score = faithfulness_count / number_of_verdicts
342
- return 0 if self.strict_mode and score < self.threshold else score
343
-
344
- def is_successful(self) -> bool:
345
- if self.error is not None:
346
- self.success = False
347
- else:
348
- try:
349
- self.success = self.score >= self.threshold
350
- except:
351
- self.success = False
352
- return self.success
353
-
354
- @property
355
- def __name__(self):
356
- return "Multimodal Faithfulness"
@@ -1,175 +0,0 @@
1
- from typing import Union, List, Optional
2
- import textwrap
3
-
4
- from deepeval.test_case import MLLMImage
5
-
6
-
7
- class MultimodalFaithfulnessTemplate:
8
- @staticmethod
9
- def generate_claims(excerpt):
10
- return (
11
- [
12
- textwrap.dedent(
13
- f"""Based on the given excerpt, which contains text and possibly images, please generate a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided text and images.
14
-
15
- Example:
16
- Example Excerpt:
17
- "Einstein won the noble prize in 1968 for his discovery of the photoelectric effect."
18
-
19
- Example JSON:
20
- {{
21
- "claims": [
22
- "Einstein won the noble prize for his discovery of the photoelectric effect.",
23
- "Einstein won the noble prize in 1968."
24
- ]
25
- }}
26
- ===== END OF EXAMPLE ======
27
-
28
- **
29
- IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
30
- Only include claims that are factual, and the claims you extract should include the full context it was presented in, NOT cherry picked facts.
31
- You should NOT include any prior knowledge, and take the text at face value when extracting claims.
32
- **
33
-
34
- Text:
35
- """
36
- )
37
- ]
38
- + excerpt
39
- + [
40
- textwrap.dedent(
41
- f"""
42
- JSON:
43
- """
44
- )
45
- ]
46
- )
47
-
48
- @staticmethod
49
- def generate_truths(excerpt, extraction_limit: Optional[int] = None):
50
- if extraction_limit is None:
51
- limit = " FACTUAL, undisputed truths"
52
- elif extraction_limit == 1:
53
- limit = " the single most important FACTUAL, undisputed truth"
54
- else:
55
- limit = f" the {extraction_limit} most important FACTUAL, undisputed truths per document"
56
- return (
57
- [
58
- textwrap.dedent(
59
- f"""Based on the given excerpt (text and images), please generate a comprehensive list of{limit}, that can inferred from the provided excerpt.
60
-
61
- Example:
62
- Example Excerpt:
63
- "Einstein won the noble prize in 1968 for his discovery of the photoelectric effect."
64
-
65
- Example JSON:
66
- {{
67
- "truths": [
68
- "Einstein won the noble prize for his discovery of the photoelectric effect.",
69
- "Einstein won the noble prize in 1968."
70
- ]
71
- }}
72
- ===== END OF EXAMPLE ======
73
-
74
- **
75
- IMPORTANT: Please make sure to only return in JSON format, with the "truths" key as a list of strings. No words or explanation is needed.
76
- Only include truths that are factual.
77
- **
78
-
79
- Excerpt:
80
- """
81
- )
82
- ]
83
- + excerpt
84
- + [
85
- textwrap.dedent(
86
- f"""
87
- JSON:
88
- """
89
- )
90
- ]
91
- )
92
-
93
- @staticmethod
94
- def generate_verdicts(claims, retrieval_context):
95
- return textwrap.dedent(
96
- f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
97
- The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
98
- Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
99
- The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
100
-
101
- **
102
- IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
103
- Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
104
- Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a German chef."]
105
-
106
- Example:
107
- {{
108
- "verdicts": [
109
- {{
110
- "verdict": "idk",
111
- "reason": "The claim about Barack Obama is not directly addressed in the retrieval context, and so poses no contradiction."
112
- }},
113
- {{
114
- "verdict": "idk",
115
- "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context."
116
- }},
117
- {{
118
- "verdict": "yes"
119
- }},
120
- {{
121
- "verdict": "no",
122
- "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead."
123
- }},
124
- {{
125
- "verdict": "no",
126
- "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead."
127
- }},
128
- ]
129
- }}
130
- ===== END OF EXAMPLE ======
131
-
132
- The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
133
- You DON'T have to provide a reason if the answer is 'yes'.
134
- ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
135
- Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
136
- Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
137
- **
138
-
139
- Retrieval Contexts:
140
- {retrieval_context}
141
-
142
- Claims:
143
- {claims}
144
-
145
- JSON:
146
- """
147
- )
148
-
149
- @staticmethod
150
- def generate_reason(score, contradictions):
151
- return textwrap.dedent(
152
- f"""Below is a list of Contradictions. It is a list of strings explaining why the 'actual output' does not align with the information presented in the 'retrieval context'. Contradictions happen in the 'actual output', NOT the 'retrieval context'.
153
- Given the faithfulness score, which is a 0-1 score indicating how faithful the `actual output` is to the retrieval context (higher the better), CONCISELY summarize the contradictions to justify the score.
154
-
155
- **
156
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
157
- Example JSON:
158
- {{
159
- "reason": "The score is <faithfulness_score> because <your_reason>."
160
- }}
161
-
162
- If there are no contradictions, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
163
- Your reason MUST use information in `contradiction` in your reason.
164
- Be sure in your reason, as if you know what the actual output is from the contradictions.
165
- **
166
-
167
- Faithfulness Score:
168
- {score}
169
-
170
- Contradictions:
171
- {contradictions}
172
-
173
- JSON:
174
- """
175
- )