deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,41 +1,50 @@
1
1
  from typing import Optional, List
2
+ import textwrap
2
3
 
3
4
 
4
5
  class FaithfulnessTemplate:
5
6
  @staticmethod
6
- def generate_claims(actual_output: str):
7
- return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided actual AI output.
8
- These truths, MUST BE COHERENT, and CANNOT be taken out of context.
9
-
10
- Example:
11
- Example Text:
12
- "Albert Einstein, the genius often associated with wild hair and mind-bending theories, famously won the Nobel Prize in Physics—though not for his groundbreaking work on relativity, as many assume. Instead, in 1968, he was honored for his discovery of the photoelectric effect, a phenomenon that laid the foundation for quantum mechanics."
13
-
14
- Example JSON:
15
- {{
16
- "claims": [
17
- "Einstein won the noble prize for his discovery of the photoelectric effect in 1968."
18
- "The photoelectric effect is a phenomenon that laid the foundation for quantum mechanics."
19
- ]
20
- }}
21
- ===== END OF EXAMPLE ======
22
-
23
- **
24
- IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
25
- Only include claims that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT. The claims you extract should include the full context it was presented in, NOT cherry picked facts.
26
- You should NOT include any prior knowledge, and take the text at face value when extracting claims.
27
- You should be aware that it is an AI that is outputting these claims.
28
- **
29
-
30
- AI Output:
31
- {actual_output}
32
-
33
- JSON:
34
- """
7
+ def generate_claims(actual_output: str, multimodal: bool = False):
8
+ multimodal_instruction = ""
9
+ if multimodal:
10
+ multimodal_instruction = " The excerpt may contain both text and images, so extract claims from all provided content."
11
+
12
+ return textwrap.dedent(
13
+ f"""Based on the given {'excerpt' if multimodal else 'text'}, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided actual AI output. {multimodal_instruction}
14
+ These truths, MUST BE COHERENT, and CANNOT be taken out of context.
15
+
16
+ Example:
17
+ Example Text:
18
+ "Albert Einstein, the genius often associated with wild hair and mind-bending theories, famously won the Nobel Prize in Physics—though not for his groundbreaking work on relativity, as many assume. Instead, in 1968, he was honored for his discovery of the photoelectric effect, a phenomenon that laid the foundation for quantum mechanics."
19
+
20
+ Example JSON:
21
+ {{
22
+ "claims": [
23
+ "Einstein won the noble prize for his discovery of the photoelectric effect in 1968.",
24
+ "The photoelectric effect is a phenomenon that laid the foundation for quantum mechanics."
25
+ ]
26
+ }}
27
+ ===== END OF EXAMPLE ======
28
+
29
+ **
30
+ IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
31
+ Only include claims that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT. The claims you extract should include the full context it was presented in, NOT cherry picked facts.
32
+ You should NOT include any prior knowledge, and take the text at face value when extracting claims.
33
+ You should be aware that it is an AI that is outputting these claims.
34
+ **
35
+
36
+ {'Excerpt' if multimodal else 'AI Output'}:
37
+ {actual_output}
38
+
39
+ JSON:
40
+ """
41
+ )
35
42
 
36
43
  @staticmethod
37
44
  def generate_truths(
38
- retrieval_context: str, extraction_limit: Optional[int] = None
45
+ retrieval_context: str,
46
+ extraction_limit: Optional[int] = None,
47
+ multimodal: bool = False,
39
48
  ):
40
49
  if extraction_limit is None:
41
50
  limit = " FACTUAL, undisputed truths"
@@ -43,98 +52,174 @@ JSON:
43
52
  limit = " the single most important FACTUAL, undisputed truth"
44
53
  else:
45
54
  limit = f" the {extraction_limit} most important FACTUAL, undisputed truths per document"
46
- return f"""Based on the given text, please generate a comprehensive list of{limit}, that can inferred from the provided text.
47
- These truths, MUST BE COHERENT. They must NOT be taken out of context.
48
-
49
- Example:
50
- Example Text:
51
- "Albert Einstein, the genius often associated with wild hair and mind-bending theories, famously won the Nobel Prize in Physics—though not for his groundbreaking work on relativity, as many assume. Instead, in 1968, he was honored for his discovery of the photoelectric effect, a phenomenon that laid the foundation for quantum mechanics."
52
-
53
- Example JSON:
54
- {{
55
- "truths": [
56
- "Einstein won the noble prize for his discovery of the photoelectric effect in 1968."
57
- "The photoelectric effect is a phenomenon that laid the foundation for quantum mechanics."
58
- ]
59
- }}
60
- ===== END OF EXAMPLE ======
61
- **
62
- IMPORTANT: Please make sure to only return in JSON format, with the "truths" key as a list of strings. No words or explanation is needed.
63
- Only include truths that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT.
64
- **
65
-
66
- Text:
67
- {retrieval_context}
68
-
69
- JSON:
70
- """
55
+
56
+ multimodal_instruction = ""
57
+ if multimodal:
58
+ multimodal_instruction = (
59
+ " The excerpt may contain both text and images."
60
+ )
61
+
62
+ return textwrap.dedent(
63
+ f"""Based on the given {'excerpt (text and images)' if multimodal else 'text'}, please generate a comprehensive list of{limit}, that can inferred from the provided {'excerpt' if multimodal else 'text'}.{multimodal_instruction}
64
+ These truths, MUST BE COHERENT. They must NOT be taken out of context.
65
+
66
+ Example:
67
+ Example Text:
68
+ "Albert Einstein, the genius often associated with wild hair and mind-bending theories, famously won the Nobel Prize in Physics—though not for his groundbreaking work on relativity, as many assume. Instead, in 1968, he was honored for his discovery of the photoelectric effect, a phenomenon that laid the foundation for quantum mechanics."
69
+
70
+ Example JSON:
71
+ {{
72
+ "truths": [
73
+ "Einstein won the noble prize for his discovery of the photoelectric effect in 1968.",
74
+ "The photoelectric effect is a phenomenon that laid the foundation for quantum mechanics."
75
+ ]
76
+ }}
77
+ ===== END OF EXAMPLE ======
78
+ **
79
+ IMPORTANT: Please make sure to only return in JSON format, with the "truths" key as a list of strings. No words or explanation is needed.
80
+ Only include truths that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT.
81
+ **
82
+
83
+ {'Excerpt' if multimodal else 'Text'}:
84
+ {retrieval_context}
85
+
86
+ JSON:
87
+ """
88
+ )
71
89
 
72
90
  @staticmethod
73
- def generate_verdicts(claims: List[str], retrieval_context: str):
74
- return f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
75
- The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
76
- Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
77
- The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
78
-
79
- Expected JSON format:
80
- {{
81
- "verdicts": [
82
- {{
83
- "verdict": "yes"
84
- }},
85
- {{
86
- "verdict": "no",
87
- "reason": <explanation_for_contradiction>
88
- }},
89
- {{
90
- "verdict": "idk",
91
- "reason": <explanation_for_uncertainty>
92
- }}
93
- ]
94
- }}
95
-
96
- Generate ONE verdict per claim - length of 'verdicts' MUST equal number of claims.
97
- No 'reason' needed for 'yes' verdicts.
98
- Only use 'no' if retrieval context DIRECTLY CONTRADICTS the claim - never use prior knowledge.
99
- Use 'idk' for claims not backed up by context OR factually incorrect but non-contradictory - do not assume your knowledge.
100
- Vague/speculative language in claims (e.g. 'may have', 'possibility') does NOT count as contradiction.
101
-
102
- **
103
- IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
104
- **
105
-
106
- Retrieval Contexts:
107
- {retrieval_context}
108
-
109
- Claims:
110
- {claims}
111
-
112
- JSON:
113
- """
91
+ def generate_verdicts(
92
+ claims: List[str], retrieval_context: str, multimodal: bool = False
93
+ ):
94
+ example_section = ""
95
+ if multimodal:
96
+ example_section = textwrap.dedent(
97
+ """
98
+ Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
99
+ Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a German chef."]
100
+
101
+ Example:
102
+ {{
103
+ "verdicts": [
104
+ {{
105
+ "reason": "The claim about Barack Obama is not directly addressed in the retrieval context, and so poses no contradiction.",
106
+ "verdict": "idk"
107
+ }},
108
+ {{
109
+ "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context.",
110
+ "verdict": "idk"
111
+ }},
112
+ {{
113
+ "verdict": "yes"
114
+ }},
115
+ {{
116
+ "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead.",
117
+ "verdict": "no"
118
+ }},
119
+ {{
120
+ "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead.",
121
+ "verdict": "no"
122
+ }}
123
+ ]
124
+ }}
125
+ ===== END OF EXAMPLE ======
126
+ """
127
+ )
128
+
129
+ format_instruction = textwrap.dedent(
130
+ """
131
+ Expected JSON format:
132
+ {{
133
+ "verdicts": [
134
+ {{
135
+ "verdict": "yes"
136
+ }},
137
+ {{
138
+ "reason": <explanation_for_contradiction>,
139
+ "verdict": "no"
140
+ }},
141
+ {{
142
+ "reason": <explanation_for_uncertainty>,
143
+ "verdict": "idk"
144
+ }}
145
+ ]
146
+ }}
147
+ """
148
+ )
149
+
150
+ guidelines = ""
151
+ if multimodal:
152
+ guidelines = textwrap.dedent(
153
+ """
154
+ The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
155
+ You DON'T have to provide a reason if the answer is 'yes'.
156
+ ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
157
+ Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
158
+ Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
159
+ If there are clear contradictions or any data or images that's not mentioned in the retrieval context, just provide 'no'.
160
+ """
161
+ )
162
+ else:
163
+ guidelines = textwrap.dedent(
164
+ """
165
+ Generate ONE verdict per claim - length of 'verdicts' MUST equal number of claims.
166
+ No 'reason' needed for 'yes' verdicts.
167
+ Only use 'no' if retrieval context DIRECTLY CONTRADICTS the claim - never use prior knowledge.
168
+ Use 'idk' for claims not backed up by context OR factually incorrect but non-contradictory - do not assume your knowledge.
169
+ Vague/speculative language in claims (e.g. 'may have', 'possibility') does NOT count as contradiction.
170
+ """
171
+ )
172
+
173
+ return textwrap.dedent(
174
+ f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
175
+ The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
176
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
177
+ The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
178
+
179
+ {format_instruction}
180
+ {example_section}
181
+ **
182
+ IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
183
+ {guidelines}
184
+ **
185
+
186
+ Retrieval Contexts:
187
+ {retrieval_context}
188
+
189
+ Claims:
190
+ {claims}
191
+
192
+ JSON:
193
+ """
194
+ )
114
195
 
115
196
  @staticmethod
116
- def generate_reason(score: float, contradictions: List[str]):
117
- return f"""Below is a list of Contradictions. It is a list of strings explaining why the 'actual output' does not align with the information presented in the 'retrieval context'. Contradictions happen in the 'actual output', NOT the 'retrieval context'.
118
- Given the faithfulness score, which is a 0-1 score indicating how faithful the `actual output` is to the retrieval context (higher the better), CONCISELY summarize the contradictions to justify the score.
197
+ def generate_reason(
198
+ score: float, contradictions: List[str], multimodal: bool = False
199
+ ):
200
+ return textwrap.dedent(
201
+ f"""Below is a list of Contradictions. It is a list of strings explaining why the 'actual output' does not align with the information presented in the 'retrieval context'. Contradictions happen in the 'actual output', NOT the 'retrieval context'.
202
+ Given the faithfulness score, which is a 0-1 score indicating how faithful the `actual output` is to the retrieval context (higher the better), CONCISELY summarize the contradictions to justify the score.
119
203
 
120
- Expected JSON format:
121
- {{
122
- "reason": "The score is <faithfulness_score> because <your_reason>."
123
- }}
204
+ Expected JSON format:
205
+ {{
206
+ "reason": "The score is <faithfulness_score> because <your_reason>."
207
+ }}
124
208
 
125
- **
126
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
209
+ **
210
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
127
211
 
128
- If there are no contradictions, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
129
- Your reason MUST use information in `contradiction` in your reason.
130
- Be sure in your reason, as if you know what the actual output is from the contradictions.
131
- **
212
+ If there are no contradictions, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
213
+ Your reason MUST use information in `contradiction` in your reason.
214
+ Be sure in your reason, as if you know what the actual output is from the contradictions.
215
+ **
132
216
 
133
- Faithfulness Score:
134
- {score}
217
+ Faithfulness Score:
218
+ {score}
135
219
 
136
- Contradictions:
137
- {contradictions}
220
+ Contradictions:
221
+ {contradictions}
138
222
 
139
- JSON:
140
- """
223
+ JSON:
224
+ """
225
+ )
@@ -118,12 +118,12 @@ def no_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
118
118
  return True
119
119
  elif (
120
120
  isinstance(model, GPTModel)
121
- and model.model_name in unsupported_log_probs_gpt_models
121
+ and model.get_model_name() in unsupported_log_probs_gpt_models
122
122
  ):
123
123
  return True
124
124
  elif (
125
125
  isinstance(model, AzureOpenAIModel)
126
- and model.model_name in unsupported_log_probs_gpt_models
126
+ and model.get_model_name() in unsupported_log_probs_gpt_models
127
127
  ):
128
128
  return True
129
129
 
@@ -17,12 +17,12 @@ Example:
17
17
  {{
18
18
  "verdicts": [
19
19
  {{
20
- "verdict": "yes",
21
- "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect."
20
+ "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect.",
21
+ "verdict": "yes"
22
22
  }},
23
23
  {{
24
- "verdict": "no",
25
- "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969."
24
+ "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969.",
25
+ "verdict": "no"
26
26
  }}
27
27
  ]
28
28
  }}
@@ -13,7 +13,7 @@ from deepeval.metrics import (
13
13
  BaseMultimodalMetric,
14
14
  BaseArenaMetric,
15
15
  )
16
- from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
16
+ from deepeval.test_case import LLMTestCase, ConversationalTestCase
17
17
  from deepeval.test_run.cache import CachedTestCase, Cache
18
18
  from deepeval.telemetry import capture_metric_type
19
19
  from deepeval.utils import update_pbar
@@ -75,7 +75,7 @@ async def measure_metric_task(
75
75
  task_id,
76
76
  progress,
77
77
  metric: Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric],
78
- test_case: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
78
+ test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
79
79
  cached_test_case: Union[CachedTestCase, None],
80
80
  ignore_errors: bool,
81
81
  skip_on_missing_params: bool,
@@ -159,7 +159,7 @@ async def measure_metrics_with_indicator(
159
159
  metrics: List[
160
160
  Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
161
161
  ],
162
- test_case: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
162
+ test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
163
163
  cached_test_case: Union[CachedTestCase, None],
164
164
  ignore_errors: bool,
165
165
  skip_on_missing_params: bool,
@@ -239,7 +239,7 @@ async def measure_metrics_with_indicator(
239
239
 
240
240
  async def safe_a_measure(
241
241
  metric: Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric],
242
- tc: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
242
+ tc: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
243
243
  ignore_errors: bool,
244
244
  skip_on_missing_params: bool,
245
245
  progress: Optional[Progress] = None,
@@ -40,8 +40,8 @@ Example JSON:
40
40
  {{
41
41
  "verdicts": [
42
42
  {{
43
- "verdict": "yes",
44
- "reason": "This request falls outside the {domain} domain and should be handled by a different specialist."
43
+ "reason": "This request falls outside the {domain} domain and should be handled by a different specialist.",
44
+ "verdict": "yes"
45
45
  }},
46
46
  {{
47
47
  "verdict": "no"
@@ -3,22 +3,4 @@ from .image_editing.image_editing import ImageEditingMetric
3
3
  from .image_coherence.image_coherence import ImageCoherenceMetric
4
4
  from .image_helpfulness.image_helpfulness import ImageHelpfulnessMetric
5
5
  from .image_reference.image_reference import ImageReferenceMetric
6
- from .multimodal_contextual_recall.multimodal_contextual_recall import (
7
- MultimodalContextualRecallMetric,
8
- )
9
- from .multimodal_contextual_relevancy.multimodal_contextual_relevancy import (
10
- MultimodalContextualRelevancyMetric,
11
- )
12
- from .multimodal_contextual_precision.multimodal_contextual_precision import (
13
- MultimodalContextualPrecisionMetric,
14
- )
15
- from .multimodal_answer_relevancy.multimodal_answer_relevancy import (
16
- MultimodalAnswerRelevancyMetric,
17
- )
18
- from .multimodal_faithfulness.multimodal_faithfulness import (
19
- MultimodalFaithfulnessMetric,
20
- )
21
- from .multimodal_tool_correctness.multimodal_tool_correctness import (
22
- MultimodalToolCorrectnessMetric,
23
- )
24
6
  from .multimodal_g_eval.multimodal_g_eval import MultimodalGEval
@@ -2,7 +2,7 @@ import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
4
  from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
5
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_coherence.template import (
7
7
  ImageCoherenceTemplate,
8
8
  )
@@ -10,32 +10,35 @@ from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
11
  trimAndLoadJson,
12
12
  check_mllm_test_case_params,
13
- initialize_multimodal_model,
13
+ initialize_model,
14
14
  )
15
- from deepeval.models import DeepEvalBaseMLLM
15
+ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.multimodal_metrics.image_coherence.schema import (
17
17
  ReasonScore,
18
18
  )
19
19
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.utils import get_or_create_event_loop
20
+ from deepeval.utils import (
21
+ get_or_create_event_loop,
22
+ convert_to_multi_modal_array,
23
+ )
21
24
 
22
25
 
23
26
  class ImageCoherenceMetric(BaseMultimodalMetric):
24
- _required_params: List[MLLMTestCaseParams] = [
25
- MLLMTestCaseParams.INPUT,
26
- MLLMTestCaseParams.ACTUAL_OUTPUT,
27
+ _required_params: List[LLMTestCaseParams] = [
28
+ LLMTestCaseParams.INPUT,
29
+ LLMTestCaseParams.ACTUAL_OUTPUT,
27
30
  ]
28
31
 
29
32
  def __init__(
30
33
  self,
31
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
34
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
32
35
  threshold: float = 0.5,
33
36
  async_mode: bool = True,
34
37
  strict_mode: bool = False,
35
38
  verbose_mode: bool = False,
36
39
  max_context_size: Optional[int] = None,
37
40
  ):
38
- self.model, self.using_native_model = initialize_multimodal_model(model)
41
+ self.model, self.using_native_model = initialize_model(model)
39
42
  self.evaluation_model = self.model.get_model_name()
40
43
  self.threshold = 1 if strict_mode else threshold
41
44
  self.strict_mode = strict_mode
@@ -45,13 +48,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
45
48
 
46
49
  def measure(
47
50
  self,
48
- test_case: MLLMTestCase,
51
+ test_case: LLMTestCase,
49
52
  _show_indicator: bool = True,
50
53
  _in_component: bool = False,
51
54
  _log_metric_to_confident: bool = True,
52
55
  ) -> float:
53
56
  check_mllm_test_case_params(
54
- test_case, self._required_params, None, None, self
57
+ test_case, self._required_params, None, None, self, self.model
55
58
  )
56
59
  self.evaluation_cost = 0 if self.using_native_model else None
57
60
  with metric_progress_indicator(
@@ -68,7 +71,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
68
71
  )
69
72
  )
70
73
  else:
71
- actual_output = test_case.actual_output
74
+ actual_output = convert_to_multi_modal_array(
75
+ test_case.actual_output
76
+ )
72
77
  self.contexts_above = []
73
78
  self.contexts_below = []
74
79
  self.scores = []
@@ -145,13 +150,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
145
150
 
146
151
  async def a_measure(
147
152
  self,
148
- test_case: MLLMTestCase,
153
+ test_case: LLMTestCase,
149
154
  _show_indicator: bool = True,
150
155
  _in_component: bool = False,
151
156
  _log_metric_to_confident: bool = True,
152
157
  ) -> float:
153
158
  check_mllm_test_case_params(
154
- test_case, self._required_params, None, None, self
159
+ test_case, self._required_params, None, None, self, self.model
155
160
  )
156
161
  self.evaluation_cost = 0 if self.using_native_model else None
157
162
  with metric_progress_indicator(
@@ -160,7 +165,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
160
165
  _show_indicator=_show_indicator,
161
166
  _in_component=_in_component,
162
167
  ):
163
- actual_output = test_case.actual_output
168
+ actual_output = convert_to_multi_modal_array(
169
+ test_case.actual_output
170
+ )
164
171
  self.contexts_above = []
165
172
  self.contexts_below = []
166
173
  self.scores = []
@@ -253,7 +260,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
253
260
  instructions = ImageCoherenceTemplate.evaluate_image_coherence(
254
261
  context_above, context_below
255
262
  )
256
- prompt = [instructions] + [image]
263
+ prompt = f"{instructions} \nImages: {image}"
257
264
  if self.using_native_model:
258
265
  res, cost = self.model.generate(prompt, ReasonScore)
259
266
  self.evaluation_cost += cost
@@ -278,7 +285,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
278
285
  instructions = ImageCoherenceTemplate.evaluate_image_coherence(
279
286
  context_above, context_below
280
287
  )
281
- prompt = [instructions] + [image]
288
+ prompt = f"{instructions} \nImages: {image}"
282
289
  if self.using_native_model:
283
290
  res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
284
291
  self.evaluation_cost += cost