deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,110 +1,206 @@
1
1
  from typing import List
2
+ import textwrap
2
3
 
3
4
 
4
5
  class AnswerRelevancyTemplate:
5
6
  @staticmethod
6
- def generate_statements(actual_output: str):
7
- return f"""Given the text, breakdown and generate a list of statements presented. Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement.
8
-
9
- Example:
10
- Example text:
11
- Our new laptop model features a high-resolution Retina display for crystal-clear visuals. It also includes a fast-charging battery, giving you up to 12 hours of usage on a single charge. For security, we’ve added fingerprint authentication and an encrypted SSD. Plus, every purchase comes with a one-year warranty and 24/7 customer support.
12
-
13
- {{
14
- "statements": [
15
- "The new laptop model has a high-resolution Retina display.",
16
- "It includes a fast-charging battery with up to 12 hours of usage.",
17
- "Security features include fingerprint authentication and an encrypted SSD.",
18
- "Every purchase comes with a one-year warranty.",
19
- "24/7 customer support is included."
20
- ]
21
- }}
22
- ===== END OF EXAMPLE ======
23
-
24
- **
25
- IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
26
- **
27
-
28
- Text:
29
- {actual_output}
30
-
31
- JSON:
32
- """
7
+ def generate_statements(actual_output: str, multimodal: bool = False):
8
+ multimodal_instruction = ""
9
+ example_text = ""
10
+ example_json = ""
11
+
12
+ if multimodal:
13
+ multimodal_instruction = " The text may contain images as well."
14
+ example_text = "Shoes. The shoes can be refunded at no extra cost. Thanks for asking the question!"
15
+ example_json = textwrap.dedent(
16
+ """
17
+ {{
18
+ "statements": ["Shoes.", "Shoes can be refunded at no extra cost", "Thanks for asking the question!"]
19
+ }}
20
+ """
21
+ )
22
+ else:
23
+ example_text = "Our new laptop model features a high-resolution Retina display for crystal-clear visuals. It also includes a fast-charging battery, giving you up to 12 hours of usage on a single charge. For security, we've added fingerprint authentication and an encrypted SSD. Plus, every purchase comes with a one-year warranty and 24/7 customer support."
24
+ example_json = textwrap.dedent(
25
+ """
26
+ {{
27
+ "statements": [
28
+ "The new laptop model has a high-resolution Retina display.",
29
+ "It includes a fast-charging battery with up to 12 hours of usage.",
30
+ "Security features include fingerprint authentication and an encrypted SSD.",
31
+ "Every purchase comes with a one-year warranty.",
32
+ "24/7 customer support is included."
33
+ ]
34
+ }}
35
+ """
36
+ )
37
+
38
+ coherence_note = (
39
+ ""
40
+ if multimodal
41
+ else " Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement."
42
+ )
43
+
44
+ return textwrap.dedent(
45
+ f"""Given the text, breakdown and generate a list of statements presented.{coherence_note}{multimodal_instruction}
46
+
47
+ Example:
48
+ Example text:
49
+ {example_text}
50
+
51
+ {example_json}
52
+ ===== END OF EXAMPLE ======
53
+
54
+ **
55
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
56
+ **
57
+
58
+ Text:
59
+ {actual_output}
60
+
61
+ JSON:
62
+ """
63
+ )
33
64
 
34
65
  @staticmethod
35
- def generate_verdicts(input: str, statements: str):
36
- return f"""For the provided list of statements, determine whether each statement is relevant to address the input.
37
- Generate JSON objects with 'verdict' and 'reason' fields.
38
- The 'verdict' should be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information).
39
- Provide 'reason' ONLY for 'no' or 'idk' verdicts.
40
- The statements are from an AI's actual output.
41
-
42
- **
43
- IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
44
-
45
- Expected JSON format:
46
- {{
47
- "verdicts": [
48
- {{
49
- "verdict": "yes"
50
- }},
51
- {{
52
- "verdict": "no",
53
- "reason": <explanation_for_irrelevance>
54
- }},
55
- {{
56
- "verdict": "idk",
57
- "reason": <explanation_for_ambiguity>
58
- }}
59
- ]
60
- }}
61
-
62
- Generate ONE verdict per statement - number of 'verdicts' MUST equal number of statements.
63
- 'verdict' must be STRICTLY 'yes', 'no', or 'idk':
64
- - 'yes': statement is relevant to addressing the input
65
- - 'no': statement is irrelevant to the input
66
- - 'idk': statement is ambiguous (not directly relevant but could be supporting information)
67
- Provide 'reason' ONLY for 'no' or 'idk' verdicts.
68
- **
69
-
70
- Input:
71
- {input}
72
-
73
- Statements:
74
- {statements}
75
-
76
- JSON:
77
- """
66
+ def generate_verdicts(
67
+ input: str, statements: str, multimodal: bool = False
68
+ ):
69
+ content_type = (
70
+ "statements (which can contain images)"
71
+ if multimodal
72
+ else "list of statements"
73
+ )
74
+ statement_or_image = "statement or image" if multimodal else "statement"
75
+
76
+ format_instruction = textwrap.dedent(
77
+ """
78
+ Expected JSON format:
79
+ {{
80
+ "verdicts": [
81
+ {{
82
+ "verdict": "yes"
83
+ }},
84
+ {{
85
+ "reason": <explanation_for_irrelevance>,
86
+ "verdict": "no"
87
+ }},
88
+ {{
89
+ "reason": <explanation_for_ambiguity>,
90
+ "verdict": "idk"
91
+ }}
92
+ ]
93
+ }}
94
+ """
95
+ )
96
+
97
+ example_section = ""
98
+ if multimodal:
99
+ example_section = textwrap.dedent(
100
+ """
101
+ Example input: What should I do if there is an earthquake?
102
+ Example statements: ["Shoes.", "Thanks for asking the question!", "Is there anything else I can help you with?", "Duck and hide"]
103
+ Example JSON:
104
+ {{
105
+ "verdicts": [
106
+ {{
107
+ "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake.",
108
+ "verdict": "no"
109
+ }},
110
+ {{
111
+ "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant.",
112
+ "verdict": "idk"
113
+ }},
114
+ {{
115
+ "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant.",
116
+ "verdict": "idk"
117
+ }},
118
+ {{
119
+ "verdict": "yes"
120
+ }}
121
+ ]
122
+ }}
123
+ """
124
+ )
125
+
126
+ guidelines = ""
127
+ if multimodal:
128
+ guidelines = textwrap.dedent(
129
+ f"""
130
+ Since you are going to generate a verdict for each statement and image, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.
131
+ """
132
+ )
133
+ else:
134
+ guidelines = textwrap.dedent(
135
+ f"""
136
+ Generate ONE verdict per statement - number of 'verdicts' MUST equal number of statements.
137
+ 'verdict' must be STRICTLY 'yes', 'no', or 'idk':
138
+ - 'yes': statement is relevant to addressing the input
139
+ - 'no': statement is irrelevant to the input
140
+ - 'idk': statement is ambiguous (not directly relevant but could be supporting information)
141
+ Provide 'reason' ONLY for 'no' or 'idk' verdicts.
142
+ """
143
+ )
144
+
145
+ return textwrap.dedent(
146
+ f"""For the provided {content_type}, determine whether each {statement_or_image} is relevant to address the input.
147
+ {"Please generate a list of JSON with two keys: `verdict` and `reason`." if multimodal else "Generate JSON objects with 'verdict' and 'reason' fields."}
148
+ The 'verdict' {"key " if multimodal else ''}should {"STRICTLY be either a 'yes', 'idk' or 'no'" if multimodal else "be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information)"}. {"Answer 'yes' if the " + statement_or_image + ' is relevant to addressing the original input, no if the ' + statement_or_image + ' is irrelevant, and "idk" if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).' if multimodal else ""}
149
+ {"The 'reason' is the reason for the verdict.' if multimodal else '"}
150
+ Provide 'reason' ONLY for 'no' or 'idk' verdicts.
151
+ The {"provided statements are statements and images' if multimodal else 'statements are from an AI's actual output"} generated in the actual output.
152
+
153
+ **
154
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
155
+
156
+ {format_instruction if not multimodal else ''}
157
+ {example_section}
158
+ {guidelines}
159
+ **
160
+
161
+ Input:
162
+ {input}
163
+
164
+ Statements:
165
+ {statements}
166
+
167
+ JSON:
168
+ """
169
+ )
78
170
 
79
171
  @staticmethod
80
172
  def generate_reason(
81
- irrelevant_statements: List[str], input: str, score: float
173
+ irrelevant_statements: List[str],
174
+ input: str,
175
+ score: float,
176
+ multimodal: bool = False,
82
177
  ):
83
- return f"""Given the answer relevancy score, the list of reasons of irrelevant statements made in the actual output, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
84
- The irrelevant statements represent things in the actual output that is irrelevant to addressing whatever is asked/talked about in the input.
85
- If there is nothing irrelevant, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
86
-
178
+ return textwrap.dedent(
179
+ f"""Given the answer relevancy score, the list of reasons of irrelevant statements made in the actual output, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
180
+ The irrelevant statements represent things in the actual output that is irrelevant to addressing whatever is asked/talked about in the input.
181
+ If there is nothing irrelevant, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
87
182
 
88
- **
89
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
90
183
 
91
- Example:
92
- Example JSON:
93
- {{
94
- "reason": "The score is <answer_relevancy_score> because <your_reason>."
95
- }}
96
- ===== END OF EXAMPLE ======
97
- **
184
+ **
185
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
98
186
 
187
+ {"Example:' if not multimodal else '"}
188
+ Example JSON:
189
+ {{
190
+ "reason": "The score is <answer_relevancy_score> because <your_reason>."
191
+ }}
192
+ {"===== END OF EXAMPLE ======' if not multimodal else '"}
193
+ **
99
194
 
100
- Answer Relevancy Score:
101
- {score}
195
+ Answer Relevancy Score:
196
+ {score}
102
197
 
103
- Reasons why the score can't be higher based on irrelevant statements in the actual output:
104
- {irrelevant_statements}
198
+ Reasons why the score can't be higher based on irrelevant statements in the actual output:
199
+ {irrelevant_statements}
105
200
 
106
- Input:
107
- {input}
201
+ Input:
202
+ {input}
108
203
 
109
- JSON:
110
- """
204
+ JSON:
205
+ """
206
+ )
@@ -70,8 +70,8 @@ class ArgumentCorrectnessTemplate:
70
70
  "verdict": "yes"
71
71
  }},
72
72
  {{
73
- "verdict": "no",
74
- "reason": "Recommending romantic Parisian comedies does not help find the highest temperature in 2023."
73
+ "reason": "Recommending romantic Parisian comedies does not help find the highest temperature in 2023.",
74
+ "verdict": "no"
75
75
  }}
76
76
  ]
77
77
  }}
@@ -4,7 +4,6 @@ from typing import Optional, Dict, List
4
4
  from deepeval.test_case import (
5
5
  LLMTestCase,
6
6
  ConversationalTestCase,
7
- MLLMTestCase,
8
7
  LLMTestCaseParams,
9
8
  ArenaTestCase,
10
9
  )
@@ -113,13 +112,11 @@ class BaseMultimodalMetric:
113
112
  self._threshold = value
114
113
 
115
114
  @abstractmethod
116
- def measure(self, test_case: MLLMTestCase, *args, **kwargs) -> float:
115
+ def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
117
116
  raise NotImplementedError
118
117
 
119
118
  @abstractmethod
120
- async def a_measure(
121
- self, test_case: MLLMTestCase, *args, **kwargs
122
- ) -> float:
119
+ async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
123
120
  raise NotImplementedError(
124
121
  f"Async execution for {self.__class__.__name__} not supported yet. Please set 'async_mode' to 'False'."
125
122
  )
@@ -64,15 +64,15 @@ Example JSON:
64
64
  {{
65
65
  "verdicts": [
66
66
  {{
67
- "verdict": "yes",
68
- "reason": "The opinion "Government meddling in healthcare bloats costs and quashes innovation" reveals a political bias, emphasizing negative views on government involvement."
67
+ "reason": "The opinion "Government meddling in healthcare bloats costs and quashes innovation" reveals a political bias, emphasizing negative views on government involvement.",
68
+ "verdict": "yes"
69
69
  }},
70
70
  {{
71
71
  "verdict": "no"
72
72
  }},
73
73
  {{
74
74
  "verdict": "no"
75
- }},
75
+ }}
76
76
  ]
77
77
  }}
78
78
 
@@ -1,10 +1,14 @@
1
1
  from typing import Optional, List, Type, Union
2
2
 
3
- from deepeval.utils import get_or_create_event_loop, prettify_list
3
+ from deepeval.utils import (
4
+ get_or_create_event_loop,
5
+ prettify_list,
6
+ )
4
7
  from deepeval.metrics.utils import (
5
8
  construct_verbose_logs,
6
9
  trimAndLoadJson,
7
10
  check_llm_test_case_params,
11
+ check_mllm_test_case_params,
8
12
  initialize_model,
9
13
  )
10
14
  from deepeval.test_case import (
@@ -56,7 +60,15 @@ class ContextualPrecisionMetric(BaseMetric):
56
60
  _in_component: bool = False,
57
61
  _log_metric_to_confident: bool = True,
58
62
  ) -> float:
59
- check_llm_test_case_params(test_case, self._required_params, self)
63
+
64
+ multimodal = test_case.multimodal
65
+
66
+ if multimodal:
67
+ check_mllm_test_case_params(
68
+ test_case, self._required_params, None, None, self, self.model
69
+ )
70
+ else:
71
+ check_llm_test_case_params(test_case, self._required_params, self)
60
72
 
61
73
  self.evaluation_cost = 0 if self.using_native_model else None
62
74
  with metric_progress_indicator(
@@ -73,15 +85,20 @@ class ContextualPrecisionMetric(BaseMetric):
73
85
  )
74
86
  )
75
87
  else:
88
+ input = test_case.input
89
+ expected_output = test_case.expected_output
90
+ retrieval_context = test_case.retrieval_context
91
+
76
92
  self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
77
93
  self._generate_verdicts(
78
- test_case.input,
79
- test_case.expected_output,
80
- test_case.retrieval_context,
94
+ input,
95
+ expected_output,
96
+ retrieval_context,
97
+ multimodal,
81
98
  )
82
99
  )
83
100
  self.score = self._calculate_score()
84
- self.reason = self._generate_reason(test_case.input)
101
+ self.reason = self._generate_reason(input, multimodal)
85
102
  self.success = self.score >= self.threshold
86
103
  self.verbose_logs = construct_verbose_logs(
87
104
  self,
@@ -104,7 +121,14 @@ class ContextualPrecisionMetric(BaseMetric):
104
121
  _log_metric_to_confident: bool = True,
105
122
  ) -> float:
106
123
 
107
- check_llm_test_case_params(test_case, self._required_params, self)
124
+ multimodal = test_case.multimodal
125
+
126
+ if multimodal:
127
+ check_mllm_test_case_params(
128
+ test_case, self._required_params, None, None, self, self.model
129
+ )
130
+ else:
131
+ check_llm_test_case_params(test_case, self._required_params, self)
108
132
 
109
133
  self.evaluation_cost = 0 if self.using_native_model else None
110
134
  with metric_progress_indicator(
@@ -113,15 +137,17 @@ class ContextualPrecisionMetric(BaseMetric):
113
137
  _show_indicator=_show_indicator,
114
138
  _in_component=_in_component,
115
139
  ):
140
+ input = test_case.input
141
+ expected_output = test_case.expected_output
142
+ retrieval_context = test_case.retrieval_context
143
+
116
144
  self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
117
145
  await self._a_generate_verdicts(
118
- test_case.input,
119
- test_case.expected_output,
120
- test_case.retrieval_context,
146
+ input, expected_output, retrieval_context, multimodal
121
147
  )
122
148
  )
123
149
  self.score = self._calculate_score()
124
- self.reason = await self._a_generate_reason(test_case.input)
150
+ self.reason = await self._a_generate_reason(input, multimodal)
125
151
  self.success = self.score >= self.threshold
126
152
  self.verbose_logs = construct_verbose_logs(
127
153
  self,
@@ -136,7 +162,7 @@ class ContextualPrecisionMetric(BaseMetric):
136
162
  )
137
163
  return self.score
138
164
 
139
- async def _a_generate_reason(self, input: str):
165
+ async def _a_generate_reason(self, input: str, multimodal: bool):
140
166
  if self.include_reason is False:
141
167
  return None
142
168
 
@@ -148,6 +174,7 @@ class ContextualPrecisionMetric(BaseMetric):
148
174
  input=input,
149
175
  verdicts=retrieval_contexts_verdicts,
150
176
  score=format(self.score, ".2f"),
177
+ multimodal=multimodal,
151
178
  )
152
179
 
153
180
  if self.using_native_model:
@@ -169,7 +196,7 @@ class ContextualPrecisionMetric(BaseMetric):
169
196
  data = trimAndLoadJson(res, self)
170
197
  return data["reason"]
171
198
 
172
- def _generate_reason(self, input: str):
199
+ def _generate_reason(self, input: str, multimodal: bool):
173
200
  if self.include_reason is False:
174
201
  return None
175
202
 
@@ -181,6 +208,7 @@ class ContextualPrecisionMetric(BaseMetric):
181
208
  input=input,
182
209
  verdicts=retrieval_contexts_verdicts,
183
210
  score=format(self.score, ".2f"),
211
+ multimodal=multimodal,
184
212
  )
185
213
 
186
214
  if self.using_native_model:
@@ -203,12 +231,17 @@ class ContextualPrecisionMetric(BaseMetric):
203
231
  return data["reason"]
204
232
 
205
233
  async def _a_generate_verdicts(
206
- self, input: str, expected_output: str, retrieval_context: List[str]
234
+ self,
235
+ input: str,
236
+ expected_output: str,
237
+ retrieval_context: List[str],
238
+ multimodal: bool,
207
239
  ) -> List[cpschema.ContextualPrecisionVerdict]:
208
240
  prompt = self.evaluation_template.generate_verdicts(
209
241
  input=input,
210
242
  expected_output=expected_output,
211
243
  retrieval_context=retrieval_context,
244
+ multimodal=multimodal,
212
245
  )
213
246
  if self.using_native_model:
214
247
  res, cost = await self.model.a_generate(
@@ -234,12 +267,17 @@ class ContextualPrecisionMetric(BaseMetric):
234
267
  return verdicts
235
268
 
236
269
  def _generate_verdicts(
237
- self, input: str, expected_output: str, retrieval_context: List[str]
270
+ self,
271
+ input: str,
272
+ expected_output: str,
273
+ retrieval_context: List[str],
274
+ multimodal: bool,
238
275
  ) -> List[cpschema.ContextualPrecisionVerdict]:
239
276
  prompt = self.evaluation_template.generate_verdicts(
240
277
  input=input,
241
278
  expected_output=expected_output,
242
279
  retrieval_context=retrieval_context,
280
+ multimodal=multimodal,
243
281
  )
244
282
  if self.using_native_model:
245
283
  res, cost = self.model.generate(prompt, schema=cpschema.Verdicts)