deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -1,16 +1,17 @@
1
1
  from typing import List, Optional, Union, Type
2
2
  import asyncio
3
3
 
4
- from deepeval.test_case import (
5
- LLMTestCase,
6
- LLMTestCaseParams,
7
- )
4
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
8
5
  from deepeval.metrics import BaseMetric
9
- from deepeval.utils import get_or_create_event_loop, prettify_list
6
+ from deepeval.utils import (
7
+ get_or_create_event_loop,
8
+ prettify_list,
9
+ )
10
10
  from deepeval.metrics.utils import (
11
11
  construct_verbose_logs,
12
12
  trimAndLoadJson,
13
13
  check_llm_test_case_params,
14
+ check_mllm_test_case_params,
14
15
  initialize_model,
15
16
  )
16
17
  from deepeval.models import DeepEvalBaseLLM
@@ -67,7 +68,13 @@ class FaithfulnessMetric(BaseMetric):
67
68
  _log_metric_to_confident: bool = True,
68
69
  ) -> float:
69
70
 
70
- check_llm_test_case_params(test_case, self._required_params, self)
71
+ multimodal = test_case.multimodal
72
+ if multimodal:
73
+ check_mllm_test_case_params(
74
+ test_case, self._required_params, None, None, self, self.model
75
+ )
76
+ else:
77
+ check_llm_test_case_params(test_case, self._required_params, self)
71
78
 
72
79
  self.evaluation_cost = 0 if self.using_native_model else None
73
80
  with metric_progress_indicator(
@@ -84,11 +91,16 @@ class FaithfulnessMetric(BaseMetric):
84
91
  )
85
92
  )
86
93
  else:
87
- self.truths = self._generate_truths(test_case.retrieval_context)
88
- self.claims = self._generate_claims(test_case.actual_output)
89
- self.verdicts = self._generate_verdicts()
94
+ retrieval_context = test_case.retrieval_context
95
+ actual_output = test_case.actual_output
96
+
97
+ self.truths = self._generate_truths(
98
+ retrieval_context, multimodal
99
+ )
100
+ self.claims = self._generate_claims(actual_output, multimodal)
101
+ self.verdicts = self._generate_verdicts(multimodal)
90
102
  self.score = self._calculate_score()
91
- self.reason = self._generate_reason()
103
+ self.reason = self._generate_reason(multimodal)
92
104
  self.success = self.score >= self.threshold
93
105
  self.verbose_logs = construct_verbose_logs(
94
106
  self,
@@ -114,7 +126,13 @@ class FaithfulnessMetric(BaseMetric):
114
126
  _log_metric_to_confident: bool = True,
115
127
  ) -> float:
116
128
 
117
- check_llm_test_case_params(test_case, self._required_params, self)
129
+ multimodal = test_case.multimodal
130
+ if multimodal:
131
+ check_mllm_test_case_params(
132
+ test_case, self._required_params, None, None, self, self.model
133
+ )
134
+ else:
135
+ check_llm_test_case_params(test_case, self._required_params, self)
118
136
 
119
137
  self.evaluation_cost = 0 if self.using_native_model else None
120
138
  with metric_progress_indicator(
@@ -123,13 +141,16 @@ class FaithfulnessMetric(BaseMetric):
123
141
  _show_indicator=_show_indicator,
124
142
  _in_component=_in_component,
125
143
  ):
144
+ retrieval_context = test_case.retrieval_context
145
+ actual_output = test_case.actual_output
146
+
126
147
  self.truths, self.claims = await asyncio.gather(
127
- self._a_generate_truths(test_case.retrieval_context),
128
- self._a_generate_claims(test_case.actual_output),
148
+ self._a_generate_truths(retrieval_context, multimodal),
149
+ self._a_generate_claims(actual_output, multimodal),
129
150
  )
130
- self.verdicts = await self._a_generate_verdicts()
151
+ self.verdicts = await self._a_generate_verdicts(multimodal)
131
152
  self.score = self._calculate_score()
132
- self.reason = await self._a_generate_reason()
153
+ self.reason = await self._a_generate_reason(multimodal)
133
154
  self.success = self.score >= self.threshold
134
155
  self.verbose_logs = construct_verbose_logs(
135
156
  self,
@@ -146,7 +167,7 @@ class FaithfulnessMetric(BaseMetric):
146
167
  )
147
168
  return self.score
148
169
 
149
- async def _a_generate_reason(self) -> str:
170
+ async def _a_generate_reason(self, multimodal: bool) -> str:
150
171
  if self.include_reason is False:
151
172
  return None
152
173
 
@@ -158,6 +179,7 @@ class FaithfulnessMetric(BaseMetric):
158
179
  prompt = self.evaluation_template.generate_reason(
159
180
  contradictions=contradictions,
160
181
  score=format(self.score, ".2f"),
182
+ multimodal=multimodal,
161
183
  )
162
184
 
163
185
  if self.using_native_model:
@@ -177,7 +199,7 @@ class FaithfulnessMetric(BaseMetric):
177
199
  data = trimAndLoadJson(res, self)
178
200
  return data["reason"]
179
201
 
180
- def _generate_reason(self) -> str:
202
+ def _generate_reason(self, multimodal: bool) -> str:
181
203
  if self.include_reason is False:
182
204
  return None
183
205
 
@@ -189,6 +211,7 @@ class FaithfulnessMetric(BaseMetric):
189
211
  prompt = self.evaluation_template.generate_reason(
190
212
  contradictions=contradictions,
191
213
  score=format(self.score, ".2f"),
214
+ multimodal=multimodal,
192
215
  )
193
216
 
194
217
  if self.using_native_model:
@@ -208,14 +231,20 @@ class FaithfulnessMetric(BaseMetric):
208
231
  data = trimAndLoadJson(res, self)
209
232
  return data["reason"]
210
233
 
211
- async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]:
234
+ async def _a_generate_verdicts(
235
+ self, multimodal: bool
236
+ ) -> List[FaithfulnessVerdict]:
212
237
  if len(self.claims) == 0:
213
238
  return []
214
239
 
215
240
  verdicts: List[FaithfulnessVerdict] = []
241
+
216
242
  prompt = self.evaluation_template.generate_verdicts(
217
- claims=self.claims, retrieval_context="\n\n".join(self.truths)
243
+ claims=self.claims,
244
+ retrieval_context="\n\n".join(self.truths),
245
+ multimodal=multimodal,
218
246
  )
247
+
219
248
  if self.using_native_model:
220
249
  res, cost = await self.model.a_generate(prompt, schema=Verdicts)
221
250
  self.evaluation_cost += cost
@@ -236,14 +265,18 @@ class FaithfulnessMetric(BaseMetric):
236
265
  ]
237
266
  return verdicts
238
267
 
239
- def _generate_verdicts(self) -> List[FaithfulnessVerdict]:
268
+ def _generate_verdicts(self, multimodal: bool) -> List[FaithfulnessVerdict]:
240
269
  if len(self.claims) == 0:
241
270
  return []
242
271
 
243
272
  verdicts: List[FaithfulnessVerdict] = []
273
+
244
274
  prompt = self.evaluation_template.generate_verdicts(
245
- claims=self.claims, retrieval_context="\n\n".join(self.truths)
275
+ claims=self.claims,
276
+ retrieval_context="\n\n".join(self.truths),
277
+ multimodal=multimodal,
246
278
  )
279
+
247
280
  if self.using_native_model:
248
281
  res, cost = self.model.generate(prompt, schema=Verdicts)
249
282
  self.evaluation_cost += cost
@@ -262,10 +295,13 @@ class FaithfulnessMetric(BaseMetric):
262
295
  ]
263
296
  return verdicts
264
297
 
265
- async def _a_generate_truths(self, retrieval_context: str) -> List[str]:
298
+ async def _a_generate_truths(
299
+ self, retrieval_context: str, multimodal: bool
300
+ ) -> List[str]:
266
301
  prompt = self.evaluation_template.generate_truths(
267
302
  retrieval_context="\n\n".join(retrieval_context),
268
303
  extraction_limit=self.truths_extraction_limit,
304
+ multimodal=multimodal,
269
305
  )
270
306
  if self.using_native_model:
271
307
  res, cost = await self.model.a_generate(prompt, schema=Truths)
@@ -280,10 +316,13 @@ class FaithfulnessMetric(BaseMetric):
280
316
  data = trimAndLoadJson(res, self)
281
317
  return data["truths"]
282
318
 
283
- def _generate_truths(self, retrieval_context: str) -> List[str]:
319
+ def _generate_truths(
320
+ self, retrieval_context: str, multimodal: bool
321
+ ) -> List[str]:
284
322
  prompt = self.evaluation_template.generate_truths(
285
323
  retrieval_context="\n\n".join(retrieval_context),
286
324
  extraction_limit=self.truths_extraction_limit,
325
+ multimodal=multimodal,
287
326
  )
288
327
  if self.using_native_model:
289
328
  res, cost = self.model.generate(prompt, schema=Truths)
@@ -298,9 +337,11 @@ class FaithfulnessMetric(BaseMetric):
298
337
  data = trimAndLoadJson(res, self)
299
338
  return data["truths"]
300
339
 
301
- async def _a_generate_claims(self, actual_output: str) -> List[str]:
340
+ async def _a_generate_claims(
341
+ self, actual_output: str, multimodal: bool
342
+ ) -> List[str]:
302
343
  prompt = self.evaluation_template.generate_claims(
303
- actual_output=actual_output
344
+ actual_output=actual_output, multimodal=multimodal
304
345
  )
305
346
  if self.using_native_model:
306
347
  res, cost = await self.model.a_generate(prompt, schema=Claims)
@@ -315,9 +356,11 @@ class FaithfulnessMetric(BaseMetric):
315
356
  data = trimAndLoadJson(res, self)
316
357
  return data["claims"]
317
358
 
318
- def _generate_claims(self, actual_output: str) -> List[str]:
359
+ def _generate_claims(
360
+ self, actual_output: str, multimodal: bool
361
+ ) -> List[str]:
319
362
  prompt = self.evaluation_template.generate_claims(
320
- actual_output=actual_output
363
+ actual_output=actual_output, multimodal=multimodal
321
364
  )
322
365
  if self.using_native_model:
323
366
  res, cost = self.model.generate(prompt, schema=Claims)
@@ -3,7 +3,7 @@ from pydantic import BaseModel, Field
3
3
 
4
4
 
5
5
  class FaithfulnessVerdict(BaseModel):
6
- verdict: Literal["yes", "idk", "no"]
6
+ verdict: Literal["yes", "no", "idk"]
7
7
  reason: Optional[str] = Field(default=None)
8
8
 
9
9
 
@@ -1,41 +1,50 @@
1
1
  from typing import Optional, List
2
+ import textwrap
2
3
 
3
4
 
4
5
  class FaithfulnessTemplate:
5
6
  @staticmethod
6
- def generate_claims(actual_output: str):
7
- return f"""Based on the given text, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided actual AI output.
8
- These truths, MUST BE COHERENT, and CANNOT be taken out of context.
9
-
10
- Example:
11
- Example Text:
12
- "Albert Einstein, the genius often associated with wild hair and mind-bending theories, famously won the Nobel Prize in Physics—though not for his groundbreaking work on relativity, as many assume. Instead, in 1968, he was honored for his discovery of the photoelectric effect, a phenomenon that laid the foundation for quantum mechanics."
13
-
14
- Example JSON:
15
- {{
16
- "claims": [
17
- "Einstein won the noble prize for his discovery of the photoelectric effect in 1968."
18
- "The photoelectric effect is a phenomenon that laid the foundation for quantum mechanics."
19
- ]
20
- }}
21
- ===== END OF EXAMPLE ======
22
-
23
- **
24
- IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
25
- Only include claims that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT. The claims you extract should include the full context it was presented in, NOT cherry picked facts.
26
- You should NOT include any prior knowledge, and take the text at face value when extracting claims.
27
- You should be aware that it is an AI that is outputting these claims.
28
- **
29
-
30
- AI Output:
31
- {actual_output}
32
-
33
- JSON:
34
- """
7
+ def generate_claims(actual_output: str, multimodal: bool = False):
8
+ multimodal_instruction = ""
9
+ if multimodal:
10
+ multimodal_instruction = " The excerpt may contain both text and images, so extract claims from all provided content."
11
+
12
+ return textwrap.dedent(
13
+ f"""Based on the given {'excerpt' if multimodal else 'text'}, please extract a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided actual AI output. {multimodal_instruction}
14
+ These truths, MUST BE COHERENT, and CANNOT be taken out of context.
15
+
16
+ Example:
17
+ Example Text:
18
+ "Albert Einstein, the genius often associated with wild hair and mind-bending theories, famously won the Nobel Prize in Physics—though not for his groundbreaking work on relativity, as many assume. Instead, in 1968, he was honored for his discovery of the photoelectric effect, a phenomenon that laid the foundation for quantum mechanics."
19
+
20
+ Example JSON:
21
+ {{
22
+ "claims": [
23
+ "Einstein won the noble prize for his discovery of the photoelectric effect in 1968.",
24
+ "The photoelectric effect is a phenomenon that laid the foundation for quantum mechanics."
25
+ ]
26
+ }}
27
+ ===== END OF EXAMPLE ======
28
+
29
+ **
30
+ IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
31
+ Only include claims that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT. The claims you extract should include the full context it was presented in, NOT cherry picked facts.
32
+ You should NOT include any prior knowledge, and take the text at face value when extracting claims.
33
+ You should be aware that it is an AI that is outputting these claims.
34
+ **
35
+
36
+ {'Excerpt' if multimodal else 'AI Output'}:
37
+ {actual_output}
38
+
39
+ JSON:
40
+ """
41
+ )
35
42
 
36
43
  @staticmethod
37
44
  def generate_truths(
38
- retrieval_context: str, extraction_limit: Optional[int] = None
45
+ retrieval_context: str,
46
+ extraction_limit: Optional[int] = None,
47
+ multimodal: bool = False,
39
48
  ):
40
49
  if extraction_limit is None:
41
50
  limit = " FACTUAL, undisputed truths"
@@ -43,98 +52,174 @@ JSON:
43
52
  limit = " the single most important FACTUAL, undisputed truth"
44
53
  else:
45
54
  limit = f" the {extraction_limit} most important FACTUAL, undisputed truths per document"
46
- return f"""Based on the given text, please generate a comprehensive list of{limit}, that can inferred from the provided text.
47
- These truths, MUST BE COHERENT. They must NOT be taken out of context.
48
-
49
- Example:
50
- Example Text:
51
- "Albert Einstein, the genius often associated with wild hair and mind-bending theories, famously won the Nobel Prize in Physics—though not for his groundbreaking work on relativity, as many assume. Instead, in 1968, he was honored for his discovery of the photoelectric effect, a phenomenon that laid the foundation for quantum mechanics."
52
-
53
- Example JSON:
54
- {{
55
- "truths": [
56
- "Einstein won the noble prize for his discovery of the photoelectric effect in 1968."
57
- "The photoelectric effect is a phenomenon that laid the foundation for quantum mechanics."
58
- ]
59
- }}
60
- ===== END OF EXAMPLE ======
61
- **
62
- IMPORTANT: Please make sure to only return in JSON format, with the "truths" key as a list of strings. No words or explanation is needed.
63
- Only include truths that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT.
64
- **
65
-
66
- Text:
67
- {retrieval_context}
68
-
69
- JSON:
70
- """
55
+
56
+ multimodal_instruction = ""
57
+ if multimodal:
58
+ multimodal_instruction = (
59
+ " The excerpt may contain both text and images."
60
+ )
61
+
62
+ return textwrap.dedent(
63
+ f"""Based on the given {'excerpt (text and images)' if multimodal else 'text'}, please generate a comprehensive list of{limit}, that can inferred from the provided {'excerpt' if multimodal else 'text'}.{multimodal_instruction}
64
+ These truths, MUST BE COHERENT. They must NOT be taken out of context.
65
+
66
+ Example:
67
+ Example Text:
68
+ "Albert Einstein, the genius often associated with wild hair and mind-bending theories, famously won the Nobel Prize in Physics—though not for his groundbreaking work on relativity, as many assume. Instead, in 1968, he was honored for his discovery of the photoelectric effect, a phenomenon that laid the foundation for quantum mechanics."
69
+
70
+ Example JSON:
71
+ {{
72
+ "truths": [
73
+ "Einstein won the noble prize for his discovery of the photoelectric effect in 1968.",
74
+ "The photoelectric effect is a phenomenon that laid the foundation for quantum mechanics."
75
+ ]
76
+ }}
77
+ ===== END OF EXAMPLE ======
78
+ **
79
+ IMPORTANT: Please make sure to only return in JSON format, with the "truths" key as a list of strings. No words or explanation is needed.
80
+ Only include truths that are factual, BUT IT DOESN'T MATTER IF THEY ARE FACTUALLY CORRECT.
81
+ **
82
+
83
+ {'Excerpt' if multimodal else 'Text'}:
84
+ {retrieval_context}
85
+
86
+ JSON:
87
+ """
88
+ )
71
89
 
72
90
  @staticmethod
73
- def generate_verdicts(claims: List[str], retrieval_context: str):
74
- return f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
75
- The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
76
- Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
77
- The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
78
-
79
- Expected JSON format:
80
- {{
81
- "verdicts": [
82
- {{
83
- "verdict": "yes"
84
- }},
85
- {{
86
- "reason": <explanation_for_contradiction>,
87
- "verdict": "no"
88
- }},
89
- {{
90
- "reason": <explanation_for_uncertainty>,
91
- "verdict": "idk"
92
- }}
93
- ]
94
- }}
95
-
96
- Generate ONE verdict per claim - length of 'verdicts' MUST equal number of claims.
97
- No 'reason' needed for 'yes' verdicts.
98
- Only use 'no' if retrieval context DIRECTLY CONTRADICTS the claim - never use prior knowledge.
99
- Use 'idk' for claims not backed up by context OR factually incorrect but non-contradictory - do not assume your knowledge.
100
- Vague/speculative language in claims (e.g. 'may have', 'possibility') does NOT count as contradiction.
101
-
102
- **
103
- IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
104
- **
105
-
106
- Retrieval Contexts:
107
- {retrieval_context}
108
-
109
- Claims:
110
- {claims}
111
-
112
- JSON:
113
- """
91
+ def generate_verdicts(
92
+ claims: List[str], retrieval_context: str, multimodal: bool = False
93
+ ):
94
+ example_section = ""
95
+ if multimodal:
96
+ example_section = textwrap.dedent(
97
+ """
98
+ Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
99
+ Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a German chef."]
100
+
101
+ Example:
102
+ {{
103
+ "verdicts": [
104
+ {{
105
+ "reason": "The claim about Barack Obama is not directly addressed in the retrieval context, and so poses no contradiction.",
106
+ "verdict": "idk"
107
+ }},
108
+ {{
109
+ "reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context.",
110
+ "verdict": "idk"
111
+ }},
112
+ {{
113
+ "verdict": "yes"
114
+ }},
115
+ {{
116
+ "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead.",
117
+ "verdict": "no"
118
+ }},
119
+ {{
120
+ "reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead.",
121
+ "verdict": "no"
122
+ }}
123
+ ]
124
+ }}
125
+ ===== END OF EXAMPLE ======
126
+ """
127
+ )
128
+
129
+ format_instruction = textwrap.dedent(
130
+ """
131
+ Expected JSON format:
132
+ {{
133
+ "verdicts": [
134
+ {{
135
+ "verdict": "yes"
136
+ }},
137
+ {{
138
+ "reason": <explanation_for_contradiction>,
139
+ "verdict": "no"
140
+ }},
141
+ {{
142
+ "reason": <explanation_for_uncertainty>,
143
+ "verdict": "idk"
144
+ }}
145
+ ]
146
+ }}
147
+ """
148
+ )
149
+
150
+ guidelines = ""
151
+ if multimodal:
152
+ guidelines = textwrap.dedent(
153
+ """
154
+ The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
155
+ You DON'T have to provide a reason if the answer is 'yes'.
156
+ ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
157
+ Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
158
+ Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
159
+ If there are clear contradictions or any data or images that's not mentioned in the retrieval context, just provide 'no'.
160
+ """
161
+ )
162
+ else:
163
+ guidelines = textwrap.dedent(
164
+ """
165
+ Generate ONE verdict per claim - length of 'verdicts' MUST equal number of claims.
166
+ No 'reason' needed for 'yes' verdicts.
167
+ Only use 'no' if retrieval context DIRECTLY CONTRADICTS the claim - never use prior knowledge.
168
+ Use 'idk' for claims not backed up by context OR factually incorrect but non-contradictory - do not assume your knowledge.
169
+ Vague/speculative language in claims (e.g. 'may have', 'possibility') does NOT count as contradiction.
170
+ """
171
+ )
172
+
173
+ return textwrap.dedent(
174
+ f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
175
+ The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
176
+ Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
177
+ The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
178
+
179
+ {format_instruction}
180
+ {example_section}
181
+ **
182
+ IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
183
+ {guidelines}
184
+ **
185
+
186
+ Retrieval Contexts:
187
+ {retrieval_context}
188
+
189
+ Claims:
190
+ {claims}
191
+
192
+ JSON:
193
+ """
194
+ )
114
195
 
115
196
  @staticmethod
116
- def generate_reason(score: float, contradictions: List[str]):
117
- return f"""Below is a list of Contradictions. It is a list of strings explaining why the 'actual output' does not align with the information presented in the 'retrieval context'. Contradictions happen in the 'actual output', NOT the 'retrieval context'.
118
- Given the faithfulness score, which is a 0-1 score indicating how faithful the `actual output` is to the retrieval context (higher the better), CONCISELY summarize the contradictions to justify the score.
197
+ def generate_reason(
198
+ score: float, contradictions: List[str], multimodal: bool = False
199
+ ):
200
+ return textwrap.dedent(
201
+ f"""Below is a list of Contradictions. It is a list of strings explaining why the 'actual output' does not align with the information presented in the 'retrieval context'. Contradictions happen in the 'actual output', NOT the 'retrieval context'.
202
+ Given the faithfulness score, which is a 0-1 score indicating how faithful the `actual output` is to the retrieval context (higher the better), CONCISELY summarize the contradictions to justify the score.
119
203
 
120
- Expected JSON format:
121
- {{
122
- "reason": "The score is <faithfulness_score> because <your_reason>."
123
- }}
204
+ Expected JSON format:
205
+ {{
206
+ "reason": "The score is <faithfulness_score> because <your_reason>."
207
+ }}
124
208
 
125
- **
126
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
209
+ **
210
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
127
211
 
128
- If there are no contradictions, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
129
- Your reason MUST use information in `contradiction` in your reason.
130
- Be sure in your reason, as if you know what the actual output is from the contradictions.
131
- **
212
+ If there are no contradictions, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
213
+ Your reason MUST use information in `contradiction` in your reason.
214
+ Be sure in your reason, as if you know what the actual output is from the contradictions.
215
+ **
132
216
 
133
- Faithfulness Score:
134
- {score}
217
+ Faithfulness Score:
218
+ {score}
135
219
 
136
- Contradictions:
137
- {contradictions}
220
+ Contradictions:
221
+ {contradictions}
138
222
 
139
- JSON:
140
- """
223
+ JSON:
224
+ """
225
+ )
@@ -118,12 +118,12 @@ def no_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
118
118
  return True
119
119
  elif (
120
120
  isinstance(model, GPTModel)
121
- and model.model_name in unsupported_log_probs_gpt_models
121
+ and model.get_model_name() in unsupported_log_probs_gpt_models
122
122
  ):
123
123
  return True
124
124
  elif (
125
125
  isinstance(model, AzureOpenAIModel)
126
- and model.model_name in unsupported_log_probs_gpt_models
126
+ and model.get_model_name() in unsupported_log_probs_gpt_models
127
127
  ):
128
128
  return True
129
129
 
@@ -13,7 +13,7 @@ from deepeval.metrics import (
13
13
  BaseMultimodalMetric,
14
14
  BaseArenaMetric,
15
15
  )
16
- from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
16
+ from deepeval.test_case import LLMTestCase, ConversationalTestCase
17
17
  from deepeval.test_run.cache import CachedTestCase, Cache
18
18
  from deepeval.telemetry import capture_metric_type
19
19
  from deepeval.utils import update_pbar
@@ -75,7 +75,7 @@ async def measure_metric_task(
75
75
  task_id,
76
76
  progress,
77
77
  metric: Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric],
78
- test_case: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
78
+ test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
79
79
  cached_test_case: Union[CachedTestCase, None],
80
80
  ignore_errors: bool,
81
81
  skip_on_missing_params: bool,
@@ -159,7 +159,7 @@ async def measure_metrics_with_indicator(
159
159
  metrics: List[
160
160
  Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
161
161
  ],
162
- test_case: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
162
+ test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
163
163
  cached_test_case: Union[CachedTestCase, None],
164
164
  ignore_errors: bool,
165
165
  skip_on_missing_params: bool,
@@ -239,7 +239,7 @@ async def measure_metrics_with_indicator(
239
239
 
240
240
  async def safe_a_measure(
241
241
  metric: Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric],
242
- tc: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
242
+ tc: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
243
243
  ignore_errors: bool,
244
244
  skip_on_missing_params: bool,
245
245
  progress: Optional[Progress] = None,
@@ -3,22 +3,4 @@ from .image_editing.image_editing import ImageEditingMetric
3
3
  from .image_coherence.image_coherence import ImageCoherenceMetric
4
4
  from .image_helpfulness.image_helpfulness import ImageHelpfulnessMetric
5
5
  from .image_reference.image_reference import ImageReferenceMetric
6
- from .multimodal_contextual_recall.multimodal_contextual_recall import (
7
- MultimodalContextualRecallMetric,
8
- )
9
- from .multimodal_contextual_relevancy.multimodal_contextual_relevancy import (
10
- MultimodalContextualRelevancyMetric,
11
- )
12
- from .multimodal_contextual_precision.multimodal_contextual_precision import (
13
- MultimodalContextualPrecisionMetric,
14
- )
15
- from .multimodal_answer_relevancy.multimodal_answer_relevancy import (
16
- MultimodalAnswerRelevancyMetric,
17
- )
18
- from .multimodal_faithfulness.multimodal_faithfulness import (
19
- MultimodalFaithfulnessMetric,
20
- )
21
- from .multimodal_tool_correctness.multimodal_tool_correctness import (
22
- MultimodalToolCorrectnessMetric,
23
- )
24
6
  from .multimodal_g_eval.multimodal_g_eval import MultimodalGEval