deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -42,6 +42,16 @@ from .mcp_use_metric.mcp_use_metric import MCPUseMetric
42
42
  from .turn_relevancy.turn_relevancy import (
43
43
  TurnRelevancyMetric,
44
44
  )
45
+ from .turn_faithfulness.turn_faithfulness import TurnFaithfulnessMetric
46
+ from .turn_contextual_precision.turn_contextual_precision import (
47
+ TurnContextualPrecisionMetric,
48
+ )
49
+ from .turn_contextual_recall.turn_contextual_recall import (
50
+ TurnContextualRecallMetric,
51
+ )
52
+ from .turn_contextual_relevancy.turn_contextual_relevancy import (
53
+ TurnContextualRelevancyMetric,
54
+ )
45
55
  from .conversation_completeness.conversation_completeness import (
46
56
  ConversationCompletenessMetric,
47
57
  )
@@ -55,12 +65,6 @@ from .multimodal_metrics import (
55
65
  ImageCoherenceMetric,
56
66
  ImageHelpfulnessMetric,
57
67
  ImageReferenceMetric,
58
- MultimodalContextualRecallMetric,
59
- MultimodalContextualRelevancyMetric,
60
- MultimodalContextualPrecisionMetric,
61
- MultimodalAnswerRelevancyMetric,
62
- MultimodalFaithfulnessMetric,
63
- MultimodalToolCorrectnessMetric,
64
68
  MultimodalGEval,
65
69
  )
66
70
 
@@ -119,17 +123,15 @@ __all__ = [
119
123
  # Conversational metrics
120
124
  "TurnRelevancyMetric",
121
125
  "ConversationCompletenessMetric",
126
+ "TurnFaithfulnessMetric",
127
+ "TurnContextualPrecisionMetric",
128
+ "TurnContextualRecallMetric",
129
+ "TurnContextualRelevancyMetric",
122
130
  # Multimodal metrics
123
131
  "TextToImageMetric",
124
132
  "ImageEditingMetric",
125
133
  "ImageCoherenceMetric",
126
134
  "ImageHelpfulnessMetric",
127
135
  "ImageReferenceMetric",
128
- "MultimodalContextualRecallMetric",
129
- "MultimodalContextualRelevancyMetric",
130
- "MultimodalContextualPrecisionMetric",
131
- "MultimodalAnswerRelevancyMetric",
132
- "MultimodalFaithfulnessMetric",
133
- "MultimodalToolCorrectnessMetric",
134
136
  "MultimodalGEval",
135
137
  ]
@@ -1,16 +1,17 @@
1
1
  from typing import Optional, List, Type, Union
2
2
 
3
- from deepeval.utils import get_or_create_event_loop, prettify_list
3
+ from deepeval.utils import (
4
+ get_or_create_event_loop,
5
+ prettify_list,
6
+ )
4
7
  from deepeval.metrics.utils import (
5
8
  construct_verbose_logs,
6
9
  trimAndLoadJson,
7
10
  check_llm_test_case_params,
11
+ check_mllm_test_case_params,
8
12
  initialize_model,
9
13
  )
10
- from deepeval.test_case import (
11
- LLMTestCase,
12
- LLMTestCaseParams,
13
- )
14
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
14
15
  from deepeval.metrics import BaseMetric
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
@@ -53,7 +54,14 @@ class AnswerRelevancyMetric(BaseMetric):
53
54
  _in_component: bool = False,
54
55
  _log_metric_to_confident: bool = True,
55
56
  ) -> float:
56
- check_llm_test_case_params(test_case, self._required_params, self)
57
+
58
+ multimodal = test_case.multimodal
59
+ if multimodal:
60
+ check_mllm_test_case_params(
61
+ test_case, self._required_params, None, None, self, self.model
62
+ )
63
+ else:
64
+ check_llm_test_case_params(test_case, self._required_params, self)
57
65
 
58
66
  self.evaluation_cost = 0 if self.using_native_model else None
59
67
  with metric_progress_indicator(
@@ -70,14 +78,17 @@ class AnswerRelevancyMetric(BaseMetric):
70
78
  )
71
79
  )
72
80
  else:
81
+ input = test_case.input
82
+ actual_output = test_case.actual_output
83
+
73
84
  self.statements: List[str] = self._generate_statements(
74
- test_case.actual_output
85
+ actual_output, multimodal
75
86
  )
76
87
  self.verdicts: List[AnswerRelevancyVerdict] = (
77
- self._generate_verdicts(test_case.input)
88
+ self._generate_verdicts(input, multimodal)
78
89
  )
79
90
  self.score = self._calculate_score()
80
- self.reason = self._generate_reason(test_case.input)
91
+ self.reason = self._generate_reason(input, multimodal)
81
92
  self.success = self.score >= self.threshold
82
93
  self.verbose_logs = construct_verbose_logs(
83
94
  self,
@@ -101,7 +112,14 @@ class AnswerRelevancyMetric(BaseMetric):
101
112
  _in_component: bool = False,
102
113
  _log_metric_to_confident: bool = True,
103
114
  ) -> float:
104
- check_llm_test_case_params(test_case, self._required_params, self)
115
+
116
+ multimodal = test_case.multimodal
117
+ if multimodal:
118
+ check_mllm_test_case_params(
119
+ test_case, self._required_params, None, None, self, self.model
120
+ )
121
+ else:
122
+ check_llm_test_case_params(test_case, self._required_params, self)
105
123
 
106
124
  self.evaluation_cost = 0 if self.using_native_model else None
107
125
  with metric_progress_indicator(
@@ -110,14 +128,17 @@ class AnswerRelevancyMetric(BaseMetric):
110
128
  _show_indicator=_show_indicator,
111
129
  _in_component=_in_component,
112
130
  ):
131
+ input = test_case.input
132
+ actual_output = test_case.actual_output
133
+
113
134
  self.statements: List[str] = await self._a_generate_statements(
114
- test_case.actual_output
135
+ actual_output, multimodal
115
136
  )
116
137
  self.verdicts: List[AnswerRelevancyVerdict] = (
117
- await self._a_generate_verdicts(test_case.input)
138
+ await self._a_generate_verdicts(input, multimodal)
118
139
  )
119
140
  self.score = self._calculate_score()
120
- self.reason = await self._a_generate_reason(test_case.input)
141
+ self.reason = await self._a_generate_reason(input, multimodal)
121
142
  self.success = self.score >= self.threshold
122
143
  self.verbose_logs = construct_verbose_logs(
123
144
  self,
@@ -133,7 +154,7 @@ class AnswerRelevancyMetric(BaseMetric):
133
154
  )
134
155
  return self.score
135
156
 
136
- async def _a_generate_reason(self, input: str) -> str:
157
+ async def _a_generate_reason(self, input: str, multimodal: bool) -> str:
137
158
  if self.include_reason is False:
138
159
  return None
139
160
 
@@ -146,7 +167,9 @@ class AnswerRelevancyMetric(BaseMetric):
146
167
  irrelevant_statements=irrelevant_statements,
147
168
  input=input,
148
169
  score=format(self.score, ".2f"),
170
+ multimodal=multimodal,
149
171
  )
172
+
150
173
  if self.using_native_model:
151
174
  res, cost = await self.model.a_generate(
152
175
  prompt, schema=AnswerRelevancyScoreReason
@@ -164,7 +187,7 @@ class AnswerRelevancyMetric(BaseMetric):
164
187
  data = trimAndLoadJson(res, self)
165
188
  return data["reason"]
166
189
 
167
- def _generate_reason(self, input: str) -> str:
190
+ def _generate_reason(self, input: str, multimodal: bool) -> str:
168
191
  if self.include_reason is False:
169
192
  return None
170
193
 
@@ -177,6 +200,7 @@ class AnswerRelevancyMetric(BaseMetric):
177
200
  irrelevant_statements=irrelevant_statements,
178
201
  input=input,
179
202
  score=format(self.score, ".2f"),
203
+ multimodal=multimodal,
180
204
  )
181
205
 
182
206
  if self.using_native_model:
@@ -197,14 +221,13 @@ class AnswerRelevancyMetric(BaseMetric):
197
221
  return data["reason"]
198
222
 
199
223
  async def _a_generate_verdicts(
200
- self, input: str
224
+ self, input: str, multimodal: bool
201
225
  ) -> List[AnswerRelevancyVerdict]:
202
226
  if len(self.statements) == 0:
203
227
  return []
204
228
 
205
229
  prompt = self.evaluation_template.generate_verdicts(
206
- input=input,
207
- statements=self.statements,
230
+ input=input, statements=self.statements, multimodal=multimodal
208
231
  )
209
232
 
210
233
  if self.using_native_model:
@@ -224,14 +247,16 @@ class AnswerRelevancyMetric(BaseMetric):
224
247
  AnswerRelevancyVerdict(**item) for item in data["verdicts"]
225
248
  ]
226
249
 
227
- def _generate_verdicts(self, input: str) -> List[AnswerRelevancyVerdict]:
250
+ def _generate_verdicts(
251
+ self, input: str, multimodal: bool
252
+ ) -> List[AnswerRelevancyVerdict]:
228
253
  if len(self.statements) == 0:
229
254
  return []
230
255
 
231
256
  prompt = self.evaluation_template.generate_verdicts(
232
- input=input,
233
- statements=self.statements,
257
+ input=input, statements=self.statements, multimodal=multimodal
234
258
  )
259
+
235
260
  if self.using_native_model:
236
261
  res, cost = self.model.generate(prompt, schema=Verdicts)
237
262
  self.evaluation_cost += cost
@@ -250,44 +275,64 @@ class AnswerRelevancyMetric(BaseMetric):
250
275
  async def _a_generate_statements(
251
276
  self,
252
277
  actual_output: str,
278
+ multimodal: bool,
253
279
  ) -> List[str]:
254
280
  prompt = self.evaluation_template.generate_statements(
255
- actual_output=actual_output,
281
+ actual_output=actual_output, multimodal=multimodal
256
282
  )
257
283
  if self.using_native_model:
258
284
  res, cost = await self.model.a_generate(prompt, schema=Statements)
259
285
  self.evaluation_cost += cost
260
- return res.statements
286
+ statements: List[str] = res.statements + [
287
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
288
+ ]
289
+ return statements
261
290
  else:
262
291
  try:
263
292
  res: Statements = await self.model.a_generate(
264
293
  prompt, schema=Statements
265
294
  )
266
- return res.statements
295
+ statements: List[str] = res.statements + [
296
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
297
+ ]
298
+ return statements
267
299
  except TypeError:
268
300
  res = await self.model.a_generate(prompt)
269
301
  data = trimAndLoadJson(res, self)
270
- return data["statements"]
302
+ statements = data["statements"] + [
303
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
304
+ ]
305
+ return statements
271
306
 
272
307
  def _generate_statements(
273
308
  self,
274
309
  actual_output: str,
310
+ multimodal: bool,
275
311
  ) -> List[str]:
276
312
  prompt = self.evaluation_template.generate_statements(
277
- actual_output=actual_output,
313
+ actual_output=actual_output, multimodal=multimodal
278
314
  )
279
315
  if self.using_native_model:
280
316
  res, cost = self.model.generate(prompt, schema=Statements)
281
317
  self.evaluation_cost += cost
282
- return res.statements
318
+ statements = res.statements + [
319
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
320
+ ]
321
+ return statements
283
322
  else:
284
323
  try:
285
324
  res: Statements = self.model.generate(prompt, schema=Statements)
286
- return res.statements
325
+ statements = res.statements + [
326
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
327
+ ]
328
+ return statements
287
329
  except TypeError:
288
330
  res = self.model.generate(prompt)
289
331
  data = trimAndLoadJson(res, self)
290
- return data["statements"]
332
+ statements = data["statements"] + [
333
+ ele for ele in actual_output if isinstance(ele, MLLMImage)
334
+ ]
335
+ return statements
291
336
 
292
337
  def _calculate_score(self):
293
338
  number_of_verdicts = len(self.verdicts)
@@ -1,110 +1,206 @@
1
1
  from typing import List
2
+ import textwrap
2
3
 
3
4
 
4
5
  class AnswerRelevancyTemplate:
5
6
  @staticmethod
6
- def generate_statements(actual_output: str):
7
- return f"""Given the text, breakdown and generate a list of statements presented. Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement.
8
-
9
- Example:
10
- Example text:
11
- Our new laptop model features a high-resolution Retina display for crystal-clear visuals. It also includes a fast-charging battery, giving you up to 12 hours of usage on a single charge. For security, we’ve added fingerprint authentication and an encrypted SSD. Plus, every purchase comes with a one-year warranty and 24/7 customer support.
12
-
13
- {{
14
- "statements": [
15
- "The new laptop model has a high-resolution Retina display.",
16
- "It includes a fast-charging battery with up to 12 hours of usage.",
17
- "Security features include fingerprint authentication and an encrypted SSD.",
18
- "Every purchase comes with a one-year warranty.",
19
- "24/7 customer support is included."
20
- ]
21
- }}
22
- ===== END OF EXAMPLE ======
23
-
24
- **
25
- IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
26
- **
27
-
28
- Text:
29
- {actual_output}
30
-
31
- JSON:
32
- """
7
+ def generate_statements(actual_output: str, multimodal: bool = False):
8
+ multimodal_instruction = ""
9
+ example_text = ""
10
+ example_json = ""
11
+
12
+ if multimodal:
13
+ multimodal_instruction = " The text may contain images as well."
14
+ example_text = "Shoes. The shoes can be refunded at no extra cost. Thanks for asking the question!"
15
+ example_json = textwrap.dedent(
16
+ """
17
+ {{
18
+ "statements": ["Shoes.", "Shoes can be refunded at no extra cost", "Thanks for asking the question!"]
19
+ }}
20
+ """
21
+ )
22
+ else:
23
+ example_text = "Our new laptop model features a high-resolution Retina display for crystal-clear visuals. It also includes a fast-charging battery, giving you up to 12 hours of usage on a single charge. For security, we've added fingerprint authentication and an encrypted SSD. Plus, every purchase comes with a one-year warranty and 24/7 customer support."
24
+ example_json = textwrap.dedent(
25
+ """
26
+ {{
27
+ "statements": [
28
+ "The new laptop model has a high-resolution Retina display.",
29
+ "It includes a fast-charging battery with up to 12 hours of usage.",
30
+ "Security features include fingerprint authentication and an encrypted SSD.",
31
+ "Every purchase comes with a one-year warranty.",
32
+ "24/7 customer support is included."
33
+ ]
34
+ }}
35
+ """
36
+ )
37
+
38
+ coherence_note = (
39
+ ""
40
+ if multimodal
41
+ else " Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement."
42
+ )
43
+
44
+ return textwrap.dedent(
45
+ f"""Given the text, breakdown and generate a list of statements presented.{coherence_note}{multimodal_instruction}
46
+
47
+ Example:
48
+ Example text:
49
+ {example_text}
50
+
51
+ {example_json}
52
+ ===== END OF EXAMPLE ======
53
+
54
+ **
55
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
56
+ **
57
+
58
+ Text:
59
+ {actual_output}
60
+
61
+ JSON:
62
+ """
63
+ )
33
64
 
34
65
  @staticmethod
35
- def generate_verdicts(input: str, statements: str):
36
- return f"""For the provided list of statements, determine whether each statement is relevant to address the input.
37
- Generate JSON objects with 'verdict' and 'reason' fields.
38
- The 'verdict' should be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information).
39
- Provide 'reason' ONLY for 'no' or 'idk' verdicts.
40
- The statements are from an AI's actual output.
41
-
42
- **
43
- IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
44
-
45
- Expected JSON format:
46
- {{
47
- "verdicts": [
48
- {{
49
- "verdict": "yes"
50
- }},
51
- {{
52
- "reason": <explanation_for_irrelevance>,
53
- "verdict": "no"
54
- }},
55
- {{
56
- "reason": <explanation_for_ambiguity>,
57
- "verdict": "idk"
58
- }}
59
- ]
60
- }}
61
-
62
- Generate ONE verdict per statement - number of 'verdicts' MUST equal number of statements.
63
- 'verdict' must be STRICTLY 'yes', 'no', or 'idk':
64
- - 'yes': statement is relevant to addressing the input
65
- - 'no': statement is irrelevant to the input
66
- - 'idk': statement is ambiguous (not directly relevant but could be supporting information)
67
- Provide 'reason' ONLY for 'no' or 'idk' verdicts.
68
- **
69
-
70
- Input:
71
- {input}
72
-
73
- Statements:
74
- {statements}
75
-
76
- JSON:
77
- """
66
+ def generate_verdicts(
67
+ input: str, statements: str, multimodal: bool = False
68
+ ):
69
+ content_type = (
70
+ "statements (which can contain images)"
71
+ if multimodal
72
+ else "list of statements"
73
+ )
74
+ statement_or_image = "statement or image" if multimodal else "statement"
75
+
76
+ format_instruction = textwrap.dedent(
77
+ """
78
+ Expected JSON format:
79
+ {{
80
+ "verdicts": [
81
+ {{
82
+ "verdict": "yes"
83
+ }},
84
+ {{
85
+ "reason": <explanation_for_irrelevance>,
86
+ "verdict": "no"
87
+ }},
88
+ {{
89
+ "reason": <explanation_for_ambiguity>,
90
+ "verdict": "idk"
91
+ }}
92
+ ]
93
+ }}
94
+ """
95
+ )
96
+
97
+ example_section = ""
98
+ if multimodal:
99
+ example_section = textwrap.dedent(
100
+ """
101
+ Example input: What should I do if there is an earthquake?
102
+ Example statements: ["Shoes.", "Thanks for asking the question!", "Is there anything else I can help you with?", "Duck and hide"]
103
+ Example JSON:
104
+ {{
105
+ "verdicts": [
106
+ {{
107
+ "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake.",
108
+ "verdict": "no"
109
+ }},
110
+ {{
111
+ "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant.",
112
+ "verdict": "idk"
113
+ }},
114
+ {{
115
+ "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant.",
116
+ "verdict": "idk"
117
+ }},
118
+ {{
119
+ "verdict": "yes"
120
+ }}
121
+ ]
122
+ }}
123
+ """
124
+ )
125
+
126
+ guidelines = ""
127
+ if multimodal:
128
+ guidelines = textwrap.dedent(
129
+ f"""
130
+ Since you are going to generate a verdict for each statement and image, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.
131
+ """
132
+ )
133
+ else:
134
+ guidelines = textwrap.dedent(
135
+ f"""
136
+ Generate ONE verdict per statement - number of 'verdicts' MUST equal number of statements.
137
+ 'verdict' must be STRICTLY 'yes', 'no', or 'idk':
138
+ - 'yes': statement is relevant to addressing the input
139
+ - 'no': statement is irrelevant to the input
140
+ - 'idk': statement is ambiguous (not directly relevant but could be supporting information)
141
+ Provide 'reason' ONLY for 'no' or 'idk' verdicts.
142
+ """
143
+ )
144
+
145
+ return textwrap.dedent(
146
+ f"""For the provided {content_type}, determine whether each {statement_or_image} is relevant to address the input.
147
+ {"Please generate a list of JSON with two keys: `verdict` and `reason`." if multimodal else "Generate JSON objects with 'verdict' and 'reason' fields."}
148
+ The 'verdict' {"key " if multimodal else ''}should {"STRICTLY be either a 'yes', 'idk' or 'no'" if multimodal else "be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information)"}. {"Answer 'yes' if the " + statement_or_image + ' is relevant to addressing the original input, no if the ' + statement_or_image + ' is irrelevant, and "idk" if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).' if multimodal else ""}
149
+ {"The 'reason' is the reason for the verdict.' if multimodal else '"}
150
+ Provide 'reason' ONLY for 'no' or 'idk' verdicts.
151
+ The {"provided statements are statements and images' if multimodal else 'statements are from an AI's actual output"} generated in the actual output.
152
+
153
+ **
154
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
155
+
156
+ {format_instruction if not multimodal else ''}
157
+ {example_section}
158
+ {guidelines}
159
+ **
160
+
161
+ Input:
162
+ {input}
163
+
164
+ Statements:
165
+ {statements}
166
+
167
+ JSON:
168
+ """
169
+ )
78
170
 
79
171
  @staticmethod
80
172
  def generate_reason(
81
- irrelevant_statements: List[str], input: str, score: float
173
+ irrelevant_statements: List[str],
174
+ input: str,
175
+ score: float,
176
+ multimodal: bool = False,
82
177
  ):
83
- return f"""Given the answer relevancy score, the list of reasons of irrelevant statements made in the actual output, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
84
- The irrelevant statements represent things in the actual output that is irrelevant to addressing whatever is asked/talked about in the input.
85
- If there is nothing irrelevant, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
86
-
178
+ return textwrap.dedent(
179
+ f"""Given the answer relevancy score, the list of reasons of irrelevant statements made in the actual output, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
180
+ The irrelevant statements represent things in the actual output that is irrelevant to addressing whatever is asked/talked about in the input.
181
+ If there is nothing irrelevant, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
87
182
 
88
- **
89
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
90
183
 
91
- Example:
92
- Example JSON:
93
- {{
94
- "reason": "The score is <answer_relevancy_score> because <your_reason>."
95
- }}
96
- ===== END OF EXAMPLE ======
97
- **
184
+ **
185
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
98
186
 
187
+ {"Example:' if not multimodal else '"}
188
+ Example JSON:
189
+ {{
190
+ "reason": "The score is <answer_relevancy_score> because <your_reason>."
191
+ }}
192
+ {"===== END OF EXAMPLE ======' if not multimodal else '"}
193
+ **
99
194
 
100
- Answer Relevancy Score:
101
- {score}
195
+ Answer Relevancy Score:
196
+ {score}
102
197
 
103
- Reasons why the score can't be higher based on irrelevant statements in the actual output:
104
- {irrelevant_statements}
198
+ Reasons why the score can't be higher based on irrelevant statements in the actual output:
199
+ {irrelevant_statements}
105
200
 
106
- Input:
107
- {input}
201
+ Input:
202
+ {input}
108
203
 
109
- JSON:
110
- """
204
+ JSON:
205
+ """
206
+ )
@@ -4,7 +4,6 @@ from typing import Optional, Dict, List
4
4
  from deepeval.test_case import (
5
5
  LLMTestCase,
6
6
  ConversationalTestCase,
7
- MLLMTestCase,
8
7
  LLMTestCaseParams,
9
8
  ArenaTestCase,
10
9
  )
@@ -113,13 +112,11 @@ class BaseMultimodalMetric:
113
112
  self._threshold = value
114
113
 
115
114
  @abstractmethod
116
- def measure(self, test_case: MLLMTestCase, *args, **kwargs) -> float:
115
+ def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
117
116
  raise NotImplementedError
118
117
 
119
118
  @abstractmethod
120
- async def a_measure(
121
- self, test_case: MLLMTestCase, *args, **kwargs
122
- ) -> float:
119
+ async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
123
120
  raise NotImplementedError(
124
121
  f"Async execution for {self.__class__.__name__} not supported yet. Please set 'async_mode' to 'False'."
125
122
  )