deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -8,9 +8,10 @@ from deepeval.metrics.conversation_completeness.template import (
8
8
  from deepeval.metrics.utils import (
9
9
  check_conversational_test_case_params,
10
10
  construct_verbose_logs,
11
- trimAndLoadJson,
12
11
  initialize_model,
13
12
  convert_turn_to_dict,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
14
15
  )
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.indicator import metric_progress_indicator
@@ -18,7 +19,11 @@ from deepeval.test_case import ConversationalTestCase
18
19
  from deepeval.test_case import TurnParams
19
20
  from deepeval.test_case.conversational_test_case import Turn
20
21
  from deepeval.utils import get_or_create_event_loop, prettify_list
21
- from deepeval.metrics.conversation_completeness.schema import *
22
+ from deepeval.metrics.conversation_completeness.schema import (
23
+ UserIntentions,
24
+ ConversationCompletenessVerdict,
25
+ ConversationCompletenessScoreReason,
26
+ )
22
27
  from deepeval.metrics.api import metric_data_manager
23
28
 
24
29
 
@@ -51,8 +56,15 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
51
56
  _in_component: bool = False,
52
57
  _log_metric_to_confident: bool = True,
53
58
  ):
59
+
60
+ multimodal = test_case.multimodal
54
61
  check_conversational_test_case_params(
55
- test_case, self._required_test_case_params, self
62
+ test_case,
63
+ self._required_test_case_params,
64
+ self,
65
+ False,
66
+ self.model,
67
+ multimodal,
56
68
  )
57
69
 
58
70
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -71,17 +83,19 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
71
83
  )
72
84
  else:
73
85
  self.user_intentions = self._extract_user_intentions(
74
- test_case.turns
86
+ test_case.turns, multimodal=multimodal
75
87
  )
76
88
  self.verdicts = [
77
89
  self._generate_verdict(
78
- turns=test_case.turns, intention=user_intention
90
+ turns=test_case.turns,
91
+ intention=user_intention,
92
+ multimodal=multimodal,
79
93
  )
80
94
  for user_intention in self.user_intentions
81
95
  ]
82
96
 
83
97
  self.score = self._calculate_score()
84
- self.reason = self._generate_reason()
98
+ self.reason = self._generate_reason(multimodal=multimodal)
85
99
  self.success = self.score >= self.threshold
86
100
  self.verbose_logs = construct_verbose_logs(
87
101
  self,
@@ -105,28 +119,40 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
105
119
  _in_component: bool = False,
106
120
  _log_metric_to_confident: bool = True,
107
121
  ) -> float:
122
+
123
+ multimodal = test_case.multimodal
108
124
  check_conversational_test_case_params(
109
- test_case, self._required_test_case_params, self
125
+ test_case,
126
+ self._required_test_case_params,
127
+ self,
128
+ False,
129
+ self.model,
130
+ multimodal,
110
131
  )
111
132
 
112
133
  self.evaluation_cost = 0 if self.using_native_model else None
113
134
  with metric_progress_indicator(
114
- self, async_mode=True, _show_indicator=_show_indicator
135
+ self,
136
+ async_mode=True,
137
+ _show_indicator=_show_indicator,
138
+ _in_component=_in_component,
115
139
  ):
116
140
  self.user_intentions = await self._a_extract_user_intentions(
117
- test_case.turns
141
+ test_case.turns, multimodal=multimodal
118
142
  )
119
143
  self.verdicts = await asyncio.gather(
120
144
  *[
121
145
  self._a_generate_verdict(
122
- turns=test_case.turns, intention=user_intention
146
+ turns=test_case.turns,
147
+ intention=user_intention,
148
+ multimodal=multimodal,
123
149
  )
124
150
  for user_intention in self.user_intentions
125
151
  ]
126
152
  )
127
153
 
128
154
  self.score = self._calculate_score()
129
- self.reason = await self._a_generate_reason()
155
+ self.reason = await self._a_generate_reason(multimodal=multimodal)
130
156
  self.success = self.score >= self.threshold
131
157
  self.verbose_logs = construct_verbose_logs(
132
158
  self,
@@ -143,7 +169,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
143
169
  )
144
170
  return self.score
145
171
 
146
- async def _a_generate_reason(self) -> str:
172
+ async def _a_generate_reason(self, multimodal: bool) -> str:
147
173
  incompletenesses: List[str] = []
148
174
  for verdict in self.verdicts:
149
175
  if verdict.verdict.strip().lower() == "no":
@@ -153,27 +179,17 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
153
179
  score=self.score,
154
180
  incompletenesses=incompletenesses,
155
181
  intentions=self.user_intentions,
182
+ multimodal=multimodal,
183
+ )
184
+ return await a_generate_with_schema_and_extract(
185
+ metric=self,
186
+ prompt=prompt,
187
+ schema_cls=ConversationCompletenessScoreReason,
188
+ extract_schema=lambda score_reason: score_reason.reason,
189
+ extract_json=lambda data: data["reason"],
156
190
  )
157
- if self.using_native_model:
158
- res, cost = await self.model.a_generate(
159
- prompt, schema=ConversationCompletenessScoreReason
160
- )
161
- self.evaluation_cost += cost
162
- return res.reason
163
- else:
164
- try:
165
- res: ConversationCompletenessScoreReason = (
166
- await self.model.a_generate(
167
- prompt, schema=ConversationCompletenessScoreReason
168
- )
169
- )
170
- return res.reason
171
- except TypeError:
172
- res = await self.model.a_generate(prompt)
173
- data = trimAndLoadJson(res, self)
174
- return data["reason"]
175
191
 
176
- def _generate_reason(self) -> str:
192
+ def _generate_reason(self, multimodal: bool) -> str:
177
193
  if self.include_reason is False:
178
194
  return None
179
195
 
@@ -186,113 +202,79 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
186
202
  score=self.score,
187
203
  incompletenesses=incompletenesses,
188
204
  intentions=self.user_intentions,
205
+ multimodal=multimodal,
206
+ )
207
+ return generate_with_schema_and_extract(
208
+ metric=self,
209
+ prompt=prompt,
210
+ schema_cls=ConversationCompletenessScoreReason,
211
+ extract_schema=lambda score_reason: score_reason.reason,
212
+ extract_json=lambda data: data["reason"],
189
213
  )
190
- if self.using_native_model:
191
- res, cost = self.model.generate(
192
- prompt, schema=ConversationCompletenessScoreReason
193
- )
194
- self.evaluation_cost += cost
195
- return res.reason
196
- else:
197
- try:
198
- res: ConversationCompletenessScoreReason = self.model.generate(
199
- prompt, schema=ConversationCompletenessScoreReason
200
- )
201
- return res.reason
202
- except TypeError:
203
- res = self.model.generate(prompt)
204
- data = trimAndLoadJson(res, self)
205
- return data["reason"]
206
214
 
207
215
  async def _a_generate_verdict(
208
- self, turns: List[Turn], intention: str
216
+ self, turns: List[Turn], intention: str, multimodal: bool
209
217
  ) -> ConversationCompletenessVerdict:
210
218
  prompt = ConversationCompletenessTemplate.generate_verdicts(
211
219
  turns=[convert_turn_to_dict(turn) for turn in turns],
212
220
  intention=intention,
221
+ multimodal=multimodal,
222
+ )
223
+
224
+ return await a_generate_with_schema_and_extract(
225
+ metric=self,
226
+ prompt=prompt,
227
+ schema_cls=ConversationCompletenessVerdict,
228
+ extract_schema=lambda r: r,
229
+ extract_json=lambda data: ConversationCompletenessVerdict(**data),
213
230
  )
214
- if self.using_native_model:
215
- res, cost = await self.model.a_generate(
216
- prompt, schema=ConversationCompletenessVerdict
217
- )
218
- self.evaluation_cost += cost
219
- return res
220
- else:
221
- try:
222
- res: ConversationCompletenessVerdict = (
223
- await self.model.a_generate(
224
- prompt, schema=ConversationCompletenessVerdict
225
- )
226
- )
227
- return res
228
- except TypeError:
229
- res = await self.model.a_generate(prompt)
230
- data = trimAndLoadJson(res, self)
231
- return ConversationCompletenessVerdict(**data)
232
231
 
233
232
  def _generate_verdict(
234
- self, turns: List[Turn], intention: str
233
+ self, turns: List[Turn], intention: str, multimodal: bool
235
234
  ) -> ConversationCompletenessVerdict:
236
235
  prompt = ConversationCompletenessTemplate.generate_verdicts(
237
236
  turns=[convert_turn_to_dict(turn) for turn in turns],
238
237
  intention=intention,
238
+ multimodal=multimodal,
239
+ )
240
+ return generate_with_schema_and_extract(
241
+ metric=self,
242
+ prompt=prompt,
243
+ schema_cls=ConversationCompletenessVerdict,
244
+ extract_schema=lambda r: r,
245
+ extract_json=lambda data: ConversationCompletenessVerdict(**data),
239
246
  )
240
- if self.using_native_model:
241
- res, cost = self.model.generate(
242
- prompt, schema=ConversationCompletenessVerdict
243
- )
244
- self.evaluation_cost += cost
245
- return res
246
- else:
247
- try:
248
- res: ConversationCompletenessVerdict = self.model.generate(
249
- prompt, schema=ConversationCompletenessVerdict
250
- )
251
- return res
252
- except TypeError:
253
- res = self.model.generate(prompt)
254
- data = trimAndLoadJson(res, self)
255
- return ConversationCompletenessVerdict(**data)
256
247
 
257
- async def _a_extract_user_intentions(self, turns: List[Turn]) -> List[str]:
248
+ async def _a_extract_user_intentions(
249
+ self, turns: List[Turn], multimodal: bool
250
+ ) -> List[str]:
258
251
  prompt = ConversationCompletenessTemplate.extract_user_intentions(
259
- turns=[convert_turn_to_dict(turn) for turn in turns]
252
+ turns=[convert_turn_to_dict(turn) for turn in turns],
253
+ multimodal=multimodal,
254
+ )
255
+
256
+ return await a_generate_with_schema_and_extract(
257
+ metric=self,
258
+ prompt=prompt,
259
+ schema_cls=UserIntentions,
260
+ extract_schema=lambda r: r.intentions,
261
+ extract_json=lambda data: UserIntentions(**data).intentions,
260
262
  )
261
- if self.using_native_model:
262
- res, cost = await self.model.a_generate(
263
- prompt, schema=UserIntentions
264
- )
265
- self.evaluation_cost += cost
266
- return res.intentions
267
- else:
268
- try:
269
- res: UserIntentions = await self.model.a_generate(
270
- prompt, schema=UserIntentions
271
- )
272
- return res.intentions
273
- except TypeError:
274
- res = await self.model.a_generate(prompt)
275
- data = trimAndLoadJson(res, self)
276
- return UserIntentions(**data).intentions
277
263
 
278
- def _extract_user_intentions(self, turns: List[Turn]) -> List[str]:
264
+ def _extract_user_intentions(
265
+ self, turns: List[Turn], multimodal: bool
266
+ ) -> List[str]:
279
267
  prompt = ConversationCompletenessTemplate.extract_user_intentions(
280
- turns=[convert_turn_to_dict(turn) for turn in turns]
268
+ turns=[convert_turn_to_dict(turn) for turn in turns],
269
+ multimodal=multimodal,
270
+ )
271
+ return generate_with_schema_and_extract(
272
+ metric=self,
273
+ prompt=prompt,
274
+ schema_cls=UserIntentions,
275
+ extract_schema=lambda r: r.intentions,
276
+ extract_json=lambda data: UserIntentions(**data).intentions,
281
277
  )
282
- if self.using_native_model:
283
- res, cost = self.model.generate(prompt, schema=UserIntentions)
284
- self.evaluation_cost += cost
285
- return res.intentions
286
- else:
287
- try:
288
- res: UserIntentions = self.model.generate(
289
- prompt, schema=UserIntentions
290
- )
291
- return res.intentions
292
- except TypeError:
293
- res = self.model.generate(prompt)
294
- data = trimAndLoadJson(res, self)
295
- return UserIntentions(**data).intentions
296
278
 
297
279
  def _calculate_score(self) -> float:
298
280
  number_of_verdicts = len(self.verdicts)
@@ -312,8 +294,8 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
312
294
  self.success = False
313
295
  else:
314
296
  try:
315
- self.score >= self.threshold
316
- except:
297
+ self.success = self.score >= self.threshold
298
+ except TypeError:
317
299
  self.success = False
318
300
  return self.success
319
301
 
@@ -2,11 +2,21 @@ from typing import List, Dict
2
2
 
3
3
 
4
4
  class ConversationCompletenessTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
- def extract_user_intentions(turns: List[Dict]):
14
+ def extract_user_intentions(turns: List[Dict], multimodal: bool = False):
7
15
  return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to extract all user intentions in the conversation. The JSON will have 1 field: 'intentions'.
8
16
  You should ONLY consider the overall intention, and not dwell too much on the specifics, as we are more concerned about the overall objective of the conversation.
9
17
 
18
+ {ConversationCompletenessTemplate.multimodal_rules if multimodal else ""}
19
+
10
20
  **
11
21
  IMPORTANT: Please make sure to only return in JSON format.
12
22
  Example Turns:
@@ -49,8 +59,13 @@ JSON:
49
59
  """
50
60
 
51
61
  @staticmethod
52
- def generate_verdicts(turns: List[Dict], intention: str):
62
+ def generate_verdicts(
63
+ turns: List[Dict], intention: str, multimodal: bool = False
64
+ ):
53
65
  return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to indicate whether given user intention was satisfied from the conversation messages. The JSON will have 2 fields: 'verdict' and 'reason'.
66
+
67
+ {ConversationCompletenessTemplate.multimodal_rules if multimodal else ""}
68
+
54
69
  The 'verdict' key should STRICTLY be either 'yes' or 'no', which states whether the user intention was satisfied or not.
55
70
  Provide a 'reason' ONLY if the answer is 'no'.
56
71
  You MUST USE look at all messages provided in the list of messages to make an informed judgement on satisfaction.
@@ -106,8 +121,13 @@ JSON:
106
121
  """
107
122
 
108
123
  @staticmethod
109
- def generate_reason(score, incompletenesses, intentions):
124
+ def generate_reason(
125
+ score, incompletenesses, intentions, multimodal: bool = False
126
+ ):
110
127
  return f"""Below is a list of incompletenesses drawn from some messages in a conversation, which you have minimal knowledge of. It is a list of strings explaining why an LLM 'actual_output' is incomplete to satisfy the user `input` for a particular message.
128
+
129
+ {ConversationCompletenessTemplate.multimodal_rules if multimodal else ""}
130
+
111
131
  Given the completeness score, which is a 0-1 score indicating how incomplete the OVERALL `actual_output`s are to the user intentions found in the `input`s of a conversation (higher the better), CONCISELY summarize the incompletenesses to justify the score.
112
132
 
113
133
  **
@@ -1,4 +1,4 @@
1
- from typing import Optional, Union
1
+ from typing import List, Optional, Union
2
2
  from deepeval.metrics import BaseConversationalMetric
3
3
  from deepeval.test_case import (
4
4
  ConversationalTestCase,
@@ -11,7 +11,6 @@ from deepeval.metrics.utils import (
11
11
  )
12
12
  from deepeval.models import DeepEvalBaseLLM
13
13
  from deepeval.metrics.indicator import metric_progress_indicator
14
- from deepeval.metrics.g_eval.schema import *
15
14
  from deepeval.metrics import DeepAcyclicGraph
16
15
  from deepeval.metrics.dag.utils import (
17
16
  is_valid_dag_from_roots,
@@ -35,11 +34,8 @@ class ConversationalDAGMetric(BaseConversationalMetric):
35
34
  verbose_mode: bool = False,
36
35
  _include_dag_suffix: bool = True,
37
36
  ):
38
- if (
39
- is_valid_dag_from_roots(
40
- root_nodes=dag.root_nodes, multiturn=dag.multiturn
41
- )
42
- == False
37
+ if not is_valid_dag_from_roots(
38
+ root_nodes=dag.root_nodes, multiturn=dag.multiturn
43
39
  ):
44
40
  raise ValueError("Cycle detected in DAG graph.")
45
41
 
@@ -62,10 +58,14 @@ class ConversationalDAGMetric(BaseConversationalMetric):
62
58
  _in_component: bool = False,
63
59
  _log_metric_to_confident: bool = True,
64
60
  ) -> float:
61
+ multimodal = test_case.multimodal
65
62
  check_conversational_test_case_params(
66
63
  test_case,
67
64
  extract_required_params(self.dag.root_nodes, multiturn=True),
68
65
  self,
66
+ False,
67
+ self.model,
68
+ multimodal,
69
69
  )
70
70
 
71
71
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -105,10 +105,14 @@ class ConversationalDAGMetric(BaseConversationalMetric):
105
105
  _in_component: bool = False,
106
106
  _log_metric_to_confident: bool = True,
107
107
  ) -> float:
108
+ multimodal = test_case.multimodal
108
109
  check_conversational_test_case_params(
109
110
  test_case,
110
111
  extract_required_params(self.dag.root_nodes, multiturn=True),
111
112
  self,
113
+ False,
114
+ self.model,
115
+ multimodal,
112
116
  )
113
117
 
114
118
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -139,7 +143,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
139
143
  else:
140
144
  try:
141
145
  self.success = self.score >= self.threshold
142
- except:
146
+ except TypeError:
143
147
  self.success = False
144
148
  return self.success
145
149