deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -15,6 +15,8 @@ from deepeval.metrics.utils import (
15
15
  trimAndLoadJson,
16
16
  initialize_model,
17
17
  check_llm_test_case_params,
18
+ generate_with_schema_and_extract,
19
+ a_generate_with_schema_and_extract,
18
20
  )
19
21
  from deepeval.models import DeepEvalBaseLLM
20
22
  from deepeval.metrics.indicator import metric_progress_indicator
@@ -82,7 +84,19 @@ class GEval(BaseMetric):
82
84
  _log_metric_to_confident: bool = True,
83
85
  _additional_context: Optional[str] = None,
84
86
  ) -> float:
85
- check_llm_test_case_params(test_case, self.evaluation_params, self)
87
+
88
+ multimodal = test_case.multimodal
89
+
90
+ check_llm_test_case_params(
91
+ test_case,
92
+ self.evaluation_params,
93
+ None,
94
+ None,
95
+ self,
96
+ self.model,
97
+ multimodal,
98
+ )
99
+
86
100
  self.evaluation_cost = 0 if self.using_native_model else None
87
101
 
88
102
  with metric_progress_indicator(
@@ -96,18 +110,25 @@ class GEval(BaseMetric):
96
110
  _in_component=_in_component,
97
111
  _additional_context=_additional_context,
98
112
  )
113
+ settings = get_settings()
99
114
  loop.run_until_complete(
100
115
  asyncio.wait_for(
101
116
  coro,
102
- timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
117
+ timeout=(
118
+ None
119
+ if settings.DEEPEVAL_DISABLE_TIMEOUTS
120
+ else settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
121
+ ),
103
122
  )
104
123
  )
105
124
  else:
106
125
  self.evaluation_steps: List[str] = (
107
- self._generate_evaluation_steps()
126
+ self._generate_evaluation_steps(multimodal)
108
127
  )
109
128
  g_score, reason = self._evaluate(
110
- test_case, _additional_context=_additional_context
129
+ test_case,
130
+ _additional_context=_additional_context,
131
+ multimodal=multimodal,
111
132
  )
112
133
  self.score = (
113
134
  (float(g_score) - self.score_range[0])
@@ -143,7 +164,18 @@ class GEval(BaseMetric):
143
164
  _log_metric_to_confident: bool = True,
144
165
  _additional_context: Optional[str] = None,
145
166
  ) -> float:
146
- check_llm_test_case_params(test_case, self.evaluation_params, self)
167
+
168
+ multimodal = test_case.multimodal
169
+
170
+ check_llm_test_case_params(
171
+ test_case,
172
+ self.evaluation_params,
173
+ None,
174
+ None,
175
+ self,
176
+ self.model,
177
+ multimodal,
178
+ )
147
179
 
148
180
  self.evaluation_cost = 0 if self.using_native_model else None
149
181
  with metric_progress_indicator(
@@ -153,10 +185,12 @@ class GEval(BaseMetric):
153
185
  _in_component=_in_component,
154
186
  ):
155
187
  self.evaluation_steps: List[str] = (
156
- await self._a_generate_evaluation_steps()
188
+ await self._a_generate_evaluation_steps(multimodal)
157
189
  )
158
190
  g_score, reason = await self._a_evaluate(
159
- test_case, _additional_context=_additional_context
191
+ test_case,
192
+ _additional_context=_additional_context,
193
+ multimodal=multimodal,
160
194
  )
161
195
  self.score = (
162
196
  (float(g_score) - self.score_range[0]) / self.score_range_span
@@ -182,7 +216,7 @@ class GEval(BaseMetric):
182
216
  )
183
217
  return self.score
184
218
 
185
- async def _a_generate_evaluation_steps(self) -> List[str]:
219
+ async def _a_generate_evaluation_steps(self, multimodal: bool) -> List[str]:
186
220
  if self.evaluation_steps:
187
221
  return self.evaluation_steps
188
222
 
@@ -190,25 +224,19 @@ class GEval(BaseMetric):
190
224
  self.evaluation_params
191
225
  )
192
226
  prompt = self.evaluation_template.generate_evaluation_steps(
193
- criteria=self.criteria, parameters=g_eval_params_str
227
+ criteria=self.criteria,
228
+ parameters=g_eval_params_str,
229
+ multimodal=multimodal,
230
+ )
231
+ return await a_generate_with_schema_and_extract(
232
+ metric=self,
233
+ prompt=prompt,
234
+ schema_cls=gschema.Steps,
235
+ extract_schema=lambda s: s.steps,
236
+ extract_json=lambda d: d["steps"],
194
237
  )
195
- if self.using_native_model:
196
- res, cost = await self.model.a_generate(prompt)
197
- self.evaluation_cost += cost
198
- data = trimAndLoadJson(res, self)
199
- return data["steps"]
200
- else:
201
- try:
202
- res: gschema.Steps = await self.model.a_generate(
203
- prompt, schema=gschema.Steps
204
- )
205
- return res.steps
206
- except TypeError:
207
- res = await self.model.a_generate(prompt)
208
- data = trimAndLoadJson(res, self)
209
- return data["steps"]
210
238
 
211
- def _generate_evaluation_steps(self) -> List[str]:
239
+ def _generate_evaluation_steps(self, multimodal: bool) -> List[str]:
212
240
  if self.evaluation_steps:
213
241
  return self.evaluation_steps
214
242
 
@@ -216,26 +244,23 @@ class GEval(BaseMetric):
216
244
  self.evaluation_params
217
245
  )
218
246
  prompt = self.evaluation_template.generate_evaluation_steps(
219
- criteria=self.criteria, parameters=g_eval_params_str
247
+ criteria=self.criteria,
248
+ parameters=g_eval_params_str,
249
+ multimodal=multimodal,
250
+ )
251
+ return generate_with_schema_and_extract(
252
+ metric=self,
253
+ prompt=prompt,
254
+ schema_cls=gschema.Steps,
255
+ extract_schema=lambda s: s.steps,
256
+ extract_json=lambda d: d["steps"],
220
257
  )
221
- if self.using_native_model:
222
- res, cost = self.model.generate(prompt)
223
- self.evaluation_cost += cost
224
- data = trimAndLoadJson(res, self)
225
- return data["steps"]
226
- else:
227
- try:
228
- res: gschema.Steps = self.model.generate(
229
- prompt, schema=gschema.Steps
230
- )
231
- return res.steps
232
- except TypeError:
233
- res = self.model.generate(prompt)
234
- data = trimAndLoadJson(res, self)
235
- return data["steps"]
236
258
 
237
259
  async def _a_evaluate(
238
- self, test_case: LLMTestCase, _additional_context: Optional[str] = None
260
+ self,
261
+ test_case: LLMTestCase,
262
+ multimodal: bool,
263
+ _additional_context: Optional[str] = None,
239
264
  ) -> Tuple[Union[int, float], str]:
240
265
  test_case_content = construct_test_case_string(
241
266
  self.evaluation_params, test_case
@@ -252,6 +277,7 @@ class GEval(BaseMetric):
252
277
  rubric=rubric_str,
253
278
  score_range=self.score_range,
254
279
  _additional_context=_additional_context,
280
+ multimodal=multimodal,
255
281
  )
256
282
  else:
257
283
  prompt = (
@@ -262,6 +288,7 @@ class GEval(BaseMetric):
262
288
  test_case_content=test_case_content,
263
289
  parameters=g_eval_params_str,
264
290
  _additional_context=_additional_context,
291
+ multimodal=multimodal,
265
292
  )
266
293
  )
267
294
  try:
@@ -275,8 +302,7 @@ class GEval(BaseMetric):
275
302
  prompt, top_logprobs=self.top_logprobs
276
303
  )
277
304
 
278
- if self.evaluation_cost is not None:
279
- self.evaluation_cost += cost
305
+ self._accrue_cost(cost)
280
306
 
281
307
  data = trimAndLoadJson(res.choices[0].message.content, self)
282
308
 
@@ -292,27 +318,21 @@ class GEval(BaseMetric):
292
318
  return weighted_summed_score, reason
293
319
  except (KeyError, AttributeError, TypeError, ValueError):
294
320
  return score, reason
295
- except (
296
- AttributeError
297
- ): # This catches the case where a_generate_raw_response doesn't exist.
298
- if self.using_native_model:
299
- res, cost = await self.model.a_generate(prompt)
300
- self.evaluation_cost += cost
301
- data = trimAndLoadJson(res, self)
302
- return data["score"], data["reason"]
303
- else:
304
- try:
305
- res: gschema.ReasonScore = await self.model.a_generate(
306
- prompt, schema=gschema.ReasonScore
307
- )
308
- return res.score, res.reason
309
- except TypeError:
310
- res = await self.model.a_generate(prompt)
311
- data = trimAndLoadJson(res, self)
312
- return data["score"], data["reason"]
321
+ except AttributeError:
322
+ # This catches the case where a_generate_raw_response doesn't exist.
323
+ return await a_generate_with_schema_and_extract(
324
+ metric=self,
325
+ prompt=prompt,
326
+ schema_cls=gschema.ReasonScore,
327
+ extract_schema=lambda s: (s.score, s.reason),
328
+ extract_json=lambda d: (d["score"], d["reason"]),
329
+ )
313
330
 
314
331
  def _evaluate(
315
- self, test_case: LLMTestCase, _additional_context: Optional[str] = None
332
+ self,
333
+ test_case: LLMTestCase,
334
+ multimodal: bool,
335
+ _additional_context: Optional[str] = None,
316
336
  ) -> Tuple[Union[int, float], str]:
317
337
  test_case_content = construct_test_case_string(
318
338
  self.evaluation_params, test_case
@@ -330,6 +350,7 @@ class GEval(BaseMetric):
330
350
  rubric=rubric_str,
331
351
  score_range=self.score_range,
332
352
  _additional_context=_additional_context,
353
+ multimodal=multimodal,
333
354
  )
334
355
  else:
335
356
  prompt = (
@@ -340,6 +361,7 @@ class GEval(BaseMetric):
340
361
  test_case_content=test_case_content,
341
362
  parameters=g_eval_params_str,
342
363
  _additional_context=_additional_context,
364
+ multimodal=multimodal,
343
365
  )
344
366
  )
345
367
 
@@ -351,7 +373,7 @@ class GEval(BaseMetric):
351
373
  res, cost = self.model.generate_raw_response(
352
374
  prompt, top_logprobs=self.top_logprobs
353
375
  )
354
- self.evaluation_cost += cost
376
+ self._accrue_cost(cost)
355
377
  data = trimAndLoadJson(res.choices[0].message.content, self)
356
378
 
357
379
  reason = data["reason"]
@@ -368,21 +390,13 @@ class GEval(BaseMetric):
368
390
  return score, reason
369
391
  except AttributeError:
370
392
  # This catches the case where a_generate_raw_response doesn't exist.
371
- if self.using_native_model:
372
- res, cost = self.model.generate(prompt)
373
- self.evaluation_cost += cost
374
- data = trimAndLoadJson(res, self)
375
- return data["score"], data["reason"]
376
- else:
377
- try:
378
- res: gschema.ReasonScore = self.model.generate(
379
- prompt, schema=gschema.ReasonScore
380
- )
381
- return res.score, res.reason
382
- except TypeError:
383
- res = self.model.generate(prompt)
384
- data = trimAndLoadJson(res, self)
385
- return data["score"], data["reason"]
393
+ return generate_with_schema_and_extract(
394
+ metric=self,
395
+ prompt=prompt,
396
+ schema_cls=gschema.ReasonScore,
397
+ extract_schema=lambda s: (s.score, s.reason),
398
+ extract_json=lambda d: (d["score"], d["reason"]),
399
+ )
386
400
 
387
401
  def is_successful(self) -> bool:
388
402
  if self.error is not None:
@@ -3,11 +3,23 @@ import textwrap
3
3
 
4
4
 
5
5
  class GEvalTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ """
13
+
6
14
  @staticmethod
7
- def generate_evaluation_steps(parameters: str, criteria: str):
15
+ def generate_evaluation_steps(
16
+ parameters: str, criteria: str, multimodal: bool = False
17
+ ):
8
18
  return textwrap.dedent(
9
19
  f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
10
20
 
21
+ {GEvalTemplate.multimodal_rules if multimodal else ""}
22
+
11
23
  Evaluation Criteria:
12
24
  {criteria}
13
25
 
@@ -31,6 +43,7 @@ class GEvalTemplate:
31
43
  rubric: Optional[str] = None,
32
44
  score_range: Tuple[int, int] = (0, 10),
33
45
  _additional_context: Optional[str] = None,
46
+ multimodal: bool = False,
34
47
  ):
35
48
  rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
36
49
  dependencies = (
@@ -62,6 +75,7 @@ class GEvalTemplate:
62
75
  - {reasoning_expectation}
63
76
  - Mention key details from the test case parameters.
64
77
  - Be concise, clear, and focused on the evaluation logic.
78
+ {GEvalTemplate.multimodal_rules if multimodal else ""}
65
79
 
66
80
  Only return valid JSON. Do **not** include any extra commentary or text.
67
81
 
@@ -95,6 +109,7 @@ class GEvalTemplate:
95
109
  test_case_content: str,
96
110
  parameters: str,
97
111
  _additional_context: Optional[str] = None,
112
+ multimodal: bool = False,
98
113
  ):
99
114
  additional_context = (
100
115
  f"\n\nAdditional Context:\n{_additional_context}\n"
@@ -104,6 +119,8 @@ class GEvalTemplate:
104
119
  return textwrap.dedent(
105
120
  f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
106
121
 
122
+ {GEvalTemplate.multimodal_rules if multimodal else ""}
123
+
107
124
  Evaluation Steps:
108
125
  {evaluation_steps}
109
126
 
@@ -9,8 +9,8 @@ from deepeval.test_case import (
9
9
  LLMTestCase,
10
10
  ToolCall,
11
11
  )
12
- from deepeval.models.llms.openai_model import unsupported_log_probs_gpt_models
13
12
  from pydantic import BaseModel, field_validator
13
+ from deepeval.models.llms.constants import OPENAI_MODELS_DATA
14
14
 
15
15
  from deepeval.test_case.conversational_test_case import ConversationalTestCase
16
16
 
@@ -114,16 +114,17 @@ def format_rubrics(rubrics: Optional[List[Rubric]]) -> Optional[str]:
114
114
 
115
115
  def no_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
116
116
 
117
- if isinstance(model, str) and model in unsupported_log_probs_gpt_models:
118
- return True
117
+ if isinstance(model, str):
118
+ model_data = OPENAI_MODELS_DATA.get(model)
119
+ if not model_data.supports_log_probs:
120
+ return True
119
121
  elif (
120
- isinstance(model, GPTModel)
121
- and model.get_model_name() in unsupported_log_probs_gpt_models
122
+ isinstance(model, GPTModel) and not model.model_data.supports_log_probs
122
123
  ):
123
124
  return True
124
125
  elif (
125
126
  isinstance(model, AzureOpenAIModel)
126
- and model.get_model_name() in unsupported_log_probs_gpt_models
127
+ and not model.model_data.supports_log_probs
127
128
  ):
128
129
  return True
129
130
 
@@ -3,11 +3,12 @@ import asyncio
3
3
  from deepeval.utils import get_or_create_event_loop, prettify_list
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  get_unit_interactions,
8
7
  print_tools_called,
9
8
  check_conversational_test_case_params,
10
9
  initialize_model,
10
+ a_generate_with_schema_and_extract,
11
+ generate_with_schema_and_extract,
11
12
  )
12
13
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
13
14
  from deepeval.metrics import BaseConversationalMetric
@@ -55,8 +56,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
55
56
  _in_component: bool = False,
56
57
  _log_metric_to_confident: bool = True,
57
58
  ):
59
+ multimodal = test_case.multimodal
58
60
  check_conversational_test_case_params(
59
- test_case, self._required_test_case_params, self
61
+ test_case,
62
+ self._required_test_case_params,
63
+ self,
64
+ None,
65
+ self.model,
66
+ multimodal,
60
67
  )
61
68
 
62
69
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -80,17 +87,21 @@ class GoalAccuracyMetric(BaseConversationalMetric):
80
87
  )
81
88
  goal_scores = [
82
89
  self._get_goal_accuracy_score(
83
- task.user_goal, task.steps_taken
90
+ task.user_goal, task.steps_taken, multimodal
84
91
  )
85
92
  for task in goal_and_steps_taken
86
93
  ]
87
94
  plan_scores = [
88
- self._get_plan_scores(task.user_goal, task.steps_taken)
95
+ self._get_plan_scores(
96
+ task.user_goal, task.steps_taken, multimodal
97
+ )
89
98
  for task in goal_and_steps_taken
90
99
  ]
91
100
  self.score = self._calculate_score(goal_scores, plan_scores)
92
101
  self.success = self.score >= self.threshold
93
- self.reason = self._generate_reason(goal_scores, plan_scores)
102
+ self.reason = self._generate_reason(
103
+ goal_scores, plan_scores, multimodal
104
+ )
94
105
 
95
106
  self.verbose_logs = construct_verbose_logs(
96
107
  self,
@@ -117,8 +128,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
117
128
  _in_component: bool = False,
118
129
  _log_metric_to_confident: bool = True,
119
130
  ):
131
+ multimodal = test_case.multimodal
120
132
  check_conversational_test_case_params(
121
- test_case, self._required_test_case_params, self
133
+ test_case,
134
+ self._required_test_case_params,
135
+ self,
136
+ None,
137
+ self.model,
138
+ multimodal,
122
139
  )
123
140
 
124
141
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -134,21 +151,23 @@ class GoalAccuracyMetric(BaseConversationalMetric):
134
151
  goal_scores = await asyncio.gather(
135
152
  *[
136
153
  self._a_get_goal_accuracy_score(
137
- task.user_goal, task.steps_taken
154
+ task.user_goal, task.steps_taken, multimodal
138
155
  )
139
156
  for task in goal_and_steps_taken
140
157
  ]
141
158
  )
142
159
  plan_scores = await asyncio.gather(
143
160
  *[
144
- self._a_get_plan_scores(task.user_goal, task.steps_taken)
161
+ self._a_get_plan_scores(
162
+ task.user_goal, task.steps_taken, multimodal
163
+ )
145
164
  for task in goal_and_steps_taken
146
165
  ]
147
166
  )
148
167
  self.score = self._calculate_score(goal_scores, plan_scores)
149
168
  self.success = self.score >= self.threshold
150
169
  self.reason = await self._a_generate_reason(
151
- goal_scores, plan_scores
170
+ goal_scores, plan_scores, multimodal
152
171
  )
153
172
 
154
173
  self.verbose_logs = construct_verbose_logs(
@@ -191,41 +210,31 @@ class GoalAccuracyMetric(BaseConversationalMetric):
191
210
  goal_and_steps_taken.append(new_goal_steps)
192
211
  return goal_and_steps_taken
193
212
 
194
- def _get_plan_scores(self, user_goal, steps_taken):
213
+ def _get_plan_scores(self, user_goal, steps_taken, multimodal: bool):
195
214
  prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
196
- user_goal, "\n".join(steps_taken)
215
+ user_goal, "\n".join(steps_taken), multimodal
216
+ )
217
+ return generate_with_schema_and_extract(
218
+ metric=self,
219
+ prompt=prompt,
220
+ schema_cls=PlanScore,
221
+ extract_schema=lambda s: s,
222
+ extract_json=lambda data: PlanScore(**data),
197
223
  )
198
- if self.using_native_model:
199
- res, cost = self.model.generate(prompt, schema=PlanScore)
200
- self.evaluation_cost += cost
201
- return res
202
- else:
203
- try:
204
- res: PlanScore = self.model.generate(prompt, schema=PlanScore)
205
- return res
206
- except TypeError:
207
- res = self.model.generate(prompt)
208
- data = trimAndLoadJson(res, self)
209
- return PlanScore(**data)
210
224
 
211
- async def _a_get_plan_scores(self, user_goal, steps_taken):
225
+ async def _a_get_plan_scores(
226
+ self, user_goal, steps_taken, multimodal: bool
227
+ ):
212
228
  prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
213
- user_goal, "\n".join(steps_taken)
229
+ user_goal, "\n".join(steps_taken), multimodal
230
+ )
231
+ return await a_generate_with_schema_and_extract(
232
+ metric=self,
233
+ prompt=prompt,
234
+ schema_cls=PlanScore,
235
+ extract_schema=lambda s: s,
236
+ extract_json=lambda data: PlanScore(**data),
214
237
  )
215
- if self.using_native_model:
216
- res, cost = await self.model.a_generate(prompt, schema=PlanScore)
217
- self.evaluation_cost += cost
218
- return res
219
- else:
220
- try:
221
- res: PlanScore = await self.model.a_generate(
222
- prompt, schema=PlanScore
223
- )
224
- return res
225
- except TypeError:
226
- res = await self.model.a_generate(prompt)
227
- data = trimAndLoadJson(res, self)
228
- return PlanScore(**data)
229
238
 
230
239
  def _calculate_score(
231
240
  self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
@@ -240,7 +249,10 @@ class GoalAccuracyMetric(BaseConversationalMetric):
240
249
  return 0 if self.strict_mode and score < self.threshold else score
241
250
 
242
251
  def _generate_reason(
243
- self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
252
+ self,
253
+ goal_scores: List[GoalScore],
254
+ plan_scores: List[PlanScore],
255
+ multimodal: bool,
244
256
  ):
245
257
  goal_evaluations = ""
246
258
  for goal_score in goal_scores:
@@ -254,18 +266,25 @@ class GoalAccuracyMetric(BaseConversationalMetric):
254
266
  )
255
267
 
256
268
  prompt = GoalAccuracyTemplate.get_final_reason(
257
- self.score, self.threshold, goal_evaluations, plan_evalautions
269
+ self.score,
270
+ self.threshold,
271
+ goal_evaluations,
272
+ plan_evalautions,
273
+ multimodal,
258
274
  )
259
275
  if self.using_native_model:
260
276
  res, cost = self.model.generate(prompt)
261
- self.evaluation_cost += cost
277
+ self._accrue_cost(cost)
262
278
  return res
263
279
  else:
264
280
  res = self.model.generate(prompt)
265
281
  return res
266
282
 
267
283
  async def _a_generate_reason(
268
- self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
284
+ self,
285
+ goal_scores: List[GoalScore],
286
+ plan_scores: List[PlanScore],
287
+ multimodal: bool,
269
288
  ):
270
289
  goal_evaluations = ""
271
290
  for goal_score in goal_scores:
@@ -279,51 +298,47 @@ class GoalAccuracyMetric(BaseConversationalMetric):
279
298
  )
280
299
 
281
300
  prompt = GoalAccuracyTemplate.get_final_reason(
282
- self.score, self.threshold, goal_evaluations, plan_evalautions
301
+ self.score,
302
+ self.threshold,
303
+ goal_evaluations,
304
+ plan_evalautions,
305
+ multimodal,
283
306
  )
284
307
  if self.using_native_model:
285
308
  res, cost = await self.model.a_generate(prompt)
286
- self.evaluation_cost += cost
309
+ self._accrue_cost(cost)
287
310
  return res
288
311
  else:
289
312
  res = await self.model.a_generate(prompt)
290
313
  return res
291
314
 
292
- def _get_goal_accuracy_score(self, user_goal, steps_taken):
315
+ def _get_goal_accuracy_score(
316
+ self, user_goal, steps_taken, multimodal: bool
317
+ ):
293
318
  prompt = GoalAccuracyTemplate.get_accuracy_score(
294
- user_goal, "\n".join(steps_taken)
319
+ user_goal, "\n".join(steps_taken), multimodal
320
+ )
321
+ return generate_with_schema_and_extract(
322
+ metric=self,
323
+ prompt=prompt,
324
+ schema_cls=GoalScore,
325
+ extract_schema=lambda s: s,
326
+ extract_json=lambda data: GoalScore(**data),
295
327
  )
296
- if self.using_native_model:
297
- res, cost = self.model.generate(prompt, schema=GoalScore)
298
- self.evaluation_cost += cost
299
- return res
300
- else:
301
- try:
302
- res: GoalScore = self.model.generate(prompt, schema=GoalScore)
303
- return res
304
- except TypeError:
305
- res = self.model.generate(prompt)
306
- data = trimAndLoadJson(res, self)
307
- return GoalScore(**data)
308
328
 
309
- async def _a_get_goal_accuracy_score(self, user_goal, steps_taken):
329
+ async def _a_get_goal_accuracy_score(
330
+ self, user_goal, steps_taken, multimodal: bool
331
+ ):
310
332
  prompt = GoalAccuracyTemplate.get_accuracy_score(
311
- user_goal, "\n".join(steps_taken)
333
+ user_goal, "\n".join(steps_taken), multimodal
334
+ )
335
+ return await a_generate_with_schema_and_extract(
336
+ metric=self,
337
+ prompt=prompt,
338
+ schema_cls=GoalScore,
339
+ extract_schema=lambda s: s,
340
+ extract_json=lambda data: GoalScore(**data),
312
341
  )
313
- if self.using_native_model:
314
- res, cost = await self.model.a_generate(prompt, schema=GoalScore)
315
- self.evaluation_cost += cost
316
- return res
317
- else:
318
- try:
319
- res: GoalScore = await self.model.a_generate(
320
- prompt, schema=GoalScore
321
- )
322
- return res
323
- except TypeError:
324
- res = await self.model.a_generate(prompt)
325
- data = trimAndLoadJson(res, self)
326
- return GoalScore(**data)
327
342
 
328
343
  def print_goals_and_steps_taken(self, goals_and_steps):
329
344
  final_goals_and_steps = ""
@@ -340,7 +355,7 @@ class GoalAccuracyMetric(BaseConversationalMetric):
340
355
  else:
341
356
  try:
342
357
  self.success = self.score >= self.threshold
343
- except:
358
+ except TypeError:
344
359
  self.success = False
345
360
  return self.success
346
361