deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -15,6 +15,8 @@ from deepeval.metrics.utils import (
15
15
  trimAndLoadJson,
16
16
  initialize_model,
17
17
  check_llm_test_case_params,
18
+ generate_with_schema_and_extract,
19
+ a_generate_with_schema_and_extract,
18
20
  )
19
21
  from deepeval.models import DeepEvalBaseLLM
20
22
  from deepeval.metrics.indicator import metric_progress_indicator
@@ -82,7 +84,19 @@ class GEval(BaseMetric):
82
84
  _log_metric_to_confident: bool = True,
83
85
  _additional_context: Optional[str] = None,
84
86
  ) -> float:
85
- check_llm_test_case_params(test_case, self.evaluation_params, self)
87
+
88
+ multimodal = test_case.multimodal
89
+
90
+ check_llm_test_case_params(
91
+ test_case,
92
+ self.evaluation_params,
93
+ None,
94
+ None,
95
+ self,
96
+ self.model,
97
+ multimodal,
98
+ )
99
+
86
100
  self.evaluation_cost = 0 if self.using_native_model else None
87
101
 
88
102
  with metric_progress_indicator(
@@ -104,10 +118,12 @@ class GEval(BaseMetric):
104
118
  )
105
119
  else:
106
120
  self.evaluation_steps: List[str] = (
107
- self._generate_evaluation_steps()
121
+ self._generate_evaluation_steps(multimodal)
108
122
  )
109
123
  g_score, reason = self._evaluate(
110
- test_case, _additional_context=_additional_context
124
+ test_case,
125
+ _additional_context=_additional_context,
126
+ multimodal=multimodal,
111
127
  )
112
128
  self.score = (
113
129
  (float(g_score) - self.score_range[0])
@@ -143,7 +159,18 @@ class GEval(BaseMetric):
143
159
  _log_metric_to_confident: bool = True,
144
160
  _additional_context: Optional[str] = None,
145
161
  ) -> float:
146
- check_llm_test_case_params(test_case, self.evaluation_params, self)
162
+
163
+ multimodal = test_case.multimodal
164
+
165
+ check_llm_test_case_params(
166
+ test_case,
167
+ self.evaluation_params,
168
+ None,
169
+ None,
170
+ self,
171
+ self.model,
172
+ multimodal,
173
+ )
147
174
 
148
175
  self.evaluation_cost = 0 if self.using_native_model else None
149
176
  with metric_progress_indicator(
@@ -153,10 +180,12 @@ class GEval(BaseMetric):
153
180
  _in_component=_in_component,
154
181
  ):
155
182
  self.evaluation_steps: List[str] = (
156
- await self._a_generate_evaluation_steps()
183
+ await self._a_generate_evaluation_steps(multimodal)
157
184
  )
158
185
  g_score, reason = await self._a_evaluate(
159
- test_case, _additional_context=_additional_context
186
+ test_case,
187
+ _additional_context=_additional_context,
188
+ multimodal=multimodal,
160
189
  )
161
190
  self.score = (
162
191
  (float(g_score) - self.score_range[0]) / self.score_range_span
@@ -182,7 +211,7 @@ class GEval(BaseMetric):
182
211
  )
183
212
  return self.score
184
213
 
185
- async def _a_generate_evaluation_steps(self) -> List[str]:
214
+ async def _a_generate_evaluation_steps(self, multimodal: bool) -> List[str]:
186
215
  if self.evaluation_steps:
187
216
  return self.evaluation_steps
188
217
 
@@ -190,25 +219,19 @@ class GEval(BaseMetric):
190
219
  self.evaluation_params
191
220
  )
192
221
  prompt = self.evaluation_template.generate_evaluation_steps(
193
- criteria=self.criteria, parameters=g_eval_params_str
222
+ criteria=self.criteria,
223
+ parameters=g_eval_params_str,
224
+ multimodal=multimodal,
225
+ )
226
+ return await a_generate_with_schema_and_extract(
227
+ metric=self,
228
+ prompt=prompt,
229
+ schema_cls=gschema.Steps,
230
+ extract_schema=lambda s: s.steps,
231
+ extract_json=lambda d: d["steps"],
194
232
  )
195
- if self.using_native_model:
196
- res, cost = await self.model.a_generate(prompt)
197
- self.evaluation_cost += cost
198
- data = trimAndLoadJson(res, self)
199
- return data["steps"]
200
- else:
201
- try:
202
- res: gschema.Steps = await self.model.a_generate(
203
- prompt, schema=gschema.Steps
204
- )
205
- return res.steps
206
- except TypeError:
207
- res = await self.model.a_generate(prompt)
208
- data = trimAndLoadJson(res, self)
209
- return data["steps"]
210
233
 
211
- def _generate_evaluation_steps(self) -> List[str]:
234
+ def _generate_evaluation_steps(self, multimodal: bool) -> List[str]:
212
235
  if self.evaluation_steps:
213
236
  return self.evaluation_steps
214
237
 
@@ -216,26 +239,23 @@ class GEval(BaseMetric):
216
239
  self.evaluation_params
217
240
  )
218
241
  prompt = self.evaluation_template.generate_evaluation_steps(
219
- criteria=self.criteria, parameters=g_eval_params_str
242
+ criteria=self.criteria,
243
+ parameters=g_eval_params_str,
244
+ multimodal=multimodal,
245
+ )
246
+ return generate_with_schema_and_extract(
247
+ metric=self,
248
+ prompt=prompt,
249
+ schema_cls=gschema.Steps,
250
+ extract_schema=lambda s: s.steps,
251
+ extract_json=lambda d: d["steps"],
220
252
  )
221
- if self.using_native_model:
222
- res, cost = self.model.generate(prompt)
223
- self.evaluation_cost += cost
224
- data = trimAndLoadJson(res, self)
225
- return data["steps"]
226
- else:
227
- try:
228
- res: gschema.Steps = self.model.generate(
229
- prompt, schema=gschema.Steps
230
- )
231
- return res.steps
232
- except TypeError:
233
- res = self.model.generate(prompt)
234
- data = trimAndLoadJson(res, self)
235
- return data["steps"]
236
253
 
237
254
  async def _a_evaluate(
238
- self, test_case: LLMTestCase, _additional_context: Optional[str] = None
255
+ self,
256
+ test_case: LLMTestCase,
257
+ multimodal: bool,
258
+ _additional_context: Optional[str] = None,
239
259
  ) -> Tuple[Union[int, float], str]:
240
260
  test_case_content = construct_test_case_string(
241
261
  self.evaluation_params, test_case
@@ -252,6 +272,7 @@ class GEval(BaseMetric):
252
272
  rubric=rubric_str,
253
273
  score_range=self.score_range,
254
274
  _additional_context=_additional_context,
275
+ multimodal=multimodal,
255
276
  )
256
277
  else:
257
278
  prompt = (
@@ -262,6 +283,7 @@ class GEval(BaseMetric):
262
283
  test_case_content=test_case_content,
263
284
  parameters=g_eval_params_str,
264
285
  _additional_context=_additional_context,
286
+ multimodal=multimodal,
265
287
  )
266
288
  )
267
289
  try:
@@ -275,8 +297,7 @@ class GEval(BaseMetric):
275
297
  prompt, top_logprobs=self.top_logprobs
276
298
  )
277
299
 
278
- if self.evaluation_cost is not None:
279
- self.evaluation_cost += cost
300
+ self._accrue_cost(cost)
280
301
 
281
302
  data = trimAndLoadJson(res.choices[0].message.content, self)
282
303
 
@@ -292,27 +313,21 @@ class GEval(BaseMetric):
292
313
  return weighted_summed_score, reason
293
314
  except (KeyError, AttributeError, TypeError, ValueError):
294
315
  return score, reason
295
- except (
296
- AttributeError
297
- ): # This catches the case where a_generate_raw_response doesn't exist.
298
- if self.using_native_model:
299
- res, cost = await self.model.a_generate(prompt)
300
- self.evaluation_cost += cost
301
- data = trimAndLoadJson(res, self)
302
- return data["score"], data["reason"]
303
- else:
304
- try:
305
- res: gschema.ReasonScore = await self.model.a_generate(
306
- prompt, schema=gschema.ReasonScore
307
- )
308
- return res.score, res.reason
309
- except TypeError:
310
- res = await self.model.a_generate(prompt)
311
- data = trimAndLoadJson(res, self)
312
- return data["score"], data["reason"]
316
+ except AttributeError:
317
+ # This catches the case where a_generate_raw_response doesn't exist.
318
+ return await a_generate_with_schema_and_extract(
319
+ metric=self,
320
+ prompt=prompt,
321
+ schema_cls=gschema.ReasonScore,
322
+ extract_schema=lambda s: (s.score, s.reason),
323
+ extract_json=lambda d: (d["score"], d["reason"]),
324
+ )
313
325
 
314
326
  def _evaluate(
315
- self, test_case: LLMTestCase, _additional_context: Optional[str] = None
327
+ self,
328
+ test_case: LLMTestCase,
329
+ multimodal: bool,
330
+ _additional_context: Optional[str] = None,
316
331
  ) -> Tuple[Union[int, float], str]:
317
332
  test_case_content = construct_test_case_string(
318
333
  self.evaluation_params, test_case
@@ -330,6 +345,7 @@ class GEval(BaseMetric):
330
345
  rubric=rubric_str,
331
346
  score_range=self.score_range,
332
347
  _additional_context=_additional_context,
348
+ multimodal=multimodal,
333
349
  )
334
350
  else:
335
351
  prompt = (
@@ -340,6 +356,7 @@ class GEval(BaseMetric):
340
356
  test_case_content=test_case_content,
341
357
  parameters=g_eval_params_str,
342
358
  _additional_context=_additional_context,
359
+ multimodal=multimodal,
343
360
  )
344
361
  )
345
362
 
@@ -351,7 +368,7 @@ class GEval(BaseMetric):
351
368
  res, cost = self.model.generate_raw_response(
352
369
  prompt, top_logprobs=self.top_logprobs
353
370
  )
354
- self.evaluation_cost += cost
371
+ self._accrue_cost(cost)
355
372
  data = trimAndLoadJson(res.choices[0].message.content, self)
356
373
 
357
374
  reason = data["reason"]
@@ -368,21 +385,13 @@ class GEval(BaseMetric):
368
385
  return score, reason
369
386
  except AttributeError:
370
387
  # This catches the case where a_generate_raw_response doesn't exist.
371
- if self.using_native_model:
372
- res, cost = self.model.generate(prompt)
373
- self.evaluation_cost += cost
374
- data = trimAndLoadJson(res, self)
375
- return data["score"], data["reason"]
376
- else:
377
- try:
378
- res: gschema.ReasonScore = self.model.generate(
379
- prompt, schema=gschema.ReasonScore
380
- )
381
- return res.score, res.reason
382
- except TypeError:
383
- res = self.model.generate(prompt)
384
- data = trimAndLoadJson(res, self)
385
- return data["score"], data["reason"]
388
+ return generate_with_schema_and_extract(
389
+ metric=self,
390
+ prompt=prompt,
391
+ schema_cls=gschema.ReasonScore,
392
+ extract_schema=lambda s: (s.score, s.reason),
393
+ extract_json=lambda d: (d["score"], d["reason"]),
394
+ )
386
395
 
387
396
  def is_successful(self) -> bool:
388
397
  if self.error is not None:
@@ -3,11 +3,23 @@ import textwrap
3
3
 
4
4
 
5
5
  class GEvalTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ """
13
+
6
14
  @staticmethod
7
- def generate_evaluation_steps(parameters: str, criteria: str):
15
+ def generate_evaluation_steps(
16
+ parameters: str, criteria: str, multimodal: bool = False
17
+ ):
8
18
  return textwrap.dedent(
9
19
  f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
10
20
 
21
+ {GEvalTemplate.multimodal_rules if multimodal else ""}
22
+
11
23
  Evaluation Criteria:
12
24
  {criteria}
13
25
 
@@ -31,6 +43,7 @@ class GEvalTemplate:
31
43
  rubric: Optional[str] = None,
32
44
  score_range: Tuple[int, int] = (0, 10),
33
45
  _additional_context: Optional[str] = None,
46
+ multimodal: bool = False,
34
47
  ):
35
48
  rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
36
49
  dependencies = (
@@ -62,6 +75,7 @@ class GEvalTemplate:
62
75
  - {reasoning_expectation}
63
76
  - Mention key details from the test case parameters.
64
77
  - Be concise, clear, and focused on the evaluation logic.
78
+ {GEvalTemplate.multimodal_rules if multimodal else ""}
65
79
 
66
80
  Only return valid JSON. Do **not** include any extra commentary or text.
67
81
 
@@ -95,6 +109,7 @@ class GEvalTemplate:
95
109
  test_case_content: str,
96
110
  parameters: str,
97
111
  _additional_context: Optional[str] = None,
112
+ multimodal: bool = False,
98
113
  ):
99
114
  additional_context = (
100
115
  f"\n\nAdditional Context:\n{_additional_context}\n"
@@ -104,6 +119,8 @@ class GEvalTemplate:
104
119
  return textwrap.dedent(
105
120
  f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
106
121
 
122
+ {GEvalTemplate.multimodal_rules if multimodal else ""}
123
+
107
124
  Evaluation Steps:
108
125
  {evaluation_steps}
109
126
 
@@ -9,8 +9,8 @@ from deepeval.test_case import (
9
9
  LLMTestCase,
10
10
  ToolCall,
11
11
  )
12
- from deepeval.models.llms.openai_model import unsupported_log_probs_gpt_models
13
12
  from pydantic import BaseModel, field_validator
13
+ from deepeval.models.llms.constants import OPENAI_MODELS_DATA
14
14
 
15
15
  from deepeval.test_case.conversational_test_case import ConversationalTestCase
16
16
 
@@ -114,16 +114,17 @@ def format_rubrics(rubrics: Optional[List[Rubric]]) -> Optional[str]:
114
114
 
115
115
  def no_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
116
116
 
117
- if isinstance(model, str) and model in unsupported_log_probs_gpt_models:
118
- return True
117
+ if isinstance(model, str):
118
+ model_data = OPENAI_MODELS_DATA.get(model)
119
+ if not model_data.supports_log_probs:
120
+ return True
119
121
  elif (
120
- isinstance(model, GPTModel)
121
- and model.get_model_name() in unsupported_log_probs_gpt_models
122
+ isinstance(model, GPTModel) and not model.model_data.supports_log_probs
122
123
  ):
123
124
  return True
124
125
  elif (
125
126
  isinstance(model, AzureOpenAIModel)
126
- and model.get_model_name() in unsupported_log_probs_gpt_models
127
+ and not model.model_data.supports_log_probs
127
128
  ):
128
129
  return True
129
130
 
@@ -3,11 +3,12 @@ import asyncio
3
3
  from deepeval.utils import get_or_create_event_loop, prettify_list
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  get_unit_interactions,
8
7
  print_tools_called,
9
8
  check_conversational_test_case_params,
10
9
  initialize_model,
10
+ a_generate_with_schema_and_extract,
11
+ generate_with_schema_and_extract,
11
12
  )
12
13
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
13
14
  from deepeval.metrics import BaseConversationalMetric
@@ -55,8 +56,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
55
56
  _in_component: bool = False,
56
57
  _log_metric_to_confident: bool = True,
57
58
  ):
59
+ multimodal = test_case.multimodal
58
60
  check_conversational_test_case_params(
59
- test_case, self._required_test_case_params, self
61
+ test_case,
62
+ self._required_test_case_params,
63
+ self,
64
+ None,
65
+ self.model,
66
+ multimodal,
60
67
  )
61
68
 
62
69
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -80,17 +87,21 @@ class GoalAccuracyMetric(BaseConversationalMetric):
80
87
  )
81
88
  goal_scores = [
82
89
  self._get_goal_accuracy_score(
83
- task.user_goal, task.steps_taken
90
+ task.user_goal, task.steps_taken, multimodal
84
91
  )
85
92
  for task in goal_and_steps_taken
86
93
  ]
87
94
  plan_scores = [
88
- self._get_plan_scores(task.user_goal, task.steps_taken)
95
+ self._get_plan_scores(
96
+ task.user_goal, task.steps_taken, multimodal
97
+ )
89
98
  for task in goal_and_steps_taken
90
99
  ]
91
100
  self.score = self._calculate_score(goal_scores, plan_scores)
92
101
  self.success = self.score >= self.threshold
93
- self.reason = self._generate_reason(goal_scores, plan_scores)
102
+ self.reason = self._generate_reason(
103
+ goal_scores, plan_scores, multimodal
104
+ )
94
105
 
95
106
  self.verbose_logs = construct_verbose_logs(
96
107
  self,
@@ -117,8 +128,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
117
128
  _in_component: bool = False,
118
129
  _log_metric_to_confident: bool = True,
119
130
  ):
131
+ multimodal = test_case.multimodal
120
132
  check_conversational_test_case_params(
121
- test_case, self._required_test_case_params, self
133
+ test_case,
134
+ self._required_test_case_params,
135
+ self,
136
+ None,
137
+ self.model,
138
+ multimodal,
122
139
  )
123
140
 
124
141
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -134,21 +151,23 @@ class GoalAccuracyMetric(BaseConversationalMetric):
134
151
  goal_scores = await asyncio.gather(
135
152
  *[
136
153
  self._a_get_goal_accuracy_score(
137
- task.user_goal, task.steps_taken
154
+ task.user_goal, task.steps_taken, multimodal
138
155
  )
139
156
  for task in goal_and_steps_taken
140
157
  ]
141
158
  )
142
159
  plan_scores = await asyncio.gather(
143
160
  *[
144
- self._a_get_plan_scores(task.user_goal, task.steps_taken)
161
+ self._a_get_plan_scores(
162
+ task.user_goal, task.steps_taken, multimodal
163
+ )
145
164
  for task in goal_and_steps_taken
146
165
  ]
147
166
  )
148
167
  self.score = self._calculate_score(goal_scores, plan_scores)
149
168
  self.success = self.score >= self.threshold
150
169
  self.reason = await self._a_generate_reason(
151
- goal_scores, plan_scores
170
+ goal_scores, plan_scores, multimodal
152
171
  )
153
172
 
154
173
  self.verbose_logs = construct_verbose_logs(
@@ -191,41 +210,31 @@ class GoalAccuracyMetric(BaseConversationalMetric):
191
210
  goal_and_steps_taken.append(new_goal_steps)
192
211
  return goal_and_steps_taken
193
212
 
194
- def _get_plan_scores(self, user_goal, steps_taken):
213
+ def _get_plan_scores(self, user_goal, steps_taken, multimodal: bool):
195
214
  prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
196
- user_goal, "\n".join(steps_taken)
215
+ user_goal, "\n".join(steps_taken), multimodal
216
+ )
217
+ return generate_with_schema_and_extract(
218
+ metric=self,
219
+ prompt=prompt,
220
+ schema_cls=PlanScore,
221
+ extract_schema=lambda s: s,
222
+ extract_json=lambda data: PlanScore(**data),
197
223
  )
198
- if self.using_native_model:
199
- res, cost = self.model.generate(prompt, schema=PlanScore)
200
- self.evaluation_cost += cost
201
- return res
202
- else:
203
- try:
204
- res: PlanScore = self.model.generate(prompt, schema=PlanScore)
205
- return res
206
- except TypeError:
207
- res = self.model.generate(prompt)
208
- data = trimAndLoadJson(res, self)
209
- return PlanScore(**data)
210
224
 
211
- async def _a_get_plan_scores(self, user_goal, steps_taken):
225
+ async def _a_get_plan_scores(
226
+ self, user_goal, steps_taken, multimodal: bool
227
+ ):
212
228
  prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
213
- user_goal, "\n".join(steps_taken)
229
+ user_goal, "\n".join(steps_taken), multimodal
230
+ )
231
+ return await a_generate_with_schema_and_extract(
232
+ metric=self,
233
+ prompt=prompt,
234
+ schema_cls=PlanScore,
235
+ extract_schema=lambda s: s,
236
+ extract_json=lambda data: PlanScore(**data),
214
237
  )
215
- if self.using_native_model:
216
- res, cost = await self.model.a_generate(prompt, schema=PlanScore)
217
- self.evaluation_cost += cost
218
- return res
219
- else:
220
- try:
221
- res: PlanScore = await self.model.a_generate(
222
- prompt, schema=PlanScore
223
- )
224
- return res
225
- except TypeError:
226
- res = await self.model.a_generate(prompt)
227
- data = trimAndLoadJson(res, self)
228
- return PlanScore(**data)
229
238
 
230
239
  def _calculate_score(
231
240
  self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
@@ -240,7 +249,10 @@ class GoalAccuracyMetric(BaseConversationalMetric):
240
249
  return 0 if self.strict_mode and score < self.threshold else score
241
250
 
242
251
  def _generate_reason(
243
- self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
252
+ self,
253
+ goal_scores: List[GoalScore],
254
+ plan_scores: List[PlanScore],
255
+ multimodal: bool,
244
256
  ):
245
257
  goal_evaluations = ""
246
258
  for goal_score in goal_scores:
@@ -254,18 +266,25 @@ class GoalAccuracyMetric(BaseConversationalMetric):
254
266
  )
255
267
 
256
268
  prompt = GoalAccuracyTemplate.get_final_reason(
257
- self.score, self.threshold, goal_evaluations, plan_evalautions
269
+ self.score,
270
+ self.threshold,
271
+ goal_evaluations,
272
+ plan_evalautions,
273
+ multimodal,
258
274
  )
259
275
  if self.using_native_model:
260
276
  res, cost = self.model.generate(prompt)
261
- self.evaluation_cost += cost
277
+ self._accrue_cost(cost)
262
278
  return res
263
279
  else:
264
280
  res = self.model.generate(prompt)
265
281
  return res
266
282
 
267
283
  async def _a_generate_reason(
268
- self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
284
+ self,
285
+ goal_scores: List[GoalScore],
286
+ plan_scores: List[PlanScore],
287
+ multimodal: bool,
269
288
  ):
270
289
  goal_evaluations = ""
271
290
  for goal_score in goal_scores:
@@ -279,51 +298,47 @@ class GoalAccuracyMetric(BaseConversationalMetric):
279
298
  )
280
299
 
281
300
  prompt = GoalAccuracyTemplate.get_final_reason(
282
- self.score, self.threshold, goal_evaluations, plan_evalautions
301
+ self.score,
302
+ self.threshold,
303
+ goal_evaluations,
304
+ plan_evalautions,
305
+ multimodal,
283
306
  )
284
307
  if self.using_native_model:
285
308
  res, cost = await self.model.a_generate(prompt)
286
- self.evaluation_cost += cost
309
+ self._accrue_cost(cost)
287
310
  return res
288
311
  else:
289
312
  res = await self.model.a_generate(prompt)
290
313
  return res
291
314
 
292
- def _get_goal_accuracy_score(self, user_goal, steps_taken):
315
+ def _get_goal_accuracy_score(
316
+ self, user_goal, steps_taken, multimodal: bool
317
+ ):
293
318
  prompt = GoalAccuracyTemplate.get_accuracy_score(
294
- user_goal, "\n".join(steps_taken)
319
+ user_goal, "\n".join(steps_taken), multimodal
320
+ )
321
+ return generate_with_schema_and_extract(
322
+ metric=self,
323
+ prompt=prompt,
324
+ schema_cls=GoalScore,
325
+ extract_schema=lambda s: s,
326
+ extract_json=lambda data: GoalScore(**data),
295
327
  )
296
- if self.using_native_model:
297
- res, cost = self.model.generate(prompt, schema=GoalScore)
298
- self.evaluation_cost += cost
299
- return res
300
- else:
301
- try:
302
- res: GoalScore = self.model.generate(prompt, schema=GoalScore)
303
- return res
304
- except TypeError:
305
- res = self.model.generate(prompt)
306
- data = trimAndLoadJson(res, self)
307
- return GoalScore(**data)
308
328
 
309
- async def _a_get_goal_accuracy_score(self, user_goal, steps_taken):
329
+ async def _a_get_goal_accuracy_score(
330
+ self, user_goal, steps_taken, multimodal: bool
331
+ ):
310
332
  prompt = GoalAccuracyTemplate.get_accuracy_score(
311
- user_goal, "\n".join(steps_taken)
333
+ user_goal, "\n".join(steps_taken), multimodal
334
+ )
335
+ return await a_generate_with_schema_and_extract(
336
+ metric=self,
337
+ prompt=prompt,
338
+ schema_cls=GoalScore,
339
+ extract_schema=lambda s: s,
340
+ extract_json=lambda data: GoalScore(**data),
312
341
  )
313
- if self.using_native_model:
314
- res, cost = await self.model.a_generate(prompt, schema=GoalScore)
315
- self.evaluation_cost += cost
316
- return res
317
- else:
318
- try:
319
- res: GoalScore = await self.model.a_generate(
320
- prompt, schema=GoalScore
321
- )
322
- return res
323
- except TypeError:
324
- res = await self.model.a_generate(prompt)
325
- data = trimAndLoadJson(res, self)
326
- return GoalScore(**data)
327
342
 
328
343
  def print_goals_and_steps_taken(self, goals_and_steps):
329
344
  final_goals_and_steps = ""
@@ -340,7 +355,7 @@ class GoalAccuracyMetric(BaseConversationalMetric):
340
355
  else:
341
356
  try:
342
357
  self.success = self.score >= self.threshold
343
- except:
358
+ except TypeError:
344
359
  self.success = False
345
360
  return self.success
346
361