deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -8,7 +8,11 @@ from deepeval.metrics.conversational_g_eval.conversational_g_eval import (
8
8
  ConversationalGEval,
9
9
  )
10
10
  from deepeval.metrics.g_eval.utils import CONVERSATIONAL_G_EVAL_PARAMS
11
- from deepeval.metrics.utils import copy_metrics, trimAndLoadJson
11
+ from deepeval.metrics.utils import (
12
+ copy_metrics,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
15
+ )
12
16
  from deepeval.test_case import (
13
17
  ConversationalTestCase,
14
18
  TurnParams,
@@ -263,20 +267,14 @@ class ConversationalVerdictNode(ConversationalBaseNode):
263
267
  score=metric.score,
264
268
  name=metric.__name__,
265
269
  )
266
- if metric.using_native_model:
267
- res, cost = metric.model.generate(prompt, schema=MetricScoreReason)
268
- metric.evaluation_cost += cost
269
- else:
270
- try:
271
- res: MetricScoreReason = metric.model.generate(
272
- prompt, schema=MetricScoreReason
273
- )
274
- except TypeError:
275
- res = metric.model.generate(prompt)
276
- data = trimAndLoadJson(res, self)
277
- res = MetricScoreReason(**data)
278
270
 
279
- return res.reason
271
+ return generate_with_schema_and_extract(
272
+ metric=metric,
273
+ prompt=prompt,
274
+ schema_cls=MetricScoreReason,
275
+ extract_schema=lambda score_reason: score_reason.reason,
276
+ extract_json=lambda data: data["reason"],
277
+ )
280
278
 
281
279
  async def _a_generate_reason(self, metric: BaseConversationalMetric):
282
280
  prompt = ConversationalVerdictNodeTemplate.generate_reason(
@@ -284,22 +282,14 @@ class ConversationalVerdictNode(ConversationalBaseNode):
284
282
  score=metric.score,
285
283
  name=metric.__name__,
286
284
  )
287
- if metric.using_native_model:
288
- res, cost = await metric.model.a_generate(
289
- prompt, schema=MetricScoreReason
290
- )
291
- metric.evaluation_cost += cost
292
- else:
293
- try:
294
- res: MetricScoreReason = await metric.model.a_generate(
295
- prompt, schema=MetricScoreReason
296
- )
297
- except TypeError:
298
- res = await metric.model.a_generate(prompt)
299
- data = trimAndLoadJson(res, self)
300
- res = MetricScoreReason(**data)
301
285
 
302
- return res.reason
286
+ return await a_generate_with_schema_and_extract(
287
+ metric=metric,
288
+ prompt=prompt,
289
+ schema_cls=MetricScoreReason,
290
+ extract_schema=lambda score_reason: score_reason.reason,
291
+ extract_json=lambda data: data["reason"],
292
+ )
303
293
 
304
294
 
305
295
  @dataclass
@@ -372,20 +362,14 @@ class ConversationalTaskNode(ConversationalBaseNode):
372
362
  instructions=self.instructions,
373
363
  text=text,
374
364
  )
375
- if metric.using_native_model:
376
- res, cost = metric.model.generate(prompt, schema=TaskNodeOutput)
377
- metric.evaluation_cost += cost
378
- self._output = res.output
379
- else:
380
- try:
381
- res: TaskNodeOutput = metric.model.generate(
382
- prompt, schema=TaskNodeOutput
383
- )
384
- self._output = res.output
385
- except TypeError:
386
- res = metric.model.generate(prompt)
387
- data = trimAndLoadJson(res, self)
388
- self._output = TaskNodeOutput(**data).output
365
+
366
+ self._output = generate_with_schema_and_extract(
367
+ metric=metric,
368
+ prompt=prompt,
369
+ schema_cls=TaskNodeOutput,
370
+ extract_schema=lambda s: s.output,
371
+ extract_json=lambda data: data["output"],
372
+ )
389
373
 
390
374
  metric._verbose_steps.append(
391
375
  construct_node_verbose_log(self, self._depth)
@@ -439,23 +423,14 @@ class ConversationalTaskNode(ConversationalBaseNode):
439
423
  instructions=self.instructions,
440
424
  text=text,
441
425
  )
442
- if metric.using_native_model:
443
- res, cost = await metric.model.a_generate(
444
- prompt, schema=TaskNodeOutput
445
- )
446
- metric.evaluation_cost += cost
447
- self._output = res.output
448
- else:
449
- try:
450
- res: TaskNodeOutput = await metric.model.a_generate(
451
- prompt, schema=TaskNodeOutput
452
- )
453
- self._output = res.output
454
- except TypeError:
455
- res = await metric.model.a_generate(prompt)
456
- data = trimAndLoadJson(res, self)
457
- self._output = TaskNodeOutput(**data).output
458
426
 
427
+ self._output = await a_generate_with_schema_and_extract(
428
+ metric=metric,
429
+ prompt=prompt,
430
+ schema_cls=TaskNodeOutput,
431
+ extract_schema=lambda s: s.output,
432
+ extract_json=lambda data: data["output"],
433
+ )
459
434
  metric._verbose_steps.append(
460
435
  construct_node_verbose_log(self, self._depth)
461
436
  )
@@ -559,22 +534,14 @@ class ConversationalBinaryJudgementNode(ConversationalBaseNode):
559
534
  criteria=self.criteria,
560
535
  text=text,
561
536
  )
562
- if metric.using_native_model:
563
- res, cost = metric.model.generate(
564
- prompt, schema=BinaryJudgementVerdict
565
- )
566
- metric.evaluation_cost += cost
567
- self._verdict = res
568
- else:
569
- try:
570
- res: BinaryJudgementVerdict = metric.model.generate(
571
- prompt, schema=BinaryJudgementVerdict
572
- )
573
- self._verdict = res
574
- except TypeError:
575
- res = metric.model.generate(prompt)
576
- data = trimAndLoadJson(res, self)
577
- self._verdict = BinaryJudgementVerdict(**data)
537
+
538
+ self._verdict = generate_with_schema_and_extract(
539
+ metric=metric,
540
+ prompt=prompt,
541
+ schema_cls=BinaryJudgementVerdict,
542
+ extract_schema=lambda s: s,
543
+ extract_json=lambda data: BinaryJudgementVerdict(**data),
544
+ )
578
545
 
579
546
  metric._verbose_steps.append(
580
547
  construct_node_verbose_log(self, self._depth)
@@ -623,22 +590,14 @@ class ConversationalBinaryJudgementNode(ConversationalBaseNode):
623
590
  criteria=self.criteria,
624
591
  text=text,
625
592
  )
626
- if metric.using_native_model:
627
- res, cost = await metric.model.a_generate(
628
- prompt, schema=BinaryJudgementVerdict
629
- )
630
- metric.evaluation_cost += cost
631
- self._verdict = res
632
- else:
633
- try:
634
- res: BinaryJudgementVerdict = await metric.model.a_generate(
635
- prompt, schema=BinaryJudgementVerdict
636
- )
637
- self._verdict = res
638
- except TypeError:
639
- res = await metric.model.a_generate(prompt)
640
- data = trimAndLoadJson(res, self)
641
- self._verdict = BinaryJudgementVerdict(**data)
593
+
594
+ self._verdict = await a_generate_with_schema_and_extract(
595
+ metric=metric,
596
+ prompt=prompt,
597
+ schema_cls=BinaryJudgementVerdict,
598
+ extract_schema=lambda s: s,
599
+ extract_json=lambda data: BinaryJudgementVerdict(**data),
600
+ )
642
601
 
643
602
  metric._verbose_steps.append(
644
603
  construct_node_verbose_log(self, self._depth)
@@ -753,22 +712,14 @@ class ConversationalNonBinaryJudgementNode(ConversationalBaseNode):
753
712
  prompt = ConversationalNonBinaryJudgementTemplate.generate_non_binary_verdict(
754
713
  criteria=self.criteria, text=text, options=self._verdict_options
755
714
  )
756
- if metric.using_native_model:
757
- res, cost = metric.model.generate(
758
- prompt, schema=self._verdict_schema
759
- )
760
- metric.evaluation_cost += cost
761
- self._verdict = res
762
- else:
763
- try:
764
- res: self._verdict_schema = metric.model.generate(
765
- prompt, schema=self._verdict_schema
766
- )
767
- self._verdict = res
768
- except TypeError:
769
- res = metric.model.generate(prompt)
770
- data = trimAndLoadJson(res, self)
771
- self._verdict = self._verdict_schema(**data)
715
+
716
+ self._verdict = generate_with_schema_and_extract(
717
+ metric=metric,
718
+ prompt=prompt,
719
+ schema_cls=self._verdict_schema,
720
+ extract_schema=lambda s: s,
721
+ extract_json=lambda data: self._verdict_schema(**data),
722
+ )
772
723
 
773
724
  metric._verbose_steps.append(
774
725
  construct_node_verbose_log(self, self._depth)
@@ -816,22 +767,14 @@ class ConversationalNonBinaryJudgementNode(ConversationalBaseNode):
816
767
  prompt = ConversationalNonBinaryJudgementTemplate.generate_non_binary_verdict(
817
768
  criteria=self.criteria, text=text, options=self._verdict_options
818
769
  )
819
- if metric.using_native_model:
820
- res, cost = await metric.model.a_generate(
821
- prompt, schema=self._verdict_schema
822
- )
823
- metric.evaluation_cost += cost
824
- self._verdict = res
825
- else:
826
- try:
827
- res: self._verdict_schema = await metric.model.a_generate(
828
- prompt, schema=self._verdict_schema
829
- )
830
- self._verdict = res
831
- except TypeError:
832
- res = await metric.model.a_generate(prompt)
833
- data = trimAndLoadJson(res, self)
834
- self._verdict = self._verdict_schema(**data)
770
+
771
+ self._verdict = await a_generate_with_schema_and_extract(
772
+ metric=metric,
773
+ prompt=prompt,
774
+ schema_cls=self._verdict_schema,
775
+ extract_schema=lambda s: s,
776
+ extract_json=lambda data: self._verdict_schema(**data),
777
+ )
835
778
 
836
779
  metric._verbose_steps.append(
837
780
  construct_node_verbose_log(self, self._depth)
@@ -2,7 +2,17 @@ from typing import List
2
2
  from textwrap import dedent
3
3
 
4
4
 
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
13
+
5
14
  class ConversationalVerdictNodeTemplate:
15
+
6
16
  @staticmethod
7
17
  def generate_reason(verbose_steps: List[str], score: float, name: str):
8
18
  return dedent(
@@ -40,6 +50,8 @@ class ConversationalTaskNodeTemplate:
40
50
  return dedent(
41
51
  f"""You are given a set of task instructions and a full conversation between a user and an assistant.
42
52
 
53
+ {multimodal_rules}
54
+
43
55
  Instructions:
44
56
  {instructions}
45
57
 
@@ -67,6 +79,8 @@ class ConversationalBinaryJudgementTemplate:
67
79
 
68
80
  Below is the full conversation you should evaluate. Consider dialogue context, speaker roles, and how responses were handled.
69
81
 
82
+ {multimodal_rules}
83
+
70
84
  Full Conversation:
71
85
  {text}
72
86
 
@@ -96,6 +110,8 @@ class ConversationalNonBinaryJudgementTemplate:
96
110
 
97
111
  You are evaluating the following conversation. Choose one of the options that best reflects the assistant's behavior.
98
112
 
113
+ {multimodal_rules}
114
+
99
115
  Options: {options}
100
116
 
101
117
  Full Conversation:
@@ -26,6 +26,8 @@ from deepeval.metrics.utils import (
26
26
  trimAndLoadJson,
27
27
  initialize_model,
28
28
  convert_turn_to_dict,
29
+ a_generate_with_schema_and_extract,
30
+ generate_with_schema_and_extract,
29
31
  )
30
32
  from deepeval.models import DeepEvalBaseLLM
31
33
  from deepeval.metrics.indicator import metric_progress_indicator
@@ -89,8 +91,14 @@ class ConversationalGEval(BaseConversationalMetric):
89
91
  _in_component: bool = False,
90
92
  _log_metric_to_confident: bool = True,
91
93
  ) -> float:
94
+ multimodal = test_case.multimodal
92
95
  check_conversational_test_case_params(
93
- test_case, self.evaluation_params, self
96
+ test_case,
97
+ self.evaluation_params,
98
+ self,
99
+ False,
100
+ self.model,
101
+ multimodal,
94
102
  )
95
103
 
96
104
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -143,8 +151,14 @@ class ConversationalGEval(BaseConversationalMetric):
143
151
  _in_component: bool = False,
144
152
  _log_metric_to_confident: bool = True,
145
153
  ) -> float:
154
+ multimodal = test_case.multimodal
146
155
  check_conversational_test_case_params(
147
- test_case, self.evaluation_params, self
156
+ test_case,
157
+ self.evaluation_params,
158
+ self,
159
+ False,
160
+ self.model,
161
+ multimodal,
148
162
  )
149
163
 
150
164
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -192,22 +206,13 @@ class ConversationalGEval(BaseConversationalMetric):
192
206
  prompt = self.evaluation_template.generate_evaluation_steps(
193
207
  criteria=self.criteria, parameters=g_eval_params_str
194
208
  )
195
- if self.using_native_model:
196
- res, cost = await self.model.a_generate(
197
- prompt, schema=cgschema.Steps
198
- )
199
- self.evaluation_cost += cost
200
- return res.steps
201
- else:
202
- try:
203
- res: cgschema.Steps = await self.model.a_generate(
204
- prompt, schema=cgschema.Steps
205
- )
206
- return res.steps
207
- except TypeError:
208
- res = await self.model.a_generate(prompt)
209
- data = trimAndLoadJson(res, self)
210
- return data["steps"]
209
+ return await a_generate_with_schema_and_extract(
210
+ metric=self,
211
+ prompt=prompt,
212
+ schema_cls=cgschema.Steps,
213
+ extract_schema=lambda s: s.steps,
214
+ extract_json=lambda data: data["steps"],
215
+ )
211
216
 
212
217
  def _generate_evaluation_steps(self) -> List[str]:
213
218
  if self.evaluation_steps:
@@ -219,20 +224,13 @@ class ConversationalGEval(BaseConversationalMetric):
219
224
  prompt = self.evaluation_template.generate_evaluation_steps(
220
225
  criteria=self.criteria, parameters=g_eval_params_str
221
226
  )
222
- if self.using_native_model:
223
- res, cost = self.model.generate(prompt, schema=cgschema.Steps)
224
- self.evaluation_cost += cost
225
- return res.steps
226
- else:
227
- try:
228
- res: cgschema.Steps = self.model.generate(
229
- prompt, schema=cgschema.Steps
230
- )
231
- return res.steps
232
- except TypeError:
233
- res = self.model.generate(prompt)
234
- data = trimAndLoadJson(res, self)
235
- return data["steps"]
227
+ return generate_with_schema_and_extract(
228
+ metric=self,
229
+ prompt=prompt,
230
+ schema_cls=cgschema.Steps,
231
+ extract_schema=lambda s: s.steps,
232
+ extract_json=lambda data: data["steps"],
233
+ )
236
234
 
237
235
  async def _a_evaluate(
238
236
  self, test_case: ConversationalTestCase
@@ -269,7 +267,8 @@ class ConversationalGEval(BaseConversationalMetric):
269
267
  res, cost = await self.model.a_generate_raw_response(
270
268
  prompt, top_logprobs=20
271
269
  )
272
- self.evaluation_cost += cost
270
+
271
+ self._accrue_cost(cost)
273
272
  data = trimAndLoadJson(res.choices[0].message.content, self)
274
273
 
275
274
  reason = data["reason"]
@@ -287,22 +286,13 @@ class ConversationalGEval(BaseConversationalMetric):
287
286
  except (
288
287
  AttributeError
289
288
  ): # This catches the case where a_generate_raw_response doesn't exist.
290
- if self.using_native_model:
291
- res, cost = await self.model.a_generate(
292
- prompt, schema=cgschema.ReasonScore
293
- )
294
- self.evaluation_cost += cost
295
- return res.score, res.reason
296
- else:
297
- try:
298
- res: cgschema.ReasonScore = await self.model.a_generate(
299
- prompt, schema=cgschema.ReasonScore
300
- )
301
- return res.score, res.reason
302
- except TypeError:
303
- res = await self.model.a_generate(prompt)
304
- data = trimAndLoadJson(res, self)
305
- return data["score"], data["reason"]
289
+ return await a_generate_with_schema_and_extract(
290
+ metric=self,
291
+ prompt=prompt,
292
+ schema_cls=cgschema.ReasonScore,
293
+ extract_schema=lambda r: (r.score, r.reason),
294
+ extract_json=lambda data: (data["score"], data["reason"]),
295
+ )
306
296
 
307
297
  def evaluate(
308
298
  self, test_case: ConversationalTestCase
@@ -339,7 +329,7 @@ class ConversationalGEval(BaseConversationalMetric):
339
329
  res, cost = self.model.generate_raw_response(
340
330
  prompt, top_logprobs=20
341
331
  )
342
- self.evaluation_cost += cost
332
+ self._accrue_cost(cost)
343
333
  data = trimAndLoadJson(res.choices[0].message.content, self)
344
334
 
345
335
  reason = data["reason"]
@@ -356,22 +346,13 @@ class ConversationalGEval(BaseConversationalMetric):
356
346
  return score, reason
357
347
  except AttributeError:
358
348
  # This catches the case where a_generate_raw_response doesn't exist.
359
- if self.using_native_model:
360
- res, cost = self.model.generate(
361
- prompt, schema=cgschema.ReasonScore
362
- )
363
- self.evaluation_cost += cost
364
- return res.score, res.reason
365
- else:
366
- try:
367
- res: cgschema.ReasonScore = self.model.generate(
368
- prompt, schema=cgschema.ReasonScore
369
- )
370
- return res.score, res.reason
371
- except TypeError:
372
- res = self.model.generate(prompt)
373
- data = trimAndLoadJson(res, self)
374
- return data["score"], data["reason"]
349
+ return generate_with_schema_and_extract(
350
+ metric=self,
351
+ prompt=prompt,
352
+ schema_cls=cgschema.ReasonScore,
353
+ extract_schema=lambda r: (r.score, r.reason),
354
+ extract_json=lambda data: (data["score"], data["reason"]),
355
+ )
375
356
 
376
357
  def generate_weighted_summed_score(
377
358
  self, raw_score: int, raw_response: ChatCompletion
@@ -62,10 +62,15 @@ class DAGMetric(BaseMetric):
62
62
  _in_component: bool = False,
63
63
  _log_metric_to_confident: bool = True,
64
64
  ) -> float:
65
+ multimodal = test_case.multimodal
65
66
  check_llm_test_case_params(
66
67
  test_case,
67
68
  extract_required_params(self.dag.root_nodes, self.dag.multiturn),
69
+ None,
70
+ None,
68
71
  self,
72
+ self.model,
73
+ multimodal,
69
74
  )
70
75
 
71
76
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -105,10 +110,15 @@ class DAGMetric(BaseMetric):
105
110
  _in_component: bool = False,
106
111
  _log_metric_to_confident: bool = True,
107
112
  ) -> float:
113
+ multimodal = test_case.multimodal
108
114
  check_llm_test_case_params(
109
115
  test_case,
110
116
  extract_required_params(self.dag.root_nodes, self.dag.multiturn),
117
+ None,
118
+ None,
111
119
  self,
120
+ self.model,
121
+ multimodal,
112
122
  )
113
123
 
114
124
  self.evaluation_cost = 0 if self.using_native_model else None