deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -18,7 +18,11 @@ from deepeval.metrics.dag.templates import (
18
18
  from deepeval.metrics.base_metric import BaseMetric
19
19
  from deepeval.metrics.g_eval.g_eval import GEval
20
20
  from deepeval.metrics.g_eval.utils import G_EVAL_PARAMS
21
- from deepeval.metrics.utils import copy_metrics, trimAndLoadJson
21
+ from deepeval.metrics.utils import (
22
+ copy_metrics,
23
+ a_generate_with_schema_and_extract,
24
+ generate_with_schema_and_extract,
25
+ )
22
26
  from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall
23
27
  from deepeval.utils import prettify_list
24
28
 
@@ -222,20 +226,13 @@ class VerdictNode(BaseNode):
222
226
  score=metric.score,
223
227
  name=metric.__name__,
224
228
  )
225
- if metric.using_native_model:
226
- res, cost = metric.model.generate(prompt, schema=MetricScoreReason)
227
- metric.evaluation_cost += cost
228
- else:
229
- try:
230
- res: MetricScoreReason = metric.model.generate(
231
- prompt, schema=MetricScoreReason
232
- )
233
- except TypeError:
234
- res = metric.model.generate(prompt)
235
- data = trimAndLoadJson(res, self)
236
- res = MetricScoreReason(**data)
237
-
238
- return res.reason
229
+ return generate_with_schema_and_extract(
230
+ metric=metric,
231
+ prompt=prompt,
232
+ schema_cls=MetricScoreReason,
233
+ extract_schema=lambda s: s.reason,
234
+ extract_json=lambda data: data["reason"],
235
+ )
239
236
 
240
237
  async def _a_generate_reason(self, metric: BaseMetric):
241
238
  prompt = VerdictNodeTemplate.generate_reason(
@@ -243,22 +240,13 @@ class VerdictNode(BaseNode):
243
240
  score=metric.score,
244
241
  name=metric.__name__,
245
242
  )
246
- if metric.using_native_model:
247
- res, cost = await metric.model.a_generate(
248
- prompt, schema=MetricScoreReason
249
- )
250
- metric.evaluation_cost += cost
251
- else:
252
- try:
253
- res: MetricScoreReason = await metric.model.a_generate(
254
- prompt, schema=MetricScoreReason
255
- )
256
- except TypeError:
257
- res = await metric.model.a_generate(prompt)
258
- data = trimAndLoadJson(res, self)
259
- res = MetricScoreReason(**data)
260
-
261
- return res.reason
243
+ return await a_generate_with_schema_and_extract(
244
+ metric=metric,
245
+ prompt=prompt,
246
+ schema_cls=MetricScoreReason,
247
+ extract_schema=lambda s: s.reason,
248
+ extract_json=lambda data: data["reason"],
249
+ )
262
250
 
263
251
 
264
252
  @dataclass
@@ -317,20 +305,13 @@ class TaskNode(BaseNode):
317
305
  instructions=self.instructions,
318
306
  text=text,
319
307
  )
320
- if metric.using_native_model:
321
- res, cost = metric.model.generate(prompt, schema=TaskNodeOutput)
322
- metric.evaluation_cost += cost
323
- self._output = res.output
324
- else:
325
- try:
326
- res: TaskNodeOutput = metric.model.generate(
327
- prompt, schema=TaskNodeOutput
328
- )
329
- self._output = res.output
330
- except TypeError:
331
- res = metric.model.generate(prompt)
332
- data = trimAndLoadJson(res, self)
333
- self._output = TaskNodeOutput(**data).output
308
+ self._output = generate_with_schema_and_extract(
309
+ metric=metric,
310
+ prompt=prompt,
311
+ schema_cls=TaskNodeOutput,
312
+ extract_schema=lambda s: s.output,
313
+ extract_json=lambda data: data["output"],
314
+ )
334
315
 
335
316
  metric._verbose_steps.append(
336
317
  construct_node_verbose_log(self, self._depth)
@@ -371,22 +352,13 @@ class TaskNode(BaseNode):
371
352
  text=text,
372
353
  )
373
354
 
374
- if metric.using_native_model:
375
- res, cost = await metric.model.a_generate(
376
- prompt, schema=TaskNodeOutput
377
- )
378
- metric.evaluation_cost += cost
379
- self._output = res.output
380
- else:
381
- try:
382
- res: TaskNodeOutput = await metric.model.a_generate(
383
- prompt, schema=TaskNodeOutput
384
- )
385
- self._output = res.output
386
- except TypeError:
387
- res = await metric.model.a_generate(prompt)
388
- data = trimAndLoadJson(res, self)
389
- self._output = TaskNodeOutput(**data).output
355
+ self._output = await a_generate_with_schema_and_extract(
356
+ metric=metric,
357
+ prompt=prompt,
358
+ schema_cls=TaskNodeOutput,
359
+ extract_schema=lambda s: s.output,
360
+ extract_json=lambda data: data["output"],
361
+ )
390
362
 
391
363
  metric._verbose_steps.append(
392
364
  construct_node_verbose_log(self, self._depth)
@@ -470,23 +442,13 @@ class BinaryJudgementNode(BaseNode):
470
442
  criteria=self.criteria,
471
443
  text=text,
472
444
  )
473
- if metric.using_native_model:
474
- res, cost = metric.model.generate(
475
- prompt, schema=BinaryJudgementVerdict
476
- )
477
- metric.evaluation_cost += cost
478
- self._verdict = res
479
- else:
480
- try:
481
- res: BinaryJudgementVerdict = metric.model.generate(
482
- prompt, schema=BinaryJudgementVerdict
483
- )
484
- self._verdict = res
485
- except TypeError:
486
- res = metric.model.generate(prompt)
487
- data = trimAndLoadJson(res, self)
488
- self._verdict = BinaryJudgementVerdict(**data)
489
-
445
+ self._verdict = generate_with_schema_and_extract(
446
+ metric=metric,
447
+ prompt=prompt,
448
+ schema_cls=BinaryJudgementVerdict,
449
+ extract_schema=lambda s: s,
450
+ extract_json=lambda data: BinaryJudgementVerdict(**data),
451
+ )
490
452
  metric._verbose_steps.append(
491
453
  construct_node_verbose_log(self, self._depth)
492
454
  )
@@ -520,22 +482,13 @@ class BinaryJudgementNode(BaseNode):
520
482
  criteria=self.criteria,
521
483
  text=text,
522
484
  )
523
- if metric.using_native_model:
524
- res, cost = await metric.model.a_generate(
525
- prompt, schema=BinaryJudgementVerdict
526
- )
527
- metric.evaluation_cost += cost
528
- self._verdict = res
529
- else:
530
- try:
531
- res: BinaryJudgementVerdict = await metric.model.a_generate(
532
- prompt, schema=BinaryJudgementVerdict
533
- )
534
- self._verdict = res
535
- except TypeError:
536
- res = await metric.model.a_generate(prompt)
537
- data = trimAndLoadJson(res, self)
538
- self._verdict = BinaryJudgementVerdict(**data)
485
+ self._verdict = await a_generate_with_schema_and_extract(
486
+ metric=metric,
487
+ prompt=prompt,
488
+ schema_cls=BinaryJudgementVerdict,
489
+ extract_schema=lambda s: s,
490
+ extract_json=lambda data: BinaryJudgementVerdict(**data),
491
+ )
539
492
 
540
493
  metric._verbose_steps.append(
541
494
  construct_node_verbose_log(self, self._depth)
@@ -629,22 +582,14 @@ class NonBinaryJudgementNode(BaseNode):
629
582
  prompt = NonBinaryJudgementTemplate.generate_non_binary_verdict(
630
583
  criteria=self.criteria, text=text, options=self._verdict_options
631
584
  )
632
- if metric.using_native_model:
633
- res, cost = metric.model.generate(
634
- prompt, schema=self._verdict_schema
635
- )
636
- metric.evaluation_cost += cost
637
- self._verdict = res
638
- else:
639
- try:
640
- res: self._verdict_schema = metric.model.generate(
641
- prompt, schema=self._verdict_schema
642
- )
643
- self._verdict = res
644
- except TypeError:
645
- res = metric.model.generate(prompt)
646
- data = trimAndLoadJson(res, self)
647
- self._verdict = self._verdict_schema(**data)
585
+
586
+ self._verdict = generate_with_schema_and_extract(
587
+ metric=metric,
588
+ prompt=prompt,
589
+ schema_cls=self._verdict_schema,
590
+ extract_schema=lambda s: s,
591
+ extract_json=lambda data: self._verdict_schema(**data),
592
+ )
648
593
 
649
594
  metric._verbose_steps.append(
650
595
  construct_node_verbose_log(self, self._depth)
@@ -678,22 +623,14 @@ class NonBinaryJudgementNode(BaseNode):
678
623
  prompt = NonBinaryJudgementTemplate.generate_non_binary_verdict(
679
624
  criteria=self.criteria, text=text, options=self._verdict_options
680
625
  )
681
- if metric.using_native_model:
682
- res, cost = await metric.model.a_generate(
683
- prompt, schema=self._verdict_schema
684
- )
685
- metric.evaluation_cost += cost
686
- self._verdict = res
687
- else:
688
- try:
689
- res: self._verdict_schema = await metric.model.a_generate(
690
- prompt, schema=self._verdict_schema
691
- )
692
- self._verdict = res
693
- except TypeError:
694
- res = await metric.model.a_generate(prompt)
695
- data = trimAndLoadJson(res, self)
696
- self._verdict = self._verdict_schema(**data)
626
+
627
+ self._verdict = await a_generate_with_schema_and_extract(
628
+ metric=metric,
629
+ prompt=prompt,
630
+ schema_cls=self._verdict_schema,
631
+ extract_schema=lambda s: s,
632
+ extract_json=lambda data: self._verdict_schema(**data),
633
+ )
697
634
 
698
635
  metric._verbose_steps.append(
699
636
  construct_node_verbose_log(self, self._depth)
@@ -1,5 +1,13 @@
1
1
  from typing import List
2
2
 
3
+ multimodal_rules = """
4
+ --- MULTIMODAL INPUT RULES ---
5
+ - Treat image content as factual evidence.
6
+ - Only reference visual details that are explicitly and clearly visible.
7
+ - Do not infer or guess objects, text, or details not visibly present.
8
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
9
+ """
10
+
3
11
 
4
12
  class VerdictNodeTemplate:
5
13
  @staticmethod
@@ -34,6 +42,8 @@ class TaskNodeTemplate:
34
42
  def generate_task_output(instructions: str, text: str):
35
43
  return f"""Given the following instructions, generate an output.
36
44
 
45
+ {multimodal_rules}
46
+
37
47
  {instructions}
38
48
 
39
49
  {text}
@@ -57,14 +67,16 @@ class BinaryJudgementTemplate:
57
67
  def generate_binary_verdict(criteria: str, text: str):
58
68
  return f"""{criteria}
59
69
 
70
+ {multimodal_rules}
71
+
60
72
  {text}
61
73
 
62
74
  **
63
- IMPORTANT: Please make sure to only return a json with two keys: `verdict` (True or False), and the 'reason' key providing the reason. The verdict must be a boolean only, either True or False.
75
+ IMPORTANT: Please make sure to only return a json with two keys: `verdict` (true or false), and the 'reason' key providing the reason. The verdict must be a boolean only, either true or false.
64
76
  Example JSON:
65
77
  {{
66
78
  "reason": "...",
67
- "verdict": True
79
+ "verdict": true
68
80
  }}
69
81
  **
70
82
 
@@ -79,6 +91,8 @@ class NonBinaryJudgementTemplate:
79
91
  ):
80
92
  return f"""{criteria}
81
93
 
94
+ {multimodal_rules}
95
+
82
96
  {text}
83
97
 
84
98
  **
@@ -32,7 +32,15 @@ class ExactMatchMetric(BaseMetric):
32
32
  _in_component: bool = False,
33
33
  _log_metric_to_confident: bool = True,
34
34
  ) -> float:
35
- check_llm_test_case_params(test_case, self._required_params, self)
35
+ check_llm_test_case_params(
36
+ test_case,
37
+ self._required_params,
38
+ None,
39
+ None,
40
+ self,
41
+ None,
42
+ test_case.multimodal,
43
+ )
36
44
 
37
45
  with metric_progress_indicator(
38
46
  self, _show_indicator=_show_indicator, _in_component=_in_component