deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,21 +1,27 @@
1
1
  from typing import Optional, List, Type, Union
2
2
 
3
- from deepeval.utils import get_or_create_event_loop, prettify_list
3
+ from deepeval.utils import (
4
+ get_or_create_event_loop,
5
+ prettify_list,
6
+ )
4
7
  from deepeval.metrics.utils import (
5
8
  construct_verbose_logs,
6
- trimAndLoadJson,
7
9
  check_llm_test_case_params,
8
10
  initialize_model,
11
+ generate_with_schema_and_extract,
12
+ a_generate_with_schema_and_extract,
9
13
  )
10
- from deepeval.test_case import (
11
- LLMTestCase,
12
- LLMTestCaseParams,
13
- )
14
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
14
15
  from deepeval.metrics import BaseMetric
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
17
18
  from deepeval.metrics.indicator import metric_progress_indicator
18
- from deepeval.metrics.answer_relevancy.schema import *
19
+ from deepeval.metrics.answer_relevancy.schema import (
20
+ Statements,
21
+ AnswerRelevancyVerdict,
22
+ Verdicts,
23
+ AnswerRelevancyScoreReason,
24
+ )
19
25
  from deepeval.metrics.api import metric_data_manager
20
26
 
21
27
 
@@ -53,7 +59,16 @@ class AnswerRelevancyMetric(BaseMetric):
53
59
  _in_component: bool = False,
54
60
  _log_metric_to_confident: bool = True,
55
61
  ) -> float:
56
- check_llm_test_case_params(test_case, self._required_params, self)
62
+
63
+ check_llm_test_case_params(
64
+ test_case,
65
+ self._required_params,
66
+ None,
67
+ None,
68
+ self,
69
+ self.model,
70
+ test_case.multimodal,
71
+ )
57
72
 
58
73
  self.evaluation_cost = 0 if self.using_native_model else None
59
74
  with metric_progress_indicator(
@@ -70,14 +85,17 @@ class AnswerRelevancyMetric(BaseMetric):
70
85
  )
71
86
  )
72
87
  else:
88
+ input = test_case.input
89
+ actual_output = test_case.actual_output
90
+
73
91
  self.statements: List[str] = self._generate_statements(
74
- test_case.actual_output
92
+ actual_output, test_case.multimodal
75
93
  )
76
94
  self.verdicts: List[AnswerRelevancyVerdict] = (
77
- self._generate_verdicts(test_case.input)
95
+ self._generate_verdicts(input, test_case.multimodal)
78
96
  )
79
97
  self.score = self._calculate_score()
80
- self.reason = self._generate_reason(test_case.input)
98
+ self.reason = self._generate_reason(input, test_case.multimodal)
81
99
  self.success = self.score >= self.threshold
82
100
  self.verbose_logs = construct_verbose_logs(
83
101
  self,
@@ -101,7 +119,16 @@ class AnswerRelevancyMetric(BaseMetric):
101
119
  _in_component: bool = False,
102
120
  _log_metric_to_confident: bool = True,
103
121
  ) -> float:
104
- check_llm_test_case_params(test_case, self._required_params, self)
122
+
123
+ check_llm_test_case_params(
124
+ test_case,
125
+ self._required_params,
126
+ None,
127
+ None,
128
+ self,
129
+ self.model,
130
+ test_case.multimodal,
131
+ )
105
132
 
106
133
  self.evaluation_cost = 0 if self.using_native_model else None
107
134
  with metric_progress_indicator(
@@ -110,14 +137,19 @@ class AnswerRelevancyMetric(BaseMetric):
110
137
  _show_indicator=_show_indicator,
111
138
  _in_component=_in_component,
112
139
  ):
140
+ input = test_case.input
141
+ actual_output = test_case.actual_output
142
+
113
143
  self.statements: List[str] = await self._a_generate_statements(
114
- test_case.actual_output
144
+ actual_output, test_case.multimodal
115
145
  )
116
146
  self.verdicts: List[AnswerRelevancyVerdict] = (
117
- await self._a_generate_verdicts(test_case.input)
147
+ await self._a_generate_verdicts(input, test_case.multimodal)
118
148
  )
119
149
  self.score = self._calculate_score()
120
- self.reason = await self._a_generate_reason(test_case.input)
150
+ self.reason = await self._a_generate_reason(
151
+ input, test_case.multimodal
152
+ )
121
153
  self.success = self.score >= self.threshold
122
154
  self.verbose_logs = construct_verbose_logs(
123
155
  self,
@@ -133,7 +165,7 @@ class AnswerRelevancyMetric(BaseMetric):
133
165
  )
134
166
  return self.score
135
167
 
136
- async def _a_generate_reason(self, input: str) -> str:
168
+ async def _a_generate_reason(self, input: str, multimodal: bool) -> str:
137
169
  if self.include_reason is False:
138
170
  return None
139
171
 
@@ -146,25 +178,18 @@ class AnswerRelevancyMetric(BaseMetric):
146
178
  irrelevant_statements=irrelevant_statements,
147
179
  input=input,
148
180
  score=format(self.score, ".2f"),
181
+ multimodal=multimodal,
149
182
  )
150
- if self.using_native_model:
151
- res, cost = await self.model.a_generate(
152
- prompt, schema=AnswerRelevancyScoreReason
153
- )
154
- self.evaluation_cost += cost
155
- return res.reason
156
- else:
157
- try:
158
- res: AnswerRelevancyScoreReason = await self.model.a_generate(
159
- prompt=prompt, schema=AnswerRelevancyScoreReason
160
- )
161
- return res.reason
162
- except TypeError:
163
- res = await self.model.a_generate(prompt)
164
- data = trimAndLoadJson(res, self)
165
- return data["reason"]
166
183
 
167
- def _generate_reason(self, input: str) -> str:
184
+ return await a_generate_with_schema_and_extract(
185
+ metric=self,
186
+ prompt=prompt,
187
+ schema_cls=AnswerRelevancyScoreReason,
188
+ extract_schema=lambda score_reason: score_reason.reason,
189
+ extract_json=lambda data: data["reason"],
190
+ )
191
+
192
+ def _generate_reason(self, input: str, multimodal: bool) -> str:
168
193
  if self.include_reason is False:
169
194
  return None
170
195
 
@@ -177,117 +202,94 @@ class AnswerRelevancyMetric(BaseMetric):
177
202
  irrelevant_statements=irrelevant_statements,
178
203
  input=input,
179
204
  score=format(self.score, ".2f"),
205
+ multimodal=multimodal,
180
206
  )
181
207
 
182
- if self.using_native_model:
183
- res, cost = self.model.generate(
184
- prompt, schema=AnswerRelevancyScoreReason
185
- )
186
- self.evaluation_cost += cost
187
- return res.reason
188
- else:
189
- try:
190
- res: AnswerRelevancyScoreReason = self.model.generate(
191
- prompt=prompt, schema=AnswerRelevancyScoreReason
192
- )
193
- return res.reason
194
- except TypeError:
195
- res = self.model.generate(prompt)
196
- data = trimAndLoadJson(res, self)
197
- return data["reason"]
208
+ return generate_with_schema_and_extract(
209
+ metric=self,
210
+ prompt=prompt,
211
+ schema_cls=AnswerRelevancyScoreReason,
212
+ extract_schema=lambda score_reason: score_reason.reason,
213
+ extract_json=lambda data: data["reason"],
214
+ )
198
215
 
199
216
  async def _a_generate_verdicts(
200
- self, input: str
217
+ self, input: str, multimodal: bool
201
218
  ) -> List[AnswerRelevancyVerdict]:
202
219
  if len(self.statements) == 0:
203
220
  return []
204
221
 
205
222
  prompt = self.evaluation_template.generate_verdicts(
206
- input=input,
207
- statements=self.statements,
223
+ input=input, statements=self.statements, multimodal=multimodal
208
224
  )
209
225
 
210
- if self.using_native_model:
211
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
212
- self.evaluation_cost += cost
213
- return [item for item in res.verdicts]
214
- else:
215
- try:
216
- res: Verdicts = await self.model.a_generate(
217
- prompt, schema=Verdicts
218
- )
219
- return [item for item in res.verdicts]
220
- except TypeError:
221
- res = await self.model.a_generate(prompt)
222
- data = trimAndLoadJson(res, self)
223
- return [
224
- AnswerRelevancyVerdict(**item) for item in data["verdicts"]
225
- ]
226
+ return await a_generate_with_schema_and_extract(
227
+ metric=self,
228
+ prompt=prompt,
229
+ schema_cls=Verdicts,
230
+ extract_schema=lambda r: list(r.verdicts),
231
+ extract_json=lambda data: [
232
+ AnswerRelevancyVerdict(**item) for item in data["verdicts"]
233
+ ],
234
+ )
226
235
 
227
- def _generate_verdicts(self, input: str) -> List[AnswerRelevancyVerdict]:
236
+ def _generate_verdicts(
237
+ self, input: str, multimodal: bool
238
+ ) -> List[AnswerRelevancyVerdict]:
228
239
  if len(self.statements) == 0:
229
240
  return []
230
241
 
231
242
  prompt = self.evaluation_template.generate_verdicts(
232
- input=input,
233
- statements=self.statements,
243
+ input=input, statements=self.statements, multimodal=multimodal
234
244
  )
235
- if self.using_native_model:
236
- res, cost = self.model.generate(prompt, schema=Verdicts)
237
- self.evaluation_cost += cost
238
- return [item for item in res.verdicts]
239
- else:
240
- try:
241
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
242
- return [item for item in res.verdicts]
243
- except TypeError:
244
- res = self.model.generate(prompt)
245
- data = trimAndLoadJson(res, self)
246
- return [
247
- AnswerRelevancyVerdict(**item) for item in data["verdicts"]
248
- ]
249
245
 
250
- async def _a_generate_statements(
246
+ return generate_with_schema_and_extract(
247
+ metric=self,
248
+ prompt=prompt,
249
+ schema_cls=Verdicts,
250
+ extract_schema=lambda r: list(r.verdicts),
251
+ extract_json=lambda data: [
252
+ AnswerRelevancyVerdict(**item) for item in data["verdicts"]
253
+ ],
254
+ )
255
+
256
+ def _generate_statements(
251
257
  self,
252
258
  actual_output: str,
259
+ multimodal: bool,
253
260
  ) -> List[str]:
254
261
  prompt = self.evaluation_template.generate_statements(
255
- actual_output=actual_output,
262
+ actual_output=actual_output, multimodal=multimodal
256
263
  )
257
- if self.using_native_model:
258
- res, cost = await self.model.a_generate(prompt, schema=Statements)
259
- self.evaluation_cost += cost
260
- return res.statements
261
- else:
262
- try:
263
- res: Statements = await self.model.a_generate(
264
- prompt, schema=Statements
265
- )
266
- return res.statements
267
- except TypeError:
268
- res = await self.model.a_generate(prompt)
269
- data = trimAndLoadJson(res, self)
270
- return data["statements"]
271
264
 
272
- def _generate_statements(
265
+ return generate_with_schema_and_extract(
266
+ metric=self,
267
+ prompt=prompt,
268
+ schema_cls=Statements,
269
+ extract_schema=lambda s: s.statements
270
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
271
+ extract_json=lambda d: d["statements"]
272
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
273
+ )
274
+
275
+ async def _a_generate_statements(
273
276
  self,
274
277
  actual_output: str,
278
+ multimodal: bool,
275
279
  ) -> List[str]:
276
280
  prompt = self.evaluation_template.generate_statements(
277
- actual_output=actual_output,
281
+ actual_output=actual_output, multimodal=multimodal
282
+ )
283
+
284
+ return await a_generate_with_schema_and_extract(
285
+ metric=self,
286
+ prompt=prompt,
287
+ schema_cls=Statements,
288
+ extract_schema=lambda s: s.statements
289
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
290
+ extract_json=lambda d: d["statements"]
291
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
278
292
  )
279
- if self.using_native_model:
280
- res, cost = self.model.generate(prompt, schema=Statements)
281
- self.evaluation_cost += cost
282
- return res.statements
283
- else:
284
- try:
285
- res: Statements = self.model.generate(prompt, schema=Statements)
286
- return res.statements
287
- except TypeError:
288
- res = self.model.generate(prompt)
289
- data = trimAndLoadJson(res, self)
290
- return data["statements"]
291
293
 
292
294
  def _calculate_score(self):
293
295
  number_of_verdicts = len(self.verdicts)
@@ -308,7 +310,7 @@ class AnswerRelevancyMetric(BaseMetric):
308
310
  else:
309
311
  try:
310
312
  self.success = self.score >= self.threshold
311
- except:
313
+ except TypeError:
312
314
  self.success = False
313
315
  return self.success
314
316
 
@@ -1,15 +1,26 @@
1
1
  from typing import List
2
+ import textwrap
2
3
 
3
4
 
4
5
  class AnswerRelevancyTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ """
13
+
5
14
  @staticmethod
6
- def generate_statements(actual_output: str):
15
+ def generate_statements(actual_output: str, multimodal: bool = False):
7
16
  return f"""Given the text, breakdown and generate a list of statements presented. Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement.
8
17
 
9
18
  Example:
10
19
  Example text:
11
20
  Our new laptop model features a high-resolution Retina display for crystal-clear visuals. It also includes a fast-charging battery, giving you up to 12 hours of usage on a single charge. For security, we’ve added fingerprint authentication and an encrypted SSD. Plus, every purchase comes with a one-year warranty and 24/7 customer support.
12
21
 
22
+ {AnswerRelevancyTemplate.multimodal_rules if multimodal else ""}
23
+
13
24
  {{
14
25
  "statements": [
15
26
  "The new laptop model has a high-resolution Retina display.",
@@ -32,13 +43,17 @@ JSON:
32
43
  """
33
44
 
34
45
  @staticmethod
35
- def generate_verdicts(input: str, statements: str):
46
+ def generate_verdicts(
47
+ input: str, statements: str, multimodal: bool = False
48
+ ):
36
49
  return f"""For the provided list of statements, determine whether each statement is relevant to address the input.
37
50
  Generate JSON objects with 'verdict' and 'reason' fields.
38
51
  The 'verdict' should be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information).
39
52
  Provide 'reason' ONLY for 'no' or 'idk' verdicts.
40
53
  The statements are from an AI's actual output.
41
54
 
55
+ {AnswerRelevancyTemplate.multimodal_rules if multimodal else ""}
56
+
42
57
  **
43
58
  IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
44
59
 
@@ -78,12 +93,16 @@ JSON:
78
93
 
79
94
  @staticmethod
80
95
  def generate_reason(
81
- irrelevant_statements: List[str], input: str, score: float
96
+ irrelevant_statements: List[str],
97
+ input: str,
98
+ score: float,
99
+ multimodal: bool = False,
82
100
  ):
83
101
  return f"""Given the answer relevancy score, the list of reasons of irrelevant statements made in the actual output, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
84
102
  The irrelevant statements represent things in the actual output that is irrelevant to addressing whatever is asked/talked about in the input.
85
103
  If there is nothing irrelevant, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
86
104
 
105
+ {AnswerRelevancyTemplate.multimodal_rules if multimodal else ""}
87
106
 
88
107
  **
89
108
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.