deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -5,9 +5,10 @@ from typing import Optional, List, Union
5
5
  from deepeval.utils import get_or_create_event_loop, prettify_list
6
6
  from deepeval.metrics.utils import (
7
7
  construct_verbose_logs,
8
- trimAndLoadJson,
9
8
  check_llm_test_case_params,
10
9
  initialize_model,
10
+ a_generate_with_schema_and_extract,
11
+ generate_with_schema_and_extract,
11
12
  )
12
13
  from deepeval.test_case import (
13
14
  LLMTestCase,
@@ -60,7 +61,15 @@ class PromptAlignmentMetric(BaseMetric):
60
61
  _log_metric_to_confident: bool = True,
61
62
  ) -> float:
62
63
 
63
- check_llm_test_case_params(test_case, self._required_params, self)
64
+ check_llm_test_case_params(
65
+ test_case,
66
+ self._required_params,
67
+ None,
68
+ None,
69
+ self,
70
+ self.model,
71
+ test_case.multimodal,
72
+ )
64
73
 
65
74
  self.evaluation_cost = 0 if self.using_native_model else None
66
75
  with metric_progress_indicator(
@@ -72,6 +81,7 @@ class PromptAlignmentMetric(BaseMetric):
72
81
  test_case,
73
82
  _show_indicator=False,
74
83
  _in_component=_in_component,
84
+ _log_metric_to_confident=_log_metric_to_confident,
75
85
  )
76
86
  loop.run_until_complete(
77
87
  asyncio.wait_for(
@@ -80,8 +90,10 @@ class PromptAlignmentMetric(BaseMetric):
80
90
  )
81
91
  )
82
92
  else:
83
- self.verdicts: paschema.Verdicts = self._generate_verdicts(
84
- test_case.input, test_case.actual_output
93
+ self.verdicts: List[paschema.PromptAlignmentVerdict] = (
94
+ self._generate_verdicts(
95
+ test_case.input, test_case.actual_output
96
+ )
85
97
  )
86
98
  self.score = self._calculate_score()
87
99
  self.reason = self._generate_reason(
@@ -111,7 +123,15 @@ class PromptAlignmentMetric(BaseMetric):
111
123
  _log_metric_to_confident: bool = True,
112
124
  ) -> float:
113
125
 
114
- check_llm_test_case_params(test_case, self._required_params, self)
126
+ check_llm_test_case_params(
127
+ test_case,
128
+ self._required_params,
129
+ None,
130
+ None,
131
+ self,
132
+ self.model,
133
+ test_case.multimodal,
134
+ )
115
135
 
116
136
  self.evaluation_cost = 0 if self.using_native_model else None
117
137
  with metric_progress_indicator(
@@ -120,8 +140,10 @@ class PromptAlignmentMetric(BaseMetric):
120
140
  _show_indicator=_show_indicator,
121
141
  _in_component=_in_component,
122
142
  ):
123
- self.verdicts: paschema.Verdicts = await self._a_generate_verdicts(
124
- test_case.input, test_case.actual_output
143
+ self.verdicts: List[paschema.PromptAlignmentVerdict] = (
144
+ await self._a_generate_verdicts(
145
+ test_case.input, test_case.actual_output
146
+ )
125
147
  )
126
148
  self.score = self._calculate_score()
127
149
  self.reason = await self._a_generate_reason(
@@ -142,7 +164,9 @@ class PromptAlignmentMetric(BaseMetric):
142
164
  )
143
165
  return self.score
144
166
 
145
- async def _a_generate_reason(self, input: str, actual_output: str) -> str:
167
+ async def _a_generate_reason(
168
+ self, input: str, actual_output: str
169
+ ) -> Optional[str]:
146
170
  if self.include_reason is False:
147
171
  return None
148
172
 
@@ -157,27 +181,16 @@ class PromptAlignmentMetric(BaseMetric):
157
181
  actual_output=actual_output,
158
182
  score=format(self.score, ".2f"),
159
183
  )
160
- if self.using_native_model:
161
- res, cost = await self.model.a_generate(
162
- prompt, schema=paschema.PromptAlignmentScoreReason
163
- )
164
- self.evaluation_cost += cost
165
- return res.reason
166
- else:
167
- try:
168
- res: paschema.PromptAlignmentScoreReason = (
169
- await self.model.a_generate(
170
- prompt=prompt,
171
- schema=paschema.PromptAlignmentScoreReason,
172
- )
173
- )
174
- return res.reason
175
- except TypeError:
176
- res = await self.model.a_generate(prompt)
177
- data = trimAndLoadJson(res, self)
178
- return data["reason"]
179
184
 
180
- def _generate_reason(self, input: str, actual_output: str) -> str:
185
+ return await a_generate_with_schema_and_extract(
186
+ metric=self,
187
+ prompt=prompt,
188
+ schema_cls=paschema.PromptAlignmentScoreReason,
189
+ extract_schema=lambda s: s.reason,
190
+ extract_json=lambda data: data["reason"],
191
+ )
192
+
193
+ def _generate_reason(self, input: str, actual_output: str) -> Optional[str]:
181
194
  if self.include_reason is False:
182
195
  return None
183
196
 
@@ -192,78 +205,54 @@ class PromptAlignmentMetric(BaseMetric):
192
205
  actual_output=actual_output,
193
206
  score=format(self.score, ".2f"),
194
207
  )
195
- if self.using_native_model:
196
- res, cost = self.model.generate(
197
- prompt, schema=paschema.PromptAlignmentScoreReason
198
- )
199
- self.evaluation_cost += cost
200
- return res.reason
201
- else:
202
- try:
203
- res: paschema.PromptAlignmentScoreReason = self.model.generate(
204
- prompt=prompt, schema=paschema.PromptAlignmentScoreReason
205
- )
206
- return res.reason
207
- except TypeError:
208
- res = self.model.generate(prompt)
209
- data = trimAndLoadJson(res, self)
210
- return data["reason"]
208
+
209
+ return generate_with_schema_and_extract(
210
+ metric=self,
211
+ prompt=prompt,
212
+ schema_cls=paschema.PromptAlignmentScoreReason,
213
+ extract_schema=lambda s: s.reason,
214
+ extract_json=lambda data: data["reason"],
215
+ )
211
216
 
212
217
  async def _a_generate_verdicts(
213
218
  self, input: str, actual_output: str
214
- ) -> paschema.Verdicts:
219
+ ) -> List[paschema.PromptAlignmentVerdict]:
215
220
  prompt = PromptAlignmentTemplate.generate_verdicts(
216
221
  prompt_instructions=self.prompt_instructions,
217
222
  input=input,
218
223
  actual_output=actual_output,
219
224
  )
220
- if self.using_native_model:
221
- res, cost = await self.model.a_generate(
222
- prompt, schema=paschema.Verdicts
223
- )
224
- self.evaluation_cost += cost
225
- return [item for item in res.verdicts]
226
- else:
227
- try:
228
- res: paschema.Verdicts = await self.model.a_generate(
229
- prompt, schema=paschema.Verdicts
230
- )
231
- return [item for item in res.verdicts]
232
- except TypeError:
233
- res = await self.model.a_generate(prompt)
234
- data = trimAndLoadJson(res, self)
235
- return [
236
- paschema.PromptAlignmentVerdict(**item)
237
- for item in data["verdicts"]
238
- ]
225
+ return await a_generate_with_schema_and_extract(
226
+ metric=self,
227
+ prompt=prompt,
228
+ schema_cls=paschema.Verdicts,
229
+ extract_schema=lambda s: list(s.verdicts),
230
+ extract_json=lambda data: [
231
+ paschema.PromptAlignmentVerdict(**item)
232
+ for item in data["verdicts"]
233
+ ],
234
+ )
239
235
 
240
236
  def _generate_verdicts(
241
237
  self, input: str, actual_output: str
242
- ) -> paschema.Verdicts:
238
+ ) -> List[paschema.PromptAlignmentVerdict]:
243
239
  prompt = PromptAlignmentTemplate.generate_verdicts(
244
240
  prompt_instructions=self.prompt_instructions,
245
241
  input=input,
246
242
  actual_output=actual_output,
247
243
  )
248
- if self.using_native_model:
249
- res, cost = self.model.generate(prompt, schema=paschema.Verdicts)
250
- self.evaluation_cost += cost
251
- return [item for item in res.verdicts]
252
- else:
253
- try:
254
- res: paschema.Verdicts = self.model.generate(
255
- prompt, schema=paschema.Verdicts
256
- )
257
- return [item for item in res.verdicts]
258
- except TypeError:
259
- res = self.model.generate(prompt)
260
- data = trimAndLoadJson(res, self)
261
- return [
262
- paschema.PromptAlignmentVerdict(**item)
263
- for item in data["verdicts"]
264
- ]
244
+ return generate_with_schema_and_extract(
245
+ metric=self,
246
+ prompt=prompt,
247
+ schema_cls=paschema.Verdicts,
248
+ extract_schema=lambda s: list(s.verdicts),
249
+ extract_json=lambda data: [
250
+ paschema.PromptAlignmentVerdict(**item)
251
+ for item in data["verdicts"]
252
+ ],
253
+ )
265
254
 
266
- def _calculate_score(self):
255
+ def _calculate_score(self) -> float:
267
256
  number_of_verdicts = len(self.verdicts)
268
257
  if number_of_verdicts == 0:
269
258
  return 1
@@ -2,6 +2,14 @@ from typing import List
2
2
 
3
3
 
4
4
  class PromptAlignmentTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_verdicts(
7
15
  prompt_instructions: List[str], input: str, actual_output: str
@@ -14,6 +22,8 @@ The 'reason' is the reason for the verdict.
14
22
  Provide a 'reason' ONLY if the answer is 'no'.
15
23
  The provided prompt instructions are the instructions to be followed in the prompt, which you have no access to.
16
24
 
25
+ {PromptAlignmentTemplate.multimodal_rules}
26
+
17
27
  **
18
28
  IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects.
19
29
  Example input: What number is the stars of the sky?
@@ -63,6 +73,8 @@ The unalignments represent prompt instructions that are not followed by the LLM
63
73
  If there no unaligments, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
64
74
  Don't have to talk about whether the actual output is a good fit for the input, access ENTIRELY based on the unalignment reasons.
65
75
 
76
+ {PromptAlignmentTemplate.multimodal_rules}
77
+
66
78
  **
67
79
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
68
80
  Example JSON:
deepeval/metrics/ragas.py CHANGED
@@ -10,7 +10,7 @@ from deepeval.telemetry import capture_metric_type
10
10
 
11
11
  # check langchain availability
12
12
  try:
13
- import langchain_core
13
+ import langchain_core # noqa: F401
14
14
  from langchain_core.language_models import BaseChatModel
15
15
  from langchain_core.embeddings import Embeddings
16
16
 
@@ -501,7 +501,7 @@ class RagasMetric(BaseMetric):
501
501
  def measure(self, test_case: LLMTestCase):
502
502
  # sends to server
503
503
  try:
504
- from ragas import evaluate
504
+ from ragas import evaluate # noqa: F401
505
505
  except ModuleNotFoundError:
506
506
  raise ModuleNotFoundError(
507
507
  "Please install ragas to use this metric. `pip install ragas`."
@@ -509,7 +509,7 @@ class RagasMetric(BaseMetric):
509
509
 
510
510
  try:
511
511
  # How do i make sure this isn't just huggingface dataset
512
- from datasets import Dataset
512
+ from datasets import Dataset # noqa: F401
513
513
  except ModuleNotFoundError:
514
514
  raise ModuleNotFoundError("Please install dataset")
515
515
 
@@ -4,20 +4,21 @@ from deepeval.metrics import BaseConversationalMetric
4
4
  from deepeval.metrics.api import metric_data_manager
5
5
  from deepeval.metrics.role_adherence.schema import (
6
6
  OutOfCharacterResponseVerdicts,
7
+ RoleAdherenceScoreReason,
7
8
  )
8
9
  from deepeval.metrics.role_adherence.template import RoleAdherenceTemplate
9
10
  from deepeval.metrics.utils import (
10
11
  check_conversational_test_case_params,
11
12
  construct_verbose_logs,
12
13
  convert_turn_to_dict,
13
- trimAndLoadJson,
14
14
  initialize_model,
15
+ a_generate_with_schema_and_extract,
16
+ generate_with_schema_and_extract,
15
17
  )
16
18
  from deepeval.models import DeepEvalBaseLLM
17
19
  from deepeval.metrics.indicator import metric_progress_indicator
18
20
  from deepeval.test_case import Turn, ConversationalTestCase, TurnParams
19
21
  from deepeval.utils import get_or_create_event_loop, prettify_list
20
- from deepeval.metrics.role_adherence.schema import *
21
22
 
22
23
 
23
24
  class RoleAdherenceMetric(BaseConversationalMetric):
@@ -51,7 +52,9 @@ class RoleAdherenceMetric(BaseConversationalMetric):
51
52
  test_case,
52
53
  self._required_test_case_params,
53
54
  self,
54
- require_chatbot_role=True,
55
+ True,
56
+ self.model,
57
+ test_case.multimodal,
55
58
  )
56
59
 
57
60
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -102,7 +105,9 @@ class RoleAdherenceMetric(BaseConversationalMetric):
102
105
  test_case,
103
106
  self._required_test_case_params,
104
107
  self,
105
- require_chatbot_role=True,
108
+ True,
109
+ self.model,
110
+ test_case.multimodal,
106
111
  )
107
112
 
108
113
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -138,7 +143,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
138
143
  )
139
144
  return self.score
140
145
 
141
- async def _a_generate_reason(self, role: str) -> str:
146
+ async def _a_generate_reason(self, role: str) -> Optional[str]:
142
147
  if self.include_reason is False:
143
148
  return None
144
149
 
@@ -150,24 +155,17 @@ class RoleAdherenceMetric(BaseConversationalMetric):
150
155
  for verdict in self.out_of_character_verdicts.verdicts
151
156
  ],
152
157
  )
153
- if self.using_native_model:
154
- res, cost = await self.model.a_generate(
155
- prompt, schema=RoleAdherenceScoreReason
156
- )
157
- self.evaluation_cost += cost
158
- return res.reason
159
- else:
160
- try:
161
- res: RoleAdherenceScoreReason = await self.model.a_generate(
162
- prompt, schema=RoleAdherenceScoreReason
163
- )
164
- return res.reason
165
- except TypeError:
166
- res = await self.model.a_generate(prompt)
167
- data = trimAndLoadJson(res, self)
168
- return data["reason"]
158
+ return await a_generate_with_schema_and_extract(
159
+ metric=self,
160
+ prompt=prompt,
161
+ schema_cls=RoleAdherenceScoreReason,
162
+ extract_schema=lambda s: s.reason,
163
+ extract_json=lambda data: data["reason"],
164
+ )
169
165
 
170
- def _generate_reason(self, role: str) -> str:
166
+ def _generate_reason(self, role: str) -> Optional[str]:
167
+ if self.include_reason is False:
168
+ return None
171
169
  prompt = RoleAdherenceTemplate.generate_reason(
172
170
  score=self.score,
173
171
  role=role,
@@ -176,22 +174,13 @@ class RoleAdherenceMetric(BaseConversationalMetric):
176
174
  for verdict in self.out_of_character_verdicts.verdicts
177
175
  ],
178
176
  )
179
- if self.using_native_model:
180
- res, cost = self.model.generate(
181
- prompt, schema=RoleAdherenceScoreReason
182
- )
183
- self.evaluation_cost += cost
184
- return res.reason
185
- else:
186
- try:
187
- res: RoleAdherenceScoreReason = self.model.generate(
188
- prompt, schema=RoleAdherenceScoreReason
189
- )
190
- return res.reason
191
- except TypeError:
192
- res = self.model.generate(prompt)
193
- data = trimAndLoadJson(res, self)
194
- return data["reason"]
177
+ return generate_with_schema_and_extract(
178
+ metric=self,
179
+ prompt=prompt,
180
+ schema_cls=RoleAdherenceScoreReason,
181
+ extract_schema=lambda s: s.reason,
182
+ extract_json=lambda data: data["reason"],
183
+ )
195
184
 
196
185
  async def _a_extract_out_of_character_verdicts(
197
186
  self, turns: List[Turn], role: str
@@ -202,28 +191,23 @@ class RoleAdherenceMetric(BaseConversationalMetric):
202
191
  role=role,
203
192
  )
204
193
  )
205
- if self.using_native_model:
206
- res, cost = await self.model.a_generate(
207
- prompt, schema=OutOfCharacterResponseVerdicts
194
+ res: OutOfCharacterResponseVerdicts = (
195
+ await a_generate_with_schema_and_extract(
196
+ metric=self,
197
+ prompt=prompt,
198
+ schema_cls=OutOfCharacterResponseVerdicts,
199
+ extract_schema=lambda s: s,
200
+ extract_json=lambda data: OutOfCharacterResponseVerdicts(
201
+ **data
202
+ ),
208
203
  )
209
- self.evaluation_cost += cost
210
- else:
211
- try:
212
- res: OutOfCharacterResponseVerdicts = (
213
- await self.model.a_generate(
214
- prompt, schema=OutOfCharacterResponseVerdicts
215
- )
216
- )
217
- except TypeError:
218
- res = await self.model.a_generate(prompt)
219
- data = trimAndLoadJson(res, self)
220
- res = OutOfCharacterResponseVerdicts(**data)
204
+ )
221
205
 
222
206
  for verdict in res.verdicts:
223
207
  try:
224
208
  index = verdict.index
225
209
  verdict.ai_message = f"{turns[index].content} (turn #{index+1})"
226
- except:
210
+ except Exception:
227
211
  pass
228
212
  return res
229
213
 
@@ -236,26 +220,19 @@ class RoleAdherenceMetric(BaseConversationalMetric):
236
220
  role=role,
237
221
  )
238
222
  )
239
- if self.using_native_model:
240
- res, cost = self.model.generate(
241
- prompt, schema=OutOfCharacterResponseVerdicts
242
- )
243
- self.evaluation_cost += cost
244
- else:
245
- try:
246
- res: OutOfCharacterResponseVerdicts = self.model.generate(
247
- prompt, schema=OutOfCharacterResponseVerdicts
248
- )
249
- except TypeError:
250
- res = self.model.generate(prompt)
251
- data = trimAndLoadJson(res, self)
252
- res = OutOfCharacterResponseVerdicts(**data)
223
+ res: OutOfCharacterResponseVerdicts = generate_with_schema_and_extract(
224
+ metric=self,
225
+ prompt=prompt,
226
+ schema_cls=OutOfCharacterResponseVerdicts,
227
+ extract_schema=lambda s: s,
228
+ extract_json=lambda data: OutOfCharacterResponseVerdicts(**data),
229
+ )
253
230
 
254
231
  for verdict in res.verdicts:
255
232
  try:
256
233
  index = verdict.index
257
234
  verdict.ai_message = f"{turns[index].content} (turn #{index+1})"
258
- except:
235
+ except Exception:
259
236
  pass
260
237
  return res
261
238
 
@@ -278,8 +255,8 @@ class RoleAdherenceMetric(BaseConversationalMetric):
278
255
  self.success = False
279
256
  else:
280
257
  try:
281
- self.score >= self.threshold
282
- except:
258
+ self.success = self.score >= self.threshold
259
+ except TypeError:
283
260
  self.success = False
284
261
  return self.success
285
262
 
@@ -2,11 +2,22 @@ from typing import List, Dict
2
2
 
3
3
 
4
4
  class RoleAdherenceTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def extract_out_of_character_response_verdicts(
7
15
  turns: List[Dict], role: str
8
16
  ):
9
17
  return f"""Based on the given list of message exchanges between a user and an LLM chatbot, generate a JSON object to specify which `ai_message` did not adhere to the specified chatbot role.
18
+
19
+ {RoleAdherenceTemplate.multimodal_rules}
20
+
10
21
  The JSON will have 1 field: "verdicts", which is a list of verdicts specifying the indices and reasons of the LLM ai_message/responses that did NOT adhere to the chatbot role.
11
22
  You MUST USE look at all messages provided in the list of messages to make an informed judgement on role adherence.
12
23
 
@@ -72,6 +83,9 @@ JSON:
72
83
  return f"""Below is a list of LLM chatbot responses (ai_message) that is out of character with respect to the specified chatbot role. It is drawn from a list of messages in a conversation, which you have minimal knowledge of.
73
84
  Given the role adherence score, which is a 0-1 score indicating how well the chatbot responses has adhered to the given role through a conversation, with 1 being the best and 0 being worst, provide a reason by quoting the out of character responses to justify the score.
74
85
 
86
+
87
+ {RoleAdherenceTemplate.multimodal_rules}
88
+
75
89
  **
76
90
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
77
91
  Example JSON: