deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -2,12 +2,17 @@ import asyncio
2
2
 
3
3
  from typing import Optional, List, Union
4
4
 
5
- from deepeval.utils import get_or_create_event_loop, prettify_list
5
+ from deepeval.utils import (
6
+ get_or_create_event_loop,
7
+ prettify_list,
8
+ get_per_task_timeout,
9
+ )
6
10
  from deepeval.metrics.utils import (
7
11
  construct_verbose_logs,
8
- trimAndLoadJson,
9
12
  check_llm_test_case_params,
10
13
  initialize_model,
14
+ a_generate_with_schema_and_extract,
15
+ generate_with_schema_and_extract,
11
16
  )
12
17
  from deepeval.test_case import (
13
18
  LLMTestCase,
@@ -18,7 +23,6 @@ from deepeval.models import DeepEvalBaseLLM
18
23
  from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
19
24
  from deepeval.metrics.indicator import metric_progress_indicator
20
25
  from deepeval.metrics.prompt_alignment import schema as paschema
21
- from deepeval.config.settings import get_settings
22
26
 
23
27
  from deepeval.metrics.api import metric_data_manager
24
28
 
@@ -60,7 +64,15 @@ class PromptAlignmentMetric(BaseMetric):
60
64
  _log_metric_to_confident: bool = True,
61
65
  ) -> float:
62
66
 
63
- check_llm_test_case_params(test_case, self._required_params, self)
67
+ check_llm_test_case_params(
68
+ test_case,
69
+ self._required_params,
70
+ None,
71
+ None,
72
+ self,
73
+ self.model,
74
+ test_case.multimodal,
75
+ )
64
76
 
65
77
  self.evaluation_cost = 0 if self.using_native_model else None
66
78
  with metric_progress_indicator(
@@ -72,16 +84,19 @@ class PromptAlignmentMetric(BaseMetric):
72
84
  test_case,
73
85
  _show_indicator=False,
74
86
  _in_component=_in_component,
87
+ _log_metric_to_confident=_log_metric_to_confident,
75
88
  )
76
89
  loop.run_until_complete(
77
90
  asyncio.wait_for(
78
91
  coro,
79
- timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
92
+ timeout=get_per_task_timeout(),
80
93
  )
81
94
  )
82
95
  else:
83
- self.verdicts: paschema.Verdicts = self._generate_verdicts(
84
- test_case.input, test_case.actual_output
96
+ self.verdicts: List[paschema.PromptAlignmentVerdict] = (
97
+ self._generate_verdicts(
98
+ test_case.input, test_case.actual_output
99
+ )
85
100
  )
86
101
  self.score = self._calculate_score()
87
102
  self.reason = self._generate_reason(
@@ -111,7 +126,15 @@ class PromptAlignmentMetric(BaseMetric):
111
126
  _log_metric_to_confident: bool = True,
112
127
  ) -> float:
113
128
 
114
- check_llm_test_case_params(test_case, self._required_params, self)
129
+ check_llm_test_case_params(
130
+ test_case,
131
+ self._required_params,
132
+ None,
133
+ None,
134
+ self,
135
+ self.model,
136
+ test_case.multimodal,
137
+ )
115
138
 
116
139
  self.evaluation_cost = 0 if self.using_native_model else None
117
140
  with metric_progress_indicator(
@@ -120,8 +143,10 @@ class PromptAlignmentMetric(BaseMetric):
120
143
  _show_indicator=_show_indicator,
121
144
  _in_component=_in_component,
122
145
  ):
123
- self.verdicts: paschema.Verdicts = await self._a_generate_verdicts(
124
- test_case.input, test_case.actual_output
146
+ self.verdicts: List[paschema.PromptAlignmentVerdict] = (
147
+ await self._a_generate_verdicts(
148
+ test_case.input, test_case.actual_output
149
+ )
125
150
  )
126
151
  self.score = self._calculate_score()
127
152
  self.reason = await self._a_generate_reason(
@@ -142,7 +167,9 @@ class PromptAlignmentMetric(BaseMetric):
142
167
  )
143
168
  return self.score
144
169
 
145
- async def _a_generate_reason(self, input: str, actual_output: str) -> str:
170
+ async def _a_generate_reason(
171
+ self, input: str, actual_output: str
172
+ ) -> Optional[str]:
146
173
  if self.include_reason is False:
147
174
  return None
148
175
 
@@ -157,27 +184,16 @@ class PromptAlignmentMetric(BaseMetric):
157
184
  actual_output=actual_output,
158
185
  score=format(self.score, ".2f"),
159
186
  )
160
- if self.using_native_model:
161
- res, cost = await self.model.a_generate(
162
- prompt, schema=paschema.PromptAlignmentScoreReason
163
- )
164
- self.evaluation_cost += cost
165
- return res.reason
166
- else:
167
- try:
168
- res: paschema.PromptAlignmentScoreReason = (
169
- await self.model.a_generate(
170
- prompt=prompt,
171
- schema=paschema.PromptAlignmentScoreReason,
172
- )
173
- )
174
- return res.reason
175
- except TypeError:
176
- res = await self.model.a_generate(prompt)
177
- data = trimAndLoadJson(res, self)
178
- return data["reason"]
179
187
 
180
- def _generate_reason(self, input: str, actual_output: str) -> str:
188
+ return await a_generate_with_schema_and_extract(
189
+ metric=self,
190
+ prompt=prompt,
191
+ schema_cls=paschema.PromptAlignmentScoreReason,
192
+ extract_schema=lambda s: s.reason,
193
+ extract_json=lambda data: data["reason"],
194
+ )
195
+
196
+ def _generate_reason(self, input: str, actual_output: str) -> Optional[str]:
181
197
  if self.include_reason is False:
182
198
  return None
183
199
 
@@ -192,78 +208,54 @@ class PromptAlignmentMetric(BaseMetric):
192
208
  actual_output=actual_output,
193
209
  score=format(self.score, ".2f"),
194
210
  )
195
- if self.using_native_model:
196
- res, cost = self.model.generate(
197
- prompt, schema=paschema.PromptAlignmentScoreReason
198
- )
199
- self.evaluation_cost += cost
200
- return res.reason
201
- else:
202
- try:
203
- res: paschema.PromptAlignmentScoreReason = self.model.generate(
204
- prompt=prompt, schema=paschema.PromptAlignmentScoreReason
205
- )
206
- return res.reason
207
- except TypeError:
208
- res = self.model.generate(prompt)
209
- data = trimAndLoadJson(res, self)
210
- return data["reason"]
211
+
212
+ return generate_with_schema_and_extract(
213
+ metric=self,
214
+ prompt=prompt,
215
+ schema_cls=paschema.PromptAlignmentScoreReason,
216
+ extract_schema=lambda s: s.reason,
217
+ extract_json=lambda data: data["reason"],
218
+ )
211
219
 
212
220
  async def _a_generate_verdicts(
213
221
  self, input: str, actual_output: str
214
- ) -> paschema.Verdicts:
222
+ ) -> List[paschema.PromptAlignmentVerdict]:
215
223
  prompt = PromptAlignmentTemplate.generate_verdicts(
216
224
  prompt_instructions=self.prompt_instructions,
217
225
  input=input,
218
226
  actual_output=actual_output,
219
227
  )
220
- if self.using_native_model:
221
- res, cost = await self.model.a_generate(
222
- prompt, schema=paschema.Verdicts
223
- )
224
- self.evaluation_cost += cost
225
- return [item for item in res.verdicts]
226
- else:
227
- try:
228
- res: paschema.Verdicts = await self.model.a_generate(
229
- prompt, schema=paschema.Verdicts
230
- )
231
- return [item for item in res.verdicts]
232
- except TypeError:
233
- res = await self.model.a_generate(prompt)
234
- data = trimAndLoadJson(res, self)
235
- return [
236
- paschema.PromptAlignmentVerdict(**item)
237
- for item in data["verdicts"]
238
- ]
228
+ return await a_generate_with_schema_and_extract(
229
+ metric=self,
230
+ prompt=prompt,
231
+ schema_cls=paschema.Verdicts,
232
+ extract_schema=lambda s: list(s.verdicts),
233
+ extract_json=lambda data: [
234
+ paschema.PromptAlignmentVerdict(**item)
235
+ for item in data["verdicts"]
236
+ ],
237
+ )
239
238
 
240
239
  def _generate_verdicts(
241
240
  self, input: str, actual_output: str
242
- ) -> paschema.Verdicts:
241
+ ) -> List[paschema.PromptAlignmentVerdict]:
243
242
  prompt = PromptAlignmentTemplate.generate_verdicts(
244
243
  prompt_instructions=self.prompt_instructions,
245
244
  input=input,
246
245
  actual_output=actual_output,
247
246
  )
248
- if self.using_native_model:
249
- res, cost = self.model.generate(prompt, schema=paschema.Verdicts)
250
- self.evaluation_cost += cost
251
- return [item for item in res.verdicts]
252
- else:
253
- try:
254
- res: paschema.Verdicts = self.model.generate(
255
- prompt, schema=paschema.Verdicts
256
- )
257
- return [item for item in res.verdicts]
258
- except TypeError:
259
- res = self.model.generate(prompt)
260
- data = trimAndLoadJson(res, self)
261
- return [
262
- paschema.PromptAlignmentVerdict(**item)
263
- for item in data["verdicts"]
264
- ]
247
+ return generate_with_schema_and_extract(
248
+ metric=self,
249
+ prompt=prompt,
250
+ schema_cls=paschema.Verdicts,
251
+ extract_schema=lambda s: list(s.verdicts),
252
+ extract_json=lambda data: [
253
+ paschema.PromptAlignmentVerdict(**item)
254
+ for item in data["verdicts"]
255
+ ],
256
+ )
265
257
 
266
- def _calculate_score(self):
258
+ def _calculate_score(self) -> float:
267
259
  number_of_verdicts = len(self.verdicts)
268
260
  if number_of_verdicts == 0:
269
261
  return 1
@@ -2,6 +2,14 @@ from typing import List
2
2
 
3
3
 
4
4
  class PromptAlignmentTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_verdicts(
7
15
  prompt_instructions: List[str], input: str, actual_output: str
@@ -14,6 +22,8 @@ The 'reason' is the reason for the verdict.
14
22
  Provide a 'reason' ONLY if the answer is 'no'.
15
23
  The provided prompt instructions are the instructions to be followed in the prompt, which you have no access to.
16
24
 
25
+ {PromptAlignmentTemplate.multimodal_rules}
26
+
17
27
  **
18
28
  IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects.
19
29
  Example input: What number is the stars of the sky?
@@ -63,6 +73,8 @@ The unalignments represent prompt instructions that are not followed by the LLM
63
73
  If there no unaligments, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
64
74
  Don't have to talk about whether the actual output is a good fit for the input, access ENTIRELY based on the unalignment reasons.
65
75
 
76
+ {PromptAlignmentTemplate.multimodal_rules}
77
+
66
78
  **
67
79
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
68
80
  Example JSON:
@@ -4,20 +4,21 @@ from deepeval.metrics import BaseConversationalMetric
4
4
  from deepeval.metrics.api import metric_data_manager
5
5
  from deepeval.metrics.role_adherence.schema import (
6
6
  OutOfCharacterResponseVerdicts,
7
+ RoleAdherenceScoreReason,
7
8
  )
8
9
  from deepeval.metrics.role_adherence.template import RoleAdherenceTemplate
9
10
  from deepeval.metrics.utils import (
10
11
  check_conversational_test_case_params,
11
12
  construct_verbose_logs,
12
13
  convert_turn_to_dict,
13
- trimAndLoadJson,
14
14
  initialize_model,
15
+ a_generate_with_schema_and_extract,
16
+ generate_with_schema_and_extract,
15
17
  )
16
18
  from deepeval.models import DeepEvalBaseLLM
17
19
  from deepeval.metrics.indicator import metric_progress_indicator
18
20
  from deepeval.test_case import Turn, ConversationalTestCase, TurnParams
19
21
  from deepeval.utils import get_or_create_event_loop, prettify_list
20
- from deepeval.metrics.role_adherence.schema import *
21
22
 
22
23
 
23
24
  class RoleAdherenceMetric(BaseConversationalMetric):
@@ -51,7 +52,9 @@ class RoleAdherenceMetric(BaseConversationalMetric):
51
52
  test_case,
52
53
  self._required_test_case_params,
53
54
  self,
54
- require_chatbot_role=True,
55
+ True,
56
+ self.model,
57
+ test_case.multimodal,
55
58
  )
56
59
 
57
60
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -102,7 +105,9 @@ class RoleAdherenceMetric(BaseConversationalMetric):
102
105
  test_case,
103
106
  self._required_test_case_params,
104
107
  self,
105
- require_chatbot_role=True,
108
+ True,
109
+ self.model,
110
+ test_case.multimodal,
106
111
  )
107
112
 
108
113
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -138,7 +143,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
138
143
  )
139
144
  return self.score
140
145
 
141
- async def _a_generate_reason(self, role: str) -> str:
146
+ async def _a_generate_reason(self, role: str) -> Optional[str]:
142
147
  if self.include_reason is False:
143
148
  return None
144
149
 
@@ -150,24 +155,17 @@ class RoleAdherenceMetric(BaseConversationalMetric):
150
155
  for verdict in self.out_of_character_verdicts.verdicts
151
156
  ],
152
157
  )
153
- if self.using_native_model:
154
- res, cost = await self.model.a_generate(
155
- prompt, schema=RoleAdherenceScoreReason
156
- )
157
- self.evaluation_cost += cost
158
- return res.reason
159
- else:
160
- try:
161
- res: RoleAdherenceScoreReason = await self.model.a_generate(
162
- prompt, schema=RoleAdherenceScoreReason
163
- )
164
- return res.reason
165
- except TypeError:
166
- res = await self.model.a_generate(prompt)
167
- data = trimAndLoadJson(res, self)
168
- return data["reason"]
158
+ return await a_generate_with_schema_and_extract(
159
+ metric=self,
160
+ prompt=prompt,
161
+ schema_cls=RoleAdherenceScoreReason,
162
+ extract_schema=lambda s: s.reason,
163
+ extract_json=lambda data: data["reason"],
164
+ )
169
165
 
170
- def _generate_reason(self, role: str) -> str:
166
+ def _generate_reason(self, role: str) -> Optional[str]:
167
+ if self.include_reason is False:
168
+ return None
171
169
  prompt = RoleAdherenceTemplate.generate_reason(
172
170
  score=self.score,
173
171
  role=role,
@@ -176,22 +174,13 @@ class RoleAdherenceMetric(BaseConversationalMetric):
176
174
  for verdict in self.out_of_character_verdicts.verdicts
177
175
  ],
178
176
  )
179
- if self.using_native_model:
180
- res, cost = self.model.generate(
181
- prompt, schema=RoleAdherenceScoreReason
182
- )
183
- self.evaluation_cost += cost
184
- return res.reason
185
- else:
186
- try:
187
- res: RoleAdherenceScoreReason = self.model.generate(
188
- prompt, schema=RoleAdherenceScoreReason
189
- )
190
- return res.reason
191
- except TypeError:
192
- res = self.model.generate(prompt)
193
- data = trimAndLoadJson(res, self)
194
- return data["reason"]
177
+ return generate_with_schema_and_extract(
178
+ metric=self,
179
+ prompt=prompt,
180
+ schema_cls=RoleAdherenceScoreReason,
181
+ extract_schema=lambda s: s.reason,
182
+ extract_json=lambda data: data["reason"],
183
+ )
195
184
 
196
185
  async def _a_extract_out_of_character_verdicts(
197
186
  self, turns: List[Turn], role: str
@@ -202,28 +191,23 @@ class RoleAdherenceMetric(BaseConversationalMetric):
202
191
  role=role,
203
192
  )
204
193
  )
205
- if self.using_native_model:
206
- res, cost = await self.model.a_generate(
207
- prompt, schema=OutOfCharacterResponseVerdicts
194
+ res: OutOfCharacterResponseVerdicts = (
195
+ await a_generate_with_schema_and_extract(
196
+ metric=self,
197
+ prompt=prompt,
198
+ schema_cls=OutOfCharacterResponseVerdicts,
199
+ extract_schema=lambda s: s,
200
+ extract_json=lambda data: OutOfCharacterResponseVerdicts(
201
+ **data
202
+ ),
208
203
  )
209
- self.evaluation_cost += cost
210
- else:
211
- try:
212
- res: OutOfCharacterResponseVerdicts = (
213
- await self.model.a_generate(
214
- prompt, schema=OutOfCharacterResponseVerdicts
215
- )
216
- )
217
- except TypeError:
218
- res = await self.model.a_generate(prompt)
219
- data = trimAndLoadJson(res, self)
220
- res = OutOfCharacterResponseVerdicts(**data)
204
+ )
221
205
 
222
206
  for verdict in res.verdicts:
223
207
  try:
224
208
  index = verdict.index
225
209
  verdict.ai_message = f"{turns[index].content} (turn #{index+1})"
226
- except:
210
+ except Exception:
227
211
  pass
228
212
  return res
229
213
 
@@ -236,26 +220,19 @@ class RoleAdherenceMetric(BaseConversationalMetric):
236
220
  role=role,
237
221
  )
238
222
  )
239
- if self.using_native_model:
240
- res, cost = self.model.generate(
241
- prompt, schema=OutOfCharacterResponseVerdicts
242
- )
243
- self.evaluation_cost += cost
244
- else:
245
- try:
246
- res: OutOfCharacterResponseVerdicts = self.model.generate(
247
- prompt, schema=OutOfCharacterResponseVerdicts
248
- )
249
- except TypeError:
250
- res = self.model.generate(prompt)
251
- data = trimAndLoadJson(res, self)
252
- res = OutOfCharacterResponseVerdicts(**data)
223
+ res: OutOfCharacterResponseVerdicts = generate_with_schema_and_extract(
224
+ metric=self,
225
+ prompt=prompt,
226
+ schema_cls=OutOfCharacterResponseVerdicts,
227
+ extract_schema=lambda s: s,
228
+ extract_json=lambda data: OutOfCharacterResponseVerdicts(**data),
229
+ )
253
230
 
254
231
  for verdict in res.verdicts:
255
232
  try:
256
233
  index = verdict.index
257
234
  verdict.ai_message = f"{turns[index].content} (turn #{index+1})"
258
- except:
235
+ except Exception:
259
236
  pass
260
237
  return res
261
238
 
@@ -278,8 +255,8 @@ class RoleAdherenceMetric(BaseConversationalMetric):
278
255
  self.success = False
279
256
  else:
280
257
  try:
281
- self.score >= self.threshold
282
- except:
258
+ self.success = self.score >= self.threshold
259
+ except TypeError:
283
260
  self.success = False
284
261
  return self.success
285
262
 
@@ -2,11 +2,22 @@ from typing import List, Dict
2
2
 
3
3
 
4
4
  class RoleAdherenceTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def extract_out_of_character_response_verdicts(
7
15
  turns: List[Dict], role: str
8
16
  ):
9
17
  return f"""Based on the given list of message exchanges between a user and an LLM chatbot, generate a JSON object to specify which `ai_message` did not adhere to the specified chatbot role.
18
+
19
+ {RoleAdherenceTemplate.multimodal_rules}
20
+
10
21
  The JSON will have 1 field: "verdicts", which is a list of verdicts specifying the indices and reasons of the LLM ai_message/responses that did NOT adhere to the chatbot role.
11
22
  You MUST USE look at all messages provided in the list of messages to make an informed judgement on role adherence.
12
23
 
@@ -72,6 +83,9 @@ JSON:
72
83
  return f"""Below is a list of LLM chatbot responses (ai_message) that is out of character with respect to the specified chatbot role. It is drawn from a list of messages in a conversation, which you have minimal knowledge of.
73
84
  Given the role adherence score, which is a 0-1 score indicating how well the chatbot responses has adhered to the given role through a conversation, with 1 being the best and 0 being worst, provide a reason by quoting the out of character responses to justify the score.
74
85
 
86
+
87
+ {RoleAdherenceTemplate.multimodal_rules}
88
+
75
89
  **
76
90
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
77
91
  Example JSON: