deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -6,17 +6,22 @@ from deepeval.utils import (
6
6
  )
7
7
  from deepeval.metrics.utils import (
8
8
  construct_verbose_logs,
9
- trimAndLoadJson,
10
9
  check_llm_test_case_params,
11
- check_mllm_test_case_params,
12
10
  initialize_model,
11
+ generate_with_schema_and_extract,
12
+ a_generate_with_schema_and_extract,
13
13
  )
14
14
  from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
15
15
  from deepeval.metrics import BaseMetric
16
16
  from deepeval.models import DeepEvalBaseLLM
17
17
  from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
18
18
  from deepeval.metrics.indicator import metric_progress_indicator
19
- from deepeval.metrics.answer_relevancy.schema import *
19
+ from deepeval.metrics.answer_relevancy.schema import (
20
+ Statements,
21
+ AnswerRelevancyVerdict,
22
+ Verdicts,
23
+ AnswerRelevancyScoreReason,
24
+ )
20
25
  from deepeval.metrics.api import metric_data_manager
21
26
 
22
27
 
@@ -55,13 +60,15 @@ class AnswerRelevancyMetric(BaseMetric):
55
60
  _log_metric_to_confident: bool = True,
56
61
  ) -> float:
57
62
 
58
- multimodal = test_case.multimodal
59
- if multimodal:
60
- check_mllm_test_case_params(
61
- test_case, self._required_params, None, None, self, self.model
62
- )
63
- else:
64
- check_llm_test_case_params(test_case, self._required_params, self)
63
+ check_llm_test_case_params(
64
+ test_case,
65
+ self._required_params,
66
+ None,
67
+ None,
68
+ self,
69
+ self.model,
70
+ test_case.multimodal,
71
+ )
65
72
 
66
73
  self.evaluation_cost = 0 if self.using_native_model else None
67
74
  with metric_progress_indicator(
@@ -82,13 +89,13 @@ class AnswerRelevancyMetric(BaseMetric):
82
89
  actual_output = test_case.actual_output
83
90
 
84
91
  self.statements: List[str] = self._generate_statements(
85
- actual_output, multimodal
92
+ actual_output, test_case.multimodal
86
93
  )
87
94
  self.verdicts: List[AnswerRelevancyVerdict] = (
88
- self._generate_verdicts(input, multimodal)
95
+ self._generate_verdicts(input, test_case.multimodal)
89
96
  )
90
97
  self.score = self._calculate_score()
91
- self.reason = self._generate_reason(input, multimodal)
98
+ self.reason = self._generate_reason(input, test_case.multimodal)
92
99
  self.success = self.score >= self.threshold
93
100
  self.verbose_logs = construct_verbose_logs(
94
101
  self,
@@ -113,13 +120,15 @@ class AnswerRelevancyMetric(BaseMetric):
113
120
  _log_metric_to_confident: bool = True,
114
121
  ) -> float:
115
122
 
116
- multimodal = test_case.multimodal
117
- if multimodal:
118
- check_mllm_test_case_params(
119
- test_case, self._required_params, None, None, self, self.model
120
- )
121
- else:
122
- check_llm_test_case_params(test_case, self._required_params, self)
123
+ check_llm_test_case_params(
124
+ test_case,
125
+ self._required_params,
126
+ None,
127
+ None,
128
+ self,
129
+ self.model,
130
+ test_case.multimodal,
131
+ )
123
132
 
124
133
  self.evaluation_cost = 0 if self.using_native_model else None
125
134
  with metric_progress_indicator(
@@ -132,13 +141,15 @@ class AnswerRelevancyMetric(BaseMetric):
132
141
  actual_output = test_case.actual_output
133
142
 
134
143
  self.statements: List[str] = await self._a_generate_statements(
135
- actual_output, multimodal
144
+ actual_output, test_case.multimodal
136
145
  )
137
146
  self.verdicts: List[AnswerRelevancyVerdict] = (
138
- await self._a_generate_verdicts(input, multimodal)
147
+ await self._a_generate_verdicts(input, test_case.multimodal)
139
148
  )
140
149
  self.score = self._calculate_score()
141
- self.reason = await self._a_generate_reason(input, multimodal)
150
+ self.reason = await self._a_generate_reason(
151
+ input, test_case.multimodal
152
+ )
142
153
  self.success = self.score >= self.threshold
143
154
  self.verbose_logs = construct_verbose_logs(
144
155
  self,
@@ -170,22 +181,13 @@ class AnswerRelevancyMetric(BaseMetric):
170
181
  multimodal=multimodal,
171
182
  )
172
183
 
173
- if self.using_native_model:
174
- res, cost = await self.model.a_generate(
175
- prompt, schema=AnswerRelevancyScoreReason
176
- )
177
- self.evaluation_cost += cost
178
- return res.reason
179
- else:
180
- try:
181
- res: AnswerRelevancyScoreReason = await self.model.a_generate(
182
- prompt=prompt, schema=AnswerRelevancyScoreReason
183
- )
184
- return res.reason
185
- except TypeError:
186
- res = await self.model.a_generate(prompt)
187
- data = trimAndLoadJson(res, self)
188
- return data["reason"]
184
+ return await a_generate_with_schema_and_extract(
185
+ metric=self,
186
+ prompt=prompt,
187
+ schema_cls=AnswerRelevancyScoreReason,
188
+ extract_schema=lambda score_reason: score_reason.reason,
189
+ extract_json=lambda data: data["reason"],
190
+ )
189
191
 
190
192
  def _generate_reason(self, input: str, multimodal: bool) -> str:
191
193
  if self.include_reason is False:
@@ -203,22 +205,13 @@ class AnswerRelevancyMetric(BaseMetric):
203
205
  multimodal=multimodal,
204
206
  )
205
207
 
206
- if self.using_native_model:
207
- res, cost = self.model.generate(
208
- prompt, schema=AnswerRelevancyScoreReason
209
- )
210
- self.evaluation_cost += cost
211
- return res.reason
212
- else:
213
- try:
214
- res: AnswerRelevancyScoreReason = self.model.generate(
215
- prompt=prompt, schema=AnswerRelevancyScoreReason
216
- )
217
- return res.reason
218
- except TypeError:
219
- res = self.model.generate(prompt)
220
- data = trimAndLoadJson(res, self)
221
- return data["reason"]
208
+ return generate_with_schema_and_extract(
209
+ metric=self,
210
+ prompt=prompt,
211
+ schema_cls=AnswerRelevancyScoreReason,
212
+ extract_schema=lambda score_reason: score_reason.reason,
213
+ extract_json=lambda data: data["reason"],
214
+ )
222
215
 
223
216
  async def _a_generate_verdicts(
224
217
  self, input: str, multimodal: bool
@@ -230,22 +223,15 @@ class AnswerRelevancyMetric(BaseMetric):
230
223
  input=input, statements=self.statements, multimodal=multimodal
231
224
  )
232
225
 
233
- if self.using_native_model:
234
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
235
- self.evaluation_cost += cost
236
- return [item for item in res.verdicts]
237
- else:
238
- try:
239
- res: Verdicts = await self.model.a_generate(
240
- prompt, schema=Verdicts
241
- )
242
- return [item for item in res.verdicts]
243
- except TypeError:
244
- res = await self.model.a_generate(prompt)
245
- data = trimAndLoadJson(res, self)
246
- return [
247
- AnswerRelevancyVerdict(**item) for item in data["verdicts"]
248
- ]
226
+ return await a_generate_with_schema_and_extract(
227
+ metric=self,
228
+ prompt=prompt,
229
+ schema_cls=Verdicts,
230
+ extract_schema=lambda r: list(r.verdicts),
231
+ extract_json=lambda data: [
232
+ AnswerRelevancyVerdict(**item) for item in data["verdicts"]
233
+ ],
234
+ )
249
235
 
250
236
  def _generate_verdicts(
251
237
  self, input: str, multimodal: bool
@@ -257,22 +243,17 @@ class AnswerRelevancyMetric(BaseMetric):
257
243
  input=input, statements=self.statements, multimodal=multimodal
258
244
  )
259
245
 
260
- if self.using_native_model:
261
- res, cost = self.model.generate(prompt, schema=Verdicts)
262
- self.evaluation_cost += cost
263
- return [item for item in res.verdicts]
264
- else:
265
- try:
266
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
267
- return [item for item in res.verdicts]
268
- except TypeError:
269
- res = self.model.generate(prompt)
270
- data = trimAndLoadJson(res, self)
271
- return [
272
- AnswerRelevancyVerdict(**item) for item in data["verdicts"]
273
- ]
246
+ return generate_with_schema_and_extract(
247
+ metric=self,
248
+ prompt=prompt,
249
+ schema_cls=Verdicts,
250
+ extract_schema=lambda r: list(r.verdicts),
251
+ extract_json=lambda data: [
252
+ AnswerRelevancyVerdict(**item) for item in data["verdicts"]
253
+ ],
254
+ )
274
255
 
275
- async def _a_generate_statements(
256
+ def _generate_statements(
276
257
  self,
277
258
  actual_output: str,
278
259
  multimodal: bool,
@@ -280,31 +261,18 @@ class AnswerRelevancyMetric(BaseMetric):
280
261
  prompt = self.evaluation_template.generate_statements(
281
262
  actual_output=actual_output, multimodal=multimodal
282
263
  )
283
- if self.using_native_model:
284
- res, cost = await self.model.a_generate(prompt, schema=Statements)
285
- self.evaluation_cost += cost
286
- statements: List[str] = res.statements + [
287
- ele for ele in actual_output if isinstance(ele, MLLMImage)
288
- ]
289
- return statements
290
- else:
291
- try:
292
- res: Statements = await self.model.a_generate(
293
- prompt, schema=Statements
294
- )
295
- statements: List[str] = res.statements + [
296
- ele for ele in actual_output if isinstance(ele, MLLMImage)
297
- ]
298
- return statements
299
- except TypeError:
300
- res = await self.model.a_generate(prompt)
301
- data = trimAndLoadJson(res, self)
302
- statements = data["statements"] + [
303
- ele for ele in actual_output if isinstance(ele, MLLMImage)
304
- ]
305
- return statements
306
264
 
307
- def _generate_statements(
265
+ return generate_with_schema_and_extract(
266
+ metric=self,
267
+ prompt=prompt,
268
+ schema_cls=Statements,
269
+ extract_schema=lambda s: s.statements
270
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
271
+ extract_json=lambda d: d["statements"]
272
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
273
+ )
274
+
275
+ async def _a_generate_statements(
308
276
  self,
309
277
  actual_output: str,
310
278
  multimodal: bool,
@@ -312,27 +280,16 @@ class AnswerRelevancyMetric(BaseMetric):
312
280
  prompt = self.evaluation_template.generate_statements(
313
281
  actual_output=actual_output, multimodal=multimodal
314
282
  )
315
- if self.using_native_model:
316
- res, cost = self.model.generate(prompt, schema=Statements)
317
- self.evaluation_cost += cost
318
- statements = res.statements + [
319
- ele for ele in actual_output if isinstance(ele, MLLMImage)
320
- ]
321
- return statements
322
- else:
323
- try:
324
- res: Statements = self.model.generate(prompt, schema=Statements)
325
- statements = res.statements + [
326
- ele for ele in actual_output if isinstance(ele, MLLMImage)
327
- ]
328
- return statements
329
- except TypeError:
330
- res = self.model.generate(prompt)
331
- data = trimAndLoadJson(res, self)
332
- statements = data["statements"] + [
333
- ele for ele in actual_output if isinstance(ele, MLLMImage)
334
- ]
335
- return statements
283
+
284
+ return await a_generate_with_schema_and_extract(
285
+ metric=self,
286
+ prompt=prompt,
287
+ schema_cls=Statements,
288
+ extract_schema=lambda s: s.statements
289
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
290
+ extract_json=lambda d: d["statements"]
291
+ + [ele for ele in actual_output if isinstance(ele, MLLMImage)],
292
+ )
336
293
 
337
294
  def _calculate_score(self):
338
295
  number_of_verdicts = len(self.verdicts)
@@ -353,7 +310,7 @@ class AnswerRelevancyMetric(BaseMetric):
353
310
  else:
354
311
  try:
355
312
  self.success = self.score >= self.threshold
356
- except:
313
+ except TypeError:
357
314
  self.success = False
358
315
  return self.success
359
316
 
@@ -3,170 +3,93 @@ import textwrap
3
3
 
4
4
 
5
5
  class AnswerRelevancyTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ """
13
+
6
14
  @staticmethod
7
15
  def generate_statements(actual_output: str, multimodal: bool = False):
8
- multimodal_instruction = ""
9
- example_text = ""
10
- example_json = ""
11
-
12
- if multimodal:
13
- multimodal_instruction = " The text may contain images as well."
14
- example_text = "Shoes. The shoes can be refunded at no extra cost. Thanks for asking the question!"
15
- example_json = textwrap.dedent(
16
- """
17
- {{
18
- "statements": ["Shoes.", "Shoes can be refunded at no extra cost", "Thanks for asking the question!"]
19
- }}
20
- """
21
- )
22
- else:
23
- example_text = "Our new laptop model features a high-resolution Retina display for crystal-clear visuals. It also includes a fast-charging battery, giving you up to 12 hours of usage on a single charge. For security, we've added fingerprint authentication and an encrypted SSD. Plus, every purchase comes with a one-year warranty and 24/7 customer support."
24
- example_json = textwrap.dedent(
25
- """
26
- {{
27
- "statements": [
28
- "The new laptop model has a high-resolution Retina display.",
29
- "It includes a fast-charging battery with up to 12 hours of usage.",
30
- "Security features include fingerprint authentication and an encrypted SSD.",
31
- "Every purchase comes with a one-year warranty.",
32
- "24/7 customer support is included."
33
- ]
34
- }}
35
- """
36
- )
37
-
38
- coherence_note = (
39
- ""
40
- if multimodal
41
- else " Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement."
42
- )
43
-
44
- return textwrap.dedent(
45
- f"""Given the text, breakdown and generate a list of statements presented.{coherence_note}{multimodal_instruction}
46
-
47
- Example:
48
- Example text:
49
- {example_text}
50
-
51
- {example_json}
52
- ===== END OF EXAMPLE ======
53
-
54
- **
55
- IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
56
- **
57
-
58
- Text:
59
- {actual_output}
60
-
61
- JSON:
62
- """
63
- )
16
+ return f"""Given the text, breakdown and generate a list of statements presented. Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement.
17
+
18
+ Example:
19
+ Example text:
20
+ Our new laptop model features a high-resolution Retina display for crystal-clear visuals. It also includes a fast-charging battery, giving you up to 12 hours of usage on a single charge. For security, we’ve added fingerprint authentication and an encrypted SSD. Plus, every purchase comes with a one-year warranty and 24/7 customer support.
21
+
22
+ {AnswerRelevancyTemplate.multimodal_rules if multimodal else ""}
23
+
24
+ {{
25
+ "statements": [
26
+ "The new laptop model has a high-resolution Retina display.",
27
+ "It includes a fast-charging battery with up to 12 hours of usage.",
28
+ "Security features include fingerprint authentication and an encrypted SSD.",
29
+ "Every purchase comes with a one-year warranty.",
30
+ "24/7 customer support is included."
31
+ ]
32
+ }}
33
+ ===== END OF EXAMPLE ======
34
+
35
+ **
36
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
37
+ **
38
+
39
+ Text:
40
+ {actual_output}
41
+
42
+ JSON:
43
+ """
64
44
 
65
45
  @staticmethod
66
46
  def generate_verdicts(
67
47
  input: str, statements: str, multimodal: bool = False
68
48
  ):
69
- content_type = (
70
- "statements (which can contain images)"
71
- if multimodal
72
- else "list of statements"
73
- )
74
- statement_or_image = "statement or image" if multimodal else "statement"
75
-
76
- format_instruction = textwrap.dedent(
77
- """
78
- Expected JSON format:
79
- {{
80
- "verdicts": [
81
- {{
82
- "verdict": "yes"
83
- }},
84
- {{
85
- "reason": <explanation_for_irrelevance>,
86
- "verdict": "no"
87
- }},
88
- {{
89
- "reason": <explanation_for_ambiguity>,
90
- "verdict": "idk"
91
- }}
92
- ]
93
- }}
94
- """
95
- )
96
-
97
- example_section = ""
98
- if multimodal:
99
- example_section = textwrap.dedent(
100
- """
101
- Example input: What should I do if there is an earthquake?
102
- Example statements: ["Shoes.", "Thanks for asking the question!", "Is there anything else I can help you with?", "Duck and hide"]
103
- Example JSON:
104
- {{
105
- "verdicts": [
106
- {{
107
- "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake.",
108
- "verdict": "no"
109
- }},
110
- {{
111
- "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant.",
112
- "verdict": "idk"
113
- }},
114
- {{
115
- "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant.",
116
- "verdict": "idk"
117
- }},
118
- {{
119
- "verdict": "yes"
120
- }}
121
- ]
122
- }}
123
- """
124
- )
125
-
126
- guidelines = ""
127
- if multimodal:
128
- guidelines = textwrap.dedent(
129
- f"""
130
- Since you are going to generate a verdict for each statement and image, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.
131
- """
132
- )
133
- else:
134
- guidelines = textwrap.dedent(
135
- f"""
136
- Generate ONE verdict per statement - number of 'verdicts' MUST equal number of statements.
137
- 'verdict' must be STRICTLY 'yes', 'no', or 'idk':
138
- - 'yes': statement is relevant to addressing the input
139
- - 'no': statement is irrelevant to the input
140
- - 'idk': statement is ambiguous (not directly relevant but could be supporting information)
141
- Provide 'reason' ONLY for 'no' or 'idk' verdicts.
142
- """
143
- )
144
-
145
- return textwrap.dedent(
146
- f"""For the provided {content_type}, determine whether each {statement_or_image} is relevant to address the input.
147
- {"Please generate a list of JSON with two keys: `verdict` and `reason`." if multimodal else "Generate JSON objects with 'verdict' and 'reason' fields."}
148
- The 'verdict' {"key " if multimodal else ''}should {"STRICTLY be either a 'yes', 'idk' or 'no'" if multimodal else "be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information)"}. {"Answer 'yes' if the " + statement_or_image + ' is relevant to addressing the original input, no if the ' + statement_or_image + ' is irrelevant, and "idk" if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).' if multimodal else ""}
149
- {"The 'reason' is the reason for the verdict.' if multimodal else '"}
150
- Provide 'reason' ONLY for 'no' or 'idk' verdicts.
151
- The {"provided statements are statements and images' if multimodal else 'statements are from an AI's actual output"} generated in the actual output.
152
-
153
- **
154
- IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
155
-
156
- {format_instruction if not multimodal else ''}
157
- {example_section}
158
- {guidelines}
159
- **
160
-
161
- Input:
162
- {input}
163
-
164
- Statements:
165
- {statements}
166
-
167
- JSON:
168
- """
169
- )
49
+ return f"""For the provided list of statements, determine whether each statement is relevant to address the input.
50
+ Generate JSON objects with 'verdict' and 'reason' fields.
51
+ The 'verdict' should be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information).
52
+ Provide 'reason' ONLY for 'no' or 'idk' verdicts.
53
+ The statements are from an AI's actual output.
54
+
55
+ {AnswerRelevancyTemplate.multimodal_rules if multimodal else ""}
56
+
57
+ **
58
+ IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
59
+
60
+ Expected JSON format:
61
+ {{
62
+ "verdicts": [
63
+ {{
64
+ "verdict": "yes"
65
+ }},
66
+ {{
67
+ "reason": <explanation_for_irrelevance>,
68
+ "verdict": "no"
69
+ }},
70
+ {{
71
+ "reason": <explanation_for_ambiguity>,
72
+ "verdict": "idk"
73
+ }}
74
+ ]
75
+ }}
76
+
77
+ Generate ONE verdict per statement - number of 'verdicts' MUST equal number of statements.
78
+ 'verdict' must be STRICTLY 'yes', 'no', or 'idk':
79
+ - 'yes': statement is relevant to addressing the input
80
+ - 'no': statement is irrelevant to the input
81
+ - 'idk': statement is ambiguous (not directly relevant but could be supporting information)
82
+ Provide 'reason' ONLY for 'no' or 'idk' verdicts.
83
+ **
84
+
85
+ Input:
86
+ {input}
87
+
88
+ Statements:
89
+ {statements}
90
+
91
+ JSON:
92
+ """
170
93
 
171
94
  @staticmethod
172
95
  def generate_reason(
@@ -175,32 +98,32 @@ class AnswerRelevancyTemplate:
175
98
  score: float,
176
99
  multimodal: bool = False,
177
100
  ):
178
- return textwrap.dedent(
179
- f"""Given the answer relevancy score, the list of reasons of irrelevant statements made in the actual output, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
180
- The irrelevant statements represent things in the actual output that is irrelevant to addressing whatever is asked/talked about in the input.
181
- If there is nothing irrelevant, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
101
+ return f"""Given the answer relevancy score, the list of reasons of irrelevant statements made in the actual output, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
102
+ The irrelevant statements represent things in the actual output that is irrelevant to addressing whatever is asked/talked about in the input.
103
+ If there is nothing irrelevant, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
104
+
105
+ {AnswerRelevancyTemplate.multimodal_rules if multimodal else ""}
182
106
 
107
+ **
108
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
183
109
 
184
- **
185
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
110
+ Example:
111
+ Example JSON:
112
+ {{
113
+ "reason": "The score is <answer_relevancy_score> because <your_reason>."
114
+ }}
115
+ ===== END OF EXAMPLE ======
116
+ **
186
117
 
187
- {"Example:' if not multimodal else '"}
188
- Example JSON:
189
- {{
190
- "reason": "The score is <answer_relevancy_score> because <your_reason>."
191
- }}
192
- {"===== END OF EXAMPLE ======' if not multimodal else '"}
193
- **
194
118
 
195
- Answer Relevancy Score:
196
- {score}
119
+ Answer Relevancy Score:
120
+ {score}
197
121
 
198
- Reasons why the score can't be higher based on irrelevant statements in the actual output:
199
- {irrelevant_statements}
122
+ Reasons why the score can't be higher based on irrelevant statements in the actual output:
123
+ {irrelevant_statements}
200
124
 
201
- Input:
202
- {input}
125
+ Input:
126
+ {input}
203
127
 
204
- JSON:
205
- """
206
- )
128
+ JSON:
129
+ """