deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,596 @@
1
+ from typing import List, Optional, Union, Type, Tuple
2
+ import asyncio
3
+
4
+ from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
+ from deepeval.metrics import BaseConversationalMetric
6
+ from deepeval.utils import (
7
+ get_or_create_event_loop,
8
+ prettify_list,
9
+ )
10
+ from deepeval.metrics.utils import (
11
+ construct_verbose_logs,
12
+ trimAndLoadJson,
13
+ check_conversational_test_case_params,
14
+ get_unit_interactions,
15
+ initialize_model,
16
+ )
17
+ from deepeval.models import DeepEvalBaseLLM
18
+ from deepeval.metrics.turn_faithfulness.template import (
19
+ TurnFaithfulnessTemplate,
20
+ )
21
+ from deepeval.metrics.indicator import metric_progress_indicator
22
+ from deepeval.metrics.turn_faithfulness.schema import (
23
+ FaithfulnessVerdict,
24
+ Verdicts,
25
+ FaithfulnessScoreReason,
26
+ Truths,
27
+ Claims,
28
+ InteractionFaithfulnessScore,
29
+ )
30
+ from deepeval.metrics.api import metric_data_manager
31
+
32
+
33
+ class TurnFaithfulnessMetric(BaseConversationalMetric):
34
+ _required_test_case_params: List[TurnParams] = [
35
+ TurnParams.CONTENT,
36
+ TurnParams.RETRIEVAL_CONTEXT,
37
+ ]
38
+
39
+ def __init__(
40
+ self,
41
+ threshold: float = 0.5,
42
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
43
+ include_reason: bool = True,
44
+ async_mode: bool = True,
45
+ strict_mode: bool = False,
46
+ verbose_mode: bool = False,
47
+ truths_extraction_limit: Optional[int] = None,
48
+ penalize_ambiguous_claims: bool = False,
49
+ evaluation_template: Type[
50
+ TurnFaithfulnessTemplate
51
+ ] = TurnFaithfulnessTemplate,
52
+ ):
53
+ self.threshold = 1 if strict_mode else threshold
54
+ self.model, self.using_native_model = initialize_model(model)
55
+ self.evaluation_model = self.model.get_model_name()
56
+ self.include_reason = include_reason
57
+ self.async_mode = async_mode
58
+ self.strict_mode = strict_mode
59
+ self.verbose_mode = verbose_mode
60
+ self.evaluation_template = evaluation_template
61
+ self.penalize_ambiguous_claims = penalize_ambiguous_claims
62
+
63
+ self.truths_extraction_limit = truths_extraction_limit
64
+ if self.truths_extraction_limit is not None:
65
+ self.truths_extraction_limit = max(self.truths_extraction_limit, 0)
66
+
67
+ def measure(
68
+ self,
69
+ test_case: ConversationalTestCase,
70
+ _show_indicator: bool = True,
71
+ _in_component: bool = False,
72
+ _log_metric_to_confident: bool = True,
73
+ ):
74
+ check_conversational_test_case_params(
75
+ test_case,
76
+ self._required_test_case_params,
77
+ self,
78
+ False,
79
+ self.model,
80
+ test_case.multimodal,
81
+ )
82
+
83
+ multimodal = test_case.multimodal
84
+
85
+ self.evaluation_cost = 0 if self.using_native_model else None
86
+ with metric_progress_indicator(
87
+ self, _show_indicator=_show_indicator, _in_component=_in_component
88
+ ):
89
+ if self.async_mode:
90
+ loop = get_or_create_event_loop()
91
+ loop.run_until_complete(
92
+ self.a_measure(
93
+ test_case,
94
+ _show_indicator=False,
95
+ _in_component=_in_component,
96
+ _log_metric_to_confident=_log_metric_to_confident,
97
+ )
98
+ )
99
+ else:
100
+ unit_interactions = get_unit_interactions(test_case.turns)
101
+ scores = self._get_faithfulness_scores(
102
+ unit_interactions, multimodal
103
+ )
104
+ self.score = self._calculate_score(scores)
105
+ self.success = self.score >= self.threshold
106
+ self.reason = self._generate_reason(scores)
107
+ verbose_steps = self._get_verbose_steps(scores)
108
+ self.verbose_logs = construct_verbose_logs(
109
+ self,
110
+ steps=[
111
+ *verbose_steps,
112
+ f"Final Score: {self.score}\n",
113
+ f"Final Reason: {self.reason}\n",
114
+ ],
115
+ )
116
+ if _log_metric_to_confident:
117
+ metric_data_manager.post_metric_if_enabled(
118
+ self, test_case=test_case
119
+ )
120
+
121
+ return self.score
122
+
123
+ async def a_measure(
124
+ self,
125
+ test_case: ConversationalTestCase,
126
+ _show_indicator: bool = True,
127
+ _in_component: bool = False,
128
+ _log_metric_to_confident: bool = True,
129
+ ) -> float:
130
+ check_conversational_test_case_params(
131
+ test_case,
132
+ self._required_test_case_params,
133
+ self,
134
+ False,
135
+ self.model,
136
+ test_case.multimodal,
137
+ )
138
+
139
+ multimodal = test_case.multimodal
140
+
141
+ self.evaluation_cost = 0 if self.using_native_model else None
142
+ with metric_progress_indicator(
143
+ self,
144
+ async_mode=True,
145
+ _show_indicator=_show_indicator,
146
+ _in_component=_in_component,
147
+ ):
148
+ unit_interactions = get_unit_interactions(test_case.turns)
149
+ scores = await self._a_get_faithfulness_scores(
150
+ unit_interactions, multimodal
151
+ )
152
+ self.score = self._calculate_score(scores)
153
+ self.success = self.score >= self.threshold
154
+ self.reason = await self._a_generate_reason(scores)
155
+ verbose_steps = self._get_verbose_steps(scores)
156
+ self.verbose_logs = construct_verbose_logs(
157
+ self,
158
+ steps=[
159
+ *verbose_steps,
160
+ f"Final Score: {self.score}\n",
161
+ f"Final Reason: {self.reason}\n",
162
+ ],
163
+ )
164
+ if _log_metric_to_confident:
165
+ metric_data_manager.post_metric_if_enabled(
166
+ self, test_case=test_case
167
+ )
168
+
169
+ return self.score
170
+
171
+ async def _a_get_faithfulness_scores(
172
+ self, unit_interactions: List[List[Turn]], multimodal: bool
173
+ ):
174
+
175
+ async def get_interaction_score(unit_interaction: List[Turn]):
176
+ user_content = "User Message: "
177
+ retrieval_context = []
178
+ assistant_content = "Assistant Message: "
179
+ for turn in unit_interaction:
180
+ if turn.role == "user":
181
+ user_content += f"\n{turn.content} "
182
+ else:
183
+ assistant_content += f"\n{turn.content} "
184
+ retrieval_context.extend(turn.retrieval_context)
185
+ truths = await self._a_generate_truths(
186
+ retrieval_context, multimodal
187
+ )
188
+ claims = await self._a_generate_claims(
189
+ user_content, assistant_content, multimodal
190
+ )
191
+ verdicts = await self._a_generate_verdicts(
192
+ claims, truths, multimodal
193
+ )
194
+ score, reason = self._get_interaction_score_and_reason(
195
+ verdicts, multimodal
196
+ )
197
+ interaction_score = InteractionFaithfulnessScore(
198
+ score=score,
199
+ reason=reason,
200
+ claims=claims,
201
+ truths=truths,
202
+ verdicts=verdicts,
203
+ )
204
+ return interaction_score
205
+
206
+ final_scores = await asyncio.gather(
207
+ *[
208
+ get_interaction_score(unit_interaction)
209
+ for unit_interaction in unit_interactions
210
+ ]
211
+ )
212
+
213
+ return final_scores
214
+
215
+ def _get_faithfulness_scores(
216
+ self, unit_interactions: List[List[Turn]], multimodal: bool
217
+ ):
218
+ interaction_scores = []
219
+
220
+ for unit_interaction in unit_interactions:
221
+ user_content = "User Message: "
222
+ retrieval_context = []
223
+ assistant_content = "Assistant Message: "
224
+ for turn in unit_interaction:
225
+ if turn.role == "user":
226
+ user_content += f"\n{turn.content} "
227
+ else:
228
+ assistant_content += f"\n{turn.content} "
229
+ retrieval_context.extend(turn.retrieval_context)
230
+ truths = self._generate_truths(retrieval_context, multimodal)
231
+ claims = self._generate_claims(
232
+ user_content, assistant_content, multimodal
233
+ )
234
+ verdicts = self._generate_verdicts(claims, truths, multimodal)
235
+ score, reason = self._get_interaction_score_and_reason(
236
+ verdicts, multimodal
237
+ )
238
+ interaction_score = InteractionFaithfulnessScore(
239
+ score=score,
240
+ reason=reason,
241
+ claims=claims,
242
+ truths=truths,
243
+ verdicts=verdicts,
244
+ )
245
+ interaction_scores.append(interaction_score)
246
+
247
+ return interaction_scores
248
+
249
+ async def _a_generate_truths(
250
+ self, retrieval_context: str, multimodal: bool
251
+ ) -> List[str]:
252
+ prompt = self.evaluation_template.generate_truths(
253
+ reference_context="\n\n".join(retrieval_context),
254
+ extraction_limit=self.truths_extraction_limit,
255
+ multimodal=multimodal,
256
+ )
257
+ if self.using_native_model:
258
+ res, cost = await self.model.a_generate(prompt, schema=Truths)
259
+ self.evaluation_cost += cost
260
+ return res.truths
261
+ else:
262
+ try:
263
+ res: Truths = await self.model.a_generate(prompt, schema=Truths)
264
+ return res.truths
265
+ except TypeError:
266
+ res = await self.model.a_generate(prompt)
267
+ data = trimAndLoadJson(res, self)
268
+ return data["truths"]
269
+
270
+ def _generate_truths(
271
+ self, retrieval_context: str, multimodal: bool
272
+ ) -> List[str]:
273
+ prompt = self.evaluation_template.generate_truths(
274
+ reference_context="\n\n".join(retrieval_context),
275
+ extraction_limit=self.truths_extraction_limit,
276
+ multimodal=multimodal,
277
+ )
278
+ if self.using_native_model:
279
+ res, cost = self.model.generate(prompt, schema=Truths)
280
+ self.evaluation_cost += cost
281
+ return res.truths
282
+ else:
283
+ try:
284
+ res: Truths = self.model.generate(prompt, schema=Truths)
285
+ return res.truths
286
+ except TypeError:
287
+ res = self.model.generate(prompt)
288
+ data = trimAndLoadJson(res, self)
289
+ return data["truths"]
290
+
291
+ async def _a_generate_claims(
292
+ self, user_content: str, assistant_content: str, multimodal: bool
293
+ ) -> List[str]:
294
+ prompt = self.evaluation_template.generate_claims(
295
+ input=user_content,
296
+ assistant_output=assistant_content,
297
+ multimodal=multimodal,
298
+ )
299
+ if self.using_native_model:
300
+ res, cost = await self.model.a_generate(prompt, schema=Claims)
301
+ self.evaluation_cost += cost
302
+ return res.claims
303
+ else:
304
+ try:
305
+ res: Claims = await self.model.a_generate(prompt, schema=Claims)
306
+ return res.claims
307
+ except TypeError:
308
+ res = await self.model.a_generate(prompt)
309
+ data = trimAndLoadJson(res, self)
310
+ return data["claims"]
311
+
312
+ def _generate_claims(
313
+ self, user_content: str, assistant_content: str, multimodal: bool
314
+ ) -> List[str]:
315
+ prompt = self.evaluation_template.generate_claims(
316
+ input=user_content,
317
+ assistant_output=assistant_content,
318
+ multimodal=multimodal,
319
+ )
320
+ if self.using_native_model:
321
+ res, cost = self.model.generate(prompt, schema=Claims)
322
+ self.evaluation_cost += cost
323
+ return res.claims
324
+ else:
325
+ try:
326
+ res: Claims = self.model.generate(prompt, schema=Claims)
327
+ return res.claims
328
+ except TypeError:
329
+ res = self.model.generate(prompt)
330
+ data = trimAndLoadJson(res, self)
331
+ return data["claims"]
332
+
333
+ async def _a_generate_verdicts(
334
+ self, claims: Claims, truths: Truths, multimodal: bool
335
+ ) -> List[FaithfulnessVerdict]:
336
+ if len(claims) == 0:
337
+ return []
338
+
339
+ verdicts: List[FaithfulnessVerdict] = []
340
+
341
+ prompt = self.evaluation_template.generate_verdicts(
342
+ claims=claims,
343
+ reference_context="\n\n".join(truths),
344
+ multimodal=multimodal,
345
+ )
346
+
347
+ if self.using_native_model:
348
+ res, cost = await self.model.a_generate(prompt, schema=Verdicts)
349
+ self.evaluation_cost += cost
350
+ verdicts = [item for item in res.verdicts]
351
+ return verdicts
352
+ else:
353
+ try:
354
+ res: Verdicts = await self.model.a_generate(
355
+ prompt, schema=Verdicts
356
+ )
357
+ verdicts = [item for item in res.verdicts]
358
+ return verdicts
359
+ except TypeError:
360
+ res = await self.model.a_generate(prompt)
361
+ data = trimAndLoadJson(res, self)
362
+ verdicts = [
363
+ FaithfulnessVerdict(**item) for item in data["verdicts"]
364
+ ]
365
+ return verdicts
366
+
367
+ def _generate_verdicts(
368
+ self, claims: Claims, truths: Truths, multimodal: bool
369
+ ) -> List[FaithfulnessVerdict]:
370
+ if len(claims) == 0:
371
+ return []
372
+
373
+ verdicts: List[FaithfulnessVerdict] = []
374
+
375
+ prompt = self.evaluation_template.generate_verdicts(
376
+ claims=claims,
377
+ reference_context="\n\n".join(truths),
378
+ multimodal=multimodal,
379
+ )
380
+
381
+ if self.using_native_model:
382
+ res, cost = self.model.generate(prompt, schema=Verdicts)
383
+ self.evaluation_cost += cost
384
+ verdicts = [item for item in res.verdicts]
385
+ return verdicts
386
+ else:
387
+ try:
388
+ res: Verdicts = self.model.generate(prompt, schema=Verdicts)
389
+ verdicts = [item for item in res.verdicts]
390
+ return verdicts
391
+ except TypeError:
392
+ res = self.model.generate(prompt)
393
+ data = trimAndLoadJson(res, self)
394
+ verdicts = [
395
+ FaithfulnessVerdict(**item) for item in data["verdicts"]
396
+ ]
397
+ return verdicts
398
+
399
+ def _get_interaction_score_and_reason(
400
+ self, verdicts, multimodal: bool
401
+ ) -> Tuple[float, str]:
402
+ number_of_verdicts = len(verdicts)
403
+ if number_of_verdicts == 0:
404
+ return 1
405
+
406
+ faithfulness_count = 0
407
+ for verdict in verdicts:
408
+ if verdict.verdict.strip().lower() != "no":
409
+ faithfulness_count += 1
410
+
411
+ if (
412
+ self.penalize_ambiguous_claims
413
+ and verdict.verdict.strip().lower() == "idk"
414
+ ):
415
+ faithfulness_count -= 1
416
+
417
+ score = faithfulness_count / number_of_verdicts
418
+ reason = self._get_interaction_reason(score, verdicts, multimodal)
419
+ return (
420
+ (0, reason)
421
+ if self.strict_mode and score < self.threshold
422
+ else (score, reason)
423
+ )
424
+
425
+ async def _a_get_interaction_score_and_reason(
426
+ self, verdicts, multimodal: bool
427
+ ) -> Tuple[float, str]:
428
+ number_of_verdicts = len(verdicts)
429
+ if number_of_verdicts == 0:
430
+ return 1
431
+
432
+ faithfulness_count = 0
433
+ for verdict in verdicts:
434
+ if verdict.verdict.strip().lower() != "no":
435
+ faithfulness_count += 1
436
+
437
+ if (
438
+ self.penalize_ambiguous_claims
439
+ and verdict.verdict.strip().lower() == "idk"
440
+ ):
441
+ faithfulness_count -= 1
442
+
443
+ score = faithfulness_count / number_of_verdicts
444
+ reason = await self._a_get_interaction_reason(
445
+ score, verdicts, multimodal
446
+ )
447
+ return (
448
+ (0, reason)
449
+ if self.strict_mode and score < self.threshold
450
+ else (score, reason)
451
+ )
452
+
453
+ async def _a_get_interaction_reason(
454
+ self, score, verdicts, multimodal: bool
455
+ ) -> str:
456
+ if self.include_reason is False:
457
+ return None
458
+
459
+ contradictions = []
460
+ for verdict in verdicts:
461
+ if verdict.verdict.strip().lower() == "no":
462
+ contradictions.append(verdict.reason)
463
+
464
+ prompt = self.evaluation_template.generate_reason(
465
+ contradictions=contradictions,
466
+ score=format(score, ".2f"),
467
+ multimodal=multimodal,
468
+ )
469
+
470
+ if self.using_native_model:
471
+ res, cost = await self.model.a_generate(
472
+ prompt, schema=FaithfulnessScoreReason
473
+ )
474
+ self.evaluation_cost += cost
475
+ return res.reason
476
+ else:
477
+ try:
478
+ res: FaithfulnessScoreReason = await self.model.a_generate(
479
+ prompt, schema=FaithfulnessScoreReason
480
+ )
481
+ return res.reason
482
+ except TypeError:
483
+ res = await self.model.a_generate(prompt)
484
+ data = trimAndLoadJson(res, self)
485
+ return data["reason"]
486
+
487
+ def _get_interaction_reason(self, score, verdicts, multimodal: bool) -> str:
488
+ if self.include_reason is False:
489
+ return None
490
+
491
+ contradictions = []
492
+ for verdict in verdicts:
493
+ if verdict.verdict.strip().lower() == "no":
494
+ contradictions.append(verdict.reason)
495
+
496
+ prompt = self.evaluation_template.generate_reason(
497
+ contradictions=contradictions,
498
+ score=format(score, ".2f"),
499
+ multimodal=multimodal,
500
+ )
501
+
502
+ if self.using_native_model:
503
+ res, cost = self.model.generate(
504
+ prompt, schema=FaithfulnessScoreReason
505
+ )
506
+ self.evaluation_cost += cost
507
+ return res.reason
508
+ else:
509
+ try:
510
+ res: FaithfulnessScoreReason = self.model.generate(
511
+ prompt, schema=FaithfulnessScoreReason
512
+ )
513
+ return res.reason
514
+ except TypeError:
515
+ res = self.model.generate(prompt)
516
+ data = trimAndLoadJson(res, self)
517
+ return data["reason"]
518
+
519
+ def _get_verbose_steps(
520
+ self, interaction_scores: List[InteractionFaithfulnessScore]
521
+ ):
522
+ steps = []
523
+ for index, interaction_score in enumerate(interaction_scores):
524
+ interaction_steps = [
525
+ f"Interaction {index + 1} \n",
526
+ f"Truths: {prettify_list(interaction_score.truths)} \n",
527
+ f"Claims: {prettify_list(interaction_score.claims)} \n",
528
+ f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
529
+ f"Score: {interaction_score.score} \n",
530
+ f"Reason: {interaction_score.reason} \n",
531
+ ]
532
+ steps.extend(interaction_steps)
533
+ return steps
534
+
535
+ def _generate_reason(
536
+ self, scores: List[InteractionFaithfulnessScore]
537
+ ) -> str:
538
+ reasons = []
539
+ for score in scores:
540
+ reasons.append(score.reason)
541
+
542
+ prompt = self.evaluation_template.generate_final_reason(
543
+ self.score, self.success, reasons
544
+ )
545
+
546
+ if self.using_native_model:
547
+ res, cost = self.model.generate(prompt)
548
+ self.evaluation_cost += cost
549
+ return res
550
+ else:
551
+ res = self.model.generate(prompt)
552
+ return res
553
+
554
+ async def _a_generate_reason(
555
+ self, scores: List[InteractionFaithfulnessScore]
556
+ ) -> str:
557
+ reasons = []
558
+ for score in scores:
559
+ reasons.append(score.reason)
560
+
561
+ prompt = self.evaluation_template.generate_final_reason(
562
+ self.score, self.success, reasons
563
+ )
564
+
565
+ if self.using_native_model:
566
+ res, cost = await self.model.a_generate(prompt)
567
+ self.evaluation_cost += cost
568
+ return res
569
+ else:
570
+ res = await self.model.a_generate(prompt)
571
+ return res
572
+
573
+ def _calculate_score(
574
+ self, scores: List[InteractionFaithfulnessScore]
575
+ ) -> float:
576
+ number_of_scores = len(scores)
577
+ if number_of_scores == 0:
578
+ return 1
579
+ total_score = 0
580
+ for score in scores:
581
+ total_score += score.score
582
+ return total_score / number_of_scores
583
+
584
+ def is_successful(self) -> bool:
585
+ if self.error is not None:
586
+ self.success = False
587
+ else:
588
+ try:
589
+ self.success = self.score >= self.threshold
590
+ except:
591
+ self.success = False
592
+ return self.success
593
+
594
+ @property
595
+ def __name__(self):
596
+ return "Turn Faithfulness"
@@ -33,8 +33,8 @@ Example Messages:
33
33
 
34
34
  Example JSON:
35
35
  {{
36
- "verdict": "no",
37
- "reason": "The LLM responded 'isn't it a nice day today' to a message that asked about how to treat a sore throat, which is completely irrelevant."
36
+ "reason": "The LLM responded 'isn't it a nice day today' to a message that asked about how to treat a sore throat, which is completely irrelevant.",
37
+ "verdict": "no"
38
38
  }}
39
39
  ===== END OF EXAMPLE ======
40
40
  You MUST ONLY provide a verdict for the LAST message on the list but MUST USE context from the previous messages.