deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,535 @@
1
+ from typing import List, Optional, Union, Type, Tuple
2
+ import asyncio
3
+
4
+ from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
+ from deepeval.metrics import BaseConversationalMetric
6
+ from deepeval.utils import (
7
+ get_or_create_event_loop,
8
+ prettify_list,
9
+ )
10
+ from deepeval.metrics.utils import (
11
+ construct_verbose_logs,
12
+ trimAndLoadJson,
13
+ check_conversational_test_case_params,
14
+ get_unit_interactions,
15
+ initialize_model,
16
+ )
17
+ from deepeval.models import DeepEvalBaseLLM
18
+ from deepeval.metrics.turn_contextual_relevancy.template import (
19
+ TurnContextualRelevancyTemplate,
20
+ )
21
+ from deepeval.metrics.indicator import metric_progress_indicator
22
+ from deepeval.metrics.turn_contextual_relevancy.schema import (
23
+ ContextualRelevancyVerdict,
24
+ ContextualRelevancyVerdicts,
25
+ ContextualRelevancyScoreReason,
26
+ InteractionContextualRelevancyScore,
27
+ )
28
+ from deepeval.metrics.api import metric_data_manager
29
+
30
+
31
+ class TurnContextualRelevancyMetric(BaseConversationalMetric):
32
+ _required_test_case_params: List[TurnParams] = [
33
+ TurnParams.CONTENT,
34
+ TurnParams.RETRIEVAL_CONTEXT,
35
+ ]
36
+
37
+ def __init__(
38
+ self,
39
+ threshold: float = 0.5,
40
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
41
+ include_reason: bool = True,
42
+ async_mode: bool = True,
43
+ strict_mode: bool = False,
44
+ verbose_mode: bool = False,
45
+ evaluation_template: Type[
46
+ TurnContextualRelevancyTemplate
47
+ ] = TurnContextualRelevancyTemplate,
48
+ ):
49
+ self.threshold = 1 if strict_mode else threshold
50
+ self.model, self.using_native_model = initialize_model(model)
51
+ self.evaluation_model = self.model.get_model_name()
52
+ self.include_reason = include_reason
53
+ self.async_mode = async_mode
54
+ self.strict_mode = strict_mode
55
+ self.verbose_mode = verbose_mode
56
+ self.evaluation_template = evaluation_template
57
+
58
+ def measure(
59
+ self,
60
+ test_case: ConversationalTestCase,
61
+ _show_indicator: bool = True,
62
+ _in_component: bool = False,
63
+ _log_metric_to_confident: bool = True,
64
+ ):
65
+ check_conversational_test_case_params(
66
+ test_case,
67
+ self._required_test_case_params,
68
+ self,
69
+ False,
70
+ self.model,
71
+ test_case.multimodal,
72
+ )
73
+
74
+ multimodal = test_case.multimodal
75
+
76
+ self.evaluation_cost = 0 if self.using_native_model else None
77
+ with metric_progress_indicator(
78
+ self, _show_indicator=_show_indicator, _in_component=_in_component
79
+ ):
80
+ if self.async_mode:
81
+ loop = get_or_create_event_loop()
82
+ loop.run_until_complete(
83
+ self.a_measure(
84
+ test_case,
85
+ _show_indicator=False,
86
+ _in_component=_in_component,
87
+ _log_metric_to_confident=_log_metric_to_confident,
88
+ )
89
+ )
90
+ else:
91
+ unit_interactions = get_unit_interactions(test_case.turns)
92
+ scores = self._get_contextual_relevancy_scores(
93
+ unit_interactions, multimodal
94
+ )
95
+ self.score = self._calculate_score(scores)
96
+ self.success = self.score >= self.threshold
97
+ self.reason = self._generate_reason(scores)
98
+ verbose_steps = self._get_verbose_steps(scores)
99
+ self.verbose_logs = construct_verbose_logs(
100
+ self,
101
+ steps=[
102
+ *verbose_steps,
103
+ f"Final Score: {self.score}\n",
104
+ f"Final Reason: {self.reason}\n",
105
+ ],
106
+ )
107
+ if _log_metric_to_confident:
108
+ metric_data_manager.post_metric_if_enabled(
109
+ self, test_case=test_case
110
+ )
111
+
112
+ return self.score
113
+
114
+ async def a_measure(
115
+ self,
116
+ test_case: ConversationalTestCase,
117
+ _show_indicator: bool = True,
118
+ _in_component: bool = False,
119
+ _log_metric_to_confident: bool = True,
120
+ ) -> float:
121
+ check_conversational_test_case_params(
122
+ test_case,
123
+ self._required_test_case_params,
124
+ self,
125
+ False,
126
+ self.model,
127
+ test_case.multimodal,
128
+ )
129
+
130
+ multimodal = test_case.multimodal
131
+
132
+ self.evaluation_cost = 0 if self.using_native_model else None
133
+ with metric_progress_indicator(
134
+ self,
135
+ async_mode=True,
136
+ _show_indicator=_show_indicator,
137
+ _in_component=_in_component,
138
+ ):
139
+ unit_interactions = get_unit_interactions(test_case.turns)
140
+ scores = await self._a_get_contextual_relevancy_scores(
141
+ unit_interactions, multimodal
142
+ )
143
+ self.score = self._calculate_score(scores)
144
+ self.success = self.score >= self.threshold
145
+ self.reason = await self._a_generate_reason(scores)
146
+ verbose_steps = self._get_verbose_steps(scores)
147
+ self.verbose_logs = construct_verbose_logs(
148
+ self,
149
+ steps=[
150
+ *verbose_steps,
151
+ f"Final Score: {self.score}\n",
152
+ f"Final Reason: {self.reason}\n",
153
+ ],
154
+ )
155
+ if _log_metric_to_confident:
156
+ metric_data_manager.post_metric_if_enabled(
157
+ self, test_case=test_case
158
+ )
159
+
160
+ return self.score
161
+
162
+ async def _a_get_contextual_relevancy_scores(
163
+ self, unit_interactions: List[List[Turn]], multimodal: bool
164
+ ):
165
+ async def get_interaction_score(unit_interaction: List[Turn]):
166
+ user_content = "User Message: "
167
+ retrieval_context = []
168
+ for turn in unit_interaction:
169
+ if turn.role == "user":
170
+ user_content += f"\n{turn.content} "
171
+ else:
172
+ retrieval_context.extend(turn.retrieval_context)
173
+
174
+ # Generate verdicts for each retrieval context
175
+ verdicts = await self._a_generate_verdicts(
176
+ user_content, retrieval_context, multimodal
177
+ )
178
+ score, reason = await self._a_get_interaction_score_and_reason(
179
+ user_content, verdicts, multimodal
180
+ )
181
+ interaction_score = InteractionContextualRelevancyScore(
182
+ score=score,
183
+ reason=reason,
184
+ verdicts=verdicts,
185
+ )
186
+ return interaction_score
187
+
188
+ final_scores = await asyncio.gather(
189
+ *[
190
+ get_interaction_score(unit_interaction)
191
+ for unit_interaction in unit_interactions
192
+ ]
193
+ )
194
+
195
+ return final_scores
196
+
197
+ def _get_contextual_relevancy_scores(
198
+ self, unit_interactions: List[List[Turn]], multimodal: bool
199
+ ):
200
+ interaction_scores = []
201
+
202
+ for unit_interaction in unit_interactions:
203
+ user_content = "User Message: "
204
+ retrieval_context = []
205
+ for turn in unit_interaction:
206
+ if turn.role == "user":
207
+ user_content += f"\n{turn.content} "
208
+ else:
209
+ retrieval_context.extend(turn.retrieval_context)
210
+
211
+ # Generate verdicts for each retrieval context
212
+ verdicts = self._generate_verdicts(
213
+ user_content, retrieval_context, multimodal
214
+ )
215
+ score, reason = self._get_interaction_score_and_reason(
216
+ user_content, verdicts, multimodal
217
+ )
218
+ interaction_score = InteractionContextualRelevancyScore(
219
+ score=score,
220
+ reason=reason,
221
+ verdicts=verdicts,
222
+ )
223
+ interaction_scores.append(interaction_score)
224
+
225
+ return interaction_scores
226
+
227
+ async def _a_generate_verdicts(
228
+ self, input: str, retrieval_context: List[str], multimodal: bool
229
+ ) -> List[ContextualRelevancyVerdict]:
230
+ if len(retrieval_context) == 0:
231
+ return []
232
+
233
+ verdicts: List[ContextualRelevancyVerdict] = []
234
+
235
+ # Generate verdicts for each context node
236
+ for context in retrieval_context:
237
+ prompt = self.evaluation_template.generate_verdicts(
238
+ input=input,
239
+ context=context,
240
+ multimodal=multimodal,
241
+ )
242
+
243
+ if self.using_native_model:
244
+ res, cost = await self.model.a_generate(
245
+ prompt, schema=ContextualRelevancyVerdicts
246
+ )
247
+ self.evaluation_cost += cost
248
+ verdicts.extend([item for item in res.verdicts])
249
+ else:
250
+ try:
251
+ res: ContextualRelevancyVerdicts = (
252
+ await self.model.a_generate(
253
+ prompt, schema=ContextualRelevancyVerdicts
254
+ )
255
+ )
256
+ verdicts.extend([item for item in res.verdicts])
257
+ except TypeError:
258
+ res = await self.model.a_generate(prompt)
259
+ data = trimAndLoadJson(res, self)
260
+ verdicts.extend(
261
+ [
262
+ ContextualRelevancyVerdict(**item)
263
+ for item in data["verdicts"]
264
+ ]
265
+ )
266
+
267
+ return verdicts
268
+
269
+ def _generate_verdicts(
270
+ self, input: str, retrieval_context: List[str], multimodal: bool
271
+ ) -> List[ContextualRelevancyVerdict]:
272
+ if len(retrieval_context) == 0:
273
+ return []
274
+
275
+ verdicts: List[ContextualRelevancyVerdict] = []
276
+
277
+ # Generate verdicts for each context node
278
+ for context in retrieval_context:
279
+ prompt = self.evaluation_template.generate_verdicts(
280
+ input=input,
281
+ context=context,
282
+ multimodal=multimodal,
283
+ )
284
+
285
+ if self.using_native_model:
286
+ res, cost = self.model.generate(
287
+ prompt, schema=ContextualRelevancyVerdicts
288
+ )
289
+ self.evaluation_cost += cost
290
+ verdicts.extend([item for item in res.verdicts])
291
+ else:
292
+ try:
293
+ res: ContextualRelevancyVerdicts = self.model.generate(
294
+ prompt, schema=ContextualRelevancyVerdicts
295
+ )
296
+ verdicts.extend([item for item in res.verdicts])
297
+ except TypeError:
298
+ res = self.model.generate(prompt)
299
+ data = trimAndLoadJson(res, self)
300
+ verdicts.extend(
301
+ [
302
+ ContextualRelevancyVerdict(**item)
303
+ for item in data["verdicts"]
304
+ ]
305
+ )
306
+
307
+ return verdicts
308
+
309
+ async def _a_get_interaction_score_and_reason(
310
+ self,
311
+ input: str,
312
+ verdicts: List[ContextualRelevancyVerdict],
313
+ multimodal: bool,
314
+ ) -> Tuple[float, str]:
315
+ if len(verdicts) == 0:
316
+ return 1, None
317
+
318
+ score = self._calculate_interaction_score(verdicts)
319
+ reason = await self._a_get_interaction_reason(
320
+ input, score, verdicts, multimodal
321
+ )
322
+ return (
323
+ (0, reason)
324
+ if self.strict_mode and score < self.threshold
325
+ else (score, reason)
326
+ )
327
+
328
+ def _get_interaction_score_and_reason(
329
+ self,
330
+ input: str,
331
+ verdicts: List[ContextualRelevancyVerdict],
332
+ multimodal: bool,
333
+ ) -> Tuple[float, str]:
334
+ if len(verdicts) == 0:
335
+ return 1, None
336
+
337
+ score = self._calculate_interaction_score(verdicts)
338
+ reason = self._get_interaction_reason(
339
+ input, score, verdicts, multimodal
340
+ )
341
+ return (
342
+ (0, reason)
343
+ if self.strict_mode and score < self.threshold
344
+ else (score, reason)
345
+ )
346
+
347
+ def _calculate_interaction_score(
348
+ self, verdicts: List[ContextualRelevancyVerdict]
349
+ ) -> float:
350
+ number_of_verdicts = len(verdicts)
351
+ if number_of_verdicts == 0:
352
+ return 1
353
+
354
+ relevant_count = 0
355
+ for verdict in verdicts:
356
+ if verdict.verdict.strip().lower() == "yes":
357
+ relevant_count += 1
358
+
359
+ score = relevant_count / number_of_verdicts
360
+ return score
361
+
362
+ async def _a_get_interaction_reason(
363
+ self,
364
+ input: str,
365
+ score: float,
366
+ verdicts: List[ContextualRelevancyVerdict],
367
+ multimodal: bool,
368
+ ) -> str:
369
+ if self.include_reason is False:
370
+ return None
371
+
372
+ # Separate relevant and irrelevant statements
373
+ irrelevant_statements = []
374
+ relevant_statements = []
375
+
376
+ for verdict in verdicts:
377
+ if verdict.verdict.strip().lower() == "yes":
378
+ relevant_statements.append(verdict.statement)
379
+ else:
380
+ # Include the reason for irrelevance
381
+ irrelevant_statements.append(
382
+ f"{verdict.statement}: {verdict.reason}"
383
+ )
384
+
385
+ prompt = self.evaluation_template.generate_reason(
386
+ input=input,
387
+ irrelevant_statements=irrelevant_statements,
388
+ relevant_statements=relevant_statements,
389
+ score=format(score, ".2f"),
390
+ multimodal=multimodal,
391
+ )
392
+
393
+ if self.using_native_model:
394
+ res, cost = await self.model.a_generate(
395
+ prompt, schema=ContextualRelevancyScoreReason
396
+ )
397
+ self.evaluation_cost += cost
398
+ return res.reason
399
+ else:
400
+ try:
401
+ res: ContextualRelevancyScoreReason = (
402
+ await self.model.a_generate(
403
+ prompt, schema=ContextualRelevancyScoreReason
404
+ )
405
+ )
406
+ return res.reason
407
+ except TypeError:
408
+ res = await self.model.a_generate(prompt)
409
+ data = trimAndLoadJson(res, self)
410
+ return data["reason"]
411
+
412
+ def _get_interaction_reason(
413
+ self,
414
+ input: str,
415
+ score: float,
416
+ verdicts: List[ContextualRelevancyVerdict],
417
+ multimodal: bool,
418
+ ) -> str:
419
+ if self.include_reason is False:
420
+ return None
421
+
422
+ # Separate relevant and irrelevant statements
423
+ irrelevant_statements = []
424
+ relevant_statements = []
425
+
426
+ for verdict in verdicts:
427
+ if verdict.verdict.strip().lower() == "yes":
428
+ relevant_statements.append(verdict.statement)
429
+ else:
430
+ # Include the reason for irrelevance
431
+ irrelevant_statements.append(
432
+ f"{verdict.statement}: {verdict.reason}"
433
+ )
434
+
435
+ prompt = self.evaluation_template.generate_reason(
436
+ input=input,
437
+ irrelevant_statements=irrelevant_statements,
438
+ relevant_statements=relevant_statements,
439
+ score=format(score, ".2f"),
440
+ multimodal=multimodal,
441
+ )
442
+
443
+ if self.using_native_model:
444
+ res, cost = self.model.generate(
445
+ prompt, schema=ContextualRelevancyScoreReason
446
+ )
447
+ self.evaluation_cost += cost
448
+ return res.reason
449
+ else:
450
+ try:
451
+ res: ContextualRelevancyScoreReason = self.model.generate(
452
+ prompt, schema=ContextualRelevancyScoreReason
453
+ )
454
+ return res.reason
455
+ except TypeError:
456
+ res = self.model.generate(prompt)
457
+ data = trimAndLoadJson(res, self)
458
+ return data["reason"]
459
+
460
+ def _get_verbose_steps(
461
+ self, interaction_scores: List[InteractionContextualRelevancyScore]
462
+ ):
463
+ steps = []
464
+ for index, interaction_score in enumerate(interaction_scores):
465
+ interaction_steps = [
466
+ f"Interaction {index + 1} \n",
467
+ f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
468
+ f"Score: {interaction_score.score} \n",
469
+ f"Reason: {interaction_score.reason} \n",
470
+ ]
471
+ steps.extend(interaction_steps)
472
+ return steps
473
+
474
+ def _generate_reason(
475
+ self, scores: List[InteractionContextualRelevancyScore]
476
+ ) -> str:
477
+ reasons = []
478
+ for score in scores:
479
+ reasons.append(score.reason)
480
+
481
+ prompt = self.evaluation_template.generate_final_reason(
482
+ self.score, self.success, reasons
483
+ )
484
+
485
+ if self.using_native_model:
486
+ res, cost = self.model.generate(prompt)
487
+ self.evaluation_cost += cost
488
+ return res
489
+ else:
490
+ res = self.model.generate(prompt)
491
+ return res
492
+
493
+ async def _a_generate_reason(
494
+ self, scores: List[InteractionContextualRelevancyScore]
495
+ ) -> str:
496
+ reasons = []
497
+ for score in scores:
498
+ reasons.append(score.reason)
499
+
500
+ prompt = self.evaluation_template.generate_final_reason(
501
+ self.score, self.success, reasons
502
+ )
503
+
504
+ if self.using_native_model:
505
+ res, cost = await self.model.a_generate(prompt)
506
+ self.evaluation_cost += cost
507
+ return res
508
+ else:
509
+ res = await self.model.a_generate(prompt)
510
+ return res
511
+
512
+ def _calculate_score(
513
+ self, scores: List[InteractionContextualRelevancyScore]
514
+ ) -> float:
515
+ number_of_scores = len(scores)
516
+ if number_of_scores == 0:
517
+ return 1
518
+ total_score = 0
519
+ for score in scores:
520
+ total_score += score.score
521
+ return total_score / number_of_scores
522
+
523
+ def is_successful(self) -> bool:
524
+ if self.error is not None:
525
+ self.success = False
526
+ else:
527
+ try:
528
+ self.success = self.score >= self.threshold
529
+ except:
530
+ self.success = False
531
+ return self.success
532
+
533
+ @property
534
+ def __name__(self):
535
+ return "Turn Contextual Relevancy"
@@ -1,10 +1,10 @@
1
- from typing import List, Optional
1
+ from typing import List, Optional, Literal
2
2
  from pydantic import BaseModel, Field
3
3
 
4
4
 
5
5
  class FaithfulnessVerdict(BaseModel):
6
- verdict: str
7
6
  reason: Optional[str] = Field(default=None)
7
+ verdict: Literal["yes", "no", "idk"]
8
8
 
9
9
 
10
10
  class Verdicts(BaseModel):
@@ -19,5 +19,13 @@ class Claims(BaseModel):
19
19
  claims: List[str]
20
20
 
21
21
 
22
- class MultimodalFaithfulnessScoreReason(BaseModel):
22
+ class FaithfulnessScoreReason(BaseModel):
23
23
  reason: str
24
+
25
+
26
+ class InteractionFaithfulnessScore(BaseModel):
27
+ score: float
28
+ reason: str
29
+ claims: List[str]
30
+ truths: List[str]
31
+ verdicts: List[FaithfulnessVerdict]