deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,550 @@
1
+ from typing import List, Optional, Union, Type, Tuple
2
+ import asyncio
3
+
4
+ from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
+ from deepeval.metrics import BaseConversationalMetric
6
+ from deepeval.utils import (
7
+ get_or_create_event_loop,
8
+ prettify_list,
9
+ )
10
+ from deepeval.metrics.utils import (
11
+ construct_verbose_logs,
12
+ trimAndLoadJson,
13
+ check_conversational_test_case_params,
14
+ get_unit_interactions,
15
+ initialize_model,
16
+ )
17
+ from deepeval.models import DeepEvalBaseLLM
18
+ from deepeval.metrics.turn_contextual_precision.template import (
19
+ TurnContextualPrecisionTemplate,
20
+ )
21
+ from deepeval.metrics.indicator import metric_progress_indicator
22
+ from deepeval.metrics.turn_contextual_precision.schema import (
23
+ ContextualPrecisionVerdict,
24
+ Verdicts,
25
+ ContextualPrecisionScoreReason,
26
+ InteractionContextualPrecisionScore,
27
+ )
28
+ from deepeval.metrics.api import metric_data_manager
29
+
30
+
31
+ class TurnContextualPrecisionMetric(BaseConversationalMetric):
32
+ _required_test_case_params: List[TurnParams] = [
33
+ TurnParams.CONTENT,
34
+ TurnParams.RETRIEVAL_CONTEXT,
35
+ TurnParams.EXPECTED_OUTCOME,
36
+ ]
37
+
38
+ def __init__(
39
+ self,
40
+ threshold: float = 0.5,
41
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
42
+ include_reason: bool = True,
43
+ async_mode: bool = True,
44
+ strict_mode: bool = False,
45
+ verbose_mode: bool = False,
46
+ evaluation_template: Type[
47
+ TurnContextualPrecisionTemplate
48
+ ] = TurnContextualPrecisionTemplate,
49
+ ):
50
+ self.threshold = 1 if strict_mode else threshold
51
+ self.model, self.using_native_model = initialize_model(model)
52
+ self.evaluation_model = self.model.get_model_name()
53
+ self.include_reason = include_reason
54
+ self.async_mode = async_mode
55
+ self.strict_mode = strict_mode
56
+ self.verbose_mode = verbose_mode
57
+ self.evaluation_template = evaluation_template
58
+
59
+ def measure(
60
+ self,
61
+ test_case: ConversationalTestCase,
62
+ _show_indicator: bool = True,
63
+ _in_component: bool = False,
64
+ _log_metric_to_confident: bool = True,
65
+ ):
66
+ check_conversational_test_case_params(
67
+ test_case,
68
+ self._required_test_case_params,
69
+ self,
70
+ False,
71
+ self.model,
72
+ test_case.multimodal,
73
+ )
74
+
75
+ multimodal = test_case.multimodal
76
+
77
+ self.evaluation_cost = 0 if self.using_native_model else None
78
+ with metric_progress_indicator(
79
+ self, _show_indicator=_show_indicator, _in_component=_in_component
80
+ ):
81
+ if self.async_mode:
82
+ loop = get_or_create_event_loop()
83
+ loop.run_until_complete(
84
+ self.a_measure(
85
+ test_case,
86
+ _show_indicator=False,
87
+ _in_component=_in_component,
88
+ _log_metric_to_confident=_log_metric_to_confident,
89
+ )
90
+ )
91
+ else:
92
+ unit_interactions = get_unit_interactions(test_case.turns)
93
+ scores = self._get_contextual_precision_scores(
94
+ unit_interactions, test_case.expected_outcome, multimodal
95
+ )
96
+ self.score = self._calculate_score(scores)
97
+ self.success = self.score >= self.threshold
98
+ self.reason = self._generate_reason(scores)
99
+ verbose_steps = self._get_verbose_steps(scores)
100
+ self.verbose_logs = construct_verbose_logs(
101
+ self,
102
+ steps=[
103
+ *verbose_steps,
104
+ f"Final Score: {self.score}\n",
105
+ f"Final Reason: {self.reason}\n",
106
+ ],
107
+ )
108
+ if _log_metric_to_confident:
109
+ metric_data_manager.post_metric_if_enabled(
110
+ self, test_case=test_case
111
+ )
112
+
113
+ return self.score
114
+
115
+ async def a_measure(
116
+ self,
117
+ test_case: ConversationalTestCase,
118
+ _show_indicator: bool = True,
119
+ _in_component: bool = False,
120
+ _log_metric_to_confident: bool = True,
121
+ ) -> float:
122
+ check_conversational_test_case_params(
123
+ test_case,
124
+ self._required_test_case_params,
125
+ self,
126
+ False,
127
+ self.model,
128
+ test_case.multimodal,
129
+ )
130
+
131
+ multimodal = test_case.multimodal
132
+
133
+ self.evaluation_cost = 0 if self.using_native_model else None
134
+ with metric_progress_indicator(
135
+ self,
136
+ async_mode=True,
137
+ _show_indicator=_show_indicator,
138
+ _in_component=_in_component,
139
+ ):
140
+ unit_interactions = get_unit_interactions(test_case.turns)
141
+ scores = await self._a_get_contextual_precision_scores(
142
+ unit_interactions, test_case.expected_outcome, multimodal
143
+ )
144
+ self.score = self._calculate_score(scores)
145
+ self.success = self.score >= self.threshold
146
+ self.reason = await self._a_generate_reason(scores)
147
+ verbose_steps = self._get_verbose_steps(scores)
148
+ self.verbose_logs = construct_verbose_logs(
149
+ self,
150
+ steps=[
151
+ *verbose_steps,
152
+ f"Final Score: {self.score}\n",
153
+ f"Final Reason: {self.reason}\n",
154
+ ],
155
+ )
156
+ if _log_metric_to_confident:
157
+ metric_data_manager.post_metric_if_enabled(
158
+ self, test_case=test_case
159
+ )
160
+
161
+ return self.score
162
+
163
+ async def _a_get_contextual_precision_scores(
164
+ self,
165
+ unit_interactions: List[List[Turn]],
166
+ _expected_outcome: str,
167
+ multimodal: bool,
168
+ ):
169
+ async def get_interaction_score(unit_interaction: List[Turn]):
170
+ user_content = "User Message: "
171
+ retrieval_context = []
172
+ expected_outcome = (
173
+ f"Expected Assistant Message: \n{_expected_outcome}"
174
+ )
175
+ for turn in unit_interaction:
176
+ if turn.role == "user":
177
+ user_content += f"\n{turn.content} "
178
+ else:
179
+ retrieval_context.extend(turn.retrieval_context)
180
+
181
+ verdicts = await self._a_generate_verdicts(
182
+ user_content, expected_outcome, retrieval_context, multimodal
183
+ )
184
+ score, reason = await self._a_get_interaction_score_and_reason(
185
+ user_content, verdicts, multimodal
186
+ )
187
+ interaction_score = InteractionContextualPrecisionScore(
188
+ score=score,
189
+ reason=reason,
190
+ verdicts=verdicts,
191
+ )
192
+ return interaction_score
193
+
194
+ final_scores = await asyncio.gather(
195
+ *[
196
+ get_interaction_score(unit_interaction)
197
+ for unit_interaction in unit_interactions
198
+ ]
199
+ )
200
+
201
+ return final_scores
202
+
203
+ def _get_contextual_precision_scores(
204
+ self,
205
+ unit_interactions: List[List[Turn]],
206
+ _expected_outcome: str,
207
+ multimodal: bool,
208
+ ):
209
+ interaction_scores = []
210
+
211
+ for unit_interaction in unit_interactions:
212
+ user_content = "User Message: "
213
+ retrieval_context = []
214
+ expected_outcome = (
215
+ f"Expected Assistant Message: \n{_expected_outcome}"
216
+ )
217
+ for turn in unit_interaction:
218
+ if turn.role == "user":
219
+ user_content += f"\n{turn.content} "
220
+ else:
221
+ retrieval_context.extend(turn.retrieval_context)
222
+
223
+ verdicts = self._generate_verdicts(
224
+ user_content, expected_outcome, retrieval_context, multimodal
225
+ )
226
+ score, reason = self._get_interaction_score_and_reason(
227
+ user_content, verdicts, multimodal
228
+ )
229
+ interaction_score = InteractionContextualPrecisionScore(
230
+ score=score,
231
+ reason=reason,
232
+ verdicts=verdicts,
233
+ )
234
+ interaction_scores.append(interaction_score)
235
+
236
+ return interaction_scores
237
+
238
+ async def _a_generate_verdicts(
239
+ self,
240
+ input: str,
241
+ expected_outcome: str,
242
+ retrieval_context: List[str],
243
+ multimodal: bool,
244
+ ) -> List[ContextualPrecisionVerdict]:
245
+ if len(retrieval_context) == 0:
246
+ return []
247
+
248
+ verdicts: List[ContextualPrecisionVerdict] = []
249
+
250
+ prompt = self.evaluation_template.generate_verdicts(
251
+ input=input,
252
+ expected_outcome=expected_outcome,
253
+ retrieval_context=retrieval_context,
254
+ multimodal=multimodal,
255
+ )
256
+
257
+ if self.using_native_model:
258
+ res, cost = await self.model.a_generate(prompt, schema=Verdicts)
259
+ self.evaluation_cost += cost
260
+ verdicts = [item for item in res.verdicts]
261
+ return verdicts
262
+ else:
263
+ try:
264
+ res: Verdicts = await self.model.a_generate(
265
+ prompt, schema=Verdicts
266
+ )
267
+ verdicts = [item for item in res.verdicts]
268
+ return verdicts
269
+ except TypeError:
270
+ res = await self.model.a_generate(prompt)
271
+ data = trimAndLoadJson(res, self)
272
+ verdicts = [
273
+ ContextualPrecisionVerdict(**item)
274
+ for item in data["verdicts"]
275
+ ]
276
+ return verdicts
277
+
278
+ def _generate_verdicts(
279
+ self,
280
+ input: str,
281
+ expected_outcome: str,
282
+ retrieval_context: List[str],
283
+ multimodal: bool,
284
+ ) -> List[ContextualPrecisionVerdict]:
285
+ if len(retrieval_context) == 0:
286
+ return []
287
+
288
+ verdicts: List[ContextualPrecisionVerdict] = []
289
+
290
+ prompt = self.evaluation_template.generate_verdicts(
291
+ input=input,
292
+ expected_outcome=expected_outcome,
293
+ retrieval_context=retrieval_context,
294
+ multimodal=multimodal,
295
+ )
296
+
297
+ if self.using_native_model:
298
+ res, cost = self.model.generate(prompt, schema=Verdicts)
299
+ self.evaluation_cost += cost
300
+ verdicts = [item for item in res.verdicts]
301
+ return verdicts
302
+ else:
303
+ try:
304
+ res: Verdicts = self.model.generate(prompt, schema=Verdicts)
305
+ verdicts = [item for item in res.verdicts]
306
+ return verdicts
307
+ except TypeError:
308
+ res = self.model.generate(prompt)
309
+ data = trimAndLoadJson(res, self)
310
+ verdicts = [
311
+ ContextualPrecisionVerdict(**item)
312
+ for item in data["verdicts"]
313
+ ]
314
+ return verdicts
315
+
316
+ async def _a_get_interaction_score_and_reason(
317
+ self,
318
+ input: str,
319
+ verdicts: List[ContextualPrecisionVerdict],
320
+ multimodal: bool,
321
+ ) -> Tuple[float, str]:
322
+ if len(verdicts) == 0:
323
+ return 1, None
324
+
325
+ score = self._calculate_interaction_score(verdicts)
326
+ reason = await self._a_get_interaction_reason(
327
+ input, score, verdicts, multimodal
328
+ )
329
+ return (
330
+ (0, reason)
331
+ if self.strict_mode and score < self.threshold
332
+ else (score, reason)
333
+ )
334
+
335
+ def _get_interaction_score_and_reason(
336
+ self,
337
+ input: str,
338
+ verdicts: List[ContextualPrecisionVerdict],
339
+ multimodal: bool,
340
+ ) -> Tuple[float, str]:
341
+ if len(verdicts) == 0:
342
+ return 1, None
343
+
344
+ score = self._calculate_interaction_score(verdicts)
345
+ reason = self._get_interaction_reason(
346
+ input, score, verdicts, multimodal
347
+ )
348
+ return (
349
+ (0, reason)
350
+ if self.strict_mode and score < self.threshold
351
+ else (score, reason)
352
+ )
353
+
354
+ def _calculate_interaction_score(
355
+ self, verdicts: List[ContextualPrecisionVerdict]
356
+ ) -> float:
357
+ number_of_verdicts = len(verdicts)
358
+ if number_of_verdicts == 0:
359
+ return 0
360
+
361
+ # Convert verdicts to binary list where 'yes' is 1 and others are 0
362
+ node_verdicts = [
363
+ 1 if v.verdict.strip().lower() == "yes" else 0 for v in verdicts
364
+ ]
365
+
366
+ sum_weighted_precision_at_k = 0.0
367
+ relevant_nodes_count = 0
368
+
369
+ for k, is_relevant in enumerate(node_verdicts, start=1):
370
+ # If the item is relevant, update the counter and add weighted precision to sum
371
+ if is_relevant:
372
+ relevant_nodes_count += 1
373
+ precision_at_k = relevant_nodes_count / k
374
+ sum_weighted_precision_at_k += precision_at_k * is_relevant
375
+
376
+ if relevant_nodes_count == 0:
377
+ return 0
378
+
379
+ # Calculate Average Precision
380
+ score = sum_weighted_precision_at_k / relevant_nodes_count
381
+ return 0 if self.strict_mode and score < self.threshold else score
382
+
383
+ async def _a_get_interaction_reason(
384
+ self,
385
+ input: str,
386
+ score: float,
387
+ verdicts: List[ContextualPrecisionVerdict],
388
+ multimodal: bool,
389
+ ) -> str:
390
+ if self.include_reason is False:
391
+ return None
392
+
393
+ # Prepare verdicts with node information for reasoning
394
+ verdicts_with_nodes = []
395
+ for i, verdict in enumerate(verdicts):
396
+ verdicts_with_nodes.append(
397
+ {
398
+ "verdict": verdict.verdict,
399
+ "reason": verdict.reason,
400
+ "node": f"Node {i + 1}",
401
+ }
402
+ )
403
+
404
+ prompt = self.evaluation_template.generate_reason(
405
+ input=input,
406
+ score=format(score, ".2f"),
407
+ verdicts=verdicts_with_nodes,
408
+ multimodal=multimodal,
409
+ )
410
+
411
+ if self.using_native_model:
412
+ res, cost = await self.model.a_generate(
413
+ prompt, schema=ContextualPrecisionScoreReason
414
+ )
415
+ self.evaluation_cost += cost
416
+ return res.reason
417
+ else:
418
+ try:
419
+ res: ContextualPrecisionScoreReason = (
420
+ await self.model.a_generate(
421
+ prompt, schema=ContextualPrecisionScoreReason
422
+ )
423
+ )
424
+ return res.reason
425
+ except TypeError:
426
+ res = await self.model.a_generate(prompt)
427
+ data = trimAndLoadJson(res, self)
428
+ return data["reason"]
429
+
430
+ def _get_interaction_reason(
431
+ self,
432
+ input: str,
433
+ score: float,
434
+ verdicts: List[ContextualPrecisionVerdict],
435
+ multimodal: bool,
436
+ ) -> str:
437
+ if self.include_reason is False:
438
+ return None
439
+
440
+ # Prepare verdicts with node information for reasoning
441
+ verdicts_with_nodes = []
442
+ for i, verdict in enumerate(verdicts):
443
+ verdicts_with_nodes.append(
444
+ {
445
+ "verdict": verdict.verdict,
446
+ "reason": verdict.reason,
447
+ "node": f"Node {i + 1}",
448
+ }
449
+ )
450
+
451
+ prompt = self.evaluation_template.generate_reason(
452
+ input=input,
453
+ score=format(score, ".2f"),
454
+ verdicts=verdicts_with_nodes,
455
+ multimodal=multimodal,
456
+ )
457
+
458
+ if self.using_native_model:
459
+ res, cost = self.model.generate(
460
+ prompt, schema=ContextualPrecisionScoreReason
461
+ )
462
+ self.evaluation_cost += cost
463
+ return res.reason
464
+ else:
465
+ try:
466
+ res: ContextualPrecisionScoreReason = self.model.generate(
467
+ prompt, schema=ContextualPrecisionScoreReason
468
+ )
469
+ return res.reason
470
+ except TypeError:
471
+ res = self.model.generate(prompt)
472
+ data = trimAndLoadJson(res, self)
473
+ return data["reason"]
474
+
475
+ def _get_verbose_steps(
476
+ self, interaction_scores: List[InteractionContextualPrecisionScore]
477
+ ):
478
+ steps = []
479
+ for index, interaction_score in enumerate(interaction_scores):
480
+ interaction_steps = [
481
+ f"Interaction {index + 1} \n",
482
+ f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
483
+ f"Score: {interaction_score.score} \n",
484
+ f"Reason: {interaction_score.reason} \n",
485
+ ]
486
+ steps.extend(interaction_steps)
487
+ return steps
488
+
489
+ def _generate_reason(
490
+ self, scores: List[InteractionContextualPrecisionScore]
491
+ ) -> str:
492
+ reasons = []
493
+ for score in scores:
494
+ reasons.append(score.reason)
495
+
496
+ prompt = self.evaluation_template.generate_final_reason(
497
+ self.score, self.success, reasons
498
+ )
499
+
500
+ if self.using_native_model:
501
+ res, cost = self.model.generate(prompt)
502
+ self.evaluation_cost += cost
503
+ return res
504
+ else:
505
+ res = self.model.generate(prompt)
506
+ return res
507
+
508
+ async def _a_generate_reason(
509
+ self, scores: List[InteractionContextualPrecisionScore]
510
+ ) -> str:
511
+ reasons = []
512
+ for score in scores:
513
+ reasons.append(score.reason)
514
+
515
+ prompt = self.evaluation_template.generate_final_reason(
516
+ self.score, self.success, reasons
517
+ )
518
+
519
+ if self.using_native_model:
520
+ res, cost = await self.model.a_generate(prompt)
521
+ self.evaluation_cost += cost
522
+ return res
523
+ else:
524
+ res = await self.model.a_generate(prompt)
525
+ return res
526
+
527
+ def _calculate_score(
528
+ self, scores: List[InteractionContextualPrecisionScore]
529
+ ) -> float:
530
+ number_of_scores = len(scores)
531
+ if number_of_scores == 0:
532
+ return 1
533
+ total_score = 0
534
+ for score in scores:
535
+ total_score += score.score
536
+ return total_score / number_of_scores
537
+
538
+ def is_successful(self) -> bool:
539
+ if self.error is not None:
540
+ self.success = False
541
+ else:
542
+ try:
543
+ self.success = self.score >= self.threshold
544
+ except:
545
+ self.success = False
546
+ return self.success
547
+
548
+ @property
549
+ def __name__(self):
550
+ return "Turn Contextual Precision"
@@ -0,0 +1,21 @@
1
+ from typing import List
2
+ from pydantic import BaseModel
3
+
4
+
5
+ class ContextualRecallVerdict(BaseModel):
6
+ verdict: str
7
+ reason: str
8
+
9
+
10
+ class Verdicts(BaseModel):
11
+ verdicts: List[ContextualRecallVerdict]
12
+
13
+
14
+ class ContextualRecallScoreReason(BaseModel):
15
+ reason: str
16
+
17
+
18
+ class InteractionContextualRecallScore(BaseModel):
19
+ score: float
20
+ reason: str
21
+ verdicts: List[ContextualRecallVerdict]