deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,178 @@
1
+ from typing import List, Union
2
+ import textwrap
3
+ from deepeval.test_case import MLLMImage
4
+
5
+
6
+ class TurnContextualRecallTemplate:
7
+ multimodal_rules = """
8
+ --- MULTIMODAL INPUT RULES ---
9
+ - Treat image content as factual evidence.
10
+ - Only reference visual details that are explicitly and clearly visible.
11
+ - Do not infer or guess objects, text, or details not visibly present.
12
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
13
+ - When evaluating claims, compare them to BOTH textual and visual evidence.
14
+ - If the claim references something not clearly visible, respond with 'idk'.
15
+ """
16
+
17
+ @staticmethod
18
+ def generate_reason(
19
+ expected_outcome: str,
20
+ supportive_reasons: str,
21
+ unsupportive_reasons: str,
22
+ score: float,
23
+ multimodal: bool = False,
24
+ ):
25
+ content_type = "sentence or image" if multimodal else "sentence"
26
+
27
+ return textwrap.dedent(
28
+ f"""Given the original assistant output, a list of supportive reasons, and a list of unsupportive reasons ({'which is' if multimodal else 'which are'} deduced directly from the {'"assistant output"' if multimodal else 'original assistant output'}), and a contextual recall score (closer to 1 the better), summarize a CONCISE reason for the score.
29
+ A supportive reason is the reason why a certain {content_type} in the original assistant output can be attributed to the node in the retrieval context.
30
+ An unsupportive reason is the reason why a certain {content_type} in the original assistant output cannot be attributed to anything in the retrieval context.
31
+ In your reason, you should {'related' if multimodal else 'relate'} supportive/unsupportive reasons to the {content_type} number in assistant output, and {'info' if multimodal else 'include info'} regarding the node number in retrieval context to support your final reason. The first mention of "node(s)" should specify "node(s) in retrieval context{')' if multimodal else ''}.
32
+
33
+ {TurnContextualRecallTemplate.multimodal_rules if multimodal else ""}
34
+
35
+ **
36
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
37
+ Example JSON:
38
+ {{
39
+ "reason": "The score is <contextual_recall_score> because <your_reason>."
40
+ }}
41
+
42
+ DO NOT mention 'supportive reasons' and 'unsupportive reasons' in your reason, these terms are just here for you to understand the broader scope of things.
43
+ If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it{',' if multimodal else ''} otherwise it gets annoying).
44
+ **
45
+
46
+ Contextual Recall Score:
47
+ {score}
48
+
49
+ Assistant Output:
50
+ {expected_outcome}
51
+
52
+ Supportive Reasons:
53
+ {supportive_reasons}
54
+
55
+ Unsupportive Reasons:
56
+ {unsupportive_reasons}
57
+
58
+ JSON:
59
+ """
60
+ )
61
+
62
+ @staticmethod
63
+ def generate_verdicts(
64
+ expected_outcome: str,
65
+ retrieval_context: List[Union[str, MLLMImage]],
66
+ multimodal: bool = False,
67
+ ):
68
+ content_type = "sentence and image" if multimodal else "sentence"
69
+ content_type_plural = (
70
+ "sentences and images" if multimodal else "sentences"
71
+ )
72
+ content_or = "sentence or image" if multimodal else "sentence"
73
+
74
+ # For multimodal, we need to annotate the retrieval context with node IDs
75
+ context_to_display = (
76
+ TurnContextualRecallTemplate.id_retrieval_context(retrieval_context)
77
+ if multimodal
78
+ else retrieval_context
79
+ )
80
+
81
+ node_instruction = ""
82
+ if multimodal:
83
+ node_instruction = " A node is either a string or image, but not both (so do not group images and texts in the same nodes)."
84
+
85
+ return textwrap.dedent(
86
+ f"""For EACH {content_type} in the given assistant output below, determine whether the {content_or} can be attributed to the nodes of retrieval contexts. Please generate a list of JSON with two keys: `verdict` and `reason`.
87
+ The `verdict` key should STRICTLY be either a 'yes' or 'no'. Answer 'yes' if the {content_or} can be attributed to any parts of the retrieval context, else answer 'no'.
88
+ The `reason` key should provide a reason why to the verdict. In the reason, you should aim to include the node(s) count in the retrieval context (eg., 1st node, and 2nd node in the retrieval context) that is attributed to said {content_or}.{node_instruction} You should also aim to quote the specific part of the retrieval context to justify your verdict, but keep it extremely concise and cut short the quote with an ellipsis if possible.
89
+
90
+ {TurnContextualRecallTemplate.multimodal_rules if multimodal else ""}
91
+
92
+ **
93
+ IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects, each with two keys: `verdict` and `reason`.
94
+
95
+ {{
96
+ "verdicts": [
97
+ {{
98
+ "reason": "...",
99
+ "verdict": "yes"
100
+ }},
101
+ ...
102
+ ]
103
+ }}
104
+
105
+ Since you are going to generate a verdict for each sentence, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of {content_type_plural} in {'the' if multimodal else '`assistant output`'}{' `assistant output`' if multimodal else ''}.
106
+ **
107
+
108
+ Assistant Output:
109
+ {expected_outcome}
110
+
111
+ Retrieval Context:
112
+ {context_to_display}
113
+
114
+ JSON:
115
+ """
116
+ )
117
+
118
+ @staticmethod
119
+ def generate_final_reason(
120
+ final_score: float, success: bool, reasons: List[str]
121
+ ):
122
+ return textwrap.dedent(
123
+ f"""You are an AI evaluator producing a single final explanation for the TurnContextualRecallMetric result.
124
+
125
+ Context:
126
+ This metric evaluates conversational contextual recall by determining whether sentences in the assistant output can be attributed to the retrieval context for each interaction. Each interaction yields a reason indicating which sentences were supported or unsupported. You are given all those reasons.
127
+
128
+ Inputs:
129
+ - final_score: the averaged score across all interactions.
130
+ - success: whether the metric passed or failed
131
+ - reasons: a list of textual reasons generated from individual interactions.
132
+
133
+ Instructions:
134
+ 1. Read all reasons and synthesize them into one unified explanation.
135
+ 2. Describe patterns of unsupported sentences, missing context coverage, or well-attributed outputs if present.
136
+ 3. Do not repeat every reason; merge them into a concise, coherent narrative.
137
+ 4. If the metric failed, state the dominant failure modes. If it passed, state why the assistant output was well-supported by retrieval context.
138
+ 5. Output a single paragraph with no lists, no bullets, no markup.
139
+
140
+ Output:
141
+ A single paragraph explaining the final outcome.
142
+
143
+ Here's the inputs:
144
+
145
+ Final Score: {final_score}
146
+
147
+ Reasons:
148
+ {reasons}
149
+
150
+ Success: {success}
151
+
152
+ Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
153
+
154
+ The final reason:
155
+ """
156
+ )
157
+
158
+ @staticmethod
159
+ def id_retrieval_context(
160
+ retrieval_context: List[Union[str, MLLMImage]],
161
+ ) -> List[Union[str, MLLMImage]]:
162
+ """
163
+ Annotates retrieval context with node IDs for multimodal processing.
164
+
165
+ Args:
166
+ retrieval_context: List of contexts (can be strings or MLLMImages)
167
+
168
+ Returns:
169
+ Annotated list with "Node X:" prefixes
170
+ """
171
+ annotated_retrieval_context = []
172
+ for i, context in enumerate(retrieval_context):
173
+ if isinstance(context, str):
174
+ annotated_retrieval_context.append(f"Node {i + 1}: {context}")
175
+ elif isinstance(context, MLLMImage):
176
+ annotated_retrieval_context.append(f"Node {i + 1}:")
177
+ annotated_retrieval_context.append(context)
178
+ return annotated_retrieval_context
@@ -0,0 +1,520 @@
1
+ from typing import List, Optional, Union, Type, Tuple
2
+ import asyncio
3
+
4
+ from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
+ from deepeval.metrics import BaseConversationalMetric
6
+ from deepeval.utils import (
7
+ get_or_create_event_loop,
8
+ prettify_list,
9
+ )
10
+ from deepeval.metrics.utils import (
11
+ construct_verbose_logs,
12
+ trimAndLoadJson,
13
+ check_conversational_test_case_params,
14
+ get_unit_interactions,
15
+ initialize_model,
16
+ )
17
+ from deepeval.models import DeepEvalBaseLLM
18
+ from deepeval.metrics.turn_contextual_recall.template import (
19
+ TurnContextualRecallTemplate,
20
+ )
21
+ from deepeval.metrics.indicator import metric_progress_indicator
22
+ from deepeval.metrics.turn_contextual_recall.schema import (
23
+ ContextualRecallVerdict,
24
+ Verdicts,
25
+ ContextualRecallScoreReason,
26
+ InteractionContextualRecallScore,
27
+ )
28
+ from deepeval.metrics.api import metric_data_manager
29
+
30
+
31
+ class TurnContextualRecallMetric(BaseConversationalMetric):
32
+ _required_test_case_params: List[TurnParams] = [
33
+ TurnParams.CONTENT,
34
+ TurnParams.RETRIEVAL_CONTEXT,
35
+ TurnParams.EXPECTED_OUTCOME,
36
+ ]
37
+
38
+ def __init__(
39
+ self,
40
+ threshold: float = 0.5,
41
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
42
+ include_reason: bool = True,
43
+ async_mode: bool = True,
44
+ strict_mode: bool = False,
45
+ verbose_mode: bool = False,
46
+ evaluation_template: Type[
47
+ TurnContextualRecallTemplate
48
+ ] = TurnContextualRecallTemplate,
49
+ ):
50
+ self.threshold = 1 if strict_mode else threshold
51
+ self.model, self.using_native_model = initialize_model(model)
52
+ self.evaluation_model = self.model.get_model_name()
53
+ self.include_reason = include_reason
54
+ self.async_mode = async_mode
55
+ self.strict_mode = strict_mode
56
+ self.verbose_mode = verbose_mode
57
+ self.evaluation_template = evaluation_template
58
+
59
+ def measure(
60
+ self,
61
+ test_case: ConversationalTestCase,
62
+ _show_indicator: bool = True,
63
+ _in_component: bool = False,
64
+ _log_metric_to_confident: bool = True,
65
+ ):
66
+ check_conversational_test_case_params(
67
+ test_case,
68
+ self._required_test_case_params,
69
+ self,
70
+ False,
71
+ self.model,
72
+ test_case.multimodal,
73
+ )
74
+
75
+ multimodal = test_case.multimodal
76
+
77
+ self.evaluation_cost = 0 if self.using_native_model else None
78
+ with metric_progress_indicator(
79
+ self, _show_indicator=_show_indicator, _in_component=_in_component
80
+ ):
81
+ if self.async_mode:
82
+ loop = get_or_create_event_loop()
83
+ loop.run_until_complete(
84
+ self.a_measure(
85
+ test_case,
86
+ _show_indicator=False,
87
+ _in_component=_in_component,
88
+ _log_metric_to_confident=_log_metric_to_confident,
89
+ )
90
+ )
91
+ else:
92
+ unit_interactions = get_unit_interactions(test_case.turns)
93
+ scores = self._get_contextual_recall_scores(
94
+ unit_interactions, test_case.expected_outcome, multimodal
95
+ )
96
+ self.score = self._calculate_score(scores)
97
+ self.success = self.score >= self.threshold
98
+ self.reason = self._generate_reason(scores)
99
+ verbose_steps = self._get_verbose_steps(scores)
100
+ self.verbose_logs = construct_verbose_logs(
101
+ self,
102
+ steps=[
103
+ *verbose_steps,
104
+ f"Final Score: {self.score}\n",
105
+ f"Final Reason: {self.reason}\n",
106
+ ],
107
+ )
108
+ if _log_metric_to_confident:
109
+ metric_data_manager.post_metric_if_enabled(
110
+ self, test_case=test_case
111
+ )
112
+
113
+ return self.score
114
+
115
+ async def a_measure(
116
+ self,
117
+ test_case: ConversationalTestCase,
118
+ _show_indicator: bool = True,
119
+ _in_component: bool = False,
120
+ _log_metric_to_confident: bool = True,
121
+ ) -> float:
122
+ check_conversational_test_case_params(
123
+ test_case,
124
+ self._required_test_case_params,
125
+ self,
126
+ False,
127
+ self.model,
128
+ test_case.multimodal,
129
+ )
130
+
131
+ multimodal = test_case.multimodal
132
+
133
+ self.evaluation_cost = 0 if self.using_native_model else None
134
+ with metric_progress_indicator(
135
+ self,
136
+ async_mode=True,
137
+ _show_indicator=_show_indicator,
138
+ _in_component=_in_component,
139
+ ):
140
+ unit_interactions = get_unit_interactions(test_case.turns)
141
+ scores = await self._a_get_contextual_recall_scores(
142
+ unit_interactions, test_case.expected_outcome, multimodal
143
+ )
144
+ self.score = self._calculate_score(scores)
145
+ self.success = self.score >= self.threshold
146
+ self.reason = await self._a_generate_reason(scores)
147
+ verbose_steps = self._get_verbose_steps(scores)
148
+ self.verbose_logs = construct_verbose_logs(
149
+ self,
150
+ steps=[
151
+ *verbose_steps,
152
+ f"Final Score: {self.score}\n",
153
+ f"Final Reason: {self.reason}\n",
154
+ ],
155
+ )
156
+ if _log_metric_to_confident:
157
+ metric_data_manager.post_metric_if_enabled(
158
+ self, test_case=test_case
159
+ )
160
+
161
+ return self.score
162
+
163
+ async def _a_get_contextual_recall_scores(
164
+ self,
165
+ unit_interactions: List[List[Turn]],
166
+ _expected_outcome: str,
167
+ multimodal: bool,
168
+ ):
169
+ async def get_interaction_score(unit_interaction: List[Turn]):
170
+ retrieval_context = []
171
+ expected_outcome = (
172
+ f"Expected Assistant Message: \n{_expected_outcome}"
173
+ )
174
+ for turn in unit_interaction:
175
+ if turn.role == "assistant":
176
+ retrieval_context.extend(turn.retrieval_context)
177
+
178
+ verdicts = await self._a_generate_verdicts(
179
+ expected_outcome, retrieval_context, multimodal
180
+ )
181
+ score, reason = await self._a_get_interaction_score_and_reason(
182
+ expected_outcome, verdicts, multimodal
183
+ )
184
+ interaction_score = InteractionContextualRecallScore(
185
+ score=score,
186
+ reason=reason,
187
+ verdicts=verdicts,
188
+ )
189
+ return interaction_score
190
+
191
+ final_scores = await asyncio.gather(
192
+ *[
193
+ get_interaction_score(unit_interaction)
194
+ for unit_interaction in unit_interactions
195
+ ]
196
+ )
197
+
198
+ return final_scores
199
+
200
+ def _get_contextual_recall_scores(
201
+ self,
202
+ unit_interactions: List[List[Turn]],
203
+ _expected_outcome: str,
204
+ multimodal: bool,
205
+ ):
206
+ interaction_scores = []
207
+
208
+ for unit_interaction in unit_interactions:
209
+ retrieval_context = []
210
+ expected_outcome = (
211
+ f"Expected Assistant Message: \n{_expected_outcome}"
212
+ )
213
+ for turn in unit_interaction:
214
+ if turn.role == "assistant":
215
+ retrieval_context.extend(turn.retrieval_context)
216
+
217
+ verdicts = self._generate_verdicts(
218
+ expected_outcome, retrieval_context, multimodal
219
+ )
220
+ score, reason = self._get_interaction_score_and_reason(
221
+ expected_outcome, verdicts, multimodal
222
+ )
223
+ interaction_score = InteractionContextualRecallScore(
224
+ score=score,
225
+ reason=reason,
226
+ verdicts=verdicts,
227
+ )
228
+ interaction_scores.append(interaction_score)
229
+
230
+ return interaction_scores
231
+
232
+ async def _a_generate_verdicts(
233
+ self,
234
+ expected_outcome: str,
235
+ retrieval_context: List[str],
236
+ multimodal: bool,
237
+ ) -> List[ContextualRecallVerdict]:
238
+ if len(retrieval_context) == 0:
239
+ return []
240
+
241
+ verdicts: List[ContextualRecallVerdict] = []
242
+
243
+ prompt = self.evaluation_template.generate_verdicts(
244
+ expected_outcome=expected_outcome,
245
+ retrieval_context=retrieval_context,
246
+ multimodal=multimodal,
247
+ )
248
+
249
+ if self.using_native_model:
250
+ res, cost = await self.model.a_generate(prompt, schema=Verdicts)
251
+ self.evaluation_cost += cost
252
+ verdicts = [item for item in res.verdicts]
253
+ return verdicts
254
+ else:
255
+ try:
256
+ res: Verdicts = await self.model.a_generate(
257
+ prompt, schema=Verdicts
258
+ )
259
+ verdicts = [item for item in res.verdicts]
260
+ return verdicts
261
+ except TypeError:
262
+ res = await self.model.a_generate(prompt)
263
+ data = trimAndLoadJson(res, self)
264
+ verdicts = [
265
+ ContextualRecallVerdict(**item) for item in data["verdicts"]
266
+ ]
267
+ return verdicts
268
+
269
+ def _generate_verdicts(
270
+ self,
271
+ expected_outcome: str,
272
+ retrieval_context: List[str],
273
+ multimodal: bool,
274
+ ) -> List[ContextualRecallVerdict]:
275
+ if len(retrieval_context) == 0:
276
+ return []
277
+
278
+ verdicts: List[ContextualRecallVerdict] = []
279
+
280
+ prompt = self.evaluation_template.generate_verdicts(
281
+ expected_outcome=expected_outcome,
282
+ retrieval_context=retrieval_context,
283
+ multimodal=multimodal,
284
+ )
285
+
286
+ if self.using_native_model:
287
+ res, cost = self.model.generate(prompt, schema=Verdicts)
288
+ self.evaluation_cost += cost
289
+ verdicts = [item for item in res.verdicts]
290
+ return verdicts
291
+ else:
292
+ try:
293
+ res: Verdicts = self.model.generate(prompt, schema=Verdicts)
294
+ verdicts = [item for item in res.verdicts]
295
+ return verdicts
296
+ except TypeError:
297
+ res = self.model.generate(prompt)
298
+ data = trimAndLoadJson(res, self)
299
+ verdicts = [
300
+ ContextualRecallVerdict(**item) for item in data["verdicts"]
301
+ ]
302
+ return verdicts
303
+
304
+ async def _a_get_interaction_score_and_reason(
305
+ self,
306
+ expected_outcome: str,
307
+ verdicts: List[ContextualRecallVerdict],
308
+ multimodal: bool,
309
+ ) -> Tuple[float, str]:
310
+ if len(verdicts) == 0:
311
+ return 1, None
312
+
313
+ score = self._calculate_interaction_score(verdicts)
314
+ reason = await self._a_get_interaction_reason(
315
+ expected_outcome, score, verdicts, multimodal
316
+ )
317
+ return (
318
+ (0, reason)
319
+ if self.strict_mode and score < self.threshold
320
+ else (score, reason)
321
+ )
322
+
323
+ def _get_interaction_score_and_reason(
324
+ self,
325
+ expected_outcome: str,
326
+ verdicts: List[ContextualRecallVerdict],
327
+ multimodal: bool,
328
+ ) -> Tuple[float, str]:
329
+ if len(verdicts) == 0:
330
+ return 1, None
331
+
332
+ score = self._calculate_interaction_score(verdicts)
333
+ reason = self._get_interaction_reason(
334
+ expected_outcome, score, verdicts, multimodal
335
+ )
336
+ return (
337
+ (0, reason)
338
+ if self.strict_mode and score < self.threshold
339
+ else (score, reason)
340
+ )
341
+
342
+ def _calculate_interaction_score(
343
+ self, verdicts: List[ContextualRecallVerdict]
344
+ ) -> float:
345
+ number_of_verdicts = len(verdicts)
346
+ if number_of_verdicts == 0:
347
+ return 1
348
+
349
+ attributable_count = 0
350
+ for verdict in verdicts:
351
+ if verdict.verdict.strip().lower() == "yes":
352
+ attributable_count += 1
353
+
354
+ score = attributable_count / number_of_verdicts
355
+ return 0 if self.strict_mode and score < self.threshold else score
356
+
357
+ async def _a_get_interaction_reason(
358
+ self,
359
+ expected_outcome: str,
360
+ score: float,
361
+ verdicts: List[ContextualRecallVerdict],
362
+ multimodal: bool,
363
+ ) -> str:
364
+ if self.include_reason is False:
365
+ return None
366
+
367
+ # Prepare verdicts with node information for reasoning
368
+ supportive_reasons = []
369
+ unsupportive_reasons = []
370
+ for verdict in verdicts:
371
+ if verdict.verdict.lower() == "yes":
372
+ supportive_reasons.append(verdict.reason)
373
+ else:
374
+ unsupportive_reasons.append(verdict.reason)
375
+
376
+ prompt = self.evaluation_template.generate_reason(
377
+ expected_outcome=expected_outcome,
378
+ supportive_reasons=supportive_reasons,
379
+ unsupportive_reasons=unsupportive_reasons,
380
+ score=format(score, ".2f"),
381
+ multimodal=multimodal,
382
+ )
383
+
384
+ if self.using_native_model:
385
+ res, cost = await self.model.a_generate(
386
+ prompt, schema=ContextualRecallScoreReason
387
+ )
388
+ self.evaluation_cost += cost
389
+ return res.reason
390
+ else:
391
+ try:
392
+ res: ContextualRecallScoreReason = await self.model.a_generate(
393
+ prompt, schema=ContextualRecallScoreReason
394
+ )
395
+ return res.reason
396
+ except TypeError:
397
+ res = await self.model.a_generate(prompt)
398
+ data = trimAndLoadJson(res, self)
399
+ return data["reason"]
400
+
401
+ def _get_interaction_reason(
402
+ self,
403
+ expected_outcome: str,
404
+ score: float,
405
+ verdicts: List[ContextualRecallVerdict],
406
+ multimodal: bool,
407
+ ) -> str:
408
+ if self.include_reason is False:
409
+ return None
410
+
411
+ # Prepare verdicts with node information for reasoning
412
+ supportive_reasons = []
413
+ unsupportive_reasons = []
414
+ for verdict in verdicts:
415
+ if verdict.verdict.lower() == "yes":
416
+ supportive_reasons.append(verdict.reason)
417
+ else:
418
+ unsupportive_reasons.append(verdict.reason)
419
+
420
+ prompt = self.evaluation_template.generate_reason(
421
+ expected_outcome=expected_outcome,
422
+ supportive_reasons=supportive_reasons,
423
+ unsupportive_reasons=unsupportive_reasons,
424
+ score=format(score, ".2f"),
425
+ multimodal=multimodal,
426
+ )
427
+
428
+ if self.using_native_model:
429
+ res, cost = self.model.generate(
430
+ prompt, schema=ContextualRecallScoreReason
431
+ )
432
+ self.evaluation_cost += cost
433
+ return res.reason
434
+ else:
435
+ try:
436
+ res: ContextualRecallScoreReason = self.model.generate(
437
+ prompt, schema=ContextualRecallScoreReason
438
+ )
439
+ return res.reason
440
+ except TypeError:
441
+ res = self.model.generate(prompt)
442
+ data = trimAndLoadJson(res, self)
443
+ return data["reason"]
444
+
445
+ def _get_verbose_steps(
446
+ self, interaction_scores: List[InteractionContextualRecallScore]
447
+ ):
448
+ steps = []
449
+ for index, interaction_score in enumerate(interaction_scores):
450
+ interaction_steps = [
451
+ f"Interaction {index + 1} \n",
452
+ f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
453
+ f"Score: {interaction_score.score} \n",
454
+ f"Reason: {interaction_score.reason} \n",
455
+ ]
456
+ steps.extend(interaction_steps)
457
+ return steps
458
+
459
+ def _generate_reason(
460
+ self, scores: List[InteractionContextualRecallScore]
461
+ ) -> str:
462
+ reasons = []
463
+ for score in scores:
464
+ reasons.append(score.reason)
465
+
466
+ prompt = self.evaluation_template.generate_final_reason(
467
+ self.score, self.success, reasons
468
+ )
469
+
470
+ if self.using_native_model:
471
+ res, cost = self.model.generate(prompt)
472
+ self.evaluation_cost += cost
473
+ return res
474
+ else:
475
+ res = self.model.generate(prompt)
476
+ return res
477
+
478
+ async def _a_generate_reason(
479
+ self, scores: List[InteractionContextualRecallScore]
480
+ ) -> str:
481
+ reasons = []
482
+ for score in scores:
483
+ reasons.append(score.reason)
484
+
485
+ prompt = self.evaluation_template.generate_final_reason(
486
+ self.score, self.success, reasons
487
+ )
488
+
489
+ if self.using_native_model:
490
+ res, cost = await self.model.a_generate(prompt)
491
+ self.evaluation_cost += cost
492
+ return res
493
+ else:
494
+ res = await self.model.a_generate(prompt)
495
+ return res
496
+
497
+ def _calculate_score(
498
+ self, scores: List[InteractionContextualRecallScore]
499
+ ) -> float:
500
+ number_of_scores = len(scores)
501
+ if number_of_scores == 0:
502
+ return 1
503
+ total_score = 0
504
+ for score in scores:
505
+ total_score += score.score
506
+ return total_score / number_of_scores
507
+
508
+ def is_successful(self) -> bool:
509
+ if self.error is not None:
510
+ self.success = False
511
+ else:
512
+ try:
513
+ self.success = self.score >= self.threshold
514
+ except:
515
+ self.success = False
516
+ return self.success
517
+
518
+ @property
519
+ def __name__(self):
520
+ return "Turn Contextual Recall"