deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
from typing import List, Optional, Union, Type, Tuple
|
|
2
|
+
import asyncio
|
|
3
|
+
import itertools
|
|
4
|
+
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
6
|
+
from deepeval.utils import (
|
|
7
|
+
get_or_create_event_loop,
|
|
8
|
+
prettify_list,
|
|
9
|
+
)
|
|
10
|
+
from deepeval.metrics.utils import (
|
|
11
|
+
construct_verbose_logs,
|
|
12
|
+
trimAndLoadJson,
|
|
13
|
+
check_conversational_test_case_params,
|
|
14
|
+
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
16
|
+
initialize_model,
|
|
17
|
+
)
|
|
18
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
19
|
+
from deepeval.metrics.turn_contextual_recall.template import (
|
|
20
|
+
TurnContextualRecallTemplate,
|
|
21
|
+
)
|
|
22
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
23
|
+
from deepeval.metrics.turn_contextual_recall.schema import (
|
|
24
|
+
ContextualRecallVerdict,
|
|
25
|
+
Verdicts,
|
|
26
|
+
ContextualRecallScoreReason,
|
|
27
|
+
InteractionContextualRecallScore,
|
|
28
|
+
)
|
|
29
|
+
from deepeval.metrics.api import metric_data_manager
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
33
|
+
_required_test_case_params: List[TurnParams] = [
|
|
34
|
+
TurnParams.ROLE,
|
|
35
|
+
TurnParams.CONTENT,
|
|
36
|
+
TurnParams.RETRIEVAL_CONTEXT,
|
|
37
|
+
TurnParams.EXPECTED_OUTCOME,
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
threshold: float = 0.5,
|
|
43
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
44
|
+
include_reason: bool = True,
|
|
45
|
+
async_mode: bool = True,
|
|
46
|
+
strict_mode: bool = False,
|
|
47
|
+
verbose_mode: bool = False,
|
|
48
|
+
window_size: int = 10,
|
|
49
|
+
evaluation_template: Type[
|
|
50
|
+
TurnContextualRecallTemplate
|
|
51
|
+
] = TurnContextualRecallTemplate,
|
|
52
|
+
):
|
|
53
|
+
self.threshold = 1 if strict_mode else threshold
|
|
54
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
55
|
+
self.evaluation_model = self.model.get_model_name()
|
|
56
|
+
self.include_reason = include_reason
|
|
57
|
+
self.async_mode = async_mode
|
|
58
|
+
self.strict_mode = strict_mode
|
|
59
|
+
self.verbose_mode = verbose_mode
|
|
60
|
+
self.window_size = window_size
|
|
61
|
+
self.evaluation_template = evaluation_template
|
|
62
|
+
|
|
63
|
+
def measure(
|
|
64
|
+
self,
|
|
65
|
+
test_case: ConversationalTestCase,
|
|
66
|
+
_show_indicator: bool = True,
|
|
67
|
+
_in_component: bool = False,
|
|
68
|
+
_log_metric_to_confident: bool = True,
|
|
69
|
+
):
|
|
70
|
+
check_conversational_test_case_params(
|
|
71
|
+
test_case,
|
|
72
|
+
self._required_test_case_params,
|
|
73
|
+
self,
|
|
74
|
+
False,
|
|
75
|
+
self.model,
|
|
76
|
+
test_case.multimodal,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
multimodal = test_case.multimodal
|
|
80
|
+
|
|
81
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
82
|
+
with metric_progress_indicator(
|
|
83
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
84
|
+
):
|
|
85
|
+
if self.async_mode:
|
|
86
|
+
loop = get_or_create_event_loop()
|
|
87
|
+
loop.run_until_complete(
|
|
88
|
+
self.a_measure(
|
|
89
|
+
test_case,
|
|
90
|
+
_show_indicator=False,
|
|
91
|
+
_in_component=_in_component,
|
|
92
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
97
|
+
turns_windows: List[List[Turn]] = [
|
|
98
|
+
list(itertools.chain(*window))
|
|
99
|
+
for window in get_turns_in_sliding_window(
|
|
100
|
+
unit_interactions, self.window_size
|
|
101
|
+
)
|
|
102
|
+
]
|
|
103
|
+
scores = []
|
|
104
|
+
for window in turns_windows:
|
|
105
|
+
scores.extend(
|
|
106
|
+
self._get_contextual_recall_scores(
|
|
107
|
+
window, test_case.expected_outcome, multimodal
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
self.score = self._calculate_score(scores)
|
|
111
|
+
self.success = self.score >= self.threshold
|
|
112
|
+
self.reason = self._generate_reason(scores)
|
|
113
|
+
verbose_steps = self._get_verbose_steps(scores)
|
|
114
|
+
self.verbose_logs = construct_verbose_logs(
|
|
115
|
+
self,
|
|
116
|
+
steps=[
|
|
117
|
+
*verbose_steps,
|
|
118
|
+
f"Final Score: {self.score}\n",
|
|
119
|
+
f"Final Reason: {self.reason}\n",
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
if _log_metric_to_confident:
|
|
123
|
+
metric_data_manager.post_metric_if_enabled(
|
|
124
|
+
self, test_case=test_case
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return self.score
|
|
128
|
+
|
|
129
|
+
async def a_measure(
|
|
130
|
+
self,
|
|
131
|
+
test_case: ConversationalTestCase,
|
|
132
|
+
_show_indicator: bool = True,
|
|
133
|
+
_in_component: bool = False,
|
|
134
|
+
_log_metric_to_confident: bool = True,
|
|
135
|
+
) -> float:
|
|
136
|
+
check_conversational_test_case_params(
|
|
137
|
+
test_case,
|
|
138
|
+
self._required_test_case_params,
|
|
139
|
+
self,
|
|
140
|
+
False,
|
|
141
|
+
self.model,
|
|
142
|
+
test_case.multimodal,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
multimodal = test_case.multimodal
|
|
146
|
+
|
|
147
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
148
|
+
with metric_progress_indicator(
|
|
149
|
+
self,
|
|
150
|
+
async_mode=True,
|
|
151
|
+
_show_indicator=_show_indicator,
|
|
152
|
+
_in_component=_in_component,
|
|
153
|
+
):
|
|
154
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
155
|
+
turns_windows: List[List[Turn]] = [
|
|
156
|
+
list(itertools.chain(*window))
|
|
157
|
+
for window in get_turns_in_sliding_window(
|
|
158
|
+
unit_interactions, self.window_size
|
|
159
|
+
)
|
|
160
|
+
]
|
|
161
|
+
scores = []
|
|
162
|
+
tasks = []
|
|
163
|
+
|
|
164
|
+
async def get_individual_scores(window):
|
|
165
|
+
scores.extend(
|
|
166
|
+
await self._a_get_contextual_recall_scores(
|
|
167
|
+
window, test_case.multimodal, multimodal
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
for window in turns_windows:
|
|
172
|
+
tasks.append(get_individual_scores(window))
|
|
173
|
+
await asyncio.gather(*tasks)
|
|
174
|
+
self.score = self._calculate_score(scores)
|
|
175
|
+
self.success = self.score >= self.threshold
|
|
176
|
+
self.reason = await self._a_generate_reason(scores)
|
|
177
|
+
verbose_steps = self._get_verbose_steps(scores)
|
|
178
|
+
self.verbose_logs = construct_verbose_logs(
|
|
179
|
+
self,
|
|
180
|
+
steps=[
|
|
181
|
+
*verbose_steps,
|
|
182
|
+
f"Final Score: {self.score}\n",
|
|
183
|
+
f"Final Reason: {self.reason}\n",
|
|
184
|
+
],
|
|
185
|
+
)
|
|
186
|
+
if _log_metric_to_confident:
|
|
187
|
+
metric_data_manager.post_metric_if_enabled(
|
|
188
|
+
self, test_case=test_case
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return self.score
|
|
192
|
+
|
|
193
|
+
async def _a_get_contextual_recall_scores(
|
|
194
|
+
self,
|
|
195
|
+
turns_window: List[Turn],
|
|
196
|
+
expected_outcome: str,
|
|
197
|
+
multimodal: bool,
|
|
198
|
+
):
|
|
199
|
+
windows_scores = []
|
|
200
|
+
|
|
201
|
+
user_content = ""
|
|
202
|
+
retrieval_context = []
|
|
203
|
+
for turn in turns_window:
|
|
204
|
+
if turn.role == "user":
|
|
205
|
+
user_content += f"\n{turn.content} "
|
|
206
|
+
else:
|
|
207
|
+
if turn.retrieval_context is not None:
|
|
208
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
209
|
+
|
|
210
|
+
verdicts = await self._a_generate_verdicts(
|
|
211
|
+
expected_outcome, retrieval_context, multimodal
|
|
212
|
+
)
|
|
213
|
+
score, reason = await self._a_get_interaction_score_and_reason(
|
|
214
|
+
expected_outcome, verdicts, multimodal
|
|
215
|
+
)
|
|
216
|
+
interaction_score = InteractionContextualRecallScore(
|
|
217
|
+
score=score,
|
|
218
|
+
reason=reason,
|
|
219
|
+
verdicts=verdicts,
|
|
220
|
+
)
|
|
221
|
+
windows_scores.append(interaction_score)
|
|
222
|
+
|
|
223
|
+
return windows_scores
|
|
224
|
+
|
|
225
|
+
def _get_contextual_recall_scores(
|
|
226
|
+
self,
|
|
227
|
+
turns_window: List[Turn],
|
|
228
|
+
expected_outcome: str,
|
|
229
|
+
multimodal: bool,
|
|
230
|
+
):
|
|
231
|
+
windows_scores = []
|
|
232
|
+
|
|
233
|
+
user_content = ""
|
|
234
|
+
retrieval_context = []
|
|
235
|
+
for turn in turns_window:
|
|
236
|
+
if turn.role == "user":
|
|
237
|
+
user_content += f"\n{turn.content} "
|
|
238
|
+
else:
|
|
239
|
+
if turn.retrieval_context is not None:
|
|
240
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
241
|
+
|
|
242
|
+
verdicts = self._generate_verdicts(
|
|
243
|
+
expected_outcome, retrieval_context, multimodal
|
|
244
|
+
)
|
|
245
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
246
|
+
expected_outcome, verdicts, multimodal
|
|
247
|
+
)
|
|
248
|
+
interaction_score = InteractionContextualRecallScore(
|
|
249
|
+
score=score,
|
|
250
|
+
reason=reason,
|
|
251
|
+
verdicts=verdicts,
|
|
252
|
+
)
|
|
253
|
+
windows_scores.append(interaction_score)
|
|
254
|
+
|
|
255
|
+
return windows_scores
|
|
256
|
+
|
|
257
|
+
async def _a_generate_verdicts(
|
|
258
|
+
self,
|
|
259
|
+
expected_outcome: str,
|
|
260
|
+
retrieval_context: List[str],
|
|
261
|
+
multimodal: bool,
|
|
262
|
+
) -> List[ContextualRecallVerdict]:
|
|
263
|
+
if len(retrieval_context) == 0:
|
|
264
|
+
return []
|
|
265
|
+
|
|
266
|
+
verdicts: List[ContextualRecallVerdict] = []
|
|
267
|
+
|
|
268
|
+
prompt = self.evaluation_template.generate_verdicts(
|
|
269
|
+
expected_outcome=expected_outcome,
|
|
270
|
+
retrieval_context=retrieval_context,
|
|
271
|
+
multimodal=multimodal,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
if self.using_native_model:
|
|
275
|
+
res, cost = await self.model.a_generate(prompt, schema=Verdicts)
|
|
276
|
+
self.evaluation_cost += cost
|
|
277
|
+
verdicts = [item for item in res.verdicts]
|
|
278
|
+
return verdicts
|
|
279
|
+
else:
|
|
280
|
+
try:
|
|
281
|
+
res: Verdicts = await self.model.a_generate(
|
|
282
|
+
prompt, schema=Verdicts
|
|
283
|
+
)
|
|
284
|
+
verdicts = [item for item in res.verdicts]
|
|
285
|
+
return verdicts
|
|
286
|
+
except TypeError:
|
|
287
|
+
res = await self.model.a_generate(prompt)
|
|
288
|
+
data = trimAndLoadJson(res, self)
|
|
289
|
+
verdicts = [
|
|
290
|
+
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
291
|
+
]
|
|
292
|
+
return verdicts
|
|
293
|
+
|
|
294
|
+
def _generate_verdicts(
|
|
295
|
+
self,
|
|
296
|
+
expected_outcome: str,
|
|
297
|
+
retrieval_context: List[str],
|
|
298
|
+
multimodal: bool,
|
|
299
|
+
) -> List[ContextualRecallVerdict]:
|
|
300
|
+
if len(retrieval_context) == 0:
|
|
301
|
+
return []
|
|
302
|
+
|
|
303
|
+
verdicts: List[ContextualRecallVerdict] = []
|
|
304
|
+
|
|
305
|
+
prompt = self.evaluation_template.generate_verdicts(
|
|
306
|
+
expected_outcome=expected_outcome,
|
|
307
|
+
retrieval_context=retrieval_context,
|
|
308
|
+
multimodal=multimodal,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
if self.using_native_model:
|
|
312
|
+
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
313
|
+
self.evaluation_cost += cost
|
|
314
|
+
verdicts = [item for item in res.verdicts]
|
|
315
|
+
return verdicts
|
|
316
|
+
else:
|
|
317
|
+
try:
|
|
318
|
+
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
319
|
+
verdicts = [item for item in res.verdicts]
|
|
320
|
+
return verdicts
|
|
321
|
+
except TypeError:
|
|
322
|
+
res = self.model.generate(prompt)
|
|
323
|
+
data = trimAndLoadJson(res, self)
|
|
324
|
+
verdicts = [
|
|
325
|
+
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
326
|
+
]
|
|
327
|
+
return verdicts
|
|
328
|
+
|
|
329
|
+
async def _a_get_interaction_score_and_reason(
|
|
330
|
+
self,
|
|
331
|
+
expected_outcome: str,
|
|
332
|
+
verdicts: List[ContextualRecallVerdict],
|
|
333
|
+
multimodal: bool,
|
|
334
|
+
) -> Tuple[float, str]:
|
|
335
|
+
if len(verdicts) == 0:
|
|
336
|
+
return (
|
|
337
|
+
1,
|
|
338
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual recall.",
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
score = self._calculate_interaction_score(verdicts)
|
|
342
|
+
reason = await self._a_get_interaction_reason(
|
|
343
|
+
expected_outcome, score, verdicts, multimodal
|
|
344
|
+
)
|
|
345
|
+
return (
|
|
346
|
+
(0, reason)
|
|
347
|
+
if self.strict_mode and score < self.threshold
|
|
348
|
+
else (score, reason)
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
def _get_interaction_score_and_reason(
|
|
352
|
+
self,
|
|
353
|
+
expected_outcome: str,
|
|
354
|
+
verdicts: List[ContextualRecallVerdict],
|
|
355
|
+
multimodal: bool,
|
|
356
|
+
) -> Tuple[float, str]:
|
|
357
|
+
if len(verdicts) == 0:
|
|
358
|
+
return (
|
|
359
|
+
1,
|
|
360
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual recall.",
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
score = self._calculate_interaction_score(verdicts)
|
|
364
|
+
reason = self._get_interaction_reason(
|
|
365
|
+
expected_outcome, score, verdicts, multimodal
|
|
366
|
+
)
|
|
367
|
+
return (
|
|
368
|
+
(0, reason)
|
|
369
|
+
if self.strict_mode and score < self.threshold
|
|
370
|
+
else (score, reason)
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
def _calculate_interaction_score(
|
|
374
|
+
self, verdicts: List[ContextualRecallVerdict]
|
|
375
|
+
) -> float:
|
|
376
|
+
number_of_verdicts = len(verdicts)
|
|
377
|
+
if number_of_verdicts == 0:
|
|
378
|
+
return 1
|
|
379
|
+
|
|
380
|
+
attributable_count = 0
|
|
381
|
+
for verdict in verdicts:
|
|
382
|
+
if verdict.verdict.strip().lower() == "yes":
|
|
383
|
+
attributable_count += 1
|
|
384
|
+
|
|
385
|
+
score = attributable_count / number_of_verdicts
|
|
386
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
387
|
+
|
|
388
|
+
async def _a_get_interaction_reason(
|
|
389
|
+
self,
|
|
390
|
+
expected_outcome: str,
|
|
391
|
+
score: float,
|
|
392
|
+
verdicts: List[ContextualRecallVerdict],
|
|
393
|
+
multimodal: bool,
|
|
394
|
+
) -> str:
|
|
395
|
+
if self.include_reason is False:
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
# Prepare verdicts with node information for reasoning
|
|
399
|
+
supportive_reasons = []
|
|
400
|
+
unsupportive_reasons = []
|
|
401
|
+
for verdict in verdicts:
|
|
402
|
+
if verdict.verdict.lower() == "yes":
|
|
403
|
+
supportive_reasons.append(verdict.reason)
|
|
404
|
+
else:
|
|
405
|
+
unsupportive_reasons.append(verdict.reason)
|
|
406
|
+
|
|
407
|
+
prompt = self.evaluation_template.generate_reason(
|
|
408
|
+
expected_outcome=expected_outcome,
|
|
409
|
+
supportive_reasons=supportive_reasons,
|
|
410
|
+
unsupportive_reasons=unsupportive_reasons,
|
|
411
|
+
score=format(score, ".2f"),
|
|
412
|
+
multimodal=multimodal,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
if self.using_native_model:
|
|
416
|
+
res, cost = await self.model.a_generate(
|
|
417
|
+
prompt, schema=ContextualRecallScoreReason
|
|
418
|
+
)
|
|
419
|
+
self.evaluation_cost += cost
|
|
420
|
+
return res.reason
|
|
421
|
+
else:
|
|
422
|
+
try:
|
|
423
|
+
res: ContextualRecallScoreReason = await self.model.a_generate(
|
|
424
|
+
prompt, schema=ContextualRecallScoreReason
|
|
425
|
+
)
|
|
426
|
+
return res.reason
|
|
427
|
+
except TypeError:
|
|
428
|
+
res = await self.model.a_generate(prompt)
|
|
429
|
+
data = trimAndLoadJson(res, self)
|
|
430
|
+
return data["reason"]
|
|
431
|
+
|
|
432
|
+
def _get_interaction_reason(
|
|
433
|
+
self,
|
|
434
|
+
expected_outcome: str,
|
|
435
|
+
score: float,
|
|
436
|
+
verdicts: List[ContextualRecallVerdict],
|
|
437
|
+
multimodal: bool,
|
|
438
|
+
) -> str:
|
|
439
|
+
if self.include_reason is False:
|
|
440
|
+
return None
|
|
441
|
+
|
|
442
|
+
# Prepare verdicts with node information for reasoning
|
|
443
|
+
supportive_reasons = []
|
|
444
|
+
unsupportive_reasons = []
|
|
445
|
+
for verdict in verdicts:
|
|
446
|
+
if verdict.verdict.lower() == "yes":
|
|
447
|
+
supportive_reasons.append(verdict.reason)
|
|
448
|
+
else:
|
|
449
|
+
unsupportive_reasons.append(verdict.reason)
|
|
450
|
+
|
|
451
|
+
prompt = self.evaluation_template.generate_reason(
|
|
452
|
+
expected_outcome=expected_outcome,
|
|
453
|
+
supportive_reasons=supportive_reasons,
|
|
454
|
+
unsupportive_reasons=unsupportive_reasons,
|
|
455
|
+
score=format(score, ".2f"),
|
|
456
|
+
multimodal=multimodal,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
if self.using_native_model:
|
|
460
|
+
res, cost = self.model.generate(
|
|
461
|
+
prompt, schema=ContextualRecallScoreReason
|
|
462
|
+
)
|
|
463
|
+
self.evaluation_cost += cost
|
|
464
|
+
return res.reason
|
|
465
|
+
else:
|
|
466
|
+
try:
|
|
467
|
+
res: ContextualRecallScoreReason = self.model.generate(
|
|
468
|
+
prompt, schema=ContextualRecallScoreReason
|
|
469
|
+
)
|
|
470
|
+
return res.reason
|
|
471
|
+
except TypeError:
|
|
472
|
+
res = self.model.generate(prompt)
|
|
473
|
+
data = trimAndLoadJson(res, self)
|
|
474
|
+
return data["reason"]
|
|
475
|
+
|
|
476
|
+
def _get_verbose_steps(
|
|
477
|
+
self, interaction_scores: List[InteractionContextualRecallScore]
|
|
478
|
+
):
|
|
479
|
+
steps = []
|
|
480
|
+
for index, interaction_score in enumerate(interaction_scores):
|
|
481
|
+
interaction_steps = [
|
|
482
|
+
f"Window {index + 1} \n",
|
|
483
|
+
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
484
|
+
f"Score: {interaction_score.score} \n",
|
|
485
|
+
f"Reason: {interaction_score.reason} \n",
|
|
486
|
+
]
|
|
487
|
+
steps.extend(interaction_steps)
|
|
488
|
+
return steps
|
|
489
|
+
|
|
490
|
+
def _generate_reason(
|
|
491
|
+
self, scores: List[InteractionContextualRecallScore]
|
|
492
|
+
) -> str:
|
|
493
|
+
if self.include_reason is False:
|
|
494
|
+
return None
|
|
495
|
+
|
|
496
|
+
if len(scores) == 0:
|
|
497
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
498
|
+
|
|
499
|
+
reasons = []
|
|
500
|
+
for score in scores:
|
|
501
|
+
reasons.append(score.reason)
|
|
502
|
+
|
|
503
|
+
prompt = self.evaluation_template.generate_final_reason(
|
|
504
|
+
self.score, self.success, reasons
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
if self.using_native_model:
|
|
508
|
+
res, cost = self.model.generate(prompt)
|
|
509
|
+
self.evaluation_cost += cost
|
|
510
|
+
return res
|
|
511
|
+
else:
|
|
512
|
+
res = self.model.generate(prompt)
|
|
513
|
+
return res
|
|
514
|
+
|
|
515
|
+
async def _a_generate_reason(
|
|
516
|
+
self, scores: List[InteractionContextualRecallScore]
|
|
517
|
+
) -> str:
|
|
518
|
+
if self.include_reason is False:
|
|
519
|
+
return None
|
|
520
|
+
|
|
521
|
+
if len(scores) == 0:
|
|
522
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
523
|
+
|
|
524
|
+
reasons = []
|
|
525
|
+
for score in scores:
|
|
526
|
+
reasons.append(score.reason)
|
|
527
|
+
|
|
528
|
+
prompt = self.evaluation_template.generate_final_reason(
|
|
529
|
+
self.score, self.success, reasons
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
if self.using_native_model:
|
|
533
|
+
res, cost = await self.model.a_generate(prompt)
|
|
534
|
+
self.evaluation_cost += cost
|
|
535
|
+
return res
|
|
536
|
+
else:
|
|
537
|
+
res = await self.model.a_generate(prompt)
|
|
538
|
+
return res
|
|
539
|
+
|
|
540
|
+
def _calculate_score(
|
|
541
|
+
self, scores: List[InteractionContextualRecallScore]
|
|
542
|
+
) -> float:
|
|
543
|
+
number_of_scores = len(scores)
|
|
544
|
+
if number_of_scores == 0:
|
|
545
|
+
return 1
|
|
546
|
+
total_score = 0
|
|
547
|
+
for score in scores:
|
|
548
|
+
total_score += score.score
|
|
549
|
+
return total_score / number_of_scores
|
|
550
|
+
|
|
551
|
+
def is_successful(self) -> bool:
|
|
552
|
+
if self.error is not None:
|
|
553
|
+
self.success = False
|
|
554
|
+
else:
|
|
555
|
+
try:
|
|
556
|
+
self.success = self.score >= self.threshold
|
|
557
|
+
except:
|
|
558
|
+
self.success = False
|
|
559
|
+
return self.success
|
|
560
|
+
|
|
561
|
+
@property
|
|
562
|
+
def __name__(self):
|
|
563
|
+
return "Turn Contextual Recall"
|
|
@@ -12,5 +12,11 @@ class ContextualRelevancyVerdicts(BaseModel):
|
|
|
12
12
|
verdicts: List[ContextualRelevancyVerdict]
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class
|
|
15
|
+
class ContextualRelevancyScoreReason(BaseModel):
|
|
16
16
|
reason: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InteractionContextualRelevancyScore(BaseModel):
|
|
20
|
+
score: float
|
|
21
|
+
reason: Optional[str]
|
|
22
|
+
verdicts: Optional[List[ContextualRelevancyVerdict]]
|