deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,550 @@
|
|
|
1
|
+
from typing import List, Optional, Union, Type, Tuple
|
|
2
|
+
import asyncio
|
|
3
|
+
|
|
4
|
+
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
6
|
+
from deepeval.utils import (
|
|
7
|
+
get_or_create_event_loop,
|
|
8
|
+
prettify_list,
|
|
9
|
+
)
|
|
10
|
+
from deepeval.metrics.utils import (
|
|
11
|
+
construct_verbose_logs,
|
|
12
|
+
trimAndLoadJson,
|
|
13
|
+
check_conversational_test_case_params,
|
|
14
|
+
get_unit_interactions,
|
|
15
|
+
initialize_model,
|
|
16
|
+
)
|
|
17
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
18
|
+
from deepeval.metrics.turn_contextual_precision.template import (
|
|
19
|
+
TurnContextualPrecisionTemplate,
|
|
20
|
+
)
|
|
21
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
22
|
+
from deepeval.metrics.turn_contextual_precision.schema import (
|
|
23
|
+
ContextualPrecisionVerdict,
|
|
24
|
+
Verdicts,
|
|
25
|
+
ContextualPrecisionScoreReason,
|
|
26
|
+
InteractionContextualPrecisionScore,
|
|
27
|
+
)
|
|
28
|
+
from deepeval.metrics.api import metric_data_manager
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
32
|
+
_required_test_case_params: List[TurnParams] = [
|
|
33
|
+
TurnParams.CONTENT,
|
|
34
|
+
TurnParams.RETRIEVAL_CONTEXT,
|
|
35
|
+
TurnParams.EXPECTED_OUTCOME,
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
threshold: float = 0.5,
|
|
41
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
42
|
+
include_reason: bool = True,
|
|
43
|
+
async_mode: bool = True,
|
|
44
|
+
strict_mode: bool = False,
|
|
45
|
+
verbose_mode: bool = False,
|
|
46
|
+
evaluation_template: Type[
|
|
47
|
+
TurnContextualPrecisionTemplate
|
|
48
|
+
] = TurnContextualPrecisionTemplate,
|
|
49
|
+
):
|
|
50
|
+
self.threshold = 1 if strict_mode else threshold
|
|
51
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
52
|
+
self.evaluation_model = self.model.get_model_name()
|
|
53
|
+
self.include_reason = include_reason
|
|
54
|
+
self.async_mode = async_mode
|
|
55
|
+
self.strict_mode = strict_mode
|
|
56
|
+
self.verbose_mode = verbose_mode
|
|
57
|
+
self.evaluation_template = evaluation_template
|
|
58
|
+
|
|
59
|
+
def measure(
|
|
60
|
+
self,
|
|
61
|
+
test_case: ConversationalTestCase,
|
|
62
|
+
_show_indicator: bool = True,
|
|
63
|
+
_in_component: bool = False,
|
|
64
|
+
_log_metric_to_confident: bool = True,
|
|
65
|
+
):
|
|
66
|
+
check_conversational_test_case_params(
|
|
67
|
+
test_case,
|
|
68
|
+
self._required_test_case_params,
|
|
69
|
+
self,
|
|
70
|
+
False,
|
|
71
|
+
self.model,
|
|
72
|
+
test_case.multimodal,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
multimodal = test_case.multimodal
|
|
76
|
+
|
|
77
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
78
|
+
with metric_progress_indicator(
|
|
79
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
80
|
+
):
|
|
81
|
+
if self.async_mode:
|
|
82
|
+
loop = get_or_create_event_loop()
|
|
83
|
+
loop.run_until_complete(
|
|
84
|
+
self.a_measure(
|
|
85
|
+
test_case,
|
|
86
|
+
_show_indicator=False,
|
|
87
|
+
_in_component=_in_component,
|
|
88
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
93
|
+
scores = self._get_contextual_precision_scores(
|
|
94
|
+
unit_interactions, test_case.expected_outcome, multimodal
|
|
95
|
+
)
|
|
96
|
+
self.score = self._calculate_score(scores)
|
|
97
|
+
self.success = self.score >= self.threshold
|
|
98
|
+
self.reason = self._generate_reason(scores)
|
|
99
|
+
verbose_steps = self._get_verbose_steps(scores)
|
|
100
|
+
self.verbose_logs = construct_verbose_logs(
|
|
101
|
+
self,
|
|
102
|
+
steps=[
|
|
103
|
+
*verbose_steps,
|
|
104
|
+
f"Final Score: {self.score}\n",
|
|
105
|
+
f"Final Reason: {self.reason}\n",
|
|
106
|
+
],
|
|
107
|
+
)
|
|
108
|
+
if _log_metric_to_confident:
|
|
109
|
+
metric_data_manager.post_metric_if_enabled(
|
|
110
|
+
self, test_case=test_case
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return self.score
|
|
114
|
+
|
|
115
|
+
async def a_measure(
|
|
116
|
+
self,
|
|
117
|
+
test_case: ConversationalTestCase,
|
|
118
|
+
_show_indicator: bool = True,
|
|
119
|
+
_in_component: bool = False,
|
|
120
|
+
_log_metric_to_confident: bool = True,
|
|
121
|
+
) -> float:
|
|
122
|
+
check_conversational_test_case_params(
|
|
123
|
+
test_case,
|
|
124
|
+
self._required_test_case_params,
|
|
125
|
+
self,
|
|
126
|
+
False,
|
|
127
|
+
self.model,
|
|
128
|
+
test_case.multimodal,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
multimodal = test_case.multimodal
|
|
132
|
+
|
|
133
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
134
|
+
with metric_progress_indicator(
|
|
135
|
+
self,
|
|
136
|
+
async_mode=True,
|
|
137
|
+
_show_indicator=_show_indicator,
|
|
138
|
+
_in_component=_in_component,
|
|
139
|
+
):
|
|
140
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
141
|
+
scores = await self._a_get_contextual_precision_scores(
|
|
142
|
+
unit_interactions, test_case.expected_outcome, multimodal
|
|
143
|
+
)
|
|
144
|
+
self.score = self._calculate_score(scores)
|
|
145
|
+
self.success = self.score >= self.threshold
|
|
146
|
+
self.reason = await self._a_generate_reason(scores)
|
|
147
|
+
verbose_steps = self._get_verbose_steps(scores)
|
|
148
|
+
self.verbose_logs = construct_verbose_logs(
|
|
149
|
+
self,
|
|
150
|
+
steps=[
|
|
151
|
+
*verbose_steps,
|
|
152
|
+
f"Final Score: {self.score}\n",
|
|
153
|
+
f"Final Reason: {self.reason}\n",
|
|
154
|
+
],
|
|
155
|
+
)
|
|
156
|
+
if _log_metric_to_confident:
|
|
157
|
+
metric_data_manager.post_metric_if_enabled(
|
|
158
|
+
self, test_case=test_case
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return self.score
|
|
162
|
+
|
|
163
|
+
async def _a_get_contextual_precision_scores(
|
|
164
|
+
self,
|
|
165
|
+
unit_interactions: List[List[Turn]],
|
|
166
|
+
_expected_outcome: str,
|
|
167
|
+
multimodal: bool,
|
|
168
|
+
):
|
|
169
|
+
async def get_interaction_score(unit_interaction: List[Turn]):
|
|
170
|
+
user_content = "User Message: "
|
|
171
|
+
retrieval_context = []
|
|
172
|
+
expected_outcome = (
|
|
173
|
+
f"Expected Assistant Message: \n{_expected_outcome}"
|
|
174
|
+
)
|
|
175
|
+
for turn in unit_interaction:
|
|
176
|
+
if turn.role == "user":
|
|
177
|
+
user_content += f"\n{turn.content} "
|
|
178
|
+
else:
|
|
179
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
180
|
+
|
|
181
|
+
verdicts = await self._a_generate_verdicts(
|
|
182
|
+
user_content, expected_outcome, retrieval_context, multimodal
|
|
183
|
+
)
|
|
184
|
+
score, reason = await self._a_get_interaction_score_and_reason(
|
|
185
|
+
user_content, verdicts, multimodal
|
|
186
|
+
)
|
|
187
|
+
interaction_score = InteractionContextualPrecisionScore(
|
|
188
|
+
score=score,
|
|
189
|
+
reason=reason,
|
|
190
|
+
verdicts=verdicts,
|
|
191
|
+
)
|
|
192
|
+
return interaction_score
|
|
193
|
+
|
|
194
|
+
final_scores = await asyncio.gather(
|
|
195
|
+
*[
|
|
196
|
+
get_interaction_score(unit_interaction)
|
|
197
|
+
for unit_interaction in unit_interactions
|
|
198
|
+
]
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return final_scores
|
|
202
|
+
|
|
203
|
+
def _get_contextual_precision_scores(
|
|
204
|
+
self,
|
|
205
|
+
unit_interactions: List[List[Turn]],
|
|
206
|
+
_expected_outcome: str,
|
|
207
|
+
multimodal: bool,
|
|
208
|
+
):
|
|
209
|
+
interaction_scores = []
|
|
210
|
+
|
|
211
|
+
for unit_interaction in unit_interactions:
|
|
212
|
+
user_content = "User Message: "
|
|
213
|
+
retrieval_context = []
|
|
214
|
+
expected_outcome = (
|
|
215
|
+
f"Expected Assistant Message: \n{_expected_outcome}"
|
|
216
|
+
)
|
|
217
|
+
for turn in unit_interaction:
|
|
218
|
+
if turn.role == "user":
|
|
219
|
+
user_content += f"\n{turn.content} "
|
|
220
|
+
else:
|
|
221
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
222
|
+
|
|
223
|
+
verdicts = self._generate_verdicts(
|
|
224
|
+
user_content, expected_outcome, retrieval_context, multimodal
|
|
225
|
+
)
|
|
226
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
227
|
+
user_content, verdicts, multimodal
|
|
228
|
+
)
|
|
229
|
+
interaction_score = InteractionContextualPrecisionScore(
|
|
230
|
+
score=score,
|
|
231
|
+
reason=reason,
|
|
232
|
+
verdicts=verdicts,
|
|
233
|
+
)
|
|
234
|
+
interaction_scores.append(interaction_score)
|
|
235
|
+
|
|
236
|
+
return interaction_scores
|
|
237
|
+
|
|
238
|
+
async def _a_generate_verdicts(
|
|
239
|
+
self,
|
|
240
|
+
input: str,
|
|
241
|
+
expected_outcome: str,
|
|
242
|
+
retrieval_context: List[str],
|
|
243
|
+
multimodal: bool,
|
|
244
|
+
) -> List[ContextualPrecisionVerdict]:
|
|
245
|
+
if len(retrieval_context) == 0:
|
|
246
|
+
return []
|
|
247
|
+
|
|
248
|
+
verdicts: List[ContextualPrecisionVerdict] = []
|
|
249
|
+
|
|
250
|
+
prompt = self.evaluation_template.generate_verdicts(
|
|
251
|
+
input=input,
|
|
252
|
+
expected_outcome=expected_outcome,
|
|
253
|
+
retrieval_context=retrieval_context,
|
|
254
|
+
multimodal=multimodal,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
if self.using_native_model:
|
|
258
|
+
res, cost = await self.model.a_generate(prompt, schema=Verdicts)
|
|
259
|
+
self.evaluation_cost += cost
|
|
260
|
+
verdicts = [item for item in res.verdicts]
|
|
261
|
+
return verdicts
|
|
262
|
+
else:
|
|
263
|
+
try:
|
|
264
|
+
res: Verdicts = await self.model.a_generate(
|
|
265
|
+
prompt, schema=Verdicts
|
|
266
|
+
)
|
|
267
|
+
verdicts = [item for item in res.verdicts]
|
|
268
|
+
return verdicts
|
|
269
|
+
except TypeError:
|
|
270
|
+
res = await self.model.a_generate(prompt)
|
|
271
|
+
data = trimAndLoadJson(res, self)
|
|
272
|
+
verdicts = [
|
|
273
|
+
ContextualPrecisionVerdict(**item)
|
|
274
|
+
for item in data["verdicts"]
|
|
275
|
+
]
|
|
276
|
+
return verdicts
|
|
277
|
+
|
|
278
|
+
def _generate_verdicts(
|
|
279
|
+
self,
|
|
280
|
+
input: str,
|
|
281
|
+
expected_outcome: str,
|
|
282
|
+
retrieval_context: List[str],
|
|
283
|
+
multimodal: bool,
|
|
284
|
+
) -> List[ContextualPrecisionVerdict]:
|
|
285
|
+
if len(retrieval_context) == 0:
|
|
286
|
+
return []
|
|
287
|
+
|
|
288
|
+
verdicts: List[ContextualPrecisionVerdict] = []
|
|
289
|
+
|
|
290
|
+
prompt = self.evaluation_template.generate_verdicts(
|
|
291
|
+
input=input,
|
|
292
|
+
expected_outcome=expected_outcome,
|
|
293
|
+
retrieval_context=retrieval_context,
|
|
294
|
+
multimodal=multimodal,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if self.using_native_model:
|
|
298
|
+
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
299
|
+
self.evaluation_cost += cost
|
|
300
|
+
verdicts = [item for item in res.verdicts]
|
|
301
|
+
return verdicts
|
|
302
|
+
else:
|
|
303
|
+
try:
|
|
304
|
+
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
305
|
+
verdicts = [item for item in res.verdicts]
|
|
306
|
+
return verdicts
|
|
307
|
+
except TypeError:
|
|
308
|
+
res = self.model.generate(prompt)
|
|
309
|
+
data = trimAndLoadJson(res, self)
|
|
310
|
+
verdicts = [
|
|
311
|
+
ContextualPrecisionVerdict(**item)
|
|
312
|
+
for item in data["verdicts"]
|
|
313
|
+
]
|
|
314
|
+
return verdicts
|
|
315
|
+
|
|
316
|
+
async def _a_get_interaction_score_and_reason(
|
|
317
|
+
self,
|
|
318
|
+
input: str,
|
|
319
|
+
verdicts: List[ContextualPrecisionVerdict],
|
|
320
|
+
multimodal: bool,
|
|
321
|
+
) -> Tuple[float, str]:
|
|
322
|
+
if len(verdicts) == 0:
|
|
323
|
+
return 1, None
|
|
324
|
+
|
|
325
|
+
score = self._calculate_interaction_score(verdicts)
|
|
326
|
+
reason = await self._a_get_interaction_reason(
|
|
327
|
+
input, score, verdicts, multimodal
|
|
328
|
+
)
|
|
329
|
+
return (
|
|
330
|
+
(0, reason)
|
|
331
|
+
if self.strict_mode and score < self.threshold
|
|
332
|
+
else (score, reason)
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
def _get_interaction_score_and_reason(
|
|
336
|
+
self,
|
|
337
|
+
input: str,
|
|
338
|
+
verdicts: List[ContextualPrecisionVerdict],
|
|
339
|
+
multimodal: bool,
|
|
340
|
+
) -> Tuple[float, str]:
|
|
341
|
+
if len(verdicts) == 0:
|
|
342
|
+
return 1, None
|
|
343
|
+
|
|
344
|
+
score = self._calculate_interaction_score(verdicts)
|
|
345
|
+
reason = self._get_interaction_reason(
|
|
346
|
+
input, score, verdicts, multimodal
|
|
347
|
+
)
|
|
348
|
+
return (
|
|
349
|
+
(0, reason)
|
|
350
|
+
if self.strict_mode and score < self.threshold
|
|
351
|
+
else (score, reason)
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
def _calculate_interaction_score(
|
|
355
|
+
self, verdicts: List[ContextualPrecisionVerdict]
|
|
356
|
+
) -> float:
|
|
357
|
+
number_of_verdicts = len(verdicts)
|
|
358
|
+
if number_of_verdicts == 0:
|
|
359
|
+
return 0
|
|
360
|
+
|
|
361
|
+
# Convert verdicts to binary list where 'yes' is 1 and others are 0
|
|
362
|
+
node_verdicts = [
|
|
363
|
+
1 if v.verdict.strip().lower() == "yes" else 0 for v in verdicts
|
|
364
|
+
]
|
|
365
|
+
|
|
366
|
+
sum_weighted_precision_at_k = 0.0
|
|
367
|
+
relevant_nodes_count = 0
|
|
368
|
+
|
|
369
|
+
for k, is_relevant in enumerate(node_verdicts, start=1):
|
|
370
|
+
# If the item is relevant, update the counter and add weighted precision to sum
|
|
371
|
+
if is_relevant:
|
|
372
|
+
relevant_nodes_count += 1
|
|
373
|
+
precision_at_k = relevant_nodes_count / k
|
|
374
|
+
sum_weighted_precision_at_k += precision_at_k * is_relevant
|
|
375
|
+
|
|
376
|
+
if relevant_nodes_count == 0:
|
|
377
|
+
return 0
|
|
378
|
+
|
|
379
|
+
# Calculate Average Precision
|
|
380
|
+
score = sum_weighted_precision_at_k / relevant_nodes_count
|
|
381
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
382
|
+
|
|
383
|
+
async def _a_get_interaction_reason(
|
|
384
|
+
self,
|
|
385
|
+
input: str,
|
|
386
|
+
score: float,
|
|
387
|
+
verdicts: List[ContextualPrecisionVerdict],
|
|
388
|
+
multimodal: bool,
|
|
389
|
+
) -> str:
|
|
390
|
+
if self.include_reason is False:
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
# Prepare verdicts with node information for reasoning
|
|
394
|
+
verdicts_with_nodes = []
|
|
395
|
+
for i, verdict in enumerate(verdicts):
|
|
396
|
+
verdicts_with_nodes.append(
|
|
397
|
+
{
|
|
398
|
+
"verdict": verdict.verdict,
|
|
399
|
+
"reason": verdict.reason,
|
|
400
|
+
"node": f"Node {i + 1}",
|
|
401
|
+
}
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
prompt = self.evaluation_template.generate_reason(
|
|
405
|
+
input=input,
|
|
406
|
+
score=format(score, ".2f"),
|
|
407
|
+
verdicts=verdicts_with_nodes,
|
|
408
|
+
multimodal=multimodal,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
if self.using_native_model:
|
|
412
|
+
res, cost = await self.model.a_generate(
|
|
413
|
+
prompt, schema=ContextualPrecisionScoreReason
|
|
414
|
+
)
|
|
415
|
+
self.evaluation_cost += cost
|
|
416
|
+
return res.reason
|
|
417
|
+
else:
|
|
418
|
+
try:
|
|
419
|
+
res: ContextualPrecisionScoreReason = (
|
|
420
|
+
await self.model.a_generate(
|
|
421
|
+
prompt, schema=ContextualPrecisionScoreReason
|
|
422
|
+
)
|
|
423
|
+
)
|
|
424
|
+
return res.reason
|
|
425
|
+
except TypeError:
|
|
426
|
+
res = await self.model.a_generate(prompt)
|
|
427
|
+
data = trimAndLoadJson(res, self)
|
|
428
|
+
return data["reason"]
|
|
429
|
+
|
|
430
|
+
def _get_interaction_reason(
|
|
431
|
+
self,
|
|
432
|
+
input: str,
|
|
433
|
+
score: float,
|
|
434
|
+
verdicts: List[ContextualPrecisionVerdict],
|
|
435
|
+
multimodal: bool,
|
|
436
|
+
) -> str:
|
|
437
|
+
if self.include_reason is False:
|
|
438
|
+
return None
|
|
439
|
+
|
|
440
|
+
# Prepare verdicts with node information for reasoning
|
|
441
|
+
verdicts_with_nodes = []
|
|
442
|
+
for i, verdict in enumerate(verdicts):
|
|
443
|
+
verdicts_with_nodes.append(
|
|
444
|
+
{
|
|
445
|
+
"verdict": verdict.verdict,
|
|
446
|
+
"reason": verdict.reason,
|
|
447
|
+
"node": f"Node {i + 1}",
|
|
448
|
+
}
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
prompt = self.evaluation_template.generate_reason(
|
|
452
|
+
input=input,
|
|
453
|
+
score=format(score, ".2f"),
|
|
454
|
+
verdicts=verdicts_with_nodes,
|
|
455
|
+
multimodal=multimodal,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if self.using_native_model:
|
|
459
|
+
res, cost = self.model.generate(
|
|
460
|
+
prompt, schema=ContextualPrecisionScoreReason
|
|
461
|
+
)
|
|
462
|
+
self.evaluation_cost += cost
|
|
463
|
+
return res.reason
|
|
464
|
+
else:
|
|
465
|
+
try:
|
|
466
|
+
res: ContextualPrecisionScoreReason = self.model.generate(
|
|
467
|
+
prompt, schema=ContextualPrecisionScoreReason
|
|
468
|
+
)
|
|
469
|
+
return res.reason
|
|
470
|
+
except TypeError:
|
|
471
|
+
res = self.model.generate(prompt)
|
|
472
|
+
data = trimAndLoadJson(res, self)
|
|
473
|
+
return data["reason"]
|
|
474
|
+
|
|
475
|
+
def _get_verbose_steps(
|
|
476
|
+
self, interaction_scores: List[InteractionContextualPrecisionScore]
|
|
477
|
+
):
|
|
478
|
+
steps = []
|
|
479
|
+
for index, interaction_score in enumerate(interaction_scores):
|
|
480
|
+
interaction_steps = [
|
|
481
|
+
f"Interaction {index + 1} \n",
|
|
482
|
+
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
483
|
+
f"Score: {interaction_score.score} \n",
|
|
484
|
+
f"Reason: {interaction_score.reason} \n",
|
|
485
|
+
]
|
|
486
|
+
steps.extend(interaction_steps)
|
|
487
|
+
return steps
|
|
488
|
+
|
|
489
|
+
def _generate_reason(
|
|
490
|
+
self, scores: List[InteractionContextualPrecisionScore]
|
|
491
|
+
) -> str:
|
|
492
|
+
reasons = []
|
|
493
|
+
for score in scores:
|
|
494
|
+
reasons.append(score.reason)
|
|
495
|
+
|
|
496
|
+
prompt = self.evaluation_template.generate_final_reason(
|
|
497
|
+
self.score, self.success, reasons
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
if self.using_native_model:
|
|
501
|
+
res, cost = self.model.generate(prompt)
|
|
502
|
+
self.evaluation_cost += cost
|
|
503
|
+
return res
|
|
504
|
+
else:
|
|
505
|
+
res = self.model.generate(prompt)
|
|
506
|
+
return res
|
|
507
|
+
|
|
508
|
+
async def _a_generate_reason(
|
|
509
|
+
self, scores: List[InteractionContextualPrecisionScore]
|
|
510
|
+
) -> str:
|
|
511
|
+
reasons = []
|
|
512
|
+
for score in scores:
|
|
513
|
+
reasons.append(score.reason)
|
|
514
|
+
|
|
515
|
+
prompt = self.evaluation_template.generate_final_reason(
|
|
516
|
+
self.score, self.success, reasons
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
if self.using_native_model:
|
|
520
|
+
res, cost = await self.model.a_generate(prompt)
|
|
521
|
+
self.evaluation_cost += cost
|
|
522
|
+
return res
|
|
523
|
+
else:
|
|
524
|
+
res = await self.model.a_generate(prompt)
|
|
525
|
+
return res
|
|
526
|
+
|
|
527
|
+
def _calculate_score(
|
|
528
|
+
self, scores: List[InteractionContextualPrecisionScore]
|
|
529
|
+
) -> float:
|
|
530
|
+
number_of_scores = len(scores)
|
|
531
|
+
if number_of_scores == 0:
|
|
532
|
+
return 1
|
|
533
|
+
total_score = 0
|
|
534
|
+
for score in scores:
|
|
535
|
+
total_score += score.score
|
|
536
|
+
return total_score / number_of_scores
|
|
537
|
+
|
|
538
|
+
def is_successful(self) -> bool:
|
|
539
|
+
if self.error is not None:
|
|
540
|
+
self.success = False
|
|
541
|
+
else:
|
|
542
|
+
try:
|
|
543
|
+
self.success = self.score >= self.threshold
|
|
544
|
+
except:
|
|
545
|
+
self.success = False
|
|
546
|
+
return self.success
|
|
547
|
+
|
|
548
|
+
@property
|
|
549
|
+
def __name__(self):
|
|
550
|
+
return "Turn Contextual Precision"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ContextualRecallVerdict(BaseModel):
|
|
6
|
+
verdict: str
|
|
7
|
+
reason: str
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Verdicts(BaseModel):
|
|
11
|
+
verdicts: List[ContextualRecallVerdict]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ContextualRecallScoreReason(BaseModel):
|
|
15
|
+
reason: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class InteractionContextualRecallScore(BaseModel):
|
|
19
|
+
score: float
|
|
20
|
+
reason: str
|
|
21
|
+
verdicts: List[ContextualRecallVerdict]
|