deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
from typing import Optional, List, Type, Union
|
|
2
2
|
import asyncio
|
|
3
3
|
|
|
4
|
-
from deepeval.utils import
|
|
4
|
+
from deepeval.utils import (
|
|
5
|
+
get_or_create_event_loop,
|
|
6
|
+
prettify_list,
|
|
7
|
+
)
|
|
5
8
|
from deepeval.metrics.utils import (
|
|
6
9
|
construct_verbose_logs,
|
|
7
|
-
trimAndLoadJson,
|
|
8
10
|
check_llm_test_case_params,
|
|
9
11
|
initialize_model,
|
|
12
|
+
a_generate_with_schema_and_extract,
|
|
13
|
+
generate_with_schema_and_extract,
|
|
10
14
|
)
|
|
11
15
|
from deepeval.test_case import (
|
|
12
16
|
LLMTestCase,
|
|
@@ -18,7 +22,10 @@ from deepeval.metrics.contextual_relevancy.template import (
|
|
|
18
22
|
ContextualRelevancyTemplate,
|
|
19
23
|
)
|
|
20
24
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
|
-
from deepeval.metrics.contextual_relevancy.schema import
|
|
25
|
+
from deepeval.metrics.contextual_relevancy.schema import (
|
|
26
|
+
ContextualRelevancyVerdicts,
|
|
27
|
+
ContextualRelevancyScoreReason,
|
|
28
|
+
)
|
|
22
29
|
from deepeval.metrics.api import metric_data_manager
|
|
23
30
|
|
|
24
31
|
|
|
@@ -57,7 +64,17 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
57
64
|
_log_metric_to_confident: bool = True,
|
|
58
65
|
) -> float:
|
|
59
66
|
|
|
60
|
-
|
|
67
|
+
multimodal = test_case.multimodal
|
|
68
|
+
|
|
69
|
+
check_llm_test_case_params(
|
|
70
|
+
test_case,
|
|
71
|
+
self._required_params,
|
|
72
|
+
None,
|
|
73
|
+
None,
|
|
74
|
+
self,
|
|
75
|
+
self.model,
|
|
76
|
+
test_case.multimodal,
|
|
77
|
+
)
|
|
61
78
|
|
|
62
79
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
63
80
|
with metric_progress_indicator(
|
|
@@ -74,12 +91,16 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
74
91
|
)
|
|
75
92
|
)
|
|
76
93
|
else:
|
|
94
|
+
|
|
95
|
+
input = test_case.input
|
|
96
|
+
retrieval_context = test_case.retrieval_context
|
|
97
|
+
|
|
77
98
|
self.verdicts_list: List[ContextualRelevancyVerdicts] = [
|
|
78
|
-
(self._generate_verdicts(
|
|
79
|
-
for context in
|
|
99
|
+
(self._generate_verdicts(input, context, multimodal))
|
|
100
|
+
for context in retrieval_context
|
|
80
101
|
]
|
|
81
102
|
self.score = self._calculate_score()
|
|
82
|
-
self.reason = self._generate_reason(
|
|
103
|
+
self.reason = self._generate_reason(input, multimodal)
|
|
83
104
|
self.success = self.score >= self.threshold
|
|
84
105
|
self.verbose_logs = construct_verbose_logs(
|
|
85
106
|
self,
|
|
@@ -103,7 +124,17 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
103
124
|
_log_metric_to_confident: bool = True,
|
|
104
125
|
) -> float:
|
|
105
126
|
|
|
106
|
-
|
|
127
|
+
multimodal = test_case.multimodal
|
|
128
|
+
|
|
129
|
+
check_llm_test_case_params(
|
|
130
|
+
test_case,
|
|
131
|
+
self._required_params,
|
|
132
|
+
None,
|
|
133
|
+
None,
|
|
134
|
+
self,
|
|
135
|
+
self.model,
|
|
136
|
+
test_case.multimodal,
|
|
137
|
+
)
|
|
107
138
|
|
|
108
139
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
109
140
|
with metric_progress_indicator(
|
|
@@ -112,16 +143,19 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
112
143
|
_show_indicator=_show_indicator,
|
|
113
144
|
_in_component=_in_component,
|
|
114
145
|
):
|
|
146
|
+
input = test_case.input
|
|
147
|
+
retrieval_context = test_case.retrieval_context
|
|
148
|
+
|
|
115
149
|
self.verdicts_list: List[ContextualRelevancyVerdicts] = (
|
|
116
150
|
await asyncio.gather(
|
|
117
151
|
*[
|
|
118
|
-
self._a_generate_verdicts(
|
|
119
|
-
for context in
|
|
152
|
+
self._a_generate_verdicts(input, context, multimodal)
|
|
153
|
+
for context in retrieval_context
|
|
120
154
|
]
|
|
121
155
|
)
|
|
122
156
|
)
|
|
123
157
|
self.score = self._calculate_score()
|
|
124
|
-
self.reason = await self._a_generate_reason(
|
|
158
|
+
self.reason = await self._a_generate_reason(input, multimodal)
|
|
125
159
|
self.success = self.score >= self.threshold
|
|
126
160
|
self.verbose_logs = construct_verbose_logs(
|
|
127
161
|
self,
|
|
@@ -136,7 +170,7 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
136
170
|
)
|
|
137
171
|
return self.score
|
|
138
172
|
|
|
139
|
-
async def _a_generate_reason(self, input: str):
|
|
173
|
+
async def _a_generate_reason(self, input: str, multimodal: bool):
|
|
140
174
|
if self.include_reason is False:
|
|
141
175
|
return None
|
|
142
176
|
|
|
@@ -154,27 +188,18 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
154
188
|
irrelevant_statements=irrelevant_statements,
|
|
155
189
|
relevant_statements=relevant_statements,
|
|
156
190
|
score=format(self.score, ".2f"),
|
|
191
|
+
multimodal=multimodal,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return await a_generate_with_schema_and_extract(
|
|
195
|
+
metric=self,
|
|
196
|
+
prompt=prompt,
|
|
197
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
198
|
+
extract_schema=lambda score_reason: score_reason.reason,
|
|
199
|
+
extract_json=lambda data: data["reason"],
|
|
157
200
|
)
|
|
158
|
-
if self.using_native_model:
|
|
159
|
-
res, cost = await self.model.a_generate(
|
|
160
|
-
prompt, schema=ContextualRelevancyScoreReason
|
|
161
|
-
)
|
|
162
|
-
self.evaluation_cost += cost
|
|
163
|
-
return res.reason
|
|
164
|
-
else:
|
|
165
|
-
try:
|
|
166
|
-
res: ContextualRelevancyScoreReason = (
|
|
167
|
-
await self.model.a_generate(
|
|
168
|
-
prompt, schema=ContextualRelevancyScoreReason
|
|
169
|
-
)
|
|
170
|
-
)
|
|
171
|
-
return res.reason
|
|
172
|
-
except TypeError:
|
|
173
|
-
res = await self.model.a_generate(prompt)
|
|
174
|
-
data = trimAndLoadJson(res, self)
|
|
175
|
-
return data["reason"]
|
|
176
201
|
|
|
177
|
-
def _generate_reason(self, input: str):
|
|
202
|
+
def _generate_reason(self, input: str, multimodal: bool):
|
|
178
203
|
if self.include_reason is False:
|
|
179
204
|
return None
|
|
180
205
|
|
|
@@ -192,23 +217,16 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
192
217
|
irrelevant_statements=irrelevant_statements,
|
|
193
218
|
relevant_statements=relevant_statements,
|
|
194
219
|
score=format(self.score, ".2f"),
|
|
220
|
+
multimodal=multimodal,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
return generate_with_schema_and_extract(
|
|
224
|
+
metric=self,
|
|
225
|
+
prompt=prompt,
|
|
226
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
227
|
+
extract_schema=lambda score_reason: score_reason.reason,
|
|
228
|
+
extract_json=lambda data: data["reason"],
|
|
195
229
|
)
|
|
196
|
-
if self.using_native_model:
|
|
197
|
-
res, cost = self.model.generate(
|
|
198
|
-
prompt, schema=ContextualRelevancyScoreReason
|
|
199
|
-
)
|
|
200
|
-
self.evaluation_cost += cost
|
|
201
|
-
return res.reason
|
|
202
|
-
else:
|
|
203
|
-
try:
|
|
204
|
-
res: ContextualRelevancyScoreReason = self.model.generate(
|
|
205
|
-
prompt, schema=ContextualRelevancyScoreReason
|
|
206
|
-
)
|
|
207
|
-
return res.reason
|
|
208
|
-
except TypeError:
|
|
209
|
-
res = self.model.generate(prompt)
|
|
210
|
-
data = trimAndLoadJson(res, self)
|
|
211
|
-
return data["reason"]
|
|
212
230
|
|
|
213
231
|
def _calculate_score(self):
|
|
214
232
|
total_verdicts = 0
|
|
@@ -226,50 +244,34 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
226
244
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
227
245
|
|
|
228
246
|
async def _a_generate_verdicts(
|
|
229
|
-
self, input: str, context: List[str]
|
|
247
|
+
self, input: str, context: List[str], multimodal: bool
|
|
230
248
|
) -> ContextualRelevancyVerdicts:
|
|
231
249
|
prompt = self.evaluation_template.generate_verdicts(
|
|
232
|
-
input=input, context=context
|
|
250
|
+
input=input, context=context, multimodal=multimodal
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
return await a_generate_with_schema_and_extract(
|
|
254
|
+
metric=self,
|
|
255
|
+
prompt=prompt,
|
|
256
|
+
schema_cls=ContextualRelevancyVerdicts,
|
|
257
|
+
extract_schema=lambda r: r,
|
|
258
|
+
extract_json=lambda data: ContextualRelevancyVerdicts(**data),
|
|
233
259
|
)
|
|
234
|
-
if self.using_native_model:
|
|
235
|
-
res, cost = await self.model.a_generate(
|
|
236
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
237
|
-
)
|
|
238
|
-
self.evaluation_cost += cost
|
|
239
|
-
return res
|
|
240
|
-
else:
|
|
241
|
-
try:
|
|
242
|
-
res = await self.model.a_generate(
|
|
243
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
244
|
-
)
|
|
245
|
-
return res
|
|
246
|
-
except TypeError:
|
|
247
|
-
res = await self.model.a_generate(prompt)
|
|
248
|
-
data = trimAndLoadJson(res, self)
|
|
249
|
-
return ContextualRelevancyVerdicts(**data)
|
|
250
260
|
|
|
251
261
|
def _generate_verdicts(
|
|
252
|
-
self, input: str, context: str
|
|
262
|
+
self, input: str, context: str, multimodal: bool
|
|
253
263
|
) -> ContextualRelevancyVerdicts:
|
|
254
264
|
prompt = self.evaluation_template.generate_verdicts(
|
|
255
|
-
input=input, context=context
|
|
265
|
+
input=input, context=context, multimodal=multimodal
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
return generate_with_schema_and_extract(
|
|
269
|
+
metric=self,
|
|
270
|
+
prompt=prompt,
|
|
271
|
+
schema_cls=ContextualRelevancyVerdicts,
|
|
272
|
+
extract_schema=lambda r: r,
|
|
273
|
+
extract_json=lambda data: ContextualRelevancyVerdicts(**data),
|
|
256
274
|
)
|
|
257
|
-
if self.using_native_model:
|
|
258
|
-
res, cost = self.model.generate(
|
|
259
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
260
|
-
)
|
|
261
|
-
self.evaluation_cost += cost
|
|
262
|
-
return res
|
|
263
|
-
else:
|
|
264
|
-
try:
|
|
265
|
-
res = self.model.generate(
|
|
266
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
267
|
-
)
|
|
268
|
-
return res
|
|
269
|
-
except TypeError:
|
|
270
|
-
res = self.model.generate(prompt)
|
|
271
|
-
data = trimAndLoadJson(res, self)
|
|
272
|
-
return ContextualRelevancyVerdicts(**data)
|
|
273
275
|
|
|
274
276
|
def is_successful(self) -> bool:
|
|
275
277
|
if self.error is not None:
|
|
@@ -277,7 +279,7 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
277
279
|
else:
|
|
278
280
|
try:
|
|
279
281
|
self.success = self.score >= self.threshold
|
|
280
|
-
except:
|
|
282
|
+
except TypeError:
|
|
281
283
|
self.success = False
|
|
282
284
|
return self.success
|
|
283
285
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Union
|
|
2
|
+
import textwrap
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
class ContextualRelevancyTemplate:
|
|
@@ -8,70 +9,98 @@ class ContextualRelevancyTemplate:
|
|
|
8
9
|
irrelevant_statements: List[str],
|
|
9
10
|
relevant_statements: List[str],
|
|
10
11
|
score: float,
|
|
12
|
+
multimodal: bool = False,
|
|
11
13
|
):
|
|
12
|
-
|
|
13
|
-
|
|
14
|
+
# Note: irrelevancies parameter name in multimodal version is kept as irrelevant_statements for consistency
|
|
15
|
+
return textwrap.dedent(
|
|
16
|
+
f"""Based on the given input, reasons for why the retrieval context is irrelevant to the input, the statements in the retrieval context that is actually relevant to the retrieval context, and the contextual relevancy score (the closer to 1 the better), please generate a CONCISE reason for the score.
|
|
17
|
+
In your reason, you should quote data provided in the reasons for irrelevancy and relevant statements to support your point.
|
|
14
18
|
|
|
15
|
-
**
|
|
16
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
17
|
-
Example JSON:
|
|
18
|
-
{{
|
|
19
|
-
|
|
20
|
-
}}
|
|
19
|
+
**
|
|
20
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
21
|
+
Example JSON:
|
|
22
|
+
{{
|
|
23
|
+
"reason": "The score is <contextual_relevancy_score> because <your_reason>."
|
|
24
|
+
}}
|
|
21
25
|
|
|
22
|
-
If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
|
23
|
-
**
|
|
26
|
+
If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
|
27
|
+
**
|
|
24
28
|
|
|
25
29
|
|
|
26
|
-
Contextual Relevancy Score:
|
|
27
|
-
{score}
|
|
30
|
+
Contextual Relevancy Score:
|
|
31
|
+
{score}
|
|
28
32
|
|
|
29
|
-
Input:
|
|
30
|
-
{input}
|
|
33
|
+
Input:
|
|
34
|
+
{input}
|
|
35
|
+
|
|
36
|
+
Reasons for why the retrieval context is irrelevant to the input:
|
|
37
|
+
{irrelevant_statements}
|
|
31
38
|
|
|
32
|
-
|
|
33
|
-
{
|
|
39
|
+
Statement in the retrieval context that is relevant to the input:
|
|
40
|
+
{relevant_statements}
|
|
34
41
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
JSON:
|
|
39
|
-
"""
|
|
42
|
+
JSON:
|
|
43
|
+
"""
|
|
44
|
+
)
|
|
40
45
|
|
|
41
46
|
@staticmethod
|
|
42
|
-
def generate_verdicts(
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
{
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
47
|
+
def generate_verdicts(
|
|
48
|
+
input: str,
|
|
49
|
+
context: str,
|
|
50
|
+
multimodal: bool = False,
|
|
51
|
+
):
|
|
52
|
+
context_type = "context (image or string)" if multimodal else "context"
|
|
53
|
+
statement_or_image = "statement or image" if multimodal else "statement"
|
|
54
|
+
|
|
55
|
+
# Conditional instructions based on mode
|
|
56
|
+
extraction_instructions = ""
|
|
57
|
+
if multimodal:
|
|
58
|
+
extraction_instructions = textwrap.dedent(
|
|
59
|
+
"""
|
|
60
|
+
If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
|
|
61
|
+
If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
|
|
62
|
+
"""
|
|
63
|
+
).strip()
|
|
64
|
+
else:
|
|
65
|
+
extraction_instructions = "You should first extract statements found in the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement."
|
|
66
|
+
|
|
67
|
+
# Additional instruction for empty context (only in non-multimodal)
|
|
68
|
+
empty_context_instruction = ""
|
|
69
|
+
if not multimodal:
|
|
70
|
+
empty_context_instruction = '\nIf provided context contains no actual content or statements then: give "no" as a "verdict",\nput context into "statement", and "No statements found in provided context." into "reason".'
|
|
71
|
+
|
|
72
|
+
return textwrap.dedent(
|
|
73
|
+
f"""Based on the input and {context_type}, please generate a JSON object to indicate whether {'the context' if multimodal else 'each statement found in the context'} is relevant to the provided input. The JSON will be a list of 'verdicts', with 2 mandatory fields: 'verdict' and 'statement', and 1 optional field: 'reason'.
|
|
74
|
+
{extraction_instructions}
|
|
75
|
+
The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the {statement_or_image} is relevant to the input.
|
|
76
|
+
Provide a 'reason' ONLY IF verdict is no. You MUST quote the irrelevant parts of the {statement_or_image} to back up your reason.{empty_context_instruction}
|
|
77
|
+
**
|
|
78
|
+
IMPORTANT: Please make sure to only return in JSON format.
|
|
79
|
+
Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
|
|
80
|
+
Example Input: "What were some of Einstein's achievements?"
|
|
81
|
+
|
|
82
|
+
Example:
|
|
83
|
+
{{
|
|
84
|
+
"verdicts": [
|
|
85
|
+
{{
|
|
86
|
+
"statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
|
|
87
|
+
"verdict": "yes"
|
|
88
|
+
}},
|
|
89
|
+
{{
|
|
90
|
+
"statement": "There was a cat.",
|
|
91
|
+
"reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements.",
|
|
92
|
+
"verdict": "no"
|
|
93
|
+
}}
|
|
94
|
+
]
|
|
95
|
+
}}
|
|
96
|
+
**
|
|
97
|
+
|
|
98
|
+
Input:
|
|
99
|
+
{input}
|
|
100
|
+
|
|
101
|
+
Context:
|
|
102
|
+
{context}
|
|
103
|
+
|
|
104
|
+
JSON:
|
|
105
|
+
"""
|
|
106
|
+
)
|