deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
deepeval/metrics/indicator.py
CHANGED
|
@@ -10,10 +10,9 @@ from deepeval.errors import MissingTestCaseParamsError
|
|
|
10
10
|
from deepeval.metrics import (
|
|
11
11
|
BaseMetric,
|
|
12
12
|
BaseConversationalMetric,
|
|
13
|
-
BaseMultimodalMetric,
|
|
14
13
|
BaseArenaMetric,
|
|
15
14
|
)
|
|
16
|
-
from deepeval.test_case import LLMTestCase, ConversationalTestCase
|
|
15
|
+
from deepeval.test_case import LLMTestCase, ConversationalTestCase
|
|
17
16
|
from deepeval.test_run.cache import CachedTestCase, Cache
|
|
18
17
|
from deepeval.telemetry import capture_metric_type
|
|
19
18
|
from deepeval.utils import update_pbar
|
|
@@ -74,8 +73,8 @@ def metric_progress_indicator(
|
|
|
74
73
|
async def measure_metric_task(
|
|
75
74
|
task_id,
|
|
76
75
|
progress,
|
|
77
|
-
metric: Union[BaseMetric,
|
|
78
|
-
test_case: Union[LLMTestCase,
|
|
76
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
77
|
+
test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
|
|
79
78
|
cached_test_case: Union[CachedTestCase, None],
|
|
80
79
|
ignore_errors: bool,
|
|
81
80
|
skip_on_missing_params: bool,
|
|
@@ -156,10 +155,8 @@ async def measure_metric_task(
|
|
|
156
155
|
|
|
157
156
|
|
|
158
157
|
async def measure_metrics_with_indicator(
|
|
159
|
-
metrics: List[
|
|
160
|
-
|
|
161
|
-
],
|
|
162
|
-
test_case: Union[LLMTestCase, MLLMTestCase, ConversationalTestCase],
|
|
158
|
+
metrics: List[Union[BaseMetric, BaseConversationalMetric]],
|
|
159
|
+
test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
|
|
163
160
|
cached_test_case: Union[CachedTestCase, None],
|
|
164
161
|
ignore_errors: bool,
|
|
165
162
|
skip_on_missing_params: bool,
|
|
@@ -238,8 +235,8 @@ async def measure_metrics_with_indicator(
|
|
|
238
235
|
|
|
239
236
|
|
|
240
237
|
async def safe_a_measure(
|
|
241
|
-
metric: Union[BaseMetric,
|
|
242
|
-
tc: Union[LLMTestCase,
|
|
238
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
239
|
+
tc: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
|
|
243
240
|
ignore_errors: bool,
|
|
244
241
|
skip_on_missing_params: bool,
|
|
245
242
|
progress: Optional[Progress] = None,
|
|
@@ -11,7 +11,8 @@ from deepeval.metrics.utils import (
|
|
|
11
11
|
construct_verbose_logs,
|
|
12
12
|
check_llm_test_case_params,
|
|
13
13
|
initialize_model,
|
|
14
|
-
|
|
14
|
+
a_generate_with_schema_and_extract,
|
|
15
|
+
generate_with_schema_and_extract,
|
|
15
16
|
)
|
|
16
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
@@ -46,6 +47,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
46
47
|
self.async_mode = async_mode
|
|
47
48
|
self.verbose_mode = verbose_mode
|
|
48
49
|
self.expected_schema = expected_schema
|
|
50
|
+
self.evaluation_model = self.model.get_model_name()
|
|
49
51
|
|
|
50
52
|
def measure(
|
|
51
53
|
self,
|
|
@@ -55,7 +57,16 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
55
57
|
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
59
|
|
|
58
|
-
|
|
60
|
+
multimodal = test_case.multimodal
|
|
61
|
+
check_llm_test_case_params(
|
|
62
|
+
test_case,
|
|
63
|
+
self._required_params,
|
|
64
|
+
None,
|
|
65
|
+
None,
|
|
66
|
+
self,
|
|
67
|
+
self.model,
|
|
68
|
+
multimodal,
|
|
69
|
+
)
|
|
59
70
|
|
|
60
71
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
61
72
|
with metric_progress_indicator(
|
|
@@ -77,7 +88,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
77
88
|
self.expected_schema.model_validate_json(
|
|
78
89
|
test_case.actual_output
|
|
79
90
|
)
|
|
80
|
-
except ValidationError
|
|
91
|
+
except ValidationError:
|
|
81
92
|
valid_json = False
|
|
82
93
|
|
|
83
94
|
self.score = 1 if valid_json else 0
|
|
@@ -106,7 +117,16 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
106
117
|
_log_metric_to_confident: bool = True,
|
|
107
118
|
) -> float:
|
|
108
119
|
|
|
109
|
-
|
|
120
|
+
multimodal = test_case.multimodal
|
|
121
|
+
check_llm_test_case_params(
|
|
122
|
+
test_case,
|
|
123
|
+
self._required_params,
|
|
124
|
+
None,
|
|
125
|
+
None,
|
|
126
|
+
self,
|
|
127
|
+
self.model,
|
|
128
|
+
multimodal,
|
|
129
|
+
)
|
|
110
130
|
|
|
111
131
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
112
132
|
with metric_progress_indicator(
|
|
@@ -120,7 +140,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
120
140
|
self.expected_schema.model_validate_json(
|
|
121
141
|
test_case.actual_output
|
|
122
142
|
)
|
|
123
|
-
except ValidationError
|
|
143
|
+
except ValidationError:
|
|
124
144
|
valid_json = False
|
|
125
145
|
|
|
126
146
|
self.score = 1 if valid_json else 0
|
|
@@ -156,22 +176,13 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
156
176
|
is_valid_json=is_valid_json,
|
|
157
177
|
)
|
|
158
178
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
try:
|
|
167
|
-
res: JsonCorrectnessScoreReason = await self.model.a_generate(
|
|
168
|
-
prompt, schema=JsonCorrectnessScoreReason
|
|
169
|
-
)
|
|
170
|
-
return res.reason
|
|
171
|
-
except TypeError:
|
|
172
|
-
res = await self.model.a_generate(prompt)
|
|
173
|
-
data = trimAndLoadJson(res, self)
|
|
174
|
-
return data["reason"]
|
|
179
|
+
return await a_generate_with_schema_and_extract(
|
|
180
|
+
metric=self,
|
|
181
|
+
prompt=prompt,
|
|
182
|
+
schema_cls=JsonCorrectnessScoreReason,
|
|
183
|
+
extract_schema=lambda s: s.reason,
|
|
184
|
+
extract_json=lambda data: data["reason"],
|
|
185
|
+
)
|
|
175
186
|
|
|
176
187
|
def generate_reason(self, actual_output: str) -> str:
|
|
177
188
|
if self.include_reason is False:
|
|
@@ -189,22 +200,13 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
189
200
|
is_valid_json=is_valid_json,
|
|
190
201
|
)
|
|
191
202
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
try:
|
|
200
|
-
res: JsonCorrectnessScoreReason = self.model.generate(
|
|
201
|
-
prompt, schema=JsonCorrectnessScoreReason
|
|
202
|
-
)
|
|
203
|
-
return res.reason
|
|
204
|
-
except TypeError:
|
|
205
|
-
res = self.model.generate(prompt)
|
|
206
|
-
data = trimAndLoadJson(res, self)
|
|
207
|
-
return data["reason"]
|
|
203
|
+
return generate_with_schema_and_extract(
|
|
204
|
+
metric=self,
|
|
205
|
+
prompt=prompt,
|
|
206
|
+
schema_cls=JsonCorrectnessScoreReason,
|
|
207
|
+
extract_schema=lambda s: s.reason,
|
|
208
|
+
extract_json=lambda data: data["reason"],
|
|
209
|
+
)
|
|
208
210
|
|
|
209
211
|
def is_successful(self) -> bool:
|
|
210
212
|
if self.error is not None:
|
|
@@ -212,7 +214,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
212
214
|
else:
|
|
213
215
|
try:
|
|
214
216
|
self.success = self.score >= self.threshold
|
|
215
|
-
except:
|
|
217
|
+
except TypeError:
|
|
216
218
|
self.success = False
|
|
217
219
|
return self.success
|
|
218
220
|
|
|
@@ -2,12 +2,22 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class JsonCorrectnessTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_reason(
|
|
7
15
|
actual_output: str, expected_schema: str, is_valid_json: bool
|
|
8
16
|
):
|
|
9
17
|
return f"""Based on the given generated json, generated by an LLM, and a boolean stating whether it is a valid JSON based on the expected json schema, give a reason why it is OR is not a valid Json.
|
|
10
18
|
|
|
19
|
+
{JsonCorrectnessTemplate.multimodal_rules}
|
|
20
|
+
|
|
11
21
|
**
|
|
12
22
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
13
23
|
Example JSON:
|
|
@@ -5,9 +5,10 @@ from deepeval.metrics import BaseConversationalMetric
|
|
|
5
5
|
from deepeval.metrics.utils import (
|
|
6
6
|
check_conversational_test_case_params,
|
|
7
7
|
construct_verbose_logs,
|
|
8
|
-
trimAndLoadJson,
|
|
9
8
|
initialize_model,
|
|
10
9
|
convert_turn_to_dict,
|
|
10
|
+
a_generate_with_schema_and_extract,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
11
12
|
)
|
|
12
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
14
|
from deepeval.metrics.knowledge_retention.template import (
|
|
@@ -51,7 +52,12 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
51
52
|
_log_metric_to_confident: bool = True,
|
|
52
53
|
):
|
|
53
54
|
check_conversational_test_case_params(
|
|
54
|
-
test_case,
|
|
55
|
+
test_case,
|
|
56
|
+
self._required_test_case_params,
|
|
57
|
+
self,
|
|
58
|
+
False,
|
|
59
|
+
self.model,
|
|
60
|
+
test_case.multimodal,
|
|
55
61
|
)
|
|
56
62
|
|
|
57
63
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -101,7 +107,12 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
101
107
|
_log_metric_to_confident: bool = True,
|
|
102
108
|
) -> float:
|
|
103
109
|
check_conversational_test_case_params(
|
|
104
|
-
test_case,
|
|
110
|
+
test_case,
|
|
111
|
+
self._required_test_case_params,
|
|
112
|
+
self,
|
|
113
|
+
False,
|
|
114
|
+
self.model,
|
|
115
|
+
test_case.multimodal,
|
|
105
116
|
)
|
|
106
117
|
|
|
107
118
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -147,23 +158,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
147
158
|
attritions=attritions,
|
|
148
159
|
score=format(self.score, ".2f"),
|
|
149
160
|
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
res: KnowledgeRetentionScoreReason = (
|
|
158
|
-
await self.model.a_generate(
|
|
159
|
-
prompt, schema=KnowledgeRetentionScoreReason
|
|
160
|
-
)
|
|
161
|
-
)
|
|
162
|
-
return res.reason
|
|
163
|
-
except TypeError:
|
|
164
|
-
res = await self.model.a_generate(prompt)
|
|
165
|
-
data = trimAndLoadJson(res, self)
|
|
166
|
-
return data["reason"]
|
|
161
|
+
return await a_generate_with_schema_and_extract(
|
|
162
|
+
metric=self,
|
|
163
|
+
prompt=prompt,
|
|
164
|
+
schema_cls=KnowledgeRetentionScoreReason,
|
|
165
|
+
extract_schema=lambda s: s.reason,
|
|
166
|
+
extract_json=lambda data: data["reason"],
|
|
167
|
+
)
|
|
167
168
|
|
|
168
169
|
def _generate_reason(self) -> str:
|
|
169
170
|
if self.include_reason is False:
|
|
@@ -178,21 +179,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
178
179
|
attritions=attritions,
|
|
179
180
|
score=format(self.score, ".2f"),
|
|
180
181
|
)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
res: KnowledgeRetentionScoreReason = self.model.generate(
|
|
189
|
-
prompt, schema=KnowledgeRetentionScoreReason
|
|
190
|
-
)
|
|
191
|
-
return res.reason
|
|
192
|
-
except TypeError:
|
|
193
|
-
res = self.model.generate(prompt)
|
|
194
|
-
data = trimAndLoadJson(res, self)
|
|
195
|
-
return data["reason"]
|
|
182
|
+
return generate_with_schema_and_extract(
|
|
183
|
+
metric=self,
|
|
184
|
+
prompt=prompt,
|
|
185
|
+
schema_cls=KnowledgeRetentionScoreReason,
|
|
186
|
+
extract_schema=lambda s: s.reason,
|
|
187
|
+
extract_json=lambda data: data["reason"],
|
|
188
|
+
)
|
|
196
189
|
|
|
197
190
|
async def _a_generate_verdicts(
|
|
198
191
|
self, turns: List[Turn]
|
|
@@ -205,7 +198,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
205
198
|
accumulated_knowledge = [
|
|
206
199
|
knowledge.data
|
|
207
200
|
for knowledge in self.knowledges[:i]
|
|
208
|
-
if knowledge is not None
|
|
201
|
+
if knowledge is not None and knowledge.data
|
|
209
202
|
]
|
|
210
203
|
if len(accumulated_knowledge) == 0:
|
|
211
204
|
continue
|
|
@@ -214,22 +207,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
214
207
|
llm_message=turns[i].content,
|
|
215
208
|
accumulated_knowledge=accumulated_knowledge,
|
|
216
209
|
)
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
verdict: KnowledgeRetentionVerdict = (
|
|
225
|
-
await self.model.a_generate(
|
|
226
|
-
prompt, schema=KnowledgeRetentionVerdict
|
|
227
|
-
)
|
|
228
|
-
)
|
|
229
|
-
except TypeError:
|
|
230
|
-
res = await self.model.a_generate(prompt)
|
|
231
|
-
data = trimAndLoadJson(res, self)
|
|
232
|
-
verdict = KnowledgeRetentionVerdict(**data)
|
|
210
|
+
verdict = await a_generate_with_schema_and_extract(
|
|
211
|
+
metric=self,
|
|
212
|
+
prompt=prompt,
|
|
213
|
+
schema_cls=KnowledgeRetentionVerdict,
|
|
214
|
+
extract_schema=lambda s: s,
|
|
215
|
+
extract_json=lambda data: KnowledgeRetentionVerdict(**data),
|
|
216
|
+
)
|
|
233
217
|
verdicts.append(verdict)
|
|
234
218
|
return verdicts
|
|
235
219
|
|
|
@@ -244,7 +228,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
244
228
|
accumulated_knowledge = [
|
|
245
229
|
knowledge.data
|
|
246
230
|
for knowledge in self.knowledges[:i]
|
|
247
|
-
if knowledge is not None
|
|
231
|
+
if knowledge is not None and knowledge.data
|
|
248
232
|
]
|
|
249
233
|
if len(accumulated_knowledge) == 0:
|
|
250
234
|
continue
|
|
@@ -254,20 +238,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
254
238
|
accumulated_knowledge=accumulated_knowledge,
|
|
255
239
|
)
|
|
256
240
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
verdict: KnowledgeRetentionVerdict = self.model.generate(
|
|
265
|
-
prompt, schema=KnowledgeRetentionVerdict
|
|
266
|
-
)
|
|
267
|
-
except TypeError:
|
|
268
|
-
res = self.model.generate(prompt)
|
|
269
|
-
data = trimAndLoadJson(res, self)
|
|
270
|
-
verdict = KnowledgeRetentionVerdict(**data)
|
|
241
|
+
verdict = generate_with_schema_and_extract(
|
|
242
|
+
metric=self,
|
|
243
|
+
prompt=prompt,
|
|
244
|
+
schema_cls=KnowledgeRetentionVerdict,
|
|
245
|
+
extract_schema=lambda s: s,
|
|
246
|
+
extract_json=lambda data: KnowledgeRetentionVerdict(**data),
|
|
247
|
+
)
|
|
271
248
|
verdicts.append(verdict)
|
|
272
249
|
return verdicts
|
|
273
250
|
|
|
@@ -289,20 +266,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
289
266
|
convert_turn_to_dict(turn) for turn in previous_turns
|
|
290
267
|
],
|
|
291
268
|
)
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
knowledges[i] = await self.model.a_generate(
|
|
300
|
-
prompt, schema=Knowledge
|
|
301
|
-
)
|
|
302
|
-
except TypeError:
|
|
303
|
-
res = await self.model.a_generate(prompt)
|
|
304
|
-
data = trimAndLoadJson(res, self)
|
|
305
|
-
knowledges[i] = Knowledge(data=data)
|
|
269
|
+
knowledges[i] = await a_generate_with_schema_and_extract(
|
|
270
|
+
metric=self,
|
|
271
|
+
prompt=prompt,
|
|
272
|
+
schema_cls=Knowledge,
|
|
273
|
+
extract_schema=lambda s: s,
|
|
274
|
+
extract_json=lambda data: Knowledge(data=data),
|
|
275
|
+
)
|
|
306
276
|
|
|
307
277
|
return knowledges
|
|
308
278
|
|
|
@@ -325,20 +295,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
325
295
|
],
|
|
326
296
|
)
|
|
327
297
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
knowledges[i] = self.model.generate(
|
|
336
|
-
prompt, schema=Knowledge
|
|
337
|
-
)
|
|
338
|
-
except TypeError:
|
|
339
|
-
res = self.model.generate(prompt)
|
|
340
|
-
data = trimAndLoadJson(res, self)
|
|
341
|
-
knowledges[i] = Knowledge(data=data)
|
|
298
|
+
knowledges[i] = generate_with_schema_and_extract(
|
|
299
|
+
metric=self,
|
|
300
|
+
prompt=prompt,
|
|
301
|
+
schema_cls=Knowledge,
|
|
302
|
+
extract_schema=lambda s: s,
|
|
303
|
+
extract_json=lambda data: Knowledge(data=data),
|
|
304
|
+
)
|
|
342
305
|
|
|
343
306
|
return knowledges
|
|
344
307
|
|
|
@@ -361,8 +324,8 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
361
324
|
self.success = False
|
|
362
325
|
else:
|
|
363
326
|
try:
|
|
364
|
-
self.score >= self.threshold
|
|
365
|
-
except:
|
|
327
|
+
self.success = self.score >= self.threshold
|
|
328
|
+
except TypeError:
|
|
366
329
|
self.success = False
|
|
367
330
|
return self.success
|
|
368
331
|
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
-
from typing import Dict, Optional,
|
|
2
|
-
from pydantic import BaseModel
|
|
1
|
+
from typing import Dict, Optional, Union, List
|
|
2
|
+
from pydantic import BaseModel, ConfigDict
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class Knowledge(BaseModel):
|
|
6
|
-
|
|
6
|
+
# Each fact’s value is either a string or a list of strings
|
|
7
|
+
# data: Dict[str, Union[str, List[str]]]
|
|
8
|
+
data: Dict[str, Union[str, List[str]]] | None = None
|
|
9
|
+
# Forbid extra top-level fields to satisfy OpenAI’s schema requirements
|
|
10
|
+
model_config = ConfigDict(extra="forbid")
|
|
7
11
|
|
|
8
12
|
|
|
9
13
|
class KnowledgeRetentionVerdict(BaseModel):
|
|
10
14
|
verdict: str
|
|
11
15
|
reason: Optional[str] = None
|
|
16
|
+
model_config = ConfigDict(extra="forbid")
|
|
12
17
|
|
|
13
18
|
|
|
14
19
|
class KnowledgeRetentionScoreReason(BaseModel):
|
|
15
20
|
reason: str
|
|
21
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -2,10 +2,20 @@ from typing import List, Dict, Any
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class KnowledgeRetentionTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_reason(attritions, score):
|
|
7
15
|
return f"""Given a list of attritions, which highlights forgetfulness in the LLM response and knowledge established previously in the conversation, use it to CONCISELY provide a reason for the knowledge retention score. Note that The knowledge retention score ranges from 0 - 1, and the higher the better.
|
|
8
16
|
|
|
17
|
+
{KnowledgeRetentionTemplate.multimodal_rules}
|
|
18
|
+
|
|
9
19
|
**
|
|
10
20
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
11
21
|
Example JSON:
|
|
@@ -33,6 +43,8 @@ JSON:
|
|
|
33
43
|
|
|
34
44
|
Your task is to determine whether the LLM message **contradicts** or **forgets** any of the known facts.
|
|
35
45
|
|
|
46
|
+
{KnowledgeRetentionTemplate.multimodal_rules}
|
|
47
|
+
|
|
36
48
|
---
|
|
37
49
|
**Output format:**
|
|
38
50
|
|
|
@@ -7,8 +7,9 @@ from deepeval.metrics.utils import (
|
|
|
7
7
|
check_conversational_test_case_params,
|
|
8
8
|
construct_verbose_logs,
|
|
9
9
|
get_unit_interactions,
|
|
10
|
-
trimAndLoadJson,
|
|
11
10
|
initialize_model,
|
|
11
|
+
a_generate_with_schema_and_extract,
|
|
12
|
+
generate_with_schema_and_extract,
|
|
12
13
|
)
|
|
13
14
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
15
|
from deepeval.test_case import ConversationalTestCase, TurnParams
|
|
@@ -50,7 +51,12 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
50
51
|
_log_metric_to_confident: bool = True,
|
|
51
52
|
):
|
|
52
53
|
check_conversational_test_case_params(
|
|
53
|
-
test_case,
|
|
54
|
+
test_case,
|
|
55
|
+
self._required_test_case_params,
|
|
56
|
+
self,
|
|
57
|
+
False,
|
|
58
|
+
self.model,
|
|
59
|
+
test_case.multimodal,
|
|
54
60
|
)
|
|
55
61
|
|
|
56
62
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -107,7 +113,12 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
107
113
|
_log_metric_to_confident: bool = True,
|
|
108
114
|
):
|
|
109
115
|
check_conversational_test_case_params(
|
|
110
|
-
test_case,
|
|
116
|
+
test_case,
|
|
117
|
+
self._required_test_case_params,
|
|
118
|
+
self,
|
|
119
|
+
False,
|
|
120
|
+
self.model,
|
|
121
|
+
test_case.multimodal,
|
|
111
122
|
)
|
|
112
123
|
|
|
113
124
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -149,48 +160,67 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
149
160
|
|
|
150
161
|
return self.score
|
|
151
162
|
|
|
152
|
-
def _generate_reason(self, task_scores: List[TaskScore]) -> str:
|
|
153
|
-
|
|
163
|
+
def _generate_reason(self, task_scores: List[TaskScore]) -> Optional[str]:
|
|
164
|
+
if not self.include_reason:
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
reasons = []
|
|
154
168
|
for task_score in task_scores:
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
reason += "]"
|
|
161
|
-
return reason
|
|
169
|
+
reasons.append(task_score.reason)
|
|
170
|
+
|
|
171
|
+
prompt = MCPTaskCompletionTemplate.generate_final_reason(
|
|
172
|
+
self.score, self.success, reasons
|
|
173
|
+
)
|
|
162
174
|
|
|
163
|
-
def _get_task_score(self, task: Task) -> TaskScore:
|
|
164
|
-
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
165
175
|
if self.using_native_model:
|
|
166
|
-
res, cost = self.model.generate(prompt
|
|
176
|
+
res, cost = self.model.generate(prompt)
|
|
167
177
|
self.evaluation_cost += cost
|
|
168
178
|
return res
|
|
169
179
|
else:
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
180
|
+
res = self.model.generate(prompt)
|
|
181
|
+
return res
|
|
182
|
+
|
|
183
|
+
async def _a_generate_reason(
|
|
184
|
+
self, task_scores: List[TaskScore]
|
|
185
|
+
) -> Optional[str]:
|
|
186
|
+
if not self.include_reason:
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
reasons = []
|
|
190
|
+
for task_score in task_scores:
|
|
191
|
+
reasons.append(task_score.reason)
|
|
192
|
+
|
|
193
|
+
prompt = MCPTaskCompletionTemplate.generate_final_reason(
|
|
194
|
+
self.score, self.success, reasons
|
|
195
|
+
)
|
|
177
196
|
|
|
178
|
-
async def _a_get_task_score(self, task: Task) -> TaskScore:
|
|
179
|
-
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
180
197
|
if self.using_native_model:
|
|
181
|
-
res, cost = await self.model.a_generate(prompt
|
|
198
|
+
res, cost = await self.model.a_generate(prompt)
|
|
182
199
|
self.evaluation_cost += cost
|
|
183
200
|
return res
|
|
184
201
|
else:
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
202
|
+
res = await self.model.a_generate(prompt)
|
|
203
|
+
return res
|
|
204
|
+
|
|
205
|
+
def _get_task_score(self, task: Task) -> TaskScore:
|
|
206
|
+
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
207
|
+
return generate_with_schema_and_extract(
|
|
208
|
+
metric=self,
|
|
209
|
+
prompt=prompt,
|
|
210
|
+
schema_cls=TaskScore,
|
|
211
|
+
extract_schema=lambda s: s,
|
|
212
|
+
extract_json=lambda data: TaskScore(**data),
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
async def _a_get_task_score(self, task: Task) -> TaskScore:
|
|
216
|
+
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
217
|
+
return await a_generate_with_schema_and_extract(
|
|
218
|
+
metric=self,
|
|
219
|
+
prompt=prompt,
|
|
220
|
+
schema_cls=TaskScore,
|
|
221
|
+
extract_schema=lambda s: s,
|
|
222
|
+
extract_json=lambda data: TaskScore(**data),
|
|
223
|
+
)
|
|
194
224
|
|
|
195
225
|
def _get_tasks(self, unit_interactions: List) -> List[Task]:
|
|
196
226
|
tasks = []
|
|
@@ -244,9 +274,9 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
244
274
|
return tasks
|
|
245
275
|
|
|
246
276
|
def _calculate_score(self, scores: List[TaskScore]) -> float:
|
|
247
|
-
|
|
277
|
+
score_divisor = len(scores) if len(scores) > 0 else 1
|
|
248
278
|
total_score = sum(score.score for score in scores)
|
|
249
|
-
score = total_score /
|
|
279
|
+
score = total_score / score_divisor
|
|
250
280
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
251
281
|
|
|
252
282
|
def is_successful(self) -> bool:
|
|
@@ -254,8 +284,8 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
254
284
|
self.success = False
|
|
255
285
|
else:
|
|
256
286
|
try:
|
|
257
|
-
self.score >= self.threshold
|
|
258
|
-
except:
|
|
287
|
+
self.success = self.score >= self.threshold
|
|
288
|
+
except TypeError:
|
|
259
289
|
self.success = False
|
|
260
290
|
return self.success
|
|
261
291
|
|