deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -5,9 +5,10 @@ from deepeval.metrics import BaseConversationalMetric
|
|
|
5
5
|
from deepeval.metrics.utils import (
|
|
6
6
|
check_conversational_test_case_params,
|
|
7
7
|
construct_verbose_logs,
|
|
8
|
-
trimAndLoadJson,
|
|
9
8
|
initialize_model,
|
|
10
9
|
convert_turn_to_dict,
|
|
10
|
+
a_generate_with_schema_and_extract,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
11
12
|
)
|
|
12
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
14
|
from deepeval.metrics.knowledge_retention.template import (
|
|
@@ -51,7 +52,12 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
51
52
|
_log_metric_to_confident: bool = True,
|
|
52
53
|
):
|
|
53
54
|
check_conversational_test_case_params(
|
|
54
|
-
test_case,
|
|
55
|
+
test_case,
|
|
56
|
+
self._required_test_case_params,
|
|
57
|
+
self,
|
|
58
|
+
False,
|
|
59
|
+
self.model,
|
|
60
|
+
test_case.multimodal,
|
|
55
61
|
)
|
|
56
62
|
|
|
57
63
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -101,7 +107,12 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
101
107
|
_log_metric_to_confident: bool = True,
|
|
102
108
|
) -> float:
|
|
103
109
|
check_conversational_test_case_params(
|
|
104
|
-
test_case,
|
|
110
|
+
test_case,
|
|
111
|
+
self._required_test_case_params,
|
|
112
|
+
self,
|
|
113
|
+
False,
|
|
114
|
+
self.model,
|
|
115
|
+
test_case.multimodal,
|
|
105
116
|
)
|
|
106
117
|
|
|
107
118
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -147,23 +158,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
147
158
|
attritions=attritions,
|
|
148
159
|
score=format(self.score, ".2f"),
|
|
149
160
|
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
res: KnowledgeRetentionScoreReason = (
|
|
158
|
-
await self.model.a_generate(
|
|
159
|
-
prompt, schema=KnowledgeRetentionScoreReason
|
|
160
|
-
)
|
|
161
|
-
)
|
|
162
|
-
return res.reason
|
|
163
|
-
except TypeError:
|
|
164
|
-
res = await self.model.a_generate(prompt)
|
|
165
|
-
data = trimAndLoadJson(res, self)
|
|
166
|
-
return data["reason"]
|
|
161
|
+
return await a_generate_with_schema_and_extract(
|
|
162
|
+
metric=self,
|
|
163
|
+
prompt=prompt,
|
|
164
|
+
schema_cls=KnowledgeRetentionScoreReason,
|
|
165
|
+
extract_schema=lambda s: s.reason,
|
|
166
|
+
extract_json=lambda data: data["reason"],
|
|
167
|
+
)
|
|
167
168
|
|
|
168
169
|
def _generate_reason(self) -> str:
|
|
169
170
|
if self.include_reason is False:
|
|
@@ -178,21 +179,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
178
179
|
attritions=attritions,
|
|
179
180
|
score=format(self.score, ".2f"),
|
|
180
181
|
)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
res: KnowledgeRetentionScoreReason = self.model.generate(
|
|
189
|
-
prompt, schema=KnowledgeRetentionScoreReason
|
|
190
|
-
)
|
|
191
|
-
return res.reason
|
|
192
|
-
except TypeError:
|
|
193
|
-
res = self.model.generate(prompt)
|
|
194
|
-
data = trimAndLoadJson(res, self)
|
|
195
|
-
return data["reason"]
|
|
182
|
+
return generate_with_schema_and_extract(
|
|
183
|
+
metric=self,
|
|
184
|
+
prompt=prompt,
|
|
185
|
+
schema_cls=KnowledgeRetentionScoreReason,
|
|
186
|
+
extract_schema=lambda s: s.reason,
|
|
187
|
+
extract_json=lambda data: data["reason"],
|
|
188
|
+
)
|
|
196
189
|
|
|
197
190
|
async def _a_generate_verdicts(
|
|
198
191
|
self, turns: List[Turn]
|
|
@@ -205,7 +198,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
205
198
|
accumulated_knowledge = [
|
|
206
199
|
knowledge.data
|
|
207
200
|
for knowledge in self.knowledges[:i]
|
|
208
|
-
if knowledge is not None
|
|
201
|
+
if knowledge is not None and knowledge.data
|
|
209
202
|
]
|
|
210
203
|
if len(accumulated_knowledge) == 0:
|
|
211
204
|
continue
|
|
@@ -214,22 +207,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
214
207
|
llm_message=turns[i].content,
|
|
215
208
|
accumulated_knowledge=accumulated_knowledge,
|
|
216
209
|
)
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
verdict: KnowledgeRetentionVerdict = (
|
|
225
|
-
await self.model.a_generate(
|
|
226
|
-
prompt, schema=KnowledgeRetentionVerdict
|
|
227
|
-
)
|
|
228
|
-
)
|
|
229
|
-
except TypeError:
|
|
230
|
-
res = await self.model.a_generate(prompt)
|
|
231
|
-
data = trimAndLoadJson(res, self)
|
|
232
|
-
verdict = KnowledgeRetentionVerdict(**data)
|
|
210
|
+
verdict = await a_generate_with_schema_and_extract(
|
|
211
|
+
metric=self,
|
|
212
|
+
prompt=prompt,
|
|
213
|
+
schema_cls=KnowledgeRetentionVerdict,
|
|
214
|
+
extract_schema=lambda s: s,
|
|
215
|
+
extract_json=lambda data: KnowledgeRetentionVerdict(**data),
|
|
216
|
+
)
|
|
233
217
|
verdicts.append(verdict)
|
|
234
218
|
return verdicts
|
|
235
219
|
|
|
@@ -244,7 +228,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
244
228
|
accumulated_knowledge = [
|
|
245
229
|
knowledge.data
|
|
246
230
|
for knowledge in self.knowledges[:i]
|
|
247
|
-
if knowledge is not None
|
|
231
|
+
if knowledge is not None and knowledge.data
|
|
248
232
|
]
|
|
249
233
|
if len(accumulated_knowledge) == 0:
|
|
250
234
|
continue
|
|
@@ -254,20 +238,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
254
238
|
accumulated_knowledge=accumulated_knowledge,
|
|
255
239
|
)
|
|
256
240
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
verdict: KnowledgeRetentionVerdict = self.model.generate(
|
|
265
|
-
prompt, schema=KnowledgeRetentionVerdict
|
|
266
|
-
)
|
|
267
|
-
except TypeError:
|
|
268
|
-
res = self.model.generate(prompt)
|
|
269
|
-
data = trimAndLoadJson(res, self)
|
|
270
|
-
verdict = KnowledgeRetentionVerdict(**data)
|
|
241
|
+
verdict = generate_with_schema_and_extract(
|
|
242
|
+
metric=self,
|
|
243
|
+
prompt=prompt,
|
|
244
|
+
schema_cls=KnowledgeRetentionVerdict,
|
|
245
|
+
extract_schema=lambda s: s,
|
|
246
|
+
extract_json=lambda data: KnowledgeRetentionVerdict(**data),
|
|
247
|
+
)
|
|
271
248
|
verdicts.append(verdict)
|
|
272
249
|
return verdicts
|
|
273
250
|
|
|
@@ -289,20 +266,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
289
266
|
convert_turn_to_dict(turn) for turn in previous_turns
|
|
290
267
|
],
|
|
291
268
|
)
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
knowledges[i] = await self.model.a_generate(
|
|
300
|
-
prompt, schema=Knowledge
|
|
301
|
-
)
|
|
302
|
-
except TypeError:
|
|
303
|
-
res = await self.model.a_generate(prompt)
|
|
304
|
-
data = trimAndLoadJson(res, self)
|
|
305
|
-
knowledges[i] = Knowledge(data=data)
|
|
269
|
+
knowledges[i] = await a_generate_with_schema_and_extract(
|
|
270
|
+
metric=self,
|
|
271
|
+
prompt=prompt,
|
|
272
|
+
schema_cls=Knowledge,
|
|
273
|
+
extract_schema=lambda s: s,
|
|
274
|
+
extract_json=lambda data: Knowledge(data=data),
|
|
275
|
+
)
|
|
306
276
|
|
|
307
277
|
return knowledges
|
|
308
278
|
|
|
@@ -325,20 +295,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
325
295
|
],
|
|
326
296
|
)
|
|
327
297
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
knowledges[i] = self.model.generate(
|
|
336
|
-
prompt, schema=Knowledge
|
|
337
|
-
)
|
|
338
|
-
except TypeError:
|
|
339
|
-
res = self.model.generate(prompt)
|
|
340
|
-
data = trimAndLoadJson(res, self)
|
|
341
|
-
knowledges[i] = Knowledge(data=data)
|
|
298
|
+
knowledges[i] = generate_with_schema_and_extract(
|
|
299
|
+
metric=self,
|
|
300
|
+
prompt=prompt,
|
|
301
|
+
schema_cls=Knowledge,
|
|
302
|
+
extract_schema=lambda s: s,
|
|
303
|
+
extract_json=lambda data: Knowledge(data=data),
|
|
304
|
+
)
|
|
342
305
|
|
|
343
306
|
return knowledges
|
|
344
307
|
|
|
@@ -361,8 +324,8 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
361
324
|
self.success = False
|
|
362
325
|
else:
|
|
363
326
|
try:
|
|
364
|
-
self.score >= self.threshold
|
|
365
|
-
except:
|
|
327
|
+
self.success = self.score >= self.threshold
|
|
328
|
+
except TypeError:
|
|
366
329
|
self.success = False
|
|
367
330
|
return self.success
|
|
368
331
|
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
-
from typing import Dict, Optional,
|
|
2
|
-
from pydantic import BaseModel
|
|
1
|
+
from typing import Dict, Optional, Union, List
|
|
2
|
+
from pydantic import BaseModel, ConfigDict
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class Knowledge(BaseModel):
|
|
6
|
-
|
|
6
|
+
# Each fact’s value is either a string or a list of strings
|
|
7
|
+
# data: Dict[str, Union[str, List[str]]]
|
|
8
|
+
data: Dict[str, Union[str, List[str]]] | None = None
|
|
9
|
+
# Forbid extra top-level fields to satisfy OpenAI’s schema requirements
|
|
10
|
+
model_config = ConfigDict(extra="forbid")
|
|
7
11
|
|
|
8
12
|
|
|
9
13
|
class KnowledgeRetentionVerdict(BaseModel):
|
|
10
14
|
verdict: str
|
|
11
15
|
reason: Optional[str] = None
|
|
16
|
+
model_config = ConfigDict(extra="forbid")
|
|
12
17
|
|
|
13
18
|
|
|
14
19
|
class KnowledgeRetentionScoreReason(BaseModel):
|
|
15
20
|
reason: str
|
|
21
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -2,10 +2,20 @@ from typing import List, Dict, Any
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class KnowledgeRetentionTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_reason(attritions, score):
|
|
7
15
|
return f"""Given a list of attritions, which highlights forgetfulness in the LLM response and knowledge established previously in the conversation, use it to CONCISELY provide a reason for the knowledge retention score. Note that The knowledge retention score ranges from 0 - 1, and the higher the better.
|
|
8
16
|
|
|
17
|
+
{KnowledgeRetentionTemplate.multimodal_rules}
|
|
18
|
+
|
|
9
19
|
**
|
|
10
20
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
11
21
|
Example JSON:
|
|
@@ -33,6 +43,8 @@ JSON:
|
|
|
33
43
|
|
|
34
44
|
Your task is to determine whether the LLM message **contradicts** or **forgets** any of the known facts.
|
|
35
45
|
|
|
46
|
+
{KnowledgeRetentionTemplate.multimodal_rules}
|
|
47
|
+
|
|
36
48
|
---
|
|
37
49
|
**Output format:**
|
|
38
50
|
|
|
@@ -7,13 +7,14 @@ from deepeval.metrics.utils import (
|
|
|
7
7
|
check_conversational_test_case_params,
|
|
8
8
|
construct_verbose_logs,
|
|
9
9
|
get_unit_interactions,
|
|
10
|
-
trimAndLoadJson,
|
|
11
10
|
initialize_model,
|
|
11
|
+
a_generate_with_schema_and_extract,
|
|
12
|
+
generate_with_schema_and_extract,
|
|
12
13
|
)
|
|
13
14
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
15
|
from deepeval.test_case import ConversationalTestCase, TurnParams
|
|
15
16
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
16
|
-
from deepeval.metrics.mcp.schema import Task, TaskScore
|
|
17
|
+
from deepeval.metrics.mcp.schema import Task, TaskScore, Reason
|
|
17
18
|
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
|
|
18
19
|
from deepeval.errors import MissingTestCaseParamsError
|
|
19
20
|
from deepeval.metrics.api import metric_data_manager
|
|
@@ -50,7 +51,12 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
50
51
|
_log_metric_to_confident: bool = True,
|
|
51
52
|
):
|
|
52
53
|
check_conversational_test_case_params(
|
|
53
|
-
test_case,
|
|
54
|
+
test_case,
|
|
55
|
+
self._required_test_case_params,
|
|
56
|
+
self,
|
|
57
|
+
False,
|
|
58
|
+
self.model,
|
|
59
|
+
test_case.multimodal,
|
|
54
60
|
)
|
|
55
61
|
|
|
56
62
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -107,7 +113,12 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
107
113
|
_log_metric_to_confident: bool = True,
|
|
108
114
|
):
|
|
109
115
|
check_conversational_test_case_params(
|
|
110
|
-
test_case,
|
|
116
|
+
test_case,
|
|
117
|
+
self._required_test_case_params,
|
|
118
|
+
self,
|
|
119
|
+
False,
|
|
120
|
+
self.model,
|
|
121
|
+
test_case.multimodal,
|
|
111
122
|
)
|
|
112
123
|
|
|
113
124
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -149,48 +160,66 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
149
160
|
|
|
150
161
|
return self.score
|
|
151
162
|
|
|
152
|
-
def _generate_reason(self, task_scores: List[TaskScore]) -> str:
|
|
153
|
-
|
|
163
|
+
def _generate_reason(self, task_scores: List[TaskScore]) -> Optional[str]:
|
|
164
|
+
if not self.include_reason:
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
reasons = []
|
|
154
168
|
for task_score in task_scores:
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
169
|
+
reasons.append(task_score.reason)
|
|
170
|
+
|
|
171
|
+
prompt = MCPTaskCompletionTemplate.generate_final_reason(
|
|
172
|
+
self.score, self.success, reasons
|
|
173
|
+
)
|
|
174
|
+
return generate_with_schema_and_extract(
|
|
175
|
+
metric=self,
|
|
176
|
+
prompt=prompt,
|
|
177
|
+
schema_cls=Reason,
|
|
178
|
+
extract_schema=lambda s: s.reason,
|
|
179
|
+
extract_json=lambda data: data["reason"],
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
async def _a_generate_reason(
|
|
183
|
+
self, task_scores: List[TaskScore]
|
|
184
|
+
) -> Optional[str]:
|
|
185
|
+
if not self.include_reason:
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
reasons = []
|
|
189
|
+
for task_score in task_scores:
|
|
190
|
+
reasons.append(task_score.reason)
|
|
191
|
+
|
|
192
|
+
prompt = MCPTaskCompletionTemplate.generate_final_reason(
|
|
193
|
+
self.score, self.success, reasons
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
return await a_generate_with_schema_and_extract(
|
|
197
|
+
metric=self,
|
|
198
|
+
prompt=prompt,
|
|
199
|
+
schema_cls=Reason,
|
|
200
|
+
extract_schema=lambda s: s.reason,
|
|
201
|
+
extract_json=lambda data: data["reason"],
|
|
202
|
+
)
|
|
162
203
|
|
|
163
204
|
def _get_task_score(self, task: Task) -> TaskScore:
|
|
164
205
|
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
return res
|
|
173
|
-
except TypeError:
|
|
174
|
-
res = self.model.generate(prompt)
|
|
175
|
-
data = trimAndLoadJson(res, self)
|
|
176
|
-
return TaskScore(**data)
|
|
206
|
+
return generate_with_schema_and_extract(
|
|
207
|
+
metric=self,
|
|
208
|
+
prompt=prompt,
|
|
209
|
+
schema_cls=TaskScore,
|
|
210
|
+
extract_schema=lambda s: s,
|
|
211
|
+
extract_json=lambda data: TaskScore(**data),
|
|
212
|
+
)
|
|
177
213
|
|
|
178
214
|
async def _a_get_task_score(self, task: Task) -> TaskScore:
|
|
179
215
|
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
prompt, schema=TaskScore
|
|
188
|
-
)
|
|
189
|
-
return res
|
|
190
|
-
except TypeError:
|
|
191
|
-
res = await self.model.a_generate(prompt)
|
|
192
|
-
data = trimAndLoadJson(res, self)
|
|
193
|
-
return TaskScore(**data)
|
|
216
|
+
return await a_generate_with_schema_and_extract(
|
|
217
|
+
metric=self,
|
|
218
|
+
prompt=prompt,
|
|
219
|
+
schema_cls=TaskScore,
|
|
220
|
+
extract_schema=lambda s: s,
|
|
221
|
+
extract_json=lambda data: TaskScore(**data),
|
|
222
|
+
)
|
|
194
223
|
|
|
195
224
|
def _get_tasks(self, unit_interactions: List) -> List[Task]:
|
|
196
225
|
tasks = []
|
|
@@ -244,9 +273,9 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
244
273
|
return tasks
|
|
245
274
|
|
|
246
275
|
def _calculate_score(self, scores: List[TaskScore]) -> float:
|
|
247
|
-
|
|
276
|
+
score_divisor = len(scores) if len(scores) > 0 else 1
|
|
248
277
|
total_score = sum(score.score for score in scores)
|
|
249
|
-
score = total_score /
|
|
278
|
+
score = total_score / score_divisor
|
|
250
279
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
251
280
|
|
|
252
281
|
def is_successful(self) -> bool:
|
|
@@ -254,8 +283,8 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
254
283
|
self.success = False
|
|
255
284
|
else:
|
|
256
285
|
try:
|
|
257
|
-
self.score >= self.threshold
|
|
258
|
-
except:
|
|
286
|
+
self.success = self.score >= self.threshold
|
|
287
|
+
except TypeError:
|
|
259
288
|
self.success = False
|
|
260
289
|
return self.success
|
|
261
290
|
|