deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -5,9 +5,10 @@ from deepeval.metrics import BaseConversationalMetric
|
|
|
5
5
|
from deepeval.metrics.utils import (
|
|
6
6
|
check_conversational_test_case_params,
|
|
7
7
|
construct_verbose_logs,
|
|
8
|
-
trimAndLoadJson,
|
|
9
8
|
initialize_model,
|
|
10
9
|
convert_turn_to_dict,
|
|
10
|
+
a_generate_with_schema_and_extract,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
11
12
|
)
|
|
12
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
14
|
from deepeval.metrics.knowledge_retention.template import (
|
|
@@ -51,7 +52,12 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
51
52
|
_log_metric_to_confident: bool = True,
|
|
52
53
|
):
|
|
53
54
|
check_conversational_test_case_params(
|
|
54
|
-
test_case,
|
|
55
|
+
test_case,
|
|
56
|
+
self._required_test_case_params,
|
|
57
|
+
self,
|
|
58
|
+
False,
|
|
59
|
+
self.model,
|
|
60
|
+
test_case.multimodal,
|
|
55
61
|
)
|
|
56
62
|
|
|
57
63
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -101,7 +107,12 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
101
107
|
_log_metric_to_confident: bool = True,
|
|
102
108
|
) -> float:
|
|
103
109
|
check_conversational_test_case_params(
|
|
104
|
-
test_case,
|
|
110
|
+
test_case,
|
|
111
|
+
self._required_test_case_params,
|
|
112
|
+
self,
|
|
113
|
+
False,
|
|
114
|
+
self.model,
|
|
115
|
+
test_case.multimodal,
|
|
105
116
|
)
|
|
106
117
|
|
|
107
118
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -147,23 +158,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
147
158
|
attritions=attritions,
|
|
148
159
|
score=format(self.score, ".2f"),
|
|
149
160
|
)
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
res: KnowledgeRetentionScoreReason = (
|
|
158
|
-
await self.model.a_generate(
|
|
159
|
-
prompt, schema=KnowledgeRetentionScoreReason
|
|
160
|
-
)
|
|
161
|
-
)
|
|
162
|
-
return res.reason
|
|
163
|
-
except TypeError:
|
|
164
|
-
res = await self.model.a_generate(prompt)
|
|
165
|
-
data = trimAndLoadJson(res, self)
|
|
166
|
-
return data["reason"]
|
|
161
|
+
return await a_generate_with_schema_and_extract(
|
|
162
|
+
metric=self,
|
|
163
|
+
prompt=prompt,
|
|
164
|
+
schema_cls=KnowledgeRetentionScoreReason,
|
|
165
|
+
extract_schema=lambda s: s.reason,
|
|
166
|
+
extract_json=lambda data: data["reason"],
|
|
167
|
+
)
|
|
167
168
|
|
|
168
169
|
def _generate_reason(self) -> str:
|
|
169
170
|
if self.include_reason is False:
|
|
@@ -178,21 +179,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
178
179
|
attritions=attritions,
|
|
179
180
|
score=format(self.score, ".2f"),
|
|
180
181
|
)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
res: KnowledgeRetentionScoreReason = self.model.generate(
|
|
189
|
-
prompt, schema=KnowledgeRetentionScoreReason
|
|
190
|
-
)
|
|
191
|
-
return res.reason
|
|
192
|
-
except TypeError:
|
|
193
|
-
res = self.model.generate(prompt)
|
|
194
|
-
data = trimAndLoadJson(res, self)
|
|
195
|
-
return data["reason"]
|
|
182
|
+
return generate_with_schema_and_extract(
|
|
183
|
+
metric=self,
|
|
184
|
+
prompt=prompt,
|
|
185
|
+
schema_cls=KnowledgeRetentionScoreReason,
|
|
186
|
+
extract_schema=lambda s: s.reason,
|
|
187
|
+
extract_json=lambda data: data["reason"],
|
|
188
|
+
)
|
|
196
189
|
|
|
197
190
|
async def _a_generate_verdicts(
|
|
198
191
|
self, turns: List[Turn]
|
|
@@ -205,7 +198,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
205
198
|
accumulated_knowledge = [
|
|
206
199
|
knowledge.data
|
|
207
200
|
for knowledge in self.knowledges[:i]
|
|
208
|
-
if knowledge is not None
|
|
201
|
+
if knowledge is not None and knowledge.data
|
|
209
202
|
]
|
|
210
203
|
if len(accumulated_knowledge) == 0:
|
|
211
204
|
continue
|
|
@@ -214,22 +207,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
214
207
|
llm_message=turns[i].content,
|
|
215
208
|
accumulated_knowledge=accumulated_knowledge,
|
|
216
209
|
)
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
verdict: KnowledgeRetentionVerdict = (
|
|
225
|
-
await self.model.a_generate(
|
|
226
|
-
prompt, schema=KnowledgeRetentionVerdict
|
|
227
|
-
)
|
|
228
|
-
)
|
|
229
|
-
except TypeError:
|
|
230
|
-
res = await self.model.a_generate(prompt)
|
|
231
|
-
data = trimAndLoadJson(res, self)
|
|
232
|
-
verdict = KnowledgeRetentionVerdict(**data)
|
|
210
|
+
verdict = await a_generate_with_schema_and_extract(
|
|
211
|
+
metric=self,
|
|
212
|
+
prompt=prompt,
|
|
213
|
+
schema_cls=KnowledgeRetentionVerdict,
|
|
214
|
+
extract_schema=lambda s: s,
|
|
215
|
+
extract_json=lambda data: KnowledgeRetentionVerdict(**data),
|
|
216
|
+
)
|
|
233
217
|
verdicts.append(verdict)
|
|
234
218
|
return verdicts
|
|
235
219
|
|
|
@@ -244,7 +228,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
244
228
|
accumulated_knowledge = [
|
|
245
229
|
knowledge.data
|
|
246
230
|
for knowledge in self.knowledges[:i]
|
|
247
|
-
if knowledge is not None
|
|
231
|
+
if knowledge is not None and knowledge.data
|
|
248
232
|
]
|
|
249
233
|
if len(accumulated_knowledge) == 0:
|
|
250
234
|
continue
|
|
@@ -254,20 +238,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
254
238
|
accumulated_knowledge=accumulated_knowledge,
|
|
255
239
|
)
|
|
256
240
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
verdict: KnowledgeRetentionVerdict = self.model.generate(
|
|
265
|
-
prompt, schema=KnowledgeRetentionVerdict
|
|
266
|
-
)
|
|
267
|
-
except TypeError:
|
|
268
|
-
res = self.model.generate(prompt)
|
|
269
|
-
data = trimAndLoadJson(res, self)
|
|
270
|
-
verdict = KnowledgeRetentionVerdict(**data)
|
|
241
|
+
verdict = generate_with_schema_and_extract(
|
|
242
|
+
metric=self,
|
|
243
|
+
prompt=prompt,
|
|
244
|
+
schema_cls=KnowledgeRetentionVerdict,
|
|
245
|
+
extract_schema=lambda s: s,
|
|
246
|
+
extract_json=lambda data: KnowledgeRetentionVerdict(**data),
|
|
247
|
+
)
|
|
271
248
|
verdicts.append(verdict)
|
|
272
249
|
return verdicts
|
|
273
250
|
|
|
@@ -289,20 +266,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
289
266
|
convert_turn_to_dict(turn) for turn in previous_turns
|
|
290
267
|
],
|
|
291
268
|
)
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
knowledges[i] = await self.model.a_generate(
|
|
300
|
-
prompt, schema=Knowledge
|
|
301
|
-
)
|
|
302
|
-
except TypeError:
|
|
303
|
-
res = await self.model.a_generate(prompt)
|
|
304
|
-
data = trimAndLoadJson(res, self)
|
|
305
|
-
knowledges[i] = Knowledge(data=data)
|
|
269
|
+
knowledges[i] = await a_generate_with_schema_and_extract(
|
|
270
|
+
metric=self,
|
|
271
|
+
prompt=prompt,
|
|
272
|
+
schema_cls=Knowledge,
|
|
273
|
+
extract_schema=lambda s: s,
|
|
274
|
+
extract_json=lambda data: Knowledge(data=data),
|
|
275
|
+
)
|
|
306
276
|
|
|
307
277
|
return knowledges
|
|
308
278
|
|
|
@@ -325,20 +295,13 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
325
295
|
],
|
|
326
296
|
)
|
|
327
297
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
knowledges[i] = self.model.generate(
|
|
336
|
-
prompt, schema=Knowledge
|
|
337
|
-
)
|
|
338
|
-
except TypeError:
|
|
339
|
-
res = self.model.generate(prompt)
|
|
340
|
-
data = trimAndLoadJson(res, self)
|
|
341
|
-
knowledges[i] = Knowledge(data=data)
|
|
298
|
+
knowledges[i] = generate_with_schema_and_extract(
|
|
299
|
+
metric=self,
|
|
300
|
+
prompt=prompt,
|
|
301
|
+
schema_cls=Knowledge,
|
|
302
|
+
extract_schema=lambda s: s,
|
|
303
|
+
extract_json=lambda data: Knowledge(data=data),
|
|
304
|
+
)
|
|
342
305
|
|
|
343
306
|
return knowledges
|
|
344
307
|
|
|
@@ -361,8 +324,8 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
361
324
|
self.success = False
|
|
362
325
|
else:
|
|
363
326
|
try:
|
|
364
|
-
self.score >= self.threshold
|
|
365
|
-
except:
|
|
327
|
+
self.success = self.score >= self.threshold
|
|
328
|
+
except TypeError:
|
|
366
329
|
self.success = False
|
|
367
330
|
return self.success
|
|
368
331
|
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
-
from typing import Dict, Optional,
|
|
2
|
-
from pydantic import BaseModel
|
|
1
|
+
from typing import Dict, Optional, Union, List
|
|
2
|
+
from pydantic import BaseModel, ConfigDict
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class Knowledge(BaseModel):
|
|
6
|
-
|
|
6
|
+
# Each fact’s value is either a string or a list of strings
|
|
7
|
+
# data: Dict[str, Union[str, List[str]]]
|
|
8
|
+
data: Dict[str, Union[str, List[str]]] | None = None
|
|
9
|
+
# Forbid extra top-level fields to satisfy OpenAI’s schema requirements
|
|
10
|
+
model_config = ConfigDict(extra="forbid")
|
|
7
11
|
|
|
8
12
|
|
|
9
13
|
class KnowledgeRetentionVerdict(BaseModel):
|
|
10
14
|
verdict: str
|
|
11
15
|
reason: Optional[str] = None
|
|
16
|
+
model_config = ConfigDict(extra="forbid")
|
|
12
17
|
|
|
13
18
|
|
|
14
19
|
class KnowledgeRetentionScoreReason(BaseModel):
|
|
15
20
|
reason: str
|
|
21
|
+
model_config = ConfigDict(extra="forbid")
|
|
@@ -2,10 +2,20 @@ from typing import List, Dict, Any
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class KnowledgeRetentionTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_reason(attritions, score):
|
|
7
15
|
return f"""Given a list of attritions, which highlights forgetfulness in the LLM response and knowledge established previously in the conversation, use it to CONCISELY provide a reason for the knowledge retention score. Note that The knowledge retention score ranges from 0 - 1, and the higher the better.
|
|
8
16
|
|
|
17
|
+
{KnowledgeRetentionTemplate.multimodal_rules}
|
|
18
|
+
|
|
9
19
|
**
|
|
10
20
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
11
21
|
Example JSON:
|
|
@@ -33,6 +43,8 @@ JSON:
|
|
|
33
43
|
|
|
34
44
|
Your task is to determine whether the LLM message **contradicts** or **forgets** any of the known facts.
|
|
35
45
|
|
|
46
|
+
{KnowledgeRetentionTemplate.multimodal_rules}
|
|
47
|
+
|
|
36
48
|
---
|
|
37
49
|
**Output format:**
|
|
38
50
|
|
|
@@ -7,8 +7,9 @@ from deepeval.metrics.utils import (
|
|
|
7
7
|
check_conversational_test_case_params,
|
|
8
8
|
construct_verbose_logs,
|
|
9
9
|
get_unit_interactions,
|
|
10
|
-
trimAndLoadJson,
|
|
11
10
|
initialize_model,
|
|
11
|
+
a_generate_with_schema_and_extract,
|
|
12
|
+
generate_with_schema_and_extract,
|
|
12
13
|
)
|
|
13
14
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
15
|
from deepeval.test_case import ConversationalTestCase, TurnParams
|
|
@@ -50,7 +51,12 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
50
51
|
_log_metric_to_confident: bool = True,
|
|
51
52
|
):
|
|
52
53
|
check_conversational_test_case_params(
|
|
53
|
-
test_case,
|
|
54
|
+
test_case,
|
|
55
|
+
self._required_test_case_params,
|
|
56
|
+
self,
|
|
57
|
+
False,
|
|
58
|
+
self.model,
|
|
59
|
+
test_case.multimodal,
|
|
54
60
|
)
|
|
55
61
|
|
|
56
62
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -107,7 +113,12 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
107
113
|
_log_metric_to_confident: bool = True,
|
|
108
114
|
):
|
|
109
115
|
check_conversational_test_case_params(
|
|
110
|
-
test_case,
|
|
116
|
+
test_case,
|
|
117
|
+
self._required_test_case_params,
|
|
118
|
+
self,
|
|
119
|
+
False,
|
|
120
|
+
self.model,
|
|
121
|
+
test_case.multimodal,
|
|
111
122
|
)
|
|
112
123
|
|
|
113
124
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -149,48 +160,67 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
149
160
|
|
|
150
161
|
return self.score
|
|
151
162
|
|
|
152
|
-
def _generate_reason(self, task_scores: List[TaskScore]) -> str:
|
|
153
|
-
|
|
163
|
+
def _generate_reason(self, task_scores: List[TaskScore]) -> Optional[str]:
|
|
164
|
+
if not self.include_reason:
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
reasons = []
|
|
154
168
|
for task_score in task_scores:
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
reason += "]"
|
|
161
|
-
return reason
|
|
169
|
+
reasons.append(task_score.reason)
|
|
170
|
+
|
|
171
|
+
prompt = MCPTaskCompletionTemplate.generate_final_reason(
|
|
172
|
+
self.score, self.success, reasons
|
|
173
|
+
)
|
|
162
174
|
|
|
163
|
-
def _get_task_score(self, task: Task) -> TaskScore:
|
|
164
|
-
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
165
175
|
if self.using_native_model:
|
|
166
|
-
res, cost = self.model.generate(prompt
|
|
176
|
+
res, cost = self.model.generate(prompt)
|
|
167
177
|
self.evaluation_cost += cost
|
|
168
178
|
return res
|
|
169
179
|
else:
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
180
|
+
res = self.model.generate(prompt)
|
|
181
|
+
return res
|
|
182
|
+
|
|
183
|
+
async def _a_generate_reason(
|
|
184
|
+
self, task_scores: List[TaskScore]
|
|
185
|
+
) -> Optional[str]:
|
|
186
|
+
if not self.include_reason:
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
reasons = []
|
|
190
|
+
for task_score in task_scores:
|
|
191
|
+
reasons.append(task_score.reason)
|
|
192
|
+
|
|
193
|
+
prompt = MCPTaskCompletionTemplate.generate_final_reason(
|
|
194
|
+
self.score, self.success, reasons
|
|
195
|
+
)
|
|
177
196
|
|
|
178
|
-
async def _a_get_task_score(self, task: Task) -> TaskScore:
|
|
179
|
-
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
180
197
|
if self.using_native_model:
|
|
181
|
-
res, cost = await self.model.a_generate(prompt
|
|
198
|
+
res, cost = await self.model.a_generate(prompt)
|
|
182
199
|
self.evaluation_cost += cost
|
|
183
200
|
return res
|
|
184
201
|
else:
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
202
|
+
res = await self.model.a_generate(prompt)
|
|
203
|
+
return res
|
|
204
|
+
|
|
205
|
+
def _get_task_score(self, task: Task) -> TaskScore:
|
|
206
|
+
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
207
|
+
return generate_with_schema_and_extract(
|
|
208
|
+
metric=self,
|
|
209
|
+
prompt=prompt,
|
|
210
|
+
schema_cls=TaskScore,
|
|
211
|
+
extract_schema=lambda s: s,
|
|
212
|
+
extract_json=lambda data: TaskScore(**data),
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
async def _a_get_task_score(self, task: Task) -> TaskScore:
|
|
216
|
+
prompt = MCPTaskCompletionTemplate.get_task_completion_score(task)
|
|
217
|
+
return await a_generate_with_schema_and_extract(
|
|
218
|
+
metric=self,
|
|
219
|
+
prompt=prompt,
|
|
220
|
+
schema_cls=TaskScore,
|
|
221
|
+
extract_schema=lambda s: s,
|
|
222
|
+
extract_json=lambda data: TaskScore(**data),
|
|
223
|
+
)
|
|
194
224
|
|
|
195
225
|
def _get_tasks(self, unit_interactions: List) -> List[Task]:
|
|
196
226
|
tasks = []
|
|
@@ -244,9 +274,9 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
244
274
|
return tasks
|
|
245
275
|
|
|
246
276
|
def _calculate_score(self, scores: List[TaskScore]) -> float:
|
|
247
|
-
|
|
277
|
+
score_divisor = len(scores) if len(scores) > 0 else 1
|
|
248
278
|
total_score = sum(score.score for score in scores)
|
|
249
|
-
score = total_score /
|
|
279
|
+
score = total_score / score_divisor
|
|
250
280
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
251
281
|
|
|
252
282
|
def is_successful(self) -> bool:
|
|
@@ -254,8 +284,8 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
254
284
|
self.success = False
|
|
255
285
|
else:
|
|
256
286
|
try:
|
|
257
|
-
self.score >= self.threshold
|
|
258
|
-
except:
|
|
287
|
+
self.success = self.score >= self.threshold
|
|
288
|
+
except TypeError:
|
|
259
289
|
self.success = False
|
|
260
290
|
return self.success
|
|
261
291
|
|