deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +176 -16
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +118 -60
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +37 -15
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/models/retry_policy.py +202 -11
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/utils.py +58 -8
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +52 -4
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
3
5
|
from typing import Optional, List, Tuple, Union, Type
|
|
4
6
|
from deepeval.metrics import BaseMetric
|
|
5
7
|
from deepeval.test_case import (
|
|
@@ -16,7 +18,7 @@ from deepeval.metrics.utils import (
|
|
|
16
18
|
)
|
|
17
19
|
from deepeval.models import DeepEvalBaseLLM
|
|
18
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
|
-
from deepeval.metrics.g_eval
|
|
21
|
+
from deepeval.metrics.g_eval import schema as gschema
|
|
20
22
|
from deepeval.metrics.g_eval.utils import (
|
|
21
23
|
Rubric,
|
|
22
24
|
construct_g_eval_params_string,
|
|
@@ -29,6 +31,8 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
29
31
|
number_evaluation_steps,
|
|
30
32
|
get_score_range,
|
|
31
33
|
)
|
|
34
|
+
from deepeval.metrics.api import metric_data_manager
|
|
35
|
+
from deepeval.config.settings import get_settings
|
|
32
36
|
|
|
33
37
|
|
|
34
38
|
class GEval(BaseMetric):
|
|
@@ -71,6 +75,7 @@ class GEval(BaseMetric):
|
|
|
71
75
|
test_case: LLMTestCase,
|
|
72
76
|
_show_indicator: bool = True,
|
|
73
77
|
_in_component: bool = False,
|
|
78
|
+
_log_metric_to_confident: bool = True,
|
|
74
79
|
_additional_context: Optional[str] = None,
|
|
75
80
|
) -> float:
|
|
76
81
|
check_llm_test_case_params(test_case, self.evaluation_params, self)
|
|
@@ -81,12 +86,16 @@ class GEval(BaseMetric):
|
|
|
81
86
|
):
|
|
82
87
|
if self.async_mode:
|
|
83
88
|
loop = get_or_create_event_loop()
|
|
89
|
+
coro = self.a_measure(
|
|
90
|
+
test_case,
|
|
91
|
+
_show_indicator=False,
|
|
92
|
+
_in_component=_in_component,
|
|
93
|
+
_additional_context=_additional_context,
|
|
94
|
+
)
|
|
84
95
|
loop.run_until_complete(
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
_in_component=_in_component,
|
|
89
|
-
_additional_context=_additional_context,
|
|
96
|
+
asyncio.wait_for(
|
|
97
|
+
coro,
|
|
98
|
+
timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
|
|
90
99
|
)
|
|
91
100
|
)
|
|
92
101
|
else:
|
|
@@ -115,6 +124,10 @@ class GEval(BaseMetric):
|
|
|
115
124
|
f"Reason: {self.reason}",
|
|
116
125
|
],
|
|
117
126
|
)
|
|
127
|
+
if _log_metric_to_confident:
|
|
128
|
+
metric_data_manager.post_metric_if_enabled(
|
|
129
|
+
self, test_case=test_case
|
|
130
|
+
)
|
|
118
131
|
|
|
119
132
|
return self.score
|
|
120
133
|
|
|
@@ -123,6 +136,7 @@ class GEval(BaseMetric):
|
|
|
123
136
|
test_case: LLMTestCase,
|
|
124
137
|
_show_indicator: bool = True,
|
|
125
138
|
_in_component: bool = False,
|
|
139
|
+
_log_metric_to_confident: bool = True,
|
|
126
140
|
_additional_context: Optional[str] = None,
|
|
127
141
|
) -> float:
|
|
128
142
|
check_llm_test_case_params(test_case, self.evaluation_params, self)
|
|
@@ -158,6 +172,10 @@ class GEval(BaseMetric):
|
|
|
158
172
|
f"Reason: {self.reason}",
|
|
159
173
|
],
|
|
160
174
|
)
|
|
175
|
+
if _log_metric_to_confident:
|
|
176
|
+
metric_data_manager.post_metric_if_enabled(
|
|
177
|
+
self, test_case=test_case
|
|
178
|
+
)
|
|
161
179
|
return self.score
|
|
162
180
|
|
|
163
181
|
async def _a_generate_evaluation_steps(self) -> List[str]:
|
|
@@ -177,7 +195,9 @@ class GEval(BaseMetric):
|
|
|
177
195
|
return data["steps"]
|
|
178
196
|
else:
|
|
179
197
|
try:
|
|
180
|
-
res: Steps = await self.model.a_generate(
|
|
198
|
+
res: gschema.Steps = await self.model.a_generate(
|
|
199
|
+
prompt, schema=gschema.Steps
|
|
200
|
+
)
|
|
181
201
|
return res.steps
|
|
182
202
|
except TypeError:
|
|
183
203
|
res = await self.model.a_generate(prompt)
|
|
@@ -201,7 +221,9 @@ class GEval(BaseMetric):
|
|
|
201
221
|
return data["steps"]
|
|
202
222
|
else:
|
|
203
223
|
try:
|
|
204
|
-
res: Steps = self.model.generate(
|
|
224
|
+
res: gschema.Steps = self.model.generate(
|
|
225
|
+
prompt, schema=gschema.Steps
|
|
226
|
+
)
|
|
205
227
|
return res.steps
|
|
206
228
|
except TypeError:
|
|
207
229
|
res = self.model.generate(prompt)
|
|
@@ -264,7 +286,7 @@ class GEval(BaseMetric):
|
|
|
264
286
|
score, res
|
|
265
287
|
)
|
|
266
288
|
return weighted_summed_score, reason
|
|
267
|
-
except:
|
|
289
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
268
290
|
return score, reason
|
|
269
291
|
except (
|
|
270
292
|
AttributeError
|
|
@@ -276,8 +298,8 @@ class GEval(BaseMetric):
|
|
|
276
298
|
return data["score"], data["reason"]
|
|
277
299
|
else:
|
|
278
300
|
try:
|
|
279
|
-
res: ReasonScore = await self.model.a_generate(
|
|
280
|
-
prompt, schema=ReasonScore
|
|
301
|
+
res: gschema.ReasonScore = await self.model.a_generate(
|
|
302
|
+
prompt, schema=gschema.ReasonScore
|
|
281
303
|
)
|
|
282
304
|
return res.score, res.reason
|
|
283
305
|
except TypeError:
|
|
@@ -338,7 +360,7 @@ class GEval(BaseMetric):
|
|
|
338
360
|
score, res
|
|
339
361
|
)
|
|
340
362
|
return weighted_summed_score, reason
|
|
341
|
-
except:
|
|
363
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
342
364
|
return score, reason
|
|
343
365
|
except AttributeError:
|
|
344
366
|
# This catches the case where a_generate_raw_response doesn't exist.
|
|
@@ -349,8 +371,8 @@ class GEval(BaseMetric):
|
|
|
349
371
|
return data["score"], data["reason"]
|
|
350
372
|
else:
|
|
351
373
|
try:
|
|
352
|
-
res: ReasonScore = self.model.generate(
|
|
353
|
-
prompt, schema=ReasonScore
|
|
374
|
+
res: gschema.ReasonScore = self.model.generate(
|
|
375
|
+
prompt, schema=gschema.ReasonScore
|
|
354
376
|
)
|
|
355
377
|
return res.score, res.reason
|
|
356
378
|
except TypeError:
|
|
@@ -364,7 +386,7 @@ class GEval(BaseMetric):
|
|
|
364
386
|
else:
|
|
365
387
|
try:
|
|
366
388
|
self.success = self.score >= self.threshold
|
|
367
|
-
except:
|
|
389
|
+
except TypeError:
|
|
368
390
|
self.success = False
|
|
369
391
|
return self.success
|
|
370
392
|
|
|
@@ -16,6 +16,7 @@ from deepeval.metrics.hallucination.template import HallucinationTemplate
|
|
|
16
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
17
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
18
|
from deepeval.metrics.hallucination.schema import *
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
required_params: List[LLMTestCaseParams] = [
|
|
21
22
|
LLMTestCaseParams.INPUT,
|
|
@@ -51,6 +52,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
51
52
|
test_case: LLMTestCase,
|
|
52
53
|
_show_indicator: bool = True,
|
|
53
54
|
_in_component: bool = False,
|
|
55
|
+
_log_metric_to_confident: bool = True,
|
|
54
56
|
) -> float:
|
|
55
57
|
|
|
56
58
|
check_llm_test_case_params(test_case, required_params, self)
|
|
@@ -66,6 +68,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
66
68
|
test_case,
|
|
67
69
|
_show_indicator=False,
|
|
68
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
69
72
|
)
|
|
70
73
|
)
|
|
71
74
|
else:
|
|
@@ -84,6 +87,10 @@ class HallucinationMetric(BaseMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
|
|
88
95
|
return self.score
|
|
89
96
|
|
|
@@ -92,6 +99,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
92
99
|
test_case: LLMTestCase,
|
|
93
100
|
_show_indicator: bool = True,
|
|
94
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
95
103
|
) -> float:
|
|
96
104
|
|
|
97
105
|
check_llm_test_case_params(test_case, required_params, self)
|
|
@@ -118,7 +126,10 @@ class HallucinationMetric(BaseMetric):
|
|
|
118
126
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
119
127
|
],
|
|
120
128
|
)
|
|
121
|
-
|
|
129
|
+
if _log_metric_to_confident:
|
|
130
|
+
metric_data_manager.post_metric_if_enabled(
|
|
131
|
+
self, test_case=test_case
|
|
132
|
+
)
|
|
122
133
|
return self.score
|
|
123
134
|
|
|
124
135
|
async def _a_generate_reason(self):
|
deepeval/metrics/indicator.py
CHANGED
|
@@ -100,6 +100,7 @@ async def measure_metric_task(
|
|
|
100
100
|
test_case,
|
|
101
101
|
_show_indicator=False,
|
|
102
102
|
_in_component=_in_component,
|
|
103
|
+
_log_metric_to_confident=False,
|
|
103
104
|
)
|
|
104
105
|
finish_text = "Done"
|
|
105
106
|
except MissingTestCaseParamsError as e:
|
|
@@ -116,7 +117,9 @@ async def measure_metric_task(
|
|
|
116
117
|
except TypeError:
|
|
117
118
|
try:
|
|
118
119
|
await metric.a_measure(
|
|
119
|
-
test_case,
|
|
120
|
+
test_case,
|
|
121
|
+
_in_component=_in_component,
|
|
122
|
+
_log_metric_to_confident=False,
|
|
120
123
|
)
|
|
121
124
|
finish_text = "Done"
|
|
122
125
|
except MissingTestCaseParamsError as e:
|
|
@@ -241,7 +244,10 @@ async def safe_a_measure(
|
|
|
241
244
|
):
|
|
242
245
|
try:
|
|
243
246
|
await metric.a_measure(
|
|
244
|
-
tc,
|
|
247
|
+
tc,
|
|
248
|
+
_show_indicator=False,
|
|
249
|
+
_in_component=_in_component,
|
|
250
|
+
_log_metric_to_confident=False,
|
|
245
251
|
)
|
|
246
252
|
update_pbar(progress, pbar_eval_id)
|
|
247
253
|
except MissingTestCaseParamsError as e:
|
|
@@ -18,6 +18,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
|
|
|
18
18
|
from deepeval.metrics.json_correctness.template import JsonCorrectnessTemplate
|
|
19
19
|
from deepeval.metrics.json_correctness.schema import JsonCorrectnessScoreReason
|
|
20
20
|
from deepeval.utils import get_or_create_event_loop
|
|
21
|
+
from deepeval.metrics.api import metric_data_manager
|
|
21
22
|
|
|
22
23
|
DEFAULT_CORRECT_REASON = "The generated Json matches and is syntactically correct to the expected schema."
|
|
23
24
|
|
|
@@ -51,6 +52,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
51
52
|
test_case: LLMTestCase,
|
|
52
53
|
_show_indicator: bool = True,
|
|
53
54
|
_in_component: bool = False,
|
|
55
|
+
_log_metric_to_confident: bool = True,
|
|
54
56
|
) -> float:
|
|
55
57
|
|
|
56
58
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -66,6 +68,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
66
68
|
test_case,
|
|
67
69
|
_show_indicator=False,
|
|
68
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
69
72
|
)
|
|
70
73
|
)
|
|
71
74
|
else:
|
|
@@ -88,6 +91,10 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
88
91
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
89
92
|
],
|
|
90
93
|
)
|
|
94
|
+
if _log_metric_to_confident:
|
|
95
|
+
metric_data_manager.post_metric_if_enabled(
|
|
96
|
+
self, test_case=test_case
|
|
97
|
+
)
|
|
91
98
|
|
|
92
99
|
return self.score
|
|
93
100
|
|
|
@@ -96,6 +103,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
96
103
|
test_case: LLMTestCase,
|
|
97
104
|
_show_indicator: bool = True,
|
|
98
105
|
_in_component: bool = False,
|
|
106
|
+
_log_metric_to_confident: bool = True,
|
|
99
107
|
) -> float:
|
|
100
108
|
|
|
101
109
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -126,7 +134,10 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
126
134
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
127
135
|
],
|
|
128
136
|
)
|
|
129
|
-
|
|
137
|
+
if _log_metric_to_confident:
|
|
138
|
+
metric_data_manager.post_metric_if_enabled(
|
|
139
|
+
self, test_case=test_case
|
|
140
|
+
)
|
|
130
141
|
return self.score
|
|
131
142
|
|
|
132
143
|
async def a_generate_reason(self, actual_output: str) -> str:
|
|
@@ -20,6 +20,7 @@ from deepeval.metrics.knowledge_retention.schema import (
|
|
|
20
20
|
KnowledgeRetentionScoreReason,
|
|
21
21
|
)
|
|
22
22
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
@@ -47,6 +48,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
47
48
|
test_case: ConversationalTestCase,
|
|
48
49
|
_show_indicator: bool = True,
|
|
49
50
|
_in_component: bool = False,
|
|
51
|
+
_log_metric_to_confident: bool = True,
|
|
50
52
|
):
|
|
51
53
|
check_conversational_test_case_params(
|
|
52
54
|
test_case, self._required_test_case_params, self
|
|
@@ -63,6 +65,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
63
65
|
test_case,
|
|
64
66
|
_show_indicator=False,
|
|
65
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
69
|
)
|
|
67
70
|
)
|
|
68
71
|
else:
|
|
@@ -84,6 +87,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
return self.score
|
|
88
95
|
|
|
89
96
|
async def a_measure(
|
|
@@ -91,6 +98,7 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
91
98
|
test_case: ConversationalTestCase,
|
|
92
99
|
_show_indicator: bool = True,
|
|
93
100
|
_in_component: bool = False,
|
|
101
|
+
_log_metric_to_confident: bool = True,
|
|
94
102
|
) -> float:
|
|
95
103
|
check_conversational_test_case_params(
|
|
96
104
|
test_case, self._required_test_case_params, self
|
|
@@ -120,6 +128,10 @@ class KnowledgeRetentionMetric(BaseConversationalMetric):
|
|
|
120
128
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
121
129
|
],
|
|
122
130
|
)
|
|
131
|
+
if _log_metric_to_confident:
|
|
132
|
+
metric_data_manager.post_metric_if_enabled(
|
|
133
|
+
self, test_case=test_case
|
|
134
|
+
)
|
|
123
135
|
return self.score
|
|
124
136
|
|
|
125
137
|
async def _a_generate_reason(self) -> str:
|
|
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
|
16
16
|
from deepeval.metrics.mcp.schema import Task, TaskScore
|
|
17
17
|
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
|
|
18
18
|
from deepeval.errors import MissingTestCaseParamsError
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
@@ -46,6 +47,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
46
47
|
test_case: ConversationalTestCase,
|
|
47
48
|
_show_indicator: bool = True,
|
|
48
49
|
_in_component: bool = False,
|
|
50
|
+
_log_metric_to_confident: bool = True,
|
|
49
51
|
):
|
|
50
52
|
check_conversational_test_case_params(
|
|
51
53
|
test_case, self._required_test_case_params, self
|
|
@@ -62,6 +64,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
62
64
|
test_case,
|
|
63
65
|
_show_indicator=False,
|
|
64
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
65
68
|
)
|
|
66
69
|
)
|
|
67
70
|
else:
|
|
@@ -90,6 +93,10 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
90
93
|
f"Score: {self.score}",
|
|
91
94
|
],
|
|
92
95
|
)
|
|
96
|
+
if _log_metric_to_confident:
|
|
97
|
+
metric_data_manager.post_metric_if_enabled(
|
|
98
|
+
self, test_case=test_case
|
|
99
|
+
)
|
|
93
100
|
return self.score
|
|
94
101
|
|
|
95
102
|
async def a_measure(
|
|
@@ -97,6 +104,7 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
97
104
|
test_case: ConversationalTestCase,
|
|
98
105
|
_show_indicator: bool = True,
|
|
99
106
|
_in_component: bool = False,
|
|
107
|
+
_log_metric_to_confident: bool = True,
|
|
100
108
|
):
|
|
101
109
|
check_conversational_test_case_params(
|
|
102
110
|
test_case, self._required_test_case_params, self
|
|
@@ -131,6 +139,11 @@ class MCPTaskCompletionMetric(BaseConversationalMetric):
|
|
|
131
139
|
f"Score: {self.score}",
|
|
132
140
|
],
|
|
133
141
|
)
|
|
142
|
+
if _log_metric_to_confident:
|
|
143
|
+
metric_data_manager.post_metric_if_enabled(
|
|
144
|
+
self, test_case=test_case
|
|
145
|
+
)
|
|
146
|
+
|
|
134
147
|
return self.score
|
|
135
148
|
|
|
136
149
|
def _generate_reason(self, task_scores: List[TaskScore]) -> str:
|
|
@@ -16,6 +16,7 @@ from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
|
16
16
|
from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore
|
|
17
17
|
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
|
|
18
18
|
from deepeval.errors import MissingTestCaseParamsError
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
@@ -46,6 +47,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
46
47
|
test_case: ConversationalTestCase,
|
|
47
48
|
_show_indicator: bool = True,
|
|
48
49
|
_in_component: bool = False,
|
|
50
|
+
_log_metric_to_confident: bool = True,
|
|
49
51
|
):
|
|
50
52
|
check_conversational_test_case_params(
|
|
51
53
|
test_case, self._required_test_case_params, self
|
|
@@ -62,6 +64,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
62
64
|
test_case,
|
|
63
65
|
_show_indicator=False,
|
|
64
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
65
68
|
)
|
|
66
69
|
)
|
|
67
70
|
else:
|
|
@@ -102,6 +105,11 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
102
105
|
f"Score: {self.score}",
|
|
103
106
|
],
|
|
104
107
|
)
|
|
108
|
+
if _log_metric_to_confident:
|
|
109
|
+
metric_data_manager.post_metric_if_enabled(
|
|
110
|
+
self, test_case=test_case
|
|
111
|
+
)
|
|
112
|
+
|
|
105
113
|
return self.score
|
|
106
114
|
|
|
107
115
|
async def a_measure(
|
|
@@ -109,6 +117,7 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
109
117
|
test_case: ConversationalTestCase,
|
|
110
118
|
_show_indicator: bool = True,
|
|
111
119
|
_in_component: bool = False,
|
|
120
|
+
_log_metric_to_confident: bool = True,
|
|
112
121
|
):
|
|
113
122
|
check_conversational_test_case_params(
|
|
114
123
|
test_case, self._required_test_case_params, self
|
|
@@ -161,6 +170,10 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
161
170
|
f"Score: {self.score}",
|
|
162
171
|
],
|
|
163
172
|
)
|
|
173
|
+
if _log_metric_to_confident:
|
|
174
|
+
metric_data_manager.post_metric_if_enabled(
|
|
175
|
+
self, test_case=test_case
|
|
176
|
+
)
|
|
164
177
|
return self.score
|
|
165
178
|
|
|
166
179
|
def _get_tool_accuracy_score(
|
|
@@ -20,6 +20,7 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
20
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
21
|
from .template import MCPUseMetricTemplate
|
|
22
22
|
from .schema import MCPPrimitivesScore, MCPArgsScore
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class MCPUseMetric(BaseMetric):
|
|
@@ -51,6 +52,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
51
52
|
test_case: LLMTestCase,
|
|
52
53
|
_show_indicator: bool = True,
|
|
53
54
|
_in_component: bool = False,
|
|
55
|
+
_log_metric_to_confident: bool = True,
|
|
54
56
|
) -> float:
|
|
55
57
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
56
58
|
|
|
@@ -65,6 +67,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
65
67
|
test_case,
|
|
66
68
|
_show_indicator=False,
|
|
67
69
|
_in_component=_in_component,
|
|
70
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
71
|
)
|
|
69
72
|
)
|
|
70
73
|
else:
|
|
@@ -104,6 +107,10 @@ class MCPUseMetric(BaseMetric):
|
|
|
104
107
|
self,
|
|
105
108
|
steps=steps,
|
|
106
109
|
)
|
|
110
|
+
if _log_metric_to_confident:
|
|
111
|
+
metric_data_manager.post_metric_if_enabled(
|
|
112
|
+
self, test_case=test_case
|
|
113
|
+
)
|
|
107
114
|
|
|
108
115
|
return self.score
|
|
109
116
|
|
|
@@ -112,6 +119,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
112
119
|
test_case: LLMTestCase,
|
|
113
120
|
_show_indicator: bool = True,
|
|
114
121
|
_in_component: bool = False,
|
|
122
|
+
_log_metric_to_confident: bool = True,
|
|
115
123
|
) -> float:
|
|
116
124
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
117
125
|
|
|
@@ -154,7 +162,10 @@ class MCPUseMetric(BaseMetric):
|
|
|
154
162
|
self,
|
|
155
163
|
steps=steps,
|
|
156
164
|
)
|
|
157
|
-
|
|
165
|
+
if _log_metric_to_confident:
|
|
166
|
+
metric_data_manager.post_metric_if_enabled(
|
|
167
|
+
self, test_case=test_case
|
|
168
|
+
)
|
|
158
169
|
return self.score
|
|
159
170
|
|
|
160
171
|
def _get_primitives_used_score(
|
|
@@ -16,6 +16,7 @@ from deepeval.metrics.utils import (
|
|
|
16
16
|
)
|
|
17
17
|
from deepeval.metrics.misuse.template import MisuseTemplate
|
|
18
18
|
from deepeval.metrics.misuse.schema import *
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class MisuseMetric(BaseMetric):
|
|
@@ -53,6 +54,7 @@ class MisuseMetric(BaseMetric):
|
|
|
53
54
|
test_case: LLMTestCase,
|
|
54
55
|
_show_indicator: bool = True,
|
|
55
56
|
_in_component: bool = False,
|
|
57
|
+
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
59
|
|
|
58
60
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -68,6 +70,7 @@ class MisuseMetric(BaseMetric):
|
|
|
68
70
|
test_case,
|
|
69
71
|
_show_indicator=False,
|
|
70
72
|
_in_component=_in_component,
|
|
73
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
71
74
|
)
|
|
72
75
|
)
|
|
73
76
|
else:
|
|
@@ -86,6 +89,10 @@ class MisuseMetric(BaseMetric):
|
|
|
86
89
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
87
90
|
],
|
|
88
91
|
)
|
|
92
|
+
if _log_metric_to_confident:
|
|
93
|
+
metric_data_manager.post_metric_if_enabled(
|
|
94
|
+
self, test_case=test_case
|
|
95
|
+
)
|
|
89
96
|
|
|
90
97
|
return self.score
|
|
91
98
|
|
|
@@ -94,6 +101,7 @@ class MisuseMetric(BaseMetric):
|
|
|
94
101
|
test_case: LLMTestCase,
|
|
95
102
|
_show_indicator: bool = True,
|
|
96
103
|
_in_component: bool = False,
|
|
104
|
+
_log_metric_to_confident: bool = True,
|
|
97
105
|
) -> float:
|
|
98
106
|
|
|
99
107
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -122,7 +130,10 @@ class MisuseMetric(BaseMetric):
|
|
|
122
130
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
123
131
|
],
|
|
124
132
|
)
|
|
125
|
-
|
|
133
|
+
if _log_metric_to_confident:
|
|
134
|
+
metric_data_manager.post_metric_if_enabled(
|
|
135
|
+
self, test_case=test_case
|
|
136
|
+
)
|
|
126
137
|
return self.score
|
|
127
138
|
|
|
128
139
|
async def _a_generate_reason(self) -> str:
|
|
@@ -48,6 +48,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
48
48
|
test_case: MLLMTestCase,
|
|
49
49
|
_show_indicator: bool = True,
|
|
50
50
|
_in_component: bool = False,
|
|
51
|
+
_log_metric_to_confident: bool = True,
|
|
51
52
|
) -> float:
|
|
52
53
|
check_mllm_test_case_params(
|
|
53
54
|
test_case, self._required_params, None, None, self
|
|
@@ -63,6 +64,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
63
64
|
test_case,
|
|
64
65
|
_show_indicator=False,
|
|
65
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
68
|
)
|
|
67
69
|
)
|
|
68
70
|
else:
|
|
@@ -146,6 +148,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
146
148
|
test_case: MLLMTestCase,
|
|
147
149
|
_show_indicator: bool = True,
|
|
148
150
|
_in_component: bool = False,
|
|
151
|
+
_log_metric_to_confident: bool = True,
|
|
149
152
|
) -> float:
|
|
150
153
|
check_mllm_test_case_params(
|
|
151
154
|
test_case, self._required_params, None, None, self
|
|
@@ -47,6 +47,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
47
47
|
test_case: MLLMTestCase,
|
|
48
48
|
_show_indicator: bool = True,
|
|
49
49
|
_in_component: bool = False,
|
|
50
|
+
_log_metric_to_confident: bool = True,
|
|
50
51
|
) -> float:
|
|
51
52
|
check_mllm_test_case_params(
|
|
52
53
|
test_case, self._required_params, 1, 1, self
|
|
@@ -63,6 +64,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
63
64
|
test_case,
|
|
64
65
|
_show_indicator=False,
|
|
65
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
68
|
)
|
|
67
69
|
)
|
|
68
70
|
else:
|
|
@@ -108,6 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
108
110
|
test_case: MLLMTestCase,
|
|
109
111
|
_show_indicator: bool = True,
|
|
110
112
|
_in_component: bool = False,
|
|
113
|
+
_log_metric_to_confident: bool = True,
|
|
111
114
|
) -> float:
|
|
112
115
|
check_mllm_test_case_params(
|
|
113
116
|
test_case, self._required_params, 1, 1, self
|
|
@@ -49,6 +49,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -64,6 +65,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
64
65
|
test_case,
|
|
65
66
|
_show_indicator=False,
|
|
66
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
69
|
)
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
@@ -147,6 +149,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
147
149
|
test_case: MLLMTestCase,
|
|
148
150
|
_show_indicator: bool = True,
|
|
149
151
|
_in_component: bool = False,
|
|
152
|
+
_log_metric_to_confident: bool = True,
|
|
150
153
|
) -> float:
|
|
151
154
|
check_mllm_test_case_params(
|
|
152
155
|
test_case, self._required_params, None, None, self
|
|
@@ -49,6 +49,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -64,6 +65,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
64
65
|
test_case,
|
|
65
66
|
_show_indicator=False,
|
|
66
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
69
|
)
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
@@ -147,6 +149,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
147
149
|
test_case: MLLMTestCase,
|
|
148
150
|
_show_indicator: bool = True,
|
|
149
151
|
_in_component: bool = False,
|
|
152
|
+
_log_metric_to_confident: bool = True,
|
|
150
153
|
) -> float:
|
|
151
154
|
check_mllm_test_case_params(
|
|
152
155
|
test_case, self._required_params, None, None, self
|