deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +176 -16
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +118 -60
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +37 -15
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/models/retry_policy.py +202 -11
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/utils.py +58 -8
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +52 -4
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py
CHANGED
|
@@ -46,13 +46,16 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
|
|
|
46
46
|
test_case: MLLMTestCase,
|
|
47
47
|
_show_indicator: bool = True,
|
|
48
48
|
_in_component: bool = False,
|
|
49
|
+
_log_metric_to_confident: bool = True,
|
|
49
50
|
) -> float:
|
|
50
51
|
check_mllm_test_case_params(
|
|
51
52
|
test_case, self._required_params, None, None, self
|
|
52
53
|
)
|
|
53
54
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
54
55
|
with metric_progress_indicator(
|
|
55
|
-
self,
|
|
56
|
+
self,
|
|
57
|
+
_show_indicator=_show_indicator,
|
|
58
|
+
_in_component=_in_component,
|
|
56
59
|
):
|
|
57
60
|
if self.async_mode:
|
|
58
61
|
loop = get_or_create_event_loop()
|
|
@@ -61,6 +64,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
|
|
|
61
64
|
test_case,
|
|
62
65
|
_show_indicator=False,
|
|
63
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
64
68
|
)
|
|
65
69
|
)
|
|
66
70
|
else:
|
|
@@ -89,6 +93,7 @@ class MultimodalAnswerRelevancyMetric(BaseMultimodalMetric):
|
|
|
89
93
|
test_case: MLLMTestCase,
|
|
90
94
|
_show_indicator: bool = True,
|
|
91
95
|
_in_component: bool = False,
|
|
96
|
+
_log_metric_to_confident: bool = True,
|
|
92
97
|
) -> float:
|
|
93
98
|
check_mllm_test_case_params(
|
|
94
99
|
test_case, self._required_params, None, None, self
|
|
@@ -49,6 +49,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -56,7 +57,9 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
56
57
|
|
|
57
58
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
58
59
|
with metric_progress_indicator(
|
|
59
|
-
self,
|
|
60
|
+
self,
|
|
61
|
+
_show_indicator=_show_indicator,
|
|
62
|
+
_in_component=_in_component,
|
|
60
63
|
):
|
|
61
64
|
if self.async_mode:
|
|
62
65
|
loop = get_or_create_event_loop()
|
|
@@ -65,6 +68,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
65
68
|
test_case,
|
|
66
69
|
_show_indicator=False,
|
|
67
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
72
|
)
|
|
69
73
|
)
|
|
70
74
|
else:
|
|
@@ -93,6 +97,7 @@ class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
|
93
97
|
test_case: MLLMTestCase,
|
|
94
98
|
_show_indicator: bool = True,
|
|
95
99
|
_in_component: bool = False,
|
|
100
|
+
_log_metric_to_confident: bool = True,
|
|
96
101
|
) -> float:
|
|
97
102
|
check_mllm_test_case_params(
|
|
98
103
|
test_case, self._required_params, None, None, self
|
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py
CHANGED
|
@@ -48,6 +48,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
|
48
48
|
test_case: MLLMTestCase,
|
|
49
49
|
_show_indicator: bool = True,
|
|
50
50
|
_in_component: bool = False,
|
|
51
|
+
_log_metric_to_confident: bool = True,
|
|
51
52
|
) -> float:
|
|
52
53
|
check_mllm_test_case_params(
|
|
53
54
|
test_case, self._required_params, None, None, self
|
|
@@ -64,6 +65,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
|
64
65
|
test_case,
|
|
65
66
|
_show_indicator=False,
|
|
66
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
69
|
)
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
@@ -90,6 +92,7 @@ class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
|
90
92
|
test_case: MLLMTestCase,
|
|
91
93
|
_show_indicator: bool = True,
|
|
92
94
|
_in_component: bool = False,
|
|
95
|
+
_log_metric_to_confident: bool = True,
|
|
93
96
|
) -> float:
|
|
94
97
|
check_mllm_test_case_params(
|
|
95
98
|
test_case, self._required_params, None, None, self
|
|
@@ -49,6 +49,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
|
49
49
|
test_case: MLLMTestCase,
|
|
50
50
|
_show_indicator: bool = True,
|
|
51
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
52
53
|
) -> float:
|
|
53
54
|
check_mllm_test_case_params(
|
|
54
55
|
test_case, self._required_params, None, None, self
|
|
@@ -65,6 +66,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
|
65
66
|
test_case,
|
|
66
67
|
_show_indicator=False,
|
|
67
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
70
|
)
|
|
69
71
|
)
|
|
70
72
|
else:
|
|
@@ -90,6 +92,7 @@ class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
|
90
92
|
test_case: MLLMTestCase,
|
|
91
93
|
_show_indicator: bool = True,
|
|
92
94
|
_in_component: bool = False,
|
|
95
|
+
_log_metric_to_confident: bool = True,
|
|
93
96
|
) -> float:
|
|
94
97
|
check_mllm_test_case_params(
|
|
95
98
|
test_case, self._required_params, None, None, self
|
|
@@ -53,6 +53,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
|
53
53
|
test_case: MLLMTestCase,
|
|
54
54
|
_show_indicator: bool = True,
|
|
55
55
|
_in_component: bool = False,
|
|
56
|
+
_log_metric_to_confident: bool = True,
|
|
56
57
|
) -> float:
|
|
57
58
|
check_mllm_test_case_params(
|
|
58
59
|
test_case, self._required_params, None, None, self
|
|
@@ -71,6 +72,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
|
71
72
|
test_case,
|
|
72
73
|
_show_indicator=False,
|
|
73
74
|
_in_component=_in_component,
|
|
75
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
74
76
|
)
|
|
75
77
|
)
|
|
76
78
|
else:
|
|
@@ -97,6 +99,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
|
97
99
|
test_case: MLLMTestCase,
|
|
98
100
|
_show_indicator: bool = True,
|
|
99
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
100
103
|
) -> float:
|
|
101
104
|
check_mllm_test_case_params(
|
|
102
105
|
test_case, self._required_params, None, None, self
|
|
@@ -78,6 +78,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
78
78
|
test_case: MLLMTestCase,
|
|
79
79
|
_show_indicator: bool = True,
|
|
80
80
|
_in_component: bool = False,
|
|
81
|
+
_log_metric_to_confident: bool = True,
|
|
81
82
|
_additional_context: Optional[str] = None,
|
|
82
83
|
) -> float:
|
|
83
84
|
|
|
@@ -96,6 +97,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
96
97
|
test_case,
|
|
97
98
|
_show_indicator=False,
|
|
98
99
|
_in_component=_in_component,
|
|
100
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
99
101
|
_additional_context=_additional_context,
|
|
100
102
|
)
|
|
101
103
|
)
|
|
@@ -132,6 +134,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
132
134
|
_show_indicator: bool = True,
|
|
133
135
|
_in_component: bool = False,
|
|
134
136
|
_additional_context: Optional[str] = None,
|
|
137
|
+
_log_metric_to_confident: bool = True,
|
|
135
138
|
) -> float:
|
|
136
139
|
|
|
137
140
|
check_mllm_test_case_params(
|
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import List, Dict
|
|
|
3
3
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
|
|
6
|
+
check_mllm_test_case_params,
|
|
7
7
|
)
|
|
8
8
|
from deepeval.test_case import (
|
|
9
9
|
MLLMTestCase,
|
|
@@ -11,10 +11,10 @@ from deepeval.test_case import (
|
|
|
11
11
|
ToolCallParams,
|
|
12
12
|
ToolCall,
|
|
13
13
|
)
|
|
14
|
-
from deepeval.metrics import
|
|
14
|
+
from deepeval.metrics import BaseMultimodalMetric
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
class MultimodalToolCorrectnessMetric(
|
|
17
|
+
class MultimodalToolCorrectnessMetric(BaseMultimodalMetric):
|
|
18
18
|
|
|
19
19
|
_required_params: List[MLLMTestCaseParams] = [
|
|
20
20
|
MLLMTestCaseParams.INPUT,
|
|
@@ -46,8 +46,11 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
|
|
|
46
46
|
test_case: MLLMTestCase,
|
|
47
47
|
_show_indicator: bool = True,
|
|
48
48
|
_in_component: bool = False,
|
|
49
|
+
_log_metric_to_confident: bool = True,
|
|
49
50
|
) -> float:
|
|
50
|
-
|
|
51
|
+
check_mllm_test_case_params(
|
|
52
|
+
test_case, self._required_params, None, None, self
|
|
53
|
+
)
|
|
51
54
|
self.test_case = test_case
|
|
52
55
|
with metric_progress_indicator(
|
|
53
56
|
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
@@ -90,11 +93,13 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
|
|
|
90
93
|
test_case: MLLMTestCase,
|
|
91
94
|
_show_indicator: bool = True,
|
|
92
95
|
_in_component: bool = False,
|
|
96
|
+
_log_metric_to_confident: bool = True,
|
|
93
97
|
) -> float:
|
|
94
98
|
return self.measure(
|
|
95
99
|
test_case,
|
|
96
100
|
_show_indicator=_show_indicator,
|
|
97
101
|
_in_component=_in_component,
|
|
102
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
98
103
|
)
|
|
99
104
|
|
|
100
105
|
##################################################
|
|
@@ -278,7 +283,7 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
|
|
|
278
283
|
|
|
279
284
|
@property
|
|
280
285
|
def __name__(self):
|
|
281
|
-
return "Tool Correctness"
|
|
286
|
+
return "Multi Modal Tool Correctness"
|
|
282
287
|
|
|
283
288
|
def indent_multiline_string(self, s, indent_level=4):
|
|
284
289
|
indent = " " * indent_level
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.non_advice.template import NonAdviceTemplate
|
|
19
19
|
from deepeval.metrics.non_advice.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class NonAdviceMetric(BaseMetric):
|
|
@@ -58,6 +59,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
58
59
|
test_case: LLMTestCase,
|
|
59
60
|
_show_indicator: bool = True,
|
|
60
61
|
_in_component: bool = False,
|
|
62
|
+
_log_metric_to_confident: bool = True,
|
|
61
63
|
) -> float:
|
|
62
64
|
|
|
63
65
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -73,6 +75,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
73
75
|
test_case,
|
|
74
76
|
_show_indicator=False,
|
|
75
77
|
_in_component=_in_component,
|
|
78
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
76
79
|
)
|
|
77
80
|
)
|
|
78
81
|
else:
|
|
@@ -93,6 +96,10 @@ class NonAdviceMetric(BaseMetric):
|
|
|
93
96
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
94
97
|
],
|
|
95
98
|
)
|
|
99
|
+
if _log_metric_to_confident:
|
|
100
|
+
metric_data_manager.post_metric_if_enabled(
|
|
101
|
+
self, test_case=test_case
|
|
102
|
+
)
|
|
96
103
|
|
|
97
104
|
return self.score
|
|
98
105
|
|
|
@@ -101,6 +108,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
101
108
|
test_case: LLMTestCase,
|
|
102
109
|
_show_indicator: bool = True,
|
|
103
110
|
_in_component: bool = False,
|
|
111
|
+
_log_metric_to_confident: bool = True,
|
|
104
112
|
) -> float:
|
|
105
113
|
|
|
106
114
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -129,6 +137,10 @@ class NonAdviceMetric(BaseMetric):
|
|
|
129
137
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
130
138
|
],
|
|
131
139
|
)
|
|
140
|
+
if _log_metric_to_confident:
|
|
141
|
+
metric_data_manager.post_metric_if_enabled(
|
|
142
|
+
self, test_case=test_case
|
|
143
|
+
)
|
|
132
144
|
|
|
133
145
|
return self.score
|
|
134
146
|
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.pii_leakage.template import PIILeakageTemplate
|
|
19
19
|
from deepeval.metrics.pii_leakage.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class PIILeakageMetric(BaseMetric):
|
|
@@ -49,6 +50,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
49
50
|
test_case: LLMTestCase,
|
|
50
51
|
_show_indicator: bool = True,
|
|
51
52
|
_in_component: bool = False,
|
|
53
|
+
_log_metric_to_confident: bool = True,
|
|
52
54
|
) -> float:
|
|
53
55
|
|
|
54
56
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -64,6 +66,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
64
66
|
test_case,
|
|
65
67
|
_show_indicator=False,
|
|
66
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
70
|
)
|
|
68
71
|
)
|
|
69
72
|
else:
|
|
@@ -84,6 +87,10 @@ class PIILeakageMetric(BaseMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
|
|
88
95
|
return self.score
|
|
89
96
|
|
|
@@ -92,6 +99,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
92
99
|
test_case: LLMTestCase,
|
|
93
100
|
_show_indicator: bool = True,
|
|
94
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
95
103
|
) -> float:
|
|
96
104
|
|
|
97
105
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -120,7 +128,10 @@ class PIILeakageMetric(BaseMetric):
|
|
|
120
128
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
121
129
|
],
|
|
122
130
|
)
|
|
123
|
-
|
|
131
|
+
if _log_metric_to_confident:
|
|
132
|
+
metric_data_manager.post_metric_if_enabled(
|
|
133
|
+
self, test_case=test_case
|
|
134
|
+
)
|
|
124
135
|
return self.score
|
|
125
136
|
|
|
126
137
|
async def _a_generate_reason(self) -> str:
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
1
3
|
from typing import Optional, List, Union
|
|
2
4
|
|
|
3
5
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
@@ -15,7 +17,10 @@ from deepeval.metrics import BaseMetric
|
|
|
15
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
18
|
from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
|
|
17
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
|
-
from deepeval.metrics.prompt_alignment
|
|
20
|
+
from deepeval.metrics.prompt_alignment import schema as paschema
|
|
21
|
+
from deepeval.config.settings import get_settings
|
|
22
|
+
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
class PromptAlignmentMetric(BaseMetric):
|
|
@@ -52,6 +57,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
52
57
|
test_case: LLMTestCase,
|
|
53
58
|
_show_indicator: bool = True,
|
|
54
59
|
_in_component: bool = False,
|
|
60
|
+
_log_metric_to_confident: bool = True,
|
|
55
61
|
) -> float:
|
|
56
62
|
|
|
57
63
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -62,15 +68,19 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
62
68
|
):
|
|
63
69
|
if self.async_mode:
|
|
64
70
|
loop = get_or_create_event_loop()
|
|
71
|
+
coro = self.a_measure(
|
|
72
|
+
test_case,
|
|
73
|
+
_show_indicator=False,
|
|
74
|
+
_in_component=_in_component,
|
|
75
|
+
)
|
|
65
76
|
loop.run_until_complete(
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
_in_component=_in_component,
|
|
77
|
+
asyncio.wait_for(
|
|
78
|
+
coro,
|
|
79
|
+
timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
|
|
70
80
|
)
|
|
71
81
|
)
|
|
72
82
|
else:
|
|
73
|
-
self.verdicts: Verdicts = self._generate_verdicts(
|
|
83
|
+
self.verdicts: paschema.Verdicts = self._generate_verdicts(
|
|
74
84
|
test_case.input, test_case.actual_output
|
|
75
85
|
)
|
|
76
86
|
self.score = self._calculate_score()
|
|
@@ -86,6 +96,10 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
86
96
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
87
97
|
],
|
|
88
98
|
)
|
|
99
|
+
if _log_metric_to_confident:
|
|
100
|
+
metric_data_manager.post_metric_if_enabled(
|
|
101
|
+
self, test_case=test_case
|
|
102
|
+
)
|
|
89
103
|
|
|
90
104
|
return self.score
|
|
91
105
|
|
|
@@ -94,6 +108,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
94
108
|
test_case: LLMTestCase,
|
|
95
109
|
_show_indicator: bool = True,
|
|
96
110
|
_in_component: bool = False,
|
|
111
|
+
_log_metric_to_confident: bool = True,
|
|
97
112
|
) -> float:
|
|
98
113
|
|
|
99
114
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -105,7 +120,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
105
120
|
_show_indicator=_show_indicator,
|
|
106
121
|
_in_component=_in_component,
|
|
107
122
|
):
|
|
108
|
-
self.verdicts: Verdicts = await self._a_generate_verdicts(
|
|
123
|
+
self.verdicts: paschema.Verdicts = await self._a_generate_verdicts(
|
|
109
124
|
test_case.input, test_case.actual_output
|
|
110
125
|
)
|
|
111
126
|
self.score = self._calculate_score()
|
|
@@ -121,7 +136,10 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
121
136
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
122
137
|
],
|
|
123
138
|
)
|
|
124
|
-
|
|
139
|
+
if _log_metric_to_confident:
|
|
140
|
+
metric_data_manager.post_metric_if_enabled(
|
|
141
|
+
self, test_case=test_case
|
|
142
|
+
)
|
|
125
143
|
return self.score
|
|
126
144
|
|
|
127
145
|
async def _a_generate_reason(self, input: str, actual_output: str) -> str:
|
|
@@ -141,14 +159,17 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
141
159
|
)
|
|
142
160
|
if self.using_native_model:
|
|
143
161
|
res, cost = await self.model.a_generate(
|
|
144
|
-
prompt, schema=PromptAlignmentScoreReason
|
|
162
|
+
prompt, schema=paschema.PromptAlignmentScoreReason
|
|
145
163
|
)
|
|
146
164
|
self.evaluation_cost += cost
|
|
147
165
|
return res.reason
|
|
148
166
|
else:
|
|
149
167
|
try:
|
|
150
|
-
res: PromptAlignmentScoreReason =
|
|
151
|
-
|
|
168
|
+
res: paschema.PromptAlignmentScoreReason = (
|
|
169
|
+
await self.model.a_generate(
|
|
170
|
+
prompt=prompt,
|
|
171
|
+
schema=paschema.PromptAlignmentScoreReason,
|
|
172
|
+
)
|
|
152
173
|
)
|
|
153
174
|
return res.reason
|
|
154
175
|
except TypeError:
|
|
@@ -173,14 +194,14 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
173
194
|
)
|
|
174
195
|
if self.using_native_model:
|
|
175
196
|
res, cost = self.model.generate(
|
|
176
|
-
prompt, schema=PromptAlignmentScoreReason
|
|
197
|
+
prompt, schema=paschema.PromptAlignmentScoreReason
|
|
177
198
|
)
|
|
178
199
|
self.evaluation_cost += cost
|
|
179
200
|
return res.reason
|
|
180
201
|
else:
|
|
181
202
|
try:
|
|
182
|
-
res: PromptAlignmentScoreReason = self.model.generate(
|
|
183
|
-
prompt=prompt, schema=PromptAlignmentScoreReason
|
|
203
|
+
res: paschema.PromptAlignmentScoreReason = self.model.generate(
|
|
204
|
+
prompt=prompt, schema=paschema.PromptAlignmentScoreReason
|
|
184
205
|
)
|
|
185
206
|
return res.reason
|
|
186
207
|
except TypeError:
|
|
@@ -190,48 +211,56 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
190
211
|
|
|
191
212
|
async def _a_generate_verdicts(
|
|
192
213
|
self, input: str, actual_output: str
|
|
193
|
-
) -> Verdicts:
|
|
214
|
+
) -> paschema.Verdicts:
|
|
194
215
|
prompt = PromptAlignmentTemplate.generate_verdicts(
|
|
195
216
|
prompt_instructions=self.prompt_instructions,
|
|
196
217
|
input=input,
|
|
197
218
|
actual_output=actual_output,
|
|
198
219
|
)
|
|
199
220
|
if self.using_native_model:
|
|
200
|
-
res, cost = await self.model.a_generate(
|
|
221
|
+
res, cost = await self.model.a_generate(
|
|
222
|
+
prompt, schema=paschema.Verdicts
|
|
223
|
+
)
|
|
201
224
|
self.evaluation_cost += cost
|
|
202
225
|
return [item for item in res.verdicts]
|
|
203
226
|
else:
|
|
204
227
|
try:
|
|
205
|
-
res: Verdicts = await self.model.a_generate(
|
|
206
|
-
prompt, schema=Verdicts
|
|
228
|
+
res: paschema.Verdicts = await self.model.a_generate(
|
|
229
|
+
prompt, schema=paschema.Verdicts
|
|
207
230
|
)
|
|
208
231
|
return [item for item in res.verdicts]
|
|
209
232
|
except TypeError:
|
|
210
233
|
res = await self.model.a_generate(prompt)
|
|
211
234
|
data = trimAndLoadJson(res, self)
|
|
212
235
|
return [
|
|
213
|
-
PromptAlignmentVerdict(**item)
|
|
236
|
+
paschema.PromptAlignmentVerdict(**item)
|
|
237
|
+
for item in data["verdicts"]
|
|
214
238
|
]
|
|
215
239
|
|
|
216
|
-
def _generate_verdicts(
|
|
240
|
+
def _generate_verdicts(
|
|
241
|
+
self, input: str, actual_output: str
|
|
242
|
+
) -> paschema.Verdicts:
|
|
217
243
|
prompt = PromptAlignmentTemplate.generate_verdicts(
|
|
218
244
|
prompt_instructions=self.prompt_instructions,
|
|
219
245
|
input=input,
|
|
220
246
|
actual_output=actual_output,
|
|
221
247
|
)
|
|
222
248
|
if self.using_native_model:
|
|
223
|
-
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
249
|
+
res, cost = self.model.generate(prompt, schema=paschema.Verdicts)
|
|
224
250
|
self.evaluation_cost += cost
|
|
225
251
|
return [item for item in res.verdicts]
|
|
226
252
|
else:
|
|
227
253
|
try:
|
|
228
|
-
res: Verdicts = self.model.generate(
|
|
254
|
+
res: paschema.Verdicts = self.model.generate(
|
|
255
|
+
prompt, schema=paschema.Verdicts
|
|
256
|
+
)
|
|
229
257
|
return [item for item in res.verdicts]
|
|
230
258
|
except TypeError:
|
|
231
259
|
res = self.model.generate(prompt)
|
|
232
260
|
data = trimAndLoadJson(res, self)
|
|
233
261
|
return [
|
|
234
|
-
PromptAlignmentVerdict(**item)
|
|
262
|
+
paschema.PromptAlignmentVerdict(**item)
|
|
263
|
+
for item in data["verdicts"]
|
|
235
264
|
]
|
|
236
265
|
|
|
237
266
|
def _calculate_score(self):
|
|
@@ -253,7 +282,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
253
282
|
else:
|
|
254
283
|
try:
|
|
255
284
|
self.success = self.score >= self.threshold
|
|
256
|
-
except:
|
|
285
|
+
except TypeError:
|
|
257
286
|
self.success = False
|
|
258
287
|
return self.success
|
|
259
288
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Optional, Union, List
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics import BaseConversationalMetric
|
|
4
|
+
from deepeval.metrics.api import metric_data_manager
|
|
4
5
|
from deepeval.metrics.role_adherence.schema import (
|
|
5
6
|
OutOfCharacterResponseVerdicts,
|
|
6
7
|
)
|
|
@@ -44,6 +45,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
44
45
|
test_case: ConversationalTestCase,
|
|
45
46
|
_show_indicator: bool = True,
|
|
46
47
|
_in_component: bool = False,
|
|
48
|
+
_log_metric_to_confident: bool = True,
|
|
47
49
|
):
|
|
48
50
|
check_conversational_test_case_params(
|
|
49
51
|
test_case,
|
|
@@ -63,6 +65,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
63
65
|
test_case,
|
|
64
66
|
_show_indicator=False,
|
|
65
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
69
|
)
|
|
67
70
|
)
|
|
68
71
|
else:
|
|
@@ -82,6 +85,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
82
85
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
83
86
|
],
|
|
84
87
|
)
|
|
88
|
+
if _log_metric_to_confident:
|
|
89
|
+
metric_data_manager.post_metric_if_enabled(
|
|
90
|
+
self, test_case=test_case
|
|
91
|
+
)
|
|
85
92
|
return self.score
|
|
86
93
|
|
|
87
94
|
async def a_measure(
|
|
@@ -89,6 +96,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
89
96
|
test_case: ConversationalTestCase,
|
|
90
97
|
_show_indicator: bool = True,
|
|
91
98
|
_in_component: bool = False,
|
|
99
|
+
_log_metric_to_confident: bool = True,
|
|
92
100
|
) -> float:
|
|
93
101
|
check_conversational_test_case_params(
|
|
94
102
|
test_case,
|
|
@@ -124,6 +132,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
124
132
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
125
133
|
],
|
|
126
134
|
)
|
|
135
|
+
if _log_metric_to_confident:
|
|
136
|
+
metric_data_manager.post_metric_if_enabled(
|
|
137
|
+
self, test_case=test_case
|
|
138
|
+
)
|
|
127
139
|
return self.score
|
|
128
140
|
|
|
129
141
|
async def _a_generate_reason(self, role: str) -> str:
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.role_violation.template import RoleViolationTemplate
|
|
19
19
|
from deepeval.metrics.role_violation.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class RoleViolationMetric(BaseMetric):
|
|
@@ -58,6 +59,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
58
59
|
test_case: LLMTestCase,
|
|
59
60
|
_show_indicator: bool = True,
|
|
60
61
|
_in_component: bool = False,
|
|
62
|
+
_log_metric_to_confident: bool = True,
|
|
61
63
|
) -> float:
|
|
62
64
|
|
|
63
65
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -73,6 +75,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
73
75
|
test_case,
|
|
74
76
|
_show_indicator=False,
|
|
75
77
|
_in_component=_in_component,
|
|
78
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
76
79
|
)
|
|
77
80
|
)
|
|
78
81
|
else:
|
|
@@ -94,6 +97,10 @@ class RoleViolationMetric(BaseMetric):
|
|
|
94
97
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
95
98
|
],
|
|
96
99
|
)
|
|
100
|
+
if _log_metric_to_confident:
|
|
101
|
+
metric_data_manager.post_metric_if_enabled(
|
|
102
|
+
self, test_case=test_case
|
|
103
|
+
)
|
|
97
104
|
|
|
98
105
|
return self.score
|
|
99
106
|
|
|
@@ -102,6 +109,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
102
109
|
test_case: LLMTestCase,
|
|
103
110
|
_show_indicator: bool = True,
|
|
104
111
|
_in_component: bool = False,
|
|
112
|
+
_log_metric_to_confident: bool = True,
|
|
105
113
|
) -> float:
|
|
106
114
|
|
|
107
115
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -131,6 +139,10 @@ class RoleViolationMetric(BaseMetric):
|
|
|
131
139
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
132
140
|
],
|
|
133
141
|
)
|
|
142
|
+
if _log_metric_to_confident:
|
|
143
|
+
metric_data_manager.post_metric_if_enabled(
|
|
144
|
+
self, test_case=test_case
|
|
145
|
+
)
|
|
134
146
|
|
|
135
147
|
return self.score
|
|
136
148
|
|