deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +97 -42
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/utils.py +1 -1
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/tracing.py +51 -3
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
|
@@ -53,6 +53,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
|
53
53
|
test_case: MLLMTestCase,
|
|
54
54
|
_show_indicator: bool = True,
|
|
55
55
|
_in_component: bool = False,
|
|
56
|
+
_log_metric_to_confident: bool = True,
|
|
56
57
|
) -> float:
|
|
57
58
|
check_mllm_test_case_params(
|
|
58
59
|
test_case, self._required_params, None, None, self
|
|
@@ -71,6 +72,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
|
71
72
|
test_case,
|
|
72
73
|
_show_indicator=False,
|
|
73
74
|
_in_component=_in_component,
|
|
75
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
74
76
|
)
|
|
75
77
|
)
|
|
76
78
|
else:
|
|
@@ -97,6 +99,7 @@ class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
|
97
99
|
test_case: MLLMTestCase,
|
|
98
100
|
_show_indicator: bool = True,
|
|
99
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
100
103
|
) -> float:
|
|
101
104
|
check_mllm_test_case_params(
|
|
102
105
|
test_case, self._required_params, None, None, self
|
|
@@ -78,6 +78,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
78
78
|
test_case: MLLMTestCase,
|
|
79
79
|
_show_indicator: bool = True,
|
|
80
80
|
_in_component: bool = False,
|
|
81
|
+
_log_metric_to_confident: bool = True,
|
|
81
82
|
_additional_context: Optional[str] = None,
|
|
82
83
|
) -> float:
|
|
83
84
|
|
|
@@ -96,6 +97,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
96
97
|
test_case,
|
|
97
98
|
_show_indicator=False,
|
|
98
99
|
_in_component=_in_component,
|
|
100
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
99
101
|
_additional_context=_additional_context,
|
|
100
102
|
)
|
|
101
103
|
)
|
|
@@ -132,6 +134,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
132
134
|
_show_indicator: bool = True,
|
|
133
135
|
_in_component: bool = False,
|
|
134
136
|
_additional_context: Optional[str] = None,
|
|
137
|
+
_log_metric_to_confident: bool = True,
|
|
135
138
|
) -> float:
|
|
136
139
|
|
|
137
140
|
check_mllm_test_case_params(
|
deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import List, Dict
|
|
|
3
3
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
|
|
6
|
+
check_mllm_test_case_params,
|
|
7
7
|
)
|
|
8
8
|
from deepeval.test_case import (
|
|
9
9
|
MLLMTestCase,
|
|
@@ -11,10 +11,10 @@ from deepeval.test_case import (
|
|
|
11
11
|
ToolCallParams,
|
|
12
12
|
ToolCall,
|
|
13
13
|
)
|
|
14
|
-
from deepeval.metrics import
|
|
14
|
+
from deepeval.metrics import BaseMultimodalMetric
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
class MultimodalToolCorrectnessMetric(
|
|
17
|
+
class MultimodalToolCorrectnessMetric(BaseMultimodalMetric):
|
|
18
18
|
|
|
19
19
|
_required_params: List[MLLMTestCaseParams] = [
|
|
20
20
|
MLLMTestCaseParams.INPUT,
|
|
@@ -46,8 +46,11 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
|
|
|
46
46
|
test_case: MLLMTestCase,
|
|
47
47
|
_show_indicator: bool = True,
|
|
48
48
|
_in_component: bool = False,
|
|
49
|
+
_log_metric_to_confident: bool = True,
|
|
49
50
|
) -> float:
|
|
50
|
-
|
|
51
|
+
check_mllm_test_case_params(
|
|
52
|
+
test_case, self._required_params, None, None, self
|
|
53
|
+
)
|
|
51
54
|
self.test_case = test_case
|
|
52
55
|
with metric_progress_indicator(
|
|
53
56
|
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
@@ -90,11 +93,13 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
|
|
|
90
93
|
test_case: MLLMTestCase,
|
|
91
94
|
_show_indicator: bool = True,
|
|
92
95
|
_in_component: bool = False,
|
|
96
|
+
_log_metric_to_confident: bool = True,
|
|
93
97
|
) -> float:
|
|
94
98
|
return self.measure(
|
|
95
99
|
test_case,
|
|
96
100
|
_show_indicator=_show_indicator,
|
|
97
101
|
_in_component=_in_component,
|
|
102
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
98
103
|
)
|
|
99
104
|
|
|
100
105
|
##################################################
|
|
@@ -278,7 +283,7 @@ class MultimodalToolCorrectnessMetric(BaseMetric):
|
|
|
278
283
|
|
|
279
284
|
@property
|
|
280
285
|
def __name__(self):
|
|
281
|
-
return "Tool Correctness"
|
|
286
|
+
return "Multi Modal Tool Correctness"
|
|
282
287
|
|
|
283
288
|
def indent_multiline_string(self, s, indent_level=4):
|
|
284
289
|
indent = " " * indent_level
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.non_advice.template import NonAdviceTemplate
|
|
19
19
|
from deepeval.metrics.non_advice.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class NonAdviceMetric(BaseMetric):
|
|
@@ -58,6 +59,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
58
59
|
test_case: LLMTestCase,
|
|
59
60
|
_show_indicator: bool = True,
|
|
60
61
|
_in_component: bool = False,
|
|
62
|
+
_log_metric_to_confident: bool = True,
|
|
61
63
|
) -> float:
|
|
62
64
|
|
|
63
65
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -73,6 +75,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
73
75
|
test_case,
|
|
74
76
|
_show_indicator=False,
|
|
75
77
|
_in_component=_in_component,
|
|
78
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
76
79
|
)
|
|
77
80
|
)
|
|
78
81
|
else:
|
|
@@ -93,6 +96,10 @@ class NonAdviceMetric(BaseMetric):
|
|
|
93
96
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
94
97
|
],
|
|
95
98
|
)
|
|
99
|
+
if _log_metric_to_confident:
|
|
100
|
+
metric_data_manager.post_metric_if_enabled(
|
|
101
|
+
self, test_case=test_case
|
|
102
|
+
)
|
|
96
103
|
|
|
97
104
|
return self.score
|
|
98
105
|
|
|
@@ -101,6 +108,7 @@ class NonAdviceMetric(BaseMetric):
|
|
|
101
108
|
test_case: LLMTestCase,
|
|
102
109
|
_show_indicator: bool = True,
|
|
103
110
|
_in_component: bool = False,
|
|
111
|
+
_log_metric_to_confident: bool = True,
|
|
104
112
|
) -> float:
|
|
105
113
|
|
|
106
114
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -129,6 +137,10 @@ class NonAdviceMetric(BaseMetric):
|
|
|
129
137
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
130
138
|
],
|
|
131
139
|
)
|
|
140
|
+
if _log_metric_to_confident:
|
|
141
|
+
metric_data_manager.post_metric_if_enabled(
|
|
142
|
+
self, test_case=test_case
|
|
143
|
+
)
|
|
132
144
|
|
|
133
145
|
return self.score
|
|
134
146
|
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.pii_leakage.template import PIILeakageTemplate
|
|
19
19
|
from deepeval.metrics.pii_leakage.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class PIILeakageMetric(BaseMetric):
|
|
@@ -49,6 +50,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
49
50
|
test_case: LLMTestCase,
|
|
50
51
|
_show_indicator: bool = True,
|
|
51
52
|
_in_component: bool = False,
|
|
53
|
+
_log_metric_to_confident: bool = True,
|
|
52
54
|
) -> float:
|
|
53
55
|
|
|
54
56
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -64,6 +66,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
64
66
|
test_case,
|
|
65
67
|
_show_indicator=False,
|
|
66
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
70
|
)
|
|
68
71
|
)
|
|
69
72
|
else:
|
|
@@ -84,6 +87,10 @@ class PIILeakageMetric(BaseMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
|
|
88
95
|
return self.score
|
|
89
96
|
|
|
@@ -92,6 +99,7 @@ class PIILeakageMetric(BaseMetric):
|
|
|
92
99
|
test_case: LLMTestCase,
|
|
93
100
|
_show_indicator: bool = True,
|
|
94
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
95
103
|
) -> float:
|
|
96
104
|
|
|
97
105
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -120,7 +128,10 @@ class PIILeakageMetric(BaseMetric):
|
|
|
120
128
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
121
129
|
],
|
|
122
130
|
)
|
|
123
|
-
|
|
131
|
+
if _log_metric_to_confident:
|
|
132
|
+
metric_data_manager.post_metric_if_enabled(
|
|
133
|
+
self, test_case=test_case
|
|
134
|
+
)
|
|
124
135
|
return self.score
|
|
125
136
|
|
|
126
137
|
async def _a_generate_reason(self) -> str:
|
|
@@ -20,6 +20,8 @@ from deepeval.metrics.indicator import metric_progress_indicator
|
|
|
20
20
|
from deepeval.metrics.prompt_alignment import schema as paschema
|
|
21
21
|
from deepeval.config.settings import get_settings
|
|
22
22
|
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
class PromptAlignmentMetric(BaseMetric):
|
|
25
27
|
|
|
@@ -55,6 +57,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
55
57
|
test_case: LLMTestCase,
|
|
56
58
|
_show_indicator: bool = True,
|
|
57
59
|
_in_component: bool = False,
|
|
60
|
+
_log_metric_to_confident: bool = True,
|
|
58
61
|
) -> float:
|
|
59
62
|
|
|
60
63
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -93,6 +96,10 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
93
96
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
94
97
|
],
|
|
95
98
|
)
|
|
99
|
+
if _log_metric_to_confident:
|
|
100
|
+
metric_data_manager.post_metric_if_enabled(
|
|
101
|
+
self, test_case=test_case
|
|
102
|
+
)
|
|
96
103
|
|
|
97
104
|
return self.score
|
|
98
105
|
|
|
@@ -101,6 +108,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
101
108
|
test_case: LLMTestCase,
|
|
102
109
|
_show_indicator: bool = True,
|
|
103
110
|
_in_component: bool = False,
|
|
111
|
+
_log_metric_to_confident: bool = True,
|
|
104
112
|
) -> float:
|
|
105
113
|
|
|
106
114
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -128,7 +136,10 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
128
136
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
129
137
|
],
|
|
130
138
|
)
|
|
131
|
-
|
|
139
|
+
if _log_metric_to_confident:
|
|
140
|
+
metric_data_manager.post_metric_if_enabled(
|
|
141
|
+
self, test_case=test_case
|
|
142
|
+
)
|
|
132
143
|
return self.score
|
|
133
144
|
|
|
134
145
|
async def _a_generate_reason(self, input: str, actual_output: str) -> str:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Optional, Union, List
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics import BaseConversationalMetric
|
|
4
|
+
from deepeval.metrics.api import metric_data_manager
|
|
4
5
|
from deepeval.metrics.role_adherence.schema import (
|
|
5
6
|
OutOfCharacterResponseVerdicts,
|
|
6
7
|
)
|
|
@@ -44,6 +45,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
44
45
|
test_case: ConversationalTestCase,
|
|
45
46
|
_show_indicator: bool = True,
|
|
46
47
|
_in_component: bool = False,
|
|
48
|
+
_log_metric_to_confident: bool = True,
|
|
47
49
|
):
|
|
48
50
|
check_conversational_test_case_params(
|
|
49
51
|
test_case,
|
|
@@ -63,6 +65,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
63
65
|
test_case,
|
|
64
66
|
_show_indicator=False,
|
|
65
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
69
|
)
|
|
67
70
|
)
|
|
68
71
|
else:
|
|
@@ -82,6 +85,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
82
85
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
83
86
|
],
|
|
84
87
|
)
|
|
88
|
+
if _log_metric_to_confident:
|
|
89
|
+
metric_data_manager.post_metric_if_enabled(
|
|
90
|
+
self, test_case=test_case
|
|
91
|
+
)
|
|
85
92
|
return self.score
|
|
86
93
|
|
|
87
94
|
async def a_measure(
|
|
@@ -89,6 +96,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
89
96
|
test_case: ConversationalTestCase,
|
|
90
97
|
_show_indicator: bool = True,
|
|
91
98
|
_in_component: bool = False,
|
|
99
|
+
_log_metric_to_confident: bool = True,
|
|
92
100
|
) -> float:
|
|
93
101
|
check_conversational_test_case_params(
|
|
94
102
|
test_case,
|
|
@@ -124,6 +132,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
124
132
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
125
133
|
],
|
|
126
134
|
)
|
|
135
|
+
if _log_metric_to_confident:
|
|
136
|
+
metric_data_manager.post_metric_if_enabled(
|
|
137
|
+
self, test_case=test_case
|
|
138
|
+
)
|
|
127
139
|
return self.score
|
|
128
140
|
|
|
129
141
|
async def _a_generate_reason(self, role: str) -> str:
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.role_violation.template import RoleViolationTemplate
|
|
19
19
|
from deepeval.metrics.role_violation.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class RoleViolationMetric(BaseMetric):
|
|
@@ -58,6 +59,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
58
59
|
test_case: LLMTestCase,
|
|
59
60
|
_show_indicator: bool = True,
|
|
60
61
|
_in_component: bool = False,
|
|
62
|
+
_log_metric_to_confident: bool = True,
|
|
61
63
|
) -> float:
|
|
62
64
|
|
|
63
65
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -73,6 +75,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
73
75
|
test_case,
|
|
74
76
|
_show_indicator=False,
|
|
75
77
|
_in_component=_in_component,
|
|
78
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
76
79
|
)
|
|
77
80
|
)
|
|
78
81
|
else:
|
|
@@ -94,6 +97,10 @@ class RoleViolationMetric(BaseMetric):
|
|
|
94
97
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
95
98
|
],
|
|
96
99
|
)
|
|
100
|
+
if _log_metric_to_confident:
|
|
101
|
+
metric_data_manager.post_metric_if_enabled(
|
|
102
|
+
self, test_case=test_case
|
|
103
|
+
)
|
|
97
104
|
|
|
98
105
|
return self.score
|
|
99
106
|
|
|
@@ -102,6 +109,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
102
109
|
test_case: LLMTestCase,
|
|
103
110
|
_show_indicator: bool = True,
|
|
104
111
|
_in_component: bool = False,
|
|
112
|
+
_log_metric_to_confident: bool = True,
|
|
105
113
|
) -> float:
|
|
106
114
|
|
|
107
115
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -131,6 +139,10 @@ class RoleViolationMetric(BaseMetric):
|
|
|
131
139
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
132
140
|
],
|
|
133
141
|
)
|
|
142
|
+
if _log_metric_to_confident:
|
|
143
|
+
metric_data_manager.post_metric_if_enabled(
|
|
144
|
+
self, test_case=test_case
|
|
145
|
+
)
|
|
134
146
|
|
|
135
147
|
return self.score
|
|
136
148
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Optional, Union
|
|
2
2
|
import asyncio
|
|
3
3
|
|
|
4
|
+
from deepeval.metrics.api import metric_data_manager
|
|
4
5
|
from deepeval.test_case import (
|
|
5
6
|
LLMTestCase,
|
|
6
7
|
LLMTestCaseParams,
|
|
@@ -73,6 +74,7 @@ class SummarizationMetric(BaseMetric):
|
|
|
73
74
|
test_case: LLMTestCase,
|
|
74
75
|
_show_indicator: bool = True,
|
|
75
76
|
_in_component: bool = False,
|
|
77
|
+
_log_metric_to_confident: bool = True,
|
|
76
78
|
) -> float:
|
|
77
79
|
|
|
78
80
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -88,6 +90,7 @@ class SummarizationMetric(BaseMetric):
|
|
|
88
90
|
test_case,
|
|
89
91
|
_show_indicator=False,
|
|
90
92
|
_in_component=_in_component,
|
|
93
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
91
94
|
)
|
|
92
95
|
)
|
|
93
96
|
else:
|
|
@@ -121,7 +124,10 @@ class SummarizationMetric(BaseMetric):
|
|
|
121
124
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
122
125
|
],
|
|
123
126
|
)
|
|
124
|
-
|
|
127
|
+
if _log_metric_to_confident:
|
|
128
|
+
metric_data_manager.post_metric_if_enabled(
|
|
129
|
+
self, test_case=test_case
|
|
130
|
+
)
|
|
125
131
|
return self.score
|
|
126
132
|
|
|
127
133
|
async def a_measure(
|
|
@@ -129,6 +135,7 @@ class SummarizationMetric(BaseMetric):
|
|
|
129
135
|
test_case: LLMTestCase,
|
|
130
136
|
_show_indicator: bool = True,
|
|
131
137
|
_in_component: bool = False,
|
|
138
|
+
_log_metric_to_confident: bool = True,
|
|
132
139
|
) -> float:
|
|
133
140
|
|
|
134
141
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -171,6 +178,10 @@ class SummarizationMetric(BaseMetric):
|
|
|
171
178
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
172
179
|
],
|
|
173
180
|
)
|
|
181
|
+
if _log_metric_to_confident:
|
|
182
|
+
metric_data_manager.post_metric_if_enabled(
|
|
183
|
+
self, test_case=test_case
|
|
184
|
+
)
|
|
174
185
|
|
|
175
186
|
return self.score
|
|
176
187
|
|
|
@@ -50,6 +50,7 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
50
50
|
test_case: LLMTestCase,
|
|
51
51
|
_show_indicator: bool = True,
|
|
52
52
|
_in_component: bool = False,
|
|
53
|
+
_log_metric_to_confident: bool = True,
|
|
53
54
|
) -> float:
|
|
54
55
|
has_trace: bool = isinstance(test_case._trace_dict, Dict)
|
|
55
56
|
if not has_trace:
|
|
@@ -66,6 +67,7 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
66
67
|
test_case,
|
|
67
68
|
_show_indicator=False,
|
|
68
69
|
_in_component=_in_component,
|
|
70
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
69
71
|
)
|
|
70
72
|
)
|
|
71
73
|
else:
|
|
@@ -89,6 +91,7 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
89
91
|
test_case: LLMTestCase,
|
|
90
92
|
_show_indicator: bool = True,
|
|
91
93
|
_in_component: bool = False,
|
|
94
|
+
_log_metric_to_confident: bool = True,
|
|
92
95
|
) -> float:
|
|
93
96
|
has_trace: bool = isinstance(test_case._trace_dict, Dict)
|
|
94
97
|
if not has_trace:
|
|
@@ -12,6 +12,7 @@ from deepeval.test_case import (
|
|
|
12
12
|
ToolCall,
|
|
13
13
|
)
|
|
14
14
|
from deepeval.metrics import BaseMetric
|
|
15
|
+
from deepeval.metrics.api import metric_data_manager
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class ToolCorrectnessMetric(BaseMetric):
|
|
@@ -45,6 +46,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
45
46
|
test_case: LLMTestCase,
|
|
46
47
|
_show_indicator: bool = True,
|
|
47
48
|
_in_component: bool = False,
|
|
49
|
+
_log_metric_to_confident: bool = True,
|
|
48
50
|
) -> float:
|
|
49
51
|
|
|
50
52
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -83,6 +85,11 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
83
85
|
]
|
|
84
86
|
steps.append(f"Score: {self.score}\nReason: {self.reason}")
|
|
85
87
|
self.verbose_logs = construct_verbose_logs(self, steps=steps)
|
|
88
|
+
|
|
89
|
+
if _log_metric_to_confident:
|
|
90
|
+
metric_data_manager.post_metric_if_enabled(
|
|
91
|
+
self, test_case=test_case
|
|
92
|
+
)
|
|
86
93
|
return self.score
|
|
87
94
|
|
|
88
95
|
async def a_measure(
|
|
@@ -90,6 +97,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
90
97
|
test_case: LLMTestCase,
|
|
91
98
|
_show_indicator: bool = True,
|
|
92
99
|
_in_component: bool = False,
|
|
100
|
+
_log_metric_to_confident: bool = True,
|
|
93
101
|
) -> float:
|
|
94
102
|
return self.measure(
|
|
95
103
|
test_case,
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.toxicity.template import ToxicityTemplate
|
|
19
19
|
from deepeval.metrics.toxicity.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class ToxicityMetric(BaseMetric):
|
|
@@ -50,6 +51,7 @@ class ToxicityMetric(BaseMetric):
|
|
|
50
51
|
test_case: LLMTestCase,
|
|
51
52
|
_show_indicator: bool = True,
|
|
52
53
|
_in_component: bool = False,
|
|
54
|
+
_log_metric_to_confident: bool = True,
|
|
53
55
|
) -> float:
|
|
54
56
|
|
|
55
57
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -65,6 +67,7 @@ class ToxicityMetric(BaseMetric):
|
|
|
65
67
|
test_case,
|
|
66
68
|
_show_indicator=False,
|
|
67
69
|
_in_component=_in_component,
|
|
70
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
71
|
)
|
|
69
72
|
)
|
|
70
73
|
else:
|
|
@@ -84,6 +87,10 @@ class ToxicityMetric(BaseMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
|
|
88
95
|
return self.score
|
|
89
96
|
|
|
@@ -92,6 +99,7 @@ class ToxicityMetric(BaseMetric):
|
|
|
92
99
|
test_case: LLMTestCase,
|
|
93
100
|
_show_indicator: bool = True,
|
|
94
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
95
103
|
) -> float:
|
|
96
104
|
|
|
97
105
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -122,6 +130,10 @@ class ToxicityMetric(BaseMetric):
|
|
|
122
130
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
123
131
|
],
|
|
124
132
|
)
|
|
133
|
+
if _log_metric_to_confident:
|
|
134
|
+
metric_data_manager.post_metric_if_enabled(
|
|
135
|
+
self, test_case=test_case
|
|
136
|
+
)
|
|
125
137
|
|
|
126
138
|
return self.score
|
|
127
139
|
|
|
@@ -20,6 +20,7 @@ from deepeval.metrics.indicator import metric_progress_indicator
|
|
|
20
20
|
from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
|
|
21
21
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
22
22
|
from deepeval.metrics.turn_relevancy.schema import *
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
class TurnRelevancyMetric(BaseConversationalMetric):
|
|
@@ -49,6 +50,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
49
50
|
test_case: ConversationalTestCase,
|
|
50
51
|
_show_indicator: bool = True,
|
|
51
52
|
_in_component: bool = False,
|
|
53
|
+
_log_metric_to_confident: bool = True,
|
|
52
54
|
):
|
|
53
55
|
check_conversational_test_case_params(
|
|
54
56
|
test_case, self._required_test_case_params, self
|
|
@@ -65,6 +67,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
65
67
|
test_case,
|
|
66
68
|
_show_indicator=False,
|
|
67
69
|
_in_component=_in_component,
|
|
70
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
71
|
)
|
|
69
72
|
)
|
|
70
73
|
else:
|
|
@@ -91,6 +94,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
91
94
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
92
95
|
],
|
|
93
96
|
)
|
|
97
|
+
if _log_metric_to_confident:
|
|
98
|
+
metric_data_manager.post_metric_if_enabled(
|
|
99
|
+
self, test_case=test_case
|
|
100
|
+
)
|
|
94
101
|
return self.score
|
|
95
102
|
|
|
96
103
|
async def a_measure(
|
|
@@ -98,6 +105,7 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
98
105
|
test_case: ConversationalTestCase,
|
|
99
106
|
_show_indicator: bool = True,
|
|
100
107
|
_in_component: bool = False,
|
|
108
|
+
_log_metric_to_confident: bool = True,
|
|
101
109
|
) -> float:
|
|
102
110
|
check_conversational_test_case_params(
|
|
103
111
|
test_case, self._required_test_case_params, self
|
|
@@ -134,6 +142,10 @@ class TurnRelevancyMetric(BaseConversationalMetric):
|
|
|
134
142
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
135
143
|
],
|
|
136
144
|
)
|
|
145
|
+
if _log_metric_to_confident:
|
|
146
|
+
metric_data_manager.post_metric_if_enabled(
|
|
147
|
+
self, test_case=test_case
|
|
148
|
+
)
|
|
137
149
|
return self.score
|
|
138
150
|
|
|
139
151
|
async def _a_generate_reason(self) -> str:
|
|
@@ -56,8 +56,8 @@ model_pricing = {
|
|
|
56
56
|
class GrokModel(DeepEvalBaseLLM):
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
59
|
-
api_key: Optional[str] = None,
|
|
60
59
|
model: Optional[str] = None,
|
|
60
|
+
api_key: Optional[str] = None,
|
|
61
61
|
temperature: float = 0,
|
|
62
62
|
generation_kwargs: Optional[Dict] = None,
|
|
63
63
|
**kwargs,
|
deepeval/openai/__init__.py
CHANGED
|
@@ -1,37 +1,19 @@
|
|
|
1
|
-
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
try:
|
|
2
|
+
import openai # noqa: F401
|
|
3
|
+
except ImportError:
|
|
4
|
+
raise ModuleNotFoundError(
|
|
5
|
+
"Please install OpenAI to use this feature: 'pip install openai'"
|
|
6
|
+
)
|
|
6
7
|
|
|
7
8
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
loader = SourceFileLoader("deepeval_openai", openai_spec.origin)
|
|
14
|
-
new_spec = importlib.util.spec_from_loader(
|
|
15
|
-
"deepeval_openai",
|
|
16
|
-
loader,
|
|
17
|
-
origin=openai_spec.origin,
|
|
18
|
-
is_package=True,
|
|
19
|
-
)
|
|
20
|
-
deepeval_openai = importlib.util.module_from_spec(new_spec)
|
|
21
|
-
deepeval_openai.__path__ = package_dirs
|
|
22
|
-
sys.modules["deepeval_openai"] = deepeval_openai
|
|
23
|
-
loader.exec_module(deepeval_openai)
|
|
24
|
-
patch_openai(deepeval_openai)
|
|
25
|
-
return deepeval_openai
|
|
9
|
+
try:
|
|
10
|
+
from openai import OpenAI, AsyncOpenAI # noqa: F401
|
|
11
|
+
except ImportError:
|
|
12
|
+
OpenAI = None # type: ignore
|
|
13
|
+
AsyncOpenAI = None # type: ignore
|
|
26
14
|
|
|
27
15
|
|
|
28
|
-
|
|
29
|
-
openai
|
|
30
|
-
OpenAI = patched_openai.OpenAI
|
|
31
|
-
AsyncOpenAI = patched_openai.AsyncOpenAI
|
|
16
|
+
if OpenAI or AsyncOpenAI:
|
|
17
|
+
from deepeval.openai.patch import patch_openai_classes
|
|
32
18
|
|
|
33
|
-
|
|
34
|
-
"openai",
|
|
35
|
-
"OpenAI",
|
|
36
|
-
"AsyncOpenAI",
|
|
37
|
-
]
|
|
19
|
+
patch_openai_classes()
|