deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +176 -16
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +118 -60
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +37 -15
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/models/retry_policy.py +202 -11
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/utils.py +58 -8
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +52 -4
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
|
@@ -19,6 +19,7 @@ from deepeval.metrics.contextual_relevancy.template import (
|
|
|
19
19
|
)
|
|
20
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
21
|
from deepeval.metrics.contextual_relevancy.schema import *
|
|
22
|
+
from deepeval.metrics.api import metric_data_manager
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class ContextualRelevancyMetric(BaseMetric):
|
|
@@ -53,6 +54,7 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
53
54
|
test_case: LLMTestCase,
|
|
54
55
|
_show_indicator: bool = True,
|
|
55
56
|
_in_component: bool = False,
|
|
57
|
+
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
59
|
|
|
58
60
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -68,6 +70,7 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
68
70
|
test_case,
|
|
69
71
|
_show_indicator=False,
|
|
70
72
|
_in_component=_in_component,
|
|
73
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
71
74
|
)
|
|
72
75
|
)
|
|
73
76
|
else:
|
|
@@ -85,6 +88,10 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
85
88
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
86
89
|
],
|
|
87
90
|
)
|
|
91
|
+
if _log_metric_to_confident:
|
|
92
|
+
metric_data_manager.post_metric_if_enabled(
|
|
93
|
+
self, test_case=test_case
|
|
94
|
+
)
|
|
88
95
|
|
|
89
96
|
return self.score
|
|
90
97
|
|
|
@@ -93,6 +100,7 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
93
100
|
test_case: LLMTestCase,
|
|
94
101
|
_show_indicator: bool = True,
|
|
95
102
|
_in_component: bool = False,
|
|
103
|
+
_log_metric_to_confident: bool = True,
|
|
96
104
|
) -> float:
|
|
97
105
|
|
|
98
106
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -122,7 +130,10 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
122
130
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
123
131
|
],
|
|
124
132
|
)
|
|
125
|
-
|
|
133
|
+
if _log_metric_to_confident:
|
|
134
|
+
metric_data_manager.post_metric_if_enabled(
|
|
135
|
+
self, test_case=test_case
|
|
136
|
+
)
|
|
126
137
|
return self.score
|
|
127
138
|
|
|
128
139
|
async def _a_generate_reason(self, input: str):
|
|
@@ -19,6 +19,7 @@ from deepeval.test_case import TurnParams
|
|
|
19
19
|
from deepeval.test_case.conversational_test_case import Turn
|
|
20
20
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
21
21
|
from deepeval.metrics.conversation_completeness.schema import *
|
|
22
|
+
from deepeval.metrics.api import metric_data_manager
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
@@ -48,6 +49,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
48
49
|
test_case: ConversationalTestCase,
|
|
49
50
|
_show_indicator: bool = True,
|
|
50
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
51
53
|
):
|
|
52
54
|
check_conversational_test_case_params(
|
|
53
55
|
test_case, self._required_test_case_params, self
|
|
@@ -64,6 +66,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
64
66
|
test_case,
|
|
65
67
|
_show_indicator=False,
|
|
66
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
70
|
)
|
|
68
71
|
)
|
|
69
72
|
else:
|
|
@@ -89,6 +92,10 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
89
92
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
90
93
|
],
|
|
91
94
|
)
|
|
95
|
+
if _log_metric_to_confident:
|
|
96
|
+
metric_data_manager.post_metric_if_enabled(
|
|
97
|
+
self, test_case=test_case
|
|
98
|
+
)
|
|
92
99
|
return self.score
|
|
93
100
|
|
|
94
101
|
async def a_measure(
|
|
@@ -96,6 +103,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
96
103
|
test_case: ConversationalTestCase,
|
|
97
104
|
_show_indicator: bool = True,
|
|
98
105
|
_in_component: bool = False,
|
|
106
|
+
_log_metric_to_confident: bool = True,
|
|
99
107
|
) -> float:
|
|
100
108
|
check_conversational_test_case_params(
|
|
101
109
|
test_case, self._required_test_case_params, self
|
|
@@ -129,6 +137,10 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
129
137
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
130
138
|
],
|
|
131
139
|
)
|
|
140
|
+
if _log_metric_to_confident:
|
|
141
|
+
metric_data_manager.post_metric_if_enabled(
|
|
142
|
+
self, test_case=test_case
|
|
143
|
+
)
|
|
132
144
|
return self.score
|
|
133
145
|
|
|
134
146
|
async def _a_generate_reason(self) -> str:
|
|
@@ -18,6 +18,7 @@ from deepeval.metrics.dag.utils import (
|
|
|
18
18
|
extract_required_params,
|
|
19
19
|
copy_graph,
|
|
20
20
|
)
|
|
21
|
+
from deepeval.metrics.api import metric_data_manager
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class ConversationalDAGMetric(BaseConversationalMetric):
|
|
@@ -59,6 +60,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
59
60
|
test_case: ConversationalTestCase,
|
|
60
61
|
_show_indicator: bool = True,
|
|
61
62
|
_in_component: bool = False,
|
|
63
|
+
_log_metric_to_confident: bool = True,
|
|
62
64
|
) -> float:
|
|
63
65
|
check_conversational_test_case_params(
|
|
64
66
|
test_case,
|
|
@@ -77,6 +79,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
77
79
|
test_case,
|
|
78
80
|
_show_indicator=False,
|
|
79
81
|
_in_component=_in_component,
|
|
82
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
80
83
|
)
|
|
81
84
|
)
|
|
82
85
|
else:
|
|
@@ -89,6 +92,10 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
89
92
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
90
93
|
],
|
|
91
94
|
)
|
|
95
|
+
if _log_metric_to_confident:
|
|
96
|
+
metric_data_manager.post_metric_if_enabled(
|
|
97
|
+
self, test_case=test_case
|
|
98
|
+
)
|
|
92
99
|
return self.score
|
|
93
100
|
|
|
94
101
|
async def a_measure(
|
|
@@ -96,6 +103,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
96
103
|
test_case: ConversationalTestCase,
|
|
97
104
|
_show_indicator: bool = True,
|
|
98
105
|
_in_component: bool = False,
|
|
106
|
+
_log_metric_to_confident: bool = True,
|
|
99
107
|
) -> float:
|
|
100
108
|
check_conversational_test_case_params(
|
|
101
109
|
test_case,
|
|
@@ -119,6 +127,10 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
119
127
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
120
128
|
],
|
|
121
129
|
)
|
|
130
|
+
if _log_metric_to_confident:
|
|
131
|
+
metric_data_manager.post_metric_if_enabled(
|
|
132
|
+
self, test_case=test_case
|
|
133
|
+
)
|
|
122
134
|
return self.score
|
|
123
135
|
|
|
124
136
|
def is_successful(self) -> bool:
|
|
@@ -141,7 +141,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
141
141
|
copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
|
|
142
142
|
|
|
143
143
|
copied_convo_g_eval.measure(
|
|
144
|
-
test_case=test_case,
|
|
144
|
+
test_case=test_case,
|
|
145
|
+
_show_indicator=False,
|
|
146
|
+
_log_metric_to_confident=False,
|
|
145
147
|
)
|
|
146
148
|
metric._verbose_steps.append(
|
|
147
149
|
construct_node_verbose_log(self, depth, copied_convo_g_eval)
|
|
@@ -157,7 +159,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
157
159
|
copied_metric.verbose_mode = False
|
|
158
160
|
|
|
159
161
|
copied_metric.measure(
|
|
160
|
-
test_case=test_case,
|
|
162
|
+
test_case=test_case,
|
|
163
|
+
_show_indicator=False,
|
|
164
|
+
_log_metric_to_confident=False,
|
|
161
165
|
)
|
|
162
166
|
metric._verbose_steps.append(
|
|
163
167
|
construct_node_verbose_log(self, depth, copied_metric)
|
|
@@ -213,7 +217,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
213
217
|
copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
|
|
214
218
|
|
|
215
219
|
await copied_convo_g_eval.a_measure(
|
|
216
|
-
test_case=test_case,
|
|
220
|
+
test_case=test_case,
|
|
221
|
+
_show_indicator=False,
|
|
222
|
+
_log_metric_to_confident=False,
|
|
217
223
|
)
|
|
218
224
|
metric._verbose_steps.append(
|
|
219
225
|
construct_node_verbose_log(self, depth, copied_convo_g_eval)
|
|
@@ -229,7 +235,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
229
235
|
copied_metric.verbose_mode = False
|
|
230
236
|
|
|
231
237
|
await copied_metric.a_measure(
|
|
232
|
-
test_case=test_case,
|
|
238
|
+
test_case=test_case,
|
|
239
|
+
_show_indicator=False,
|
|
240
|
+
_log_metric_to_confident=False,
|
|
233
241
|
)
|
|
234
242
|
metric._verbose_steps.append(
|
|
235
243
|
construct_node_verbose_log(self, depth, copied_metric)
|
|
@@ -11,7 +11,6 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
11
11
|
format_rubrics,
|
|
12
12
|
)
|
|
13
13
|
from deepeval.test_case import (
|
|
14
|
-
Turn,
|
|
15
14
|
TurnParams,
|
|
16
15
|
ConversationalTestCase,
|
|
17
16
|
)
|
|
@@ -28,7 +27,8 @@ from deepeval.metrics.utils import (
|
|
|
28
27
|
)
|
|
29
28
|
from deepeval.models import DeepEvalBaseLLM
|
|
30
29
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
31
|
-
|
|
30
|
+
import deepeval.metrics.conversational_g_eval.schema as cgschema
|
|
31
|
+
from deepeval.metrics.api import metric_data_manager
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class ConversationalGEval(BaseConversationalMetric):
|
|
@@ -92,6 +92,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
92
92
|
test_case: ConversationalTestCase,
|
|
93
93
|
_show_indicator: bool = True,
|
|
94
94
|
_in_component: bool = False,
|
|
95
|
+
_log_metric_to_confident: bool = True,
|
|
95
96
|
) -> float:
|
|
96
97
|
check_conversational_test_case_params(
|
|
97
98
|
test_case, self.evaluation_params, self
|
|
@@ -108,6 +109,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
108
109
|
test_case,
|
|
109
110
|
_show_indicator=False,
|
|
110
111
|
_in_component=_in_component,
|
|
112
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
111
113
|
)
|
|
112
114
|
)
|
|
113
115
|
else:
|
|
@@ -132,6 +134,10 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
132
134
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
133
135
|
],
|
|
134
136
|
)
|
|
137
|
+
if _log_metric_to_confident:
|
|
138
|
+
metric_data_manager.post_metric_if_enabled(
|
|
139
|
+
self, test_case=test_case
|
|
140
|
+
)
|
|
135
141
|
|
|
136
142
|
return self.score
|
|
137
143
|
|
|
@@ -140,6 +146,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
140
146
|
test_case: ConversationalTestCase,
|
|
141
147
|
_show_indicator: bool = True,
|
|
142
148
|
_in_component: bool = False,
|
|
149
|
+
_log_metric_to_confident: bool = True,
|
|
143
150
|
) -> float:
|
|
144
151
|
check_conversational_test_case_params(
|
|
145
152
|
test_case, self.evaluation_params, self
|
|
@@ -173,6 +180,10 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
173
180
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
174
181
|
],
|
|
175
182
|
)
|
|
183
|
+
if _log_metric_to_confident:
|
|
184
|
+
metric_data_manager.post_metric_if_enabled(
|
|
185
|
+
self, test_case=test_case
|
|
186
|
+
)
|
|
176
187
|
|
|
177
188
|
return self.score
|
|
178
189
|
|
|
@@ -187,12 +198,16 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
187
198
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
188
199
|
)
|
|
189
200
|
if self.using_native_model:
|
|
190
|
-
res, cost = await self.model.a_generate(
|
|
201
|
+
res, cost = await self.model.a_generate(
|
|
202
|
+
prompt, schema=cgschema.Steps
|
|
203
|
+
)
|
|
191
204
|
self.evaluation_cost += cost
|
|
192
205
|
return res.steps
|
|
193
206
|
else:
|
|
194
207
|
try:
|
|
195
|
-
res: Steps = await self.model.a_generate(
|
|
208
|
+
res: cgschema.Steps = await self.model.a_generate(
|
|
209
|
+
prompt, schema=cgschema.Steps
|
|
210
|
+
)
|
|
196
211
|
return res.steps
|
|
197
212
|
except TypeError:
|
|
198
213
|
res = await self.model.a_generate(prompt)
|
|
@@ -210,12 +225,14 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
210
225
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
211
226
|
)
|
|
212
227
|
if self.using_native_model:
|
|
213
|
-
res, cost = self.model.generate(prompt, schema=Steps)
|
|
228
|
+
res, cost = self.model.generate(prompt, schema=cgschema.Steps)
|
|
214
229
|
self.evaluation_cost += cost
|
|
215
230
|
return res.steps
|
|
216
231
|
else:
|
|
217
232
|
try:
|
|
218
|
-
res: Steps = self.model.generate(
|
|
233
|
+
res: cgschema.Steps = self.model.generate(
|
|
234
|
+
prompt, schema=cgschema.Steps
|
|
235
|
+
)
|
|
219
236
|
return res.steps
|
|
220
237
|
except TypeError:
|
|
221
238
|
res = self.model.generate(prompt)
|
|
@@ -270,21 +287,21 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
270
287
|
score, res
|
|
271
288
|
)
|
|
272
289
|
return weighted_summed_score, reason
|
|
273
|
-
except:
|
|
290
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
274
291
|
return score, reason
|
|
275
292
|
except (
|
|
276
293
|
AttributeError
|
|
277
294
|
): # This catches the case where a_generate_raw_response doesn't exist.
|
|
278
295
|
if self.using_native_model:
|
|
279
296
|
res, cost = await self.model.a_generate(
|
|
280
|
-
prompt, schema=ReasonScore
|
|
297
|
+
prompt, schema=cgschema.ReasonScore
|
|
281
298
|
)
|
|
282
299
|
self.evaluation_cost += cost
|
|
283
300
|
return res.score, res.reason
|
|
284
301
|
else:
|
|
285
302
|
try:
|
|
286
|
-
res: ReasonScore = await self.model.a_generate(
|
|
287
|
-
prompt, schema=ReasonScore
|
|
303
|
+
res: cgschema.ReasonScore = await self.model.a_generate(
|
|
304
|
+
prompt, schema=cgschema.ReasonScore
|
|
288
305
|
)
|
|
289
306
|
return res.score, res.reason
|
|
290
307
|
except TypeError:
|
|
@@ -340,18 +357,20 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
340
357
|
score, res
|
|
341
358
|
)
|
|
342
359
|
return weighted_summed_score, reason
|
|
343
|
-
except:
|
|
360
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
344
361
|
return score, reason
|
|
345
362
|
except AttributeError:
|
|
346
363
|
# This catches the case where a_generate_raw_response doesn't exist.
|
|
347
364
|
if self.using_native_model:
|
|
348
|
-
res, cost = self.model.generate(
|
|
365
|
+
res, cost = self.model.generate(
|
|
366
|
+
prompt, schema=cgschema.ReasonScore
|
|
367
|
+
)
|
|
349
368
|
self.evaluation_cost += cost
|
|
350
369
|
return res.score, res.reason
|
|
351
370
|
else:
|
|
352
371
|
try:
|
|
353
|
-
res: ReasonScore = self.model.generate(
|
|
354
|
-
prompt, schema=ReasonScore
|
|
372
|
+
res: cgschema.ReasonScore = self.model.generate(
|
|
373
|
+
prompt, schema=cgschema.ReasonScore
|
|
355
374
|
)
|
|
356
375
|
return res.score, res.reason
|
|
357
376
|
except TypeError:
|
|
@@ -362,49 +381,44 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
362
381
|
def generate_weighted_summed_score(
|
|
363
382
|
self, raw_score: int, raw_response: ChatCompletion
|
|
364
383
|
) -> Union[int, float]:
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
sum_of_weighted_scores / sum_linear_probability
|
|
404
|
-
)
|
|
405
|
-
return weighted_summed_score
|
|
406
|
-
except:
|
|
407
|
-
raise
|
|
384
|
+
generated_logprobs = raw_response.choices[0].logprobs.content
|
|
385
|
+
# First, locate the token that we care for logprobs, i.e., the token matching the score
|
|
386
|
+
score_logprobs = None
|
|
387
|
+
for token_logprobs in generated_logprobs:
|
|
388
|
+
if token_logprobs.token == str(raw_score):
|
|
389
|
+
score_logprobs = token_logprobs
|
|
390
|
+
break
|
|
391
|
+
# Then, calculate the score based on the logprobs
|
|
392
|
+
token_linear_probability: Dict[int, float] = {}
|
|
393
|
+
sum_linear_probability = 0
|
|
394
|
+
# Filter out tokens with <1% linear probability, i.e., logprobs < math.log(0.01)
|
|
395
|
+
min_logprob = math.log(0.01)
|
|
396
|
+
for token_logprob in score_logprobs.top_logprobs:
|
|
397
|
+
logprob = token_logprob.logprob
|
|
398
|
+
|
|
399
|
+
# Filter out low probability tokens
|
|
400
|
+
if logprob < min_logprob:
|
|
401
|
+
continue
|
|
402
|
+
# Filter out non-decimal token to prevent errors in later int(token) conversion
|
|
403
|
+
if not token_logprob.token.isdecimal():
|
|
404
|
+
continue
|
|
405
|
+
|
|
406
|
+
# Calculate the linear probability
|
|
407
|
+
linear_prob = math.exp(logprob)
|
|
408
|
+
token_score = int(token_logprob.token)
|
|
409
|
+
if token_linear_probability.get(token_score):
|
|
410
|
+
token_linear_probability[token_score] += linear_prob
|
|
411
|
+
else:
|
|
412
|
+
token_linear_probability[token_score] = linear_prob
|
|
413
|
+
sum_linear_probability += linear_prob
|
|
414
|
+
|
|
415
|
+
sum_of_weighted_scores = 0.0
|
|
416
|
+
for score, prob in token_linear_probability.items():
|
|
417
|
+
sum_of_weighted_scores += score * prob
|
|
418
|
+
|
|
419
|
+
# Scale the sum of linear probability to 1
|
|
420
|
+
weighted_summed_score = sum_of_weighted_scores / sum_linear_probability
|
|
421
|
+
return weighted_summed_score
|
|
408
422
|
|
|
409
423
|
def number_evaluation_steps(self):
|
|
410
424
|
evaluation_steps = """"""
|
|
@@ -417,8 +431,8 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
417
431
|
self.success = False
|
|
418
432
|
else:
|
|
419
433
|
try:
|
|
420
|
-
self.score >= self.threshold
|
|
421
|
-
except:
|
|
434
|
+
self.success = self.score >= self.threshold
|
|
435
|
+
except TypeError:
|
|
422
436
|
self.success = False
|
|
423
437
|
return self.success
|
|
424
438
|
|
deepeval/metrics/dag/dag.py
CHANGED
|
@@ -18,6 +18,7 @@ from deepeval.metrics.dag.utils import (
|
|
|
18
18
|
is_valid_dag_from_roots,
|
|
19
19
|
extract_required_params,
|
|
20
20
|
)
|
|
21
|
+
from deepeval.metrics.api import metric_data_manager
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class DAGMetric(BaseMetric):
|
|
@@ -59,6 +60,7 @@ class DAGMetric(BaseMetric):
|
|
|
59
60
|
test_case: LLMTestCase,
|
|
60
61
|
_show_indicator: bool = True,
|
|
61
62
|
_in_component: bool = False,
|
|
63
|
+
_log_metric_to_confident: bool = True,
|
|
62
64
|
) -> float:
|
|
63
65
|
check_llm_test_case_params(
|
|
64
66
|
test_case,
|
|
@@ -77,6 +79,7 @@ class DAGMetric(BaseMetric):
|
|
|
77
79
|
test_case,
|
|
78
80
|
_show_indicator=False,
|
|
79
81
|
_in_component=_in_component,
|
|
82
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
80
83
|
)
|
|
81
84
|
)
|
|
82
85
|
else:
|
|
@@ -89,6 +92,10 @@ class DAGMetric(BaseMetric):
|
|
|
89
92
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
90
93
|
],
|
|
91
94
|
)
|
|
95
|
+
if _log_metric_to_confident:
|
|
96
|
+
metric_data_manager.post_metric_if_enabled(
|
|
97
|
+
self, test_case=test_case
|
|
98
|
+
)
|
|
92
99
|
return self.score
|
|
93
100
|
|
|
94
101
|
async def a_measure(
|
|
@@ -96,6 +103,7 @@ class DAGMetric(BaseMetric):
|
|
|
96
103
|
test_case: LLMTestCase,
|
|
97
104
|
_show_indicator: bool = True,
|
|
98
105
|
_in_component: bool = False,
|
|
106
|
+
_log_metric_to_confident: bool = True,
|
|
99
107
|
) -> float:
|
|
100
108
|
check_llm_test_case_params(
|
|
101
109
|
test_case,
|
|
@@ -119,6 +127,10 @@ class DAGMetric(BaseMetric):
|
|
|
119
127
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
120
128
|
],
|
|
121
129
|
)
|
|
130
|
+
if _log_metric_to_confident:
|
|
131
|
+
metric_data_manager.post_metric_if_enabled(
|
|
132
|
+
self, test_case=test_case
|
|
133
|
+
)
|
|
122
134
|
return self.score
|
|
123
135
|
|
|
124
136
|
def is_successful(self) -> bool:
|
deepeval/metrics/dag/nodes.py
CHANGED
|
@@ -111,7 +111,9 @@ class VerdictNode(BaseNode):
|
|
|
111
111
|
copied_g_eval = GEval(**g_eval_args)
|
|
112
112
|
|
|
113
113
|
copied_g_eval.measure(
|
|
114
|
-
test_case=test_case,
|
|
114
|
+
test_case=test_case,
|
|
115
|
+
_show_indicator=False,
|
|
116
|
+
_log_metric_to_confident=False,
|
|
115
117
|
)
|
|
116
118
|
metric._verbose_steps.append(
|
|
117
119
|
construct_node_verbose_log(self, depth, copied_g_eval)
|
|
@@ -124,7 +126,9 @@ class VerdictNode(BaseNode):
|
|
|
124
126
|
copied_metric.verbose_mode = False
|
|
125
127
|
|
|
126
128
|
copied_metric.measure(
|
|
127
|
-
test_case=test_case,
|
|
129
|
+
test_case=test_case,
|
|
130
|
+
_show_indicator=False,
|
|
131
|
+
_log_metric_to_confident=False,
|
|
128
132
|
)
|
|
129
133
|
metric._verbose_steps.append(
|
|
130
134
|
construct_node_verbose_log(self, depth, copied_metric)
|
|
@@ -174,7 +178,9 @@ class VerdictNode(BaseNode):
|
|
|
174
178
|
copied_g_eval = GEval(**g_eval_args)
|
|
175
179
|
|
|
176
180
|
await copied_g_eval.a_measure(
|
|
177
|
-
test_case=test_case,
|
|
181
|
+
test_case=test_case,
|
|
182
|
+
_show_indicator=False,
|
|
183
|
+
_log_metric_to_confident=False,
|
|
178
184
|
)
|
|
179
185
|
metric._verbose_steps.append(
|
|
180
186
|
construct_node_verbose_log(self, depth, copied_g_eval)
|
|
@@ -188,7 +194,9 @@ class VerdictNode(BaseNode):
|
|
|
188
194
|
copied_metric.verbose_mode = False
|
|
189
195
|
|
|
190
196
|
await copied_metric.a_measure(
|
|
191
|
-
test_case=test_case,
|
|
197
|
+
test_case=test_case,
|
|
198
|
+
_show_indicator=False,
|
|
199
|
+
_log_metric_to_confident=False,
|
|
192
200
|
)
|
|
193
201
|
metric._verbose_steps.append(
|
|
194
202
|
construct_node_verbose_log(self, depth, copied_metric)
|
|
@@ -23,6 +23,7 @@ from deepeval.metrics.faithfulness.schema import (
|
|
|
23
23
|
Truths,
|
|
24
24
|
Claims,
|
|
25
25
|
)
|
|
26
|
+
from deepeval.metrics.api import metric_data_manager
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
class FaithfulnessMetric(BaseMetric):
|
|
@@ -63,6 +64,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
63
64
|
test_case: LLMTestCase,
|
|
64
65
|
_show_indicator: bool = True,
|
|
65
66
|
_in_component: bool = False,
|
|
67
|
+
_log_metric_to_confident: bool = True,
|
|
66
68
|
) -> float:
|
|
67
69
|
|
|
68
70
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -78,6 +80,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
78
80
|
test_case,
|
|
79
81
|
_show_indicator=False,
|
|
80
82
|
_in_component=_in_component,
|
|
83
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
81
84
|
)
|
|
82
85
|
)
|
|
83
86
|
else:
|
|
@@ -96,6 +99,10 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
96
99
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
97
100
|
],
|
|
98
101
|
)
|
|
102
|
+
if _log_metric_to_confident:
|
|
103
|
+
metric_data_manager.post_metric_if_enabled(
|
|
104
|
+
self, test_case=test_case
|
|
105
|
+
)
|
|
99
106
|
|
|
100
107
|
return self.score
|
|
101
108
|
|
|
@@ -104,6 +111,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
104
111
|
test_case: LLMTestCase,
|
|
105
112
|
_show_indicator: bool = True,
|
|
106
113
|
_in_component: bool = False,
|
|
114
|
+
_log_metric_to_confident: bool = True,
|
|
107
115
|
) -> float:
|
|
108
116
|
|
|
109
117
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -132,7 +140,10 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
132
140
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
133
141
|
],
|
|
134
142
|
)
|
|
135
|
-
|
|
143
|
+
if _log_metric_to_confident:
|
|
144
|
+
metric_data_manager.post_metric_if_enabled(
|
|
145
|
+
self, test_case=test_case
|
|
146
|
+
)
|
|
136
147
|
return self.score
|
|
137
148
|
|
|
138
149
|
async def _a_generate_reason(self) -> str:
|