deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
deepeval/metrics/bias/bias.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Optional, Type, Union
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics import BaseMetric
|
|
4
|
+
from deepeval.metrics.api import metric_data_manager
|
|
4
5
|
from deepeval.test_case import (
|
|
5
6
|
LLMTestCase,
|
|
6
7
|
LLMTestCaseParams,
|
|
@@ -48,8 +49,8 @@ class BiasMetric(BaseMetric):
|
|
|
48
49
|
test_case: LLMTestCase,
|
|
49
50
|
_show_indicator: bool = True,
|
|
50
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
51
53
|
) -> float:
|
|
52
|
-
|
|
53
54
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
54
55
|
|
|
55
56
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -63,6 +64,7 @@ class BiasMetric(BaseMetric):
|
|
|
63
64
|
test_case,
|
|
64
65
|
_show_indicator=False,
|
|
65
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
68
|
)
|
|
67
69
|
)
|
|
68
70
|
else:
|
|
@@ -81,7 +83,10 @@ class BiasMetric(BaseMetric):
|
|
|
81
83
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
82
84
|
],
|
|
83
85
|
)
|
|
84
|
-
|
|
86
|
+
if _log_metric_to_confident:
|
|
87
|
+
metric_data_manager.post_metric_if_enabled(
|
|
88
|
+
self, test_case=test_case
|
|
89
|
+
)
|
|
85
90
|
return self.score
|
|
86
91
|
|
|
87
92
|
async def a_measure(
|
|
@@ -89,8 +94,8 @@ class BiasMetric(BaseMetric):
|
|
|
89
94
|
test_case: LLMTestCase,
|
|
90
95
|
_show_indicator: bool = True,
|
|
91
96
|
_in_component: bool = False,
|
|
97
|
+
_log_metric_to_confident: bool = True,
|
|
92
98
|
) -> float:
|
|
93
|
-
|
|
94
99
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
95
100
|
|
|
96
101
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -116,6 +121,10 @@ class BiasMetric(BaseMetric):
|
|
|
116
121
|
],
|
|
117
122
|
)
|
|
118
123
|
|
|
124
|
+
if _log_metric_to_confident:
|
|
125
|
+
metric_data_manager.post_metric_if_enabled(
|
|
126
|
+
self, test_case=test_case
|
|
127
|
+
)
|
|
119
128
|
return self.score
|
|
120
129
|
|
|
121
130
|
async def _a_generate_reason(self) -> str:
|
|
@@ -17,7 +17,8 @@ from deepeval.metrics.contextual_precision.template import (
|
|
|
17
17
|
ContextualPrecisionTemplate,
|
|
18
18
|
)
|
|
19
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
|
-
|
|
20
|
+
import deepeval.metrics.contextual_precision.schema as cpschema
|
|
21
|
+
from deepeval.metrics.api import metric_data_manager
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class ContextualPrecisionMetric(BaseMetric):
|
|
@@ -53,8 +54,8 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
53
54
|
test_case: LLMTestCase,
|
|
54
55
|
_show_indicator: bool = True,
|
|
55
56
|
_in_component: bool = False,
|
|
57
|
+
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
|
-
|
|
58
59
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
59
60
|
|
|
60
61
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -68,10 +69,11 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
68
69
|
test_case,
|
|
69
70
|
_show_indicator=False,
|
|
70
71
|
_in_component=_in_component,
|
|
72
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
71
73
|
)
|
|
72
74
|
)
|
|
73
75
|
else:
|
|
74
|
-
self.verdicts: List[ContextualPrecisionVerdict] = (
|
|
76
|
+
self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
|
|
75
77
|
self._generate_verdicts(
|
|
76
78
|
test_case.input,
|
|
77
79
|
test_case.expected_output,
|
|
@@ -88,7 +90,10 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
88
90
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
89
91
|
],
|
|
90
92
|
)
|
|
91
|
-
|
|
93
|
+
if _log_metric_to_confident:
|
|
94
|
+
metric_data_manager.post_metric_if_enabled(
|
|
95
|
+
self, test_case=test_case
|
|
96
|
+
)
|
|
92
97
|
return self.score
|
|
93
98
|
|
|
94
99
|
async def a_measure(
|
|
@@ -96,6 +101,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
96
101
|
test_case: LLMTestCase,
|
|
97
102
|
_show_indicator: bool = True,
|
|
98
103
|
_in_component: bool = False,
|
|
104
|
+
_log_metric_to_confident: bool = True,
|
|
99
105
|
) -> float:
|
|
100
106
|
|
|
101
107
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -107,7 +113,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
107
113
|
_show_indicator=_show_indicator,
|
|
108
114
|
_in_component=_in_component,
|
|
109
115
|
):
|
|
110
|
-
self.verdicts: List[ContextualPrecisionVerdict] = (
|
|
116
|
+
self.verdicts: List[cpschema.ContextualPrecisionVerdict] = (
|
|
111
117
|
await self._a_generate_verdicts(
|
|
112
118
|
test_case.input,
|
|
113
119
|
test_case.expected_output,
|
|
@@ -124,7 +130,10 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
124
130
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
125
131
|
],
|
|
126
132
|
)
|
|
127
|
-
|
|
133
|
+
if _log_metric_to_confident:
|
|
134
|
+
metric_data_manager.post_metric_if_enabled(
|
|
135
|
+
self, test_case=test_case
|
|
136
|
+
)
|
|
128
137
|
return self.score
|
|
129
138
|
|
|
130
139
|
async def _a_generate_reason(self, input: str):
|
|
@@ -132,7 +141,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
132
141
|
return None
|
|
133
142
|
|
|
134
143
|
retrieval_contexts_verdicts = [
|
|
135
|
-
{"verdict": verdict.verdict, "
|
|
144
|
+
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
136
145
|
for verdict in self.verdicts
|
|
137
146
|
]
|
|
138
147
|
prompt = self.evaluation_template.generate_reason(
|
|
@@ -143,15 +152,15 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
143
152
|
|
|
144
153
|
if self.using_native_model:
|
|
145
154
|
res, cost = await self.model.a_generate(
|
|
146
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
155
|
+
prompt, schema=cpschema.ContextualPrecisionScoreReason
|
|
147
156
|
)
|
|
148
157
|
self.evaluation_cost += cost
|
|
149
158
|
return res.reason
|
|
150
159
|
else:
|
|
151
160
|
try:
|
|
152
|
-
res: ContextualPrecisionScoreReason = (
|
|
161
|
+
res: cpschema.ContextualPrecisionScoreReason = (
|
|
153
162
|
await self.model.a_generate(
|
|
154
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
163
|
+
prompt, schema=cpschema.ContextualPrecisionScoreReason
|
|
155
164
|
)
|
|
156
165
|
)
|
|
157
166
|
return res.reason
|
|
@@ -165,7 +174,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
165
174
|
return None
|
|
166
175
|
|
|
167
176
|
retrieval_contexts_verdicts = [
|
|
168
|
-
{"verdict": verdict.verdict, "
|
|
177
|
+
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
169
178
|
for verdict in self.verdicts
|
|
170
179
|
]
|
|
171
180
|
prompt = self.evaluation_template.generate_reason(
|
|
@@ -176,14 +185,16 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
176
185
|
|
|
177
186
|
if self.using_native_model:
|
|
178
187
|
res, cost = self.model.generate(
|
|
179
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
188
|
+
prompt, schema=cpschema.ContextualPrecisionScoreReason
|
|
180
189
|
)
|
|
181
190
|
self.evaluation_cost += cost
|
|
182
191
|
return res.reason
|
|
183
192
|
else:
|
|
184
193
|
try:
|
|
185
|
-
res: ContextualPrecisionScoreReason =
|
|
186
|
-
|
|
194
|
+
res: cpschema.ContextualPrecisionScoreReason = (
|
|
195
|
+
self.model.generate(
|
|
196
|
+
prompt, schema=cpschema.ContextualPrecisionScoreReason
|
|
197
|
+
)
|
|
187
198
|
)
|
|
188
199
|
return res.reason
|
|
189
200
|
except TypeError:
|
|
@@ -193,21 +204,23 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
193
204
|
|
|
194
205
|
async def _a_generate_verdicts(
|
|
195
206
|
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
196
|
-
) -> List[ContextualPrecisionVerdict]:
|
|
207
|
+
) -> List[cpschema.ContextualPrecisionVerdict]:
|
|
197
208
|
prompt = self.evaluation_template.generate_verdicts(
|
|
198
209
|
input=input,
|
|
199
210
|
expected_output=expected_output,
|
|
200
211
|
retrieval_context=retrieval_context,
|
|
201
212
|
)
|
|
202
213
|
if self.using_native_model:
|
|
203
|
-
res, cost = await self.model.a_generate(
|
|
214
|
+
res, cost = await self.model.a_generate(
|
|
215
|
+
prompt, schema=cpschema.Verdicts
|
|
216
|
+
)
|
|
204
217
|
self.evaluation_cost += cost
|
|
205
218
|
verdicts = [item for item in res.verdicts]
|
|
206
219
|
return verdicts
|
|
207
220
|
else:
|
|
208
221
|
try:
|
|
209
|
-
res: Verdicts = await self.model.a_generate(
|
|
210
|
-
prompt, schema=Verdicts
|
|
222
|
+
res: cpschema.Verdicts = await self.model.a_generate(
|
|
223
|
+
prompt, schema=cpschema.Verdicts
|
|
211
224
|
)
|
|
212
225
|
verdicts = [item for item in res.verdicts]
|
|
213
226
|
return verdicts
|
|
@@ -215,34 +228,36 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
215
228
|
res = await self.model.a_generate(prompt)
|
|
216
229
|
data = trimAndLoadJson(res, self)
|
|
217
230
|
verdicts = [
|
|
218
|
-
ContextualPrecisionVerdict(**item)
|
|
231
|
+
cpschema.ContextualPrecisionVerdict(**item)
|
|
219
232
|
for item in data["verdicts"]
|
|
220
233
|
]
|
|
221
234
|
return verdicts
|
|
222
235
|
|
|
223
236
|
def _generate_verdicts(
|
|
224
237
|
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
225
|
-
) -> List[ContextualPrecisionVerdict]:
|
|
238
|
+
) -> List[cpschema.ContextualPrecisionVerdict]:
|
|
226
239
|
prompt = self.evaluation_template.generate_verdicts(
|
|
227
240
|
input=input,
|
|
228
241
|
expected_output=expected_output,
|
|
229
242
|
retrieval_context=retrieval_context,
|
|
230
243
|
)
|
|
231
244
|
if self.using_native_model:
|
|
232
|
-
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
245
|
+
res, cost = self.model.generate(prompt, schema=cpschema.Verdicts)
|
|
233
246
|
self.evaluation_cost += cost
|
|
234
247
|
verdicts = [item for item in res.verdicts]
|
|
235
248
|
return verdicts
|
|
236
249
|
else:
|
|
237
250
|
try:
|
|
238
|
-
res: Verdicts = self.model.generate(
|
|
251
|
+
res: cpschema.Verdicts = self.model.generate(
|
|
252
|
+
prompt, schema=cpschema.Verdicts
|
|
253
|
+
)
|
|
239
254
|
verdicts = [item for item in res.verdicts]
|
|
240
255
|
return verdicts
|
|
241
256
|
except TypeError:
|
|
242
257
|
res = self.model.generate(prompt)
|
|
243
258
|
data = trimAndLoadJson(res, self)
|
|
244
259
|
verdicts = [
|
|
245
|
-
ContextualPrecisionVerdict(**item)
|
|
260
|
+
cpschema.ContextualPrecisionVerdict(**item)
|
|
246
261
|
for item in data["verdicts"]
|
|
247
262
|
]
|
|
248
263
|
return verdicts
|
|
@@ -279,7 +294,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
279
294
|
else:
|
|
280
295
|
try:
|
|
281
296
|
self.success = self.score >= self.threshold
|
|
282
|
-
except:
|
|
297
|
+
except TypeError:
|
|
283
298
|
self.success = False
|
|
284
299
|
return self.success
|
|
285
300
|
|
|
@@ -16,6 +16,7 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
16
16
|
from deepeval.metrics.contextual_recall.template import ContextualRecallTemplate
|
|
17
17
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
18
|
from deepeval.metrics.contextual_recall.schema import *
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class ContextualRecallMetric(BaseMetric):
|
|
@@ -52,8 +53,8 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
52
53
|
test_case: LLMTestCase,
|
|
53
54
|
_show_indicator: bool = True,
|
|
54
55
|
_in_component: bool = False,
|
|
56
|
+
_log_metric_to_confident: bool = True,
|
|
55
57
|
) -> float:
|
|
56
|
-
|
|
57
58
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
58
59
|
|
|
59
60
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -67,6 +68,7 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
67
68
|
test_case,
|
|
68
69
|
_show_indicator=False,
|
|
69
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
70
72
|
)
|
|
71
73
|
)
|
|
72
74
|
else:
|
|
@@ -85,7 +87,10 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
85
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
86
88
|
],
|
|
87
89
|
)
|
|
88
|
-
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
89
94
|
return self.score
|
|
90
95
|
|
|
91
96
|
async def a_measure(
|
|
@@ -93,6 +98,7 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
93
98
|
test_case: LLMTestCase,
|
|
94
99
|
_show_indicator: bool = True,
|
|
95
100
|
_in_component: bool = False,
|
|
101
|
+
_log_metric_to_confident: bool = True,
|
|
96
102
|
) -> float:
|
|
97
103
|
|
|
98
104
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -121,7 +127,10 @@ class ContextualRecallMetric(BaseMetric):
|
|
|
121
127
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
122
128
|
],
|
|
123
129
|
)
|
|
124
|
-
|
|
130
|
+
if _log_metric_to_confident:
|
|
131
|
+
metric_data_manager.post_metric_if_enabled(
|
|
132
|
+
self, test_case=test_case
|
|
133
|
+
)
|
|
125
134
|
return self.score
|
|
126
135
|
|
|
127
136
|
async def _a_generate_reason(self, expected_output: str):
|
|
@@ -19,6 +19,7 @@ from deepeval.metrics.contextual_relevancy.template import (
|
|
|
19
19
|
)
|
|
20
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
21
|
from deepeval.metrics.contextual_relevancy.schema import *
|
|
22
|
+
from deepeval.metrics.api import metric_data_manager
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class ContextualRelevancyMetric(BaseMetric):
|
|
@@ -53,6 +54,7 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
53
54
|
test_case: LLMTestCase,
|
|
54
55
|
_show_indicator: bool = True,
|
|
55
56
|
_in_component: bool = False,
|
|
57
|
+
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
59
|
|
|
58
60
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -68,6 +70,7 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
68
70
|
test_case,
|
|
69
71
|
_show_indicator=False,
|
|
70
72
|
_in_component=_in_component,
|
|
73
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
71
74
|
)
|
|
72
75
|
)
|
|
73
76
|
else:
|
|
@@ -85,6 +88,10 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
85
88
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
86
89
|
],
|
|
87
90
|
)
|
|
91
|
+
if _log_metric_to_confident:
|
|
92
|
+
metric_data_manager.post_metric_if_enabled(
|
|
93
|
+
self, test_case=test_case
|
|
94
|
+
)
|
|
88
95
|
|
|
89
96
|
return self.score
|
|
90
97
|
|
|
@@ -93,6 +100,7 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
93
100
|
test_case: LLMTestCase,
|
|
94
101
|
_show_indicator: bool = True,
|
|
95
102
|
_in_component: bool = False,
|
|
103
|
+
_log_metric_to_confident: bool = True,
|
|
96
104
|
) -> float:
|
|
97
105
|
|
|
98
106
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -122,7 +130,10 @@ class ContextualRelevancyMetric(BaseMetric):
|
|
|
122
130
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
123
131
|
],
|
|
124
132
|
)
|
|
125
|
-
|
|
133
|
+
if _log_metric_to_confident:
|
|
134
|
+
metric_data_manager.post_metric_if_enabled(
|
|
135
|
+
self, test_case=test_case
|
|
136
|
+
)
|
|
126
137
|
return self.score
|
|
127
138
|
|
|
128
139
|
async def _a_generate_reason(self, input: str):
|
|
@@ -19,6 +19,7 @@ from deepeval.test_case import TurnParams
|
|
|
19
19
|
from deepeval.test_case.conversational_test_case import Turn
|
|
20
20
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
21
21
|
from deepeval.metrics.conversation_completeness.schema import *
|
|
22
|
+
from deepeval.metrics.api import metric_data_manager
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
@@ -48,6 +49,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
48
49
|
test_case: ConversationalTestCase,
|
|
49
50
|
_show_indicator: bool = True,
|
|
50
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
51
53
|
):
|
|
52
54
|
check_conversational_test_case_params(
|
|
53
55
|
test_case, self._required_test_case_params, self
|
|
@@ -64,6 +66,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
64
66
|
test_case,
|
|
65
67
|
_show_indicator=False,
|
|
66
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
67
70
|
)
|
|
68
71
|
)
|
|
69
72
|
else:
|
|
@@ -89,6 +92,10 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
89
92
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
90
93
|
],
|
|
91
94
|
)
|
|
95
|
+
if _log_metric_to_confident:
|
|
96
|
+
metric_data_manager.post_metric_if_enabled(
|
|
97
|
+
self, test_case=test_case
|
|
98
|
+
)
|
|
92
99
|
return self.score
|
|
93
100
|
|
|
94
101
|
async def a_measure(
|
|
@@ -96,6 +103,7 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
96
103
|
test_case: ConversationalTestCase,
|
|
97
104
|
_show_indicator: bool = True,
|
|
98
105
|
_in_component: bool = False,
|
|
106
|
+
_log_metric_to_confident: bool = True,
|
|
99
107
|
) -> float:
|
|
100
108
|
check_conversational_test_case_params(
|
|
101
109
|
test_case, self._required_test_case_params, self
|
|
@@ -129,6 +137,10 @@ class ConversationCompletenessMetric(BaseConversationalMetric):
|
|
|
129
137
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
130
138
|
],
|
|
131
139
|
)
|
|
140
|
+
if _log_metric_to_confident:
|
|
141
|
+
metric_data_manager.post_metric_if_enabled(
|
|
142
|
+
self, test_case=test_case
|
|
143
|
+
)
|
|
132
144
|
return self.score
|
|
133
145
|
|
|
134
146
|
async def _a_generate_reason(self) -> str:
|
|
@@ -18,6 +18,7 @@ from deepeval.metrics.dag.utils import (
|
|
|
18
18
|
extract_required_params,
|
|
19
19
|
copy_graph,
|
|
20
20
|
)
|
|
21
|
+
from deepeval.metrics.api import metric_data_manager
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class ConversationalDAGMetric(BaseConversationalMetric):
|
|
@@ -59,6 +60,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
59
60
|
test_case: ConversationalTestCase,
|
|
60
61
|
_show_indicator: bool = True,
|
|
61
62
|
_in_component: bool = False,
|
|
63
|
+
_log_metric_to_confident: bool = True,
|
|
62
64
|
) -> float:
|
|
63
65
|
check_conversational_test_case_params(
|
|
64
66
|
test_case,
|
|
@@ -77,6 +79,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
77
79
|
test_case,
|
|
78
80
|
_show_indicator=False,
|
|
79
81
|
_in_component=_in_component,
|
|
82
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
80
83
|
)
|
|
81
84
|
)
|
|
82
85
|
else:
|
|
@@ -89,6 +92,10 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
89
92
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
90
93
|
],
|
|
91
94
|
)
|
|
95
|
+
if _log_metric_to_confident:
|
|
96
|
+
metric_data_manager.post_metric_if_enabled(
|
|
97
|
+
self, test_case=test_case
|
|
98
|
+
)
|
|
92
99
|
return self.score
|
|
93
100
|
|
|
94
101
|
async def a_measure(
|
|
@@ -96,6 +103,7 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
96
103
|
test_case: ConversationalTestCase,
|
|
97
104
|
_show_indicator: bool = True,
|
|
98
105
|
_in_component: bool = False,
|
|
106
|
+
_log_metric_to_confident: bool = True,
|
|
99
107
|
) -> float:
|
|
100
108
|
check_conversational_test_case_params(
|
|
101
109
|
test_case,
|
|
@@ -119,6 +127,10 @@ class ConversationalDAGMetric(BaseConversationalMetric):
|
|
|
119
127
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
120
128
|
],
|
|
121
129
|
)
|
|
130
|
+
if _log_metric_to_confident:
|
|
131
|
+
metric_data_manager.post_metric_if_enabled(
|
|
132
|
+
self, test_case=test_case
|
|
133
|
+
)
|
|
122
134
|
return self.score
|
|
123
135
|
|
|
124
136
|
def is_successful(self) -> bool:
|
|
@@ -141,7 +141,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
141
141
|
copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
|
|
142
142
|
|
|
143
143
|
copied_convo_g_eval.measure(
|
|
144
|
-
test_case=test_case,
|
|
144
|
+
test_case=test_case,
|
|
145
|
+
_show_indicator=False,
|
|
146
|
+
_log_metric_to_confident=False,
|
|
145
147
|
)
|
|
146
148
|
metric._verbose_steps.append(
|
|
147
149
|
construct_node_verbose_log(self, depth, copied_convo_g_eval)
|
|
@@ -157,7 +159,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
157
159
|
copied_metric.verbose_mode = False
|
|
158
160
|
|
|
159
161
|
copied_metric.measure(
|
|
160
|
-
test_case=test_case,
|
|
162
|
+
test_case=test_case,
|
|
163
|
+
_show_indicator=False,
|
|
164
|
+
_log_metric_to_confident=False,
|
|
161
165
|
)
|
|
162
166
|
metric._verbose_steps.append(
|
|
163
167
|
construct_node_verbose_log(self, depth, copied_metric)
|
|
@@ -213,7 +217,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
213
217
|
copied_convo_g_eval = ConversationalGEval(**convo_g_eval_args)
|
|
214
218
|
|
|
215
219
|
await copied_convo_g_eval.a_measure(
|
|
216
|
-
test_case=test_case,
|
|
220
|
+
test_case=test_case,
|
|
221
|
+
_show_indicator=False,
|
|
222
|
+
_log_metric_to_confident=False,
|
|
217
223
|
)
|
|
218
224
|
metric._verbose_steps.append(
|
|
219
225
|
construct_node_verbose_log(self, depth, copied_convo_g_eval)
|
|
@@ -229,7 +235,9 @@ class ConversationalVerdictNode(ConversationalBaseNode):
|
|
|
229
235
|
copied_metric.verbose_mode = False
|
|
230
236
|
|
|
231
237
|
await copied_metric.a_measure(
|
|
232
|
-
test_case=test_case,
|
|
238
|
+
test_case=test_case,
|
|
239
|
+
_show_indicator=False,
|
|
240
|
+
_log_metric_to_confident=False,
|
|
233
241
|
)
|
|
234
242
|
metric._verbose_steps.append(
|
|
235
243
|
construct_node_verbose_log(self, depth, copied_metric)
|